示例#1
0
def bm25_vectorizer(data):
    vectorizer = TfidfVectorizer(
        sublinear_tf=True,
        max_df=0.5,
        preprocessor=stif_classifier_dataset.preprocessor)
    vectorizer._tfidf = bm25_tfidf.BM25Transformer(use_idf=True,
                                                   k1=1.5,
                                                   b=0.75)
    X = vectorizer.fit_transform(data)
    return vectorizer, X
    def from_path(cls, path, **shared):
        import numpy as np
        import scipy.sparse as sp
        from sklearn.feature_extraction.text import (TfidfTransformer,
                                                     TfidfVectorizer as
                                                     SklearnTfidfVectorizer)

        path = Path(path)

        model_path = path / "vectorizer.json"
        if not model_path.exists():
            raise LoadingError("Missing vectorizer model file: %s" %
                               model_path.name)
        with model_path.open("r", encoding="utf-8") as f:
            vectorizer_dict = json.load(f)

        vectorizer = cls(vectorizer_dict["config"], **shared)
        vectorizer._language = vectorizer_dict["language_code"]

        builtin_entity_scope = vectorizer_dict["builtin_entity_scope"]
        if builtin_entity_scope is not None:
            builtin_entity_scope = set(builtin_entity_scope)
        vectorizer.builtin_entity_scope = builtin_entity_scope

        vectorizer_ = vectorizer_dict["vectorizer"]
        if vectorizer_:
            vocab = vectorizer_["vocab"]
            idf_diag_data = vectorizer_["idf_diag"]
            idf_diag_data = np.array(idf_diag_data)

            idf_diag_shape = (len(idf_diag_data), len(idf_diag_data))
            row = list(range(idf_diag_shape[0]))
            col = list(range(idf_diag_shape[0]))
            idf_diag = sp.csr_matrix((idf_diag_data, (row, col)),
                                     shape=idf_diag_shape)

            tfidf_transformer = TfidfTransformer()
            tfidf_transformer._idf_diag = idf_diag

            vectorizer_ = SklearnTfidfVectorizer(
                tokenizer=lambda x: tokenize_light(x, vectorizer._language))
            vectorizer_.vocabulary_ = vocab

            vectorizer_._tfidf = tfidf_transformer

        vectorizer._tfidf_vectorizer = vectorizer_
        return vectorizer
def bm25_vectorizer(data):
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, preprocessor=stif_classifier_dataset.preprocessor)
    vectorizer._tfidf = bm25_tfidf.BM25Transformer(use_idf=True, k1=1.5, b=0.75)
    X = vectorizer.fit_transform(data)
    return vectorizer,X