def bm25_vectorizer(data): vectorizer = TfidfVectorizer( sublinear_tf=True, max_df=0.5, preprocessor=stif_classifier_dataset.preprocessor) vectorizer._tfidf = bm25_tfidf.BM25Transformer(use_idf=True, k1=1.5, b=0.75) X = vectorizer.fit_transform(data) return vectorizer, X
def from_path(cls, path, **shared): import numpy as np import scipy.sparse as sp from sklearn.feature_extraction.text import (TfidfTransformer, TfidfVectorizer as SklearnTfidfVectorizer) path = Path(path) model_path = path / "vectorizer.json" if not model_path.exists(): raise LoadingError("Missing vectorizer model file: %s" % model_path.name) with model_path.open("r", encoding="utf-8") as f: vectorizer_dict = json.load(f) vectorizer = cls(vectorizer_dict["config"], **shared) vectorizer._language = vectorizer_dict["language_code"] builtin_entity_scope = vectorizer_dict["builtin_entity_scope"] if builtin_entity_scope is not None: builtin_entity_scope = set(builtin_entity_scope) vectorizer.builtin_entity_scope = builtin_entity_scope vectorizer_ = vectorizer_dict["vectorizer"] if vectorizer_: vocab = vectorizer_["vocab"] idf_diag_data = vectorizer_["idf_diag"] idf_diag_data = np.array(idf_diag_data) idf_diag_shape = (len(idf_diag_data), len(idf_diag_data)) row = list(range(idf_diag_shape[0])) col = list(range(idf_diag_shape[0])) idf_diag = sp.csr_matrix((idf_diag_data, (row, col)), shape=idf_diag_shape) tfidf_transformer = TfidfTransformer() tfidf_transformer._idf_diag = idf_diag vectorizer_ = SklearnTfidfVectorizer( tokenizer=lambda x: tokenize_light(x, vectorizer._language)) vectorizer_.vocabulary_ = vocab vectorizer_._tfidf = tfidf_transformer vectorizer._tfidf_vectorizer = vectorizer_ return vectorizer
def bm25_vectorizer(data): vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, preprocessor=stif_classifier_dataset.preprocessor) vectorizer._tfidf = bm25_tfidf.BM25Transformer(use_idf=True, k1=1.5, b=0.75) X = vectorizer.fit_transform(data) return vectorizer,X