def vectorizer(txt): vec = MarisaTfidfVectorizer(min_df=2, max_features=1000000, stop_words=None, smooth_idf=True, norm='l2', sublinear_tf=True, use_idf=True, ngram_range=(1, 2)) vec.fit(txt) return vec
def tfidf_vectorizer(txt): vec = MarisaTfidfVectorizer( min_df = 2, max_features = 1000000, stop_words = None, smooth_idf=True, norm='l2', sublinear_tf=True, use_idf=True, ngram_range=(1,2)) vec.fit(txt) return vec
def vectorizer(df): # 1M max_features should fit in memory, # OvA will be at max 184 classes, # so we can fit coef_ = 1M*184*8B ~ 1GB in memory easily vec = MarisaTfidfVectorizer(min_df=1, stop_words=None, max_features=1000000, smooth_idf=True, norm='l2', sublinear_tf=True, use_idf=True, ngram_range=(1, 3)) vec.fit(iterText(df)) return vec
def vectorizer(df): # 1M max_features should fit in memory, # OvA will be at max 184 classes, # so we can fit coef_ = 1M*184*8B ~ 1GB in memory easily vec = MarisaTfidfVectorizer( min_df = 1, stop_words = None, max_features=1000000, smooth_idf=True, norm='l2', sublinear_tf=True, use_idf=True, ngram_range=(1,3)) vec.fit(iterText(df)) return vec