Пример #1
0
def vectorizer(txt):
    vec = MarisaTfidfVectorizer(min_df=2,
                                max_features=1000000,
                                stop_words=None,
                                smooth_idf=True,
                                norm='l2',
                                sublinear_tf=True,
                                use_idf=True,
                                ngram_range=(1, 2))
    vec.fit(txt)
    return vec
Пример #2
0
def tfidf_vectorizer(txt):
    vec = MarisaTfidfVectorizer(
        min_df = 2,
        max_features = 1000000,
        stop_words = None,
        smooth_idf=True,
        norm='l2',
        sublinear_tf=True,
        use_idf=True,
        ngram_range=(1,2))
    vec.fit(txt)
    return vec
Пример #3
0
def vectorizer(df):
    # 1M max_features should fit in memory,
    # OvA will be at max 184 classes,
    # so we can fit coef_ =  1M*184*8B ~ 1GB in memory easily
    vec = MarisaTfidfVectorizer(min_df=1,
                                stop_words=None,
                                max_features=1000000,
                                smooth_idf=True,
                                norm='l2',
                                sublinear_tf=True,
                                use_idf=True,
                                ngram_range=(1, 3))
    vec.fit(iterText(df))
    return vec
Пример #4
0
def vectorizer(df):
    # 1M max_features should fit in memory, 
    # OvA will be at max 184 classes, 
    # so we can fit coef_ =  1M*184*8B ~ 1GB in memory easily
    vec = MarisaTfidfVectorizer(
        min_df = 1,
        stop_words = None,
        max_features=1000000,
        smooth_idf=True,
        norm='l2',
        sublinear_tf=True,
        use_idf=True,
        ngram_range=(1,3))
    vec.fit(iterText(df))
    return vec