Python TfidfVectorizer.max_df示例

编程语言: Python

命名空间/包名称: sklearn.feature_extraction.text

类/类型: TfidfVectorizer

方法/功能: max_df

hotexamples.com的示例: 5

Python TfidfVectorizer.max_df - 已找到5个示例。这些是从开源项目中提取的最受好评的sklearn.feature_extraction.text.TfidfVectorizer.max_df现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

fit(30)

get_stop_words(30)

TfidfVectorizer(30)

fit_transform(30)

get_feature_names(30)

inverse_transform(30)

build_analyzer(30)

build_tokenizer(29)

get_params(29)

get_feature_names_out(14)

__init__(12)

idf_(11)

build_preprocessor(8)

max_features(8)

_validate_vocabulary(3)

max_df(3)

fir(2)

N_(2)

fit_on_texts(2)

build_vocab(2)

decode(2)

_tfidf(2)

decode_error(1)

append(1)

_document_frequency(1)

_get_param_names(1)

kneighbors(1)

join(1)

_stop_words_id(1)

inv_vocabulary_(1)

input(1)

infer_vector(1)

idx_target_cache(1)

get_word_net_feature_vecs(1)

bert(1)

get_shape(1)

encode(1)

get_feautre_names(1)

cate_set(1)

get_feature_name(1)

fit_transfrorm(1)

fit_transfrom(1)

count(1)

fit_trainsform(1)

count_args(1)

count_chunks(1)

encoding(1)

mean(1)

示例#1

显示文件

文件： create_lda_data.py 项目： cuongnv-ict/news

def build_vocab(dataset, output_vocab, root_dir, title_map):
    vectorizer = TfidfVectorizer(ngram_range=(1, 1),
                                 max_df=0.6,
                                 min_df=1,
                                 stop_words=utils.load_data_from_list(
                                     os.path.join(root_dir, 'stopwords.txt')))
    stack = os.listdir(dataset)
    contents = []
    titles = []
    while (len(stack) > 0):
        file_name = stack.pop()
        file_path = os.path.join(dataset, file_name)
        if (os.path.isdir(file_path)
            ):  # neu la thu muc thi day vao strong stack
            utils.push_data_to_stack(stack, file_path, file_name)
        else:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
                base = os.path.basename(file_name)
                titles.append(title_map[base])
                contents.append(content.lower())
    # change vectorizer to ensure length of document greater than 0
    if len(contents) < 50:
        vectorizer.max_df = 1.0
    vectorizer.fit(contents)
    with open(output_vocab, 'w', encoding='utf-8') as f:
        vocab = {w: i for i, w in enumerate(vectorizer.vocabulary_.keys())}
        f.write(u'\n'.join(vocab.keys()))
    return contents, titles

示例#2

显示文件

文件： test_text.py 项目： BloodD/scikit-learn

def test_vectorizer():
    # raw documents as an iterator
    train_data = iter(ALL_FOOD_DOCS[:-1])
    test_data = [ALL_FOOD_DOCS[-1]]
    n_train = len(ALL_FOOD_DOCS) - 1

    # test without vocabulary
    v1 = CountVectorizer(max_df=0.5)
    counts_train = v1.fit_transform(train_data)
    if hasattr(counts_train, 'tocsr'):
        counts_train = counts_train.tocsr()
    assert_equal(counts_train[0, v1.vocabulary_["pizza"]], 2)

    # build a vectorizer v1 with the same vocabulary as the one fitted by v1
    v2 = CountVectorizer(vocabulary=v1.vocabulary_)

    # compare that the two vectorizer give the same output on the test sample
    for v in (v1, v2):
        counts_test = v.transform(test_data)
        if hasattr(counts_test, 'tocsr'):
            counts_test = counts_test.tocsr()

        vocabulary = v.vocabulary_
        assert_equal(counts_test[0, vocabulary["salad"]], 1)
        assert_equal(counts_test[0, vocabulary["tomato"]], 1)
        assert_equal(counts_test[0, vocabulary["water"]], 1)

        # stop word from the fixed list
        assert_false("the" in vocabulary)

        # stop word found automatically by the vectorizer DF thresholding
        # words that are high frequent across the complete corpus are likely
        # to be not informative (either real stop words of extraction
        # artifacts)
        assert_false("copyright" in vocabulary)

        # not present in the sample
        assert_equal(counts_test[0, vocabulary["coke"]], 0)
        assert_equal(counts_test[0, vocabulary["burger"]], 0)
        assert_equal(counts_test[0, vocabulary["beer"]], 0)
        assert_equal(counts_test[0, vocabulary["pizza"]], 0)

    # test tf-idf
    t1 = TfidfTransformer(norm='l1')
    tfidf = t1.fit(counts_train).transform(counts_train).toarray()
    assert_equal(len(t1.idf_), len(v1.vocabulary_))
    assert_equal(tfidf.shape, (n_train, len(v1.vocabulary_)))

    # test tf-idf with new data
    tfidf_test = t1.transform(counts_test).toarray()
    assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary_)))

    # test tf alone
    t2 = TfidfTransformer(norm='l1', use_idf=False)
    tf = t2.fit(counts_train).transform(counts_train).toarray()
    assert_equal(t2.idf_, None)

    # test idf transform with unlearned idf vector
    t3 = TfidfTransformer(use_idf=True)
    assert_raises(ValueError, t3.transform, counts_train)

    # test idf transform with incompatible n_features
    X = [[1, 1, 5],
         [1, 1, 0]]
    t3.fit(X)
    X_incompt = [[1, 3],
                 [1, 3]]
    assert_raises(ValueError, t3.transform, X_incompt)

    # L1-normalized term frequencies sum to one
    assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)

    # test the direct tfidf vectorizer
    # (equivalent to term count vectorizer + tfidf transformer)
    train_data = iter(ALL_FOOD_DOCS[:-1])
    tv = TfidfVectorizer(norm='l1')
    assert_false(tv.fixed_vocabulary)

    tv.max_df = v1.max_df
    tfidf2 = tv.fit_transform(train_data).toarray()
    assert_array_almost_equal(tfidf, tfidf2)

    # test the direct tfidf vectorizer with new data
    tfidf_test2 = tv.transform(test_data).toarray()
    assert_array_almost_equal(tfidf_test, tfidf_test2)

    # test transform on unfitted vectorizer with empty vocabulary
    v3 = CountVectorizer(vocabulary=None)
    assert_raises(ValueError, v3.transform, train_data)

    # ascii preprocessor?
    v3.set_params(strip_accents='ascii', lowercase=False)
    assert_equal(v3.build_preprocessor(), strip_accents_ascii)

    # error on bad strip_accents param
    v3.set_params(strip_accents='_gabbledegook_', preprocessor=None)
    assert_raises(ValueError, v3.build_preprocessor)

    # error with bad analyzer type
    v3.set_params = '_invalid_analyzer_type_'
    assert_raises(ValueError, v3.build_analyzer)

示例#3

显示文件

文件： test_text.py 项目： lucidfrontier45/scikit-learn

def test_vectorizer():
    # raw documents as an iterator
    train_data = iter(ALL_FOOD_DOCS[:-1])
    test_data = [ALL_FOOD_DOCS[-1]]
    n_train = len(ALL_FOOD_DOCS) - 1

    # test without vocabulary
    v1 = CountVectorizer(max_df=0.5, min_df=1)
    counts_train = v1.fit_transform(train_data)
    if hasattr(counts_train, 'tocsr'):
        counts_train = counts_train.tocsr()
    assert_equal(counts_train[0, v1.vocabulary_[u"pizza"]], 2)

    # build a vectorizer v1 with the same vocabulary as the one fitted by v1
    v2 = CountVectorizer(vocabulary=v1.vocabulary_)

    # compare that the two vectorizer give the same output on the test sample
    for v in (v1, v2):
        counts_test = v.transform(test_data)
        if hasattr(counts_test, 'tocsr'):
            counts_test = counts_test.tocsr()

        vocabulary = v.vocabulary_
        assert_equal(counts_test[0, vocabulary[u"salad"]], 1)
        assert_equal(counts_test[0, vocabulary[u"tomato"]], 1)
        assert_equal(counts_test[0, vocabulary[u"water"]], 1)

        # stop word from the fixed list
        assert_false(u"the" in vocabulary)

        # stop word found automatically by the vectorizer DF thresholding
        # words that are high frequent across the complete corpus are likely
        # to be not informative (either real stop words of extraction
        # artifacts)
        assert_false(u"copyright" in vocabulary)

        # not present in the sample
        assert_equal(counts_test[0, vocabulary[u"coke"]], 0)
        assert_equal(counts_test[0, vocabulary[u"burger"]], 0)
        assert_equal(counts_test[0, vocabulary[u"beer"]], 0)
        assert_equal(counts_test[0, vocabulary[u"pizza"]], 0)

    # test tf-idf
    t1 = TfidfTransformer(norm='l1')
    tfidf = t1.fit(counts_train).transform(counts_train).toarray()
    assert_equal(len(t1.idf_), len(v1.vocabulary_))
    assert_equal(tfidf.shape, (n_train, len(v1.vocabulary_)))

    # test tf-idf with new data
    tfidf_test = t1.transform(counts_test).toarray()
    assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary_)))

    # test tf alone
    t2 = TfidfTransformer(norm='l1', use_idf=False)
    tf = t2.fit(counts_train).transform(counts_train).toarray()
    assert_equal(t2.idf_, None)

    # L1-normalized term frequencies sum to one
    assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)

    # test the direct tfidf vectorizer
    # (equivalent to term count vectorizer + tfidf transformer)
    train_data = iter(ALL_FOOD_DOCS[:-1])
    tv = TfidfVectorizer(norm='l1', min_df=1)
    assert_false(tv.fixed_vocabulary)

    tv.max_df = v1.max_df
    tfidf2 = tv.fit_transform(train_data).toarray()
    assert_array_almost_equal(tfidf, tfidf2)

    # test the direct tfidf vectorizer with new data
    tfidf_test2 = tv.transform(test_data).toarray()
    assert_array_almost_equal(tfidf_test, tfidf_test2)

    # test transform on unfitted vectorizer with empty vocabulary
    v3 = CountVectorizer(vocabulary=None)
    assert_raises(ValueError, v3.transform, train_data)

示例#4

显示文件

def test_vectorizer():
    # raw documents as an iterator
    train_data = iter(ALL_FOOD_DOCS[:-1])
    test_data = [ALL_FOOD_DOCS[-1]]
    n_train = len(ALL_FOOD_DOCS) - 1

    # test without vocabulary
    v1 = CountVectorizer(max_df=0.5)
    counts_train = v1.fit_transform(train_data)
    if hasattr(counts_train, 'tocsr'):
        counts_train = counts_train.tocsr()
    assert counts_train[0, v1.vocabulary_["pizza"]] == 2

    # build a vectorizer v1 with the same vocabulary as the one fitted by v1
    v2 = CountVectorizer(vocabulary=v1.vocabulary_)

    # compare that the two vectorizer give the same output on the test sample
    for v in (v1, v2):
        counts_test = v.transform(test_data)
        if hasattr(counts_test, 'tocsr'):
            counts_test = counts_test.tocsr()

        vocabulary = v.vocabulary_
        assert counts_test[0, vocabulary["salad"]] == 1
        assert counts_test[0, vocabulary["tomato"]] == 1
        assert counts_test[0, vocabulary["water"]] == 1

        # stop word from the fixed list
        assert "the" not in vocabulary

        # stop word found automatically by the vectorizer DF thresholding
        # words that are high frequent across the complete corpus are likely
        # to be not informative (either real stop words of extraction
        # artifacts)
        assert "copyright" not in vocabulary

        # not present in the sample
        assert counts_test[0, vocabulary["coke"]] == 0
        assert counts_test[0, vocabulary["burger"]] == 0
        assert counts_test[0, vocabulary["beer"]] == 0
        assert counts_test[0, vocabulary["pizza"]] == 0

    # test tf-idf
    t1 = TfidfTransformer(norm='l1')
    tfidf = t1.fit(counts_train).transform(counts_train).toarray()
    assert len(t1.idf_) == len(v1.vocabulary_)
    assert tfidf.shape == (n_train, len(v1.vocabulary_))

    # test tf-idf with new data
    tfidf_test = t1.transform(counts_test).toarray()
    assert tfidf_test.shape == (len(test_data), len(v1.vocabulary_))

    # test tf alone
    t2 = TfidfTransformer(norm='l1', use_idf=False)
    tf = t2.fit(counts_train).transform(counts_train).toarray()
    assert not hasattr(t2, "idf_")

    # test idf transform with unlearned idf vector
    t3 = TfidfTransformer(use_idf=True)
    with pytest.raises(ValueError):
        t3.transform(counts_train)

    # test idf transform with incompatible n_features
    X = [[1, 1, 5],
         [1, 1, 0]]
    t3.fit(X)
    X_incompt = [[1, 3],
                 [1, 3]]
    with pytest.raises(ValueError):
        t3.transform(X_incompt)

    # L1-normalized term frequencies sum to one
    assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)

    # test the direct tfidf vectorizer
    # (equivalent to term count vectorizer + tfidf transformer)
    train_data = iter(ALL_FOOD_DOCS[:-1])
    tv = TfidfVectorizer(norm='l1')

    tv.max_df = v1.max_df
    tfidf2 = tv.fit_transform(train_data).toarray()
    assert not tv.fixed_vocabulary_
    assert_array_almost_equal(tfidf, tfidf2)

    # test the direct tfidf vectorizer with new data
    tfidf_test2 = tv.transform(test_data).toarray()
    assert_array_almost_equal(tfidf_test, tfidf_test2)

    # test transform on unfitted vectorizer with empty vocabulary
    v3 = CountVectorizer(vocabulary=None)
    with pytest.raises(ValueError):
        v3.transform(train_data)

    # ascii preprocessor?
    v3.set_params(strip_accents='ascii', lowercase=False)
    processor = v3.build_preprocessor()
    text = ("J'ai mangé du kangourou  ce midi, "
            "c'était pas très bon.")
    expected = strip_accents_ascii(text)
    result = processor(text)
    assert expected == result

    # error on bad strip_accents param
    v3.set_params(strip_accents='_gabbledegook_', preprocessor=None)
    with pytest.raises(ValueError):
        v3.build_preprocessor()

    # error with bad analyzer type
    v3.set_params = '_invalid_analyzer_type_'
    with pytest.raises(ValueError):
        v3.build_analyzer()

示例#5

显示文件

文件： test_text.py 项目： saeed-abdullah/scikit-learn

def test_vectorizer():
    # raw documents as an iterator
    train_data = iter(ALL_FOOD_DOCS[:-1])
    test_data = [ALL_FOOD_DOCS[-1]]
    n_train = len(ALL_FOOD_DOCS) - 1

    # test without vocabulary
    v1 = CountVectorizer(max_df=0.5)
    counts_train = v1.fit_transform(train_data)
    if hasattr(counts_train, 'tocsr'):
        counts_train = counts_train.tocsr()
    assert_equal(counts_train[0, v1.vocabulary_[u"pizza"]], 2)

    # build a vectorizer v1 with the same vocabulary as the one fitted by v1
    v2 = CountVectorizer(vocabulary=v1.vocabulary_)

    # compare that the two vectorizer give the same output on the test sample
    for v in (v1, v2):
        counts_test = v.transform(test_data)
        if hasattr(counts_test, 'tocsr'):
            counts_test = counts_test.tocsr()

        vocabulary = v.vocabulary_
        assert_equal(counts_test[0, vocabulary[u"salad"]], 1)
        assert_equal(counts_test[0, vocabulary[u"tomato"]], 1)
        assert_equal(counts_test[0, vocabulary[u"water"]], 1)

        # stop word from the fixed list
        assert_false(u"the" in vocabulary)

        # stop word found automatically by the vectorizer DF thresholding
        # words that are high frequent across the complete corpus are likely
        # to be not informative (either real stop words of extraction
        # artifacts)
        assert_false(u"copyright" in vocabulary)

        # not present in the sample
        assert_equal(counts_test[0, vocabulary[u"coke"]], 0)
        assert_equal(counts_test[0, vocabulary[u"burger"]], 0)
        assert_equal(counts_test[0, vocabulary[u"beer"]], 0)
        assert_equal(counts_test[0, vocabulary[u"pizza"]], 0)

    # test tf-idf
    t1 = TfidfTransformer(norm='l1')
    tfidf = t1.fit(counts_train).transform(counts_train).toarray()
    assert_equal(len(t1.idf_), len(v1.vocabulary_))
    assert_equal(tfidf.shape, (n_train, len(v1.vocabulary_)))

    # test tf-idf with new data
    tfidf_test = t1.transform(counts_test).toarray()
    assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary_)))

    # test tf alone
    t2 = TfidfTransformer(norm='l1', use_idf=False)
    tf = t2.fit(counts_train).transform(counts_train).toarray()
    assert_equal(t2.idf_, None)

    # L1-normalized term frequencies sum to one
    assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)

    # test the direct tfidf vectorizer
    # (equivalent to term count vectorizer + tfidf transformer)
    train_data = iter(ALL_FOOD_DOCS[:-1])
    tv = TfidfVectorizer(norm='l1')
    assert_false(tv.fixed_vocabulary)

    tv.max_df = v1.max_df
    tfidf2 = tv.fit_transform(train_data).toarray()
    assert_array_almost_equal(tfidf, tfidf2)

    # test the direct tfidf vectorizer with new data
    tfidf_test2 = tv.transform(test_data).toarray()
    assert_array_almost_equal(tfidf_test, tfidf_test2)

    # test transform on unfitted vectorizer with empty vocabulary
    v3 = CountVectorizer(vocabulary=None)
    assert_raises(ValueError, v3.transform, train_data)