Python FastCountVectorizer примеры использования

Язык программирования: Python

Пространство имен/Пакет: fastcountvectorizer

Класс/Тип: FastCountVectorizer

Примеров на hotexamples.com: 13

Python FastCountVectorizer - 13 примеров найдено. Это лучшие примеры Python кода для fastcountvectorizer.FastCountVectorizer, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

FastCountVectorizer(13)

fit(5)

fit_transform(1)

Пример #1

0

Показать файл

Файл: test_fastcountvectorizer.py Проект: smola/fastcountvectorizer

def test_fastcountvectorizer_save_stop_words():
    cv = FastCountVectorizer(analyzer="char", min_df=2, save_stop_words=True)
    cv.fit(["ab", "ac"])
    assert hasattr(cv, "stop_words_")
    assert cv.stop_words_ == {"b", "c"}

    cv = FastCountVectorizer(analyzer="char", min_df=2, save_stop_words=False)
    cv.fit(["ab", "ac"])
    assert not hasattr(cv, "stop_words_")

Пример #2

0

Показать файл

Файл: test_fastcountvectorizer.py Проект: smola/fastcountvectorizer

def test_fastcountvectorizer_char_ngram1_strip_accents_ascii():
    cv = FastCountVectorizer(strip_accents="ascii",
                             analyzer="char",
                             ngram_range=(1, 1))
    check_cv(cv,
             input=["ábc"],
             output=lil_matrix([[1, 1, 1]]).tocsr(),
             vocab=["a", "b", "c"])

Пример #3

0

Показать файл

Файл: test_fastcountvectorizer.py Проект: smola/fastcountvectorizer

def test_unicode_decode_error_input_file_bytes():
    text = "àbć"

    cv = FastCountVectorizer(encoding="ascii", input="file", analyzer="word")
    with pytest.raises(UnicodeDecodeError):
        cv.fit([io.BytesIO(text.encode("utf-8"))])

    cv = FastCountVectorizer(encoding="ascii", input="file", analyzer="char")
    with pytest.raises(UnicodeDecodeError):
        cv.fit([io.BytesIO(text.encode("utf-8"))])

Пример #4

0

Показать файл

Файл: test_fastcountvectorizer.py Проект: smola/fastcountvectorizer

def test_fastcountvectorizer_validate_params():
    FastCountVectorizer().fit(["foo"])

    FastCountVectorizer(input="content")
    FastCountVectorizer(input="file")._validate_params()
    FastCountVectorizer(input="filename")._validate_params()
    with pytest.raises(ValueError):
        FastCountVectorizer(input="unsupported")._validate_params()

    FastCountVectorizer(analyzer="char").fit(["foo"])
    FastCountVectorizer(analyzer="word").fit(["foo"])
    with pytest.raises(ValueError):
        FastCountVectorizer(analyzer="char_wb").fit(["foo"])
    with pytest.raises(ValueError):
        FastCountVectorizer(input="unsupported").fit(["foo"])

Пример #5

0

Показать файл

Файл: test_fastcountvectorizer.py Проект: smola/fastcountvectorizer

def test_fastcountvectorizer_char_ngram1_unicode():
    cv = FastCountVectorizer(analyzer="char", ngram_range=(1, 1))
    check_cv(cv,
             input=["ǟƂƇ"],
             output=lil_matrix([[1, 1, 1]]).tocsr(),
             vocab=["Ƃ", "Ƈ", "ǟ"])
    check_cv(cv,
             input=["ƇƂǟ"],
             output=lil_matrix([[1, 1, 1]]).tocsr(),
             vocab=["Ƃ", "Ƈ", "ǟ"])

Пример #6

0

Показать файл

Файл: test_fastcountvectorizer.py Проект: smola/fastcountvectorizer

def test_fastcountvectorizer_char_ngram1_3():
    cv = FastCountVectorizer(analyzer="char", ngram_range=(1, 3))
    check_cv(
        cv,
        input=["abcef"],
        vocab=[
            "a", "ab", "abc", "b", "bc", "bce", "c", "ce", "cef", "e", "ef",
            "f"
        ],
        output=lil_matrix([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]).tocsr(),
    )

Пример #7

0

Показать файл

Файл: test_fastcountvectorizer.py Проект: smola/fastcountvectorizer

def test_unicode_decode_error_input_content():
    text = "àbć"
    doc = text.encode("utf-8")

    cv = FastCountVectorizer(encoding="ascii",
                             input="content",
                             analyzer="word")
    with pytest.raises(UnicodeDecodeError):
        cv.fit([doc])

    cv = FastCountVectorizer(encoding="ascii",
                             input="content",
                             analyzer="char")
    with pytest.raises(UnicodeDecodeError):
        cv.fit([doc])

Пример #8

0

Показать файл

Файл: test_fastcountvectorizer.py Проект: smola/fastcountvectorizer

def test_fastcountvectorizer_char_ngram1():
    cv = FastCountVectorizer(analyzer="char", ngram_range=(1, 1))
    check_cv(cv,
             input=["abc"],
             output=lil_matrix([[1, 1, 1]]).tocsr(),
             vocab=["a", "b", "c"])
    check_cv(cv,
             input=["cba"],
             output=lil_matrix([[1, 1, 1]]).tocsr(),
             vocab=["a", "b", "c"])
    check_cv(
        cv,
        input=["cba", "ade"],
        output=lil_matrix([[1, 1, 1, 0, 0], [1, 0, 0, 1, 1]]).tocsr(),
        vocab=["a", "b", "c", "d", "e"],
    )

Пример #9

0

Показать файл

Файл: test_fastcountvectorizer.py Проект: smola/fastcountvectorizer

def test_unicode_decode_error_input_filename(tmp_path):
    p = tmp_path / "input_file.txt"
    with p.open("w", encoding="utf-8") as f:
        text = "àbć"
        f.write(text)
    doc = str(p)

    cv = FastCountVectorizer(encoding="ascii",
                             input="filename",
                             analyzer="word")
    with pytest.raises(UnicodeDecodeError):
        cv.fit([doc])

    cv = FastCountVectorizer(encoding="ascii",
                             input="filename",
                             analyzer="char")
    with pytest.raises(UnicodeDecodeError):
        cv.fit([doc])

Пример #10

0

Показать файл

Файл: test_fastcountvectorizer.py Проект: smola/fastcountvectorizer

def test_fastcountvectorizer_word_ngram1():
    cv = FastCountVectorizer(analyzer="word", ngram_range=(1, 1))
    check_cv(
        cv,
        input=["aaa bbb ccc"],
        output=lil_matrix([[1, 1, 1]]).tocsr(),
        vocab=["aaa", "bbb", "ccc"],
    )
    check_cv(
        cv,
        input=["bbb aaa ccc"],
        output=lil_matrix([[1, 1, 1]]).tocsr(),
        vocab=["aaa", "bbb", "ccc"],
    )
    check_cv(
        cv,
        input=["ccc bbb aaa", "aaa  ddd\teee"],
        output=lil_matrix([[1, 1, 1, 0, 0], [1, 0, 0, 1, 1]]).tocsr(),
        vocab=["aaa", "bbb", "ccc", "ddd", "eee"],
    )

Пример #11

0

Показать файл

Файл: test_fastcountvectorizer.py Проект: smola/fastcountvectorizer

def test_fastcountvectorizer_word_ngram1_3():
    cv = FastCountVectorizer(analyzer="word", ngram_range=(1, 3))
    check_cv(
        cv,
        input=["aaa bbb ccc eee fff"],
        vocab=[
            "aaa",
            "aaa bbb",
            "aaa bbb ccc",
            "bbb",
            "bbb ccc",
            "bbb ccc eee",
            "ccc",
            "ccc eee",
            "ccc eee fff",
            "eee",
            "eee fff",
            "fff",
        ],
        output=lil_matrix([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]).tocsr(),
    )

Пример #12

0

Показать файл

Файл: test_fastcountvectorizer.py Проект: smola/fastcountvectorizer

def test_fastcountvectorizer_word_return_dtype():
    input = ["abc"]
    cv = FastCountVectorizer()
    result = cv.fit_transform(input)
    assert result.dtype == np.int64

    cv = FastCountVectorizer(dtype=np.int64)
    result = cv.fit_transform(input)
    assert result.dtype == np.int64

    cv = FastCountVectorizer(dtype=np.int32)
    result = cv.fit_transform(input)
    assert result.dtype == np.int32

    cv = FastCountVectorizer(dtype=np.float64)
    result = cv.fit_transform(input)
    assert result.dtype == np.float64

Пример #13

0

Показать файл

Файл: benchmark.py Проект: smola/fastcountvectorizer

def run_fastcountvectorizer_fit(ngram_range):
    cv = FastCountVectorizer(ngram_range=ngram_range)
    cv.fit(docs)