def test_fastcountvectorizer_save_stop_words():
    cv = FastCountVectorizer(analyzer="char", min_df=2, save_stop_words=True)
    cv.fit(["ab", "ac"])
    assert hasattr(cv, "stop_words_")
    assert cv.stop_words_ == {"b", "c"}

    cv = FastCountVectorizer(analyzer="char", min_df=2, save_stop_words=False)
    cv.fit(["ab", "ac"])
    assert not hasattr(cv, "stop_words_")
def test_unicode_decode_error_input_file_bytes():
    text = "àbć"

    cv = FastCountVectorizer(encoding="ascii", input="file", analyzer="word")
    with pytest.raises(UnicodeDecodeError):
        cv.fit([io.BytesIO(text.encode("utf-8"))])

    cv = FastCountVectorizer(encoding="ascii", input="file", analyzer="char")
    with pytest.raises(UnicodeDecodeError):
        cv.fit([io.BytesIO(text.encode("utf-8"))])
def test_unicode_decode_error_input_content():
    text = "àbć"
    doc = text.encode("utf-8")

    cv = FastCountVectorizer(encoding="ascii",
                             input="content",
                             analyzer="word")
    with pytest.raises(UnicodeDecodeError):
        cv.fit([doc])

    cv = FastCountVectorizer(encoding="ascii",
                             input="content",
                             analyzer="char")
    with pytest.raises(UnicodeDecodeError):
        cv.fit([doc])
def test_unicode_decode_error_input_filename(tmp_path):
    p = tmp_path / "input_file.txt"
    with p.open("w", encoding="utf-8") as f:
        text = "àbć"
        f.write(text)
    doc = str(p)

    cv = FastCountVectorizer(encoding="ascii",
                             input="filename",
                             analyzer="word")
    with pytest.raises(UnicodeDecodeError):
        cv.fit([doc])

    cv = FastCountVectorizer(encoding="ascii",
                             input="filename",
                             analyzer="char")
    with pytest.raises(UnicodeDecodeError):
        cv.fit([doc])
Exemplo n.º 5
0
def run_fastcountvectorizer_fit(ngram_range):
    cv = FastCountVectorizer(ngram_range=ngram_range)
    cv.fit(docs)