Пример #1
0
def sent_tokenize(texts):
    nlp = Vietnamese()
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    docs = []
    for text in texts:
        text_tokenized = []
        if (len(text) > 3):
            for sentence in nlp(text.lower()[1:-1]).sents:
                sent_tokens = np.array([postprocess_token(token.text) for token in sentence])
                text_tokenized.append(sent_tokens)
        else:
            text_tokenized.append([])
        docs.append(text_tokenized)

    return docs
Пример #2
0
def test_vi_tokenizer_no_pyvi():
    """Test for whitespace tokenization without pyvi"""
    nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}})
    text = "Đây là một văn  bản bằng tiếng Việt Sau đó, đây là một văn bản khác bằng ngôn ngữ này"
    doc = nlp(text)
    assert [t.text for t in doc if not t.is_space] == text.split()
    assert doc[4].text == " "
Пример #3
0
def tokenize(texts):
    nlp = Vietnamese()
    docs = []
    for text in texts:
        tokens = np.array([postprocess_token(token.text) for token in nlp(text.lower())[1:-1]])
        docs.append(tokens)

    return docs
Пример #4
0
def test_vi_tokenizer_serialize(vi_tokenizer):
    tokenizer_bytes = vi_tokenizer.to_bytes()
    nlp = Vietnamese()
    nlp.tokenizer.from_bytes(tokenizer_bytes)
    assert tokenizer_bytes == nlp.tokenizer.to_bytes()
    assert nlp.tokenizer.use_pyvi is True

    with make_tempdir() as d:
        file_path = d / "tokenizer"
        vi_tokenizer.to_disk(file_path)
        nlp = Vietnamese()
        nlp.tokenizer.from_disk(file_path)
        assert tokenizer_bytes == nlp.tokenizer.to_bytes()
        assert nlp.tokenizer.use_pyvi is True

    # mode is (de)serialized correctly
    nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}})
    nlp_bytes = nlp.to_bytes()
    nlp_r = Vietnamese()
    nlp_r.from_bytes(nlp_bytes)
    assert nlp_bytes == nlp_r.to_bytes()
    assert nlp_r.tokenizer.use_pyvi is False

    with make_tempdir() as d:
        nlp.to_disk(d)
        nlp_r = Vietnamese()
        nlp_r.from_disk(d)
        assert nlp_bytes == nlp_r.to_bytes()
        assert nlp_r.tokenizer.use_pyvi is False