Python Vietnamese примеры использования

Язык программирования: Python

Пространство имен/Пакет: spacy.lang.vi

Класс/Тип: Vietnamese

Примеров на hotexamples.com: 4

Python Vietnamese - 4 примера найдено. Это лучшие примеры Python кода для spacy.lang.vi.Vietnamese, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Vietnamese(3)

from_config(2)

add_pipe(1)

create_pipe(1)

from_bytes(1)

from_disk(1)

to_bytes(1)

to_disk(1)

Пример #1

Показать файл

Файл: util.py Проект: tranquansp/Aivivn_1

def sent_tokenize(texts):
    nlp = Vietnamese()
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    docs = []
    for text in texts:
        text_tokenized = []
        if (len(text) > 3):
            for sentence in nlp(text.lower()[1:-1]).sents:
                sent_tokens = np.array([postprocess_token(token.text) for token in sentence])
                text_tokenized.append(sent_tokens)
        else:
            text_tokenized.append([])
        docs.append(text_tokenized)

    return docs

Пример #2

Показать файл

def test_vi_tokenizer_no_pyvi():
    """Test for whitespace tokenization without pyvi"""
    nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}})
    text = "Đây là một văn  bản bằng tiếng Việt Sau đó, đây là một văn bản khác bằng ngôn ngữ này"
    doc = nlp(text)
    assert [t.text for t in doc if not t.is_space] == text.split()
    assert doc[4].text == " "

Пример #3

Показать файл

Файл: util.py Проект: tranquansp/Aivivn_1

def tokenize(texts):
    nlp = Vietnamese()
    docs = []
    for text in texts:
        tokens = np.array([postprocess_token(token.text) for token in nlp(text.lower())[1:-1]])
        docs.append(tokens)

    return docs

Пример #4

Показать файл

def test_vi_tokenizer_serialize(vi_tokenizer):
    tokenizer_bytes = vi_tokenizer.to_bytes()
    nlp = Vietnamese()
    nlp.tokenizer.from_bytes(tokenizer_bytes)
    assert tokenizer_bytes == nlp.tokenizer.to_bytes()
    assert nlp.tokenizer.use_pyvi is True

    with make_tempdir() as d:
        file_path = d / "tokenizer"
        vi_tokenizer.to_disk(file_path)
        nlp = Vietnamese()
        nlp.tokenizer.from_disk(file_path)
        assert tokenizer_bytes == nlp.tokenizer.to_bytes()
        assert nlp.tokenizer.use_pyvi is True

    # mode is (de)serialized correctly
    nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}})
    nlp_bytes = nlp.to_bytes()
    nlp_r = Vietnamese()
    nlp_r.from_bytes(nlp_bytes)
    assert nlp_bytes == nlp_r.to_bytes()
    assert nlp_r.tokenizer.use_pyvi is False

    with make_tempdir() as d:
        nlp.to_disk(d)
        nlp_r = Vietnamese()
        nlp_r.from_disk(d)
        assert nlp_bytes == nlp_r.to_bytes()
        assert nlp_r.tokenizer.use_pyvi is False