Python UnicodeSegmentTokenizer 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: vtext.tokenize

hotexamples.com에서의 예제들: 4

Python UnicodeSegmentTokenizer - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 vtext.tokenize.UnicodeSegmentTokenizer에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

UnicodeSegmentTokenizer(4)

tokenize(1)

자주 사용되는 메소드들

UnicodeSegmentTokenizer (4)

tokenize (1)

예제 #1

파일 보기

def test_unicode_segment_tokenize():

    tokenizer = UnicodeSegmentTokenizer(word_bounds=False)
    assert tokenizer.tokenize("Today, tomorrow") == ["Today", "tomorrow"]

    tokenizer = UnicodeSegmentTokenizer(word_bounds=True)
    assert tokenizer.tokenize("Today, tomorrow") == ["Today", ",", "tomorrow"]

    with pytest.raises(TypeError):
        UnicodeSegmentTokenizer(word_bounds=1)

    with pytest.raises(TypeError):
        UnicodeSegmentTokenizer().tokenize(2)

예제 #2

파일 보기

    dataset_size = 91  # MB for 20 newsgroup dataset

    print("# Tokenizing {} documents".format(len(data)))

    def pyre_tokenizer(txt):
        return list(re.compile(token_regexp).findall(txt))

    db = [
        (r"Python re.findall(r'\b\w\w+\b', ...)", pyre_tokenizer),
        (
            r"RegexpTokenizer(r'\b\w\w+\b')",
            RegexpTokenizer(pattern=token_regexp).tokenize,
        ),
        (
            "UnicodeSegmentTokenizer(word_bounds=False)",
            UnicodeSegmentTokenizer(word_bounds=False).tokenize,
        ),
        (
            "UnicodeSegmentTokenizer(word_bounds=True)",
            UnicodeSegmentTokenizer(word_bounds=True).tokenize,
        ),
        ("VTextTokenizer('en')", VTextTokenizer("en").tokenize),
        ("CharacterTokenizer(4)", CharacterTokenizer(4).tokenize),
    ]

    if sacremoses is not None:
        db.append(("MosesTokenizer()", sacremoses.MosesTokenizer().tokenize))
    if spacy is not None:
        from spacy.lang.en import English

        db.append(("Spacy en", English().tokenizer))

예제 #3

파일 보기

    ("fr", "Sequoia"),
    ("de", "GSD"),
    # ("ru", "GSD"),
]


def whitespace_split(x):
    return x.split(" ")


tok_db = [
    # ("whitespace", lambda lang: whitespace_split),
    ("regexp", lambda lang: re.compile(r"\b\w\w+\b").findall),
    (
        "unicode-segmentation",
        lambda lang: UnicodeSegmentTokenizer(word_bounds=True).tokenize,
    ),
    ("vtext", lambda lang: VTextTokenizer(lang).tokenize),
]

if sacremoses is not None:
    tok_db.append(("MosesTokenizer", lambda lang: sacremoses.MosesTokenizer().tokenize))

if spacy is not None:

    def spacy_tokenizer(lang):
        if lang == "en":
            from spacy.lang.en import English as Nlp
        elif lang == "de":
            from spacy.lang.de import German as Nlp
        elif lang == "fr":

예제 #4

파일 보기

        "fox ",
        "ox c",
        "x ca",
        " can",
        "can'",
        "an't",
    ]


@hypothesis.given(st.text())
@pytest.mark.parametrize(
    "tokenizer",
    [
        RegexpTokenizer(),
        CharacterTokenizer(),
        UnicodeSegmentTokenizer(),
        VTextTokenizer("en"),
        VTextTokenizer("fr"),
    ],
    ids=_pytest_ids,
)
def test_tokenize_edge_cases(tokenizer, txt):
    tokenizer.tokenize(txt)


@pytest.mark.parametrize(
    "tokenizer, expected",
    [
        (RegexpTokenizer(), {"pattern": r"\b\w\w+\b"}),
        (CharacterTokenizer(), {"window_size": 4}),
        (UnicodeSegmentTokenizer(), {"word_bounds": True}),