def test_unicode_segment_tokenize(): tokenizer = UnicodeSegmentTokenizer(word_bounds=False) assert tokenizer.tokenize("Today, tomorrow") == ["Today", "tomorrow"] tokenizer = UnicodeSegmentTokenizer(word_bounds=True) assert tokenizer.tokenize("Today, tomorrow") == ["Today", ",", "tomorrow"] with pytest.raises(TypeError): UnicodeSegmentTokenizer(word_bounds=1) with pytest.raises(TypeError): UnicodeSegmentTokenizer().tokenize(2)
dataset_size = 91 # MB for 20 newsgroup dataset print("# Tokenizing {} documents".format(len(data))) def pyre_tokenizer(txt): return list(re.compile(token_regexp).findall(txt)) db = [ (r"Python re.findall(r'\b\w\w+\b', ...)", pyre_tokenizer), ( r"RegexpTokenizer(r'\b\w\w+\b')", RegexpTokenizer(pattern=token_regexp).tokenize, ), ( "UnicodeSegmentTokenizer(word_bounds=False)", UnicodeSegmentTokenizer(word_bounds=False).tokenize, ), ( "UnicodeSegmentTokenizer(word_bounds=True)", UnicodeSegmentTokenizer(word_bounds=True).tokenize, ), ("VTextTokenizer('en')", VTextTokenizer("en").tokenize), ("CharacterTokenizer(4)", CharacterTokenizer(4).tokenize), ] if sacremoses is not None: db.append(("MosesTokenizer()", sacremoses.MosesTokenizer().tokenize)) if spacy is not None: from spacy.lang.en import English db.append(("Spacy en", English().tokenizer))
("fr", "Sequoia"), ("de", "GSD"), # ("ru", "GSD"), ] def whitespace_split(x): return x.split(" ") tok_db = [ # ("whitespace", lambda lang: whitespace_split), ("regexp", lambda lang: re.compile(r"\b\w\w+\b").findall), ( "unicode-segmentation", lambda lang: UnicodeSegmentTokenizer(word_bounds=True).tokenize, ), ("vtext", lambda lang: VTextTokenizer(lang).tokenize), ] if sacremoses is not None: tok_db.append(("MosesTokenizer", lambda lang: sacremoses.MosesTokenizer().tokenize)) if spacy is not None: def spacy_tokenizer(lang): if lang == "en": from spacy.lang.en import English as Nlp elif lang == "de": from spacy.lang.de import German as Nlp elif lang == "fr":
"fox ", "ox c", "x ca", " can", "can'", "an't", ] @hypothesis.given(st.text()) @pytest.mark.parametrize( "tokenizer", [ RegexpTokenizer(), CharacterTokenizer(), UnicodeSegmentTokenizer(), VTextTokenizer("en"), VTextTokenizer("fr"), ], ids=_pytest_ids, ) def test_tokenize_edge_cases(tokenizer, txt): tokenizer.tokenize(txt) @pytest.mark.parametrize( "tokenizer, expected", [ (RegexpTokenizer(), {"pattern": r"\b\w\w+\b"}), (CharacterTokenizer(), {"window_size": 4}), (UnicodeSegmentTokenizer(), {"word_bounds": True}),