def test_character_tokenizer(): tokenizer = CharacterTokenizer() assert tokenizer.tokenize("fox can't") == [ "fox ", "ox c", "x ca", " can", "can'", "an't", ]
assert tokenizer.tokenize("fox can't") == [ "fox ", "ox c", "x ca", " can", "can'", "an't", ] @hypothesis.given(st.text()) @pytest.mark.parametrize( "tokenizer", [ RegexpTokenizer(), CharacterTokenizer(), UnicodeWordTokenizer(), VTextTokenizer("en"), VTextTokenizer("fr"), ], ids=_pytest_ids, ) def test_tokenize_edge_cases(tokenizer, txt): tokenizer.tokenize(txt) @pytest.mark.parametrize( "tokenizer, expected", [ (RegexpTokenizer(), { "pattern": r"\b\w\w+\b"
def test_pickle_non_default_params(): # check that pickling correctly stores estimator parameters est = CharacterTokenizer(window_size=10) est2 = pickle.loads(pickle.dumps(est)) assert est2.get_params()["window_size"] == 10
db = [ (r"Python re.findall(r'\b\w\w+\b', ...)", pyre_tokenizer), ( r"RegexpTokenizer(r'\b\w\w+\b')", RegexpTokenizer(pattern=token_regexp).tokenize, ), ( "UnicodeSegmentTokenizer(word_bounds=False)", UnicodeSegmentTokenizer(word_bounds=False).tokenize, ), ( "UnicodeSegmentTokenizer(word_bounds=True)", UnicodeSegmentTokenizer(word_bounds=True).tokenize, ), ("VTextTokenizer('en')", VTextTokenizer("en").tokenize), ("CharacterTokenizer(4)", CharacterTokenizer(4).tokenize), ] if sacremoses is not None: db.append(("MosesTokenizer()", sacremoses.MosesTokenizer().tokenize)) if spacy is not None: from spacy.lang.en import English db.append(("Spacy en", English().tokenizer)) if blingfire is not None: db.append( ("BlingFire en", lambda x: blingfire.text_to_words(x).split(" "))) for label, func in db: t0 = time()
assert tokenizer.tokenize("fox can't") == [ "fox ", "ox c", "x ca", " can", "can'", "an't", ] @hypothesis.given(st.text()) @pytest.mark.parametrize( "tokenizer", [ RegexpTokenizer(), CharacterTokenizer(), UnicodeSegmentTokenizer(), VTextTokenizer("en"), VTextTokenizer("fr"), ], ids=_pytest_ids, ) def test_tokenize_edge_cases(tokenizer, txt): tokenizer.tokenize(txt) @pytest.mark.parametrize( "tokenizer, expected", [ (RegexpTokenizer(), {"pattern": r"\b\w\w+\b"}), (CharacterTokenizer(), {"window_size": 4}),