def sent_tokenize(texts): nlp = Vietnamese() nlp.add_pipe(nlp.create_pipe('sentencizer')) docs = [] for text in texts: text_tokenized = [] if (len(text) > 3): for sentence in nlp(text.lower()[1:-1]).sents: sent_tokens = np.array([postprocess_token(token.text) for token in sentence]) text_tokenized.append(sent_tokens) else: text_tokenized.append([]) docs.append(text_tokenized) return docs
def test_vi_tokenizer_no_pyvi(): """Test for whitespace tokenization without pyvi""" nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}}) text = "Đây là một văn bản bằng tiếng Việt Sau đó, đây là một văn bản khác bằng ngôn ngữ này" doc = nlp(text) assert [t.text for t in doc if not t.is_space] == text.split() assert doc[4].text == " "
def tokenize(texts): nlp = Vietnamese() docs = [] for text in texts: tokens = np.array([postprocess_token(token.text) for token in nlp(text.lower())[1:-1]]) docs.append(tokens) return docs
def test_vi_tokenizer_serialize(vi_tokenizer): tokenizer_bytes = vi_tokenizer.to_bytes() nlp = Vietnamese() nlp.tokenizer.from_bytes(tokenizer_bytes) assert tokenizer_bytes == nlp.tokenizer.to_bytes() assert nlp.tokenizer.use_pyvi is True with make_tempdir() as d: file_path = d / "tokenizer" vi_tokenizer.to_disk(file_path) nlp = Vietnamese() nlp.tokenizer.from_disk(file_path) assert tokenizer_bytes == nlp.tokenizer.to_bytes() assert nlp.tokenizer.use_pyvi is True # mode is (de)serialized correctly nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}}) nlp_bytes = nlp.to_bytes() nlp_r = Vietnamese() nlp_r.from_bytes(nlp_bytes) assert nlp_bytes == nlp_r.to_bytes() assert nlp_r.tokenizer.use_pyvi is False with make_tempdir() as d: nlp.to_disk(d) nlp_r = Vietnamese() nlp_r.from_disk(d) assert nlp_bytes == nlp_r.to_bytes() assert nlp_r.tokenizer.use_pyvi is False