def tokenize(self, text: str): tokens = [] if self.with_postag: response = self.kytea.getTagsToString(text) # FIXME Following dirty workaround is required to # process inputs which include <whitespace> itself # (e.g. "私 は猫") response = response.replace("\\ ", "<SPACE>").replace(" ", " <SPACE>") for elem in response.split(" ")[:-1]: # FIXME If input contains a character "/", # KyTea outputs "//補助記号/・", # which breaks the simple logic elem.split("/") pron, postag, surface = map(lambda e: e[::-1], elem[::-1].split("/", maxsplit=2)) surface = surface.replace("<SPACE>", " ") tokens.append(Token(surface=surface, postag=postag, pron=pron)) else: for surface in list(self.kytea.getWS(text)): tokens.append(Token(surface=surface)) return tokens
def tokenize(self, text: str) -> List[Token]: return_result = [] parse_result = self.janome.tokenize(text) if self.with_postag: for morph in parse_result: surface = morph.surface postag, postag2, postag3, postag4 = morph.part_of_speech.split(",") inflection = morph.infl_type conjugation = morph.infl_form base_form = morph.base_form yomi = morph.reading pron = morph.phonetic token = Token( surface=surface, postag=postag, postag2=postag2, postag3=postag3, postag4=postag4, inflection=inflection, conjugation=conjugation, base_form=base_form, yomi=yomi, pron=pron) return_result.append(token) else: for morph in parse_result: return_result.append(Token(surface=morph.surface)) return return_result
def tokenize(self, text: str): """Tokenize.""" result = [] for token in self.tokenizer.tokenize(text, self.mode): surface = token.surface() if self.with_postag: postag, postag2, postag3, postag4, \ inflection, conjugation = token.part_of_speech() base_form = token.dictionary_form() normalized_form = token.normalized_form() yomi = token.reading_form() result.append(Token( surface=surface, postag=postag, postag2=postag2, postag3=postag3, postag4=postag4, inflection=inflection, conjugation=conjugation, base_form=base_form, normalized_form=normalized_form, yomi=yomi, )) else: result.append(Token( surface=surface )) return result
def test_postagging_with_mecab(): """Test MeCab tokenizer.""" try: tokenizer = WordTokenizer(tokenizer="mecab", with_postag=True) except ImportError: pytest.skip("natto-py is not installed.") expect = [Token(**kwargs) for kwargs in mecab_tokens_list] result = tokenizer.tokenize("吾輩は猫である") assert expect == result
def test_postagging_with_kytea(): """Test KyTea tokenizer.""" try: tokenizer = WordTokenizer(tokenizer="kytea", with_postag=True) except ImportError: pytest.skip("KyTea is not installed.") expect = [Token(**kwargs) for kwargs in kytea_tokens_list] result = tokenizer.tokenize("吾輩は猫である") assert expect == result
def test_postagging_with_janome(): """Test MeCab tokenizer.""" try: tokenizer = WordTokenizer(tokenizer="janome", with_postag=True) except ImportError: pytest.skip("Janome is not installed.") expect = [Token(**kwargs) for kwargs in janome_tokens_list] result = tokenizer.tokenize("すもももももももものうち") assert expect == result
def test_word_tokenize_with_janome(): """Test MeCab tokenizer.""" try: tokenizer = WordTokenizer(tokenizer="janome", with_postag=True) except ImportError: pytest.skip("skip janome") expect = [Token(**kwargs) for kwargs in janome_tokens_list] result = tokenizer.tokenize(SENTENCE3) assert expect == result
def test_word_tokenize_with_kytea(): """Test KyTea tokenizer.""" try: tokenizer = WordTokenizer(tokenizer="kytea", with_postag=True) except ImportError: pytest.skip("skip kytea") expect = [Token(**kwargs) for kwargs in kytea_tokens_list] result = tokenizer.tokenize(SENTENCE1) assert expect == result
def test_word_tokenize_with_mecab(): """Test MeCab tokenizer.""" try: tokenizer = WordTokenizer(tokenizer="mecab", with_postag=True) except ImportError: pytest.skip("skip mecab") expect = [Token(**kwargs) for kwargs in mecab_tokens_list] result = tokenizer.tokenize(SENTENCE1) assert expect == result
def test_word_tokenize_with_kytea(): try: import Mykytea del Mykytea except ImportError: pytest.skip("Mykytea is not installed.") tokenizer = WordTokenizer(tokenizer="KyTea") expect = [Token(surface=w) for w in "吾輩 は 猫 で あ る".split(" ")] result = tokenizer.tokenize("吾輩は猫である") assert expect == result
def test_word_tokenize_with_janome(): try: import janome del janome except ImportError: pytest.skip("janome is not installed") tokenizer = WordTokenizer(tokenizer="Janome") expect = [Token(surface=w) for w in "吾輩 は 猫 で ある".split(" ")] result = tokenizer.tokenize("吾輩は猫である") assert expect == result
def test_word_tokenize_with_mecab_whitespace(): try: import natto del natto except ImportError: pytest.skip("natto-py is not installed") tokenizer = WordTokenizer(tokenizer="MeCab") expect = [Token(surface=w) for w in "吾輩 は で ある".split(" ")] result = tokenizer.tokenize("吾輩は である") assert expect == result
def test_word_tokenize_with_kytea_using_custom_model(): try: import Mykytea del Mykytea except ImportError: pytest.skip("KyTea is not installed.") tokenizer = WordTokenizer(tokenizer="KyTea", model_path="data/model.knm") expect = [Token(surface=w) for w in "吾輩は 猫である".split(" ")] result = tokenizer.tokenize("吾輩は猫である") assert expect == result
def test_word_tokenize_with_sudachi_mode_a(): try: import sudachipy del sudachipy except ImportError: pytest.skip("sudachipy is not installed") tokenizer = WordTokenizer(tokenizer="Sudachi", mode="A") expect = [Token(surface=w) for w in "医薬 品 安全 管理 責任 者".split(" ")] result = tokenizer.tokenize("医薬品安全管理責任者") assert expect == result
def test_word_tokenize_with_sudachi_mode_a(): """Test Sudachi tokenizer.""" try: tokenizer = WordTokenizer(tokenizer="sudachi", mode="A", with_postag=True) except ImportError: pytest.skip("skip sudachi") expect = [Token(**kwargs) for kwargs in sudachi_tokens_list] result = tokenizer.tokenize(SENTENCE2) assert expect == result
def test_postagging_with_sudachi_mode_a(): """Test Sudachi tokenizer.""" try: tokenizer = WordTokenizer(tokenizer="sudachi", mode="A", with_postag=True) except ImportError: pytest.skip("SudachiPy is not installed.") expect = [Token(**kwargs) for kwargs in sudachi_tokens_list] result = tokenizer.tokenize("医薬品安全管理責任者") assert expect == result
def test_word_tokenize_with_sentencepiece(): try: import sentencepiece del sentencepiece except ImportError: pytest.skip("Sentencepiece is not installed.") tokenizer = WordTokenizer(tokenizer="Sentencepiece", model_path="data/model.spm") expect = [Token(surface=w) for w in "▁ 吾 輩 は 猫 である".split(" ")] result = tokenizer.tokenize("吾輩は猫である") assert expect == result
def test_token_with_postag2(): token = Token( surface="大崎", postag="名詞", postag2="固有名詞,人名,姓", inflection="*", conjugation="*", base_form="大崎", yomi="オオサキ", pron="オーサキ") truth = "名詞,固有名詞,人名,姓,*,*,大崎,オオサキ,オーサキ" assert token.feature == truth
def tokenize(self, text: str) -> List[Token]: """Tokenize""" return_result = [] parse_result = self.mecab.parse(text).rstrip(" ") if self.with_postag: for elem in parse_result.split("\n")[:-1]: ( surface, postag, postag2, postag3, postag4, inflection, conjugation, base_form, yomi, pron, ) = self.parse_feature(elem) token = Token( surface=surface, postag=postag, postag2=postag2, postag3=postag3, postag4=postag4, inflection=inflection, conjugation=conjugation, base_form=base_form, yomi=yomi, pron=pron, ) return_result.append(token) else: for surface in parse_result.split(" "): return_result.append(Token(surface=surface)) return return_result
def test_kytea_with_s3_model(): try: import boto3 del boto3 except ImportError: pytest.skip("skip s3 test because of missing boto3") try: tokenizer = WordTokenizer( tokenizer="KyTea", model_path="s3://konoha-demo/kytea/model.knm") except ImportError: pytest.skip("skip kytea") expect = [Token(surface=w) for w in "吾輩は 猫である".split(" ")] # NOQA result = tokenizer.tokenize("吾輩は猫である") assert expect == result
def test_sentencepiece_with_s3_model(): try: import boto3 del boto3 except ImportError: pytest.skip("skip s3 test because of missing boto3") try: tokenizer = WordTokenizer( tokenizer="SentencePiece", model_path="s3://konoha-demo/sentencepiece/model.spm") except ImportError: pytest.skip("skip sentencepiece") expect = [Token(surface=w) for w in "▁ 吾 輩 は 猫 である".split(" ")] # NOQA result = tokenizer.tokenize("吾輩は猫である") assert expect == result
def test_word_tokenize_with_whitespace(): tokenizer = WordTokenizer(tokenizer="Whitespace") expect = [Token(surface=w) for w in "吾輩 は 猫 で ある".split(" ")] result = tokenizer.tokenize("吾輩 は 猫 で ある") assert expect == result
def tokenize(self, text: str): result = [] for subword in self.tokenizer.EncodeAsPieces(text): token = Token(surface=subword) result.append(token) return result
def tokenize(self, text: str): return [Token(surface=surface) for surface in text.split(" ")]
def test_token_without_feature(): token = Token(surface="大崎") assert "大崎" == token.surface assert "" == token.feature
def tokenize(self, text: str): return [Token(surface=char) for char in list(text)]
def test_word_tokenize_with_character(): tokenizer = WordTokenizer(tokenizer="Character") expect = [Token(surface=w) for w in "吾 輩 は 猫 で あ る".split(" ")] result = tokenizer.tokenize("吾輩は猫である") assert expect == result
def test_token_with_postag(): token = Token(surface="大崎", postag="名詞") assert "大崎" == token.surface assert "名詞" == token.feature