示例#1
0
 def tokenize(self, text: str):
     """Tokenize."""
     result = []
     for token in self.tokenizer.tokenize(text, self.mode):
         _token = Token(token.surface())
         if self.with_postag:
             _token.postag = token.part_of_speech()[0]
         result.append(_token)
     return result
示例#2
0
    def tokenize(self, text: str):
        """Tokenize"""
        return_result = []
        parse_result = self.mecab.parse(text)
        if self.with_postag:
            for elem in parse_result.split("\n")[:-1]:
                surface, feature = elem.split()
                postag = feature.split(",")[0]
                return_result.append(Token(surface=surface, postag=postag))
        else:
            for surface in parse_result.split(" "):
                return_result.append(Token(surface=surface))

        return return_result
    def tokenize(self, text: str):
        return_result = []

        if self.with_postag:
            response = self.kytea.getTagsToString(text)
            response = response.replace("  ", " <SPACE>")  # FIXME

            for elem in response.split(" ")[:-1]:
                surface, postag, _ = elem.split("/")
                surface = surface.replace("<SPACE>", " ")
                return_result.append(Token(surface=surface, postag=postag))

        else:
            for surface in list(self.kytea.getWS(text)):
                return_result.append(Token(surface=surface))

        return return_result
 def test_word_tokenize_with_character(self):
     """Test Character tokenizer."""
     tokenizer1 = WordTokenizer(tokenizer="Character")
     tokenizer2 = WordTokenizer(tokenizer="character")
     # assert tokenizer1 == tokenizer2
     expect = [Token(surface=w) for w in "吾 輩 は 猫 で あ る".split(" ")]  # NOQA
     result1 = tokenizer1.tokenize(SENTENCE1)
     result2 = tokenizer2.tokenize(SENTENCE1)
     assert expect == result1  # NOQA
     assert result1 == result2
    def test_word_tokenize_with_sudachi_mode_c(self):
        """Test Sudachi tokenizer."""
        try:
            tokenizer = WordTokenizer(tokenizer="Sudachi", mode="C")
        except ModuleNotFoundError:
            pytest.skip("skip sudachi")

        expect = [Token(surface=w) for w in "医薬品安全管理責任者".split(" ")]
        result = tokenizer.tokenize(SENTENCE2)
        self.assertEqual(expect, result)
    def test_word_tokenize_with_mecab(self):
        """Test MeCab tokenizer."""
        try:
            tokenizer1 = WordTokenizer(tokenizer="MeCab")
            tokenizer2 = WordTokenizer(tokenizer="mecab")
        except ModuleNotFoundError:
            pytest.skip("skip mecab")

        expect = [Token(surface=w) for w in "吾輩 は 猫 で ある".split(" ")]  # NOQA
        result1 = tokenizer1.tokenize(SENTENCE1)
        result2 = tokenizer2.tokenize(SENTENCE1)
        assert expect == result1  # NOQA
        assert result1 == result2
    def test_word_tokenize_with_mecab(self):
        """Test MeCab tokenizer."""
        try:
            tokenizer = WordTokenizer(tokenizer="mecab", with_postag=True)
        except ModuleNotFoundError:
            pytest.skip("skip mecab")

        words = "吾輩 は 猫 で ある".split(" ")  # NOQA
        postags = "名詞 助詞 名詞 助動詞 助動詞".split(" ")

        expect = [Token(surface=w, postag=p) for w, p in zip(words, postags)]
        result = tokenizer.tokenize(SENTENCE1)
        assert expect == result
示例#8
0
    def test_token_with_postag2(self):
        token = Token(
            surface="大崎",
            postag="名詞",
            postag2="固有名詞,人名,姓",
            conj_type="*",
            conj_form="*",
            origin_form="大崎",
            yomi="オオサキ",
            pron="オーサキ")

        self.assertEqual(
            "名詞,固有名詞,人名,姓,*,*,大崎,オオサキ,オーサキ",
            token.feature)
    def test_word_tokenize_with_sentencepiece(self):
        """Test Sentencepiece tokenizer."""
        try:
            tokenizer1 = WordTokenizer(tokenizer="Sentencepiece",
                                       model_path="data/model.spm")
            tokenizer2 = WordTokenizer(tokenizer="Sentencepiece",
                                       model_path="data/model.spm")
        except ModuleNotFoundError:
            pytest.skip("skip sentencepiece")

        expect = [Token(surface=w) for w in "▁ 吾 輩 は 猫 である".split(" ")]  # NOQA
        result1 = tokenizer1.tokenize(SENTENCE1)
        result2 = tokenizer2.tokenize(SENTENCE1)
        assert expect == result1  # NOQA
        assert result1 == result2
示例#10
0
    def test_word_tokenize_with_sudachi_mode_a(self):
        """Test Sudachi tokenizer."""
        try:
            tokenizer = WordTokenizer(tokenizer="sudachi",
                                      mode="A",
                                      with_postag=True)
        except ModuleNotFoundError:
            pytest.skip("skip sudachi")

        words = "医薬 品 安全 管理 責任 者".split(" ")  # NOQA
        postags = "名詞 接尾辞 名詞 名詞 名詞 接尾辞".split(" ")

        expect = [Token(surface=w, postag=p) for w, p in zip(words, postags)]
        result = tokenizer.tokenize(SENTENCE2)
        self.assertEqual(expect, result)
示例#11
0
 def test_token_without_feature(self):
     token = Token(surface="大崎")
     self.assertEqual("大崎", token.surface)
     self.assertEqual("", token.feature)
示例#12
0
 def test_token_with_postag(self):
     token = Token(surface="大崎", postag="名詞")
     self.assertEqual("大崎", token.surface)
     self.assertEqual("名詞", token.feature)
 def tokenize(self, text: str):
     result = []
     for subword in self.tokenizer.EncodeAsPieces(text):
         token = Token(surface=subword)
         result.append(token)
     return result
示例#14
0
 def tokenize(self, text: str):
     return [Token(surface=char) for char in list(text)]