def test_postagging_with_janome(): """Test MeCab tokenizer.""" try: tokenizer = WordTokenizer(tokenizer="janome", with_postag=True) except ImportError: pytest.skip("Janome is not installed.") expect = [Token(**kwargs) for kwargs in janome_tokens_list] result = tokenizer.tokenize("すもももももももものうち") assert expect == result
def test_postagging_with_mecab(): """Test MeCab tokenizer.""" try: tokenizer = WordTokenizer(tokenizer="mecab", with_postag=True) except ImportError: pytest.skip("natto-py is not installed.") expect = [Token(**kwargs) for kwargs in mecab_tokens_list] result = tokenizer.tokenize("吾輩は猫である") assert expect == result
def test_word_tokenize_with_mecab(): """Test MeCab tokenizer.""" try: tokenizer = WordTokenizer(tokenizer="mecab", with_postag=True) except ImportError: pytest.skip("skip mecab") expect = [Token(**kwargs) for kwargs in mecab_tokens_list] result = tokenizer.tokenize(SENTENCE1) assert expect == result
def test_word_tokenize_with_kytea(): """Test KyTea tokenizer.""" try: tokenizer = WordTokenizer(tokenizer="kytea", with_postag=True) except ImportError: pytest.skip("skip kytea") expect = [Token(**kwargs) for kwargs in kytea_tokens_list] result = tokenizer.tokenize(SENTENCE1) assert expect == result
def test_word_tokenize_with_janome(): """Test MeCab tokenizer.""" try: tokenizer = WordTokenizer(tokenizer="janome", with_postag=True) except ImportError: pytest.skip("skip janome") expect = [Token(**kwargs) for kwargs in janome_tokens_list] result = tokenizer.tokenize(SENTENCE3) assert expect == result
def test_postagging_with_kytea(): """Test KyTea tokenizer.""" try: tokenizer = WordTokenizer(tokenizer="kytea", with_postag=True) except ImportError: pytest.skip("KyTea is not installed.") expect = [Token(**kwargs) for kwargs in kytea_tokens_list] result = tokenizer.tokenize("吾輩は猫である") assert expect == result
def test_word_tokenize_with_kytea(): try: import Mykytea del Mykytea except ImportError: pytest.skip("Mykytea is not installed.") tokenizer = WordTokenizer(tokenizer="KyTea") expect = [Token(surface=w) for w in "吾輩 は 猫 で あ る".split(" ")] result = tokenizer.tokenize("吾輩は猫である") assert expect == result
def test_word_tokenize_with_kytea_using_custom_model(): try: import Mykytea del Mykytea except ImportError: pytest.skip("KyTea is not installed.") tokenizer = WordTokenizer(tokenizer="KyTea", model_path="data/model.knm") expect = [Token(surface=w) for w in "吾輩は 猫である".split(" ")] result = tokenizer.tokenize("吾輩は猫である") assert expect == result
def test_word_tokenize_with_sudachi_mode_a(): try: import sudachipy del sudachipy except ImportError: pytest.skip("sudachipy is not installed") tokenizer = WordTokenizer(tokenizer="Sudachi", mode="A") expect = [Token(surface=w) for w in "医薬 品 安全 管理 責任 者".split(" ")] result = tokenizer.tokenize("医薬品安全管理責任者") assert expect == result
def test_word_tokenize_with_mecab_whitespace(): try: import natto del natto except ImportError: pytest.skip("natto-py is not installed") tokenizer = WordTokenizer(tokenizer="MeCab") expect = [Token(surface=w) for w in "吾輩 は で ある".split(" ")] result = tokenizer.tokenize("吾輩は である") assert expect == result
def test_word_tokenize_with_janome(): try: import janome del janome except ImportError: pytest.skip("janome is not installed") tokenizer = WordTokenizer(tokenizer="Janome") expect = [Token(surface=w) for w in "吾輩 は 猫 で ある".split(" ")] result = tokenizer.tokenize("吾輩は猫である") assert expect == result
def test_word_tokenize_with_sentencepiece(): try: import sentencepiece del sentencepiece except ImportError: pytest.skip("Sentencepiece is not installed.") tokenizer = WordTokenizer(tokenizer="Sentencepiece", model_path="data/model.spm") expect = [Token(surface=w) for w in "▁ 吾 輩 は 猫 である".split(" ")] result = tokenizer.tokenize("吾輩は猫である") assert expect == result
def test_word_tokenize_with_sudachi_mode_a(): """Test Sudachi tokenizer.""" try: tokenizer = WordTokenizer(tokenizer="sudachi", mode="A", with_postag=True) except ImportError: pytest.skip("skip sudachi") expect = [Token(**kwargs) for kwargs in sudachi_tokens_list] result = tokenizer.tokenize(SENTENCE2) assert expect == result
def test_postagging_with_sudachi_mode_a(): """Test Sudachi tokenizer.""" try: tokenizer = WordTokenizer(tokenizer="sudachi", mode="A", with_postag=True) except ImportError: pytest.skip("SudachiPy is not installed.") expect = [Token(**kwargs) for kwargs in sudachi_tokens_list] result = tokenizer.tokenize("医薬品安全管理責任者") assert expect == result
class KonohaTokenizer(Tokenizer): """An integration for AllenNLP. """ def __init__( self, tokenizer_name: str = 'mecab', with_postag: bool = False, user_dictionary_path: Optional[str] = None, system_dictionary_path: Optional[str] = None, model_path: Optional[str] = None, mode: Optional[str] = None, dictionary_format: Optional[str] = None, start_tokens: Optional[List[str]] = None, end_tokens: Optional[List[str]] = None, ) -> None: self._tokenizer = WordTokenizer( tokenizer=tokenizer_name, with_postag=with_postag, user_dictionary_path=user_dictionary_path, system_dictionary_path=system_dictionary_path, model_path=model_path, mode=mode, dictionary_format=dictionary_format, ) self._start_tokens = start_tokens or [] self._start_tokens.reverse() self._end_tokens = end_tokens or [] @overrides def batch_tokenize(self, texts: List[str]) -> List[List[Token]]: return [self.tokenize(text) for text in texts] @overrides def tokenize(self, text: str) -> List[Token]: konoha_tokens = self._tokenizer.tokenize(text) tokens = [ Token( text=token.surface, lemma_=token.base_form, pos_=token.postag, ) for token in konoha_tokens ] for start_token in self._start_tokens: tokens.insert(0, Token(start_token, 0)) for end_token in self._end_tokens: tokens.append(Token(end_token, -1)) return tokens
def test_kytea_with_s3_model(): try: import boto3 del boto3 except ImportError: pytest.skip("skip s3 test because of missing boto3") try: tokenizer = WordTokenizer( tokenizer="KyTea", model_path="s3://konoha-demo/kytea/model.knm") except ImportError: pytest.skip("skip kytea") expect = [Token(surface=w) for w in "吾輩は 猫である".split(" ")] # NOQA result = tokenizer.tokenize("吾輩は猫である") assert expect == result
def test_sentencepiece_with_s3_model(): try: import boto3 del boto3 except ImportError: pytest.skip("skip s3 test because of missing boto3") try: tokenizer = WordTokenizer( tokenizer="SentencePiece", model_path="s3://konoha-demo/sentencepiece/model.spm") except ImportError: pytest.skip("skip sentencepiece") expect = [Token(surface=w) for w in "▁ 吾 輩 は 猫 である".split(" ")] # NOQA result = tokenizer.tokenize("吾輩は猫である") assert expect == result
def test_word_tokenize_with_character(): tokenizer = WordTokenizer(tokenizer="Character") expect = [Token(surface=w) for w in "吾 輩 は 猫 で あ る".split(" ")] result = tokenizer.tokenize("吾輩は猫である") assert expect == result
def test_word_tokenize_with_whitespace(): tokenizer = WordTokenizer(tokenizer="Whitespace") expect = [Token(surface=w) for w in "吾輩 は 猫 で ある".split(" ")] result = tokenizer.tokenize("吾輩 は 猫 で ある") assert expect == result