Exemplo n.º 1
0
    def test_build_user2(self):
        sys_dic = tempfile.mktemp(prefix="sudachi_sy", suffix=".dic")
        self.tempfiles.append(sys_dic)
        sudachipy.sudachipy.build_system_dic(
            matrix=RESOURCES_PATH / "matrix.def",
            lex=[RESOURCES_PATH / "lex.csv"],
            output=sys_dic
        )
        u1_dic = tempfile.mktemp(prefix="sudachi_u1", suffix=".dic")
        self.tempfiles.append(u1_dic)
        sudachipy.sudachipy.build_user_dic(
            system=sys_dic,
            lex=[RESOURCES_PATH / "user1.csv"],
            output=u1_dic
        )

        u2_dic = tempfile.mktemp(prefix="sudachi_u2", suffix=".dic")
        self.tempfiles.append(u2_dic)
        sudachipy.sudachipy.build_user_dic(
            system=sys_dic,
            lex=[RESOURCES_PATH / "user2.csv"],
            output=u2_dic
        )

        cfg = self.make_config(sys_dic, [u1_dic, u2_dic])
        dict = sudachipy.Dictionary(config_path=cfg)
        tok = dict.create()
        result = tok.tokenize("かぼすにいく")
        self.assertEqual(result.size(), 3)
        self.assertEqual(result[0].dictionary_id(), 2)
        self.assertEqual(result[0].part_of_speech()[0], "被子植物門")
Exemplo n.º 2
0
def init_word_tokenizers(main, lang, word_tokenizer = 'default'):
    if lang not in main.settings_global['word_tokenizers']:
        lang = 'other'

    if word_tokenizer == 'default':
        word_tokenizer = main.settings_custom['word_tokenization']['word_tokenizers'][lang]

    # NLTK
    if word_tokenizer.startswith('nltk_'):
        if word_tokenizer == 'nltk_nist':
            if 'nltk_nist_tokenizer' not in main.__dict__:
                main.nltk_nist_tokenizer = nltk.tokenize.nist.NISTTokenizer()
        elif word_tokenizer == 'nltk_nltk':
            if 'nltk_nltk_tokenizer' not in main.__dict__:
                main.nltk_nltk_tokenizer = nltk.NLTKWordTokenizer()
        elif word_tokenizer == 'nltk_penn_treebank':
            if 'nltk_treebank_tokenizer' not in main.__dict__:
                main.nltk_treebank_tokenizer = nltk.TreebankWordTokenizer()
        elif word_tokenizer == 'nltk_tok_tok':
            if 'nltk_toktok_tokenizer' not in main.__dict__:
                main.nltk_toktok_tokenizer = nltk.ToktokTokenizer()
        elif word_tokenizer == 'nltk_twitter':
            if 'nltk_tweet_tokenizer' not in main.__dict__:
                main.nltk_tweet_tokenizer = nltk.TweetTokenizer()
    # Sacremoses
    elif word_tokenizer == 'sacremoses_moses':
        lang_sacremoses = wl_conversion.remove_lang_code_suffixes(main, wl_conversion.to_iso_639_1(main, lang))
        lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        if f'sacremoses_moses_tokenizer_{lang}' not in main.__dict__:
            main.__dict__[f'sacremoses_moses_tokenizer_{lang}'] = sacremoses.MosesTokenizer(lang = lang_sacremoses)
    # spaCy
    elif word_tokenizer.startswith('spacy_'):
        init_spacy_models(main, lang)
    # Chinese
    elif word_tokenizer == 'pkuseg_zho':
        if 'pkuseg_word_tokenizer' not in main.__dict__:
            main.pkuseg_word_tokenizer = pkuseg.pkuseg()
    # Chinese & Japanese
    elif word_tokenizer.startswith('wordless_'):
        init_spacy_models(main, 'eng_us')
        init_spacy_models(main, 'other')
    # Japanese
    elif word_tokenizer.startswith('sudachipy_jpn'):
        if 'sudachipy_word_tokenizer' not in main.__dict__:
            main.sudachipy_word_tokenizer = sudachipy.Dictionary().create()
    # Tibetan
    elif word_tokenizer == 'botok_bod':
        if 'botok_word_tokenizer' not in main.__dict__:
            main.botok_word_tokenizer = botok.WordTokenizer()
Exemplo n.º 3
0
 def test_build_system(self):
     out_tmp = tempfile.mktemp(prefix="sudachi_sy", suffix=".dic")
     self.tempfiles.append(out_tmp)
     stats = sudachipy.sudachipy.build_system_dic(
         matrix=RESOURCES_PATH / "matrix.def",
         lex=[RESOURCES_PATH / "lex.csv"],
         output=out_tmp
     )
     self.assertIsNotNone(stats)
     cfg = self.make_config(out_tmp, [])
     dict = sudachipy.Dictionary(config_path=cfg)
     tok = dict.create()
     result = tok.tokenize("東京にいく")
     self.assertEqual(result.size(), 3)
Exemplo n.º 4
0
def init_lemmatizers(main, lang, lemmatizer):
    # spaCy
    if lemmatizer.startswith('spacy_'):
        init_spacy_models(main, lang)
    # Russian & Ukrainian
    elif lemmatizer == 'pymorphy2_morphological_analyzer':
        if lang == 'rus':
            if 'pymorphy2_morphological_analyzer_rus' not in main.__dict__:
                main.pymorphy2_morphological_analyzer_rus = pymorphy2.MorphAnalyzer(lang = 'ru')
        elif lang == 'ukr':
            if 'pymorphy2_morphological_analyzer_ukr' not in main.__dict__:
                main.pymorphy2_morphological_analyzer_ukr = pymorphy2.MorphAnalyzer(lang = 'uk')
    # Japanese
    elif lemmatizer == 'sudachipy_jpn':
        if 'sudachipy_word_tokenizer' not in main.__dict__:
            main.sudachipy_word_tokenizer = sudachipy.Dictionary().create()
Exemplo n.º 5
0
 def __init__(self) -> None:
     self._tokenizer = sudachipy.Dictionary().create()
Exemplo n.º 6
0
 def setUp(self) -> None:
     self.dict = sudachipy.Dictionary()