def test_tagging(self): # test_1 text = 'Pythonで簡単に使えるツールです' output = 'Python/名詞 で/助詞 簡単/形状詞 に/助動詞 使える/動詞 ツール/名詞 です/助動詞' words = nagisa.tagging(text) self.assertEqual(output, str(words)) # test_2 output = 'python/名詞 で/助詞 簡単/形状詞 に/助動詞 使える/動詞 ツール/名詞 です/助動詞' words = nagisa.tagging(text, lower=True) self.assertEqual(output, str(words)) # test_3 text = 'ニューラルネットワークを使ってます。' output = 'ニューラル/名詞 ネットワーク/名詞 を/助詞 使っ/動詞 て/助動詞 ます/助動詞 。/補助記号' self.assertEqual(output, str(nagisa.tagging(text))) # test_4 tagger_nn = nagisa.Tagger(single_word_list=['ニューラルネットワーク', "ニューラルネット"]) output = 'ニューラルネットワーク/名詞 を/助詞 使っ/動詞 て/助動詞 ます/助動詞 。/補助記号' self.assertEqual(output, str(tagger_nn.tagging(text))) # test_5 text = "3月に見た「3月のライオン」" new_tagger = nagisa.Tagger(single_word_list=['3月のライオン']) output = '3/名詞 月/名詞 に/助詞 見/動詞 た/助動詞 「/補助記号 3月のライオン/名詞 」/補助記号' self.assertEqual(output, str(new_tagger.tagging(text))) # test_6 text = '(人•ᴗ•♡)こんばんは♪' output = '(人•ᴗ•♡)/補助記号 こんばんは/感動詞 ♪/補助記号' words = nagisa.tagging(text) self.assertEqual(output, str(words)) # test_7 url = 'https://github.com/taishi-i/nagisaでコードを公開中(๑¯ω¯๑)' output = 'コード/名詞 公開/名詞 中/接尾辞' words = nagisa.filter(url, filter_postags=['URL', '補助記号', '助詞']) self.assertEqual(output, str(words)) # test_8 output = 'https://github.com/taishi-i/nagisa/URL で/助詞 を/助詞 (๑ ̄ω ̄๑)/補助記号' words = nagisa.extract(url, extract_postags=['URL', '補助記号', '助詞']) self.assertEqual(output, str(words)) # test_9 words = [" (人•ᴗ•♡)", "こんばんは", "♪"] output = ['補助記号', '感動詞', '補助記号'] postags = nagisa.postagging(words) self.assertEqual(output, postags) # test_10 postags = nagisa.decode(words) self.assertEqual(output, postags)
def wl_pos_tag(main, tokens, lang, pos_tagger='default', tagset='custom'): tokens_tagged = [] # Check if the first token is empty if tokens and tokens[0] == '': first_token_empty = True else: first_token_empty = False tokens = [str(token) for token in tokens if token] if pos_tagger == 'default': pos_tagger = main.settings_custom['pos_tagging']['pos_taggers'][lang] wl_text_utils.check_pos_taggers(main, lang=lang, pos_tagger=pos_tagger) # Chinese if pos_tagger == main.tr('jieba - Chinese POS Tagger'): tokens_tagged = jieba.posseg.cut(' '.join(tokens)) # Dutch, English, French, German, Greek (Modern), Italian, Portuguese, Spanish elif 'spaCy' in pos_tagger: nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = spacy.tokens.Doc(nlp.vocab, words=tokens) nlp.tagger(doc) tokens_tagged = [(token.text, token.tag_) for token in doc] # English & Russian elif pos_tagger == main.tr('NLTK - Perceptron POS Tagger'): tokens_tagged = nltk.pos_tag(tokens, lang=lang) # Japanese elif pos_tagger == main.tr('nagisa - Japanese POS Tagger'): import nagisa tokens_tagged = zip(tokens, nagisa.postagging(tokens)) # Russian & Ukrainian elif pos_tagger == main.tr('pymorphy2 - Morphological Analyzer'): if lang == 'rus': morphological_analyzer = pymorphy2.MorphAnalyzer(lang='ru') elif lang == 'ukr': morphological_analyzer = pymorphy2.MorphAnalyzer(lang='uk') for token in tokens: tokens_tagged.append( (token, morphological_analyzer.parse(token)[0].tag._POS)) # Thai elif pos_tagger == main.tr('PyThaiNLP - Perceptron Tagger (ORCHID)'): tokens_tagged = pythainlp.tag.pos_tag(tokens, engine='perceptron', corpus='orchid') elif pos_tagger == main.tr('PyThaiNLP - Perceptron Tagger (PUD)'): tokens_tagged = pythainlp.tag.pos_tag(tokens, engine='perceptron', corpus='pud') # Tibetan elif pos_tagger == main.tr('botok - Tibetan POS Tagger'): wl_text_utils.check_word_tokenizers(main, lang='bod') tokens = main.botok_word_tokenizer.tokenize(' '.join(tokens)) for token in tokens: if token.pos: tokens_tagged.append((token.text, token.pos)) else: tokens_tagged.append((token.text, token.chunk_type)) # Vietnamese elif pos_tagger == main.tr('Underthesea - Vietnamese POS Tagger'): tokens_tagged = underthesea.pos_tag(' '.join(tokens)) # Convert to Universal Tagset if (tagset == 'custom' and main.settings_custom['pos_tagging']['to_universal_pos_tags'] or tagset == 'universal'): mappings = { tag: tag_universal for tag, tag_universal, _, _ in main.settings_custom['tagsets'] ['mappings'][lang][pos_tagger] } tokens_tagged = list(tokens_tagged) # Issue warnings if any tag is missing from the mapping table for _, tag in tokens_tagged: if tag not in mappings: print( f'Warning: tag "{tag}" is missing from the {wl_conversion.to_lang_text(main, lang)} mapping table!' ) tokens_tagged = [(token, mappings.get(tag, 'X')) for token, tag in tokens_tagged] # Strip empty tokens and strip whitespace in tokens tokens_tagged = [(token.strip(), tag) for token, tag in tokens_tagged if token.strip()] # Add the first empty token (if any) if first_token_empty: tokens_tagged.insert(0, ('', '')) return tokens_tagged
def wordless_pos_tag(main, tokens, lang, pos_tagger='default', tagset='custom'): tokens_tagged = [] tokens = [str(token) for token in tokens] if pos_tagger == 'default': pos_tagger = main.settings_custom['pos_tagging']['pos_taggers'][lang] wordless_text_utils.check_pos_taggers(main, lang=lang, pos_tagger=pos_tagger) # Chinese if pos_tagger == main.tr('jieba - Chinese POS Tagger'): tokens_tagged = jieba.posseg.cut(' '.join(tokens)) # Dutch, English, French, German, Greek (Modern), Italian, Portuguese, Spanish elif 'spaCy' in pos_tagger: nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = spacy.tokens.Doc(nlp.vocab, words=tokens) nlp.tagger(doc) tokens_tagged = [(token.text, token.tag_) for token in doc] # English & Russian elif pos_tagger == main.tr('NLTK - Perceptron POS Tagger'): tokens_tagged = nltk.pos_tag(tokens, lang=lang) # Japanese elif pos_tagger == main.tr('nagisa - Japanese POS Tagger'): import nagisa tokens_tagged = zip(tokens, nagisa.postagging(tokens)) # Russian & Ukrainian elif pos_tagger == main.tr('pymorphy2 - Morphological Analyzer'): if lang == 'rus': morphological_analyzer = pymorphy2.MorphAnalyzer(lang='ru') elif lang == 'ukr': morphological_analyzer = pymorphy2.MorphAnalyzer(lang='uk') for token in tokens: tokens_tagged.append( (token, morphological_analyzer.parse(token)[0].tag._POS)) # Thai elif pos_tagger == main.tr( 'PyThaiNLP - Perceptron POS Tagger - ORCHID Corpus'): tokens_tagged = pythainlp.tag.pos_tag(tokens, engine='perceptron', corpus='orchid') elif pos_tagger == main.tr( 'PyThaiNLP - Perceptron POS Tagger - PUD Corpus'): tokens_tagged = pythainlp.tag.pos_tag(tokens, engine='perceptron', corpus='pud') # Tibetan elif pos_tagger == main.tr('pybo - Tibetan POS Tagger'): word_tokenizer = main.settings_custom['word_tokenization'][ 'word_tokenizers'][lang] wordless_text_utils.check_pybo_tokenizers( main, word_tokenizer=word_tokenizer) if word_tokenizer == main.tr('pybo - Tibetan Word Tokenizer (GMD)'): tokens = main.pybo_tokenizer_gmd.tokenize(' '.join(tokens)) elif word_tokenizer == main.tr('pybo - Tibetan Word Tokenizer (POS)'): tokens = main.pybo_tokenizer_pos.tokenize(' '.join(tokens)) elif word_tokenizer == main.tr( 'pybo - Tibetan Word Tokenizer (tsikchen)'): tokens = main.pybo_tokenizer_tsikchen.tokenize(' '.join(tokens)) for token in tokens: if token.pos: tokens_tagged.append((token.text, token.pos)) else: tokens_tagged.append((token.text, token.chunk_type)) # Vietnamese elif pos_tagger == main.tr('Underthesea - Vietnamese POS Tagger'): tokens_tagged = underthesea.pos_tag(' '.join(tokens)) # Convert to Universal Tagset if (tagset == 'custom' and main.settings_custom['pos_tagging']['to_universal_pos_tags'] or tagset == 'universal'): mappings = { tag: tag_universal for tag, tag_universal, _, _ in main.settings_custom['tagsets'] ['mappings'][lang][pos_tagger] } tokens_tagged = [(token, mappings[tag]) for token, tag in tokens_tagged] # Strip empty tokens and strip whitespace in tokens tokens_tagged = [(token.strip(), tag) for token, tag in tokens_tagged if token.strip()] # Check if the first token is empty if tokens[0] == '': tokens_tagged.insert(0, ('', '')) return tokens_tagged
def wl_pos_tag_tokens(main, tokens, lang, pos_tagger, tagset): tokens_tagged = [] if pos_tagger == 'nagisa_jpn': # Defer import to save loading time import nagisa lang = wl_conversion.remove_lang_code_suffixes(main, lang) # spaCy if pos_tagger.startswith('spacy_'): if not lang.startswith('srp_'): lang = wl_conversion.remove_lang_code_suffixes(main, lang) nlp = main.__dict__[f'spacy_nlp_{lang}'] if lang != 'jpn': doc = spacy.tokens.Doc(nlp.vocab, words=tokens, spaces=[False] * len(tokens)) for pipe_name in nlp.pipe_names: nlp.get_pipe(pipe_name)(doc) # The Japanese model do not have a tagger component and Japanese POS tags are taken directly from SudachiPy # See: https://github.com/explosion/spaCy/discussions/9983#discussioncomment-1910117 else: doc = nlp(''.join(tokens)) if tagset == 'default': tokens_tagged = [(token.text, token.tag_) for token in doc] elif tagset == 'universal': tokens_tagged = [(token.text, token.pos_) for token in doc] # Chinese elif pos_tagger == 'jieba_zho': tokens_tagged = jieba.posseg.cut(''.join(tokens)) # English & Russian elif pos_tagger == 'nltk_perceptron': lang = wl_conversion.remove_lang_code_suffixes(main, lang) tokens_tagged = nltk.pos_tag(tokens, lang=lang) # Japanese elif pos_tagger == 'nagisa_jpn': tokens_tagged = zip(tokens, nagisa.postagging(tokens)) elif pos_tagger == 'sudachipy_jpn': tokens_tagged = [(token.surface(), '-'.join( [pos for pos in token.part_of_speech()[:4] if pos != '*'])) for token in main.sudachipy_word_tokenizer.tokenize( ''.join(tokens))] # Russian & Ukrainian elif pos_tagger == 'pymorphy2_morphological_analyzer': if lang == 'rus': morphological_analyzer = main.pymorphy2_morphological_analyzer_rus elif lang == 'ukr': morphological_analyzer = main.pymorphy2_morphological_analyzer_ukr for token in tokens: tokens_tagged.append( (token, morphological_analyzer.parse(token)[0].tag._POS)) # Thai elif pos_tagger == 'pythainlp_perceptron_lst20': tokens_tagged = pythainlp.tag.pos_tag(tokens, engine='perceptron', corpus='lst20') elif pos_tagger == 'pythainlp_perceptron_orchid': tokens_tagged = pythainlp.tag.pos_tag(tokens, engine='perceptron', corpus='orchid') elif pos_tagger == 'pythainlp_perceptron_pud': tokens_tagged = pythainlp.tag.pos_tag(tokens, engine='perceptron', corpus='pud') # Tibetan elif pos_tagger == 'botok_bod': tokens_retokenized = main.botok_word_tokenizer.tokenize( ''.join(tokens)) for token in tokens_retokenized: if token.pos: tokens_tagged.append((token.text, token.pos)) else: tokens_tagged.append((token.text, token.chunk_type)) # Vietnamese elif pos_tagger == 'underthesea_vie': tokens_tagged = underthesea.pos_tag(' '.join(tokens)) # Remove empty tokens and strip whitespace in tokens tokens_tagged = [(str(token).strip(), tag) for token, tag in tokens_tagged if str(token).strip()] # Make sure that tokenization is not modified during POS tagging i_tokens = 0 i_tokens_tagged = 0 len_tokens = len(tokens) len_tokens_tagged = len(tokens_tagged) if len_tokens != len_tokens_tagged: tokens_tagged_modified = [] while i_tokens < len_tokens and i_tokens_tagged < len_tokens_tagged: # Different token if len(tokens[i_tokens]) != len(tokens_tagged[i_tokens_tagged][0]): tokens_temp = [tokens[i_tokens]] tokens_tagged_temp = [tokens_tagged[i_tokens_tagged][0]] tags_temp = [tokens_tagged[i_tokens_tagged][1]] # Align tokens while i_tokens < len_tokens - 1 or i_tokens_tagged < len_tokens_tagged - 1: len_tokens_temp = sum( [len(token) for token in tokens_temp]) len_tokens_tagged_temp = sum( [len(token) for token in tokens_tagged_temp]) if len_tokens_temp > len_tokens_tagged_temp: tokens_tagged_temp.append( tokens_tagged[i_tokens_tagged + 1][0]) tags_temp.append(tokens_tagged[i_tokens_tagged + 1][1]) i_tokens_tagged += 1 elif len_tokens_temp < len_tokens_tagged_temp: tokens_temp.append(tokens[i_tokens + 1]) i_tokens += 1 else: if len(tokens_temp) == len(tokens_tagged_temp): tokens_tagged_modified.extend([ (token, tag) for token, tag in zip(tokens_temp, tags_temp) ]) elif len(tokens_temp) > len(tokens_tagged_temp): tokens_tagged_modified.extend([ (token, tags_temp[0]) for token in tokens_temp ]) else: tokens_tagged_modified.append( (tokens_temp[0], tags_temp[0])) tokens_temp = [] tokens_tagged_temp = [] tags_temp = [] break if tokens_temp: if len(tokens_temp) == len(tokens_tagged_temp): tokens_tagged_modified.extend([ (token, tag) for token, tag in zip(tokens_temp, tags_temp) ]) elif len(tokens_temp) > len(tokens_tagged_temp): tokens_tagged_modified.extend([ (token, tags_temp[0]) for token in tokens_temp ]) else: tokens_tagged_modified.append( (tokens_temp[0], tags_temp[0])) else: tokens_tagged_modified.append( (tokens[i_tokens], tokens_tagged[i_tokens_tagged][1])) i_tokens += 1 i_tokens_tagged += 1 len_tokens_tagged_modified = len(tokens_tagged_modified) if len_tokens < len_tokens_tagged_modified: tokens_tagged = tokens_tagged_modified[:len_tokens] elif len_tokens > len_tokens_tagged_modified: tokens_tagged = tokens_tagged_modified + [ tokens_tagged_modified[-1] ] * (len_tokens - len_tokens_tagged_modified) else: tokens_tagged = tokens_tagged_modified.copy() else: tokens_tagged = [(tokens[i], tokens_tagged[i][1]) for i in range(len(tokens))] return tokens_tagged
def test_tagging(self): # test_1 text = 'Pythonで簡単に使えるツールです' output = 'Python/名詞 で/助詞 簡単/形状詞 に/助動詞 使える/動詞 ツール/名詞 です/助動詞' words = nagisa.tagging(text) self.assertEqual(output, str(words)) # test_2 output = 'python/名詞 で/助詞 簡単/形状詞 に/助動詞 使える/動詞 ツール/名詞 です/助動詞' words = nagisa.tagging(text, lower=True) self.assertEqual(output, str(words)) # test_3 text = 'ニューラルネットワークを使ってます。' output = 'ニューラル/名詞 ネットワーク/名詞 を/助詞 使っ/動詞 て/助動詞 ます/助動詞 。/補助記号' self.assertEqual(output, str(nagisa.tagging(text))) # test_4 tagger_nn = nagisa.Tagger(single_word_list=['ニューラルネットワーク', "ニューラルネット"]) output = 'ニューラルネットワーク/名詞 を/助詞 使っ/動詞 て/助動詞 ます/助動詞 。/補助記号' self.assertEqual(output, str(tagger_nn.tagging(text))) # test_5 text = "3月に見た「3月のライオン」" new_tagger = nagisa.Tagger(single_word_list=['3月のライオン']) output = '3/名詞 月/名詞 に/助詞 見/動詞 た/助動詞 「/補助記号 3月のライオン/名詞 」/補助記号' self.assertEqual(output, str(new_tagger.tagging(text))) # test_6 text = "それが、iPhone XSです。" output = "それ/代名詞 が/助詞 、/補助記号 iPhone XS/名詞 です/助動詞 。/補助記号" new_tagger = nagisa.Tagger(single_word_list=["iPhone[a-zA-Z0-9 ]+"]) self.assertEqual(output, str(new_tagger.tagging(text))) # test_7 text = "1234abc ABC" output = "1234/名詞 abc ABC/名詞" new_tagger = nagisa.Tagger(single_word_list=["[a-zA-Z ]+", "[0-9]+"]) self.assertEqual(output, str(new_tagger.tagging(text))) # test_8 text = '(人•ᴗ•♡)こんばんは♪' output = '(人•ᴗ•♡)/補助記号 こんばんは/感動詞 ♪/補助記号' words = nagisa.tagging(text) self.assertEqual(output, str(words)) # test_9 url = 'https://github.com/taishi-i/nagisaでコードを公開中(๑¯ω¯๑)' output = 'コード/名詞 公開/名詞 中/接尾辞' words = nagisa.filter(url, filter_postags=['URL', '補助記号', '助詞']) self.assertEqual(output, str(words)) # test_10 output = 'https://github.com/taishi-i/nagisa/URL で/助詞 を/助詞 (๑ ̄ω ̄๑)/補助記号' words = nagisa.extract(url, extract_postags=['URL', '補助記号', '助詞']) self.assertEqual(output, str(words)) # test_11 words = [" (人•ᴗ•♡)", "こんばんは", "♪"] output = ['補助記号', '感動詞', '補助記号'] postags = nagisa.postagging(words) self.assertEqual(output, postags) # test_12 postags = nagisa.decode(words) self.assertEqual(output, postags) # test_13 words = [" (人•ᴗ•♡)", " ", "こんばんは", "♪"] output = ['補助記号', "空白", '感動詞', '補助記号'] postags = nagisa.postagging(words) self.assertEqual(output, postags) # test_14 postags = nagisa.decode(words) self.assertEqual(output, postags) # test_15 words = [" (人•ᴗ•♡)", " ", "こんばんは", "♪"] output = ['補助記号', "空白", '感動詞', '補助記号'] postags = nagisa.postagging(words) self.assertEqual(output, postags) # test_16 postags = nagisa.decode(words) self.assertEqual(output, postags) # test_17 text = "こんばんは😀" output = "こんばんは/感動詞 😀/補助記号" words = nagisa.tagging(text) self.assertEqual(output, str(words)) # test_18 text = "コンバンハ12345" output = "コンバンハ/名詞 1/名詞 2/名詞 3/名詞 4/名詞 5/名詞" words = nagisa.tagging(text) self.assertEqual(output, str(words)) # test_19 text = "𪗱𪘂𪘚𪚲" output = "𪗱/補助記号 𪘂/補助記号 𪘚/補助記号 𪚲/補助記号" words = nagisa.tagging(text) self.assertEqual(output, str(words))