def init_word_detokenizers(main, lang): if lang not in ['zho_cn', 'zho_tw', 'jpn', 'tha', 'bod']: # Sacremoses lang_sacremoses = wl_conversion.remove_lang_code_suffixes(main, wl_conversion.to_iso_639_1(main, lang)) lang = wl_conversion.remove_lang_code_suffixes(main, lang) if f'sacremoses_moses_detokenizer_{lang}' not in main.__dict__: main.__dict__[f'sacremoses_moses_detokenizer_{lang}'] = sacremoses.MosesDetokenizer(lang = lang_sacremoses)
def test_remove_lang_code_suffixes(): for lang_code_639_3, lang_code_639_1 in TO_ISO_639_1.items(): if lang_code_639_3.find('_') > -1: lang_code_639_3 = wl_conversion.remove_lang_code_suffixes(main, lang_code_639_3) assert lang_code_639_3.find('_') == -1 if lang_code_639_1.find('_') > -1: lang_code_639_1 = wl_conversion.remove_lang_code_suffixes(main, lang_code_639_1) assert lang_code_639_1.find('_') == -1
def init_word_tokenizers(main, lang, word_tokenizer = 'default'): if lang not in main.settings_global['word_tokenizers']: lang = 'other' if word_tokenizer == 'default': word_tokenizer = main.settings_custom['word_tokenization']['word_tokenizers'][lang] # NLTK if word_tokenizer.startswith('nltk_'): if word_tokenizer == 'nltk_nist': if 'nltk_nist_tokenizer' not in main.__dict__: main.nltk_nist_tokenizer = nltk.tokenize.nist.NISTTokenizer() elif word_tokenizer == 'nltk_nltk': if 'nltk_nltk_tokenizer' not in main.__dict__: main.nltk_nltk_tokenizer = nltk.NLTKWordTokenizer() elif word_tokenizer == 'nltk_penn_treebank': if 'nltk_treebank_tokenizer' not in main.__dict__: main.nltk_treebank_tokenizer = nltk.TreebankWordTokenizer() elif word_tokenizer == 'nltk_tok_tok': if 'nltk_toktok_tokenizer' not in main.__dict__: main.nltk_toktok_tokenizer = nltk.ToktokTokenizer() elif word_tokenizer == 'nltk_twitter': if 'nltk_tweet_tokenizer' not in main.__dict__: main.nltk_tweet_tokenizer = nltk.TweetTokenizer() # Sacremoses elif word_tokenizer == 'sacremoses_moses': lang_sacremoses = wl_conversion.remove_lang_code_suffixes(main, wl_conversion.to_iso_639_1(main, lang)) lang = wl_conversion.remove_lang_code_suffixes(main, lang) if f'sacremoses_moses_tokenizer_{lang}' not in main.__dict__: main.__dict__[f'sacremoses_moses_tokenizer_{lang}'] = sacremoses.MosesTokenizer(lang = lang_sacremoses) # spaCy elif word_tokenizer.startswith('spacy_'): init_spacy_models(main, lang) # Chinese elif word_tokenizer == 'pkuseg_zho': if 'pkuseg_word_tokenizer' not in main.__dict__: main.pkuseg_word_tokenizer = pkuseg.pkuseg() # Chinese & Japanese elif word_tokenizer.startswith('wordless_'): init_spacy_models(main, 'eng_us') init_spacy_models(main, 'other') # Japanese elif word_tokenizer.startswith('sudachipy_jpn'): if 'sudachipy_word_tokenizer' not in main.__dict__: main.sudachipy_word_tokenizer = sudachipy.Dictionary().create() # Tibetan elif word_tokenizer == 'botok_bod': if 'botok_word_tokenizer' not in main.__dict__: main.botok_word_tokenizer = botok.WordTokenizer()
def init_spacy_models(main, lang): # Chinese, English, German, Portuguese if not lang.startswith('srp_'): lang = wl_conversion.remove_lang_code_suffixes(main, lang) if f'spacy_nlp_{lang}' not in main.__dict__: # Languages with models if lang in SPACY_LANGS: model = importlib.import_module(SPACY_LANGS[lang]) main.__dict__[f'spacy_nlp_{lang}'] = model.load(disable = ['parser', 'ner']) # Add senter main.__dict__[f'spacy_nlp_{lang}'].enable_pipe('senter') # Languages without models else: # Serbian if lang == 'srp_cyrl': main.__dict__['spacy_nlp_srp_cyrl'] = spacy.blank('sr') elif lang == 'srp_latn': main.__dict__['spacy_nlp_srp_latn'] = spacy.blank('sr') else: main.__dict__[f'spacy_nlp_{lang}'] = spacy.blank(wl_conversion.to_iso_639_1(main, lang)) # Add sentencizer and lemmatizer main.__dict__[f'spacy_nlp_{lang}'].add_pipe('sentencizer') if lang in SPACY_LANGS_LEMMATIZERS: main.__dict__[f'spacy_nlp_{lang}'].add_pipe('lemmatizer') main.__dict__[f'spacy_nlp_{lang}'].initialize()
def wl_word_tokenize(main, text, lang, word_tokenizer='default'): tokens_multilevel = [] if lang not in main.settings_global['word_tokenizers']: lang = 'other' if word_tokenizer == 'default': word_tokenizer = main.settings_custom['word_tokenization'][ 'word_tokenizers'][lang] wl_nlp_utils.init_word_tokenizers(main, lang=lang, word_tokenizer=word_tokenizer) if word_tokenizer.startswith('spacy_'): # Input of SudachiPy cannot be more than 49149 BYTES if word_tokenizer == 'spacy_jpn' and len(text) > 49149 // 4: # Around 300 tokens per line 4 characters per token and 4 bytes per character (≈ 49149 / 4 / 4 / 300) sections = wl_nlp_utils.split_into_chunks_text(text, section_size=10) else: sections = wl_nlp_utils.split_into_chunks_text( text, section_size=main.settings_custom['files']['misc'] ['read_files_in_chunks']) else: sections = wl_nlp_utils.split_into_chunks_text(text, 1) for section in sections: # spaCy if word_tokenizer.startswith('spacy_'): # Chinese, English, German, Portuguese if not lang.startswith('srp_'): lang = wl_conversion.remove_lang_code_suffixes(main, lang) nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = nlp(section) tokens_multilevel.append([]) len_sents = len(list(doc.sents)) for i, sentence in enumerate(doc.sents): tokens_sentence = [] tokens = [token.text for token in sentence] len_tokens = len(tokens) for j, token in enumerate(tokens): # Split paragraphs by new line character len_lines = len(re.findall(r'\n', token)) if len_lines: # Check if the last paragraph is empty if i == len_sents - 1 and j == len_tokens - 1 and token.endswith( '\n'): len_lines -= 1 if tokens_sentence: tokens_multilevel[-1].append(tokens_sentence) tokens_sentence = [] tokens_multilevel.extend([[] for j in range(len_lines)]) else: if token.strip(): tokens_sentence.append(token) if tokens_sentence: tokens_multilevel[-1].append(tokens_sentence) else: tokens_multilevel.append([]) if section.strip(): # NLTK if word_tokenizer.startswith('nltk_'): sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, section, lang) if word_tokenizer == 'nltk_nist': for sentence in sentences: tokens_multilevel[-1].append( main.nltk_nist_tokenizer.tokenize(sentence)) elif word_tokenizer == 'nltk_nltk': for sentence in sentences: tokens_multilevel[-1].append( main.nltk_nltk_tokenizer.tokenize(sentence)) elif word_tokenizer == 'nltk_penn_treebank': for sentence in sentences: tokens_multilevel[-1].append( main.nltk_treebank_tokenizer.tokenize( sentence)) elif word_tokenizer == 'nltk_tok_tok': for sentence in sentences: tokens_multilevel[-1].append( main.nltk_toktok_tokenizer.tokenize(sentence)) elif word_tokenizer == 'nltk_twitter': for sentence in sentences: tokens_multilevel[-1].append( main.nltk_tweet_tokenizer.tokenize(sentence)) # Sacremoses elif word_tokenizer == 'sacremoses_moses': lang = wl_conversion.remove_lang_code_suffixes(main, lang) sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, section, lang) for sentence in sentences: tokens_multilevel[-1].append( main.__dict__[f'sacremoses_moses_tokenizer_{lang}'] .tokenize(sentence, escape=False)) # Chinese elif word_tokenizer == 'jieba_zho': sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, section, lang=lang) for sentence in sentences: tokens_multilevel[-1].append(jieba.lcut(sentence)) elif word_tokenizer == 'pkuseg_zho': sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, section, lang=lang) for sentence in sentences: tokens_multilevel[-1].append( main.pkuseg_word_tokenizer.cut(sentence)) elif word_tokenizer == 'wordless_zho_char': sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, section, lang=lang) for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wl_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # English if wl_checking_unicode.is_eng(char): for j, _ in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wl_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wl_word_tokenize( main, sentence[ non_han_start:i + j + 1], lang='eng_us')) tokens = list( wl_misc.flatten_list( tokens)) non_han_start = i + j + 1 break # Other Languages else: for j, _ in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wl_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wl_word_tokenize( main, sentence[ non_han_start:i + j + 1], lang='other')) tokens = list( wl_misc.flatten_list( tokens)) non_han_start = i + j + 1 break tokens_multilevel[-1].append(tokens) # Japanese elif word_tokenizer == 'nagisa_jpn': import nagisa sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, section, lang=lang) for sentence in sentences: tokens_multilevel[-1].append( nagisa.tagging(str(sentence)).words) elif word_tokenizer.startswith('sudachipy_jpn'): sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, section, lang=lang) if word_tokenizer == 'sudachipy_jpn_split_mode_a': for sentence in sentences: tokens_multilevel[-1].append([ token.surface() for token in main.sudachipy_word_tokenizer. tokenize(sentence, sudachipy.SplitMode.A) ]) elif word_tokenizer == 'sudachipy_jpn_split_mode_b': for sentence in sentences: tokens_multilevel[-1].append([ token.surface() for token in main.sudachipy_word_tokenizer. tokenize(sentence, sudachipy.SplitMode.B) ]) elif word_tokenizer == 'sudachipy_jpn_split_mode_c': for sentence in sentences: tokens_multilevel[-1].append([ token.surface() for token in main.sudachipy_word_tokenizer. tokenize(sentence, sudachipy.SplitMode.C) ]) elif word_tokenizer == 'wordless_jpn_kanji': sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, section, lang=lang) for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wl_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # Japanese Kana if wl_checking_unicode.is_kana(char): for j, _ in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wl_checking_unicode.is_kana( sentence[i + j + 1]): tokens.extend( wl_word_tokenize( main, sentence[ non_han_start:i + j + 1], lang='jpn')) tokens = list( wl_misc.flatten_list( tokens)) non_han_start = i + j + 1 break # English elif wl_checking_unicode.is_eng(char): for j, _ in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wl_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wl_word_tokenize( main, sentence[ non_han_start:i + j + 1], lang='eng_us')) tokens = list( wl_misc.flatten_list( tokens)) non_han_start = i + j + 1 break # Other Languages else: for j, _ in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wl_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wl_word_tokenize( main, sentence[ non_han_start:i + j + 1], lang='other')) tokens = list( wl_misc.flatten_list( tokens)) non_han_start = i + j + 1 break tokens_multilevel[-1].append(tokens) # Icelandic elif word_tokenizer == 'tokenizer_isl': sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, section, lang='isl', sentence_tokenizer='tokenizer_isl') for sentence in sentences: tokens_multilevel[-1].append([ token for kind, token, val in tokenizer.tokenize( sentence) if token ]) # Thai elif word_tokenizer.startswith('pythainlp_'): # Preserve sentence boundaries sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, section, lang='tha') if word_tokenizer == 'pythainlp_longest_matching': for sentence in sentences: tokens_multilevel[-1].append( pythainlp.word_tokenize(sentence, engine='longest')) elif word_tokenizer == 'pythainlp_max_matching': for sentence in sentences: tokens_multilevel[-1].append( pythainlp.word_tokenize(sentence, engine='mm')) elif word_tokenizer == 'pythainlp_max_matching_tcc': for sentence in sentences: tokens_multilevel[-1].append( pythainlp.word_tokenize(sentence, engine='newmm')) elif word_tokenizer == 'pythainlp_max_matching_tcc_safe_mode': for sentence in sentences: tokens_multilevel[-1].append( pythainlp.word_tokenize(sentence, engine='newmm-safe')) elif word_tokenizer == 'pythainlp_nercut': for sentence in sentences: tokens_multilevel[-1].append( pythainlp.word_tokenize(sentence, engine='nercut')) # Tibetan elif word_tokenizer == 'botok_bod': sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, section, lang='bod') for sentence in sentences: tokens_multilevel[-1].append([ token.text for token in main.botok_word_tokenizer.tokenize(sentence) ]) # Vietnamese elif word_tokenizer == 'underthesea_vie': sentences = wl_sentence_tokenization.wl_sentence_tokenize( main, section, lang='vie', sentence_tokenizer='underthesea_vie') for sentence in sentences: tokens_multilevel[-1].append( underthesea.word_tokenize(str(sentence))) # Remove empty tokens and strip whitespace for para in tokens_multilevel: for i, sentence in enumerate(para): para[i] = [token.strip() for token in sentence if token.strip()] # Record token boundaries if lang in ['zho_cn', 'zho_tw', 'jpn']: for para in tokens_multilevel: for sentence in para: if sentence: sentence[-1] = wl_texts.Wl_Token(sentence[-1], boundary='', sentence_ending=True) else: for para in tokens_multilevel: for sentence in para: if sentence: sentence[-1] = wl_texts.Wl_Token(sentence[-1], boundary=' ', sentence_ending=True) return tokens_multilevel
def wl_sentence_tokenize(main, text, lang, sentence_tokenizer='default'): sentences = [] if lang not in main.settings_global['sentence_tokenizers']: lang = 'other' if sentence_tokenizer == 'default': sentence_tokenizer = main.settings_custom['sentence_tokenization'][ 'sentence_tokenizers'][lang] wl_nlp_utils.init_sentence_tokenizers( main, lang=lang, sentence_tokenizer=sentence_tokenizer) # Input of SudachiPy cannot be more than 49149 BYTES if sentence_tokenizer == 'spacy_jpn' and len(text) > 49149 // 4: # Around 300 tokens per line 4 characters per token and 4 bytes per character (≈ 49149 / 4 / 4 / 300) sections = wl_nlp_utils.split_into_chunks_text(text, section_size=10) else: sections = wl_nlp_utils.split_into_chunks_text( text, section_size=main.settings_custom['files']['misc'] ['read_files_in_chunks']) for section in sections: # NLTK if sentence_tokenizer == 'nltk_punkt': lang_texts = { 'ces': 'czech', 'dan': 'danish', 'nld': 'dutch', # English 'eng_gb': 'english', 'eng_us': 'english', 'est': 'estonian', 'fin': 'finnish', 'fra': 'french', # German 'deu_at': 'german', 'deu_de': 'german', 'deu_ch': 'german', 'ell': 'greek', 'ita': 'italian', # Norwegian 'nob': 'norwegian', 'nno': 'norwegian', 'pol': 'polish', # Portuguese 'por_br': 'portuguese', 'por_pt': 'portuguese', 'rus': 'russian', 'slv': 'slovene', 'spa': 'spanish', 'swe': 'swedish', 'tur': 'turkish', # Other languages 'other': 'english' } sentences.extend( nltk.sent_tokenize(section, language=lang_texts[lang])) # spaCy elif sentence_tokenizer.startswith('spacy_'): # Chinese, English, German, Portuguese if not lang.startswith('srp_'): lang = wl_conversion.remove_lang_code_suffixes(main, lang) nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = nlp(section) sentences.extend([sentence.text for sentence in doc.sents]) # Chinese & Japanese elif sentence_tokenizer in ['wordless_zho', 'wordless_jpn']: for line in section.splitlines(): sentence_start = 0 for i, char in enumerate(line): if i >= sentence_start and char in [ '。', '!', '?', '!', '?' ]: for j, char_next in enumerate(line): if j > i and char_next not in [ '。', '!', '?', '!', '?', '’', '”', ')', ')' ]: sentences.append(line[sentence_start:j]) sentence_start = j break if sentence_start <= len(line): sentences.append(line[sentence_start:]) # Icelandic elif sentence_tokenizer == 'tokenizer_isl': for sentence in tokenizer.split_into_sentences(section): sentences.append( wl_word_detokenization.wl_word_detokenize( main, tokens=sentence.split(), lang='isl')) # Thai elif sentence_tokenizer == 'pythainlp_crfcut': sentences.extend(pythainlp.sent_tokenize(section)) # Tibetan elif sentence_tokenizer == 'botok_bod': wl_nlp_utils.init_word_tokenizers(main, lang='bod') tokens = main.botok_word_tokenizer.tokenize(section) for sentence_tokens in botok.sentence_tokenizer(tokens): sentences.append(''.join([ sentence_token.text for sentence_token in sentence_tokens['tokens'] ])) # Vietnamese elif sentence_tokenizer == 'underthesea_vie': sentences.extend(underthesea.sent_tokenize(section)) # Strip spaces sentences = [ sentence_non_empty for sentence in sentences if (sentence_non_empty := sentence.strip()) ]
def wl_pos_tag_tokens(main, tokens, lang, pos_tagger, tagset): tokens_tagged = [] if pos_tagger == 'nagisa_jpn': # Defer import to save loading time import nagisa lang = wl_conversion.remove_lang_code_suffixes(main, lang) # spaCy if pos_tagger.startswith('spacy_'): if not lang.startswith('srp_'): lang = wl_conversion.remove_lang_code_suffixes(main, lang) nlp = main.__dict__[f'spacy_nlp_{lang}'] if lang != 'jpn': doc = spacy.tokens.Doc(nlp.vocab, words=tokens, spaces=[False] * len(tokens)) for pipe_name in nlp.pipe_names: nlp.get_pipe(pipe_name)(doc) # The Japanese model do not have a tagger component and Japanese POS tags are taken directly from SudachiPy # See: https://github.com/explosion/spaCy/discussions/9983#discussioncomment-1910117 else: doc = nlp(''.join(tokens)) if tagset == 'default': tokens_tagged = [(token.text, token.tag_) for token in doc] elif tagset == 'universal': tokens_tagged = [(token.text, token.pos_) for token in doc] # Chinese elif pos_tagger == 'jieba_zho': tokens_tagged = jieba.posseg.cut(''.join(tokens)) # English & Russian elif pos_tagger == 'nltk_perceptron': lang = wl_conversion.remove_lang_code_suffixes(main, lang) tokens_tagged = nltk.pos_tag(tokens, lang=lang) # Japanese elif pos_tagger == 'nagisa_jpn': tokens_tagged = zip(tokens, nagisa.postagging(tokens)) elif pos_tagger == 'sudachipy_jpn': tokens_tagged = [(token.surface(), '-'.join( [pos for pos in token.part_of_speech()[:4] if pos != '*'])) for token in main.sudachipy_word_tokenizer.tokenize( ''.join(tokens))] # Russian & Ukrainian elif pos_tagger == 'pymorphy2_morphological_analyzer': if lang == 'rus': morphological_analyzer = main.pymorphy2_morphological_analyzer_rus elif lang == 'ukr': morphological_analyzer = main.pymorphy2_morphological_analyzer_ukr for token in tokens: tokens_tagged.append( (token, morphological_analyzer.parse(token)[0].tag._POS)) # Thai elif pos_tagger == 'pythainlp_perceptron_lst20': tokens_tagged = pythainlp.tag.pos_tag(tokens, engine='perceptron', corpus='lst20') elif pos_tagger == 'pythainlp_perceptron_orchid': tokens_tagged = pythainlp.tag.pos_tag(tokens, engine='perceptron', corpus='orchid') elif pos_tagger == 'pythainlp_perceptron_pud': tokens_tagged = pythainlp.tag.pos_tag(tokens, engine='perceptron', corpus='pud') # Tibetan elif pos_tagger == 'botok_bod': tokens_retokenized = main.botok_word_tokenizer.tokenize( ''.join(tokens)) for token in tokens_retokenized: if token.pos: tokens_tagged.append((token.text, token.pos)) else: tokens_tagged.append((token.text, token.chunk_type)) # Vietnamese elif pos_tagger == 'underthesea_vie': tokens_tagged = underthesea.pos_tag(' '.join(tokens)) # Remove empty tokens and strip whitespace in tokens tokens_tagged = [(str(token).strip(), tag) for token, tag in tokens_tagged if str(token).strip()] # Make sure that tokenization is not modified during POS tagging i_tokens = 0 i_tokens_tagged = 0 len_tokens = len(tokens) len_tokens_tagged = len(tokens_tagged) if len_tokens != len_tokens_tagged: tokens_tagged_modified = [] while i_tokens < len_tokens and i_tokens_tagged < len_tokens_tagged: # Different token if len(tokens[i_tokens]) != len(tokens_tagged[i_tokens_tagged][0]): tokens_temp = [tokens[i_tokens]] tokens_tagged_temp = [tokens_tagged[i_tokens_tagged][0]] tags_temp = [tokens_tagged[i_tokens_tagged][1]] # Align tokens while i_tokens < len_tokens - 1 or i_tokens_tagged < len_tokens_tagged - 1: len_tokens_temp = sum( [len(token) for token in tokens_temp]) len_tokens_tagged_temp = sum( [len(token) for token in tokens_tagged_temp]) if len_tokens_temp > len_tokens_tagged_temp: tokens_tagged_temp.append( tokens_tagged[i_tokens_tagged + 1][0]) tags_temp.append(tokens_tagged[i_tokens_tagged + 1][1]) i_tokens_tagged += 1 elif len_tokens_temp < len_tokens_tagged_temp: tokens_temp.append(tokens[i_tokens + 1]) i_tokens += 1 else: if len(tokens_temp) == len(tokens_tagged_temp): tokens_tagged_modified.extend([ (token, tag) for token, tag in zip(tokens_temp, tags_temp) ]) elif len(tokens_temp) > len(tokens_tagged_temp): tokens_tagged_modified.extend([ (token, tags_temp[0]) for token in tokens_temp ]) else: tokens_tagged_modified.append( (tokens_temp[0], tags_temp[0])) tokens_temp = [] tokens_tagged_temp = [] tags_temp = [] break if tokens_temp: if len(tokens_temp) == len(tokens_tagged_temp): tokens_tagged_modified.extend([ (token, tag) for token, tag in zip(tokens_temp, tags_temp) ]) elif len(tokens_temp) > len(tokens_tagged_temp): tokens_tagged_modified.extend([ (token, tags_temp[0]) for token in tokens_temp ]) else: tokens_tagged_modified.append( (tokens_temp[0], tags_temp[0])) else: tokens_tagged_modified.append( (tokens[i_tokens], tokens_tagged[i_tokens_tagged][1])) i_tokens += 1 i_tokens_tagged += 1 len_tokens_tagged_modified = len(tokens_tagged_modified) if len_tokens < len_tokens_tagged_modified: tokens_tagged = tokens_tagged_modified[:len_tokens] elif len_tokens > len_tokens_tagged_modified: tokens_tagged = tokens_tagged_modified + [ tokens_tagged_modified[-1] ] * (len_tokens - len_tokens_tagged_modified) else: tokens_tagged = tokens_tagged_modified.copy() else: tokens_tagged = [(tokens[i], tokens_tagged[i][1]) for i in range(len(tokens))] return tokens_tagged
def wl_pos_tag_text(main, text, lang, pos_tagger, tagset): tokens_tagged = [] if pos_tagger == 'nagisa_jpn': # Defer import to save loading time import nagisa # spaCy if pos_tagger.startswith('spacy_'): if not lang.startswith('srp_'): lang = wl_conversion.remove_lang_code_suffixes(main, lang) nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = nlp(text) if tagset == 'default': tokens_tagged = [(token.text, token.tag_) for token in doc] elif tagset == 'universal': tokens_tagged = [(token.text, token.pos_) for token in doc] # Chinese elif pos_tagger == 'jieba_zho': tokens_tagged = jieba.posseg.cut(text) # English & Russian elif pos_tagger == 'nltk_perceptron': lang = wl_conversion.remove_lang_code_suffixes(main, lang) tokens = wl_word_tokenization.wl_word_tokenize_flat(main, text, lang=lang) tokens_tagged = nltk.pos_tag(tokens, lang=lang) # Japanese elif pos_tagger == 'nagisa_jpn': tokens_tagged = nagisa.tagging(text) tokens_tagged = zip(tokens_tagged.words, tokens_tagged.postags) elif pos_tagger == 'sudachipy_jpn': tokens_tagged = [(token.surface(), '-'.join([ pos for pos in token.part_of_speech()[:4] if pos != '*' ])) for token in main.sudachipy_word_tokenizer.tokenize(text)] # Russian & Ukrainian elif pos_tagger == 'pymorphy2_morphological_analyzer': if lang == 'rus': morphological_analyzer = main.pymorphy2_morphological_analyzer_rus elif lang == 'ukr': morphological_analyzer = main.pymorphy2_morphological_analyzer_ukr tokens = wl_word_tokenization.wl_word_tokenize_flat(main, text, lang=lang) for token in tokens: tokens_tagged.append( (token, morphological_analyzer.parse(token)[0].tag._POS)) # Thai elif pos_tagger.startswith('pythainlp_'): tokens = wl_word_tokenization.wl_word_tokenize_flat(main, text, lang=lang) if pos_tagger == 'pythainlp_perceptron_lst20': tokens_tagged = pythainlp.tag.pos_tag(tokens, engine='perceptron', corpus='lst20') elif pos_tagger == 'pythainlp_perceptron_orchid': tokens_tagged = pythainlp.tag.pos_tag(tokens, engine='perceptron', corpus='orchid') elif pos_tagger == 'pythainlp_perceptron_pud': tokens_tagged = pythainlp.tag.pos_tag(tokens, engine='perceptron', corpus='pud') # Tibetan elif pos_tagger == 'botok_bod': tokens = main.botok_word_tokenizer.tokenize(text) for token in tokens: if token.pos: tokens_tagged.append((token.text, token.pos)) else: tokens_tagged.append((token.text, token.chunk_type)) # Vietnamese elif pos_tagger == 'underthesea_vie': tokens_tagged = underthesea.pos_tag(text) # Remove empty tokens and strip whitespace in tokens tokens_tagged = [(str(token).strip(), tag) for token, tag in tokens_tagged if str(token).strip()] return tokens_tagged
def wl_get_stop_word_list(main, lang, stop_word_list='default'): if lang not in main.settings_global['stop_word_lists']: lang = 'other' if stop_word_list == 'default': stop_word_list = main.settings_custom['stop_word_lists'][ 'stop_word_lists'][lang] stop_words = [] if stop_word_list == 'custom': stop_words = main.settings_custom['stop_word_lists']['custom_lists'][ lang] else: lang_639_1 = wl_conversion.to_iso_639_1(main, lang) # Chinese (Simplified), English, German, Portuguese if lang != 'zho_tw' and not lang.startswith('srp_'): lang_639_1 = wl_conversion.remove_lang_code_suffixes( main, wl_conversion.to_iso_639_1(main, lang)) lang = wl_conversion.remove_lang_code_suffixes(main, lang) # Chinese (Traditional) if lang_639_1 == 'zh_tw': cc = opencc.OpenCC('s2twp') stop_words_zho_cn = wl_get_stop_word_list( main, lang='zho_cn', stop_word_list=stop_word_list.replace('zho_tw', 'zho_cn')) stop_words = [ cc.convert(stop_word) for stop_word in stop_words_zho_cn ] elif stop_word_list.startswith('cltk_'): stop_words = importlib.import_module( f'stop_word_lists.cltk.{lang}').STOPS # extra-stopwords elif stop_word_list.startswith('extra_stopwords_'): LANG_TEXTS = { 'sqi': 'albanian', 'ara': 'arabic', 'hye': 'armenian', 'eus': 'basque', 'bel': 'belarusian', 'ben': 'bengali', 'bul': 'bulgarian', 'cat': 'catalan', 'zho': 'chinese', 'hrv': 'croatian', 'ces': 'czech', 'dan': 'danish', 'nld': 'dutch', 'eng': 'english', 'est': 'estonian', 'fin': 'finnish', 'fra': 'french', 'glg': 'galician', 'deu': 'german', 'ell': 'greek', 'hau': 'hausa', 'heb': 'hebrew', 'hin': 'hindi', 'hun': 'hungarian', 'isl': 'icelandic', 'ind': 'indonesian', 'gle': 'irish', 'ita': 'italian', 'jpn': 'japanese', 'kor': 'korean', 'kur': 'kurdish', 'lav': 'latvian', 'lit': 'lithuanian', 'msa': 'malay', 'mar': 'marathi', 'mon': 'mongolian', 'nep': 'nepali', # Norwegian 'nob': 'norwegian', 'nno': 'norwegian', 'fas': 'persian', 'pol': 'polish', 'por': 'portuguese', 'ron': 'romanian', 'rus': 'russian', # Serbian 'srp_cyrl': 'serbian-cyrillic', 'srp_latn': 'serbian', 'slk': 'slovak', 'slv': 'slovenian', 'spa': 'spanish', 'swa': 'swahili', 'swe': 'swedish', 'tgl': 'tagalog', 'tel': 'telugu', 'tha': 'thai', 'tur': 'turkish', 'ukr': 'ukranian', 'urd': 'urdu', 'vie': 'vietnamese', 'yor': 'yoruba' } with open(wl_misc.get_normalized_path( f'stop_word_lists/extra-stopwords/{LANG_TEXTS[lang]}'), 'r', encoding='utf_8') as f: stop_words = [ line.rstrip() for line in f if not line.startswith('#') ] # NLTK elif stop_word_list.startswith('nltk_'): LANG_TEXTS = { 'ara': 'arabic', 'aze': 'azerbaijani', 'dan': 'danish', 'nld': 'dutch', 'eng': 'english', 'fin': 'finnish', 'fra': 'french', 'deu': 'german', 'ell': 'greek', 'hun': 'hungarian', 'ind': 'indonesian', 'ita': 'italian', 'kaz': 'kazakh', 'nep': 'nepali', # Norwegian 'nob': 'norwegian', 'nno': 'norwegian', 'por': 'portuguese', 'ron': 'romanian', 'rus': 'russian', 'slv': 'slovene', 'spa': 'spanish', 'swe': 'swedish', 'tgk': 'tajik', 'tur': 'turkish' } stop_words = nltk.corpus.stopwords.words(LANG_TEXTS[lang]) # spaCy elif stop_word_list.startswith('spacy_'): # Serbian if lang_639_1 == 'sr_cyrl': spacy_lang = importlib.import_module('spacy.lang.sr') stop_words = spacy_lang.STOP_WORDS elif lang_639_1 == 'sr_latn': spacy_lang = importlib.import_module('spacy.lang.sr') stop_words = spacy_lang.STOP_WORDS stop_words = wl_nlp_utils.to_srp_latn(stop_words) else: spacy_lang = importlib.import_module( f'spacy.lang.{lang_639_1}') stop_words = spacy_lang.STOP_WORDS # Stopwords ISO elif stop_word_list.startswith('stopwords_iso_'): # Greek (Ancient) if lang_639_1 == 'grc': lang_639_1 = 'el' # Norwegian if lang_639_1 in ['nb', 'nn']: lang_639_1 = 'no' with open(wl_misc.get_normalized_path( 'stop_word_lists/Stopwords ISO/stopwords_iso.json'), 'r', encoding='utf_8') as f: stop_words = json.load(f)[lang_639_1] # Thai elif stop_word_list == 'pythainlp_tha': stop_words = pythainlp.corpus.common.thai_stopwords() # Remove empty tokens stop_words = [stop_word for stop_word in stop_words if stop_word.strip()] return sorted(set(stop_words))
def wl_lemmatize_text(main, text, lang, tokenized, tagged, lemmatizer): lemmas = [] # spaCy if lemmatizer.startswith('spacy_'): if not lang.startswith('srp_'): lang = wl_conversion.remove_lang_code_suffixes(main, lang) nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = nlp(text) lemmas = [token.lemma_ for token in doc] # English elif lemmatizer == 'nltk_wordnet': word_net_lemmatizer = nltk.WordNetLemmatizer() for token, pos in wl_pos_tagging.wl_pos_tag( main, text, lang = 'eng_us', pos_tagger = 'nltk_perceptron', tagset = 'universal' ): if pos == 'ADJ': lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADJ)) elif pos in ['NOUN', 'PROPN']: lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.NOUN)) elif pos == 'ADV': lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADV)) elif pos in ['VERB', 'AUX']: lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.VERB)) else: lemmas.append(word_net_lemmatizer.lemmatize(token)) # Japanese elif lemmatizer == 'sudachipy_jpn': lemmas = [ token.dictionary_form() for token in main.sudachipy_word_tokenizer.tokenize(text) ] # Russian & Ukrainian elif lemmatizer == 'pymorphy2_morphological_analyzer': if lang == 'rus': morphological_analyzer = main.pymorphy2_morphological_analyzer_rus elif lang == 'ukr': morphological_analyzer = main.pymorphy2_morphological_analyzer_ukr tokens = wl_word_tokenization.wl_word_tokenize_flat(main, text, lang = lang) for token in tokens: lemmas.append(morphological_analyzer.parse(token)[0].normal_form) # Tibetan elif lemmatizer == 'botok_bod': tokens = main.botok_word_tokenizer.tokenize(text) for token in tokens: if token.lemma: lemmas.append(token.lemma) else: lemmas.append(token.text) # Lemmatization Lists elif lemmatizer.startswith('lemmatization_lists_'): mapping_lemmas = {} lang = wl_conversion.to_iso_639_1(main, lang) lang = wl_conversion.remove_lang_code_suffixes(main, lang) with open(wl_misc.get_normalized_path(f'lemmatization/Lemmatization Lists/lemmatization-{lang}.txt'), 'r', encoding = 'utf_8_sig') as f: for line in f: try: lemma, word = line.rstrip().split('\t') mapping_lemmas[word] = lemma except ValueError: pass tokens = wl_word_tokenization.wl_word_tokenize_flat(main, text, lang = lang) lemmas = [mapping_lemmas.get(token, token) for token in tokens] # Remove empty lemmas and strip whitespace in tokens lemmas = [ str(lemma).strip() for lemma in lemmas if str(lemma).strip() ] return lemmas
def wl_lemmatize_tokens(main, tokens, lang, tokenized, tagged, lemmatizer): empty_offsets = [] lemmas = [] tokens = [str(token) for token in tokens] re_tags = wl_matching.get_re_tags(main, tag_type = 'body') if tagged == _tr('wl_lemmatize_tokens', 'Yes'): tags = [''.join(re.findall(re_tags, token)) for token in tokens] tokens = [re.sub(re_tags, '', token) for token in tokens] else: tags = [''] * len(tokens) # Record empty tokens with their tags for i, token in reversed(list(enumerate(tokens))): if not token.strip(): empty_offsets.append(i) del tokens[i] del tags[i] # spaCy if 'spacy' in lemmatizer: if not lang.startswith('srp_'): lang = wl_conversion.remove_lang_code_suffixes(main, lang) nlp = main.__dict__[f'spacy_nlp_{lang}'] if lang != 'jpn': doc = spacy.tokens.Doc(nlp.vocab, words = tokens, spaces = [False] * len(tokens)) for pipe_name in nlp.pipe_names: nlp.get_pipe(pipe_name)(doc) # The Japanese model do not have a lemmatizer component and Japanese lemmas are taken directly from SudachiPy # See: https://github.com/explosion/spaCy/discussions/9983#discussioncomment-1923647 else: doc = nlp(''.join(tokens)) lemma_tokens = [token.text for token in doc] lemmas = [token.lemma_ for token in doc] # English elif lemmatizer == 'nltk_wordnet': word_net_lemmatizer = nltk.WordNetLemmatizer() for token, pos in wl_pos_tagging.wl_pos_tag( main, tokens, lang = 'eng_us', pos_tagger = 'nltk_perceptron', tagset = 'universal' ): if pos == 'ADJ': lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADJ)) elif pos in ['NOUN', 'PROPN']: lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.NOUN)) elif pos == 'ADV': lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.ADV)) elif pos in ['VERB', 'AUX']: lemmas.append(word_net_lemmatizer.lemmatize(token, pos = nltk.corpus.wordnet.VERB)) else: lemmas.append(word_net_lemmatizer.lemmatize(token)) lemma_tokens = tokens.copy() # Japanese elif lemmatizer == 'sudachipy_jpn': tokens_retokenized = main.sudachipy_word_tokenizer.tokenize(''.join(tokens)) lemma_tokens = [token.surface() for token in tokens_retokenized] lemmas = [token.dictionary_form() for token in tokens_retokenized] # Russian & Ukrainian elif lemmatizer == 'pymorphy2_morphological_analyzer': if lang == 'rus': morphological_analyzer = main.pymorphy2_morphological_analyzer_rus elif lang == 'ukr': morphological_analyzer = main.pymorphy2_morphological_analyzer_ukr for token in tokens: lemmas.append(morphological_analyzer.parse(token)[0].normal_form) lemma_tokens = tokens.copy() # Tibetan elif lemmatizer == 'botok_bod': lemma_tokens = [] tokens_retokenized = main.botok_word_tokenizer.tokenize(''.join(tokens)) for token in tokens_retokenized: if token.lemma: lemmas.append(token.lemma) else: lemmas.append(token.text) lemma_tokens.append(token.text) # Lemmatization Lists elif 'lemmatization_lists' in lemmatizer: mapping_lemmas = {} lang = wl_conversion.to_iso_639_1(main, lang) lang = wl_conversion.remove_lang_code_suffixes(main, lang) with open(wl_misc.get_normalized_path(f'lemmatization/Lemmatization Lists/lemmatization-{lang}.txt'), 'r', encoding = 'utf_8_sig') as f: for line in f: try: lemma, word = line.rstrip().split('\t') mapping_lemmas[word] = lemma except ValueError: pass lemma_tokens = tokens.copy() lemmas = [mapping_lemmas.get(token, token) for token in tokens] # Remove empty lemmas and strip whitespace in tokens for i, lemma in reversed(list(enumerate(lemmas))): lemma_tokens[i] = lemma_tokens[i].strip() lemmas[i] = lemma.strip() if not lemmas[i]: del lemmas[i] del lemma_tokens[i] # Make sure that tokenization is not modified during lemmatization i_tokens = 0 i_lemmas = 0 len_tokens = len(tokens) len_lemmas = len(lemmas) if len_tokens != len_lemmas: tags_modified = [] lemmas_modified = [] while i_tokens < len_tokens and i_lemmas < len_lemmas: # Different token if len(tokens[i_tokens]) != len(lemma_tokens[i_lemmas]): tokens_temp = [tokens[i_tokens]] tags_temp = [tags[i_tokens]] lemma_tokens_temp = [lemma_tokens[i_lemmas]] lemmas_temp = [lemmas[i_lemmas]] # Align tokens while i_tokens < len_tokens - 1 or i_lemmas < len_lemmas - 1: len_tokens_temp = sum([len(token) for token in tokens_temp]) len_lemma_tokens_temp = sum([len(token) for token in lemma_tokens_temp]) if len_tokens_temp > len_lemma_tokens_temp: lemma_tokens_temp.append(lemma_tokens[i_lemmas + 1]) lemmas_temp.append(lemmas[i_lemmas + 1]) i_lemmas += 1 elif len_tokens_temp < len_lemma_tokens_temp: tokens_temp.append(tokens[i_tokens + 1]) tags_temp.append(tags[i_tokens + 1]) i_tokens += 1 else: # Use lemmas in one-to-one if len(tokens_temp) == len(lemma_tokens_temp): tags_modified.extend(tags_temp) lemmas_modified.extend(lemmas_temp) # Use original tokens in many-to-one or one-to-many else: tags_modified.extend(tags) lemmas_modified.extend(tokens_temp) tokens_temp = [] tags_temp = [] lemma_tokens_temp = [] lemmas_temp = [] break if tokens_temp: # Use lemmas in one-to-one if len(tokens_temp) == len(lemma_tokens_temp): tags_modified.extend(tags_temp) lemmas_modified.extend(lemmas_temp) # Use original tokens in many-to-one or one-to-many else: tags_modified.extend(tags) lemmas_modified.extend(tokens_temp) else: tags_modified.extend(tags[i_tokens]) lemmas_modified.append(lemmas[i_lemmas]) i_tokens += 1 i_lemmas += 1 len_lemmas_modified = len(lemmas_modified) if len_tokens < len_lemmas_modified: tags = tags_modified[:len_tokens] lemmas = lemmas_modified[:len_tokens] elif len_tokens > len_lemmas_modified: tags = tags_modified + [tags_modified[-1]] * (len_tokens - len_lemmas_modified) lemmas = lemmas_modified + [lemmas_modified[-1]] * (len_tokens - len_lemmas_modified) else: tags = tags_modified.copy() lemmas = lemmas_modified.copy() # Insert empty lemmas and their tags after alignment of input and output for empty_offset in sorted(empty_offsets): lemmas.insert(empty_offset, '') tags.insert(empty_offset, '') return [lemma + tag for lemma, tag in zip(lemmas, tags)]
def wl_word_detokenize(main, tokens, lang): text = '' if lang == 'other': lang = 'eng_gb' wl_nlp_utils.init_word_detokenizers(main, lang=lang) # Chinese if lang.startswith('zho'): non_cjk_start = 0 for i, token in enumerate(tokens): if i >= non_cjk_start: if (wl_checking_unicode.has_han(token) or all(map(str.isnumeric, token))): text += token non_cjk_start += 1 else: # Non-Chinese for j, _ in enumerate(tokens[i:]): if (i + j + 1 == len(tokens) or wl_checking_unicode.has_han( tokens[i + j + 1])): text += wl_word_detokenize( main, tokens=tokens[non_cjk_start:i + j + 1], lang='other') non_cjk_start = i + j + 1 break # Japanese elif lang == 'jpn': non_cjk_start = 0 for i, token in enumerate(tokens): if i < non_cjk_start: continue if (wl_checking_unicode.has_han(token) or wl_checking_unicode.has_kana(token) or all(map(str.isnumeric, token))): text += token non_cjk_start = i + 1 else: # Non-Japanese for j, _ in enumerate(tokens[i:]): if (i + j + 1 == len(tokens) or wl_checking_unicode.has_han(tokens[i + j + 1]) or wl_checking_unicode.has_kana( tokens[i + j + 1])): text += wl_word_detokenize( main, tokens=tokens[non_cjk_start:i + j + 1], lang='other') non_cjk_start = i + j + 1 break # Thai elif lang == 'tha': non_thai_start = 0 for i, token in enumerate(tokens): if i < non_thai_start: continue if wl_checking_unicode.has_thai(token): if type(token) == wl_texts.Wl_Token: text += token + token.boundary else: text += token non_thai_start = i + 1 else: # Non-Thai for j, _ in enumerate(tokens[i:]): if (i + j + 1 == len(tokens) or wl_checking_unicode.has_thai(tokens[i + j + 1])): text += wl_word_detokenize( main, tokens=tokens[non_thai_start:i + j + 1], lang='other') non_thai_start = i + j + 1 break # Tibetan elif lang == 'bod': non_tibetan_start = 0 for i, token in enumerate(tokens): if i < non_tibetan_start: continue if wl_checking_unicode.has_tibetan(token): # Check for Tibetan Mark Shad # See: https://w3c.github.io/tlreq/#section_breaks if i > 0 and text[-1] == '།' and token[0] == '།': text += ' ' + token else: text += token non_tibetan_start = i + 1 else: # Non-Tibetan for j, _ in enumerate(tokens[i:]): if (i + j + 1 == len(tokens) or wl_checking_unicode.has_tibetan( tokens[i + j + 1])): text += wl_word_detokenize( main, tokens=tokens[non_tibetan_start:i + j + 1], lang='other') non_tibetan_start = i + j + 1 break else: lang = wl_conversion.remove_lang_code_suffixes(main, lang) sentence_start = 0 sentences = [] for i, token in enumerate(tokens): if type(token) == wl_texts.Wl_Token and token.sentence_ending: sentences.append(tokens[sentence_start:i + 1]) sentence_start = i + 1 elif i == len(tokens) - 1: sentences.append(tokens[sentence_start:]) for sentence in sentences: text += main.__dict__[ f'sacremoses_moses_detokenizer_{lang}'].detokenize(sentence) text = re.sub(r'\s{2,}', ' ', text) return text.strip()