def testing_to_iso_639_1(lang_code): len_iso_639_3 = max( [len(lang_code) for lang_code in main.settings_global['lang_codes']]) iso_639_1 = wordless_conversion.to_iso_639_1(main, lang_code) print(f'{lang_code:{len_iso_639_3}} -> {iso_639_1}') assert iso_639_1 == main.settings_global['lang_codes'][lang_code]
def check_spacy_models(main, lang, pipeline): if pipeline == 'word_tokenization': nlp_pipelines = [] nlp_disable = ['tagger', 'parser', 'ner'] elif pipeline in ['sentence_tokenization', 'tokenization']: nlp_pipelines = ['sentencizer'] nlp_disable = ['tagger', 'parser', 'ner'] elif pipeline in ['pos_tagging', 'lemmatization']: nlp_pipelines = ['tagger'] nlp_disable = ['parser', 'ner'] # Languages with models if lang in [ 'nld', 'eng', 'fra', 'deu', 'ell', 'ita', 'por', 'spa', 'other' ]: if f'spacy_nlp_{lang}' in main.__dict__: if main.__dict__[f'spacy_nlp_{lang}'].pipe_names != nlp_pipelines: del main.__dict__[f'spacy_nlp_{lang}'] if f'spacy_nlp_{lang}' not in main.__dict__: # Dutch if lang == 'nld': import nl_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = nl_core_news_sm.load( disable=nlp_disable) # English elif lang == 'eng': import en_core_web_sm main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load( disable=nlp_disable) # French elif lang == 'fra': import fr_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = fr_core_news_sm.load( disable=nlp_disable) # German elif lang == 'deu': import de_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = de_core_news_sm.load( disable=nlp_disable) # Greek (Modern) elif lang == 'ell': import el_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = el_core_news_sm.load( disable=nlp_disable) # Italian elif lang == 'ita': import it_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = it_core_news_sm.load( disable=nlp_disable) # Portuguese elif lang == 'por': import pt_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = pt_core_news_sm.load( disable=nlp_disable) # Spanish elif lang == 'spa': import es_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = es_core_news_sm.load( disable=nlp_disable) # Other Languages elif lang == 'other': import en_core_web_sm main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load( disable=nlp_disable) # Languages without models else: # Serbian (Cyrillic) & Serbian (Latin) if lang in ['srp_cyrl', 'srp_latn']: main.__dict__['spacy_nlp_srp_cyrl'] = spacy.blank('rs') main.__dict__['spacy_nlp_srp_latn'] = spacy.blank('rs') else: main.__dict__[f'spacy_nlp_{lang}'] = spacy.blank( wordless_conversion.to_iso_639_1(main, lang)) if 'sentencizer' in nlp_pipelines: nlp = main.__dict__[f'spacy_nlp_{lang}'] if 'sentencizer' not in nlp.pipe_names: nlp.add_pipe(nlp.create_pipe('sentencizer'))
def wordless_get_stop_words(main, lang, list_stop_words='default'): if list_stop_words == 'default': list_stop_words = main.settings_custom['stop_words']['stop_words'][ lang] lang_639_1 = wordless_conversion.to_iso_639_1(main, lang) # Chinese (Simplified) if lang_639_1 == 'zh_cn': lang_639_1 = 'zh' if 'Stopwords ISO' in list_stop_words: # Norwegian Bokmål & Norwegian Nynorsk if lang_639_1 in ['nb', 'nn']: lang_639_1 = 'no' # Chinese (Traditional) if lang_639_1 == 'zh_tw': with open(wordless_misc.get_abs_path( 'stop_words/Stopwords ISO/stop_words_zh_tw.txt'), 'r', encoding='utf_8') as f: stop_words = [line.rstrip() for line in f] else: with open(wordless_misc.get_abs_path( 'stop_words/Stopwords ISO/stopwords_iso.json'), 'r', encoding='utf_8') as f: stop_words = json.load(f)[lang_639_1] elif 'spaCy' in list_stop_words: # Chinese (Traditional) if lang_639_1 == 'zh_tw': with open(wordless_misc.get_abs_path( 'stop_words/spaCy/stop_words_zh_tw.txt'), 'r', encoding='utf_8') as f: stop_words = [line.rstrip() for line in f] else: spacy_lang = importlib.import_module(f'spacy.lang.{lang_639_1}') stop_words = spacy_lang.STOP_WORDS elif 'NLTK' in list_stop_words: lang_texts = { 'ara': 'arabic', 'aze': 'azerbaijani', 'dan': 'danish', 'nld': 'dutch', 'eng': 'english', 'fin': 'finnish', 'fra': 'french', 'deu': 'german', 'ell': 'greek', 'hun': 'hungarian', 'ind': 'indonesian', 'ita': 'italian', 'kaz': 'kazakh', 'nep': 'nepali', # Norwegian Bokmål & Norwegian Nynorsk 'nob': 'norwegian', 'nno': 'norwegian', 'por': 'portuguese', 'ron': 'romanian', 'rus': 'russian', 'spa': 'spanish', 'swe': 'swedish', 'tur': 'turkish' } stop_words = nltk.corpus.stopwords.words(lang_texts[lang]) # Greek (Ancient) elif list_stop_words == main.tr( 'grk-stoplist - Greek (Ancient) Stop Words'): with open(wordless_misc.get_abs_path( 'stop_words/grk-stoplist/stoplist-greek.txt'), 'r', encoding='utf_8') as f: stop_words = [line.rstrip() for line in f.readlines()] # Thai elif list_stop_words == main.tr('PyThaiNLP - Thai Stop Words'): stop_words = pythainlp.corpus.common.thai_stopwords() # Custom Lists elif list_stop_words == main.tr('Custom List'): stop_words = main.settings_custom['stop_words']['custom_lists'][lang] return sorted(stop_words)
def wordless_lemmatize(main, tokens, lang, text_type=('untokenized', 'untagged'), lemmatizer='default'): empty_offsets = [] mapping_lemmas = {} lemmas = [] tokens = [str(token) for token in tokens] re_tags_all = wordless_matching.get_re_tags(main, tags='all') re_tags_pos = wordless_matching.get_re_tags(main, tags='pos') re_tags_non_pos = wordless_matching.get_re_tags(main, tags='non_pos') if text_type[1] == 'tagged_both': tags = [''.join(re.findall(re_tags_all, token)) for token in tokens] tokens = [re.sub(re_tags_all, '', token) for token in tokens] elif text_type[1] == 'tagged_pos': tags = [''.join(re.findall(re_tags_pos, token)) for token in tokens] tokens = [re.sub(re_tags_pos, '', token) for token in tokens] elif text_type[1] == 'tagged_non_pos': tags = [ ''.join(re.findall(re_tags_non_pos, token)) for token in tokens ] tokens = [re.sub(re_tags_non_pos, '', token) for token in tokens] else: tags = [''] * len(tokens) # Record empty tokens for i, token in reversed(list(enumerate(tokens))): if not token.strip(): tokens.remove(token) empty_offsets.append(i) wordless_text_utils.check_lemmatizers(main, lang) if tokens and lang in main.settings_global['lemmatizers']: if lemmatizer == 'default': lemmatizer = main.settings_custom['lemmatization']['lemmatizers'][ lang] # Dutch, English, French, German, Greek (Modern), Italian, Portuguese, Spanish if 'spaCy' in lemmatizer: nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = spacy.tokens.Doc(nlp.vocab, words=tokens) nlp.tagger(doc) lemmas = [token.lemma_ for token in doc] # English elif lemmatizer == main.tr('NLTK - WordNet Lemmatizer'): word_net_lemmatizer = nltk.WordNetLemmatizer() for token, pos in wordless_pos_tag( main, tokens, lang='eng', pos_tagger='NLTK - Perceptron POS Tagger', tagset='universal'): if pos == 'ADJ': lemmas.append( word_net_lemmatizer.lemmatize( token, pos=nltk.corpus.wordnet.ADJ)) elif pos in ['NOUN', 'PROPN']: lemmas.append( word_net_lemmatizer.lemmatize( token, pos=nltk.corpus.wordnet.NOUN)) elif pos == 'ADV': lemmas.append( word_net_lemmatizer.lemmatize( token, pos=nltk.corpus.wordnet.ADV)) elif pos in ['VERB', 'AUX']: lemmas.append( word_net_lemmatizer.lemmatize( token, pos=nltk.corpus.wordnet.VERB)) else: lemmas.append(word_net_lemmatizer.lemmatize(token)) # Greek (Ancient) elif lemmatizer == main.tr( 'lemmalist-greek - Greek (Ancient) Lemma List'): with open(wordless_misc.get_abs_path( 'lemmatization/lemmalist-greek/lemmalist-greek.txt'), 'r', encoding='utf_8') as f: for line in f.readlines(): line = line.rstrip() if line: lemma, *words = line.split() for word in words: mapping_lemmas[word] = lemma # Russian & Ukrainian elif lemmatizer == main.tr('pymorphy2 - Morphological Analyzer'): if lang == 'rus': morphological_analyzer = pymorphy2.MorphAnalyzer(lang='ru') else: morphological_analyzer = pymorphy2.MorphAnalyzer(lang='uk') for token in tokens: lemmas.append( morphological_analyzer.parse(token)[0].normal_form) # Tibetan elif lemmatizer == main.tr('pybo - Tibetan Lemmatizer'): word_tokenizer = main.settings_custom['word_tokenization'][ 'word_tokenizers'][lang] wordless_text_utils.check_pybo_tokenizers( main, word_tokenizer=word_tokenizer) if word_tokenizer == main.tr( 'pybo - Tibetan Word Tokenizer (GMD)'): tokens = main.pybo_tokenizer_gmd.tokenize(' '.join(tokens)) elif word_tokenizer == main.tr( 'pybo - Tibetan Word Tokenizer (POS)'): tokens = main.pybo_tokenizer_pos.tokenize(' '.join(tokens)) elif word_tokenizer == main.tr( 'pybo - Tibetan Word Tokenizer (tsikchen)'): tokens = main.pybo_tokenizer_tsikchen.tokenize( ' '.join(tokens)) for token in tokens: if token.lemma: lemmas.append(token.lemma) else: lemmas.append(token.text) # Other Languages elif 'Lemmatization Lists' in lemmatizer: lang = wordless_conversion.to_iso_639_1(main, lang) with open(wordless_misc.get_abs_path( f'lemmatization/Lemmatization Lists/lemmatization-{lang}.txt' ), 'r', encoding='utf_8_sig') as f: for line in f: try: lemma, word = line.rstrip().split('\t') mapping_lemmas[word] = lemma except: pass else: lemmas = tokens if mapping_lemmas: lemmas = [mapping_lemmas.get(token, token) for token in tokens] # Insert empty lemmas for empty_offset in empty_offsets: lemmas.insert(empty_offset, '') return [lemma + tag for lemma, tag in zip(lemmas, tags)]
def wordless_word_detokenize(main, tokens, lang, word_detokenizer='default'): sentence_start = 0 sentences = [] text = '' if lang not in main.settings_global['word_detokenizers']: lang = 'other' if word_detokenizer == 'default': word_detokenizer = main.settings_custom['word_detokenization'][ 'word_detokenizers'][lang] for i, token in enumerate(tokens): if type(token ) == wordless_text.Wordless_Token and token.sentence_ending: sentences.append(tokens[sentence_start:i + 1]) sentence_start = i + 1 elif i == len(tokens) - 1: sentences.append(tokens[sentence_start:]) # English & Other Languages if word_detokenizer == main.tr('NLTK - Penn Treebank Detokenizer'): treebank_detokenizer = nltk.tokenize.treebank.TreebankWordDetokenizer() for sentence in sentences: text += treebank_detokenizer.tokenize(tokens) elif word_detokenizer == main.tr('Sacremoses - Moses Detokenizer'): moses_detokenizer = sacremoses.MosesDetokenizer( lang=wordless_conversion.to_iso_639_1(main, lang)) for sentence in sentences: text += moses_detokenizer.detokenize(sentence) # Chinese elif word_detokenizer == main.tr('Wordless - Chinese Word Detokenizer'): non_cjk_start = 0 for i, token in enumerate(tokens): if i >= non_cjk_start: if (wordless_checking_unicode.has_han(token) or all(map(str.isnumeric, token))): text += token non_cjk_start += 1 else: # English if wordless_checking_unicode.is_eng_token(token): for j, token in enumerate(tokens[i:]): if i + j + 1 == len( tokens ) or not wordless_checking_unicode.is_eng_token( tokens[i + j + 1]): text += wordless_word_detokenize( main, tokens[non_cjk_start:i + j + 1], lang='eng') non_cjk_start = i + j + 1 break # Other Languages else: for j, token in enumerate(tokens[i:]): if (i + j + 1 == len(tokens) or wordless_checking_unicode.has_han( tokens[i + j + 1])): text += wordless_word_detokenize( main, tokens[non_cjk_start:i + j + 1], lang='other') non_cjk_start = i + j + 1 break elif word_detokenizer == main.tr('Wordless - Japanese Word Detokenizer'): non_cjk_start = 0 for i, token in enumerate(tokens): if i < non_cjk_start: continue if (wordless_checking_unicode.has_han(token) or wordless_checking_unicode.has_kana(token) or all(map(str.isnumeric, token))): text += token non_cjk_start = i + 1 else: # English if wordless_checking_unicode.is_eng_token(token): for j, token in enumerate(tokens[i:]): if i + j + 1 == len( tokens ) or not wordless_checking_unicode.is_eng_token( tokens[i + j + 1]): text += wordless_word_detokenize( main, tokens[non_cjk_start:i + j + 1], lang='eng') non_cjk_start = i + j + 1 break # Other Languages else: for j, token in enumerate(tokens[i:]): if (i + j + 1 == len(tokens) or wordless_checking_unicode.has_han( tokens[i + j + 1]) or wordless_checking_unicode.has_kana( tokens[i + j + 1])): text += wordless_word_detokenize( main, tokens[non_cjk_start:i + j + 1], lang='other') non_cjk_start = i + j + 1 break # Thai elif word_detokenizer in main.tr('Wordless - Thai Word Detokenizer'): non_thai_start = 0 for i, token in enumerate(tokens): if i < non_thai_start: continue if wordless_checking_unicode.has_thai(token): if type(token) == wordless_text.Wordless_Token: text += token + token.boundary else: text += token non_thai_start = i + 1 else: # English if wordless_checking_unicode.is_eng_token(token): for j, token in enumerate(tokens[i:]): if i + j + 1 == len( tokens ) or not wordless_checking_unicode.is_eng_token( tokens[i + j + 1]): text += wordless_word_detokenize( main, tokens[non_thai_start:i + j + 1], lang='eng') non_thai_start = i + j + 1 break # Other Languages else: for j, token in enumerate(tokens[i:]): if (i + j + 1 == len(tokens) or wordless_checking_unicode.has_thai( tokens[i + j + 1])): text += wordless_word_detokenize( main, tokens[non_thai_start:i + j + 1], lang='other') non_thai_start = i + j + 1 break # Tibetan elif word_detokenizer == main.tr('Wordless - Tibetan Word Detokenizer'): non_tibetan_start = 0 for i, token in enumerate(tokens): if i < non_tibetan_start: continue if wordless_checking_unicode.has_tibetan(token): # Check for Tibetan Mark Shad # See: https://w3c.github.io/tlreq/#section_breaks if i > 0 and token[0] == '།': text += token else: text += token non_tibetan_start = i + 1 else: # English if wordless_checking_unicode.is_eng_token(token): for j, token in enumerate(tokens[i:]): if i + j + 1 == len( tokens ) or not wordless_checking_unicode.is_eng_token( tokens[i + j + 1]): text += wordless_word_detokenize( main, tokens[non_tibetan_start:i + j + 1], lang='eng') non_tibetan_start = i + j + 1 break # Other Languages else: for j, token in enumerate(tokens[i:]): if (i + j + 1 == len(tokens) or wordless_checking_unicode.has_tibetan( tokens[i + j + 1])): text += wordless_word_detokenize( main, tokens[non_tibetan_start:i + j + 1], lang='other') non_tibetan_start = i + j + 1 break return re.sub(r'\s{2,}', ' ', text)
def wordless_word_tokenize(main, text, lang, word_tokenizer='default', keep_sentences=False): tokens_sentences = [] if lang not in main.settings_global['word_tokenizers']: lang = 'other' if word_tokenizer == 'default': word_tokenizer = main.settings_custom['word_tokenization'][ 'word_tokenizers'][lang] wordless_text_utils.check_word_tokenizers(main, lang=lang, word_tokenizer=word_tokenizer) if 'NLTK' in word_tokenizer: sentences = wordless_sentence_tokenize(main, text, lang) if word_tokenizer == main.tr('NLTK - Penn Treebank Tokenizer'): treebank_tokenizer = nltk.TreebankWordTokenizer() for sentence in sentences: tokens_sentences.append(treebank_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - Twitter Tokenizer'): tweet_tokenizer = nltk.TweetTokenizer() for sentence in sentences: tokens_sentences.append(tweet_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - NIST Tokenizer'): nist_tokenizer = nltk.tokenize.nist.NISTTokenizer() for sentence in sentences: tokens_sentences.append(nist_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'): toktok_tokenizer = nltk.ToktokTokenizer() for sentence in sentences: tokens_sentences.append(toktok_tokenizer.tokenize(sentence)) if not keep_sentences: tokens_sentences = [ itertools.chain.from_iterable(tokens_sentences) ] elif 'Sacremoses' in word_tokenizer: if keep_sentences: sentences = wordless_sentence_tokenize(main, text, lang) else: sentences = [text] if word_tokenizer == main.tr('Sacremoses - Moses Tokenizer'): moses_tokenizer = sacremoses.MosesTokenizer( lang=wordless_conversion.to_iso_639_1(main, lang)) for sentence in sentences: tokens_sentences.append( moses_tokenizer.tokenize(sentence, escape=False)) elif word_tokenizer == main.tr('Sacremoses - Penn Treebank Tokenizer'): moses_tokenizer = sacremoses.MosesTokenizer( lang=wordless_conversion.to_iso_639_1(main, lang)) for sentence in sentences: tokens_sentences.append( moses_tokenizer.penn_tokenize(sentence)) elif 'spaCy' in word_tokenizer: nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = nlp(text) # See Issue #3479: https://github.com/explosion/spaCy/issues/3479 doc.is_parsed = True if keep_sentences: for sentence in doc.sents: tokens_sentences.append( [token.text for token in sentence.as_doc()]) else: tokens_sentences.append([token.text for token in doc]) # Chinese & Japanese elif ('jieba' in word_tokenizer or 'nagisa' in word_tokenizer or 'Wordless' in word_tokenizer): if keep_sentences: sentences = wordless_sentence_tokenize(main, text, lang=lang) else: sentences = [text] # Chinese if word_tokenizer == main.tr('jieba - Chinese Word Tokenizer'): for sentence in sentences: tokens_sentences.append(jieba.cut(sentence)) elif word_tokenizer == main.tr( 'Wordless - Chinese Character Tokenizer'): for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wordless_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # English if wordless_checking_unicode.is_eng(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='eng')) non_han_start = i + j + 1 break # Other Languages else: for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wordless_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='other')) non_han_start = i + j + 1 break tokens_sentences.extend(tokens) # Japanese elif word_tokenizer == main.tr('nagisa - Japanese Word Tokenizer'): import nagisa for sentence in sentences: tokens_sentences.append(nagisa.tagging(str(sentence)).words) elif word_tokenizer == main.tr('Wordless - Japanese Kanji Tokenizer'): for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wordless_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # Japanese Kana if wordless_checking_unicode.is_kana(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_kana( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='jpn')) non_han_start = i + j + 1 break # English elif wordless_checking_unicode.is_eng(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='eng')) non_han_start = i + j + 1 break # Other Languages else: for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wordless_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='other')) non_han_start = i + j + 1 break tokens_sentences.extend(tokens) # Thai elif 'PyThaiNLP' in word_tokenizer: sentences = wordless_sentence_tokenize( main, text, lang='tha', sentence_tokenizer='PyThaiNLP - Thai Sentence Tokenizer') if word_tokenizer == main.tr( 'PyThaiNLP - Maximum Matching Algorithm + TCC'): for sentence in sentences: tokens_sentences.append( pythainlp.tokenize.word_tokenize(sentence, engine='newmm')) elif word_tokenizer == main.tr( 'PyThaiNLP - Maximum Matching Algorithm'): for sentence in sentences: tokens_sentences.append( pythainlp.tokenize.word_tokenize(sentence, engine='mm')) elif word_tokenizer == main.tr('PyThaiNLP - Longest Matching'): for sentence in sentences: tokens_sentences.append( pythainlp.tokenize.word_tokenize( sentence, engine='longest-matching')) # Tibetan elif 'pybo' in word_tokenizer: if keep_sentences: sentences = wordless_sentence_tokenize(main, text, lang='bod') else: sentences = [text] if word_tokenizer == main.tr('pybo - Tibetan Word Tokenizer (GMD)'): for sentence in sentences: tokens_sentences.append([ token.text for token in main.pybo_tokenizer_gmd.tokenize(sentence) ]) elif word_tokenizer == main.tr('pybo - Tibetan Word Tokenizer (POS)'): for sentence in sentences: tokens_sentences.append([ token.text for token in main.pybo_tokenizer_pos.tokenize(sentence) ]) elif word_tokenizer == main.tr( 'pybo - Tibetan Word Tokenizer (tsikchen)'): for sentence in sentences: tokens_sentences.append([ token.text for token in main.pybo_tokenizer_tsikchen.tokenize(sentence) ]) # Vietnamese elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'): if keep_sentences: sentences = wordless_sentence_tokenize( main, text, lang='vie', sentence_tokenizer='Underthesea - Vietnamese Sentence Tokenizer' ) else: sentences = [text] for sentence in sentences: tokens_sentences.append(underthesea.word_tokenize(str(sentence))) # Remove empty tokens and strip whitespace for i, tokens in enumerate(tokens_sentences): tokens_sentences[i] = [ token.strip() for token in tokens if token.strip() ] # Record token boundaries if lang in ['zho_cn', 'zho_tw', 'jpn']: for tokens in tokens_sentences: if tokens: tokens[-1] = wordless_text.Wordless_Token(tokens[-1], boundary='', sentence_ending=True) else: for tokens in tokens_sentences: if tokens: tokens[-1] = wordless_text.Wordless_Token(tokens[-1], boundary=' ', sentence_ending=True) return tokens_sentences
def wordless_word_tokenize(main, text, lang, word_tokenizer='default', flat_tokens=True): tokens_hierarchical = [] if lang not in main.settings_global['word_tokenizers']: lang = 'other' if word_tokenizer == 'default': word_tokenizer = main.settings_custom['word_tokenization'][ 'word_tokenizers'][lang] # Check initialization status of word (and sentence) tokenizers if flat_tokens: wordless_text_utils.check_word_tokenizers( main, lang=lang, word_tokenizer=word_tokenizer) else: wordless_text_utils.check_tokenizers(main, lang=lang, word_tokenizer=word_tokenizer) # NLTK if 'NLTK' in word_tokenizer: sentences = wordless_sentence_tokenize(main, text, lang) if word_tokenizer == main.tr('NLTK - Penn Treebank Tokenizer'): treebank_tokenizer = nltk.TreebankWordTokenizer() for sentence in sentences: tokens_hierarchical.append( treebank_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - Twitter Tokenizer'): tweet_tokenizer = nltk.TweetTokenizer() for sentence in sentences: tokens_hierarchical.append(tweet_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - NIST Tokenizer'): nist_tokenizer = nltk.tokenize.nist.NISTTokenizer() for sentence in sentences: tokens_hierarchical.append(nist_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'): toktok_tokenizer = nltk.ToktokTokenizer() for sentence in sentences: tokens_hierarchical.append(toktok_tokenizer.tokenize(sentence)) # Sacremoses elif 'Sacremoses' in word_tokenizer: if flat_tokens: sentences = [text] else: sentences = wordless_sentence_tokenize(main, text, lang) if word_tokenizer == main.tr('Sacremoses - Moses Tokenizer'): moses_tokenizer = sacremoses.MosesTokenizer( lang=wordless_conversion.to_iso_639_1(main, lang)) for sentence in sentences: tokens_hierarchical.append( moses_tokenizer.tokenize(sentence, escape=False)) elif word_tokenizer == main.tr('Sacremoses - Penn Treebank Tokenizer'): moses_tokenizer = sacremoses.MosesTokenizer( lang=wordless_conversion.to_iso_639_1(main, lang)) for sentence in sentences: tokens_hierarchical.append( moses_tokenizer.penn_tokenize(sentence)) # spaCy elif 'spaCy' in word_tokenizer: nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = nlp(text) # See Issue #3479: https://github.com/explosion/spaCy/issues/3479 doc.is_parsed = True if flat_tokens: tokens_hierarchical.append([token.text for token in doc]) else: for sentence in doc.sents: tokens_hierarchical.append( [token.text for token in sentence.as_doc()]) # syntok elif word_tokenizer == 'syntok - Word Tokenizer': syntok_tokenizer = syntok.tokenizer.Tokenizer() if flat_tokens: tokens_hierarchical.append( [token.value for token in syntok_tokenizer.tokenize(text)]) else: for para in syntok.segmenter.analyze(text): for sentence in para: tokens_hierarchical.append( [token.value for token in sentence]) # Chinese & Japanese elif ('jieba' in word_tokenizer or 'nagisa' in word_tokenizer or 'Wordless' in word_tokenizer): if flat_tokens: sentences = [text] else: sentences = wordless_sentence_tokenize(main, text, lang=lang) # Chinese if word_tokenizer == main.tr('jieba - Chinese Word Tokenizer'): for sentence in sentences: tokens_hierarchical.append(jieba.cut(sentence)) elif word_tokenizer == main.tr( 'Wordless - Chinese Character Tokenizer'): for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wordless_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # English if wordless_checking_unicode.is_eng(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='eng')) non_han_start = i + j + 1 break # Other Languages else: for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wordless_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='other')) non_han_start = i + j + 1 break tokens_hierarchical.append(tokens) # Japanese elif word_tokenizer == main.tr('nagisa - Japanese Word Tokenizer'): import nagisa for sentence in sentences: tokens_hierarchical.append(nagisa.tagging(str(sentence)).words) elif word_tokenizer == main.tr('Wordless - Japanese Kanji Tokenizer'): for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wordless_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # Japanese Kana if wordless_checking_unicode.is_kana(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_kana( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='jpn')) non_han_start = i + j + 1 break # English elif wordless_checking_unicode.is_eng(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='eng')) non_han_start = i + j + 1 break # Other Languages else: for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wordless_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='other')) non_han_start = i + j + 1 break tokens_hierarchical.append(tokens) # Russian elif word_tokenizer == 'razdel - Russian Word Tokenizer': if flat_tokens: sentences = [text] else: sentences = wordless_sentence_tokenize(main, text, lang='rus') for sentence in sentences: tokens_hierarchical.append( [token.text for token in razdel.tokenize(sentence)]) # Thai elif 'PyThaiNLP' in word_tokenizer: # Preserve sentence boundaries sentences = wordless_sentence_tokenize( main, text, lang='tha', sentence_tokenizer='PyThaiNLP - Thai Sentence Tokenizer') if word_tokenizer == main.tr( 'PyThaiNLP - Maximum Matching Algorithm + TCC'): for sentence in sentences: tokens_hierarchical.append( pythainlp.tokenize.word_tokenize(sentence, engine='newmm')) elif word_tokenizer == main.tr( 'PyThaiNLP - Maximum Matching Algorithm'): for sentence in sentences: tokens_hierarchical.append( pythainlp.tokenize.word_tokenize(sentence, engine='mm')) elif word_tokenizer == main.tr('PyThaiNLP - Longest Matching'): for sentence in sentences: tokens_hierarchical.append( pythainlp.tokenize.word_tokenize( sentence, engine='longest-matching')) # Tibetan elif 'botok' in word_tokenizer: if flat_tokens: sentences = [text] else: sentences = wordless_sentence_tokenize(main, text, lang='bod') botok_tokenizer = wordless_text_utils.check_botok_tokenizers( main, word_tokenizer) for sentence in sentences: tokens_hierarchical.append( [token.text for token in botok_tokenizer.tokenize(sentence)]) # Vietnamese elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'): if flat_tokens: sentences = [text] else: sentences = wordless_sentence_tokenize( main, text, lang='vie', sentence_tokenizer='Underthesea - Vietnamese Sentence Tokenizer' ) for sentence in sentences: tokens_hierarchical.append(underthesea.word_tokenize( str(sentence))) # Remove empty tokens and strip whitespace for i, sentence in enumerate(tokens_hierarchical): tokens_hierarchical[i] = [ token.strip() for token in sentence if token.strip() ] # Record token boundaries if lang in ['zho_cn', 'zho_tw', 'jpn']: for sentence in tokens_hierarchical: if sentence: sentence[-1] = wordless_text.Wordless_Token( sentence[-1], boundary='', sentence_ending=True) else: for sentence in tokens_hierarchical: if sentence: sentence[-1] = wordless_text.Wordless_Token( sentence[-1], boundary=' ', sentence_ending=True) # Clause tokenization if not flat_tokens: for i, sentence in enumerate(tokens_hierarchical): tokens_hierarchical[i] = wordless_clause_tokenize( main, sentence, lang) # Flatten tokens tokens_flat = list(wordless_misc.flatten_list(tokens_hierarchical)) if flat_tokens: return tokens_flat else: return tokens_hierarchical
def wordless_get_stop_words(main, lang, list_stop_words='default'): if list_stop_words == 'default': list_stop_words = main.settings_custom['stop_words']['stop_words'][ lang] lang_639_1 = wordless_conversion.to_iso_639_1(main, lang) # Chinese (Simplified) if lang_639_1 == 'zh_cn': lang_639_1 = 'zh' # extra-stopwords if 'extra-stopwords' in list_stop_words: LANG_TEXTS = { 'sqi': 'albanian', 'ara': 'arabic', 'hye': 'armenian', 'eus': 'basque', 'bel': 'belarusian', 'ben': 'bengali', 'bul': 'bulgarian', 'cat': 'catalan', 'zho_cn': 'chinese', # Chinese (Traditional) 'zho_tw': 'chinese-traditional', 'hrv': 'croatian', 'ces': 'czech', 'dan': 'danish', 'nld': 'dutch', 'eng': 'english', 'est': 'estonian', 'fin': 'finnish', 'fra': 'french', 'glg': 'galician', 'deu': 'german', 'ell': 'greek', 'hau': 'hausa', 'heb': 'hebrew', 'hin': 'hindi', 'hun': 'hungarian', 'isl': 'icelandic', 'ind': 'indonesian', 'gle': 'irish', 'ita': 'italian', 'jpn': 'japanese', 'kor': 'korean', 'kur': 'kurdish', 'lav': 'latvian', 'lit': 'lithuanian', 'msa': 'malay', 'mar': 'marathi', 'mon': 'mongolian', 'nep': 'nepali', # Norwegian Bokmål & Norwegian Nynorsk 'nob': 'norwegian', 'nno': 'norwegian', 'fas': 'persian', 'pol': 'polish', 'por': 'portuguese', 'ron': 'romanian', 'rus': 'russian', 'srp_cyrl': 'serbian-cyrillic', 'srp_latn': 'serbian', 'slk': 'slovak', 'slv': 'slovenian', 'spa': 'spanish', 'swa': 'swahili', 'swe': 'swedish', 'tgl': 'tagalog', 'tel': 'telugu', 'tha': 'thai', 'tur': 'turkish', 'ukr': 'ukranian', 'urd': 'urdu', 'vie': 'vietnamese', 'yor': 'yoruba' } with open(wordless_misc.get_normalized_path( f'stop_words/extra-stopwords/{LANG_TEXTS[lang]}'), 'r', encoding='utf_8') as f: stop_words = [ line.rstrip() for line in f if not line.startswith('#') ] # NLTK elif 'NLTK' in list_stop_words: LANG_TEXTS = { 'ara': 'arabic', 'aze': 'azerbaijani', 'dan': 'danish', 'nld': 'dutch', 'eng': 'english', 'fin': 'finnish', 'fra': 'french', 'deu': 'german', 'ell': 'greek', 'hun': 'hungarian', 'ind': 'indonesian', 'ita': 'italian', 'kaz': 'kazakh', 'nep': 'nepali', # Norwegian Bokmål & Norwegian Nynorsk 'nob': 'norwegian', 'nno': 'norwegian', 'por': 'portuguese', 'ron': 'romanian', 'rus': 'russian', 'slv': 'slovene', 'spa': 'spanish', 'swe': 'swedish', 'tgk': 'tajik', 'tur': 'turkish' } stop_words = nltk.corpus.stopwords.words(LANG_TEXTS[lang]) # spaCy elif 'spaCy' in list_stop_words: # Chinese (Traditional) if lang_639_1 == 'zh_tw': with open(wordless_misc.get_normalized_path( 'stop_words/spaCy/stop_words_zh_tw.txt'), 'r', encoding='utf_8') as f: stop_words = [line.rstrip() for line in f] else: # Serbian (Cyrillic) & Serbian (Latin) if lang_639_1 == 'sr_cyrl': spacy_lang = importlib.import_module('spacy.lang.sr') stop_words = spacy_lang.STOP_WORDS elif lang_639_1 == 'sr_latn': spacy_lang = importlib.import_module('spacy.lang.sr') stop_words = spacy_lang.STOP_WORDS stop_words = wordless_text_utils.to_srp_latn(stop_words) else: spacy_lang = importlib.import_module( f'spacy.lang.{lang_639_1}') stop_words = spacy_lang.STOP_WORDS # Stopwords ISO elif 'Stopwords ISO' in list_stop_words: # Norwegian Bokmål & Norwegian Nynorsk if lang_639_1 in ['nb', 'nn']: lang_639_1 = 'no' # Chinese (Traditional) if lang_639_1 == 'zh_tw': with open(wordless_misc.get_normalized_path( 'stop_words/Stopwords ISO/stop_words_zh_tw.txt'), 'r', encoding='utf_8') as f: stop_words = [line.rstrip() for line in f] else: with open(wordless_misc.get_normalized_path( 'stop_words/Stopwords ISO/stopwords_iso.json'), 'r', encoding='utf_8') as f: stop_words = json.load(f)[lang_639_1] # Greek (Ancient) elif list_stop_words == main.tr( 'grk-stoplist - Greek (Ancient) Stop Words'): with open(wordless_misc.get_normalized_path( 'stop_words/grk-stoplist/stoplist-greek.txt'), 'r', encoding='utf_8') as f: stop_words = [line.rstrip() for line in f.readlines()] # Thai elif list_stop_words == main.tr('PyThaiNLP - Thai Stop Words'): stop_words = pythainlp.corpus.common.thai_stopwords() # Custom Lists elif list_stop_words == main.tr('Custom List'): stop_words = main.settings_custom['stop_words']['custom_lists'][lang] # Remove empty tokens stop_words = [stop_word for stop_word in stop_words if stop_word] return sorted(set(stop_words))
def test_to_iso_639_1(lang_code): len_iso_639_3 = max( [len(lang_code) for lang_code in main.settings_global['lang_codes']]) iso_639_1 = wordless_conversion.to_iso_639_1(main, lang_code) assert iso_639_1 == main.settings_global['lang_codes'][lang_code]