def init_word_tokenizers(main, lang, word_tokenizer = 'default'): if lang not in main.settings_global['word_tokenizers']: lang = 'other' if word_tokenizer == 'default': word_tokenizer = main.settings_custom['word_tokenization']['word_tokenizers'][lang] # NLTK if word_tokenizer.startswith('nltk_'): if word_tokenizer == 'nltk_nist': if 'nltk_nist_tokenizer' not in main.__dict__: main.nltk_nist_tokenizer = nltk.tokenize.nist.NISTTokenizer() elif word_tokenizer == 'nltk_nltk': if 'nltk_nltk_tokenizer' not in main.__dict__: main.nltk_nltk_tokenizer = nltk.NLTKWordTokenizer() elif word_tokenizer == 'nltk_penn_treebank': if 'nltk_treebank_tokenizer' not in main.__dict__: main.nltk_treebank_tokenizer = nltk.TreebankWordTokenizer() elif word_tokenizer == 'nltk_tok_tok': if 'nltk_toktok_tokenizer' not in main.__dict__: main.nltk_toktok_tokenizer = nltk.ToktokTokenizer() elif word_tokenizer == 'nltk_twitter': if 'nltk_tweet_tokenizer' not in main.__dict__: main.nltk_tweet_tokenizer = nltk.TweetTokenizer() # Sacremoses elif word_tokenizer == 'sacremoses_moses': lang_sacremoses = wl_conversion.remove_lang_code_suffixes(main, wl_conversion.to_iso_639_1(main, lang)) lang = wl_conversion.remove_lang_code_suffixes(main, lang) if f'sacremoses_moses_tokenizer_{lang}' not in main.__dict__: main.__dict__[f'sacremoses_moses_tokenizer_{lang}'] = sacremoses.MosesTokenizer(lang = lang_sacremoses) # spaCy elif word_tokenizer.startswith('spacy_'): init_spacy_models(main, lang) # Chinese elif word_tokenizer == 'pkuseg_zho': if 'pkuseg_word_tokenizer' not in main.__dict__: main.pkuseg_word_tokenizer = pkuseg.pkuseg() # Chinese & Japanese elif word_tokenizer.startswith('wordless_'): init_spacy_models(main, 'eng_us') init_spacy_models(main, 'other') # Japanese elif word_tokenizer.startswith('sudachipy_jpn'): if 'sudachipy_word_tokenizer' not in main.__dict__: main.sudachipy_word_tokenizer = sudachipy.Dictionary().create() # Tibetan elif word_tokenizer == 'botok_bod': if 'botok_word_tokenizer' not in main.__dict__: main.botok_word_tokenizer = botok.WordTokenizer()
def wordless_word_tokenize(main, text, lang, word_tokenizer='default', keep_sentences=False): tokens_sentences = [] if lang not in main.settings_global['word_tokenizers']: lang = 'other' if word_tokenizer == 'default': word_tokenizer = main.settings_custom['word_tokenization'][ 'word_tokenizers'][lang] wordless_text_utils.check_word_tokenizers(main, lang=lang, word_tokenizer=word_tokenizer) if 'NLTK' in word_tokenizer: sentences = wordless_sentence_tokenize(main, text, lang) if word_tokenizer == main.tr('NLTK - Penn Treebank Tokenizer'): treebank_tokenizer = nltk.TreebankWordTokenizer() for sentence in sentences: tokens_sentences.append(treebank_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - Twitter Tokenizer'): tweet_tokenizer = nltk.TweetTokenizer() for sentence in sentences: tokens_sentences.append(tweet_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - NIST Tokenizer'): nist_tokenizer = nltk.tokenize.nist.NISTTokenizer() for sentence in sentences: tokens_sentences.append(nist_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'): toktok_tokenizer = nltk.ToktokTokenizer() for sentence in sentences: tokens_sentences.append(toktok_tokenizer.tokenize(sentence)) if not keep_sentences: tokens_sentences = [ itertools.chain.from_iterable(tokens_sentences) ] elif 'Sacremoses' in word_tokenizer: if keep_sentences: sentences = wordless_sentence_tokenize(main, text, lang) else: sentences = [text] if word_tokenizer == main.tr('Sacremoses - Moses Tokenizer'): moses_tokenizer = sacremoses.MosesTokenizer( lang=wordless_conversion.to_iso_639_1(main, lang)) for sentence in sentences: tokens_sentences.append( moses_tokenizer.tokenize(sentence, escape=False)) elif word_tokenizer == main.tr('Sacremoses - Penn Treebank Tokenizer'): moses_tokenizer = sacremoses.MosesTokenizer( lang=wordless_conversion.to_iso_639_1(main, lang)) for sentence in sentences: tokens_sentences.append( moses_tokenizer.penn_tokenize(sentence)) elif 'spaCy' in word_tokenizer: nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = nlp(text) # See Issue #3479: https://github.com/explosion/spaCy/issues/3479 doc.is_parsed = True if keep_sentences: for sentence in doc.sents: tokens_sentences.append( [token.text for token in sentence.as_doc()]) else: tokens_sentences.append([token.text for token in doc]) # Chinese & Japanese elif ('jieba' in word_tokenizer or 'nagisa' in word_tokenizer or 'Wordless' in word_tokenizer): if keep_sentences: sentences = wordless_sentence_tokenize(main, text, lang=lang) else: sentences = [text] # Chinese if word_tokenizer == main.tr('jieba - Chinese Word Tokenizer'): for sentence in sentences: tokens_sentences.append(jieba.cut(sentence)) elif word_tokenizer == main.tr( 'Wordless - Chinese Character Tokenizer'): for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wordless_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # English if wordless_checking_unicode.is_eng(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='eng')) non_han_start = i + j + 1 break # Other Languages else: for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wordless_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='other')) non_han_start = i + j + 1 break tokens_sentences.extend(tokens) # Japanese elif word_tokenizer == main.tr('nagisa - Japanese Word Tokenizer'): import nagisa for sentence in sentences: tokens_sentences.append(nagisa.tagging(str(sentence)).words) elif word_tokenizer == main.tr('Wordless - Japanese Kanji Tokenizer'): for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wordless_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # Japanese Kana if wordless_checking_unicode.is_kana(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_kana( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='jpn')) non_han_start = i + j + 1 break # English elif wordless_checking_unicode.is_eng(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='eng')) non_han_start = i + j + 1 break # Other Languages else: for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wordless_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='other')) non_han_start = i + j + 1 break tokens_sentences.extend(tokens) # Thai elif 'PyThaiNLP' in word_tokenizer: sentences = wordless_sentence_tokenize( main, text, lang='tha', sentence_tokenizer='PyThaiNLP - Thai Sentence Tokenizer') if word_tokenizer == main.tr( 'PyThaiNLP - Maximum Matching Algorithm + TCC'): for sentence in sentences: tokens_sentences.append( pythainlp.tokenize.word_tokenize(sentence, engine='newmm')) elif word_tokenizer == main.tr( 'PyThaiNLP - Maximum Matching Algorithm'): for sentence in sentences: tokens_sentences.append( pythainlp.tokenize.word_tokenize(sentence, engine='mm')) elif word_tokenizer == main.tr('PyThaiNLP - Longest Matching'): for sentence in sentences: tokens_sentences.append( pythainlp.tokenize.word_tokenize( sentence, engine='longest-matching')) # Tibetan elif 'pybo' in word_tokenizer: if keep_sentences: sentences = wordless_sentence_tokenize(main, text, lang='bod') else: sentences = [text] if word_tokenizer == main.tr('pybo - Tibetan Word Tokenizer (GMD)'): for sentence in sentences: tokens_sentences.append([ token.text for token in main.pybo_tokenizer_gmd.tokenize(sentence) ]) elif word_tokenizer == main.tr('pybo - Tibetan Word Tokenizer (POS)'): for sentence in sentences: tokens_sentences.append([ token.text for token in main.pybo_tokenizer_pos.tokenize(sentence) ]) elif word_tokenizer == main.tr( 'pybo - Tibetan Word Tokenizer (tsikchen)'): for sentence in sentences: tokens_sentences.append([ token.text for token in main.pybo_tokenizer_tsikchen.tokenize(sentence) ]) # Vietnamese elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'): if keep_sentences: sentences = wordless_sentence_tokenize( main, text, lang='vie', sentence_tokenizer='Underthesea - Vietnamese Sentence Tokenizer' ) else: sentences = [text] for sentence in sentences: tokens_sentences.append(underthesea.word_tokenize(str(sentence))) # Remove empty tokens and strip whitespace for i, tokens in enumerate(tokens_sentences): tokens_sentences[i] = [ token.strip() for token in tokens if token.strip() ] # Record token boundaries if lang in ['zho_cn', 'zho_tw', 'jpn']: for tokens in tokens_sentences: if tokens: tokens[-1] = wordless_text.Wordless_Token(tokens[-1], boundary='', sentence_ending=True) else: for tokens in tokens_sentences: if tokens: tokens[-1] = wordless_text.Wordless_Token(tokens[-1], boundary=' ', sentence_ending=True) return tokens_sentences
def wordless_word_tokenize(main, text, lang, word_tokenizer='default', flat_tokens=True): tokens_hierarchical = [] if lang not in main.settings_global['word_tokenizers']: lang = 'other' if word_tokenizer == 'default': word_tokenizer = main.settings_custom['word_tokenization'][ 'word_tokenizers'][lang] # Check initialization status of word (and sentence) tokenizers if flat_tokens: wordless_text_utils.check_word_tokenizers( main, lang=lang, word_tokenizer=word_tokenizer) else: wordless_text_utils.check_tokenizers(main, lang=lang, word_tokenizer=word_tokenizer) # NLTK if 'NLTK' in word_tokenizer: sentences = wordless_sentence_tokenize(main, text, lang) if word_tokenizer == main.tr('NLTK - Penn Treebank Tokenizer'): treebank_tokenizer = nltk.TreebankWordTokenizer() for sentence in sentences: tokens_hierarchical.append( treebank_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - Twitter Tokenizer'): tweet_tokenizer = nltk.TweetTokenizer() for sentence in sentences: tokens_hierarchical.append(tweet_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - NIST Tokenizer'): nist_tokenizer = nltk.tokenize.nist.NISTTokenizer() for sentence in sentences: tokens_hierarchical.append(nist_tokenizer.tokenize(sentence)) elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'): toktok_tokenizer = nltk.ToktokTokenizer() for sentence in sentences: tokens_hierarchical.append(toktok_tokenizer.tokenize(sentence)) # Sacremoses elif 'Sacremoses' in word_tokenizer: if flat_tokens: sentences = [text] else: sentences = wordless_sentence_tokenize(main, text, lang) if word_tokenizer == main.tr('Sacremoses - Moses Tokenizer'): moses_tokenizer = sacremoses.MosesTokenizer( lang=wordless_conversion.to_iso_639_1(main, lang)) for sentence in sentences: tokens_hierarchical.append( moses_tokenizer.tokenize(sentence, escape=False)) elif word_tokenizer == main.tr('Sacremoses - Penn Treebank Tokenizer'): moses_tokenizer = sacremoses.MosesTokenizer( lang=wordless_conversion.to_iso_639_1(main, lang)) for sentence in sentences: tokens_hierarchical.append( moses_tokenizer.penn_tokenize(sentence)) # spaCy elif 'spaCy' in word_tokenizer: nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = nlp(text) # See Issue #3479: https://github.com/explosion/spaCy/issues/3479 doc.is_parsed = True if flat_tokens: tokens_hierarchical.append([token.text for token in doc]) else: for sentence in doc.sents: tokens_hierarchical.append( [token.text for token in sentence.as_doc()]) # syntok elif word_tokenizer == 'syntok - Word Tokenizer': syntok_tokenizer = syntok.tokenizer.Tokenizer() if flat_tokens: tokens_hierarchical.append( [token.value for token in syntok_tokenizer.tokenize(text)]) else: for para in syntok.segmenter.analyze(text): for sentence in para: tokens_hierarchical.append( [token.value for token in sentence]) # Chinese & Japanese elif ('jieba' in word_tokenizer or 'nagisa' in word_tokenizer or 'Wordless' in word_tokenizer): if flat_tokens: sentences = [text] else: sentences = wordless_sentence_tokenize(main, text, lang=lang) # Chinese if word_tokenizer == main.tr('jieba - Chinese Word Tokenizer'): for sentence in sentences: tokens_hierarchical.append(jieba.cut(sentence)) elif word_tokenizer == main.tr( 'Wordless - Chinese Character Tokenizer'): for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wordless_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # English if wordless_checking_unicode.is_eng(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='eng')) non_han_start = i + j + 1 break # Other Languages else: for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wordless_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='other')) non_han_start = i + j + 1 break tokens_hierarchical.append(tokens) # Japanese elif word_tokenizer == main.tr('nagisa - Japanese Word Tokenizer'): import nagisa for sentence in sentences: tokens_hierarchical.append(nagisa.tagging(str(sentence)).words) elif word_tokenizer == main.tr('Wordless - Japanese Kanji Tokenizer'): for sentence in sentences: tokens = [] non_han_start = 0 for i, char in enumerate(sentence): if i >= non_han_start: if wordless_checking_unicode.is_han(char): tokens.append(char) non_han_start += 1 else: # Japanese Kana if wordless_checking_unicode.is_kana(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_kana( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='jpn')) non_han_start = i + j + 1 break # English elif wordless_checking_unicode.is_eng(char): for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or not wordless_checking_unicode.is_eng( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='eng')) non_han_start = i + j + 1 break # Other Languages else: for j, char in enumerate(sentence[i:]): if i + j + 1 == len( sentence ) or wordless_checking_unicode.is_han( sentence[i + j + 1]): tokens.extend( wordless_word_tokenize( main, sentence[non_han_start:i + j + 1], lang='other')) non_han_start = i + j + 1 break tokens_hierarchical.append(tokens) # Russian elif word_tokenizer == 'razdel - Russian Word Tokenizer': if flat_tokens: sentences = [text] else: sentences = wordless_sentence_tokenize(main, text, lang='rus') for sentence in sentences: tokens_hierarchical.append( [token.text for token in razdel.tokenize(sentence)]) # Thai elif 'PyThaiNLP' in word_tokenizer: # Preserve sentence boundaries sentences = wordless_sentence_tokenize( main, text, lang='tha', sentence_tokenizer='PyThaiNLP - Thai Sentence Tokenizer') if word_tokenizer == main.tr( 'PyThaiNLP - Maximum Matching Algorithm + TCC'): for sentence in sentences: tokens_hierarchical.append( pythainlp.tokenize.word_tokenize(sentence, engine='newmm')) elif word_tokenizer == main.tr( 'PyThaiNLP - Maximum Matching Algorithm'): for sentence in sentences: tokens_hierarchical.append( pythainlp.tokenize.word_tokenize(sentence, engine='mm')) elif word_tokenizer == main.tr('PyThaiNLP - Longest Matching'): for sentence in sentences: tokens_hierarchical.append( pythainlp.tokenize.word_tokenize( sentence, engine='longest-matching')) # Tibetan elif 'botok' in word_tokenizer: if flat_tokens: sentences = [text] else: sentences = wordless_sentence_tokenize(main, text, lang='bod') botok_tokenizer = wordless_text_utils.check_botok_tokenizers( main, word_tokenizer) for sentence in sentences: tokens_hierarchical.append( [token.text for token in botok_tokenizer.tokenize(sentence)]) # Vietnamese elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'): if flat_tokens: sentences = [text] else: sentences = wordless_sentence_tokenize( main, text, lang='vie', sentence_tokenizer='Underthesea - Vietnamese Sentence Tokenizer' ) for sentence in sentences: tokens_hierarchical.append(underthesea.word_tokenize( str(sentence))) # Remove empty tokens and strip whitespace for i, sentence in enumerate(tokens_hierarchical): tokens_hierarchical[i] = [ token.strip() for token in sentence if token.strip() ] # Record token boundaries if lang in ['zho_cn', 'zho_tw', 'jpn']: for sentence in tokens_hierarchical: if sentence: sentence[-1] = wordless_text.Wordless_Token( sentence[-1], boundary='', sentence_ending=True) else: for sentence in tokens_hierarchical: if sentence: sentence[-1] = wordless_text.Wordless_Token( sentence[-1], boundary=' ', sentence_ending=True) # Clause tokenization if not flat_tokens: for i, sentence in enumerate(tokens_hierarchical): tokens_hierarchical[i] = wordless_clause_tokenize( main, sentence, lang) # Flatten tokens tokens_flat = list(wordless_misc.flatten_list(tokens_hierarchical)) if flat_tokens: return tokens_flat else: return tokens_hierarchical
import nltk from wordnik import swagger, WordApi, AccountApi client = swagger.ApiClient( 'dd3d32ae6b4709e1150040139c308fb77446e0a8ecc93db31', 'https://api.wordnik.com/v4') word_api = WordApi.WordApi(client) toktok = nltk.ToktokTokenizer() words = ['paint', 'mimic', 'mimics', 'francie', 'frolic', 'funhouse'] for word in words: print('=== {} ==='.format(word)) defs = word_api.getDefinitions(word) if not defs: print("no definitions") continue for def_ in defs: fmt_str = "{} --- {}" tokenized_def = toktok.tokenize(def_.text.lower()) tokenized_def = [s.encode('utf-8') for s in tokenized_def] print(fmt_str.format(def_.sourceDictionary, tokenized_def)) account_api = AccountApi.AccountApi(client) for i in range(5): print("Attempt {}".format(i)) status = account_api.getApiTokenStatus() print("Remaining_calls: {}".format(status.remainingCalls))