def run(self): preview_results = [] preview_lang = self.main.settings_custom['lemmatization'][ 'preview_lang'] preview_samples = self.main.settings_custom['lemmatization'][ 'preview_samples'] for line in preview_samples.split('\n'): line = line.strip() if line: tokens = wl_word_tokenization.wl_word_tokenize( self.main, line, lang=preview_lang) tokens = wl_misc.flatten_list(tokens) lemmas = wl_lemmatization.wl_lemmatize( self.main, tokens, lang=preview_lang, lemmatizer=self.lemmatizer) text = wl_word_detokenization.wl_word_detokenize( self.main, lemmas, lang=preview_lang) preview_results.append(text) else: preview_results.append('') self.worker_done.emit(preview_samples, preview_results)
def run(self): preview_results = [] preview_lang = self.main.settings_custom['word_detokenization']['preview_lang'] preview_samples = self.main.settings_custom['word_detokenization']['preview_samples'] for line in preview_samples.splitlines(): line = line.strip() if line: text = wl_word_detokenization.wl_word_detokenize( self.main, tokens = line.split(), lang = preview_lang, word_detokenizer = self.word_detokenizer ) preview_results.append(text) else: preview_results.append('') self.worker_done.emit(preview_samples, preview_results)
def test_word_detokenize(lang, word_detokenizer, show_results=False): lang_text = wl_conversion.to_lang_text(main, lang) tokens = wl_word_tokenization.wl_word_tokenize( main, text=getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'), lang=lang) text = wl_word_detokenization.wl_word_detokenize( main, tokens=tokens, lang=lang, word_detokenizer=word_detokenizer) if show_results: print(f'{lang} / {word_detokenizer}:') print(text) if lang == 'cat': if word_detokenizer == 'NLTK - Penn Treebank Detokenizer': assert text == "El català (denominació oficial a Catalunya, a les Illes Balears, a Andorra, a la ciutat de l' Alguer i tradicional a Catalunya Nord) o valencià (denominació oficial al País Valencià i tradicional al Carxe) és una llengua romànica parlada a Catalunya, el País Valencià (tret d' algunes comarques i localitats de l' interior), les Illes Balears, Andorra, la Franja de Ponent (a l' Aragó), la ciutat de l' Alguer (a l' illa de Sardenya), la Catalunya del Nord,[8] el Carxe (un petit territori de Múrcia poblat per immigrats valencians),[9][10] i en petites comunitats arreu del món (entre les quals destaca la de l' Argentina, amb 195.000 parlants).[11 ]" elif word_detokenizer == 'Sacremoses - Moses Detokenizer': assert text == "El català (denominació oficial a Catalunya, a les Illes Balears, a Andorra, a la ciutat de l' Alguer i tradicional a Catalunya Nord) o valencià (denominació oficial al País Valencià i tradicional al Carxe) és una llengua romànica parlada a Catalunya, el País Valencià (tret d' algunes comarques i localitats de l' interior), les Illes Balears, Andorra, la Franja de Ponent (a l' Aragó), la ciutat de l' Alguer (a l' illa de Sardenya), la Catalunya del Nord,[8] el Carxe (un petit territori de Múrcia poblat per immigrats valencians),[9][10] i en petites comunitats arreu del món (entre les quals destaca la de l' Argentina, amb 195.000 parlants).[11]" elif lang == 'zho_cn': assert text == '汉语,又称汉文、中文、中国话、中国语、华语、华文、唐话[2],或被视为一个语族,或被视为隶属于汉藏语系汉语族之一种语言。' elif lang == 'zho_tw': assert text == '漢語,又稱漢文、中文、中國話、中國語、華語、華文、唐話[2],或被視為一個語族,或被視為隸屬於漢藏語系漢語族之一種語言。' elif lang == 'ces': assert text == 'Čeština neboli český jazyk je západoslovanský jazyk, nejbližší slovenštině, poté lužické srbštině a polštině.' elif lang == 'nld': assert text == 'Het Nederlands is een West-Germaanse taal en de moedertaal van de meeste inwoners van Nederland, België en Suriname.' elif lang == 'eng': if word_detokenizer == 'NLTK - Penn Treebank Detokenizer': assert text == 'English is a West Germanic language that was first spoken in early medieval England and eventually became a global lingua franca.[4][5 ]' elif word_detokenizer == 'Sacremoses - Moses Detokenizer': assert text == 'English is a West Germanic language that was first spoken in early medieval England and eventually became a global lingua franca.[4][5]' elif lang == 'fin': assert text == 'Suomen kieli (suomi) on uralilaisten kielten itämerensuomalaiseen ryhmään kuuluva kieli.' elif lang == 'fra': assert text == 'Le français est une langue indo-européenne de la famille des langues romanes.' elif lang == 'deu': if word_detokenizer == 'NLTK - Penn Treebank Detokenizer': assert text == 'Die deutsche Sprache bzw. Deutsch ([ dɔʏ̯t͡ʃ]; abgekürzt dt . oder dtsch .) ist eine westgermanische Sprache.' elif word_detokenizer == 'Sacremoses - Moses Detokenizer': assert text == 'Die deutsche Sprache bzw. Deutsch ([dɔʏ̯t͡ʃ]; abgekürzt dt. oder dtsch.) ist eine westgermanische Sprache.' elif lang == 'ell': assert text == 'Η ελληνική γλώσσα ανήκει στην ινδοευρωπαϊκή οικογένεια[9] και συγκεκριμένα στον ελληνικό κλάδο, μαζί με την τσακωνική, ενώ είναι η επίσημη γλώσσα της Ελλάδος και της Κύπρου.' elif lang == 'hun': assert text == 'A magyar nyelv az uráli nyelvcsalád tagja, a finnugor nyelvek közé tartozó ugor nyelvek egyike.' elif lang == 'isl': if word_detokenizer == 'NLTK - Penn Treebank Detokenizer': assert text == 'Íslenska er vesturnorrænt, germanskt og indóevrópskt tungumál sem er einkum talað og ritað á Íslandi og er móðurmál langflestra Íslendinga.[4 ]' elif word_detokenizer == 'Sacremoses - Moses Detokenizer': assert text == 'Íslenska er vesturnorrænt, germanskt og indóevrópskt tungumál sem er einkum talað og ritað á Íslandi og er móðurmál langflestra Íslendinga.[4]' elif lang == 'gle': assert text == 'Is ceann de na teangacha Ceilteacha í an Ghaeilge (nó Gaeilge na hÉireann mar a thugtar uirthi corruair), agus ceann den dtrí cinn de theangacha Ceilteacha ar a dtugtar na teangacha Gaelacha (.i. an Ghaeilge, Gaeilge na hAlban agus Gaeilge Mhanann) go háirithe.' elif lang == 'ita': if word_detokenizer == 'NLTK - Penn Treebank Detokenizer': assert text == "L' italiano ([ itaˈljaːno][Nota 1] ascolta[?·info] ) è una lingua romanza parlata principalmente in Italia." elif word_detokenizer == 'Sacremoses - Moses Detokenizer': assert text == "L'italiano ([itaˈljaːno][Nota 1] ascolta[?·info]) è una lingua romanza parlata principalmente in Italia." elif lang == 'jpn': assert text == '日本語(にほんご、にっぽんご[注1])は、主に日本国内や日本人同士の間で使用されている言語である。' elif lang == 'lav': if word_detokenizer == 'NLTK - Penn Treebank Detokenizer': assert text == 'Latviešu valoda ir dzimtā valoda apmēram 1,7 miljoniem cilvēku, galvenokārt Latvijā, kur tā ir vienīgā valsts valoda . [3 ]' elif word_detokenizer == 'Sacremoses - Moses Detokenizer': assert text == 'Latviešu valoda ir dzimtā valoda apmēram 1,7 miljoniem cilvēku, galvenokārt Latvijā, kur tā ir vienīgā valsts valoda. [3]' elif lang == 'lit': assert text == 'Lietuvių kalba – iš baltų prokalbės kilusi lietuvių tautos kalba, kuri Lietuvoje yra valstybinė, o Europos Sąjungoje – viena iš oficialiųjų kalbų.' elif lang == 'pol': assert text == 'Język polski, polszczyzna, skrót: pol. – język naturalny należący do grupy języków zachodniosłowiańskich (do której należą również czeski, słowacki, kaszubski, dolnołużycki, górnołużycki i wymarły połabski), stanowiącej część rodziny języków indoeuropejskich.' elif lang == 'por': assert text == 'A língua portuguesa, também designada português, é uma língua românica flexiva ocidental originada no galego-português falado no Reino da Galiza e no norte de Portugal.' elif lang == 'ron': assert text == 'Limba română este o limbă indo - europeană, din grupul italic și din subgrupul oriental al limbilor romanice.' elif lang == 'rus': if word_detokenizer == 'NLTK - Penn Treebank Detokenizer': assert text == 'Ру́сский язы́к ([ ˈruskʲɪi̯ jɪˈzɨk] Информация о файле слушать) [~ 3] [⇨] — один из восточнославянских языков, национальный язык русского народа.' elif word_detokenizer == 'Sacremoses - Moses Detokenizer': assert text == 'Ру́сский язы́к ([ˈruskʲɪi̯ jɪˈzɨk] Информация о файле слушать) [~ 3] [⇨] — один из восточнославянских языков, национальный язык русского народа.' elif lang == 'slk': assert text == 'Slovenčina patrí do skupiny západoslovanských jazykov (spolu s češtinou, poľštinou, hornou a dolnou lužickou srbčinou a kašubčinou).' elif lang == 'slv': assert text == 'Slovenščina [slovénščina] / [sloˈʋenʃtʃina] je združeni naziv za uradni knjižni jezik Slovencev in skupno ime za narečja in govore, ki jih govorijo ali so jih nekoč govorili Slovenci.' elif lang == 'spa': assert text == 'El español o castellano es una lengua romance procedente del latín hablado.' elif lang == 'swe': if word_detokenizer == 'NLTK - Penn Treebank Detokenizer': assert text == 'Svenska (svenska (info) ) är ett östnordiskt språk som talas av ungefär tio miljoner personer främst i Sverige där språket har en dominant ställning som huvudspråk, men även som det ena nationalspråket i Finland och som enda officiella språk på Åland.' elif word_detokenizer == 'Sacremoses - Moses Detokenizer': assert text == 'Svenska (svenska (info)) är ett östnordiskt språk som talas av ungefär tio miljoner personer främst i Sverige där språket har en dominant ställning som huvudspråk, men även som det ena nationalspråket i Finland och som enda officiella språk på Åland.' elif lang == 'tam': assert text == 'தமிழ் மொழி (Tamil language) தமிழர்களினதும், தமிழ் பேசும் பலரதும் தாய்மொழி ஆகும்.' elif lang == 'tha': assert text == 'ภาษาไทยหรือภาษาไทยกลางเป็นภาษาราชการและภาษาประจำชาติของประเทศไทย' elif lang == 'bod': assert text == 'བོད་ཀྱི་སྐད་ཡིག་ནི་བོད་ཡུལ་དང་དེའི་ཉེ་འཁོར་གྱི་ས་ཁུལ་ཏེ།'
def wl_process_tokens_concordancer(text, token_settings): main = text.main tokens = text.tokens_flat.copy() settings = copy.deepcopy(token_settings) # Token Settings if settings['use_tags']: settings['ignore_tags'] = settings['ignore_tags_tags'] settings['ignore_tags_type'] = settings['ignore_tags_type_tags'] # Punctuations if not settings['puncs']: tokens = [ token for token in tokens if not wl_checking_token.is_token_punc(token) ] text.offsets_paras = [] text.offsets_sentences = [] text.offsets_clauses = [] text.tokens_flat = [] for para in text.tokens_multilevel: text.offsets_paras.append(len(text.tokens_flat)) for sentence in para: text.offsets_sentences.append(len(text.tokens_flat)) for clause in sentence: text.offsets_clauses.append(len(text.tokens_flat)) for token in clause: if text.tokens_flat: if wl_checking_token.is_token_punc(token): text.tokens_flat[ -1] = wl_word_detokenization.wl_word_detokenize( main, [text.tokens_flat[-1], token], lang=text.lang) else: text.tokens_flat.append(token) else: text.tokens_flat.append(token) # Remove duplicate offsets text.offsets_paras = sorted(set(text.offsets_paras)) text.offsets_sentences = sorted(set(text.offsets_sentences)) text.offsets_clauses = sorted(set(text.offsets_clauses)) # Check if the first token is a punctuation mark if wl_checking_token.is_token_punc(text.tokens_flat[0]): tokens.insert(0, []) # Ignore tags if settings['ignore_tags']: # Ignore all tags if settings['ignore_tags_type'] == main.tr('all'): tokens = [(token, []) for token in tokens] text.tokens_flat = [(token, []) for token in text.tokens_flat] # Ignore POS tags elif settings['ignore_tags_type'] == main.tr('POS'): tokens = [(token, tags) for token, tags in zip(tokens, text.tags_non_pos)] text.tokens_flat = [ (token, tags) for token, tags in zip(text.tokens_flat, text.tags_non_pos) ] # Ignore non-POS tags elif settings['ignore_tags_type'] == main.tr('non-POS'): tokens = [(token, tags) for token, tags in zip(tokens, text.tags_pos)] text.tokens_flat = [ (token, tags) for token, tags in zip(text.tokens_flat, text.tags_pos) ] else: tokens = [(token, tags) for token, tags in zip(tokens, text.tags_all)] text.tokens_flat = [ (token, tags) for token, tags in zip(text.tokens_flat, text.tags_all) ] # Use tags only if settings['use_tags']: tokens = [''.join(tags) for _, tags in tokens] text.tokens_flat = [''.join(tags) for _, tags in text.tokens_flat] else: tokens = [f"{token}{''.join(tags)}" for token, tags in tokens] text.tokens_flat = [ f"{token}{''.join(tags)}" for token, tags in text.tokens_flat ] return tokens
def wl_sentence_tokenize(main, text, lang, sentence_tokenizer='default'): sentences = [] if lang not in main.settings_global['sentence_tokenizers']: lang = 'other' if sentence_tokenizer == 'default': sentence_tokenizer = main.settings_custom['sentence_tokenization'][ 'sentence_tokenizers'][lang] wl_text_utils.check_sentence_tokenizers( main, lang=lang, sentence_tokenizer=sentence_tokenizer) # NLTK if sentence_tokenizer == main.tr('NLTK - Punkt Sentence Tokenizer'): lang_texts = { 'ces': 'czech', 'dan': 'danish', 'nld': 'dutch', 'eng': 'english', 'est': 'estonian', 'fin': 'finnish', 'fra': 'french', 'deu': 'german', # Greek (Modern) 'ell': 'greek', 'ita': 'italian', # Norwegian Bokmål & Norwegian Nynorsk 'nob': 'norwegian', 'nno': 'norwegian', 'pol': 'polish', 'por': 'portuguese', 'rus': 'russian', 'slv': 'slovene', 'spa': 'spanish', 'swe': 'swedish', 'tur': 'turkish', # Other languages 'other': 'english' } sentences = nltk.sent_tokenize(text, language=lang_texts[lang]) # spaCy elif sentence_tokenizer == main.tr('spaCy - Sentencizer'): nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = nlp(text) # See Issue #3479: https://github.com/explosion/spaCy/issues/3479 doc.is_parsed = True sentences = [sentence.text for sentence in doc.sents] # syntok elif sentence_tokenizer == main.tr('syntok - Sentence Segmenter'): for para in syntok.segmenter.analyze(text): for sentence in para: sentences.append(''.join( [token.spacing + token.value for token in sentence])) # Chinese & Japanese elif sentence_tokenizer in [ main.tr('Wordless - Chinese Sentence Tokenizer'), main.tr('Wordless - Japanese Sentence Tokenizer') ]: for line in text.splitlines(): sentence_start = 0 for i, char in enumerate(line): if i >= sentence_start and char in ['。', '!', '?', '!', '?']: for j, char in enumerate(line): if j > i and char not in [ '。', '!', '?', '!', '?', '’', '”', ')', ')' ]: sentences.append(line[sentence_start:j]) sentence_start = j break if sentence_start <= len(line): sentences.append(line[sentence_start:]) # Icelandic elif sentence_tokenizer == main.tr( 'Tokenizer - Icelandic Sentence Tokenizer'): for sentence in tokenizer.split_into_sentences(text): sentences.append( wl_word_detokenization.wl_word_detokenize( main, tokens=sentence.split(), lang='isl')) # Russian elif sentence_tokenizer == main.tr('razdel - Russian Sentenizer'): sentences = [sentence.text for sentence in razdel.sentenize(text)] # Thai elif sentence_tokenizer == main.tr('PyThaiNLP - CRFCut'): sentences = pythainlp.sent_tokenize(text) # Tibetan elif sentence_tokenizer == main.tr('botok - Tibetan Sentence Tokenizer'): wl_text_utils.check_word_tokenizers(main, lang='bod') tokens = main.botok_word_tokenizer.tokenize(text) for sentence_tokens in botok.sentence_tokenizer(tokens): sentences.append(''.join([ sentence_token.text for sentence_token in sentence_tokens[1] ])) # Vietnamese elif sentence_tokenizer == main.tr( 'Underthesea - Vietnamese Sentence Tokenizer'): sentences = underthesea.sent_tokenize(text) # Strip spaces sentences = [sentence.strip() for sentence in sentences] sentences = wl_text_utils.record_boundary_sentences(sentences, text) return sentences