def run(self): preview_results = [] preview_lang = self.main.settings_custom['lemmatization'][ 'preview_lang'] preview_samples = self.main.settings_custom['lemmatization'][ 'preview_samples'] for line in preview_samples.split('\n'): line = line.strip() if line: tokens = wl_word_tokenization.wl_word_tokenize( self.main, line, lang=preview_lang) tokens = wl_misc.flatten_list(tokens) lemmas = wl_lemmatization.wl_lemmatize( self.main, tokens, lang=preview_lang, lemmatizer=self.lemmatizer) text = wl_word_detokenization.wl_word_detokenize( self.main, lemmas, lang=preview_lang) preview_results.append(text) else: preview_results.append('') self.worker_done.emit(preview_samples, preview_results)
def test_lemmatize(lang, lemmatizer, show_results = False): lang_text = wl_conversion.to_lang_text(main, lang) tokens = wl_word_tokenization.wl_word_tokenize( main, text = getattr(wl_test_lang_examples, f'SENTENCE_{lang.upper()}'), lang = lang ) lemmas = wl_lemmatization.wl_lemmatize( main, tokens = tokens, lang = lang, lemmatizer = lemmatizer ) if show_results: print(f'{lang} / {lemmatizer}:') print(lemmas) if lang == 'ast': assert lemmas == ["L'asturianu", 'ser', 'unu', 'llingua', 'romance', 'propiu', "d'Asturies,[1", ']', 'perteneciente', 'al', 'subgrupu', 'asturllionés', '.'] elif lang == 'bul': assert lemmas == ['Бъ̀лгарският', 'езѝк', 'съм', 'индоевропейски', 'език', 'от', 'група', 'на', 'южнославянските', 'език', '.'] elif lang == 'cat': assert lemmas == ['El', 'català', '(', 'denominació', 'oficial', 'a', 'Catalunya', ',', 'a', 'ell', 'Illes', 'Balears', ',', 'a', 'Andorra', ',', 'a', 'ell', 'ciutat', 'de', 'ell', 'Alguer', 'i', 'tradicional', 'a', 'Catalunya', 'Nord', ')', 'o', 'valencià', '(', 'denominació', 'oficial', 'al', 'País', 'Valencià', 'i', 'tradicional', 'al', 'Carxe', ')', 'ser', 'un', 'llengua', 'romànic', 'parlar', 'a', 'Catalunya', ',', 'ell', 'País', 'Valencià', '(', 'treure', 'de', 'algun', 'comarca', 'i', 'localitat', 'de', 'ell', 'interior', ')', ',', 'ell', 'Illes', 'Balears', ',', 'Andorra', ',', 'ell', 'Franja', 'de', 'Ponent', '(', 'a', 'ell', 'Aragó', ')', ',', 'ell', 'ciutat', 'de', 'ell', 'Alguer', '(', 'a', 'ell', 'illa', 'de', 'Sardenya', ')', ',', 'ell', 'Catalunya', 'del', 'Nord,[8', ']', 'ell', 'Carxe', '(', 'un', 'petit', 'territori', 'de', 'Múrcia', 'poblar', 'per', 'immigrar', 'valencians),[9][10', ']', 'i', 'en', 'petita', 'comunitat', 'arreu', 'del', 'món', '(', 'entrar', 'ell', 'qual', 'destacar', 'ell', 'de', 'ell', 'Argentina', ',', 'amb', '195.000', 'parlants).[11', ']'] elif lang == 'ces': assert lemmas == ['Čeština', 'neboli', 'český', 'jazyk', 'on', 'západoslovanský', 'jazyk', ',', 'blízký', 'slovenštině', ',', 'poté', 'lužické', 'srbštině', 'a', 'polštině', '.'] elif lang == 'nld': assert lemmas == ['het', 'nederlands', 'zijn', 'een', 'west-germaans', 'taal', 'en', 'de', 'moedertaal', 'van', 'de', 'veel', 'inwoner', 'van', 'nederland', ',', 'belgië', 'en', 'suriname', '.'] elif lang == 'eng': if lemmatizer == 'Lemmatization Lists - English Lemma List': assert lemmas == ['English', 'be', 'a', 'West', 'Germanic', 'language', 'that', 'be', '1', 'speak', 'in', 'early', 'medieval', 'England', 'and', 'eventually', 'become', 'a', 'global', 'lingua', 'franca.[4][5', ']'] elif lemmatizer == 'NLTK - WordNet Lemmatizer': assert lemmas == ['English', 'be', 'a', 'West', 'Germanic', 'language', 'that', 'be', 'first', 'speak', 'in', 'early', 'medieval', 'England', 'and', 'eventually', 'become', 'a', 'global', 'lingua', 'franca.[4][5', ']'] elif lemmatizer == 'spaCy - English Lemmatizer': assert lemmas == ['English', 'be', 'a', 'West', 'germanic', 'language', 'that', 'be', 'first', 'speak', 'in', 'early', 'medieval', 'England', 'and', 'eventually', 'become', 'a', 'global', 'lingua', 'franca.[4][5', ']'] elif lang == 'est': assert lemmas == ['Eesti', 'kee', '(', 'varasem', 'nimetu', ':', 'maakeel', ')', 'olema', 'läänemeresoome', 'lõunarühma', 'kuuluma', 'kee', '.'] elif lang == 'fra': if lemmatizer == 'Lemmatization Lists - French Lemma List': assert lemmas == ['Le', 'français', 'être', 'un', 'langue', 'indo-européen', 'de', 'le', 'famille', 'un', 'langue', 'roman', '.'] elif lemmatizer == 'spaCy - French Lemmatizer': assert lemmas == ['le', 'français', 'être', 'un', 'langue', 'indo-européen', 'de', 'le', 'famille', 'un', 'langue', 'roman', '.'] elif lang == 'glg': assert lemmas == ['O', 'galego', '(', '[', 'ɡaˈleɣo̝', ']', ')', 'ser', 'un', 'lingua', 'indoeuropeo', 'que', 'pertencer', 'á', 'póla', 'de', 'lingua', 'románico', '.'] elif lang == 'deu': if lemmatizer == 'Lemmatization Lists - German Lemma List': assert lemmas == ['Die', 'deutsch', 'Sprache', 'bzw.', 'Deutsch', '(', '[', 'dɔʏ̯t͡ʃ', ']', ';', 'abkürzen', 'dt', '.', 'oder', 'dtsch', '.', ')', 'sein', 'einen', 'westgermanische', 'Sprache', '.'] elif lemmatizer == 'spaCy - German Lemmatizer': assert lemmas == ['der', 'deutsch', 'Sprache', 'bzw.', 'Deutsch', '(', '[', 'dɔʏ̯t͡ʃ', ']', ';', 'abkürzen', 'dt', '.', 'oder', 'dtsch', '.', ')', 'sein', 'einen', 'westgermanische', 'Sprache', '.'] elif lang == 'grc': assert lemmas == ['Με', 'τον', 'όρο', 'αρχαία', 'ελληνική', 'γλώσσα', 'εννοείται', 'μια', 'μορφή', 'της', 'ελληνικής', 'γλώσσας', ',', 'πού', 'ομιλούνταν', 'κατά', 'τους', 'αρχαϊκούς', 'χρόνους', 'και', 'την', 'κλασική', 'αρχαιότητα', '.'] elif lang == 'ell': assert lemmas == ['η', 'ελληνικός', 'γλώσσα', 'ανήκω', 'στην', 'ινδοευρωπαϊκός', 'οικογένεια[9', ']', 'και', 'συγκεκριμένα', 'στον', 'ελληνικό', 'κλάδο', ',', 'μαζί', 'με', 'την', 'τσακωνικός', ',', 'ενώ', 'είναι', 'η', 'επίσημη', 'γλώσσα', 'της', 'Ελλάδος', 'και', 'της', 'Κύπρου', '.'] elif lang == 'hun': assert lemmas == ['A', 'magyar', 'nyelv', 'az', 'uráli', 'nyelvcsalád', 'tag', ',', 'a', 'finnugor', 'nyelv', 'köz', 'tartozó', 'ugor', 'nyelv', 'egyik', '.'] elif lang == 'gle': assert lemmas == ['Is', 'ceann', 'de', 'na', 'teangach', 'Ceilteacha', 'í', 'an', 'Ghaeilge', '(', 'nó', 'Gaeilge', 'na', 'hÉireann', 'mar', 'a', 'tabhair', 'ar', 'corruair', ')', ',', 'agus', 'ceann', 'den', 'trí', 'ceann', 'de', 'teangach', 'Ceilteacha', 'air', 'a', 'tabhair', 'na', 'teangach', 'Gaelacha', '(', '.i.', 'an', 'Ghaeilge', ',', 'Gaeilge', 'na', 'hAlban', 'agus', 'Gaeilge', 'Mhanann', ')', 'go', 'áirithe', '.'] elif lang == 'ita': assert lemmas == ["L'", 'italiano', '(', '[', 'itaˈljaːno][Nota', '1', ']', 'ascolta[?·info', ']', ')', 'essere', 'una', 'lingua', 'romanzo', 'parlato', 'principalmente', 'in', 'Italia', '.'] elif lang == 'lit': assert lemmas == ['lietuvė', 'kalbėti', '–', 'ižti', 'baltas', 'prokalbės', 'kilęs', 'lietuvė', 'tauta', 'kalbėti', ',', '-PRON-', 'Lietuvoje', 'būti', 'valstybinis', ',', 'o', 'Europos', 'sąjunga', '–', 'viena', 'ižti', 'oficialus', 'kalbus', '.'] elif lang == 'glv': assert lemmas == ['She', 'Gaelg', '(', 'graït', ':', '/gɪlg/', ')', 'çhengey', 'Gaelagh', 'Mannin', '.'] elif lang == 'nob': assert lemmas == ['bokmål', 'være', 'en', 'varietet', 'av', 'norsk', 'språk', '.'] elif lang == 'fas': assert lemmas == ['فارسی', 'یا', 'پارسی', 'یکی', 'از', 'زبان\u200cهای', 'هندواروپایی', 'در', 'شاخهٔ', 'زبان\u200cهای', 'ایرانی', 'جنوب', 'غربی', 'است', 'که', 'در', 'کشورهای', 'ایران', '،', 'افغانستان،[۳', ']', 'تاجیکستان[۴', ']', 'را', 'ازبکستان[۵', ']', 'به', 'آن', 'سخن', 'می\u200cگویند', '.'] elif lang == 'por': assert lemmas == ['A', 'língua', 'portuguesar', ',', 'também', 'designar', 'português', ',', 'ser', 'umar', 'língua', 'românico', 'flexivo', 'ocidental', 'originar', 'o', 'galego-português', 'falar', 'o', 'Reino', 'da', 'Galiza', 'e', 'o', 'norte', 'de', 'Portugal', '.'] elif lang == 'ron': assert lemmas == ['Limba', 'român', 'fi', 'vrea', 'limbă', 'indo', '-', 'european', ',', 'din', 'grup', 'italic', 'și', 'din', 'subgrupul', 'oriental', 'al', 'limbă', 'romanice', '.'] elif lang == 'rus': assert lemmas == ['ру́сский', 'язы́к', '(', '[', 'ˈruskʲɪi̯', 'jɪˈzɨk', ']', 'информация', 'о', 'файл', 'слушать', ')', '[', '~', '3', ']', '[', '⇨', ']', '—', 'один', 'из', 'восточнославянский', 'язык', ',', 'национальный', 'язык', 'русский', 'народ', '.'] elif lang == 'gla': assert lemmas == ["'S", 'i', 'cànan', 'dùthchasach', 'na', 'h', '-', 'Alba', 'a', 'th', "'", 'anns', 'a', "'", 'Ghàidhlig', '.'] elif lang == 'slk': assert lemmas == ['Slovenčina', 'patriť', 'do', 'skupina', 'západoslovanský', 'jazyk', '(', 'spolu', 's', 'čeština', ',', 'poľština', ',', 'horný', 'as', 'dolný', 'lužickou', 'srbčina', 'as', 'kašubčinou', ')', '.'] elif lang == 'slv': assert lemmas == ['Slovenščina', '[', 'slovénščina', ']', '/', '[', 'sloˈʋenʃtʃina', ']', 'onbiti', 'združen', 'naziv', 'za', 'uraden', 'knjižen', 'jezik', 'Slovenec', 'in', 'skupen', 'ime', 'za', 'narečje', 'in', 'govoriti', ',', 'ki', 'on', 'govoriti', 'ali', 'biti', 'on', 'nekoč', 'govoriti', 'Slovenec', '.'] elif lang == 'spa': assert lemmas == ['El', 'español', 'o', 'castellano', 'ser', 'uno', 'lengua', 'romance', 'procedente', 'del', 'latín', 'hablar', '.'] elif lang == 'swe': assert lemmas == ['Svenska', '(', 'svensk', '(', 'info', ')', ')', 'vara', 'en', 'östnordiskt', 'språka', 'som', 'tala', 'av', 'ungefär', 'tio', 'miljon', 'person', 'främst', 'i', 'Sverige', 'där', 'språk', 'hare', 'man', 'dominant', 'ställning', 'som', 'huvudspråk', ',', 'mena', 'även', 'som', 'en', 'en', 'nationalspråk', 'i', 'Finland', 'och', 'som', 'enda', 'officiell', 'språka', 'på', 'Åland', '.'] elif lang == 'bod': assert lemmas == ['བོད་', 'ཀྱི་', 'སྐད་ཡིག་', 'ནི་', 'བོད་ཡུལ་', 'དང་', 'དེ་', 'གི་', 'ཉེ་འཁོར་', 'གྱི་', 'ས་ཁུལ་', 'ཏེ་', ' །'] elif lang == 'ukr': if lemmatizer == 'Lemmatization Lists - Ukrainian Lemma List': assert lemmas == ['Украї́нська', 'мо́ва', '(', 'МФА', ':', '[', 'ukrɑ̽ˈjɪnʲsʲkɑ̽', 'ˈmɔwɑ̽', ']', ',', 'історичний', 'назвати', '—', 'ру́ська', ',', 'руси́нська[9][10][11', ']', '[', '*', '2', ']', ')', '—', 'національний', 'мова', 'українець', '.'] elif lemmatizer == 'pymorphy2 - Morphological Analyzer': assert lemmas == ['украї́нський', 'мо́вий', '(', 'мфа', ':', '[', 'ukrɑ̽ˈjɪnʲsʲkɑ̽', 'ˈmɔwɑ̽', ']', ',', 'історичний', 'назва', '—', 'ру́ський', ',', 'руси́нська[9][10][11', ']', '[', '*', '2', ']', ')', '—', 'національний', 'мова', 'українець', '.'] elif lang == 'cym': assert lemmas == ['Aelod', "o'r", 'cangen', 'Frythonaidd', "o'r", 'iaith', 'Celtaidd', 'a', 'siarad', 'bod', 'brodorol', 'yn', 'Nghymru', ',', 'can', 'Gymry', 'a', 'pobl', 'arall', 'aredig', 'gwasgar', 'bod', 'Lloegr', ',', 'a', 'can', 'cymuno', 'bechan', 'bod', 'Y', 'Wladfa', ',', 'gwybod', 'Ariannin[7', ']', "yw'r", 'Gymraeg', '(', 'hefyd', 'Cymraeg', 'heb', 'yr', 'bannod', ')', '.']
def wl_process_tokens(text, token_settings): main = text.main settings = copy.deepcopy(token_settings) # Token Settings if settings['use_tags']: settings['ignore_tags'] = settings['ignore_tags_tags'] settings['ignore_tags_type'] = settings['ignore_tags_type_tags'] # Punctuations if not settings['puncs']: i_tokens = 0 # Mark tokens to be removed for para in text.tokens_multilevel: for sentence in para: for clause in sentence: for i, token in enumerate(clause): if wl_checking_token.is_token_punc(token): clause[i] = '' text.tags_pos[i_tokens + i] = '' text.tags_non_pos[i_tokens + i] = '' text.tags_all[i_tokens + i] = '' i_tokens += len(clause) # Remove punctuations for para in text.tokens_multilevel: for sentence in para: for i, clause in enumerate(sentence): sentence[i] = [token for token in clause if token] text.tags_pos = [tags for tags in text.tags_pos if tags != ''] text.tags_non_pos = [tags for tags in text.tags_pos if tags != ''] text.tags_all = [tags for tags in text.tags_pos if tags != ''] # Lemmatize all tokens if not settings['use_tags'] and settings['lemmatize_tokens']: for para in text.tokens_multilevel: for sentence in para: for i, clause in enumerate(sentence): sentence[i] = wl_lemmatization.wl_lemmatize(main, clause, lang=text.lang) # Treat as all lowercase if settings['treat_as_lowercase']: for para in text.tokens_multilevel: for sentence in para: for i, clause in enumerate(sentence): sentence[i] = [token.lower() for token in clause] text.tags_pos = [[tag.lower() for tag in tags] for tags in text.tags_pos] text.tags_non_pos = [[tag.lower() for tag in tags] for tags in text.tags_non_pos] text.tags_all = [[tag.lower() for tag in tags] for tags in text.tags_all] # Words if settings['words']: # Lowercase if not settings['lowercase']: for para in text.tokens_multilevel: for sentence in para: for clause in sentence: for i, token in enumerate(clause): if wl_checking_token.is_token_word_lowercase( token): clause[i] = '' # Uppercase if not settings['uppercase']: for para in text.tokens_multilevel: for sentence in para: for clause in sentence: for i, token in enumerate(clause): if wl_checking_token.is_token_word_uppercase( token): clause[i] = '' # Title Case if not settings['title_case']: for para in text.tokens_multilevel: for sentence in para: for clause in sentence: for i, token in enumerate(clause): if wl_checking_token.is_token_word_title_case( token): clause[i] = '' else: for para in text.tokens_multilevel: for sentence in para: for clause in sentence: for i, token in enumerate(clause): if wl_checking_token.is_token_word(token): clause[i] = '' # Numerals if not settings['nums']: for para in text.tokens_multilevel: for sentence in para: for clause in sentence: for i, token in enumerate(clause): if wl_checking_token.is_token_num(token): clause[i] = '' # Filter stop words if settings['filter_stop_words']: for para in text.tokens_multilevel: for sentence in para: for i, clause in enumerate(sentence): sentence[i] = wl_stop_word_lists.wl_filter_stop_words( main, clause, lang=text.lang) # Ignore tags i_token = 0 if settings['ignore_tags']: # Ignore all tags if settings['ignore_tags_type'] == main.tr('all'): for para in text.tokens_multilevel: for sentence in para: for clause in sentence: for i, token in enumerate(clause): clause[i] = (token, []) # Ignore POS tags elif settings['ignore_tags_type'] == main.tr('POS'): for para in text.tokens_multilevel: for sentence in para: for clause in sentence: for i, token in enumerate(clause): clause[i] = (token, text.tags_non_pos[i_token + i]) i_token += len(clause) # Ignore non-POS tags elif settings['ignore_tags_type'] == main.tr('non-POS'): for para in text.tokens_multilevel: for sentence in para: for clause in sentence: for i, token in enumerate(clause): clause[i] = (token, text.tags_pos[i_token + i]) i_token += len(clause) else: for para in text.tokens_multilevel: for sentence in para: for clause in sentence: for i, token in enumerate(clause): clause[i] = (token, text.tags_all[i_token + i]) i_token += len(clause) # Use tags only if settings['use_tags']: for para in text.tokens_multilevel: for sentence in para: for clause in sentence: for i, token in enumerate(clause): clause[i] = clause[i][1] else: for para in text.tokens_multilevel: for sentence in para: for clause in sentence: for i, token in enumerate(clause): clause[i] = f"{clause[i][0]}{''.join(clause[i][1])}" text.tokens_flat = list(wl_misc.flatten_list(text.tokens_multilevel)) return text
def match_ngrams( main, search_terms, tokens, lang, tokenized, tagged, token_settings, search_settings ): search_terms_matched = set() settings = copy.deepcopy(search_settings) re_tags = get_re_tags(main) search_term_tokens = [search_term_token for search_term in search_terms for search_term_token in search_term.split()] if search_settings['use_regex']: regexes_matched = {search_term_token: set() for search_term_token in search_term_tokens} tokens_matched = {} else: tokens_matched = {search_term_token: set() for search_term_token in search_term_tokens} # Search Settings if settings['match_tags']: settings['match_inflected_forms'] = False # Token Settings if token_settings['use_tags']: settings['match_inflected_forms'] = False settings['match_tags'] = False else: if token_settings['ignore_tags']: settings['ignore_tags'] = False settings['match_tags'] = False # Match tags only & Ignore tags if settings['match_tags']: if tagged == 'No': tokens_searched = [] else: tokens_searched = [''.join(re.findall(re_tags, token)) for token in tokens] else: if settings['ignore_tags']: if tagged == 'No': tokens_searched = tokens else: if tagged == 'Yes': tokens_searched = [re.sub(re_tags, '', token) for token in tokens] else: tokens_searched = tokens if tokens_searched: if settings['use_regex']: for search_term_token in search_term_tokens: if settings['match_whole_words']: regex = fr'(^|\s+){search_term_token}(\s+|$)' else: regex = search_term_token if settings['ignore_case']: flags = re.IGNORECASE else: flags = 0 for token, token_searched in zip(tokens, tokens_searched): if re.search(regex, token_searched, flags = flags): regexes_matched[search_term_token].add(token) tokens_matched[token] = set() else: for search_term_token in search_term_tokens: regex = re.escape(search_term_token) if settings['match_whole_words']: regex = fr'(^|\s+){regex}(\s+|$)' if settings['ignore_case']: flags = re.IGNORECASE else: flags = 0 for token, token_searched in zip(tokens, tokens_searched): if re.search(regex, token_searched, flags = flags): tokens_matched[search_term_token].add(token) if settings['match_inflected_forms']: lemmas_searched = wl_lemmatization.wl_lemmatize(main, tokens_searched, lang, tokenized, tagged) lemmas_matched = wl_lemmatization.wl_lemmatize(main, list(tokens_matched), lang, tokenized, tagged) for token_matched, lemma_matched in zip(list(tokens_matched), lemmas_matched): lemma_matched = re.escape(lemma_matched) lemma_matched = fr'(^|\s+){lemma_matched}(\s+|$)' if settings['ignore_case']: flags = re.IGNORECASE else: flags = 0 for token, lemma_searched in zip(tokens, lemmas_searched): if re.search(lemma_matched, lemma_searched, flags = flags): tokens_matched[token_matched].add(token) if search_settings['use_regex']: for search_term in search_terms: search_term_tokens_matched = [] for search_term_token in search_term.split(): search_term_tokens_matched.append(set()) for regex_matched in regexes_matched[search_term_token]: search_term_tokens_matched[-1].add(regex_matched) search_term_tokens_matched[-1] |= set(tokens_matched[regex_matched]) for item in itertools.product(*search_term_tokens_matched): search_terms_matched.add(item) else: for search_term in search_terms: search_term_tokens_matched = [] for search_term_token in search_term.split(): search_term_tokens_matched.append(set(tokens_matched[search_term_token])) for item in itertools.product(*search_term_tokens_matched): search_terms_matched.add(item) return search_terms_matched