def testing_lemmatize(lang, lemmatizer): lang_text = wordless_conversion.to_lang_text(main, lang) print(f'{lang_text} / {lemmatizer}:') wordless_text_utils.check_lemmatizers(main, lang, lemmatizer = lemmatizer) lemmas = wordless_text_processing.wordless_lemmatize(main, globals()[f'tokens_{lang}'], lang = lang, lemmatizer = lemmatizer) print(f"\t{lemmas}")
def wordless_lemmatize(main, tokens, lang, text_type=('untokenized', 'untagged'), lemmatizer='default'): empty_offsets = [] mapping_lemmas = {} lemmas = [] tokens = [str(token) for token in tokens] re_tags_all = wordless_matching.get_re_tags(main, tags='all') re_tags_pos = wordless_matching.get_re_tags(main, tags='pos') re_tags_non_pos = wordless_matching.get_re_tags(main, tags='non_pos') if text_type[1] == 'tagged_both': tags = [''.join(re.findall(re_tags_all, token)) for token in tokens] tokens = [re.sub(re_tags_all, '', token) for token in tokens] elif text_type[1] == 'tagged_pos': tags = [''.join(re.findall(re_tags_pos, token)) for token in tokens] tokens = [re.sub(re_tags_pos, '', token) for token in tokens] elif text_type[1] == 'tagged_non_pos': tags = [ ''.join(re.findall(re_tags_non_pos, token)) for token in tokens ] tokens = [re.sub(re_tags_non_pos, '', token) for token in tokens] else: tags = [''] * len(tokens) # Record empty tokens for i, token in reversed(list(enumerate(tokens))): if not token.strip(): tokens.remove(token) empty_offsets.append(i) wordless_text_utils.check_lemmatizers(main, lang) if tokens and lang in main.settings_global['lemmatizers']: if lemmatizer == 'default': lemmatizer = main.settings_custom['lemmatization']['lemmatizers'][ lang] # Dutch, English, French, German, Greek (Modern), Italian, Portuguese, Spanish if 'spaCy' in lemmatizer: nlp = main.__dict__[f'spacy_nlp_{lang}'] doc = spacy.tokens.Doc(nlp.vocab, words=tokens) nlp.tagger(doc) lemmas = [token.lemma_ for token in doc] # English elif lemmatizer == main.tr('NLTK - WordNet Lemmatizer'): word_net_lemmatizer = nltk.WordNetLemmatizer() for token, pos in wordless_pos_tag( main, tokens, lang='eng', pos_tagger='NLTK - Perceptron POS Tagger', tagset='universal'): if pos == 'ADJ': lemmas.append( word_net_lemmatizer.lemmatize( token, pos=nltk.corpus.wordnet.ADJ)) elif pos in ['NOUN', 'PROPN']: lemmas.append( word_net_lemmatizer.lemmatize( token, pos=nltk.corpus.wordnet.NOUN)) elif pos == 'ADV': lemmas.append( word_net_lemmatizer.lemmatize( token, pos=nltk.corpus.wordnet.ADV)) elif pos in ['VERB', 'AUX']: lemmas.append( word_net_lemmatizer.lemmatize( token, pos=nltk.corpus.wordnet.VERB)) else: lemmas.append(word_net_lemmatizer.lemmatize(token)) # Greek (Ancient) elif lemmatizer == main.tr( 'lemmalist-greek - Greek (Ancient) Lemma List'): with open(wordless_misc.get_abs_path( 'lemmatization/lemmalist-greek/lemmalist-greek.txt'), 'r', encoding='utf_8') as f: for line in f.readlines(): line = line.rstrip() if line: lemma, *words = line.split() for word in words: mapping_lemmas[word] = lemma # Russian & Ukrainian elif lemmatizer == main.tr('pymorphy2 - Morphological Analyzer'): if lang == 'rus': morphological_analyzer = pymorphy2.MorphAnalyzer(lang='ru') else: morphological_analyzer = pymorphy2.MorphAnalyzer(lang='uk') for token in tokens: lemmas.append( morphological_analyzer.parse(token)[0].normal_form) # Tibetan elif lemmatizer == main.tr('pybo - Tibetan Lemmatizer'): word_tokenizer = main.settings_custom['word_tokenization'][ 'word_tokenizers'][lang] wordless_text_utils.check_pybo_tokenizers( main, word_tokenizer=word_tokenizer) if word_tokenizer == main.tr( 'pybo - Tibetan Word Tokenizer (GMD)'): tokens = main.pybo_tokenizer_gmd.tokenize(' '.join(tokens)) elif word_tokenizer == main.tr( 'pybo - Tibetan Word Tokenizer (POS)'): tokens = main.pybo_tokenizer_pos.tokenize(' '.join(tokens)) elif word_tokenizer == main.tr( 'pybo - Tibetan Word Tokenizer (tsikchen)'): tokens = main.pybo_tokenizer_tsikchen.tokenize( ' '.join(tokens)) for token in tokens: if token.lemma: lemmas.append(token.lemma) else: lemmas.append(token.text) # Other Languages elif 'Lemmatization Lists' in lemmatizer: lang = wordless_conversion.to_iso_639_1(main, lang) with open(wordless_misc.get_abs_path( f'lemmatization/Lemmatization Lists/lemmatization-{lang}.txt' ), 'r', encoding='utf_8_sig') as f: for line in f: try: lemma, word = line.rstrip().split('\t') mapping_lemmas[word] = lemma except: pass else: lemmas = tokens if mapping_lemmas: lemmas = [mapping_lemmas.get(token, token) for token in tokens] # Insert empty lemmas for empty_offset in empty_offsets: lemmas.insert(empty_offset, '') return [lemma + tag for lemma, tag in zip(lemmas, tags)]
def wordless_process_tokens(text, token_settings): main = text.main tokens = text.tokens.copy() settings = copy.deepcopy(token_settings) # Token Settings if settings['use_tags']: settings['ignore_tags'] = settings['ignore_tags_tags'] settings['ignore_tags_type'] = settings['ignore_tags_type_tags'] # Punctuations if not settings['puncs']: for i, token in reversed(list(enumerate(tokens))): if wordless_checking_token.is_token_punc(token): del tokens[i] del text.tags_pos[i] del text.tags_non_pos[i] del text.tags_all[i] # Lemmatize all tokens if not settings['use_tags'] and settings['lemmatize_tokens']: wordless_text_utils.check_lemmatizers(main, lang = text.lang) tokens = wordless_text_processing.wordless_lemmatize(main, tokens, lang = text.lang) # Treat as all lowercase if settings['treat_as_lowercase']: tokens = [token.lower() for token in tokens] text.tags_pos = [[tag.lower() for tag in tags] for tags in text.tags_pos] text.tags_non_pos = [[tag.lower() for tag in tags] for tags in text.tags_non_pos] text.tags_all = [[tag.lower() for tag in tags] for tags in text.tags_all] text.tokens = copy.deepcopy(tokens) # Words if settings['words']: # Lowercase if not settings['lowercase']: for i, token in enumerate(tokens): if wordless_checking_token.is_token_word_lowercase(token): tokens[i] = '' # Uppercase if not settings['uppercase']: for i, token in enumerate(tokens): if wordless_checking_token.is_token_word_uppercase(token): tokens[i] = '' # Title Case if not settings['title_case']: for i, token in enumerate(tokens): if wordless_checking_token.is_token_word_title_case(token): tokens[i] = '' else: for i, token in enumerate(tokens): if wordless_checking_token.is_token_word(token): tokens[i] = '' # Numerals if not settings['nums']: for i, token in enumerate(tokens): if wordless_checking_token.is_token_num(token): tokens[i] = '' # Filter stop words if settings['filter_stop_words']: tokens_filtered = wordless_text_processing.wordless_filter_stop_words(main, [token for token in tokens], lang = text.lang) for i, token in enumerate(tokens): if token not in tokens_filtered: tokens[i] = '' # Ignore tags if settings['ignore_tags']: # Ignore all tags if settings['ignore_tags_type'] == main.tr('all'): tokens = [(token, []) for token in tokens] text.tokens = [(token, []) for token in text.tokens] # Ignore POS tags elif settings['ignore_tags_type'] == main.tr('POS'): tokens = [(token, tags) for token, tags in zip(tokens, text.tags_non_pos)] text.tokens = [(token, tags) for token, tags in zip(text.tokens, text.tags_non_pos)] # Ignore non-POS tags elif settings['ignore_tags_type'] == main.tr('non-POS'): tokens = [(token, tags) for token, tags in zip(tokens, text.tags_pos)] text.tokens = [(token, tags) for token, tags in zip(text.tokens, text.tags_pos)] else: tokens = [(token, tags) for token, tags in zip(tokens, text.tags_all)] text.tokens = [(token, tags) for token, tags in zip(text.tokens, text.tags_all)] return tokens
def match_ngrams(main, search_terms, tokens, lang, text_type, token_settings, search_settings): search_terms_matched = [] settings = copy.deepcopy(search_settings) re_tags_all = get_re_tags(main, tags = 'all') re_tags_pos = get_re_tags(main, tags = 'pos') re_tags_non_pos = get_re_tags(main, tags = 'non_pos') search_term_tokens = [search_term_token for search_term in search_terms for search_term_token in search_term.split()] if search_settings['use_regex']: regexes_matched = {search_term_token: set() for search_term_token in search_term_tokens} tokens_matched = {} else: tokens_matched = {search_term_token: set() for search_term_token in search_term_tokens} # Search Settings if settings['match_tags']: settings['match_inflected_forms'] = False settings['ignore_tags'] = settings['ignore_tags_tags'] settings['ignore_tags_type'] = settings['ignore_tags_type_tags'] # Token Settings if token_settings['use_tags']: settings['match_inflected_forms'] = False settings['match_tags'] = False if token_settings['ignore_tags_tags']: settings['ignore_tags'] = False else: if token_settings['ignore_tags']: if token_settings['ignore_tags_type'] == main.tr('all'): settings['ignore_tags'] = False settings['match_tags'] = False # Match Tags Only & Ignore Tags if settings['match_tags']: if settings['ignore_tags']: if text_type[1] == 'untagged': tokens_searched = [] else: if settings['ignore_tags_type'] == main.tr('POS'): if text_type[1] in ['tagged_both', 'tagged_non_pos']: tokens_searched = [''.join(re.findall(re_tags_non_pos, token)) for token in tokens] elif text_type[1] == 'tagged_pos': tokens_searched = [] elif settings['ignore_tags_type'] == main.tr('non-POS'): if text_type[1] in ['tagged_both', 'tagged_pos']: tokens_searched = [''.join(re.findall(re_tags_pos, token)) for token in tokens] elif text_type[1] == 'tagged_non_pos': tokens_searched = [] else: if text_type[1] == 'untagged': tokens_searched = [] elif text_type[1] == 'tagged_pos': tokens_searched = [''.join(re.findall(re_tags_pos, token)) for token in tokens] elif text_type[1] == 'tagged_non_pos': tokens_searched = [''.join(re.findall(re_tags_non_pos, token)) for token in tokens] elif text_type[1] == 'tagged_both': tokens_searched = [''.join(re.findall(re_tags_all, token)) for token in tokens] else: if settings['ignore_tags']: if text_type[1] == 'untagged': tokens_searched = tokens else: if settings['ignore_tags_type'] == main.tr('all'): if text_type[1] == 'tagged_both': tokens_searched = [re.sub(re_tags_all, '', token) for token in tokens] elif text_type[1] == 'tagged_pos': tokens_searched = [re.sub(re_tags_pos, '', token) for token in tokens] elif text_type[1] == 'tagged_non_pos': tokens_searched = [re.sub(re_tags_non_pos, '', token) for token in tokens] elif settings['ignore_tags_type'] == main.tr('POS'): if text_type[1] in ['tagged_both', 'tagged_pos']: tokens_searched = [re.sub(re_tags_pos, '', token) for token in tokens] elif text_type[1] == 'tagged_non_pos': tokens_searched = tokens elif settings['ignore_tags_type'] == main.tr('non-POS'): if text_type[1] in ['tagged_both', 'tagged_non_pos']: tokens_searched = [re.sub(re_tags_non_pos, '', token) for token in tokens] elif text_type[1] == 'tagged_pos': tokens_searched = tokens else: tokens_searched = tokens if tokens_searched: if settings['use_regex']: for search_term_token in search_term_tokens: if settings['match_whole_words']: regex = fr'(^|\s+){search_term_token}(\s+|$)' else: regex = search_term_token if settings['ignore_case']: flags = re.IGNORECASE else: flags = 0 for token, token_searched in zip(tokens, tokens_searched): if re.search(regex, token_searched, flags = flags): regexes_matched[search_term_token].add(token) tokens_matched[token] = set() else: for search_term_token in search_term_tokens: regex = re.escape(search_term_token) if settings['match_whole_words']: regex = fr'(^|\s+){regex}(\s+|$)' if settings['ignore_case']: flags = re.IGNORECASE else: flags = 0 for token, token_searched in zip(tokens, tokens_searched): if re.search(regex, token_searched, flags = flags): tokens_matched[search_term_token].add(token) if settings['match_inflected_forms']: wordless_text_utils.check_lemmatizers(main, lang) lemmas_searched = wordless_text_processing.wordless_lemmatize(main, tokens_searched, lang, text_type) lemmas_matched = wordless_text_processing.wordless_lemmatize(main, list(tokens_matched), lang, text_type) for token_matched, lemma_matched in zip(list(tokens_matched), lemmas_matched): lemma_matched = re.escape(lemma_matched) lemma_matched = fr'(^|\s+){lemma_matched}(\s+|$)' if settings['ignore_case']: flags = re.IGNORECASE else: flags = 0 for token, lemma_searched in zip(tokens, lemmas_searched): if re.search(lemma_matched, lemma_searched, flags = flags): tokens_matched[token_matched].add(token) if search_settings['use_regex']: for search_term in search_terms: search_term_tokens_matched = [] for search_term_token in search_term.split(): search_term_tokens_matched.append(set()) for regex_matched in regexes_matched[search_term_token]: search_term_tokens_matched[-1].add(regex_matched) search_term_tokens_matched[-1] |= set(tokens_matched[regex_matched]) search_terms_matched.extend(itertools.product(*search_term_tokens_matched)) else: for search_term in search_terms: search_term_tokens_matched = [] for search_term_token in search_term.split(): search_term_tokens_matched.append(set(tokens_matched[search_term_token])) search_terms_matched.extend(itertools.product(*search_term_tokens_matched)) return search_terms_matched