Пример #1
0
def testing_lemmatize(lang, lemmatizer):
    lang_text = wordless_conversion.to_lang_text(main, lang)

    print(f'{lang_text} / {lemmatizer}:')

    wordless_text_utils.check_lemmatizers(main, lang, lemmatizer = lemmatizer)

    lemmas = wordless_text_processing.wordless_lemmatize(main, globals()[f'tokens_{lang}'],
                                                         lang = lang,
                                                         lemmatizer = lemmatizer)

    print(f"\t{lemmas}")
Пример #2
0
def wordless_lemmatize(main,
                       tokens,
                       lang,
                       text_type=('untokenized', 'untagged'),
                       lemmatizer='default'):
    empty_offsets = []
    mapping_lemmas = {}
    lemmas = []

    tokens = [str(token) for token in tokens]

    re_tags_all = wordless_matching.get_re_tags(main, tags='all')
    re_tags_pos = wordless_matching.get_re_tags(main, tags='pos')
    re_tags_non_pos = wordless_matching.get_re_tags(main, tags='non_pos')

    if text_type[1] == 'tagged_both':
        tags = [''.join(re.findall(re_tags_all, token)) for token in tokens]
        tokens = [re.sub(re_tags_all, '', token) for token in tokens]
    elif text_type[1] == 'tagged_pos':
        tags = [''.join(re.findall(re_tags_pos, token)) for token in tokens]
        tokens = [re.sub(re_tags_pos, '', token) for token in tokens]
    elif text_type[1] == 'tagged_non_pos':
        tags = [
            ''.join(re.findall(re_tags_non_pos, token)) for token in tokens
        ]
        tokens = [re.sub(re_tags_non_pos, '', token) for token in tokens]
    else:
        tags = [''] * len(tokens)

    # Record empty tokens
    for i, token in reversed(list(enumerate(tokens))):
        if not token.strip():
            tokens.remove(token)

            empty_offsets.append(i)

    wordless_text_utils.check_lemmatizers(main, lang)

    if tokens and lang in main.settings_global['lemmatizers']:
        if lemmatizer == 'default':
            lemmatizer = main.settings_custom['lemmatization']['lemmatizers'][
                lang]

        # Dutch, English, French, German, Greek (Modern), Italian, Portuguese, Spanish
        if 'spaCy' in lemmatizer:
            nlp = main.__dict__[f'spacy_nlp_{lang}']

            doc = spacy.tokens.Doc(nlp.vocab, words=tokens)
            nlp.tagger(doc)

            lemmas = [token.lemma_ for token in doc]
        # English
        elif lemmatizer == main.tr('NLTK - WordNet Lemmatizer'):
            word_net_lemmatizer = nltk.WordNetLemmatizer()

            for token, pos in wordless_pos_tag(
                    main,
                    tokens,
                    lang='eng',
                    pos_tagger='NLTK - Perceptron POS Tagger',
                    tagset='universal'):
                if pos == 'ADJ':
                    lemmas.append(
                        word_net_lemmatizer.lemmatize(
                            token, pos=nltk.corpus.wordnet.ADJ))
                elif pos in ['NOUN', 'PROPN']:
                    lemmas.append(
                        word_net_lemmatizer.lemmatize(
                            token, pos=nltk.corpus.wordnet.NOUN))
                elif pos == 'ADV':
                    lemmas.append(
                        word_net_lemmatizer.lemmatize(
                            token, pos=nltk.corpus.wordnet.ADV))
                elif pos in ['VERB', 'AUX']:
                    lemmas.append(
                        word_net_lemmatizer.lemmatize(
                            token, pos=nltk.corpus.wordnet.VERB))
                else:
                    lemmas.append(word_net_lemmatizer.lemmatize(token))
        # Greek (Ancient)
        elif lemmatizer == main.tr(
                'lemmalist-greek - Greek (Ancient) Lemma List'):
            with open(wordless_misc.get_abs_path(
                    'lemmatization/lemmalist-greek/lemmalist-greek.txt'),
                      'r',
                      encoding='utf_8') as f:
                for line in f.readlines():
                    line = line.rstrip()

                    if line:
                        lemma, *words = line.split()

                        for word in words:
                            mapping_lemmas[word] = lemma
        # Russian & Ukrainian
        elif lemmatizer == main.tr('pymorphy2 - Morphological Analyzer'):
            if lang == 'rus':
                morphological_analyzer = pymorphy2.MorphAnalyzer(lang='ru')
            else:
                morphological_analyzer = pymorphy2.MorphAnalyzer(lang='uk')

            for token in tokens:
                lemmas.append(
                    morphological_analyzer.parse(token)[0].normal_form)
        # Tibetan
        elif lemmatizer == main.tr('pybo - Tibetan Lemmatizer'):
            word_tokenizer = main.settings_custom['word_tokenization'][
                'word_tokenizers'][lang]

            wordless_text_utils.check_pybo_tokenizers(
                main, word_tokenizer=word_tokenizer)

            if word_tokenizer == main.tr(
                    'pybo - Tibetan Word Tokenizer (GMD)'):
                tokens = main.pybo_tokenizer_gmd.tokenize(' '.join(tokens))
            elif word_tokenizer == main.tr(
                    'pybo - Tibetan Word Tokenizer (POS)'):
                tokens = main.pybo_tokenizer_pos.tokenize(' '.join(tokens))
            elif word_tokenizer == main.tr(
                    'pybo - Tibetan Word Tokenizer (tsikchen)'):
                tokens = main.pybo_tokenizer_tsikchen.tokenize(
                    ' '.join(tokens))

            for token in tokens:
                if token.lemma:
                    lemmas.append(token.lemma)
                else:
                    lemmas.append(token.text)
        # Other Languages
        elif 'Lemmatization Lists' in lemmatizer:
            lang = wordless_conversion.to_iso_639_1(main, lang)

            with open(wordless_misc.get_abs_path(
                    f'lemmatization/Lemmatization Lists/lemmatization-{lang}.txt'
            ),
                      'r',
                      encoding='utf_8_sig') as f:
                for line in f:
                    try:
                        lemma, word = line.rstrip().split('\t')

                        mapping_lemmas[word] = lemma
                    except:
                        pass
    else:
        lemmas = tokens

    if mapping_lemmas:
        lemmas = [mapping_lemmas.get(token, token) for token in tokens]

    # Insert empty lemmas
    for empty_offset in empty_offsets:
        lemmas.insert(empty_offset, '')

    return [lemma + tag for lemma, tag in zip(lemmas, tags)]
def wordless_process_tokens(text, token_settings):
    main = text.main
    tokens = text.tokens.copy()

    settings = copy.deepcopy(token_settings)

    # Token Settings
    if settings['use_tags']:
        settings['ignore_tags'] = settings['ignore_tags_tags']
        settings['ignore_tags_type'] = settings['ignore_tags_type_tags']

    # Punctuations
    if not settings['puncs']:
        for i, token in reversed(list(enumerate(tokens))):
            if wordless_checking_token.is_token_punc(token):
                del tokens[i]

                del text.tags_pos[i]
                del text.tags_non_pos[i]
                del text.tags_all[i]

    # Lemmatize all tokens
    if not settings['use_tags'] and settings['lemmatize_tokens']:
        wordless_text_utils.check_lemmatizers(main,
                                              lang = text.lang)

        tokens = wordless_text_processing.wordless_lemmatize(main, tokens,
                                                             lang = text.lang)

    # Treat as all lowercase
    if settings['treat_as_lowercase']:
        tokens = [token.lower() for token in tokens]

        text.tags_pos = [[tag.lower() for tag in tags] for tags in text.tags_pos]
        text.tags_non_pos = [[tag.lower() for tag in tags] for tags in text.tags_non_pos]
        text.tags_all = [[tag.lower() for tag in tags] for tags in text.tags_all]

    text.tokens = copy.deepcopy(tokens)

    # Words
    if settings['words']:
        # Lowercase
        if not settings['lowercase']:
            for i, token in enumerate(tokens):
                if wordless_checking_token.is_token_word_lowercase(token):
                    tokens[i] = ''
        # Uppercase
        if not settings['uppercase']:
            for i, token in enumerate(tokens):
                if wordless_checking_token.is_token_word_uppercase(token):
                    tokens[i] = ''
        # Title Case
        if not settings['title_case']:
            for i, token in enumerate(tokens):
                if wordless_checking_token.is_token_word_title_case(token):
                    tokens[i] = ''
    else:
        for i, token in enumerate(tokens):
            if wordless_checking_token.is_token_word(token):
                tokens[i] = ''

    # Numerals
    if not settings['nums']:
        for i, token in enumerate(tokens):
            if wordless_checking_token.is_token_num(token):
                tokens[i] = ''

    # Filter stop words
    if settings['filter_stop_words']:
        tokens_filtered = wordless_text_processing.wordless_filter_stop_words(main, [token for token in tokens],
                                                                              lang = text.lang)

        for i, token in enumerate(tokens):
            if token not in tokens_filtered:
                tokens[i] = ''

    # Ignore tags
    if settings['ignore_tags']:
        # Ignore all tags
        if settings['ignore_tags_type'] == main.tr('all'):
            tokens = [(token, []) for token in tokens]
            text.tokens = [(token, []) for token in text.tokens]
        # Ignore POS tags
        elif settings['ignore_tags_type'] == main.tr('POS'):
            tokens = [(token, tags)
                      for token, tags in zip(tokens, text.tags_non_pos)]
            text.tokens = [(token, tags)
                           for token, tags in zip(text.tokens, text.tags_non_pos)]
        # Ignore non-POS tags
        elif settings['ignore_tags_type'] == main.tr('non-POS'):
            tokens = [(token, tags)
                      for token, tags in zip(tokens, text.tags_pos)]
            text.tokens = [(token, tags)
                           for token, tags in zip(text.tokens, text.tags_pos)]
    else:
        tokens = [(token, tags)
                  for token, tags in zip(tokens, text.tags_all)]
        text.tokens = [(token, tags)
                       for token, tags in zip(text.tokens, text.tags_all)]

    return tokens
Пример #4
0
def match_ngrams(main, search_terms, tokens,
                 lang, text_type, token_settings, search_settings):
    search_terms_matched = []

    settings = copy.deepcopy(search_settings)

    re_tags_all = get_re_tags(main, tags = 'all')
    re_tags_pos = get_re_tags(main, tags = 'pos')
    re_tags_non_pos = get_re_tags(main, tags = 'non_pos')

    search_term_tokens = [search_term_token
                          for search_term in search_terms
                          for search_term_token in search_term.split()]

    if search_settings['use_regex']:
        regexes_matched = {search_term_token: set() for search_term_token in search_term_tokens}
        tokens_matched = {}
    else:
        tokens_matched = {search_term_token: set() for search_term_token in search_term_tokens}

    # Search Settings
    if settings['match_tags']:
        settings['match_inflected_forms'] = False

        settings['ignore_tags'] = settings['ignore_tags_tags']
        settings['ignore_tags_type'] = settings['ignore_tags_type_tags']

    # Token Settings
    if token_settings['use_tags']:
        settings['match_inflected_forms'] = False
        settings['match_tags'] = False

        if token_settings['ignore_tags_tags']:
            settings['ignore_tags'] = False
    else:
        if token_settings['ignore_tags']:
            if token_settings['ignore_tags_type'] == main.tr('all'):
                settings['ignore_tags'] = False
                settings['match_tags'] = False

    # Match Tags Only & Ignore Tags
    if settings['match_tags']:
        if settings['ignore_tags']:
            if text_type[1] == 'untagged':
                tokens_searched = []
            else:
                if settings['ignore_tags_type'] == main.tr('POS'):
                    if text_type[1] in ['tagged_both', 'tagged_non_pos']:
                        tokens_searched = [''.join(re.findall(re_tags_non_pos, token)) for token in tokens]
                    elif text_type[1] == 'tagged_pos':
                        tokens_searched = []
                elif settings['ignore_tags_type'] == main.tr('non-POS'):
                    if text_type[1] in ['tagged_both', 'tagged_pos']:
                        tokens_searched = [''.join(re.findall(re_tags_pos, token)) for token in tokens]
                    elif text_type[1] == 'tagged_non_pos':
                        tokens_searched = []
        else:
            if text_type[1] == 'untagged':
                tokens_searched = []
            elif text_type[1] == 'tagged_pos':
                tokens_searched = [''.join(re.findall(re_tags_pos, token)) for token in tokens]
            elif text_type[1] == 'tagged_non_pos':
                tokens_searched = [''.join(re.findall(re_tags_non_pos, token)) for token in tokens]
            elif text_type[1] == 'tagged_both':
                tokens_searched = [''.join(re.findall(re_tags_all, token)) for token in tokens]
    else:
        if settings['ignore_tags']:
            if text_type[1] == 'untagged':
                tokens_searched = tokens
            else:
                if settings['ignore_tags_type'] == main.tr('all'):
                    if text_type[1] == 'tagged_both':
                        tokens_searched = [re.sub(re_tags_all, '', token) for token in tokens]
                    elif text_type[1] == 'tagged_pos':
                        tokens_searched = [re.sub(re_tags_pos, '', token) for token in tokens]
                    elif text_type[1] == 'tagged_non_pos':
                        tokens_searched = [re.sub(re_tags_non_pos, '', token) for token in tokens]
                elif settings['ignore_tags_type'] == main.tr('POS'):
                    if text_type[1] in ['tagged_both', 'tagged_pos']:
                        tokens_searched = [re.sub(re_tags_pos, '', token) for token in tokens]
                    elif text_type[1] == 'tagged_non_pos':
                        tokens_searched = tokens
                elif settings['ignore_tags_type'] == main.tr('non-POS'):
                    if text_type[1] in ['tagged_both', 'tagged_non_pos']:
                        tokens_searched = [re.sub(re_tags_non_pos, '', token) for token in tokens]
                    elif text_type[1] == 'tagged_pos':
                        tokens_searched = tokens
        else:
            tokens_searched = tokens

    if tokens_searched:
        if settings['use_regex']:
            for search_term_token in search_term_tokens:
                if settings['match_whole_words']:
                    regex = fr'(^|\s+){search_term_token}(\s+|$)'
                else:
                    regex = search_term_token

                if settings['ignore_case']:
                    flags = re.IGNORECASE
                else:
                    flags = 0

                for token, token_searched in zip(tokens, tokens_searched):
                    if re.search(regex, token_searched, flags = flags):
                        regexes_matched[search_term_token].add(token)
                        tokens_matched[token] = set()
        else:
            for search_term_token in search_term_tokens:
                regex = re.escape(search_term_token)

                if settings['match_whole_words']:
                    regex = fr'(^|\s+){regex}(\s+|$)'

                if settings['ignore_case']:
                    flags = re.IGNORECASE
                else:
                    flags = 0

                for token, token_searched in zip(tokens, tokens_searched):
                    if re.search(regex, token_searched, flags = flags):
                        tokens_matched[search_term_token].add(token)

        if settings['match_inflected_forms']:
            wordless_text_utils.check_lemmatizers(main, lang)

            lemmas_searched = wordless_text_processing.wordless_lemmatize(main, tokens_searched, lang, text_type)
            lemmas_matched = wordless_text_processing.wordless_lemmatize(main, list(tokens_matched), lang, text_type)

            for token_matched, lemma_matched in zip(list(tokens_matched), lemmas_matched):
                lemma_matched = re.escape(lemma_matched)
                lemma_matched = fr'(^|\s+){lemma_matched}(\s+|$)'

                if settings['ignore_case']:
                    flags = re.IGNORECASE
                else:
                    flags = 0

                for token, lemma_searched in zip(tokens, lemmas_searched):
                    if re.search(lemma_matched, lemma_searched, flags = flags):
                        tokens_matched[token_matched].add(token)

    if search_settings['use_regex']:
        for search_term in search_terms:
            search_term_tokens_matched = []

            for search_term_token in search_term.split():
                search_term_tokens_matched.append(set())

                for regex_matched in regexes_matched[search_term_token]:
                    search_term_tokens_matched[-1].add(regex_matched)
                    search_term_tokens_matched[-1] |= set(tokens_matched[regex_matched])

            search_terms_matched.extend(itertools.product(*search_term_tokens_matched))
    else:
        for search_term in search_terms:
            search_term_tokens_matched = []

            for search_term_token in search_term.split():
                search_term_tokens_matched.append(set(tokens_matched[search_term_token]))

            search_terms_matched.extend(itertools.product(*search_term_tokens_matched))

    return search_terms_matched