def find_route(text, objects_dict, duplicates_filtering_window=0, far_objects_filtering_dist=0.0, splitting_min_dist=None): pos = 0 morf = morfeusz2.Morfeusz() sets = prepare_text(text, morf) route = [] while pos < len(sets): matches = find_next_object(sets, pos, objects_dict) if len(matches) == 0: pos += 1 continue objects, lengths, positions = zip(*matches) max_length = max(lengths) pos += max_length route.append([(m[0], m[2]) for m in matches if m[1] == max_length]) route = swap_elements(route) route = filter_variants(route) remove_needless_fields(route) if far_objects_filtering_dist > 0: route = filter_far_objects(route, far_objects_filtering_dist) if duplicates_filtering_window != 0: route = filter_duplicates(route, duplicates_filtering_window) route = filter_duplicates(route, 1) if splitting_min_dist is not None: route = split_route(route, splitting_min_dist) return route
def __init__(self, punfile="../data/punctation.txt", stopfile="../data/stopwords.txt"): # Initialize Morfeusz if platform.system() == 'Windows': self.morf = morfeusz else: self.morf = morfeusz2.Morfeusz() # Initialize files self.__punfile = punfile self.__stopfile = stopfile self.tokens_re = re.compile(r'(' + '|'.join(regex_str) + ')', re.UNICODE | re.VERBOSE | re.IGNORECASE) self.emoticon_re = re.compile(r'^' + emoticons_str + '$', re.UNICODE | re.VERBOSE | re.IGNORECASE) self.undef_re = re.compile(r'^' + regex_str[-1] + '$', re.UNICODE | re.VERBOSE | re.IGNORECASE) self.men_re = re.compile(r'^' + regex_str[2] + '$', re.UNICODE | re.VERBOSE | re.IGNORECASE) self.url_re = re.compile( r'(' + '|'.join([regex_str[1], regex_str[4]]) + ')', re.UNICODE | re.VERBOSE | re.IGNORECASE) # Load list of punctation characters from file with open(self.__punfile) as punf: self.punctation = ast.literal_eval(punf.read()) # Load list of stop words characters from file with open(self.__stopfile) as stopf: self.stop = ast.literal_eval(stopf.read())
def __init__(self, nlp): if imported_Morfeusz: self.morf = morfeusz2.Morfeusz( generate=False, whitespace=morfeusz2.KEEP_WHITESPACES, expand_tags=True) self.imported_Morfeusz = imported_Morfeusz self.nlp = nlp self.toygger = Toygger() self.vocab = self.nlp.vocab Token.set_extension("feats", default="") # we reserve a custom attribute for storing morphological features #tagmap into UD POS self.tag_map = { 'adj': 'ADJ', 'adja': 'ADJ', 'adjc': 'ADJ', 'adjp': 'ADJ', 'adv': 'ADV', 'aglt': 'AUX', 'bedzie': 'VERB', 'brev': 'X', 'burk': 'ADV', 'comp': 'SCONJ', 'conj': 'CCONJ', 'depr': 'NOUN', 'fin': 'VERB', 'ger': 'NOUN', 'imps': 'VERB', 'impt': 'VERB', 'inf': 'VERB', 'interj': 'INTJ', 'interp': 'PUNCT', 'num': 'NUM', 'numcol': 'NUM', 'pact': 'VERB', 'pant': 'VERB', 'pcon': 'VERB', 'ppas': '******', 'ppron12': 'PRON', 'ppron3': 'PRON', 'praet': 'VERB', 'pred': 'VERB', 'prep': 'ADP', 'qub': 'PART', 'siebie': 'PRON', 'subst': 'NOUN', 'winien': 'VERB', 'xxx': 'X', #additional tags besides nkjp 'part': 'PART', # particle 'ign': 'X', 'dig': 'NUM', 'romandig': 'NUM', 'frag': 'X', 'pacta': 'VERB', 'numcomp': 'NUM' }
def Get_Case(keyword): ''' Returns case ''' morf = morfeusz2.Morfeusz() list_of_morphosyntactic_forms = morf.analyse(keyword) element = list_of_morphosyntactic_forms[0][2][2] case = element.split(':')[2].split('.')[0] return case
def count_nouns(titles: list): """Return dictionary of occurences of nouns(nominative) in article titles""" morf = morfeusz2.Morfeusz() unique_list: list = [] for title in titles: sentence_analysis: list = morf.analyse(title) for id in range(len(sentence_analysis)): list_matching_words = [item for item in sentence_analysis if item[0] == id] unique_list.append(list_matching_words) words_to_check: list = [] for unique_word in unique_list: for item in unique_word: compatible_types: bool = any(elem in item[-1][3] for elem in ['nazwisko', 'imiona', 'imię', 'nazwa_geograficzna']) if (('subst:sg:nom:f' in item[-1][2] or 'subst:sg:nom:m' in item[-1][2] or 'subst:pl:nom:f' in item[-1][2] or 'subst:pl:nom:m' in item[-1][2]) and not (compatible_types or ':' in item[-1][1])): words_to_check.append(item[-1][1]) else: pass results: dict = {} for word in words_to_check: results.setdefault(word, 0) results[word] += 1 return results
def to_lemmas(words: List[str]) -> List[str]: w_str = " ".join(words) morf = morfeusz2.Morfeusz() # (praet='composite') analysis = [] analysis = morf.analyse(words) # for word in words: # analysis.append(morf.analyse(word)) # # analysis = morf.analyse(words) prev = None result = [] for i, j, (orth, base, tag, posp, kwal) in analysis: if i == prev: continue prev = i if i > 0: if i + 1 == i: continue if ':' in base: result.append(re.findall('(.*):', base)[0]) else: result.append(base) return result
def stem2(text): if type(text) is str: morf = morfeusz2.Morfeusz() result_words = [] for word in text: analysis = morf.analyse(word.decode('utf-8')) for intepretation in analysis: result_words.append(intepretation[2][1].encode('utf-8')) return ' '.join(result_words)
def infinitive_of_word(self, word): morf = morfeusz2.Morfeusz() analysis = morf.analyse(word) if len(analysis) == 1: return analysis[0][2][1] elif len(analysis) == 2: return analysis[1][2][1] else: return analysis[len(analysis)-1][2][1]
def __init__(self, nlp): self.nlp = nlp try: self.nlp.tokenizer.morf.generate("") except RuntimeError: # morfeusz does not have the generate dictionary loaded self.nlp.tokenizer.morf = morfeusz2.Morfeusz( expand_tags=True, whitespace=morfeusz2.KEEP_WHITESPACES, generate=True) self.morf = self.nlp.tokenizer.morf
def print_interpretation(df): morf = morfeusz2.Morfeusz() for line in df['text']: if type(line) is str: for word in line.split(' '): print("-----TEXT: ", word) analysis = morf.analyse(word.decode('utf-8')) for intepretation in analysis: print('-----INTERPRETATION: ', intepretation[2][1].encode('utf-8'))
def get_morfeusz(): import morfeusz2 morf = morfeusz2.Morfeusz( analyse=True, # load analyze dictionary generate=False, # dont load generator dictionary expand_tags=True, # expand tags (return tags without dots) aggl= 'isolated', # 'isolated' - token 'm' has aglt interpretation, token 'np' has brev interpretation praet='composite', # aglt and 'by' are not divided # whitespace=morfeusz2.KEEP_WHITESPACES ) return morf
def __init__(self): self._morf = morfeusz2.Morfeusz( dict_path=f'{base_dir}/third parties/morfeusz2-dictionary-polimorf', dict_name="polimorf") self._base_form_extension = None self.reset_base_form_extension() self._base_form_removals = None self.reset_base_form_removals() self._reinterpret_mapping = None self.reset_reinterpret_mapping()
class MorfeuszAnalyser(): morf = morfeusz2.Morfeusz(generate=False) tokenizer = nltk.data.load('tokenizers/punkt/polish.pickle') def __init__(self, ngram_range=(1, 3), split_to_sentences=True, use_multiprocessing=True): self.ngram_range = ngram_range self.split_to_sentences = split_to_sentences self.use_multiprocessing = use_multiprocessing with open('polish_stopwords.txt') as f: self.stop_words = [x.strip() for x in f] self.ignore_tags = ['interp', 'interj', 'part', 'conj', 'comp', 'pred'] def _analyse(self, text): analysis = [ x for x in self.morf.analyse(text) if x[2][1].split(':')[0] not in self.stop_words and x[2][2] not in self.ignore_tags ] org_tokens = [] lem_tokens = [] curr_index = -1 for word_index, _, tup in analysis: if curr_index == word_index: continue curr_index = word_index org_tokens.append(tup[0]) lem_tokens.append(tup[1].split(':')[0]) return lem_tokens + ngrams(org_tokens, self.ngram_range) def __call__(self, text): if self.split_to_sentences: sentences = self.tokenizer.tokenize(text) if self.use_multiprocessing: with Pool(cpu_count() - 1) as p: sentence_tokens = p.map(self._analyse, sentences) else: sentence_tokens = [ self._analyse(sentence) for sentence in sentences ] return [token for tokens in sentence_tokens for token in tokens] else: return self._analyse(text)
def stem(text): if type(text) is str and ' ' in text: morf = morfeusz2.Morfeusz() result_words = [] for word in text.split(' '): try: analysis = morf.analyse(word.decode('utf-8')) if len(analysis) > 0: result_words.append(analysis[0][2][1].encode('utf-8')) except: result_words.append(word) result = ' '.join(result_words) return result else: return text
def preprocess_sents(sents, stop_words): morf = morfeusz2.Morfeusz(generate=False) res = [] for sent in sents: analysis = morf.analyse(sent) brief_list = [ next(t) for _, t in itertools.groupby(analysis, lambda x: x[0]) ] words = list( filter(lambda x: x.isalpha() and x not in stop_words, map(lambda x: x[2][1].lower(), brief_list))) if len(words) > 0: res.append(' '.join(words)) return res
def main(file: str): morf = morfeusz2.Morfeusz() with open(file, 'r') as f: text = f.read() result = morf.analyse(text) line = [] id = 0 for word in result: if word[0] != id: print(", ".join(line)) line = [] id = word[0] line.append(f"{word[2][1]}:{word[2][2]}")
def lemmatize(self, text): """Szuka lemmatów słów w danym tekście. Zwraca słownik zbudowany wg schemtu `lemmat: formy występujące w tekście`. """ morf = morfeusz2.Morfeusz(whitespace=morfeusz2.SKIP_WHITESPACES, generate=False) analysis = morf.analyse(text) pairs = [ ( lemm[2][0], # forma występująca w tekście lemm[2][1].split(":")[0], # lemmat ) for lemm in analysis ] lemmas = collections.defaultdict(set) for key, val in pairs: lemmas[key].add(val) return lemmas
def __init__(self): self.column_names = [ "Wykładnik formy", "Lemat", "Znacznik morfosyntaktyczny", "Klasyfikacja nazw własnych", "Kwalifikatory" ] self.VERBS_SYMBOLS = ["fin", "praet"] self.NOUNS_SYMBOLS = ["subst", "depr"] self.ADJECTIVES_SYMBOLS = ["adj", "adja", "adjp"] self.GRADES_OF_ADJECTIVES = { "equal": "pos", "higher": "com", "top": "sup" } self.MORPHOSYNTACTIC_MARKER = "Znacznik morfosyntaktyczny" self.morfeusz_object = morfeusz2.Morfeusz(praet='composite') self.nouns = [] self.adjectives = [] self.verbs = []
def tokenize_and_lemmatize(text): return_word_list = [] next_word = 0 try: for list_of_tuples in morfeusz2.Morfeusz().analyse(str(text)): morf_actual_word = list_of_tuples[0] if next_word > morf_actual_word: continue next_word = list_of_tuples[1] analyse_tuple = list_of_tuples[2] return_word_list.append((str(analyse_tuple[1])).lower()) except: print("Error:", text) return return_word_list
def parseString(ciag, pretty): ciagU = ciag.decode('utf8') objMorf = morfeusz2.Morfeusz() if pretty: print "Ale ładne" else: out = objMorf.analyse(ciagU) # [x.encode('utf8') for x in out] # print out.encode('utf8') # print "ciag: %s" % ciag # print "ciagU: %s" % ciagU print out for wyraz in out: print "------------------------------" print "Wyraz: %s" % wyraz[2][0] print "Leksem: %s" % wyraz[2][1] print "Uwagi: %s" % wyraz[2][3] print "Uwagi2: %s" % wyraz[2][4] print "Morfo: %s" % wyraz[2][2] for el in wyraz[2][2].split(":"): print el
def prepare_objects(terms): morf = morfeusz2.Morfeusz() print(morf.dict_id()) prepared_objects = [] for term in terms: words = term['name'].split(' ') words_results = [morf.analyse(w) for w in words] prepared_words = [] for word_result in words_results: info = process_word(word_result) prepared_result = [(w[1], w[2].split(':')[0], w[3]) for w in info] forms = set([r[0].split(':')[0].lower() for r in prepared_result]) prepared_words.append( (forms, any([is_sufficient(t) for t in prepared_result]))) prepared_objects.append({ 'name': term['name'], 'keywords': prepared_words, 'type': term['type'], 'coords': (term['latitude'], term['longitude']) }) return prepared_objects
def Decline_Noun(keyword, case): ''' Decline nouns ''' result = "" morf = morfeusz2.Morfeusz() list_of_morphosyntactic_forms = morf.generate(keyword) if case == 'nom': for tuple in list_of_morphosyntactic_forms: for element in tuple: if "subst:sg:nom" in element: result = tuple[0] elif case == 'gen': for tuple in list_of_morphosyntactic_forms: for element in tuple: if "subst:sg:gen" in element: result = tuple[0] elif case == 'dat': for tuple in list_of_morphosyntactic_forms: for element in tuple: if "subst:sg:dat" in element: result = tuple[0] elif case == 'acc': for tuple in list_of_morphosyntactic_forms: for element in tuple: if "subst:sg:nom.acc" in element: result = tuple[0] elif "subst:sg:acc" in element: result = tuple[0] elif case == 'inst': for tuple in list_of_morphosyntactic_forms: for element in tuple: if "subst:sg:inst" in element: result = tuple[0] elif case == 'loc': for tuple in list_of_morphosyntactic_forms: for element in tuple: if "subst:sg:loc" in element: result = tuple[0] return result
# -*- coding: utf-8 -*- import pandas as pd import morfeusz2 import pickle morf = morfeusz2.Morfeusz() data = pd.read_excel("wykaz_miejscowosci.xlsx") values = data[u'Nazwa miejscowości '].values.tolist() values_lexical = [] for value in values: try: value = value.lower() lexical = morf.analyse(value) lst_word = -1 name = [] for i in lexical: if i[0] == lst_word: continue lst_word = i[0] v = i[2][1] v = v.split(":")[0] v = v.lower() name.append(v) name = " ".join(name) if u"góra" in name: print(name)
# encoding=utf8 import sys from nltk.tokenize import word_tokenize from tqdm import * import morfeusz2 import os import codecs import json import sys reload(sys) sys.setdefaultencoding('utf8') if sys.platform == 'linux2': morfeusz = morfeusz2.Morfeusz() class XmlParser: @staticmethod def get_stopwords(self): with open(self.stopwords_path) as f: words_list = f.readlines()[0] return words_list.split(", ") def __init__(self): self.stopwords_path = os.path.join(os.path.abspath('..'), "data", "stopwords.txt") self.stopwords = self.get_stopwords(self) self.special_char = "\'~*+§/\[](){}<>@=°„‚’\”&^|%_#-:;.!?," self.xml_article_path = os.path.join(os.path.abspath('..'), "data", "wiki.xml") # self.articles_json_path = os.path.abspath('..') + '\\data\\articles.json'
* Paweł Płatek """ import argparse import logging import signal import sys from collections import defaultdict from sys import exit from typing import Callable, List, Optional, Set, Tuple # http://morfeusz.sgjp.pl/download/ import morfeusz2 # type: ignore # init morfeusz2 globally, because it is slow and leaks memory morfeusz_analyser = morfeusz2.Morfeusz(whitespace=morfeusz2.KEEP_WHITESPACES) # (start_segment, end_segment, (text_form, lemma, morphology marker, ordinariness, stylistic qualifiers)) Interpretation = Tuple[int, int, Tuple[str, str, str, List[str], List[str]]] IsDiminutiveFunc = Callable[[str, List[Interpretation]], bool] logging.basicConfig(format='%(message)s') L = logging.getLogger(__name__) def interrupt_handler(sig, frame): print('Exit') exit(0) signal.signal(signal.SIGINT, interrupt_handler) # http://www.ipipan.waw.pl/~wolinski/publ/znakowanie.pdf GRAM_FLEX = defaultdict(lambda: 'nieznane', {
def __init__(self): self._analyzer = morfeusz2.Morfeusz()
def setUp(self): self.morfeusz = morfeusz2.Morfeusz()
import collections import functools import morfeusz2 import matplotlib.pyplot as plt KSIĄŻKA = 'jadro_ciemnosci.txt' # TU(3): wpisać nazwę pliku z tekstem książki. # Chociaż biblioteki morfeusz2 używa się zwykle do analizowania # dłuższych tekstów, my używamy jej tylko do analizowania # pojedynczych wyrazów. # Dzięki parametrowi praet='composite' formy czasu przeszłego # i trybu przypuszczającego są analizowane jako jeden segment, # a nie jako np. 'robił' + 'by' + 'm'. MORFEUSZ = morfeusz2.Morfeusz(praet='composite') def podaj_wyrazy(nazwa_pliku): # TU(4): uzupełnić zgodnie z instrukcją. with open(nazwa_pliku, 'rt', encoding='utf-8') as plik: for czesc in plik.read().split(): wyraz = czesc.strip(',.—;?!…:„”()*&-–/') if wyraz != '': yield wyraz def wypisz_skrajne_znaki_wyrazów(nazwa_pliku): znaki = collections.Counter() for wyraz in podaj_wyrazy(nazwa_pliku): znaki[wyraz[0]] += 1
def diminutive_probability(word: str, interpretation: Interpretation, allows_rerun: bool = True) -> float: """Returns probability of the word being diminutive, given its morphological interpretation. TODO: weights for sets of suffixes TODO: handle suffix combinations Args: word: word to check interpretation: one item from morfeusz2.analyse function allows_rerun: allows recursive calls to this function """ _, _, word_morphology = interpretation text_form, lemma, morphology_marker, _, _ = word_morphology # remove "rozpodabniacze", because words can have completely different meanings # f.e. kot:s1 == animal, kot:s2 == young soldier lemma = lemma.split(':')[0] L.debug('Probability for `%s` (%s, %s, %s)', word, text_form, lemma, morphology_marker) # find word's part of speech is_noun = False is_adjective = False is_unknown = False # TODO, czy część mowy zawsze jest jako pierwsza? marker = morphology_marker.split(':')[0] if GRAM_FLEX[marker] == 'rzeczownik': is_noun = True if GRAM_FLEX[marker] == 'przymiotnik': is_adjective = True if GRAM_FLEX[marker] == 'nieznane': is_unknown = True # sanity check if is_noun and is_adjective: L.warning('Strange, word `%s` is both noun and adjective', word) # results number_of_matches = 0 number_of_checks = 0 # general suffixes if is_noun or is_adjective or is_unknown: # Paweł Miczko number_of_checks += 1 if has_diminutive_suffix(lemma, suf_miczko_general, 'Paweł Miczko'): number_of_matches += 1 # noun only suffixes if is_noun: L.debug(' -> rzeczownik') # Długosz suffixes # find gender and grammatical number gender = None grammar_number = None subgender = None for marker_with_dots in morphology_marker.split(':'): for marker in marker_with_dots.split('.'): flex = GRAM_CATEGORY[marker] if flex == 'rodzaj': gender = marker elif flex == 'liczba': grammar_number = marker elif flex == 'przyrodzaj': subgender = marker # rodzaj/liczba dowolne suffixes_to_check = set() suffixes_to_check.update(suf_dlugosz_noun_other) # liczba pojedyncza if grammar_number == 'sg': L.debug(' -> liczba pojedyncza') if gender: # męski if gender.startswith('m'): L.debug(' -> rodzaj męski') suffixes_to_check.update(suf_dlugosz_noun_masculine) # żeński elif gender.startswith('f'): L.debug(' -> rodzaj żeński') suffixes_to_check.update(suf_dlugosz_noun_feminine) # nijaki elif gender.startswith('n'): L.debug(' -> rodzaj nijaki') suffixes_to_check.update(suf_dlugosz_noun_neuter) # przymnogi TODO, czyli jakby mnogi? Sprawdzac word czy lemma? elif gender.startswith('p'): L.debug(' -> rodzaj przymnogi') suffixes_to_check.update( suf_dlugosz_noun_plural_and_plurale_tantum) # check lemma, as it always is plural number_of_checks += 1 if has_diminutive_suffix(lemma, suffixes_to_check, 'Długosz'): number_of_matches += 1 else: # liczba mnoga if grammar_number: L.debug(' -> liczba mnoga') suffixes_to_check.update( suf_dlugosz_noun_plural_and_plurale_tantum) # plurale tantum elif subgender == 'pt': L.debug(' -> plurale tantum') suffixes_to_check.update( suf_dlugosz_noun_plural_and_plurale_tantum) # check original word, not lemma, because lemma is singular number_of_checks += 1 if has_diminutive_suffix(word, suffixes_to_check, 'Długosz'): number_of_matches += 1 # run checks for pluralized lemma if allows_rerun and lemma.lower() != word.lower(): L.debug(' -> re-running checks for lemma!') L.debug('~*' * 5) number_of_checks += 1 morf = morfeusz2.Morfeusz( whitespace=morfeusz2.SKIP_WHITESPACES) lemma_segments = morf.analyse(lemma) if is_diminutive(lemma, lemma_segments, allows_rerun=False): number_of_matches += 1 L.debug('~*' * 5) # Grzegorczykowa and Puzynina, Dobrzyński, Kaczorowska number_of_checks += 1 if has_diminutive_suffix(lemma, suf_gpdk_noun, 'GPDK'): number_of_matches += 1 # adjective only suffixes elif is_adjective: L.debug(' -> przymiotnik') # Grzegorczykowa number_of_checks += 1 if has_diminutive_suffix(lemma, suf_grzeg_adjectives, 'Grzegorczykowa'): number_of_matches += 1 # we care only about nouns and adjectives else: pass probability = 0.0 if number_of_checks != 0: probability = float(number_of_matches) / number_of_checks L.debug(' -> probability: %f', probability) return probability
if ktext.tokens and ktext.tokens[-1].start_position == start_position and ktext.tokens[ -1].end_position == end_position: ktext.tokens[-1].add_interpretation(kinterpretation) else: ktoken = KToken(form, space_before=None, start_offset=None, end_offset=None) ktoken.start_position = start_position ktoken.end_position = end_position ktoken.add_interpretation(kinterpretation) ktext.add_token(ktoken) return ktext parser = ArgumentParser(description='Train') parser.add_argument('jsonl_path', help='path to JSONL for getting text') parser.add_argument('--dict_dir', default=None, help='path to directory with dict') parser.add_argument('--dict_name', default=None, help='dict name') parser.add_argument('output_path', help='path to merged JSONL') args = parser.parse_args() morfeusz = morfeusz2.Morfeusz(generate=False, expand_tags=True, dict_name=args.dict_name, dict_path=args.dict_dir) # dict_name=None, dict_path=None #--dict-dir /home/kwrobel/repos/poleval2020-task2/data/ --dict morfeusz-f19 with jsonlines.open(args.jsonl_path) as reader, jsonlines.open(args.output_path, mode='w') as writer: for data in reader: original_ktext = KText.load(data) text = original_ktext.text ktext = morfeusz_tokenize(text, original_ktext) ktext.fix_offsets2() writer.write(ktext.save())