def __init__(self): super(PolishLemmatizer, self).__init__() try: from morfeusz2 import Morfeusz except ImportError: raise ImportError( 'The Polish lemmatizer requires the morfeusz2-python library') if PolishLemmatizer._morph is None: PolishLemmatizer._morph = Morfeusz(dict_name='polimorf')
def process_request(params): option_parser = MorfeuszOptionParser(params) option_parser.parse_bool('expandDag', 'expand_dag') option_parser.parse_bool('expandTags', 'expand_tags') option_parser.parse_bool('expandDot', 'expand_dot') option_parser.parse_bool('expandUnderscore', 'expand_underscore') option_parser.parse_string('agglutinationRules', 'aggl', AGGLUTINATION_RULES) option_parser.parse_string('pastTenseSegmentation', 'praet', PAST_TENSE_SEGMENTATION) option_parser.parse_enum('tokenNumbering', 'separate_numbering', TokenNumbering, TokenNumbering.separate) option_parser.parse_enum('caseHandling', 'case_handling', CaseHandling) option_parser.parse_enum('whitespaceHandling', 'whitespace', WhitespaceHandling) option_parser.parse_actions('action') results = [] response = {'results': results} if option_parser.validate(response): option_parser.set_dictionary_path('MORFEUSZ_DICT_PATH') morfeusz = Morfeusz(**option_parser.get_opts()) if option_parser.action == 'analyze': for interp_list in morfeusz.analyse(option_parser.text): if isinstance(interp_list, list): subitem = [] results.append(subitem) for item in interp_list: subitem.append(tag_items(item)) else: results.append(tag_items(interp_list)) elif option_parser.action == 'generate': for title in option_parser.titles: subitem = [] results.append(subitem) for interp_list in morfeusz.generate(title): subitem.append(tag_items(interp_list)) response['version'] = morfeusz2.__version__ response['dictionaryId'] = morfeusz.dict_id() response['copyright'] = morfeusz.dict_copyright() return response
from sys import argv, exit #other imports corpus_filename = 'pl.txt' try: filename = argv[1] if filename in listdir('.'): corpus_filename = filename else: print('File %s not found in the current directory' % filename) exit(-1) except IndexError: pass exclude = string.digits #unicode(string.digits) # morph = Morfeusz() def lemm(line): sentence = re.split( '\d+|\W+|_', line.lower(), flags=re.UNICODE ) #re.split('\W+', line.lower(), flags=re.UNICODE) #line.split() norm_sentence = [] for i in xrange(0, len(sentence)): if sentence[i] != u'': #print 'sen: ', sentence[i], 'len: ', len(sentence[i]) w_desc = morph.analyse(sentence[i]) if len(w_desc) > 0: norm_sentence.append(w_desc[0][2][1].split(':')[0]) return norm_sentence
#! /usr/bin/python # *-* coding: utf-8 *-* from morfeusz2 import Morfeusz from concraft_pl2 import Concraft, Server try: morfeusz = Morfeusz(expand_tags=True) server = Server(model_path="/home/kuba/work/ipipan/concraft/pre-trained/Sep-18/model-04-09-2018.gz", port=3001) concraft = Concraft(port=3001) dag = morfeusz.analyse(u'W Szczebrzeszynie chrząszcz brzmi w trzcinie.') res = concraft.disamb(dag) print(res) dag = morfeusz.analyse(u'W Szczebrzeszynie chrząszcz brzmi w trzcinie.') dag_str = concraft.dag_to_str(dag) dag_disamb_str = concraft.disamb_str(dag_str) print(dag_disamb_str) finally: server.terminate()
import pathlib, random, re, sys from typing import Callable, Optional from morfeusz2 import Morfeusz from wordnet import query morfeusz = Morfeusz(analyse=False) DATASETS = ["new"] DICT_LINES = {} DICT_FUNCTIONS = {} THESAURUS = {} # Words from the thesaurus containing these tags will be ignored: BLACKLISTED_TAGS = [ "(bardzo potocznie)", "(potocznie)", "(częściej, ale wg niektórych niepoprawnie)", "(eufemistycznie)", # :( #"(nieco potocznie)", # Eh, it's fine "(obraźliwe)", "(obraźliwie)", #"(pieszczotliwie)", "(pogardliwie)", "(potoczne)", "(potocznie)", "(przestarzale)", "(ptoocznie)", "(regionalnie)", # Contains some inappropriate words "(rzadko, wg niektórych niepoprawnie)",
def __init__(self): """Constructor""" self.morf = Morfeusz()
r'<([a-z]*-[a-z]*)(\s+[a-z]*="\w*((\s+\w*)+)?")?>(.+?)<\/[a-z]*-[a-z]*>', letter_contents) # print('one word tags') # for tag in tags_word: # print(tag) # print('\n') # print('two word tags') # for tag in tags_words: # print(tag) # print('\n') def remove_dashes(text): tmp_str = '' for letter in text: if letter != '-': tmp_str = tmp_str + letter return tmp_str letter1_no_tags = remove_tags(letters[0].contents) letter1_nt_str = ' '.join(letter1_no_tags) # letter_ntnd_str = remove_dashes(letter1_nt_str.decode('utf-8')) # print(letter_ntnd_str) morf = Morfeusz() # print(morf) print(letter1_nt_str.decode('utf8')) letter1_analysed = morf.analyse(letter1_nt_str) print(letter1_analysed)