def prepare_text(text: str, morf: morfeusz2.Morfeusz) -> List[Set[str]]: analysed_text = morf.analyse(text) pos = 0 sets = [] current_set = set() for morf_tuple in analysed_text: part_of_speech = morf_tuple[2][2].split(':')[0] if part_of_speech in [ 'interj', 'conj', 'part', 'siebie', 'fin', 'bedzie', 'aglt', 'impt', 'imps', 'inf', 'winien', 'pred', 'comp', 'interj', 'interp' ]: continue if morf_tuple[0] != pos: if len(current_set) != 0: sets.append(current_set) current_set = set() pos = morf_tuple[0] lemma = morf_tuple[2][1].split(':')[0] if part_of_speech == 'ign': lemma = stem(lemma) lemma = ''.join(c for c in lemma if c.isalnum()) if len(lemma) > 0: current_set.add(lemma.lower()) return sets
def answer_sentiment_score(text: str, sentiment_dict: Dict[str, int], morf: morfeusz2.Morfeusz, verbose=False): analysis = morf.analyse(text) lemmas = lemmas_list(analysis) return sentiment_score(lemmas, sentiment_dict, verbose)
class MorfeuszLemmatizer(object): """Morfeusz-based lemmatizer""" def __init__(self): """Constructor""" self.morf = Morfeusz() def lemmatize(self, form): analysed = self.morf.analyse(form) for ( _begin, _end, (_wordform, baseform, _tags, _commonness, _qualifiers), ) in analysed: return baseform
def process_request(params): option_parser = MorfeuszOptionParser(params) option_parser.parse_bool('expandDag', 'expand_dag') option_parser.parse_bool('expandTags', 'expand_tags') option_parser.parse_bool('expandDot', 'expand_dot') option_parser.parse_bool('expandUnderscore', 'expand_underscore') option_parser.parse_string('agglutinationRules', 'aggl', AGGLUTINATION_RULES) option_parser.parse_string('pastTenseSegmentation', 'praet', PAST_TENSE_SEGMENTATION) option_parser.parse_enum('tokenNumbering', 'separate_numbering', TokenNumbering, TokenNumbering.separate) option_parser.parse_enum('caseHandling', 'case_handling', CaseHandling) option_parser.parse_enum('whitespaceHandling', 'whitespace', WhitespaceHandling) option_parser.parse_actions('action') results = [] response = {'results': results} if option_parser.validate(response): option_parser.set_dictionary_path('MORFEUSZ_DICT_PATH') morfeusz = Morfeusz(**option_parser.get_opts()) if option_parser.action == 'analyze': for interp_list in morfeusz.analyse(option_parser.text): if isinstance(interp_list, list): subitem = [] results.append(subitem) for item in interp_list: subitem.append(tag_items(item)) else: results.append(tag_items(interp_list)) elif option_parser.action == 'generate': for title in option_parser.titles: subitem = [] results.append(subitem) for interp_list in morfeusz.generate(title): subitem.append(tag_items(interp_list)) response['version'] = morfeusz2.__version__ response['dictionaryId'] = morfeusz.dict_id() response['copyright'] = morfeusz.dict_copyright() return response
#! /usr/bin/python # *-* coding: utf-8 *-* from morfeusz2 import Morfeusz from concraft_pl2 import Concraft, Server try: morfeusz = Morfeusz(expand_tags=True) server = Server(model_path="/home/kuba/work/ipipan/concraft/pre-trained/Sep-18/model-04-09-2018.gz", port=3001) concraft = Concraft(port=3001) dag = morfeusz.analyse(u'W Szczebrzeszynie chrząszcz brzmi w trzcinie.') res = concraft.disamb(dag) print(res) dag = morfeusz.analyse(u'W Szczebrzeszynie chrząszcz brzmi w trzcinie.') dag_str = concraft.dag_to_str(dag) dag_disamb_str = concraft.disamb_str(dag_str) print(dag_disamb_str) finally: server.terminate()
r'<([a-z]*-[a-z]*)(\s+[a-z]*="\w*((\s+\w*)+)?")?>(.+?)<\/[a-z]*-[a-z]*>', letter_contents) # print('one word tags') # for tag in tags_word: # print(tag) # print('\n') # print('two word tags') # for tag in tags_words: # print(tag) # print('\n') def remove_dashes(text): tmp_str = '' for letter in text: if letter != '-': tmp_str = tmp_str + letter return tmp_str letter1_no_tags = remove_tags(letters[0].contents) letter1_nt_str = ' '.join(letter1_no_tags) # letter_ntnd_str = remove_dashes(letter1_nt_str.decode('utf-8')) # print(letter_ntnd_str) morf = Morfeusz() # print(morf) print(letter1_nt_str.decode('utf8')) letter1_analysed = morf.analyse(letter1_nt_str) print(letter1_analysed)
if 'generated_with_lsa.txt' in listdir('.'): remove('generated_with_lsa.txt') f = codecs.open(corpus_filename, encoding='utf-8') # main cycle for raw_line in f: sentence = re.split('\W+', raw_line.lower(), flags=re.UNICODE) ngrams2file = [] for ngram_tuple in ngrams( sentence, min(len(sentence[:-1]), max_n) ): # [:-1] because sentence has empty word as the last element ngram = list(ngram_tuple) ngrams2file.append(ngram) # initial form of ngram for c in xrange(0, len(ngram)): # step by word in ngram w_desc = morph.analyse(ngram[c]) if len(w_desc) > 0: init_form = w_desc[0][2][1].split(':')[0] #print 'init_form: ', type(init_form) try: index = dictionary.index(init_form) # + u'\n' #print 'index: ', index syns = find_syns(init_form, index, w_desc[0][2][2].split(':')) #print 'syns: ' for syn in syns: ngram2file = ngram[:] #print(syn) ngram2file[c] = syn ngrams2file.append(ngram2file) #u' '.join(ngram2file) #print 'n ', ngrams2file[-1]