def get_words(txt): words = nltk.word_tokenize(txt) words_1 = [] for w in words: if analyse(w)[0][0][1] != None: words_1.append(analyse(w)[0][0][1]) else: words_1.append(w) words_1 = [w for w in words_1 if len(w) > 2] fdist = FreqDist(words_1) most = fdist.most_common(5) most = [m[0] for m in most] return most
def test1(self): text = 'Mama ma.' interps = morfeusz.analyse(text) if sgjp: self.assertEqual(interps.pop(), [(u('Mama'), u('mama'), 'subst:sg:nom:f'), (u('ma'), u('mój'), 'adj:sg:voc:f:pos'), (u('.'), u('.'), 'interp')] ) self.assertEqual(interps, [ [(u('Mama'), u('mama'), 'subst:sg:nom:f'), (u('ma'), u('mieć'), 'fin:sg:ter:imperf'), (u('.'), u('.'), 'interp')], [(u('Mama'), u('mama'), 'subst:sg:nom:f'), (u('ma'), u('mój'), 'adj:sg:nom:f:pos'), (u('.'), u('.'), 'interp')] ])
def test2(self): text = u('Miałem miał.') interps = morfeusz.analyse(text, dag=True) self.assertEqual(interps, [ (0, 1, (u('Miał'), u('mieć'), u('praet:sg:m1:imperf'))), (0, 1, (u('Miał'), u('mieć'), u('praet:sg:m2:imperf'))), (0, 1, (u('Miał'), u('mieć'), u('praet:sg:m3:imperf'))), (1, 2, (u('em'), u('być'), u('aglt:sg:pri:imperf:wok'))), (0, 2, (u('Miałem'), u('miał'), u('subst:sg:inst:m3'))), (2, 3, (u('miał'), u('miał'), u('subst:sg:nom:m3'))), (2, 3, (u('miał'), u('miał'), u('subst:sg:acc:m3'))), (2, 3, (u('miał'), u('mieć'), u('praet:sg:m1:imperf'))), (2, 3, (u('miał'), u('mieć'), u('praet:sg:m2:imperf'))), (2, 3, (u('miał'), u('mieć'), u('praet:sg:m3:imperf'))), (3, 4, (u('.'), u('.'), u('interp'))), ])
def get_word_counts(text): words = defaultdict(int) tokens = nltk.wordpunct_tokenize(text) for word in tokens: # for every word in given text -> lemmatize & count word = re.sub(r'[_+=:;"\'\?/>.<,\\]',' ',word) if len(word) > 1: #print word.encode('utf8') res = morfeusz.analyse(word,expand_tags=False,dag=True) # morfological analyzer for Polish try: base = res[0][2][1] except IndexError: base = None #list.append(tup[1]) if base is not None: words[base] += 1 # increment the word count else: pass return words
def single_word_lemma(word): # analyzing sometimes fails try: interpretations = morfeusz.analyse(word) except: print('Parsing failed:', word, sys.exc_info(), file=sys.stderr) return word lemmas = set([i[0][1] for i in interpretations]) if len(interpretations) == 1 and interpretations[0][0][2] == 'ign': # OOD - out of dictionary: maybe we can find this word in a different database # cases: # [inflected] acronyms # inflected names like Murphy'iego apos_agluts = "'a 'ego 'em 'er 'o 's 'u".split() lemma = interpretations[0][0][1] for ending in apos_agluts: if ending in lemma: return lemma.replace(ending, '') if word in lemmas: # print('Leaving:', word) return word if len(lemmas) == 1: lemma = list(lemmas)[0] # print('Single candidate:', lemma) splitted = lemma.split( ':') # there are some flags in Morpheus, like Polska:s2 return splitted[0].capitalize() if word[0].isupper() else splitted[0] # next step - cleans lemmas of markers, select only same-capitalization matching_lemmas = set( lemma.split(':')[0] for lemma in lemmas if lemma[0].isupper() == word[0].isupper()) if len(matching_lemmas) == 1: return list(matching_lemmas)[0] # fallback return word
def orphaned_examples(test_word=None, hashtable=None, online=False, complete_overwrite=False, onepage_testmode=False): buffer_size = 20 #how many words will be printed on one page if online: active_words = fetch_active_words( ) # prepare only as many pages as we need at the moment else: active_words = {'active': [], 'inactive': [], 'under_review': []} edit_history = read_edit_history() excluded_words = active_words['active'] + edit_history['added'] with open('output/empty_sections.txt', 'r') as g: empty_sections = g.readlines() random.shuffle(empty_sections) if not complete_overwrite: excluded_words += active_words['inactive'] else: excluded_words += active_words['under_review'] if not hashtable: authors_hashtable = read_author_hashtable() else: authors_hashtable = hashtable site = pwb.Site() # this is a dirty trick, because morfAnalyse() and wikilink() don't # really work as they should. The following regex extracts the first part # of [[these|links]] re_base_form = re.compile(r'\[\[(.*?)(?:\||\]\])') words_count = 0 with open('input/orphans.txt') as f,\ open('output/empty_sections.txt', 'r') as g: # list of pages with no examples (obtained by empty_section.py) orphans = f.read() # for testing purposes if test_word: empty_sections = [test_word] pages_count = 666 if onepage_testmode else 0 #loop helper output = [] #list-container for examples for input_word in empty_sections: if complete_overwrite == False and words_count > 2 * len( active_words['active']): with open('output/example_queue.json', 'w') as o: formatted_output = json.dumps(ordermydict(output), ensure_ascii=False, indent=4) o.write(formatted_output) return 2 if (pages_count == 101) or (pages_count == 667 and onepage_testmode): return 0 # dealing with various list formats, e.g. *[[word]] input_word = input_word.strip('*[]\n') if len(input_word) < 4 or input_word.upper == input_word: continue if input_word in excluded_words: continue print(input_word) if complete_overwrite: # write to file/page every N words if len(output) == buffer_size: formatted_output = json.dumps(ordermydict(output), ensure_ascii=False, indent=4) if online: while (True): output_page = pwb.Page( site, 'Wikisłownik:Dodawanie przykładów/dane/{0:03d}' .format(pages_count - 1)) if pages_count == 666 or output_page.userName( ) == 'AlkamidBot': output_page.text = formatted_output output_page.save( comment= 'Pobranie nowych przykładów z NKJP.pl') break else: pages_count += 1 if pages_count == 100: return 0 with open( 'output/json_examples_{0}.json'.format( pages_count), 'w') as o: o.write(formatted_output) pages_count += 1 output = [] if input_word[0] == '-' or input_word[-1] == '-' or input_word[ 0].isupper(): continue # let's skip prefixes and sufixes for now, also whatever starts with a capital leter query = '{0}**'.format(input_word).replace(' ', '** ') result = nkjp_lookup(query) root = etree.parse(result).getroot() #print(xml.dom.minidom.parseString(etree.tostring(root)).toprettyxml()) #return -1 if root.find('concordance') is not None: found = 0 found_orphan = 0 defs = get_definitions(input_word) if defs == 0: continue new_word = ExampleDict() new_word['title'] = input_word new_word['fetch_time'] = str(defs[1]) new_word['definitions'] = defs[0] for line in root.find('concordance').findall('line'): sentence = extract_one_sentence(line, input_word) # NKJP treats gerunds as verb forms. We don't if '\'\'czasownik' in new_word['definitions'] and\ all(('ger:' in analysed[2] or 'subst:' in analysed[2]) for analysed in morfeusz.analyse(sentence[1])[0]): continue if check_sentence_quality(sentence) == 0: continue ref = get_reference(line, authors_hashtable) if ref == '': break if len(new_word['examples']) < 2: temp_example = { 'verificator': 'None', 'correct_num': 'None', 'good_example': False, 'bad_example': False } #temp_example['left'] = line.find('left').text #temp_example['right'] = line.find('right').text temp_example['example'] = wikitext_one_sentence( sentence, input_word) temp_example['left_extra'] = phrases_wikilink( wikilink(sentence[3])) temp_example['right_extra'] = phrases_wikilink( wikilink(sentence[4])) temp_example['source'] = ref orphan_switch = check_if_includes_orphan( sentence, orphans, edit_history['orphans']) temp_example['orphan'] = orphan_switch new_word['examples'].append(temp_example) else: found_new = 0 wikified_example = wikitext_one_sentence( sentence, input_word) for ex_ix, ex in enumerate(new_word['examples']): neworphan = check_if_includes_orphan( sentence, orphans, edit_history['orphans']) if neworphan: if ex['orphan']: if wikified_proportion( ex['example'] ) < wikified_proportion(wikified_example): new_example = new_word['examples'][ ex_ix] found_new = 1 orphan_switch = neworphan break elif not orphan_switch: new_example = new_word['examples'][ex_ix] found_new = 1 break else: if not ex['orphan']: if wikified_proportion( ex['example'] ) < wikified_proportion(wikified_example): new_example = new_word['examples'][ ex_ix] found_new = 1 break if found_new: new_example['orphan'] = neworphan #new_example['left'] = line.find('left').text #new_example['right'] = line.find('right').text new_example['example'] = wikitext_one_sentence( sentence, input_word) new_example['left_extra'] = phrases_wikilink( wikilink(sentence[3])) new_example['right_extra'] = phrases_wikilink( wikilink(sentence[4])) new_example['source'] = ref if new_word and len(new_word['examples']) > 0: output.append(new_word) words_count += 1
def orphaned_examples(test_word=None, hashtable=None, online=False, complete_overwrite=False, onepage_testmode=False): buffer_size = 20 #how many words will be printed on one page if online: active_words = fetch_active_words() # prepare only as many pages as we need at the moment else: active_words = {'active': [], 'inactive': [], 'under_review': []} edit_history = read_edit_history() excluded_words = active_words['active'] + edit_history['added'] with open('output/empty_sections.txt', 'r') as g: empty_sections = g.readlines() random.shuffle(empty_sections) if not complete_overwrite: excluded_words += active_words['inactive'] else: excluded_words += active_words['under_review'] if not hashtable: authors_hashtable = read_author_hashtable() else: authors_hashtable = hashtable site = pwb.Site() # this is a dirty trick, because morfAnalyse() and wikilink() don't # really work as they should. The following regex extracts the first part # of [[these|links]] re_base_form = re.compile(r'\[\[(.*?)(?:\||\]\])') words_count = 0 with open('input/orphans.txt') as f,\ open('output/empty_sections.txt', 'r') as g: # list of pages with no examples (obtained by empty_section.py) orphans = f.read() # for testing purposes if test_word: empty_sections = [test_word] pages_count = 666 if onepage_testmode else 0 #loop helper output = [] #list-container for examples for input_word in empty_sections: if complete_overwrite == False and words_count > 2*len(active_words['active']): with open('output/example_queue.json', 'w') as o: formatted_output = json.dumps(ordermydict(output), ensure_ascii=False, indent=4) o.write(formatted_output) return 2 if (pages_count == 101) or (pages_count == 667 and onepage_testmode): return 0 # dealing with various list formats, e.g. *[[word]] input_word = input_word.strip('*[]\n') if len(input_word) < 4 or input_word.upper == input_word: continue if input_word in excluded_words: continue print(input_word) if complete_overwrite: # write to file/page every N words if len(output) == buffer_size: formatted_output = json.dumps(ordermydict(output), ensure_ascii=False, indent=4) if online: while(True): output_page = pwb.Page(site, 'Wikisłownik:Dodawanie przykładów/dane/{0:03d}'.format(pages_count-1)) if pages_count == 666 or output_page.userName() == 'AlkamidBot': output_page.text = formatted_output output_page.save(comment='Pobranie nowych przykładów z NKJP.pl') break else: pages_count += 1 if pages_count == 100: return 0 with open('output/json_examples_{0}.json'.format(pages_count), 'w') as o: o.write(formatted_output) pages_count += 1 output = [] if input_word[0] == '-' or input_word[-1] == '-' or input_word[0].isupper(): continue # let's skip prefixes and sufixes for now, also whatever starts with a capital leter query = '{0}**'.format(input_word).replace(' ', '** ') result = nkjp_lookup(query) root = etree.parse(result).getroot() #print(xml.dom.minidom.parseString(etree.tostring(root)).toprettyxml()) #return -1 if root.find('concordance') is not None: found = 0 found_orphan = 0 defs = get_definitions(input_word) if defs == 0: continue new_word = ExampleDict() new_word['title'] = input_word new_word['fetch_time'] = str(defs[1]) new_word['definitions'] = defs[0] for line in root.find('concordance').findall('line'): sentence = extract_one_sentence(line, input_word) # NKJP treats gerunds as verb forms. We don't if '\'\'czasownik' in new_word['definitions'] and\ all(('ger:' in analysed[2] or 'subst:' in analysed[2]) for analysed in morfeusz.analyse(sentence[1])[0]): continue if check_sentence_quality(sentence) == 0: continue ref = get_reference(line, authors_hashtable) if ref == '': break if len(new_word['examples']) < 2: temp_example = {'verificator': 'None', 'correct_num': 'None', 'good_example': False, 'bad_example': False} #temp_example['left'] = line.find('left').text #temp_example['right'] = line.find('right').text temp_example['example'] = wikitext_one_sentence(sentence, input_word) temp_example['left_extra'] = phrases_wikilink(wikilink(sentence[3])) temp_example['right_extra'] = phrases_wikilink(wikilink(sentence[4])) temp_example['source'] = ref orphan_switch = check_if_includes_orphan(sentence, orphans, edit_history['orphans']) temp_example['orphan'] = orphan_switch new_word['examples'].append(temp_example) else: found_new = 0 wikified_example = wikitext_one_sentence(sentence, input_word) for ex_ix, ex in enumerate(new_word['examples']): neworphan = check_if_includes_orphan(sentence, orphans, edit_history['orphans']) if neworphan: if ex['orphan']: if wikified_proportion(ex['example']) < wikified_proportion(wikified_example): new_example = new_word['examples'][ex_ix] found_new = 1 orphan_switch = neworphan break elif not orphan_switch: new_example = new_word['examples'][ex_ix] found_new = 1 break else: if not ex['orphan']: if wikified_proportion(ex['example']) < wikified_proportion(wikified_example): new_example = new_word['examples'][ex_ix] found_new = 1 break if found_new: new_example['orphan'] = neworphan #new_example['left'] = line.find('left').text #new_example['right'] = line.find('right').text new_example['example'] = wikitext_one_sentence(sentence, input_word) new_example['left_extra'] = phrases_wikilink(wikilink(sentence[3])) new_example['right_extra'] = phrases_wikilink(wikilink(sentence[4])) new_example['source'] = ref if new_word and len(new_word['examples']) > 0: output.append(new_word) words_count += 1
# else every exception is "during handling of OSError..." restart = True if restart: # keyboard interrupts jump out here if raised in wrapper env['LD_LIBRARY_PATH'] = os.path.realpath(os.path.dirname(__file__)) print('Restarting with LD_LIBRARY_PATH =', env['LD_LIBRARY_PATH']) subprocess.call([sys.executable] + sys.argv, env=env) exit() parser = argparse.ArgumentParser() parser.add_argument('--dag', action='store_true', help='use dag=True with morfeusz.analyze()') args = parser.parse_args() while True: try: data = input('morfeusz> ') except EOFError: print() break except KeyboardInterrupt: print() break if not data: break pprint.pprint(morfeusz.analyse(data, dag=args.dag)) print('bye')