Exemplo n.º 1
0
def process_frequency_list():
    freq_dict = {}
    print("Töötlen sagedusloendit")
    with open("freq_list_raw.txt", "r", encoding="UTF-8-SIG") as file:
        count = 0
        for line in file:
            if "?" in line:
                continue
            count += 1
            print(count)
            parts = line.strip().split(" ")
            freq = int(parts[0])
            lemma = parts[1]
            word_analysis = estnltk.Text(lemma).tag_analysis()

            synsets = wn.synsets(lemma)
            if is_tag_allowed(word_analysis) and len(
                    synsets) > 0 and has_relations(synsets):
                freq_dict[lemma] = freq

    print("Koostan töödeldud sagedusloendit")
    with open("freq_list_processed.txt", "w", encoding="UTF-8") as file:
        for word in sorted(freq_dict, key=freq_dict.get, reverse=True):
            file.write(word + " " + str(freq_dict[word]) + "\n")
    freq_dict.clear()
    print("Sagedusloend töödeldud")
Exemplo n.º 2
0
def get_wordnet_distractors(keyword,sent):
    text = Text(sent)
    word_lower = Text(keyword).lower().text
    dataframe = text.get.word_texts.lemmas.postags.postag_descriptions.as_dict
    #print ("dataframe ",dataframe)
    word_index = dataframe["word_texts"].index(word_lower)
    pos_tag = dataframe["postags"][word_index]
    pos_final = pos_mapping[pos_tag]

    print("pos  ", pos_final, " ,Estonian tag: ", pos_tag)
    if pos_final is not None and pos_final != "NUM":
        sets = wn.synsets(word_lower, pos=pos_final)
        print("initial synsets ", sets)
        if len(sets) == 0:
            lemma = dataframe["lemmas"][word_index]
            lemma_text = Text(lemma)
            lemma_lower = Text(lemma_text).lower().text
            sets = wn.synsets(lemma_lower, pos=pos_final)
            print("modified synsets ", sets)
    else:
        sets = wn.synsets(word_lower)

    distractors=[]
    for syn in sets[:1]:
        # print (syn)
        # print(syn.name)
        # print(syn.pos)
        # print (syn.definition())
        hypernyms = syn.hypernyms()
        for hypenym in hypernyms:
            for hyponym in hypenym.hyponyms():
                if hyponym.name not in distractors:
                    distractors.append(hyponym.name.split(".")[0])

    # print ("wordnet distractors ",distractors)
    if word_lower in distractors:
        distractors.remove(word_lower)
    return distractors
Exemplo n.º 3
0
def process_foreign_list():
    basic = dictionaries.get_basic_list("basic_processed.txt")
    foreign_words = {}

    count = 0
    print("Töötlen selgitustega võõrsõnade loendit")
    with open("foreign_meaning.txt", "r", encoding="UTF-8") as file:
        for line in file:
            print(count)
            count += 1
            parts = line.strip().split("\t")
            word = parts[0]

            if len(parts) > 1:
                definition = parts[1].split("(")[0]
                foreign_words[word] = definition
            else:
                foreign_words[word] = None

    print("Töötlen märksõnadega võõrsõnade loendit")
    with open("foreign_keywords.txt", "r", encoding="UTF-8") as file:
        count = 0
        for word in file:
            print(count)
            count += 1
            word = word.strip()
            if word[0] == "-" or word[-1] == "-":
                continue
            word_analysis = estnltk.Text(word).tag_analysis()
            synsets = wn.synsets(word)

            # filtreerime välja kõik ebavajalikud sõnad
            if is_tag_allowed(
                    word_analysis
            ) and word not in basic and word not in foreign_words and has_relations(
                    synsets):
                foreign_words[word] = None

    print("Koostan töödeldud võõrsõnade loendit")
    with open("foreign_processed.txt", "w", encoding="UTF-8") as file:
        for word in foreign_words:
            if len(word.split(" ")) <= 1:
                if foreign_words[word] is None:
                    file.write(word + "\n")
                else:
                    file.write(word + " " + foreign_words[word] + "\n")

    foreign_words.clear()
    print("Võõrsõnade loend töödeldud")
Exemplo n.º 4
0
def process_basic_list():
    basic_list = []
    print("Töötlen põhisõnavara loendit")
    count = 0
    with open("basic_raw.txt", "r", encoding="UTF-8") as file:
        for line in file:
            print(count)
            count += 1
            word = line.strip()
            word_analysis = estnltk.Text(word).tag_analysis()
            synsets = wn.synsets(word)

            if is_tag_allowed(word_analysis) and len(
                    synsets) > 0 and has_relations(synsets):
                basic_list.append(word)

    print("Koostan töödeldud põhisõnavara loendit")
    with open("basic_processed.txt", "w", encoding="UTF-8") as file:
        for word in basic_list:
            file.write(word + "\n")

    basic_list.clear()
    print("Põhisõnavara loend töödeldud")
Exemplo n.º 5
0
def hyper(text):
    global sent_count, hp_count
    WN_POS = {u'A',u'S',u'V',u'D'}

    words = text.words
    pos = text.postags
    lemmas_ = text.lemmas
    lemmas =[]
    for lemma in lemmas_:
        if "|" in lemma:
            lemma = lemma[:lemma.index("|")]
        lemmas.append(lemma)
    lemma_pos = zip(lemmas, pos)

    pos2lemmas = defaultdict(set)


    for lemma, pos in lemma_pos:
        if pos in WN_POS:
            pos2lemmas[pos].add(lemma)

    pos2pairs = dict()

    for pos in pos2lemmas:
        if len(pos2lemmas[pos]) > 1:
            combs = combinations(pos2lemmas[pos],2)
            pos2pairs[pos] = [comb for comb in combs]

    pos2pairs2 = {}

    for pos in pos2pairs:
        pairs_for_pos = []
        for lemma1, lemma2 in pos2pairs[pos]:
            pairs_for_synsets = set()
            synsets1 = wn.synsets(lemma1)
            synsets2 = wn.synsets(lemma2)
            pairs_for_pos.append([x for x in itertools.product(synsets1, synsets2)])
            #pairs_for_pos.append(pairs_for_synsets)

        pos2pairs2[pos] = pairs_for_pos

    #võtab sõnaraamatu tagastab 3 taset ülespoole mõlemal

    syn_hyper = hyper_level3(pos2pairs2)
    sent_count += 1
    objects = []
    for key in pos2pairs:
        for idx in range(len(pos2pairs2[key])):
            value = pos2pairs2[key][idx]
            try:
                for syn1, syn2 in value:
                    if syn1 in syn_hyper[syn2]:
                        obj = {}
                        hp_count +=1
                        print("HP" , syn_hyper[syn2].index(syn1)+1)
                        print(syn_hyper[syn2])
                        lemma1, lemma2 = pos2pairs[key][idx]
                        print("WORDS" , words[lemmas.index(lemma1)], words[lemmas.index(lemma2)],)
                        print("LEMMAS" , lemma1, lemma2)
                        print(text)
                        obj["type"] = "HP" + str(syn_hyper[syn2].index(syn1)+1)
                        obj["lemmas"] = lemma1, lemma2
                        obj["start"] = words[lemmas.index(lemma1)]["start"], words[lemmas.index(lemma2)]["start"]
                        obj["end"] = words[lemmas.index(lemma1)]["end"], words[lemmas.index(lemma2)]["end"]
                        return obj
                    elif syn2 in syn_hyper[syn1]:
                        obj = {}
                        hp_count +=1
                        print("HP" , syn_hyper[syn1].index(syn2)+1)
                        print(syn_hyper[syn1])
                        lemma1, lemma2 = pos2pairs[key][idx]
                        print("WORDS" , words[lemmas.index(lemma1)], words[lemmas.index(lemma2)],)
                        print("LEMMAS" , lemma2, lemma1)
                        print(text)
                        obj["type"] = "HP" + str(syn_hyper[syn1].index(syn2)+1)
                        obj["lemmas"] = lemma2, lemma1
                        obj["start"] = words[lemmas.index(lemma1)]["start"], words[lemmas.index(lemma2)]["start"]
                        obj["end"] = words[lemmas.index(lemma1)]["end"], words[lemmas.index(lemma2)]["end"]
                        return obj

            except IndexError:
                continue


    print(sent_count)
    #print("len(s_h)", len(syn_hyper))
    print("HP protsent", (hp_count/sent_count)*100)

    return None
Exemplo n.º 6
0
from estnltk.wordnet import wn
from pprint import pprint

print(len(wn.all_synsets()))

pprint(wn.synsets("koer", pos=wn.VERB))
pprint(wn.synsets('koer'))

synset = wn.synset("king.n.01")
pprint(synset.name)
pprint(synset.pos)
pprint(synset.definition())
pprint(synset.examples())

pprint(synset.hypernyms())
pprint(synset.hyponyms())
pprint(synset.meronyms())
pprint(synset.holonyms())

pprint(synset.get_related_synsets('fuzzynym'))

target_synset = wn.synset('kinnas.n.01')
pprint(synset.path_similarity(target_synset))
pprint(synset.lch_similarity(target_synset))
pprint(synset.wup_similarity(target_synset))
pprint(synset.lowest_common_hypernyms(target_synset))
Exemplo n.º 7
0
            # Kontrollime kas sõna on võõrsõna ja kas võõrsõnale leidub omasõna
            if is_foreign(
                    word[LEMMA]) and foreign_dict[word[LEMMA]] is not None:
                # Lisame lemma tulemusse, sageduse alampiir on 0, sest tegemist on usaldusväärse asendusega
                add_lemma_to_result(foreign_dict[word[LEMMA]],
                                    replacement_list,
                                    similarity=0)

            # Sõnaliik
            tag = word[POSTAG]
            # Kontrollime, kas sõnaliik on sobiv
            if tag in wn_pos:
                # Kontrollime, kas sõna lemma vajab lihtsustamist
                if needs_replacing(word[LEMMA]):
                    # Leiame täissünonüümi hulga
                    syn_sets = wn.synsets(word[LEMMA], pos=wn_pos[tag])
                    # Kui täissünonüüme ei ole, pole ei saa ka neid analüüsida
                    if len(syn_sets) > 0:
                        # Järjestame sünohulgad sarnasuse alusel
                        ordered_synsets = get_best_syn_set_from_prev_and_next(
                            prev_word, next_word, syn_sets)
                        # Vaatame kõik sünohulgad järjestatud sünohulkade seast läbi
                        for syn_set in ordered_synsets:
                            # Kontrollime, kas oleme leidnud juba sobiva asenduse, kui ei ole, üritame leida
                            if needs_further_simplification(replacement_list):
                                # Otsime lihtsamat lemmat
                                find_replacement(syn_set, replacement_list)
                # Kui oleme leidnud sobiva sõna, siis edasi enam ei analüüsi.
                else:
                    break
Exemplo n.º 8
0
from estnltk.wordnet import wn
from pprint import pprint

print (len(wn.all_synsets()))

pprint(wn.synsets("koer",pos=wn.VERB))
pprint(wn.synsets('koer'))


synset = wn.synset("king.n.01")
pprint(synset.name)
pprint(synset.pos)
pprint(synset.definition())
pprint(synset.examples())

pprint(synset.hypernyms())
pprint(synset.hyponyms())
pprint(synset.meronyms())
pprint(synset.holonyms())

pprint(synset.get_related_synsets('fuzzynym'))


target_synset = wn.synset('kinnas.n.01')
pprint(synset.path_similarity(target_synset))
pprint(synset.lch_similarity(target_synset))
pprint(synset.wup_similarity(target_synset))
pprint(synset.lowest_common_hypernyms(target_synset))