def analyse_words(vocab):
    analysis_types = set()
    problem_words = set()
    word_analyses = dict()

    for word in vocab:
        e_output = estnltk.Text(word, **estnltk_options)
        # mostly accents within a word, causing estnltk to detect multiple words
        if len(e_output.analysis) > 1:
            problem_words.add(word)
            continue
        elif len(e_output.analysis) == 0:
            raise Exception(
                "no analysis returned from the analyzer for word: %s" % word)

        word_analyses[word] = list()

        for e_analysis in e_output.analysis[0]:
            analysis_string = get_analysis_string(e_analysis)
            analysis_types.add(analysis_string)
            if analysis_string not in word_analyses[word]:
                word_analyses[word].append(analysis_string)

        e_cap_output = estnltk.Text(word.capitalize(), **estnltk_options)
        if len(e_cap_output.analysis) != 1:
            raise Exception("problem in capital word analysis for word %s" %
                            word)
        for e_analysis in e_cap_output.analysis[0]:
            analysis_string = get_analysis_string(e_analysis)
            analysis_types.add(analysis_string)
            if analysis_string not in word_analyses[word]:
                word_analyses[word].append(analysis_string)

    return word_analyses, analysis_types, problem_words
Exemplo n.º 2
0
def process_frequency_list():
    freq_dict = {}
    print("Töötlen sagedusloendit")
    with open("freq_list_raw.txt", "r", encoding="UTF-8-SIG") as file:
        count = 0
        for line in file:
            if "?" in line:
                continue
            count += 1
            print(count)
            parts = line.strip().split(" ")
            freq = int(parts[0])
            lemma = parts[1]
            word_analysis = estnltk.Text(lemma).tag_analysis()

            synsets = wn.synsets(lemma)
            if is_tag_allowed(word_analysis) and len(
                    synsets) > 0 and has_relations(synsets):
                freq_dict[lemma] = freq

    print("Koostan töödeldud sagedusloendit")
    with open("freq_list_processed.txt", "w", encoding="UTF-8") as file:
        for word in sorted(freq_dict, key=freq_dict.get, reverse=True):
            file.write(word + " " + str(freq_dict[word]) + "\n")
    freq_dict.clear()
    print("Sagedusloend töödeldud")
Exemplo n.º 3
0
def analyze_title(title):
    tc = 0
    article["title"] = title
    article["scount"] += 1

    # eemalda märgendid
    detagged_title = re.sub("<.*?>", "", title)

    # jaga sõnadeks
    title_words = estnltk.Text(
        re.sub("[\s,.!?;:\xAB\xBB\-\"()„’́'“«»]+", " ", detagged_title))

    title_lemmas = title_words.lemmas

    for lemma in title_lemmas:
        if len(lemma) > 0:

            if lemma.lower() not in words:
                words[lemma.lower()] = 0

            words[lemma.lower()] += 5
            tc += 1

    article["tlength"] = tc
    article["wcount"] += tc
Exemplo n.º 4
0
def word_based_score():
    norm_word_weights()
    for elem in article["body"]:
        content = elem["content"]
        # eemaldame märgendid
        content = re.sub("<.*?>", "", content)
        # jagame sõnadeks
        content_words = estnltk.Text(
            re.sub("[\s,.!?;:\xAB\xBB\-\"()„’́'“«»]+", " ", content))
        # lemmastame
        content_lemmas = content_words.lemmas

        for lemma in content_lemmas:

            lemma = lemma.lower()
            if lemma in words:
                elem["wrdsc"] += words[lemma]

        if elem["wcount"] != 0:
            elem["wrdsc"] /= elem["wcount"]
        else:
            elem["wrdsc"] = 0

    # normaliseerime lausete sõnasageduse põhise skoori
    normalize_score("wrdsc")
Exemplo n.º 5
0
def avg_char_count_in_sentence(text) -> float:
    words = (estnltk.Text(text)).tokenize_words().words
    count_of_words = len(words)
    count_of_chars = 0
    for w in words:
        single_word = str(w["text"])
        if single_word.isalpha():
            count_of_chars_in_word = len(list(single_word))
            count_of_chars += count_of_chars_in_word
    return count_of_chars / count_of_words
Exemplo n.º 6
0
def order_by_similarity(processed_list, original_word_lemma):
    if len(processed_list) < 2:
        return processed_list

    temp_dict = {}
    for temp_word in processed_list:
        temp_dict[temp_word] = get_word_similarity(
            original_word_lemma,
            estnltk.Text(temp_word).lemmas[0])
    return sorted(temp_dict, key=temp_dict.get, reverse=True)
Exemplo n.º 7
0
def getParahgraphsDetails(inputText):
    document = estnltk.Text(inputText)
    total_parahs = len(document.paragraph_texts)
    lengths = []
    for parah in document.paragraph_texts:
        words = text.text_to_word_sequence(
            parah,
            filters=' !#$%&()*+,-./:;<=>?@[\\]^_{|}~\t\n"',
            lower=True,
            split=" ")
        lengths.append(len(words))
    lengths = np.array(lengths)
    average_length = lengths.mean()
    return [total_parahs, average_length]
Exemplo n.º 8
0
def process_foreign_list():
    basic = dictionaries.get_basic_list("basic_processed.txt")
    foreign_words = {}

    count = 0
    print("Töötlen selgitustega võõrsõnade loendit")
    with open("foreign_meaning.txt", "r", encoding="UTF-8") as file:
        for line in file:
            print(count)
            count += 1
            parts = line.strip().split("\t")
            word = parts[0]

            if len(parts) > 1:
                definition = parts[1].split("(")[0]
                foreign_words[word] = definition
            else:
                foreign_words[word] = None

    print("Töötlen märksõnadega võõrsõnade loendit")
    with open("foreign_keywords.txt", "r", encoding="UTF-8") as file:
        count = 0
        for word in file:
            print(count)
            count += 1
            word = word.strip()
            if word[0] == "-" or word[-1] == "-":
                continue
            word_analysis = estnltk.Text(word).tag_analysis()
            synsets = wn.synsets(word)

            # filtreerime välja kõik ebavajalikud sõnad
            if is_tag_allowed(
                    word_analysis
            ) and word not in basic and word not in foreign_words and has_relations(
                    synsets):
                foreign_words[word] = None

    print("Koostan töödeldud võõrsõnade loendit")
    with open("foreign_processed.txt", "w", encoding="UTF-8") as file:
        for word in foreign_words:
            if len(word.split(" ")) <= 1:
                if foreign_words[word] is None:
                    file.write(word + "\n")
                else:
                    file.write(word + " " + foreign_words[word] + "\n")

    foreign_words.clear()
    print("Võõrsõnade loend töödeldud")
Exemplo n.º 9
0
def get_most_similar(word_list, original_lemma):
    if len(word_list) == 1:
        return word_list[0]
    # word2vec ei arvesta käändes olevaid sõnu, seega ei ole see optimaalne
    best_similarity = -1
    best_lemma = None
    for word in word_list:
        word_analysis = estnltk.Text(word).analysis[0][0]
        similarity = get_word_similarity(original_lemma,
                                         word_analysis[LEMMA],
                                         default=0)
        if similarity > best_similarity:
            best_lemma = word
            best_similarity = similarity
    return best_lemma
Exemplo n.º 10
0
def process_basic_list():
    basic_list = []
    print("Töötlen põhisõnavara loendit")
    count = 0
    with open("basic_raw.txt", "r", encoding="UTF-8") as file:
        for line in file:
            print(count)
            count += 1
            word = line.strip()
            word_analysis = estnltk.Text(word).tag_analysis()
            synsets = wn.synsets(word)

            if is_tag_allowed(word_analysis) and len(
                    synsets) > 0 and has_relations(synsets):
                basic_list.append(word)

    print("Koostan töödeldud põhisõnavara loendit")
    with open("basic_processed.txt", "w", encoding="UTF-8") as file:
        for word in basic_list:
            file.write(word + "\n")

    basic_list.clear()
    print("Põhisõnavara loend töödeldud")
Exemplo n.º 11
0
def analyze_line(line):
    global sentence
    global parnr
    global divnr

    if bool(re.search("<p>", line)):
        article["pcount"] += 1
        parnr += 1
        return

    if bool(re.search('</p>', line)) or bool(re.search("</div", line)):
        return

    # lause informatsioon
    sentence = {
        "wcount": 0,
        "parnr": 0,
        "divnr": 0,
        "subhead": 0,
        "bibl": 0,
        "caption": 0,
        "possc": 0,
        "forsc": 0,
        "wrdsc": 0,
        "score": 0,
        "content": ""
    }

    if bool(re.search("<div", line)):

        if len(article["body"]) != 0:
            article["divcount"] += 1
            divnr += 1
        sentence["subhead"] = 1
        article["pcount"] += 1
        parnr += 1

        # eemaldame märgendused
        line = re.sub("<.*?>", "", line)

    if bool(re.search("<bibl>", line)):
        sentence["bibl"] = 1

    if bool(re.search("Pildi allkiri", line)):
        sentence["caption"] = 1

    sentence["content"] = line
    sentence["parnr"] = parnr
    sentence["divnr"] = divnr
    # eemaldame märgendid
    line = re.sub("<.*?>", "", line)
    # jagame sõnadeks
    line_words = estnltk.Text(
        re.sub("[\s,.!?;:\xAB\xBB\-\"()„’́'“«»]+", " ", line))

    line_lemmas = line_words.lemmas

    sentence["wcount"] += len(line_lemmas)
    weight = 1
    if sentence["subhead"] > 0:
        weight += 1

    for lemma in line_lemmas:
        lemma = lemma.lower()
        if lemma.isalpha():

            if lemma not in words:
                words[lemma] = 0

            words[lemma] += weight

    body = article["body"]
    body.append(sentence)
    article["body"] = body
    article["wcount"] += sentence["wcount"]