Пример #1
0
def read_dict(file):
    dict = {}
    n = 0
    fin = open(file, 'r')
    for sent in fin:
        line = sent.strip().lower().translate(refer.middle_punctuation_remover)
        line = line.translate(refer.end_punctuation_remover)
        words = line.split(' ')
        for word in words:
            word, pos = morph.choose_lemma(word, -1)
            if pos in ["S", "A", "V", "NI"] and not (word in refer.silly_words):
                if dict.get(word) is None:
                    dict[word] = (1., pos)
                    n += 1
                else:
                    old_f, pos = dict[word]
                    dict[word] = (old_f + 1., pos)
    fin.close()
    n = 0
    res = {}
    for key in dict.keys():
        if dict[key][0] >= 10.:
            res[key] = (1., n)
            n += 1
            if dict[key][1] in ["A", "V"]:
                res["не " + key] = (1., n)
                n += 1
    return res
Пример #2
0
def build_dictionary(text):
    text = text.translate(end_punctuation_remover)
    dict = {}
    words = text.split(" ")
    for word in words:
        if word in silly_words or word.isdigit() or len(word) < 3:
            continue
        word, _ = morph.choose_lemma(word, -1)
        if dict.get(word) is None:
            dict[word] = 1
        else:
            dict[word] += 1
    return dict
Пример #3
0
def build_bigrams(text):
    text = text.translate(end_punctuation_remover)
    bigrams = {}
    words = text.split(" ")
    prev = ''
    for word in words:
        if prev == '' or word in silly_words:
            continue
        word, _ = morph.choose_lemma(word, -1)
        pair = (prev, word) if prev < word else (word, prev)
        if bigrams.get(pair) is None:
            bigrams[pair] = 1
        else:
            bigrams[pair] += 1
        prev = word
    return bigrams
Пример #4
0
def build_with_frequencies(model, text):
    text = preproocess_text(text)
    dict = build_dictionary(text)
    closest_dict = get_closest_words(model, dict, 3)
    bigrams = build_bigrams(text)
    sentences = get_sentences(text)

    if len(sentences) == 0:
        return text[:LIMIT]

    weights = []
    for sentence in sentences:
        weight = 0.
        words = sentence.translate(end_punctuation_remover).split(" ")
        if len(words) < 4 or not (find_anafor(words[:4]) is None):
            weights.append(0)
            continue

        prev = ''
        for word in words:
            word, _ = morph.choose_lemma(word, -1)
            if dict.get(word) is None:
                continue
            weight += dict[word]
            closest = closest_dict.get(word)
            if not (closest is None):
                for word2 in closest:
                    weight += dict[word2] / 3.
            pair = (prev, word) if prev < word else (word, prev)
            if bigrams.get(pair) is None:
                continue
            weight += bigrams[pair]
        weight = weight * 1. / len(sentence)
        # weight = weight + 1. / 3. if sent_len <= text_len / 3. else weight
        weights.append(weight)

    for i in range(len(weights) // 3):
        weights[i] += 1. / 2.
    weights, sentences = zip(*sorted(zip(weights, sentences), reverse=True))
    length = 0
    result = ""
    for i in range(len(sentences)):
        result += sentences[i]
        length += len(sentences[i])
        if length >= LIMIT:
            break
    return result
Пример #5
0
def extract_features(file, dict, extra_lines, annotated_dict = None):
    words_num = len(dict)
    fin_corp = open(file, 'r')
    rows = []
    cols = []
    data = []
    n = 0
    for line in fin_corp:
        words = line.strip().translate(refer.middle_punctuation_remover).translate(refer.end_punctuation_remover).split(' ')
        weight = 1.
        if line.endswith('!'):
            weight = 2.
        line_data = [0.] * words_num
        unzero_num = 0.
        prev = ""
        for word in words:
            lemma, pos = morph.choose_lemma(word.lower(), -1)
            if prev == "не" and pos in ["A", "V"]:
                lemma = "не " + lemma
                #lbl *= 0.
            if not (dict.get(lemma) is None):
                #lbl, idx = dict[lemma]
                #if not (annotated_dict is None or annotated_dict.get(lemma) is None):
                #    lbl, _ = annotated_dict[lemma]
                lbl, idx = dict[lemma]
                if line_data[idx] == 0.:
                    unzero_num += 1.
                if word == word.upper():
                    line_data[idx] += weight * 2. #* lbl
                else:
                    line_data[idx] += weight #* lbl
            prev = word
        for j in range(words_num):
            if line_data[j] == 0.:
                continue
            rows.append(n)
            cols.append(j)
            data.append(line_data[j] / unzero_num)
        n += 1
    fin_corp.close()
    data = csr_matrix((data, (rows, cols)), shape=(n + extra_lines, words_num))
    return data