def read_dict(file): dict = {} n = 0 fin = open(file, 'r') for sent in fin: line = sent.strip().lower().translate(refer.middle_punctuation_remover) line = line.translate(refer.end_punctuation_remover) words = line.split(' ') for word in words: word, pos = morph.choose_lemma(word, -1) if pos in ["S", "A", "V", "NI"] and not (word in refer.silly_words): if dict.get(word) is None: dict[word] = (1., pos) n += 1 else: old_f, pos = dict[word] dict[word] = (old_f + 1., pos) fin.close() n = 0 res = {} for key in dict.keys(): if dict[key][0] >= 10.: res[key] = (1., n) n += 1 if dict[key][1] in ["A", "V"]: res["не " + key] = (1., n) n += 1 return res
def build_dictionary(text): text = text.translate(end_punctuation_remover) dict = {} words = text.split(" ") for word in words: if word in silly_words or word.isdigit() or len(word) < 3: continue word, _ = morph.choose_lemma(word, -1) if dict.get(word) is None: dict[word] = 1 else: dict[word] += 1 return dict
def build_bigrams(text): text = text.translate(end_punctuation_remover) bigrams = {} words = text.split(" ") prev = '' for word in words: if prev == '' or word in silly_words: continue word, _ = morph.choose_lemma(word, -1) pair = (prev, word) if prev < word else (word, prev) if bigrams.get(pair) is None: bigrams[pair] = 1 else: bigrams[pair] += 1 prev = word return bigrams
def build_with_frequencies(model, text): text = preproocess_text(text) dict = build_dictionary(text) closest_dict = get_closest_words(model, dict, 3) bigrams = build_bigrams(text) sentences = get_sentences(text) if len(sentences) == 0: return text[:LIMIT] weights = [] for sentence in sentences: weight = 0. words = sentence.translate(end_punctuation_remover).split(" ") if len(words) < 4 or not (find_anafor(words[:4]) is None): weights.append(0) continue prev = '' for word in words: word, _ = morph.choose_lemma(word, -1) if dict.get(word) is None: continue weight += dict[word] closest = closest_dict.get(word) if not (closest is None): for word2 in closest: weight += dict[word2] / 3. pair = (prev, word) if prev < word else (word, prev) if bigrams.get(pair) is None: continue weight += bigrams[pair] weight = weight * 1. / len(sentence) # weight = weight + 1. / 3. if sent_len <= text_len / 3. else weight weights.append(weight) for i in range(len(weights) // 3): weights[i] += 1. / 2. weights, sentences = zip(*sorted(zip(weights, sentences), reverse=True)) length = 0 result = "" for i in range(len(sentences)): result += sentences[i] length += len(sentences[i]) if length >= LIMIT: break return result
def extract_features(file, dict, extra_lines, annotated_dict = None): words_num = len(dict) fin_corp = open(file, 'r') rows = [] cols = [] data = [] n = 0 for line in fin_corp: words = line.strip().translate(refer.middle_punctuation_remover).translate(refer.end_punctuation_remover).split(' ') weight = 1. if line.endswith('!'): weight = 2. line_data = [0.] * words_num unzero_num = 0. prev = "" for word in words: lemma, pos = morph.choose_lemma(word.lower(), -1) if prev == "не" and pos in ["A", "V"]: lemma = "не " + lemma #lbl *= 0. if not (dict.get(lemma) is None): #lbl, idx = dict[lemma] #if not (annotated_dict is None or annotated_dict.get(lemma) is None): # lbl, _ = annotated_dict[lemma] lbl, idx = dict[lemma] if line_data[idx] == 0.: unzero_num += 1. if word == word.upper(): line_data[idx] += weight * 2. #* lbl else: line_data[idx] += weight #* lbl prev = word for j in range(words_num): if line_data[j] == 0.: continue rows.append(n) cols.append(j) data.append(line_data[j] / unzero_num) n += 1 fin_corp.close() data = csr_matrix((data, (rows, cols)), shape=(n + extra_lines, words_num)) return data