def get_score(text): total = len(get_words(text)) if total == 0: return 0 before = len(get_words(text.split(',')[0])) return before / max(1, total)
def get_score(text): words = get_words(text) exists = False max_dist = 0 with open('tools/distances.txt', 'rb') as src: for word in words: nodes = [] for form in morph.parse(word): normal_form = form.normal_form if get_pos_tag(normal_form) != 'ADJF': continue if normal_form not in senses: continue for _, id in senses[normal_form].items(): node = int(id[1:]) if node not in nodes: nodes.append(node) for i in range(len(nodes)): for j in range(i + 1, len(nodes)): dist = get_dist(src, nodes[i], nodes[j]) if not exists or dist > max_dist: exists = True max_dist = dist return max_dist
def get_score(text): words = get_words(text) for alliance in ['а', 'но', 'да', 'зато', 'однако']: if alliance in words: return 1 return 0
def get_score(text): words = get_words(text) for word in words: if word.isnumeric(): return 1 return 0
def get_score(text): words = get_words(text) if len(words) == 0: return 0 length = sum(map(len, words)) return length / len(words)
def get_score(text): """ Counts synonym pairs in the given text. Relations from RuWordNet are used. :param text: string; :return: int. """ words = get_words(text) return count_pair_relation(words, 'synonym')
def get_score(text): words = get_words(text) max_senses = 1 for word in words: for form in morph.parse(word): normal_form = form.normal_form if form.tag.POS == 'NOUN' and normal_form in senses: max_senses = max(max_senses, len(senses[normal_form])) return max_senses
def get_score(text): words = get_words(text) max_tags = 0 for word in words: tags = set() for form in morph.parse(word): if form.tag.POS in good_tags: tags.add(form.tag.POS) max_tags = max(max_tags, len(tags)) return max_tags
def get_word_part_lengths(text): parts = split_by_every(text, ',.!?…—') word_parts = [] for part in parts: if len(part) == 0: continue word_parts.append(len(get_words(part))) total_words = sum(word_parts) if total_words == 0: return [] return [l / total_words for l in word_parts]
def get_score(text): words = get_words(text) if len(words) == 0: return 0 cnt = 0 for word in words: for form in morph.parse(word): if form.score < 0.1: continue if form.tag.POS in {'VERB', 'INFN', 'GRND', 'PRTF', 'PRTS'}: cnt += 1 break return 1.0 * cnt / len(words)
def get_score(text): words = get_words(text) if len(words) == 0: return 0 cnt = 0 for word in words: for form in morph.parse(word): if form.score < 0.1: continue if form.tag.POS in good_tags: cnt += 1 break return cnt / len(words)
def get_score(text): words = get_words(text) if len(words) == 0: return 0 cnt = 0 for word in words: for form in morph.parse(word): if form.score < 0.1: continue if form.tag.POS in {'ADJF', 'ADJS', 'COMP'}: cnt += 1 break return 1.0 * cnt / len(words)
def get_score(text): """ Counts words with multiple domain; Relations from RuWordNet are used. :param text: string; :return: int. """ words = get_words(text) words_with_multiple_domain = 0 for word in words: norms = norm_forms(word) if norms and len(collect_by_relation(norms, 'domain')) > 1: words_with_multiple_domain += 1 return words_with_multiple_domain
def get_score(text): """ Counts domains in the given text. Relations from RuWordNet are used. :param text: string; :return: int. """ words = get_words(text) domains_in_text = set() for word in words: norms = norm_forms(word) word_domains = collect_by_relation(norms, 'domain') if word_domains: domains_in_text.update(word_domains) return len(domains_in_text)
def get_word_array(text): array = [] for word in get_words(text): all_forms = {} for form in morph.parse(word): if form.score < 0.1: continue normalized_word, score = get_pos_form(form.normal_form) if normalized_word is not None and normalized_word in wv.vocab: if normalized_word not in all_forms: all_forms[normalized_word] = 0 all_forms[normalized_word] += score * form.score if len(all_forms) != 0: array.append((word, all_forms)) return array
def get_score(text): """ Counts words with labels and emotional/ :param text: string; :return: int - number of domains with more than 1 word in the given sentence; """ words = get_words(text) words_with_labels = set() for word in words: norms = norm_forms(word) if norms: for norm in norms: labels = LABELS_DICT.get(norm, None) if labels: words_with_labels.add(norm) return len(words_with_labels)
def get_syllables_counts(text): words = get_words(text) clean_words = [] for word in words: ok = True for form in morph.parse(word): if form.score < 0.1: continue if form.tag.POS in {'PREP', 'CONJ', 'PRCL', 'INTJ'}: ok = False break if ok: clean_words.append(word) return [ sum([word.count(letter) for letter in VOWELS]) for word in clean_words ]
def get_score(text): words = get_words(text) vector_sum = np.array([0] * 300, dtype=np.float32) for word in words: for form in morph.parse(word): if form.score < 0.1: continue normalized_word, _ = get_pos_form(form.normal_form) if normalized_word is not None and normalized_word in wv.vocab: idf = log(192689044 / (frequencies.get(form.normal_form, 0) + 1)) vector = wv[normalized_word] vector_sum += form.score * idf * vector size = np.linalg.norm(vector_sum) if size < 1e-4: return [0] * 300 return list(vector_sum / np.linalg.norm(vector_sum))
def get_score(text): words = get_words(text) if len(words) == 0: return 0 exists = False min_frequency = 0 for word in words: for form in morph.parse(word): if form.score < 0.1: continue if form.normal_form in frequencies: if not exists or frequencies[form.normal_form] < min_frequency: exists = True min_frequency = frequencies[form.normal_form] else: return 0 return min_frequency
def get_score(text): words = get_words(text) min_dist = 1 for word in words: all_senses = [] for form in morph.parse(word): normal_form = form.normal_form if get_pos_tag(normal_form) in good_tags: continue if normal_form not in senses: continue for sense, _ in senses[normal_form].items(): if sense not in all_senses: all_senses.append(sense) if len(all_senses) == 0: continue bag_of_words = set() for i in range(len(all_senses)): for j in range(len(all_senses[i])): if not all_senses[i][j].isalnum(): all_senses[i] = all_senses[i].replace(all_senses[i][j], ' ') all_senses[i] = all_senses[i].lower().split() want = min(len(all_senses[i]), 3) for sense_word in all_senses[i][:want]: normal_form = max(morph.parse(sense_word), key=lambda x: x.score).normal_form normalized_form, _ = get_pos_form(normal_form) if normalized_form is not None and normalized_form not in bag_of_words and normalized_form in wv.vocab: bag_of_words.add(normalized_form) for w1 in bag_of_words: for w2 in bag_of_words: if w1 == w2: continue min_dist = min(min_dist, wv.similarity(w1, w2)) return min_dist
def get_score(text): words = get_words(text) max_dist = -1 for word in words: all_senses = [] for form in morph.parse(word): normal_form = form.normal_form if normal_form not in senses: continue for sense, _ in senses[normal_form].items(): if sense not in all_senses: all_senses.append(sense) if len(all_senses) == 0: continue for i in range(len(all_senses)): for j in range(len(all_senses[i])): if not all_senses[i][j].isalnum(): all_senses[i] = all_senses[i].replace( all_senses[i][j], ' ') all_senses[i] = all_senses[i].split() for j in range(len(all_senses[i])): normal_form = max(morph.parse(all_senses[i][j]), key=lambda x: x.score).normal_form all_senses[i][j], _ = get_pos_form(normal_form) for i in range(len(all_senses)): for j in range(i + 1, len(all_senses)): for w1 in all_senses[i]: if w1 is None or w1 not in wv.vocab: continue for w2 in all_senses[j]: if w2 is None or w1 == w2 or w2 not in wv.vocab: continue max_dist = max(max_dist, wv.similarity(w1, w2)) return max_dist
def get_score(text): words = get_words(text) sum_senses_left = 0 for word in words[:len(words) // 2]: max_senses = 1 for form in morph.parse(word): normal_form = form.normal_form if get_pos_tag(normal_form) == 'ADJF' and normal_form in senses: max_senses = max(max_senses, len(senses[normal_form])) sum_senses_left += log(max_senses) sum_senses_right = 0 for word in words[len(words) // 2:]: max_senses = 1 for form in morph.parse(word): normal_form = form.normal_form if get_pos_tag(normal_form) == 'ADJF' and normal_form in senses: max_senses = max(max_senses, len(senses[normal_form])) sum_senses_right += log(max_senses) return [ sum_senses_left + sum_senses_right, sum_senses_left, sum_senses_right ]
def get_score(text): words = get_words(text) return int('если' in words or 'когда' in words)