def getMorphemes(word): """ Check whether word in dictionary and if not then try to segment it """ try: if WordMorphemeDicts.contains(word): return WordMorphemeDicts.get(word) else: return segment_to_morphemes(word) except Exception: print("Error to get morphemes from word '" + word + " '")
def form_roots_vects(self): roots_vects = defaultdict(list) roots_avg_vects = dict() for word in WordMorphemeDicts.words(): if word in self.modelWord_from_word: model_word = self.modelWord_from_word[word] for root in WordMorphemeDicts.get(word).roots: vect = self.model[model_word] roots_vects[root].append(vect) for root in roots_vects: roots_avg_vects[root] = sum(roots_vects[root]) / len( roots_vects[root]) return roots_avg_vects
def get_morph(x, w, fs, d, filter_key): it = iter(WordMorphemeDicts.get(w).all_in_order) l = len(x) tag, morph = it.next() morph_len = len(morph) w = w.replace(u'-', '') try: for inx in range(len(w) - l + 1): sub = w[inx: inx + l] if morph_len == 0: tag, morph = it.next() morph_len = len(morph) # if if sub == x: # Match, then build key from features key = tuple(map(lambda f: f(l, w, inx), fs)) if key == filter_key: if sub == morph: d[key][tag] += 1 # if d[key]['ALL'] += 1 # if # if morph_len -= 1 # for except Exception as e: print(w)
def getSplits(word): """ Check whether word in dictionary and if not then try to segment it """ try: if WordMorphemeDicts.contains(word): result = [] for tag, sub in WordMorphemeDicts.get(word).all_in_order: result.append(sub) # for return result else: p, k, s = parse(word) return reversed(s) except Exception: "Error to get morphemes from word '" + word + " '"
def get_morph_by_pos(x, w, fs, d, filter_key, pos): tag_for_letter = WordMorphemeDicts.get(w).tag_for_letter all_in_order = WordMorphemeDicts.get(w).all_in_order pos = len(w) + pos sub = w[pos: pos + len(x)] l = len(x) tags = set(tag_for_letter[pos: pos + len(x)]) if x == sub: key = tuple(map(lambda f: f(l, w, pos), fs)) if key == filter_key: tag = next(iter(tags)) if len(tags) == 1 and (tag, sub) in all_in_order: d[key][tag] += 1 # if d[key]["ALL"] += 1
def parse_word(word, start_from): global D global matches features_f = [ #lambda _, w, pos: len(w) - pos #, lambda _, w, pos: get(w, pos - 3) #, lambda _, w, pos: get(w, pos - 2) lambda _, w, pos: get(w, pos - 1) , lambda l, w, pos: w[pos: pos + l] #, lambda l, w, pos: get(w, pos + l) ] , lambda l, w, pos: get(w, pos + l + 1)] small_feature = False if start_from > 4: features_f = features_f[1:] small_feature = True # if result = [] for inx in range(1, len(word) - start_from + 1): # get statistic for concrete substring #D = defaultdict(lambda: defaultdict(lambda: 0)) if start_from != 0: sub = word[-(inx + start_from):-start_from] else: sub = word[-(inx + start_from):] # if sub_key = tuple(map(lambda f: f(len(sub), word, len(word) - (inx + start_from)), features_f)) if sub_key not in D: for w in WordMorphemeDicts.words(): if small_feature: get_morph(sub, w, features_f, D, sub_key) else: get_morph_by_pos(sub, w, features_f, D, sub_key, -(inx + start_from)) # for else: matches += 1 # if # apply statistic to determine max probability max_prob, max_key = 0.000001, 'R' for key, value in D[sub_key].iteritems(): if key == 'ALL': continue # if p = (value + 0.0) / D[sub_key]['ALL'] if p > max_prob: max_prob, max_key = p, key # if # for result.append([max_prob, max_key, sub]) # for return result
print '/'.join(slovorod_d[word]), ' | ', '/'.join(orig_list) #if mismatch_num > 100: # break # if # if # for print mismatch_num """ # This code to get the values after the root, and roots out = codecs.open('init_split', 'w', encoding='utf-8') for w in WordMorphemeDicts.words(): res = [] for t, s in WordMorphemeDicts.get(w).all_in_order: res.append(s) # for out.write(w + ' : ' + ','.join(res) + '\n') # for out.close() #for w in after_root: # out.write(w + '\n') ## for
return segment_to_morphemes(word) except Exception: print ("Error to get morphemes from word:" + word) # def if __name__ == "__main__": all_count = 0 test_count = 0 false_negative = 0 root_count = 0 pref_count = 0 suf_count = 0 equal_count = 0 for word in WordMorphemeDicts.words(): m_orig = WordMorphemeDicts.get(word) m_test = getMorphemes(word) all_count += 3 is_equal = True count = 0 n_count = 0 for r in m_test.roots: if r in m_orig.roots: count += 1 else: n_count += 1 # if # for if len(m_orig.roots):