def calculate_params(self, doc: Document, raw_modeldata: RawModelData, modeldata: ModelData): apriori_probs = raw_modeldata.tag_ngram_model.word_apriori_probs() theta = HashSuffixTree.calculate_theta(apriori_probs) lemma_suffix_guesser = raw_modeldata.lemma_suffix_tree.create_guesser(theta) lemma_prob = raw_modeldata.lemma_freq_tree.create_guesser(theta) lemma_unigram_model = raw_modeldata.lemma_unigram_model lambda_s = 1.0 lambda_u = 1.0 lambda_l = 1.0 for sentence in doc.sentences(): for tok in sentence: suffix_probs = lemma.batch_convert(lemma_suffix_guesser.tag_log_probabilities( tok.token), tok.token, modeldata.tag_vocabulary) uni_probs = dict() for t in suffix_probs.keys(): uniscore = lemma_unigram_model.log_prob(t.stem) uni_probs[t] = uniscore lemma_probs = dict() for t in suffix_probs.keys(): lemma_score = lemma_prob.tag_log_probability(t.stem, lemma.main_pos_tag(t.tag)) lemma_probs[t] = lemma_score uni_max = max(uni_probs.items(), key=lambda e: e[1]) t = max(suffix_probs.items(), key=lambda e: e[1][1]) suffix_max = (t[0], t[1][1]) lemma_max = max(lemma_probs.items(), key=lambda e: e[1]) act_uni_prob = lemma_unigram_model.log_prob(tok.stem) act_lemma_prob = lemma_prob.tag_log_probability(tok.stem, lemma.main_pos_tag( tok.tag)) if tok in suffix_probs.keys(): act_suff_prob = suffix_probs[tok][1] else: act_suff_prob = UNKOWN_VALUE uni_prop = act_uni_prob - uni_max[1] suff_prop = act_suff_prob - suffix_max[1] lemma_prop = act_lemma_prob - lemma_max[1] if uni_prop > suff_prop and uni_prop > lemma_prop: lambda_u += uni_prop elif suff_prop > uni_prop and suff_prop > lemma_prop: lambda_s += suff_prop elif lemma_prop > uni_prop and lemma_prop > suff_prop: lambda_l += lemma_prop s = lambda_u + lambda_s + lambda_l lambda_u /= s lambda_s /= s lambda_l /= s self.lambdas.append(lambda_u) self.lambdas.append(lambda_s) self.lambdas.append(lambda_l)
def find_best_lemma(self, t: Token, position: int) -> Token: if util.analysis_queue.has_anal(position): stems = self.simplify_lemma(util.analysis_queue.analysises(position)) self.is_last_guessed = False else: stems = self.analyser.analyse(t.token) self.is_last_guessed = False tag_log_probs = self.model.compiled_data.lemma_guesser.tag_log_probabilities(t.token) lemma_suff_probs = batch_convert(tag_log_probs, t.token, self.model.data.tag_vocabulary) use_morph = True if len(stems) == 0: self.is_last_guessed = True use_morph = False stems = set(lemma_suff_probs.keys()) possible_stems = [ct for ct in stems if t.tag == ct.tag] if len(possible_stems) == 0: return Token(t.token, t.token, t.tag) if len(possible_stems) == 1 and t.token == t.token.lower(): best = possible_stems[0] else: if self.stem_filter is not None: possible_stems = self.stem_filter.filter_stem(possible_stems) comp = [] for poss_tok in possible_stems: pair = lemma_suff_probs.get(poss_tok) if pair is not None: traf = pair[0] else: traf = def_lemma_representation_by_token(poss_tok, self.model.data) comp.append((poss_tok, traf)) if not use_morph: lower_tok = Token(poss_tok.token, poss_tok.stem.lower(), poss_tok.tag) comp.append((lower_tok, traf)) best = (max(comp, key=self.lemma_comparator))[0] return self.decode_lemma(best)