def calculate_params(self, doc: Document, raw_modeldata: RawModelData, modeldata: ModelData): apriori_probs = raw_modeldata.tag_ngram_model.word_apriori_probs() theta = HashSuffixTree.calculate_theta(apriori_probs) lemma_suffix_guesser = raw_modeldata.lemma_suffix_tree.create_guesser(theta) lemma_prob = raw_modeldata.lemma_freq_tree.create_guesser(theta) lemma_unigram_model = raw_modeldata.lemma_unigram_model lambda_s = 1.0 lambda_u = 1.0 lambda_l = 1.0 for sentence in doc.sentences(): for tok in sentence: suffix_probs = lemma.batch_convert(lemma_suffix_guesser.tag_log_probabilities( tok.token), tok.token, modeldata.tag_vocabulary) uni_probs = dict() for t in suffix_probs.keys(): uniscore = lemma_unigram_model.log_prob(t.stem) uni_probs[t] = uniscore lemma_probs = dict() for t in suffix_probs.keys(): lemma_score = lemma_prob.tag_log_probability(t.stem, lemma.main_pos_tag(t.tag)) lemma_probs[t] = lemma_score uni_max = max(uni_probs.items(), key=lambda e: e[1]) t = max(suffix_probs.items(), key=lambda e: e[1][1]) suffix_max = (t[0], t[1][1]) lemma_max = max(lemma_probs.items(), key=lambda e: e[1]) act_uni_prob = lemma_unigram_model.log_prob(tok.stem) act_lemma_prob = lemma_prob.tag_log_probability(tok.stem, lemma.main_pos_tag( tok.tag)) if tok in suffix_probs.keys(): act_suff_prob = suffix_probs[tok][1] else: act_suff_prob = UNKOWN_VALUE uni_prop = act_uni_prob - uni_max[1] suff_prop = act_suff_prob - suffix_max[1] lemma_prop = act_lemma_prob - lemma_max[1] if uni_prop > suff_prop and uni_prop > lemma_prop: lambda_u += uni_prop elif suff_prop > uni_prop and suff_prop > lemma_prop: lambda_s += suff_prop elif lemma_prop > uni_prop and lemma_prop > suff_prop: lambda_l += lemma_prop s = lambda_u + lambda_s + lambda_l lambda_u /= s lambda_s /= s lambda_l /= s self.lambdas.append(lambda_u) self.lambdas.append(lambda_s) self.lambdas.append(lambda_l)
def combine(self, token: Token, lem_transf: BaseLemmaTransformation, compiled_modeldata: CompiledModelData, modeldata: ModelData) -> float: unigram_lemma_model = compiled_modeldata.unigram_lemma_model uni_score = unigram_lemma_model.log_prob(token.stem) suffix_score = compiled_modeldata.lemma_guesser.tag_log_probability( token.token, lem_transf) lemma_prob = compiled_modeldata.\ suffix_lemma_model.tag_log_probability(token.stem, lemma.main_pos_tag(token.tag)) return uni_score * self.lambdas[0] +\ suffix_score * self.lambdas[1] +\ lemma_prob * self.lambdas[2]