def compile(self) -> CompiledModelData: c = CompiledModelData() c.unigram_lemma_model = self.lemma_unigram_model c.tag_transition_model = self.tag_ngram_model.create_probability_model() c.standard_emission_model = self.std_emission_ngram_model.create_probability_model() c.spec_tokens_emission_model = self.spec_emission_ngram_model.create_probability_model() c.apriori_tag_probs = self.tag_ngram_model.word_apriori_probs() theta = HashSuffixTree.calculate_theta(c.apriori_tag_probs) c.lower_case_suffix_guesser = self.lower_suffix_tree.create_guesser(theta) c.upper_case_suffix_guesser = self.upper_suffix_tree.create_guesser(theta) c.lemma_guesser = self.lemma_suffix_tree.create_guesser(theta) c.suffix_lemma_model = self.lemma_freq_tree.create_guesser(theta) c.combiner = self.combiner return c
def calculate_params(self, doc: Document, raw_modeldata: RawModelData, modeldata: ModelData): apriori_probs = raw_modeldata.tag_ngram_model.word_apriori_probs() theta = HashSuffixTree.calculate_theta(apriori_probs) lemma_suffix_guesser = raw_modeldata.lemma_suffix_tree.create_guesser(theta) lemma_prob = raw_modeldata.lemma_freq_tree.create_guesser(theta) lemma_unigram_model = raw_modeldata.lemma_unigram_model lambda_s = 1.0 lambda_u = 1.0 lambda_l = 1.0 for sentence in doc.sentences(): for tok in sentence: suffix_probs = lemma.batch_convert(lemma_suffix_guesser.tag_log_probabilities( tok.token), tok.token, modeldata.tag_vocabulary) uni_probs = dict() for t in suffix_probs.keys(): uniscore = lemma_unigram_model.log_prob(t.stem) uni_probs[t] = uniscore lemma_probs = dict() for t in suffix_probs.keys(): lemma_score = lemma_prob.tag_log_probability(t.stem, lemma.main_pos_tag(t.tag)) lemma_probs[t] = lemma_score uni_max = max(uni_probs.items(), key=lambda e: e[1]) t = max(suffix_probs.items(), key=lambda e: e[1][1]) suffix_max = (t[0], t[1][1]) lemma_max = max(lemma_probs.items(), key=lambda e: e[1]) act_uni_prob = lemma_unigram_model.log_prob(tok.stem) act_lemma_prob = lemma_prob.tag_log_probability(tok.stem, lemma.main_pos_tag( tok.tag)) if tok in suffix_probs.keys(): act_suff_prob = suffix_probs[tok][1] else: act_suff_prob = UNKOWN_VALUE uni_prop = act_uni_prob - uni_max[1] suff_prop = act_suff_prob - suffix_max[1] lemma_prop = act_lemma_prob - lemma_max[1] if uni_prop > suff_prop and uni_prop > lemma_prop: lambda_u += uni_prop elif suff_prop > uni_prop and suff_prop > lemma_prop: lambda_s += suff_prop elif lemma_prop > uni_prop and lemma_prop > suff_prop: lambda_l += lemma_prop s = lambda_u + lambda_s + lambda_l lambda_u /= s lambda_s /= s lambda_l /= s self.lambdas.append(lambda_u) self.lambdas.append(lambda_s) self.lambdas.append(lambda_l)
def compile(self) -> CompiledModelData: c = CompiledModelData() c.unigram_lemma_model = self.lemma_unigram_model c.tag_transition_model = self.tag_ngram_model.create_probability_model( ) c.standard_emission_model = self.std_emission_ngram_model.create_probability_model( ) c.spec_tokens_emission_model = self.spec_emission_ngram_model.create_probability_model( ) c.apriori_tag_probs = self.tag_ngram_model.word_apriori_probs() theta = HashSuffixTree.calculate_theta(c.apriori_tag_probs) c.lower_case_suffix_guesser = self.lower_suffix_tree.create_guesser( theta) c.upper_case_suffix_guesser = self.upper_suffix_tree.create_guesser( theta) c.lemma_guesser = self.lemma_suffix_tree.create_guesser(theta) c.suffix_lemma_model = self.lemma_freq_tree.create_guesser(theta) c.combiner = self.combiner return c