예제 #1
0
 def calculate_params(self, doc: Document,
                      raw_modeldata: RawModelData,
                      modeldata: ModelData):
     apriori_probs = raw_modeldata.tag_ngram_model.word_apriori_probs()
     theta = HashSuffixTree.calculate_theta(apriori_probs)
     lemma_suffix_guesser = raw_modeldata.lemma_suffix_tree.create_guesser(theta)
     lemma_prob = raw_modeldata.lemma_freq_tree.create_guesser(theta)
     lemma_unigram_model = raw_modeldata.lemma_unigram_model
     lambda_s = 1.0
     lambda_u = 1.0
     lambda_l = 1.0
     for sentence in doc.sentences():
         for tok in sentence:
             suffix_probs = lemma.batch_convert(lemma_suffix_guesser.tag_log_probabilities(
                 tok.token), tok.token, modeldata.tag_vocabulary)
             uni_probs = dict()
             for t in suffix_probs.keys():
                 uniscore = lemma_unigram_model.log_prob(t.stem)
                 uni_probs[t] = uniscore
             lemma_probs = dict()
             for t in suffix_probs.keys():
                 lemma_score = lemma_prob.tag_log_probability(t.stem, lemma.main_pos_tag(t.tag))
                 lemma_probs[t] = lemma_score
             uni_max = max(uni_probs.items(), key=lambda e: e[1])
             t = max(suffix_probs.items(), key=lambda e: e[1][1])
             suffix_max = (t[0], t[1][1])
             lemma_max = max(lemma_probs.items(), key=lambda e: e[1])
             act_uni_prob = lemma_unigram_model.log_prob(tok.stem)
             act_lemma_prob = lemma_prob.tag_log_probability(tok.stem, lemma.main_pos_tag(
                 tok.tag))
             if tok in suffix_probs.keys():
                 act_suff_prob = suffix_probs[tok][1]
             else:
                 act_suff_prob = UNKOWN_VALUE
             uni_prop = act_uni_prob - uni_max[1]
             suff_prop = act_suff_prob - suffix_max[1]
             lemma_prop = act_lemma_prob - lemma_max[1]
             if uni_prop > suff_prop and uni_prop > lemma_prop:
                 lambda_u += uni_prop
             elif suff_prop > uni_prop and suff_prop > lemma_prop:
                 lambda_s += suff_prop
             elif lemma_prop > uni_prop and lemma_prop > suff_prop:
                 lambda_l += lemma_prop
     s = lambda_u + lambda_s + lambda_l
     lambda_u /= s
     lambda_s /= s
     lambda_l /= s
     self.lambdas.append(lambda_u)
     self.lambdas.append(lambda_s)
     self.lambdas.append(lambda_l)
예제 #2
0
    def find_best_lemma(self, t: Token, position: int) -> Token:
        if util.analysis_queue.has_anal(position):
            stems = self.simplify_lemma(util.analysis_queue.analysises(position))
            self.is_last_guessed = False
        else:
            stems = self.analyser.analyse(t.token)
            self.is_last_guessed = False

        tag_log_probs = self.model.compiled_data.lemma_guesser.tag_log_probabilities(t.token)
        lemma_suff_probs = batch_convert(tag_log_probs, t.token, self.model.data.tag_vocabulary)

        use_morph = True
        if len(stems) == 0:
            self.is_last_guessed = True
            use_morph = False
            stems = set(lemma_suff_probs.keys())

        possible_stems = [ct for ct in stems if t.tag == ct.tag]

        if len(possible_stems) == 0:
            return Token(t.token, t.token, t.tag)

        if len(possible_stems) == 1 and t.token == t.token.lower():
            best = possible_stems[0]
        else:
            if self.stem_filter is not None:
                possible_stems = self.stem_filter.filter_stem(possible_stems)
            comp = []
            for poss_tok in possible_stems:
                pair = lemma_suff_probs.get(poss_tok)
                if pair is not None:
                    traf = pair[0]
                else:
                    traf = def_lemma_representation_by_token(poss_tok, self.model.data)
                comp.append((poss_tok, traf))
                if not use_morph:
                    lower_tok = Token(poss_tok.token, poss_tok.stem.lower(), poss_tok.tag)
                    comp.append((lower_tok, traf))
            best = (max(comp, key=self.lemma_comparator))[0]
        return self.decode_lemma(best)