Пример #1
0
    def _align_source_target(self, un_match, un_pos, position, tgt_text,
                             tgt_pos, align_features):

        tgt_dic = {}  # list of pairs of words

        tgt_word_pos = TMUtilsMatching.segment_2_universal(
            tgt_text.lower(), tgt_pos, self.tgt_lang)
        for i in range(0, len(tgt_word_pos)):
            value_similarity = 0
            for f in align_features:
                if f == 'word_ter':  # TER between words
                    value_similarity = value_similarity + TMUtilsMatching.ter_distance(
                        un_match, tgt_word_pos[i][0])
                if f == 'posTag':  # Boolean PosTag
                    value_similarity = value_similarity + TMUtilsMatching.pos_bool(
                        un_pos, tgt_word_pos[i][1])
                if f == 'position':  # Word position
                    value_similarity = value_similarity + TMUtilsMatching.position_distance(
                        position, i)
                #if f == 'frequency':  # frequency of pairs of words
                #  value_similarity = value_similarity + self.target_importance(un_word, tgt_word_pos[i][0], segment,best_segments)
            # Dictionary have the target word and the position of the word in the target sentence
            tgt_dic[(tgt_word_pos[i][0], i)] = value_similarity
        tgt_align = sorted(tgt_dic.items(),
                           key=lambda item: item[1],
                           reverse=True)[0]  # Select the highest score
        return tgt_align[0][0], tgt_align[0][
            1]  # un_word, un_position  # Retorn the word with biggest score
Пример #2
0
    def _align_source_target(self, un_match, un_pos, position, tgt_word_pos,
                             align_features):  #tgt_text, tgt_pos,
        related_words = []
        tgt_dic = {}  # list of pairs of words

        equal_posTag = [[
            position_tgt, word, pos
        ] for position_tgt, [word, pos] in list(enumerate(tgt_word_pos))
                        if pos == un_pos.strip(' ') or pos == 'VERB'
                        or pos == 'NOUN' or pos == 'ADJ']
        #print('*************')
        #print(equal_posTag)
        if not equal_posTag:
            return None, None

        else:
            if 'glossary' in align_features:
                related_words = self.search_exact_value(un_match, 10)
            for i in range(0, len(equal_posTag)):
                value_similarity = 0
                for f in align_features:
                    if f == 'word_ter':  # TER between words
                        value_similarity = value_similarity + (
                            0.25 * TMUtilsMatching.un_match_distance(
                                un_match, equal_posTag[i][1]))
                    if f == 'posTag':  # Boolean PosTag
                        value_similarity = value_similarity + (
                            0.25 * TMUtilsMatching.pos_bool(
                                un_pos, equal_posTag[i][2]))
                    if f == 'position':  # Word position
                        value_similarity = value_similarity + (
                            0.25 * TMUtilsMatching.position_distance(
                                position, equal_posTag[i][0]))
                    if f == 'glossary':  # search word on elasticTM
                        if equal_posTag[i][1] in related_words:
                            is_related = 1
                        else:
                            is_related = 0
                        value_similarity = value_similarity + (
                            0.25 * is_related
                        )  #target_importance(un_word, tgt_word_pos[i][0], segment,best_segments)
                # Dictionary have the target word and the position of the word in the target sentence --> Low is the best
                tgt_dic[(equal_posTag[i][1],
                         equal_posTag[i][0])] = value_similarity
            tgt_align = sorted(tgt_dic.items(),
                               key=lambda item: item[1],
                               reverse=True)[0]  # Select the highest score
            print(
                sorted(tgt_dic.items(), key=lambda item: item[1],
                       reverse=True))
            if tgt_align[1] > G_CONFIG.get_src_tgt_threshold():
                return tgt_align[0][0], tgt_align[0][1]
            else:
                return None, None