示例#1
0
    def process(self, query_dic, src_text, tgt_text):  #, src_pos, tgt_pos

        # pre-process and apply regex to tm_src
        src_re = self.re_pp[self.src_lang].process(src_text)

        if src_text != src_re:  # Was applied regex in src

            tgt_re = self.re_pp[self.tgt_lang].process(tgt_text)

            # if query_dic['query_re'] == query_dic['tokenizer']: # Was not applied regular expression on query
            #   if src_re is not None and src_pos is not None:
            #     src_text, src_pos = TMRegexMatch._delete_elements(src_re.split(' '), src_pos.split(' '))
            #   if tgt_re is not None and tgt_pos is not None:
            #     tgt_text, tgt_pos = TMRegexMatch._delete_elements(tgt_re.split(' '), tgt_pos.split(' '))

            #ter = TMUtilsMatching._ter_score(query_dic['tokenizer'], src_text)
            #else: #Transform target into query

            ter = TMUtilsMatching._ter_score(query_dic['query_re'], src_re)
            #Extract patterns (find and replace) value
            src_query_f, src_query_r = TMRegexMatch._extract_find_replace(
                query_dic['tokenizer'].split(' '),
                query_dic['query_re'].split(' '))
            tgt_query_f = src_query_f.copy()
            tgt_query_r = src_query_r.copy()
            src_f, src_r = TMRegexMatch._extract_find_replace(
                src_text.split(' '), src_re.split(' '))
            ter = ter - len(src_f)
            src_text = TMRegexMatch._replace_values(src_query_f, src_query_r,
                                                    src_re.split(' '), src_f,
                                                    src_r)

            tgt_f, tgt_r = TMRegexMatch._extract_find_replace(
                tgt_text.split(' '), tgt_re.split(' '))
            tgt_text = TMRegexMatch._replace_values(tgt_query_f, tgt_query_r,
                                                    tgt_re.split(' '), tgt_f,
                                                    tgt_r)

        else:
            ter = TMUtilsMatching._ter_score(query_dic['tokenizer'],
                                             src_text)  #Regex did't applied
        return tgt_text, src_text, ter  #, src_pos, tgt_pos
示例#2
0
 def _match_rank_concordance(self, best_segments):  # , output
     self.timer.start("ter")
     l_ter_score = [
         TMUtilsMatching._ter_score(self.query, segment[0].source_text)
         for segment in best_segments
     ]
     self.timer.stop("ter")
     l_best_sort = sorted(zip(best_segments, l_ter_score),
                          key=operator.itemgetter(1),
                          reverse=True)
     return [(segment[0][0], segment[1]) for segment in l_best_sort]
示例#3
0
    def execute(self, threshold, l_best_segments, match_process,
                align_features, concordance):  #, output
        self.timer.start("preprocess")
        query_dic = self._preprocess(
            self.query,
            self.src_lang)  # Tokenize, posTag and universal query string
        self.timer.stop("preprocess")

        if concordance:
            return self._match_rank_concordance(l_best_segments)
        else:
            rank_segments = self._match_rank(l_best_segments, threshold)
            trans_segments = []
            # Check if the retrieve segments are 100% match or apply transformations
            for segment in rank_segments:
                #segment = segment[0]
                if segment.source_text == self.query:  # 100% match --> Return match considering domain
                    ter = 100
                    if self.query.isupper():
                        segment.source_text = segment.source_text.upper()
                    if self.query.islower():
                        segment.source_text = segment.source_text.lower()
                    #trans_segments.append((segment,ter))
                else:
                    #Pre-process source and target
                    tgt_text = TMUtilsMatching.pre_process(
                        segment.target_text, self.tgt_lang, 'tokenizer',
                        {})  # Pre-process tgt
                    src_text = TMUtilsMatching.pre_process(
                        segment.source_text, self.src_lang, 'tokenizer',
                        {})  # Tokenize tm_src
                    if 'regex' in match_process:
                        if (query_dic['tokenizer'] == query_dic['query_re']):
                            ter = TMUtilsMatching._ter_score(
                                query_dic['tokenizer'],
                                src_text)  # Regex did't applied on query
                        else:
                            self.timer.start("_regx_match")
                            tgt_text, src_text, ter = self._regx_match(
                                query_dic, src_text, tgt_text
                            )  #, segment.source_pos, segment.target_pos
                            self.timer.stop("_regx_match")
                            logging.info(
                                "Applied Regex Segment: {} {} {}".format(
                                    tgt_text, src_text, str(ter)))
                    else:
                        ter = TMUtilsMatching._ter_score(
                            query_dic['tokenizer'],
                            src_text)  # Regex did't enter as a parameter
                    if ter < threshold:
                        logging.info("TER less threshold: {} ".format(
                            str(ter)))
                        continue
                    if 'posTag' in match_process and ter != 100:  #Check segments with only one difference
                        if segment.source_pos is not None and segment.target_pos is not None:  #This part need the pos tagger annotation
                            self.timer.start("fuzzy_match")
                            #target_word (to D, R, or I), target_position, operation(R I or D),src_un_match(some time have source or query information)
                            tgt_word, tgt_position, operation, src_un_match, src_position = self._combine_feature_match(
                                query_dic, tgt_text, src_text,
                                segment.source_pos, segment.target_pos,
                                align_features)

                            logging.info("Un_match: {} {} ".format(
                                tgt_word, operation))

                            if src_un_match is not None:
                                src_text = self._create_target_expression(
                                    src_text, src_position, operation,
                                    src_un_match, 'source')  #src_un_match,
                                # src_text = src_text.split(' ')
                                # if operation == 'R':
                                #   src_text[int(src_position.split(' _ ')[1])] = tgt_word
                                # if operation == 'I':
                                #   new_src_text = src_text[:int(src_position)] + [src_un_match] + src_text[int(src_position):]
                                #   #new_src_text.append(src_un_match)
                                #   #new_src_text = new_src_text + src_text[int(src_position):]
                                #   src_text = new_src_text
                                # if operation == 'D':
                                #   src_text.pop(int(src_position))
                                # src_text = ' '.join(src_text)
                            if tgt_word is not None:
                                tgt_text = self._create_target_expression(
                                    tgt_text, tgt_position, operation,
                                    src_un_match, 'target')  #tgt_word,

                                self.timer.stop("fuzzy_match")
                    segment.source_text = TMUtilsMatching.pre_process(
                        src_text.split(' '), self.src_lang, 'untokenizer', {})
                    segment.target_text = TMUtilsMatching.pre_process(
                        tgt_text.split(' '), self.tgt_lang, 'untokenizer', {})
                    logging.info("Target segment: {}".format(
                        segment.target_text))
                    if self.query.isupper():
                        segment.source_text = segment.source_text.upper()
                        segment.target_text = segment.target_text.upper()
                    if self.query.islower():
                        segment.source_text = segment.source_text.lower()
                        segment.target_text = segment.target_text.lower()
                trans_segments.append((segment, ter))
            return trans_segments