def _align_source_target(self, un_match, un_pos, position, tgt_text, tgt_pos, align_features): tgt_dic = {} # list of pairs of words tgt_word_pos = TMUtilsMatching.segment_2_universal( tgt_text.lower(), tgt_pos, self.tgt_lang) for i in range(0, len(tgt_word_pos)): value_similarity = 0 for f in align_features: if f == 'word_ter': # TER between words value_similarity = value_similarity + TMUtilsMatching.ter_distance( un_match, tgt_word_pos[i][0]) if f == 'posTag': # Boolean PosTag value_similarity = value_similarity + TMUtilsMatching.pos_bool( un_pos, tgt_word_pos[i][1]) if f == 'position': # Word position value_similarity = value_similarity + TMUtilsMatching.position_distance( position, i) #if f == 'frequency': # frequency of pairs of words # value_similarity = value_similarity + self.target_importance(un_word, tgt_word_pos[i][0], segment,best_segments) # Dictionary have the target word and the position of the word in the target sentence tgt_dic[(tgt_word_pos[i][0], i)] = value_similarity tgt_align = sorted(tgt_dic.items(), key=lambda item: item[1], reverse=True)[0] # Select the highest score return tgt_align[0][0], tgt_align[0][ 1] # un_word, un_position # Retorn the word with biggest score
def _preprocess(self, text, lang): dic_query = {} s_tags = XmlUtils.extract_tags(text) if not s_tags: dic_query['query'] = text else: dic_query['query'] = XmlUtils.strip_tags( text) # split tag to do the match dic_query['tokenizer'] = TMUtilsMatching.pre_process( dic_query['query'], self.src_lang, 'tokenizer', {}) dic_query['pos'] = TMUtilsMatching.pre_process(dic_query['tokenizer'], lang, 'pos_tagger', {}) dic_query['universal'] = TMUtilsMatching.segment_2_universal( dic_query['tokenizer'].lower(), dic_query['pos'], lang) # universal_text[0] dic_query['universal'] = dic_query['pos'] regex_class = TMRegexMatch( self.src_lang, self.tgt_lang) # Class to improve fuzzy match dic_query['query_re'] = TMUtilsMatching.pre_process( dic_query['tokenizer'], self.src_lang, 'reg_exp', regex_class.re_pp) return dic_query
def process(self, query_dic, tgt_text, src_text, src_pos, tgt_pos, align_features): tgt_word = None tgt_position = None operation = None un_match = None src_position = None src_word_pos = TMUtilsMatching.segment_2_universal( src_text.lower(), src_pos, self.src_lang ) #self._segment_2_universal(segment.source_text, segment.source_pos, self.src_lang) # [word, pos] tm_src segment query_universal = [] #Check if segments are equal of with only one diference (posTag) query_tok = query_dic['tokenizer'].lower() for i in range(0, len(query_dic['universal'].split(' '))): query_universal.append([ query_tok.split(' ')[i], query_dic['universal'].split(' ')[i] ]) #list(zip(query_dic['tokenizer'].split(' '), query_dic['universal'].split(' '))) logging.info("Differences between PosTag: {} ".format( TMUtilsMatching.len_compare(query_universal, src_word_pos))) if TMUtilsMatching.len_compare( query_universal, src_word_pos) is True and (query_dic['tokenizer'] != src_text): # Obtain un_match word and its features if len(query_universal) == len(src_word_pos): operation = 'R' # Load the unmatch between query and src --> un_match = un_match_q _ un_match_s un_match, un_pos, src_position = TMPosMatch._get_src_unmatch( query_universal, src_word_pos) # Replace (query and src) if un_match is not None: tgt_word, tgt_position = self._align_source_target( un_match.split('_')[1], un_pos.split('_')[1], src_position.split('_')[1], tgt_text, tgt_pos, align_features) tgt_word = un_match.split('_')[0] elif len(query_universal) > len( src_word_pos): # Insert a new word in target operation = 'I' un_match, un_pos, src_position = TMPosMatch._get_src_unmatch( query_universal, src_word_pos) # Insert --> return word from query tgt_word = un_match tgt_position = src_position else: # Delete a new word in target operation = 'D' un_match, un_pos, src_position = TMPosMatch._get_src_unmatch( src_word_pos, query_universal) # Delete --> return word from src if un_match is not None: tgt_word, tgt_position = self._align_source_target( un_match, un_pos, src_position, tgt_text, tgt_pos, align_features) return tgt_word, tgt_position, operation, un_match, src_position
def execute_segment(self, segment, src_re, src_re_reduce, ini_editD, align_features, equal): logging.info("Applied match PIPE") tgt_text = segment.target_text src_text = segment.source_text status = '' editD = ini_editD status_tokenizer = False if equal: if self.query == src_text: return segment, editD, 'find', equal, status_tokenizer else: equal = False if not equal: for op in self.pipe: #Indicate by parameters if op == 'regex': if self.query_dic['query'] != self.query_dic['query_re']: # If query has regex #and not TMMatching.check_upper_equal(self.query_dic['query'], self.query_dic['query_re']) logging.info("Applied Regex") self.timer.start("_regx_match") # ************************** Compare query_re with src_re --> simplified match = ini_editD if src_re != src_text: if src_re_reduce.lower() == self.query_dic['query_re_reduce'].lower(): # With simplified regular expression and in lowercase match = 100 # Perfect match tgt_text, src_text = self._regex_transform(segment.source_text, segment.target_text) ini_editD = self._tm_edit_distance(self.query_dic['query'],src_text, self.query_dic['query_re_reduce'], src_re_reduce) #match logging.info("After applied Regex Segment: {} {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD))) if match == 100: status = 'find' self.timer.stop("_regx_match") if op == 'tags': logging.info("Delete Tags") self.timer.start("_tags_match") src_text, tgt_text, status, reduce, ini_editD = self._match_tags(src_text, src_re_reduce, tgt_text, status, ini_editD) logging.info("After applied Tags: {} {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD))) self.timer.stop("_tags_match") if op == 'posTag': self.timer.start("fuzzy_match") upper = False if segment.source_pos is not None and segment.target_pos is not None: # This part need the pos tagger annotation squery, tok_query, pos_query = self.check_query_parameters() logging.info("Apply posTag matching") self.timer.start("fuzzy_preprocess") if status_tokenizer == False: # Tokenize source and target tgt_text = TMUtilsMatching.pre_process(tgt_text, self.tgt_lang, 'tokenizer', {}) # Pre-process tgt src_text = TMUtilsMatching.pre_process(src_text, self.src_lang, 'tokenizer', {}) # Tokenize tm_src self.query_dic['query_re_reduce_tok'] = TMUtilsMatching.pre_process(self.query_dic['query_re_reduce'], self.src_lang, 'tokenizer', {}) # Tokenize the simplified query status_tokenizer = True if 'universal' not in self.query_dic: self.query_dic['universal'] = TMUtilsMatching.segment_2_universal(tok_query.lower(), pos_query, self.src_lang) #print(self.query_dic['universal']) src_word_pos = TMUtilsMatching.segment_2_universal(src_text.lower(), segment.source_pos, self.src_lang) # [word, pos] tm_src segment tgt_word_pos = TMUtilsMatching.segment_2_universal(tgt_text.lower(), segment.target_pos, self.tgt_lang) # [word, pos] tm_tgt segment self.timer.stop("fuzzy_preprocess") if isinstance(self.query_dic['universal'], list) and isinstance(src_word_pos, list) and isinstance(tgt_word_pos, list): logging.info("Check unmatch word --> PosTag") if TMUtilsMatching.len_compare(pos_query.split(' '), segment.source_pos.split(' ')) is True and (tok_query != src_text): logging.info("Query and source have same length or only one difference") self.timer.start("search unmatch") tgt_un_match, tgt_position, operation, src_un_match, src_position, pos_tag = self._combine_feature_match(tok_query, src_word_pos, tgt_word_pos, align_features) self.timer.stop("search unmatch") logging.info("Unmatch word and operation: {} {}".format(safe_str(src_un_match), safe_str(operation), safe_str(ini_editD))) self.timer.start("create target unmatch") if src_un_match is not None: # Create new src src_text, upper = self._create_target_expression(src_text, src_position, operation, src_un_match, 'source', upper, pos_tag) # Improve edit distance src_re = TMUtilsMatching.pre_process(src_text, self.src_lang, 'reg_exp', self.match['regex'].re_pp) src_re_reduce = TMRegexMatch.simplified_name(src_re) penalize_match = self._improve_match(src_un_match, operation) ini_editD = self._tm_edit_distance(tok_query.lower(), src_text.lower(), self.query_dic['query_re_reduce_tok'].lower(), src_re_reduce.lower()) - penalize_match # match # Create new tgt if tgt_un_match is not None: tgt_text, upper = self._create_target_expression(tgt_text, tgt_position, operation, tgt_un_match, 'target', upper, pos_tag) # tgt_word, self.timer.stop("create target unmatch") logging.info("After applied posTag: {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD))) self.timer.stop("fuzzy_match") # Check if find or break some transformation if ini_editD > editD: editD = ini_editD if status == 'find' or status == 'break': segment.source_text = src_text segment.target_text = tgt_text return segment, editD, status, equal, status_tokenizer if editD >= self.min_match: segment.source_text = src_text segment.target_text = tgt_text status = 'find' else: #Call split rules if 'split' in self.pipe and not self.trans_segments: # Applied split if exist posTagger for source language and self.query_dic['pos'] src_text = None tgt_text = None editSplit = 0 # Split by sentences. list_sentences = TMUtilsMatching.pre_process(self.query_dic['tokenizer'], self.src_lang, 'split_sentences', {}) logging.info("split by Sentences : {} ".format(list_sentences)) # Check sentence first if len(list_sentences) > 1: split_match = TMSplitMatch([TMUtilsMatching.pre_process(q.split(' '), self.src_lang, 'untokenizer', {}) for q in list_sentences], [], self.src_lang, self.tgt_lang, 'sentence', self.machine_translation, self.domain) src_text, tgt_text, editSplit = split_match._match() #print('*****Only sentences *****') #print(src_text) #print(tgt_text) #print(editSplit) if editSplit >= self.min_match: # Check if split method return segments from ActivaTM segment.source_text, segment.target_text, editD = src_text, tgt_text, editSplit else: # Split in small phrase # Check if exist split for an especific pairs of languages lang_class = G_CONFIG.get_split_rules(self.src_lang, self.tgt_lang) if lang_class: logging.info("Split Query by Phrase") all_split, all_marks = self._splitByPhrase(lang_class, list_sentences) # Check if any split rule was applied if len(all_split) > 1: # print(list_query_split) split_match = TMSplitMatch(all_split, all_marks, self.src_lang, self.tgt_lang, 'phrase', self.machine_translation, self.domain) src_text, tgt_text, editSplit = split_match._match() if editSplit >= self.min_match: #Check if split method return segments from ActivaTM segment.source_text, segment.target_text, editD = src_text, tgt_text, editSplit if editD >= self.min_match: status = 'find' status_tokenizer = True else: if not self.trans_segments: #If doesn't found any match, prepare segment to automatic translation. If there aren't automatic translation, then return [] #logging.info("Prepare Automatic Translation : ") self.trans_segments.append((segment, editD)) status = 'break' # If exist segment on the list, break the for and there aren't translation return segment, editD, status, equal, status_tokenizer