Пример #1
0
    def _align_source_target(self, un_match, un_pos, position, tgt_text,
                             tgt_pos, align_features):

        tgt_dic = {}  # list of pairs of words

        tgt_word_pos = TMUtilsMatching.segment_2_universal(
            tgt_text.lower(), tgt_pos, self.tgt_lang)
        for i in range(0, len(tgt_word_pos)):
            value_similarity = 0
            for f in align_features:
                if f == 'word_ter':  # TER between words
                    value_similarity = value_similarity + TMUtilsMatching.ter_distance(
                        un_match, tgt_word_pos[i][0])
                if f == 'posTag':  # Boolean PosTag
                    value_similarity = value_similarity + TMUtilsMatching.pos_bool(
                        un_pos, tgt_word_pos[i][1])
                if f == 'position':  # Word position
                    value_similarity = value_similarity + TMUtilsMatching.position_distance(
                        position, i)
                #if f == 'frequency':  # frequency of pairs of words
                #  value_similarity = value_similarity + self.target_importance(un_word, tgt_word_pos[i][0], segment,best_segments)
            # Dictionary have the target word and the position of the word in the target sentence
            tgt_dic[(tgt_word_pos[i][0], i)] = value_similarity
        tgt_align = sorted(tgt_dic.items(),
                           key=lambda item: item[1],
                           reverse=True)[0]  # Select the highest score
        return tgt_align[0][0], tgt_align[0][
            1]  # un_word, un_position  # Retorn the word with biggest score
Пример #2
0
    def _preprocess(self, text, lang):

        dic_query = {}
        s_tags = XmlUtils.extract_tags(text)
        if not s_tags:
            dic_query['query'] = text
        else:
            dic_query['query'] = XmlUtils.strip_tags(
                text)  # split tag to do the match

        dic_query['tokenizer'] = TMUtilsMatching.pre_process(
            dic_query['query'], self.src_lang, 'tokenizer', {})
        dic_query['pos'] = TMUtilsMatching.pre_process(dic_query['tokenizer'],
                                                       lang, 'pos_tagger', {})
        dic_query['universal'] = TMUtilsMatching.segment_2_universal(
            dic_query['tokenizer'].lower(), dic_query['pos'],
            lang)  # universal_text[0]
        dic_query['universal'] = dic_query['pos']

        regex_class = TMRegexMatch(
            self.src_lang, self.tgt_lang)  # Class to improve fuzzy match
        dic_query['query_re'] = TMUtilsMatching.pre_process(
            dic_query['tokenizer'], self.src_lang, 'reg_exp',
            regex_class.re_pp)
        return dic_query
Пример #3
0
    def process(self, query_dic, tgt_text, src_text, src_pos, tgt_pos,
                align_features):

        tgt_word = None
        tgt_position = None
        operation = None
        un_match = None
        src_position = None

        src_word_pos = TMUtilsMatching.segment_2_universal(
            src_text.lower(), src_pos, self.src_lang
        )  #self._segment_2_universal(segment.source_text, segment.source_pos, self.src_lang) # [word, pos] tm_src segment
        query_universal = []
        #Check if segments are equal of with only one diference (posTag)
        query_tok = query_dic['tokenizer'].lower()
        for i in range(0, len(query_dic['universal'].split(' '))):
            query_universal.append([
                query_tok.split(' ')[i], query_dic['universal'].split(' ')[i]
            ])
        #list(zip(query_dic['tokenizer'].split(' '), query_dic['universal'].split(' ')))
        logging.info("Differences between PosTag: {} ".format(
            TMUtilsMatching.len_compare(query_universal, src_word_pos)))
        if TMUtilsMatching.len_compare(
                query_universal,
                src_word_pos) is True and (query_dic['tokenizer'] != src_text):
            # Obtain un_match word and its features

            if len(query_universal) == len(src_word_pos):
                operation = 'R'  # Load the unmatch between query and src --> un_match = un_match_q _ un_match_s
                un_match, un_pos, src_position = TMPosMatch._get_src_unmatch(
                    query_universal, src_word_pos)  # Replace (query and src)
                if un_match is not None:
                    tgt_word, tgt_position = self._align_source_target(
                        un_match.split('_')[1],
                        un_pos.split('_')[1],
                        src_position.split('_')[1], tgt_text, tgt_pos,
                        align_features)
                    tgt_word = un_match.split('_')[0]
            elif len(query_universal) > len(
                    src_word_pos):  # Insert a new word in target
                operation = 'I'
                un_match, un_pos, src_position = TMPosMatch._get_src_unmatch(
                    query_universal,
                    src_word_pos)  # Insert --> return word from query
                tgt_word = un_match
                tgt_position = src_position
            else:  # Delete a new word in target
                operation = 'D'
                un_match, un_pos, src_position = TMPosMatch._get_src_unmatch(
                    src_word_pos,
                    query_universal)  # Delete --> return word from src
                if un_match is not None:
                    tgt_word, tgt_position = self._align_source_target(
                        un_match, un_pos, src_position, tgt_text, tgt_pos,
                        align_features)
        return tgt_word, tgt_position, operation, un_match, src_position
Пример #4
0
  def execute_segment(self, segment, src_re, src_re_reduce, ini_editD, align_features, equal):
    logging.info("Applied match PIPE")
    tgt_text = segment.target_text
    src_text = segment.source_text
    status = ''

    editD = ini_editD
    status_tokenizer = False
    if equal:
      if self.query == src_text:
        return segment, editD, 'find', equal, status_tokenizer
      else:
        equal = False
    if not equal:
      for op in self.pipe: #Indicate by parameters
        if op == 'regex':
          if self.query_dic['query'] != self.query_dic['query_re']: # If query has regex   #and not TMMatching.check_upper_equal(self.query_dic['query'], self.query_dic['query_re'])
              logging.info("Applied Regex")
              self.timer.start("_regx_match")
              # ************************** Compare query_re with src_re --> simplified
              match = ini_editD
              if src_re != src_text:
                if src_re_reduce.lower() == self.query_dic['query_re_reduce'].lower():  # With simplified regular expression and in lowercase
                  match = 100  # Perfect match
                tgt_text, src_text = self._regex_transform(segment.source_text, segment.target_text)
                ini_editD = self._tm_edit_distance(self.query_dic['query'],src_text, self.query_dic['query_re_reduce'], src_re_reduce) #match
                logging.info("After applied Regex Segment: {} {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD)))
              if match == 100:
                status = 'find'
              self.timer.stop("_regx_match")
        if op == 'tags':
          logging.info("Delete Tags")
          self.timer.start("_tags_match")
          src_text, tgt_text, status, reduce, ini_editD = self._match_tags(src_text, src_re_reduce, tgt_text, status, ini_editD)
          logging.info("After applied Tags: {} {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD)))
          self.timer.stop("_tags_match")

        if op == 'posTag':
          self.timer.start("fuzzy_match")
          upper = False
          if segment.source_pos is not None and segment.target_pos is not None:  # This part need the pos tagger annotation
            squery, tok_query, pos_query = self.check_query_parameters()
            logging.info("Apply posTag matching")
            self.timer.start("fuzzy_preprocess")
            if status_tokenizer == False:  # Tokenize source and target
              tgt_text = TMUtilsMatching.pre_process(tgt_text, self.tgt_lang, 'tokenizer', {})  # Pre-process tgt
              src_text = TMUtilsMatching.pre_process(src_text, self.src_lang, 'tokenizer', {})  # Tokenize tm_src
              self.query_dic['query_re_reduce_tok'] = TMUtilsMatching.pre_process(self.query_dic['query_re_reduce'], self.src_lang, 'tokenizer', {})  # Tokenize the simplified query
              status_tokenizer = True

            if 'universal' not in self.query_dic:
              self.query_dic['universal'] = TMUtilsMatching.segment_2_universal(tok_query.lower(), pos_query, self.src_lang)
            #print(self.query_dic['universal'])
            src_word_pos = TMUtilsMatching.segment_2_universal(src_text.lower(), segment.source_pos, self.src_lang)  # [word, pos] tm_src segment
            tgt_word_pos = TMUtilsMatching.segment_2_universal(tgt_text.lower(), segment.target_pos, self.tgt_lang)  # [word, pos] tm_tgt segment

            self.timer.stop("fuzzy_preprocess")
            if isinstance(self.query_dic['universal'], list) and isinstance(src_word_pos, list) and isinstance(tgt_word_pos, list):

              logging.info("Check unmatch word --> PosTag")
              if TMUtilsMatching.len_compare(pos_query.split(' '), segment.source_pos.split(' ')) is True and (tok_query != src_text):
                logging.info("Query and source have same length or only one difference")

                self.timer.start("search unmatch")
                tgt_un_match, tgt_position, operation, src_un_match, src_position, pos_tag = self._combine_feature_match(tok_query, src_word_pos, tgt_word_pos, align_features)
                self.timer.stop("search unmatch")
                logging.info("Unmatch word and operation: {} {}".format(safe_str(src_un_match), safe_str(operation), safe_str(ini_editD)))
                self.timer.start("create target unmatch")

                if src_un_match is not None:
                  # Create new src
                  src_text, upper = self._create_target_expression(src_text, src_position, operation, src_un_match, 'source', upper, pos_tag)
                  # Improve edit distance
                  src_re = TMUtilsMatching.pre_process(src_text, self.src_lang, 'reg_exp', self.match['regex'].re_pp)
                  src_re_reduce = TMRegexMatch.simplified_name(src_re)
                  penalize_match = self._improve_match(src_un_match, operation)
                  ini_editD = self._tm_edit_distance(tok_query.lower(), src_text.lower(), self.query_dic['query_re_reduce_tok'].lower(), src_re_reduce.lower()) - penalize_match  # match
                  # Create new tgt
                if tgt_un_match is not None:
                  tgt_text, upper = self._create_target_expression(tgt_text, tgt_position, operation, tgt_un_match, 'target', upper, pos_tag)  # tgt_word,
                self.timer.stop("create target unmatch")
                logging.info("After applied posTag: {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD)))
          self.timer.stop("fuzzy_match")

        # Check if find or break some transformation
        if ini_editD > editD:
          editD = ini_editD
        if status == 'find' or status == 'break':
          segment.source_text = src_text
          segment.target_text = tgt_text
          return segment, editD, status, equal, status_tokenizer
      if editD >= self.min_match:
        segment.source_text = src_text
        segment.target_text = tgt_text
        status = 'find'
      else:
        #Call split rules
        if 'split' in self.pipe and not self.trans_segments: # Applied split if exist posTagger for source language  and self.query_dic['pos']

          src_text = None
          tgt_text = None
          editSplit = 0

          # Split by sentences.
          list_sentences = TMUtilsMatching.pre_process(self.query_dic['tokenizer'], self.src_lang, 'split_sentences', {})
          logging.info("split by Sentences : {} ".format(list_sentences))

          # Check sentence first
          if len(list_sentences) > 1:

            split_match = TMSplitMatch([TMUtilsMatching.pre_process(q.split(' '), self.src_lang, 'untokenizer', {}) for q in list_sentences], [], self.src_lang, self.tgt_lang, 'sentence', self.machine_translation, self.domain)
            src_text, tgt_text, editSplit = split_match._match()
            #print('*****Only sentences *****')
            #print(src_text)
            #print(tgt_text)
            #print(editSplit)

          if editSplit >= self.min_match:  # Check if split method return segments from ActivaTM
            segment.source_text, segment.target_text, editD = src_text, tgt_text, editSplit

          else: # Split in small phrase
            # Check if exist split for an especific pairs of languages
            lang_class = G_CONFIG.get_split_rules(self.src_lang, self.tgt_lang)

            if lang_class:
              logging.info("Split Query by Phrase")
              all_split, all_marks = self._splitByPhrase(lang_class, list_sentences)

              # Check if any split rule was applied
              if len(all_split) > 1:
                  # print(list_query_split)
                split_match = TMSplitMatch(all_split, all_marks, self.src_lang, self.tgt_lang, 'phrase', self.machine_translation, self.domain)
                src_text, tgt_text, editSplit = split_match._match()

                if editSplit >= self.min_match: #Check if split method return segments from ActivaTM
                  segment.source_text, segment.target_text, editD = src_text, tgt_text, editSplit

        if editD >= self.min_match:
          status = 'find'
          status_tokenizer = True
        else:
          if not self.trans_segments:  #If doesn't found any match, prepare segment to automatic translation. If there aren't automatic translation, then return []
            #logging.info("Prepare Automatic Translation : ")
            self.trans_segments.append((segment, editD))
          status = 'break' # If exist segment on the list, break the for and there aren't translation
    return segment, editD, status, equal, status_tokenizer