示例#1
0
    def _preprocess(self, text, lang):

        dic_query = {}
        s_tags = XmlUtils.extract_tags(text)
        if not s_tags:
            dic_query['query'] = text
        else:
            dic_query['query'] = XmlUtils.strip_tags(
                text)  # split tag to do the match

        dic_query['tokenizer'] = TMUtilsMatching.pre_process(
            dic_query['query'], self.src_lang, 'tokenizer', {})
        dic_query['pos'] = TMUtilsMatching.pre_process(dic_query['tokenizer'],
                                                       lang, 'pos_tagger', {})
        dic_query['universal'] = TMUtilsMatching.segment_2_universal(
            dic_query['tokenizer'].lower(), dic_query['pos'],
            lang)  # universal_text[0]
        dic_query['universal'] = dic_query['pos']

        regex_class = TMRegexMatch(
            self.src_lang, self.tgt_lang)  # Class to improve fuzzy match
        dic_query['query_re'] = TMUtilsMatching.pre_process(
            dic_query['tokenizer'], self.src_lang, 'reg_exp',
            regex_class.re_pp)
        return dic_query
示例#2
0
 def check_query_parameters(self):
   if 'pos' not in self.query_dic:  # Applied pos and universal on query --> only the firt time
     if 'tokenizer' not in self.query_dic:  # The first transformation is posTag --> any other were applied
       query_out_tags = XmlUtils.replace_tags(self.query_dic['query'])
       self.query_dic['tokenizer'] = TMUtilsMatching.pre_process(query_out_tags, self.src_lang, 'tokenizer', {})
     self.query_dic['pos'] = TMUtilsMatching.pre_process(self.query_dic['tokenizer'], self.src_lang, 'pos_tagger', {})
   return self.query_dic['query'], self.query_dic['tokenizer'], self.query_dic['pos']
示例#3
0
  def _preprocess(self):
    self.query_dic['query'] = self.query

    if re.search("<.*>", self.query):  # Uniform tags --> # Yo tengo un <b>gato</b>. --> Yo tengo un <T1>gato</T1>
      self.query_dic['query_tags'] = TMUtilsMatching.pre_process(self.query, (self.src_lang, self.tgt_lang), 'tags', {})

      self.query_dic['query'] = self.query_dic['query_tags'] # query now have the tags <T1>gato</T1>

    if 'regex' in self.pipe: self.query_dic['query_re'] = TMUtilsMatching.pre_process(self.query_dic['query'], self.src_lang, 'reg_exp', self.match['regex'].re_pp)
    else: self.query_dic['query_re'] = self.query_dic['query']
    self.query_dic['query_re_reduce'] = TMRegexMatch.simplified_name(self.query_dic['query_re'])

    return self.query_dic
示例#4
0
 def _deals_output(self, segment, editD, trans_segments, status_tokenizer, status):
   if self.out == 'moses': # Moses output is tokenizer
     if status_tokenizer == False:# tokenize output
       segment.source_text = TMUtilsMatching.pre_process(segment.source_text, self.src_lang, 'tokenizer', {})
       segment.target_text = TMUtilsMatching.pre_process(segment.target_text, self.tgt_lang, 'tokenizer', {})
     trans_segments.append((segment, editD))
     return trans_segments, 'break'
   else:
     if status_tokenizer == True:  # TM output is untokenizer
       segment.target_text = TMUtilsMatching.pre_process(segment.target_text.split(' '), self.tgt_lang, 'untokenizer', {})
       segment.source_text = TMUtilsMatching.pre_process(segment.source_text.split(' '), self.src_lang, 'untokenizer', {})
     trans_segments.append((segment, editD))
     if status == 'translate': status = 'break'
     else: status = 'continue'
     #if editD == 100: # Add this if to obtain better matching time
     #  status = 'break'
   logging.info("Final Output (Query -- Source -- Target): {} {} {}".format(safe_str(self.query_dic['query'] + ' -- '), safe_str(segment.source_text + ' -- '), safe_str(segment.target_text)))
   return trans_segments, status
示例#5
0
文件: TMSplit.py 项目: MittagQI/nectm
    def clause_chunk(self, text):
        dicSegment_Rules = {}
        # Check if need to transfer to universal posTag
        if self.class_lang == 'generic_geral' and self.lang.upper(
        ) not in TMUtilsMatching.pre_process(' ', self.lang.upper(),
                                             'get_lang_universalPOS', {}):
            text_Universal = TMUtilsMatching.pre_process(
                [[[word, pos] for word, pos in text]], self.lang,
                'universal_pos_tagger', {})

            if not text_Universal:  # If rhere are some problem with universal posTag
                return [Tree('S', text)]

            text = [(word, pos) for word, pos in text_Universal]

        #Run each rule
        for r in self.order:
            if r == 'initial':
                lSentences = [
                    text
                ]  # --> Lista inicial de segmentos a serem processados
            else:
                chunkO = RegexpChunkParser(
                    self.rules[r].get_rule_obj(),
                    chunk_label='splitPoint',
                    root_label='S')  # Create chunk Object

                #Process to chunk the segments --> Call each rule in recursive form
                lChunk_Segments = lSentences

                len_actual = 0  #--> Control the split number
                len_previous = len(lSentences)

                while len_actual != len_previous:
                    len_previous = len(lChunk_Segments)
                    lChunk_Segments = self._recursive_rule(
                        lChunk_Segments, chunkO)
                    len_actual = len(lChunk_Segments)

                dicSegment_Rules[r] = lChunk_Segments
                lSentences = lChunk_Segments  # --> Load all chunks obtain by one rule
        self.timer.print()
        return dicSegment_Rules['last']
示例#6
0
  def _validate_pipe(self, pipe):
    match_process = {
      'regex': None,
      'posTag': None,
      'tags': TMTags()
    }

    try:
      match_process['regex'] = TMRegexMatch(self.src_lang, self.tgt_lang)
      logging.info("Loading regex for matching")
    except ValueError:
      if 'regex' in pipe:
        pipe.pop(pipe.index('regex'))
        logging.info("Unsupported regex for matching")

    query_out_tags = XmlUtils.replace_tags(self.query)

    try:
      if 'tokenizer' not in self.query_dic:
        self.query_dic['tokenizer'] = TMUtilsMatching.pre_process(query_out_tags, self.src_lang, 'tokenizer', {})
      logging.info("Loading Tokenizer for {}".format(self.src_lang))

      try:
        if 'pos' not in self.query_dic:
          self.query_dic['pos'] = TMUtilsMatching.pre_process(self.query_dic['tokenizer'], self.src_lang, 'pos_tagger', {})
        match_process['posTag'] = TMPosMatch(self.src_lang, self.tgt_lang)
        logging.info("Loading regex for matching")
      except Exception as e:
        if 'posTag' in pipe:
          pipe.pop(pipe.index('posTag'))
          logging.info("Unsupported posTag for matching")
    except Exception as e:
      if 'posTag' in pipe:
        pipe.pop(pipe.index('posTag'))
        logging.info("Unsupported Tokenizer for {}".format(self.src_lang))

    return match_process, pipe
示例#7
0
文件: TMDbApi.py 项目: MittagQI/nectm
    def _prepare_target_text(self, query, segment, translation, source_lang,
                             target_lang):
        segment.source_text = query
        segment.domain = []
        segment.file_name = []

        if re.search("</?[^<>]+/?>",
                     query) is not None:  # If there are tags on query
            tgt_tags = TMUtilsMatching.transfer_tags(
                segment.source_text, translation, (source_lang, target_lang))
            segment.target_text = TMUtilsMatching.pre_process(
                tgt_tags.split(' '), target_lang, 'untokenizer', {})
        else:
            segment.target_text = translation.strip('\n')
        logging.info("Translate less minumum_match : {} {}".format(
            segment.source_text + ' -- ', translation))

        return segment
示例#8
0
  def _match_rank(self, best_segments):
    self.timer.start("rank segments")
    editD_score = []
    if 'query_tags' in self.query_dic: # Simplified tags
      query = TMUtilsMatching.reduce_tags(self.query_dic['query_tags']) # Yo tengo un <T1>gato</T1>. Yo tengo un T gato T.
    else:
      query = self.query_dic['query']

    for i in range(0, len(best_segments)):
      segment = best_segments[i]
      # Simplified tags in tm source
      if re.search("</?T[0-9]*/?>", segment[0].source_text):
        src_text = TMUtilsMatching.reduce_tags(segment[0].source_text) # Simplified tags in tm source and target
      else: src_text = segment[0].source_text
      # Applied Regex and simplified
      if 'regex' in self.pipe: src_re = TMUtilsMatching.pre_process(src_text, self.src_lang, 'reg_exp', self.match['regex'].re_pp)
      else: src_re = src_text


      src_re_reduce = TMRegexMatch.simplified_name(src_re)
      best_segments[i] = (segment[0], segment[1], src_re, src_re_reduce)
      editD_score.append(self._tm_edit_distance(query, src_text, self.query_dic['query_re_reduce'], src_re_reduce))  # EditD with tags simplied TMUtilsMatching._edit_distance(query, src_text)
    self.timer.stop("rank segments")
    return sorted(zip(best_segments, editD_score), key=operator.itemgetter(1), reverse=True)
示例#9
0
  def _match(self):

    #Create dictionary con query info (posTag and universal)

    if self.split_type == 'sentence':
      list_info_query = [{'tokenizer': self.list_query[j]} for j in range(0, len(self.list_query))]
    else:
      list_info_query = [{'tokenizer': self.list_query[j], 'pos': self.list_pos [j]} for j in range(0, len(self.list_query))]

    # Query Elasticsearch --> out=moses to return only one segment
    l_best_segments = self.tmdb_api.query([TMUtilsMatching.pre_process(q.split(' '), self.src_lang, 'untokenizer', {}) for q in self.list_query], list_info_query, (self.src_lang, self.tgt_lang), pipe=['regex', 'tags', 'posTag'], out='moses', limit=5, domains=None, min_match=80, concordance=False, aut_trans=False, exact_length=False)

    join_source = ''
    join_target = ''
    total_match = 0
    for i in range(0, len(l_best_segments)):
      if l_best_segments[i]:
        segment, match = l_best_segments[i][0]
        join_source = join_source + ' ' + segment.source_text
        join_target = join_target + ' ' + segment.target_text
      else:
          join_source = join_source + ' ' + self.list_query[i]
          join_target = join_target + ' ' + self.list_query[i]
          match = 0
      total_match = total_match + match

      if self.split_type == 'phrase':
        if self.list_marks:
          if self.list_marks[0]:
            mark = self.list_marks.pop(0)[0]
            join_source = join_source + ' ' + mark
            join_target = join_target + ' ' + mark

    total_match = total_match/len(self.list_query)
    #print(join_source + ' ---- ' + join_target + ' ---- ' + str(total_match))
    return join_source, join_target, int(math.floor(total_match))
示例#10
0
  def execute_segment(self, segment, src_re, src_re_reduce, ini_editD, align_features, equal):
    logging.info("Applied match PIPE")
    tgt_text = segment.target_text
    src_text = segment.source_text
    status = ''

    editD = ini_editD
    status_tokenizer = False
    if equal:
      if self.query == src_text:
        return segment, editD, 'find', equal, status_tokenizer
      else:
        equal = False
    if not equal:
      for op in self.pipe: #Indicate by parameters
        if op == 'regex':
          if self.query_dic['query'] != self.query_dic['query_re']: # If query has regex   #and not TMMatching.check_upper_equal(self.query_dic['query'], self.query_dic['query_re'])
              logging.info("Applied Regex")
              self.timer.start("_regx_match")
              # ************************** Compare query_re with src_re --> simplified
              match = ini_editD
              if src_re != src_text:
                if src_re_reduce.lower() == self.query_dic['query_re_reduce'].lower():  # With simplified regular expression and in lowercase
                  match = 100  # Perfect match
                tgt_text, src_text = self._regex_transform(segment.source_text, segment.target_text)
                ini_editD = self._tm_edit_distance(self.query_dic['query'],src_text, self.query_dic['query_re_reduce'], src_re_reduce) #match
                logging.info("After applied Regex Segment: {} {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD)))
              if match == 100:
                status = 'find'
              self.timer.stop("_regx_match")
        if op == 'tags':
          logging.info("Delete Tags")
          self.timer.start("_tags_match")
          src_text, tgt_text, status, reduce, ini_editD = self._match_tags(src_text, src_re_reduce, tgt_text, status, ini_editD)
          logging.info("After applied Tags: {} {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD)))
          self.timer.stop("_tags_match")

        if op == 'posTag':
          self.timer.start("fuzzy_match")
          upper = False
          if segment.source_pos is not None and segment.target_pos is not None:  # This part need the pos tagger annotation
            squery, tok_query, pos_query = self.check_query_parameters()
            logging.info("Apply posTag matching")
            self.timer.start("fuzzy_preprocess")
            if status_tokenizer == False:  # Tokenize source and target
              tgt_text = TMUtilsMatching.pre_process(tgt_text, self.tgt_lang, 'tokenizer', {})  # Pre-process tgt
              src_text = TMUtilsMatching.pre_process(src_text, self.src_lang, 'tokenizer', {})  # Tokenize tm_src
              self.query_dic['query_re_reduce_tok'] = TMUtilsMatching.pre_process(self.query_dic['query_re_reduce'], self.src_lang, 'tokenizer', {})  # Tokenize the simplified query
              status_tokenizer = True

            if 'universal' not in self.query_dic:
              self.query_dic['universal'] = TMUtilsMatching.segment_2_universal(tok_query.lower(), pos_query, self.src_lang)
            #print(self.query_dic['universal'])
            src_word_pos = TMUtilsMatching.segment_2_universal(src_text.lower(), segment.source_pos, self.src_lang)  # [word, pos] tm_src segment
            tgt_word_pos = TMUtilsMatching.segment_2_universal(tgt_text.lower(), segment.target_pos, self.tgt_lang)  # [word, pos] tm_tgt segment

            self.timer.stop("fuzzy_preprocess")
            if isinstance(self.query_dic['universal'], list) and isinstance(src_word_pos, list) and isinstance(tgt_word_pos, list):

              logging.info("Check unmatch word --> PosTag")
              if TMUtilsMatching.len_compare(pos_query.split(' '), segment.source_pos.split(' ')) is True and (tok_query != src_text):
                logging.info("Query and source have same length or only one difference")

                self.timer.start("search unmatch")
                tgt_un_match, tgt_position, operation, src_un_match, src_position, pos_tag = self._combine_feature_match(tok_query, src_word_pos, tgt_word_pos, align_features)
                self.timer.stop("search unmatch")
                logging.info("Unmatch word and operation: {} {}".format(safe_str(src_un_match), safe_str(operation), safe_str(ini_editD)))
                self.timer.start("create target unmatch")

                if src_un_match is not None:
                  # Create new src
                  src_text, upper = self._create_target_expression(src_text, src_position, operation, src_un_match, 'source', upper, pos_tag)
                  # Improve edit distance
                  src_re = TMUtilsMatching.pre_process(src_text, self.src_lang, 'reg_exp', self.match['regex'].re_pp)
                  src_re_reduce = TMRegexMatch.simplified_name(src_re)
                  penalize_match = self._improve_match(src_un_match, operation)
                  ini_editD = self._tm_edit_distance(tok_query.lower(), src_text.lower(), self.query_dic['query_re_reduce_tok'].lower(), src_re_reduce.lower()) - penalize_match  # match
                  # Create new tgt
                if tgt_un_match is not None:
                  tgt_text, upper = self._create_target_expression(tgt_text, tgt_position, operation, tgt_un_match, 'target', upper, pos_tag)  # tgt_word,
                self.timer.stop("create target unmatch")
                logging.info("After applied posTag: {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD)))
          self.timer.stop("fuzzy_match")

        # Check if find or break some transformation
        if ini_editD > editD:
          editD = ini_editD
        if status == 'find' or status == 'break':
          segment.source_text = src_text
          segment.target_text = tgt_text
          return segment, editD, status, equal, status_tokenizer
      if editD >= self.min_match:
        segment.source_text = src_text
        segment.target_text = tgt_text
        status = 'find'
      else:
        #Call split rules
        if 'split' in self.pipe and not self.trans_segments: # Applied split if exist posTagger for source language  and self.query_dic['pos']

          src_text = None
          tgt_text = None
          editSplit = 0

          # Split by sentences.
          list_sentences = TMUtilsMatching.pre_process(self.query_dic['tokenizer'], self.src_lang, 'split_sentences', {})
          logging.info("split by Sentences : {} ".format(list_sentences))

          # Check sentence first
          if len(list_sentences) > 1:

            split_match = TMSplitMatch([TMUtilsMatching.pre_process(q.split(' '), self.src_lang, 'untokenizer', {}) for q in list_sentences], [], self.src_lang, self.tgt_lang, 'sentence', self.machine_translation, self.domain)
            src_text, tgt_text, editSplit = split_match._match()
            #print('*****Only sentences *****')
            #print(src_text)
            #print(tgt_text)
            #print(editSplit)

          if editSplit >= self.min_match:  # Check if split method return segments from ActivaTM
            segment.source_text, segment.target_text, editD = src_text, tgt_text, editSplit

          else: # Split in small phrase
            # Check if exist split for an especific pairs of languages
            lang_class = G_CONFIG.get_split_rules(self.src_lang, self.tgt_lang)

            if lang_class:
              logging.info("Split Query by Phrase")
              all_split, all_marks = self._splitByPhrase(lang_class, list_sentences)

              # Check if any split rule was applied
              if len(all_split) > 1:
                  # print(list_query_split)
                split_match = TMSplitMatch(all_split, all_marks, self.src_lang, self.tgt_lang, 'phrase', self.machine_translation, self.domain)
                src_text, tgt_text, editSplit = split_match._match()

                if editSplit >= self.min_match: #Check if split method return segments from ActivaTM
                  segment.source_text, segment.target_text, editD = src_text, tgt_text, editSplit

        if editD >= self.min_match:
          status = 'find'
          status_tokenizer = True
        else:
          if not self.trans_segments:  #If doesn't found any match, prepare segment to automatic translation. If there aren't automatic translation, then return []
            #logging.info("Prepare Automatic Translation : ")
            self.trans_segments.append((segment, editD))
          status = 'break' # If exist segment on the list, break the for and there aren't translation
    return segment, editD, status, equal, status_tokenizer
示例#11
0
    def execute(self, threshold, l_best_segments, match_process,
                align_features, concordance):  #, output
        self.timer.start("preprocess")
        query_dic = self._preprocess(
            self.query,
            self.src_lang)  # Tokenize, posTag and universal query string
        self.timer.stop("preprocess")

        if concordance:
            return self._match_rank_concordance(l_best_segments)
        else:
            rank_segments = self._match_rank(l_best_segments, threshold)
            trans_segments = []
            # Check if the retrieve segments are 100% match or apply transformations
            for segment in rank_segments:
                #segment = segment[0]
                if segment.source_text == self.query:  # 100% match --> Return match considering domain
                    ter = 100
                    if self.query.isupper():
                        segment.source_text = segment.source_text.upper()
                    if self.query.islower():
                        segment.source_text = segment.source_text.lower()
                    #trans_segments.append((segment,ter))
                else:
                    #Pre-process source and target
                    tgt_text = TMUtilsMatching.pre_process(
                        segment.target_text, self.tgt_lang, 'tokenizer',
                        {})  # Pre-process tgt
                    src_text = TMUtilsMatching.pre_process(
                        segment.source_text, self.src_lang, 'tokenizer',
                        {})  # Tokenize tm_src
                    if 'regex' in match_process:
                        if (query_dic['tokenizer'] == query_dic['query_re']):
                            ter = TMUtilsMatching._ter_score(
                                query_dic['tokenizer'],
                                src_text)  # Regex did't applied on query
                        else:
                            self.timer.start("_regx_match")
                            tgt_text, src_text, ter = self._regx_match(
                                query_dic, src_text, tgt_text
                            )  #, segment.source_pos, segment.target_pos
                            self.timer.stop("_regx_match")
                            logging.info(
                                "Applied Regex Segment: {} {} {}".format(
                                    tgt_text, src_text, str(ter)))
                    else:
                        ter = TMUtilsMatching._ter_score(
                            query_dic['tokenizer'],
                            src_text)  # Regex did't enter as a parameter
                    if ter < threshold:
                        logging.info("TER less threshold: {} ".format(
                            str(ter)))
                        continue
                    if 'posTag' in match_process and ter != 100:  #Check segments with only one difference
                        if segment.source_pos is not None and segment.target_pos is not None:  #This part need the pos tagger annotation
                            self.timer.start("fuzzy_match")
                            #target_word (to D, R, or I), target_position, operation(R I or D),src_un_match(some time have source or query information)
                            tgt_word, tgt_position, operation, src_un_match, src_position = self._combine_feature_match(
                                query_dic, tgt_text, src_text,
                                segment.source_pos, segment.target_pos,
                                align_features)

                            logging.info("Un_match: {} {} ".format(
                                tgt_word, operation))

                            if src_un_match is not None:
                                src_text = self._create_target_expression(
                                    src_text, src_position, operation,
                                    src_un_match, 'source')  #src_un_match,
                                # src_text = src_text.split(' ')
                                # if operation == 'R':
                                #   src_text[int(src_position.split(' _ ')[1])] = tgt_word
                                # if operation == 'I':
                                #   new_src_text = src_text[:int(src_position)] + [src_un_match] + src_text[int(src_position):]
                                #   #new_src_text.append(src_un_match)
                                #   #new_src_text = new_src_text + src_text[int(src_position):]
                                #   src_text = new_src_text
                                # if operation == 'D':
                                #   src_text.pop(int(src_position))
                                # src_text = ' '.join(src_text)
                            if tgt_word is not None:
                                tgt_text = self._create_target_expression(
                                    tgt_text, tgt_position, operation,
                                    src_un_match, 'target')  #tgt_word,

                                self.timer.stop("fuzzy_match")
                    segment.source_text = TMUtilsMatching.pre_process(
                        src_text.split(' '), self.src_lang, 'untokenizer', {})
                    segment.target_text = TMUtilsMatching.pre_process(
                        tgt_text.split(' '), self.tgt_lang, 'untokenizer', {})
                    logging.info("Target segment: {}".format(
                        segment.target_text))
                    if self.query.isupper():
                        segment.source_text = segment.source_text.upper()
                        segment.target_text = segment.target_text.upper()
                    if self.query.islower():
                        segment.source_text = segment.source_text.lower()
                        segment.target_text = segment.target_text.lower()
                trans_segments.append((segment, ter))
            return trans_segments
示例#12
0
        # Tokenizer
        tok = TMTokenizer(lang.upper()).tokenizer
        pos = TMPosTagger(lang.upper())

        for eline in file.readlines():
            tok_sentences = tok.process(eline)
            print(tok_sentences)
            pos_sentence = [
                element
                for word, element in pos.tag_segments([tok_sentences])[0]
            ]

            # Split several steps

            list_sentences = TMUtilsMatching.pre_process(
                tok_sentences, args.source, 'split_sentences', {})
            #print('+++++++++++++++++')
            #print(list_sentences)

            list_word_pos = []
            if list_sentences:
                i = 0
                for each_sent in list_sentences:
                    # Create word_pos
                    len_e = len(each_sent.split())
                    list_word_pos.append([
                        (w, p)
                        for w, p in zip(each_sent.split(), pos_sentence[i:i +
                                                                        len_e])
                    ])
                    i = i + len_e