Пример #1
0
    def _preprocess(self, text, lang):

        dic_query = {}
        s_tags = XmlUtils.extract_tags(text)
        if not s_tags:
            dic_query['query'] = text
        else:
            dic_query['query'] = XmlUtils.strip_tags(
                text)  # split tag to do the match

        dic_query['tokenizer'] = TMUtilsMatching.pre_process(
            dic_query['query'], self.src_lang, 'tokenizer', {})
        dic_query['pos'] = TMUtilsMatching.pre_process(dic_query['tokenizer'],
                                                       lang, 'pos_tagger', {})
        dic_query['universal'] = TMUtilsMatching.segment_2_universal(
            dic_query['tokenizer'].lower(), dic_query['pos'],
            lang)  # universal_text[0]
        dic_query['universal'] = dic_query['pos']

        regex_class = TMRegexMatch(
            self.src_lang, self.tgt_lang)  # Class to improve fuzzy match
        dic_query['query_re'] = TMUtilsMatching.pre_process(
            dic_query['tokenizer'], self.src_lang, 'reg_exp',
            regex_class.re_pp)
        return dic_query
Пример #2
0
 def check_query_parameters(self):
   if 'pos' not in self.query_dic:  # Applied pos and universal on query --> only the firt time
     if 'tokenizer' not in self.query_dic:  # The first transformation is posTag --> any other were applied
       query_out_tags = XmlUtils.replace_tags(self.query_dic['query'])
       self.query_dic['tokenizer'] = TMUtilsMatching.pre_process(query_out_tags, self.src_lang, 'tokenizer', {})
     self.query_dic['pos'] = TMUtilsMatching.pre_process(self.query_dic['tokenizer'], self.src_lang, 'pos_tagger', {})
   return self.query_dic['query'], self.query_dic['tokenizer'], self.query_dic['pos']
Пример #3
0
    def _align_source_target(self, un_match, un_pos, position, tgt_text,
                             tgt_pos, align_features):

        tgt_dic = {}  # list of pairs of words

        tgt_word_pos = TMUtilsMatching.segment_2_universal(
            tgt_text.lower(), tgt_pos, self.tgt_lang)
        for i in range(0, len(tgt_word_pos)):
            value_similarity = 0
            for f in align_features:
                if f == 'word_ter':  # TER between words
                    value_similarity = value_similarity + TMUtilsMatching.ter_distance(
                        un_match, tgt_word_pos[i][0])
                if f == 'posTag':  # Boolean PosTag
                    value_similarity = value_similarity + TMUtilsMatching.pos_bool(
                        un_pos, tgt_word_pos[i][1])
                if f == 'position':  # Word position
                    value_similarity = value_similarity + TMUtilsMatching.position_distance(
                        position, i)
                #if f == 'frequency':  # frequency of pairs of words
                #  value_similarity = value_similarity + self.target_importance(un_word, tgt_word_pos[i][0], segment,best_segments)
            # Dictionary have the target word and the position of the word in the target sentence
            tgt_dic[(tgt_word_pos[i][0], i)] = value_similarity
        tgt_align = sorted(tgt_dic.items(),
                           key=lambda item: item[1],
                           reverse=True)[0]  # Select the highest score
        return tgt_align[0][0], tgt_align[0][
            1]  # un_word, un_position  # Retorn the word with biggest score
Пример #4
0
    def process(self, query_dic, tgt_text, src_text, src_pos, tgt_pos,
                align_features):

        tgt_word = None
        tgt_position = None
        operation = None
        un_match = None
        src_position = None

        src_word_pos = TMUtilsMatching.segment_2_universal(
            src_text.lower(), src_pos, self.src_lang
        )  #self._segment_2_universal(segment.source_text, segment.source_pos, self.src_lang) # [word, pos] tm_src segment
        query_universal = []
        #Check if segments are equal of with only one diference (posTag)
        query_tok = query_dic['tokenizer'].lower()
        for i in range(0, len(query_dic['universal'].split(' '))):
            query_universal.append([
                query_tok.split(' ')[i], query_dic['universal'].split(' ')[i]
            ])
        #list(zip(query_dic['tokenizer'].split(' '), query_dic['universal'].split(' ')))
        logging.info("Differences between PosTag: {} ".format(
            TMUtilsMatching.len_compare(query_universal, src_word_pos)))
        if TMUtilsMatching.len_compare(
                query_universal,
                src_word_pos) is True and (query_dic['tokenizer'] != src_text):
            # Obtain un_match word and its features

            if len(query_universal) == len(src_word_pos):
                operation = 'R'  # Load the unmatch between query and src --> un_match = un_match_q _ un_match_s
                un_match, un_pos, src_position = TMPosMatch._get_src_unmatch(
                    query_universal, src_word_pos)  # Replace (query and src)
                if un_match is not None:
                    tgt_word, tgt_position = self._align_source_target(
                        un_match.split('_')[1],
                        un_pos.split('_')[1],
                        src_position.split('_')[1], tgt_text, tgt_pos,
                        align_features)
                    tgt_word = un_match.split('_')[0]
            elif len(query_universal) > len(
                    src_word_pos):  # Insert a new word in target
                operation = 'I'
                un_match, un_pos, src_position = TMPosMatch._get_src_unmatch(
                    query_universal,
                    src_word_pos)  # Insert --> return word from query
                tgt_word = un_match
                tgt_position = src_position
            else:  # Delete a new word in target
                operation = 'D'
                un_match, un_pos, src_position = TMPosMatch._get_src_unmatch(
                    src_word_pos,
                    query_universal)  # Delete --> return word from src
                if un_match is not None:
                    tgt_word, tgt_position = self._align_source_target(
                        un_match, un_pos, src_position, tgt_text, tgt_pos,
                        align_features)
        return tgt_word, tgt_position, operation, un_match, src_position
Пример #5
0
    def _align_source_target(self, un_match, un_pos, position, tgt_word_pos,
                             align_features):  #tgt_text, tgt_pos,
        related_words = []
        tgt_dic = {}  # list of pairs of words

        equal_posTag = [[
            position_tgt, word, pos
        ] for position_tgt, [word, pos] in list(enumerate(tgt_word_pos))
                        if pos == un_pos.strip(' ') or pos == 'VERB'
                        or pos == 'NOUN' or pos == 'ADJ']
        #print('*************')
        #print(equal_posTag)
        if not equal_posTag:
            return None, None

        else:
            if 'glossary' in align_features:
                related_words = self.search_exact_value(un_match, 10)
            for i in range(0, len(equal_posTag)):
                value_similarity = 0
                for f in align_features:
                    if f == 'word_ter':  # TER between words
                        value_similarity = value_similarity + (
                            0.25 * TMUtilsMatching.un_match_distance(
                                un_match, equal_posTag[i][1]))
                    if f == 'posTag':  # Boolean PosTag
                        value_similarity = value_similarity + (
                            0.25 * TMUtilsMatching.pos_bool(
                                un_pos, equal_posTag[i][2]))
                    if f == 'position':  # Word position
                        value_similarity = value_similarity + (
                            0.25 * TMUtilsMatching.position_distance(
                                position, equal_posTag[i][0]))
                    if f == 'glossary':  # search word on elasticTM
                        if equal_posTag[i][1] in related_words:
                            is_related = 1
                        else:
                            is_related = 0
                        value_similarity = value_similarity + (
                            0.25 * is_related
                        )  #target_importance(un_word, tgt_word_pos[i][0], segment,best_segments)
                # Dictionary have the target word and the position of the word in the target sentence --> Low is the best
                tgt_dic[(equal_posTag[i][1],
                         equal_posTag[i][0])] = value_similarity
            tgt_align = sorted(tgt_dic.items(),
                               key=lambda item: item[1],
                               reverse=True)[0]  # Select the highest score
            print(
                sorted(tgt_dic.items(), key=lambda item: item[1],
                       reverse=True))
            if tgt_align[1] > G_CONFIG.get_src_tgt_threshold():
                return tgt_align[0][0], tgt_align[0][1]
            else:
                return None, None
Пример #6
0
  def _preprocess(self):
    self.query_dic['query'] = self.query

    if re.search("<.*>", self.query):  # Uniform tags --> # Yo tengo un <b>gato</b>. --> Yo tengo un <T1>gato</T1>
      self.query_dic['query_tags'] = TMUtilsMatching.pre_process(self.query, (self.src_lang, self.tgt_lang), 'tags', {})

      self.query_dic['query'] = self.query_dic['query_tags'] # query now have the tags <T1>gato</T1>

    if 'regex' in self.pipe: self.query_dic['query_re'] = TMUtilsMatching.pre_process(self.query_dic['query'], self.src_lang, 'reg_exp', self.match['regex'].re_pp)
    else: self.query_dic['query_re'] = self.query_dic['query']
    self.query_dic['query_re_reduce'] = TMRegexMatch.simplified_name(self.query_dic['query_re'])

    return self.query_dic
Пример #7
0
    def _match_tags(query, src_text, tgt_text):

        match = 0
        query_strip_tags = TMUtilsMatching.strip_tags(
            query)  #Strip tags from query
        src_text_strip_tags = TMUtilsMatching.strip_tags(
            src_text).strip()  # Strip tags from src
        tgt_text_strip_tags = TMUtilsMatching.strip_tags(
            tgt_text).strip()  # Strip tags from tgt

        if query_strip_tags == src_text_strip_tags:  # query and src_tm are equals
            match = 100
        return query_strip_tags, src_text_strip_tags, tgt_text_strip_tags, match
Пример #8
0
 def style_string(self, src_text, tgt_text, status_tokenizer):
   #Check upper and lower case
   if src_text and tgt_text:
     src_text, tgt_text = self._transform_case(src_text, tgt_text)
     # Transfer XML tags (if needed)
     self.timer.start("transfer_tags")
     if re.search("</?[^<>]+/?>", self.query) is not None: # transfer tags only if query has and tgt and src don't
       status_tokenizer = True
       if (re.search("</?[^<>]+/?>", src_text) is None):
         src_text = TMUtilsMatching.transfer_tags(self.query, src_text, (self.src_lang, self.tgt_lang))
       if (re.search("</?[^<>]+/?>", tgt_text) is None):
         tgt_text = TMUtilsMatching.transfer_tags(self.query, tgt_text, (self.src_lang, self.tgt_lang))
     self.timer.stop("transfer_tags")
   return src_text, tgt_text, status_tokenizer
Пример #9
0
    def __init__(self):  # language,#
        self.order = ['comma', 'conjunction', 'compose_sub',
                      'last']  # 'subordinate', --> Last = subordinate

        self.sw = TMUtilsMatching.check_stopwords(
            'EN'
        )  #TMTextProcessors.stop_words('english') #stopwords.words('english')
        self.ut = TMTextProcessors.univ_pos_tagger(
            'EN')  #TMUniversalPosTag('EN')
        self.rules = {
            #pattern ---> left ---> right
            'conjunction':
            RulesPattern('<.*>*<V.*><.*>*<CC><.*>*<V.*><.*>*',
                         '<.*>*<V.*><.*>*',
                         '<CC><.*>*<V.*><.*>*'),  #, '<CC>','',''
            'last':
            RulesPattern('<.*>*<V.*><.*>*<WDT|WP|WRB|IN/that><.*>*<V.*><.*>*',
                         '<.*>*<V.*><.*>*',
                         '<WDT|WP|WRB|IN/that><.*>*<V.*><.*>*'),
            'compose_sub':
            RulesPattern(
                '<.*>*<V.*><.*>*<CC|\,|\;|\:|\-><WDT|WP|WRB|IN/that><.*>*<V.*><.*>*',
                '<.*>*<V.*><.*>*',
                '<CC|\,|\;|\:|\-><WDT|WP|WRB|IN/that><.*>*<V.*><.*>*'),
            # --> wh_words               <V> <NP|PP>*
            'comma':
            RulesPattern(
                '<.*>*<V.*><.*>*<\,|\;|\:|\-><.*>*<V.*><.*>*',
                '<.*>*<V.*><.*>*',
                '<\,|\;|\:|\-><.*>*<V.*><.*>*'),  #, '<\,|\;|\:|\->', '', ''
        }
Пример #10
0
  def __init__(self, list_query, list_marks, src_lang, tgt_lang, split_type, aut_trans, domain):
    self.src_lang = src_lang
    self.tgt_lang = tgt_lang
    self.aut_trans = aut_trans
    self.domain = domain

    self.tmdb_api = TMDbApi.TMDbApi.TMDbApi()

    self.split_type = split_type

    if self.split_type == 'sentence':
      self.list_query = list_query
    if self.split_type == 'phrase':

      if not TMUtilsMatching.empty_list(list_marks): # Search its translation target lang
        self.list_marks = self.tgt_list_marks(list_marks)  # [[], [], ('and', 'CC')]
      else:
        self.list_marks = list_marks
      '''
      [[('-', ':'), ('A', 'DT'), ('framework', 'NN'), ('for', 'IN'), ('the', 'DT'), ('measurement', 'NN'), ('of', 'IN'),
        ('greenhouse', 'NN'), ('gas', 'NN'), ('concentrations', 'NNS'), ('is', 'VBZ'), ('in', 'IN'), ('place', 'NN')],
       [('to', 'TO'), ('understand', 'VV'), ('their', 'PP$'), ('sources', 'NNS')],
       [('sinks', 'NNS'), ('requires', 'VVZ'), ('measuring', 'VVG'), ('transport', 'NN'), ('and', 'CC'), ('flux', 'NN'),
        ('in', 'IN'), ('both', 'CC'), ('the', 'DT'), ('horizontal', 'JJ'), ('and', 'CC'), ('vertical', 'JJ'),
        ('.', 'SENT')]]
      '''
      self.list_query = [' '.join([word for word, post in part]) for part in list_query]
      self.list_pos = [' '.join([pos for word, pos in part]) for part in list_query]
      logging.info("After Split Each parts: {} {}".format(self.list_query, self.list_pos))
Пример #11
0
  def _improve_match(self, query_info, operation):
    query_word = query_info.split(' _ ')

    if operation == 'R': #Estimate editD between the words
      return (TMUtilsMatching._edit_distance(query_word[0], query_word[1]) / 2)
    else:
      return (len(query_word[0]) / 2) # EditD is equal ao total de characters add or delete from the string
Пример #12
0
    def __init__(self):
        self.order = ['comma', 'conjunction', 'compose_sub',
                      'last']  #subordinate

        self.sw = TMUtilsMatching.check_stopwords(
            'ES'
        )  #TMTextProcessors.stop_words('spanish')#stopwords.words('spanish')
        self.ut = TMTextProcessors.univ_pos_tagger(
            'ES')  #TMUniversalPosTag('ES')

        self.rules = {
            'conjunction':
            RulesPattern(
                '<.*>*<V.*><.*>*<CC|CCNEG|CCAD><.*>*<V.*><.*>*',  #, '<CC|CCNEG|CCAD>','',''
                '<.*>*<V.*><.*>*',
                '<CC|CCNEG|CCAD><.*>*<V.*><.*>*'
            ),  #'?!<V.*>', '<CC|CCNEG|CCAD><.*>*'
            'comma':
            RulesPattern(
                '<.*>*<V.*><.*>*<CM|COLON|DASH|SEMICOLON><.*>*<V.*><.*>*',
                '<.*>*<V.*><.*>*', '<CM|COLON|DASH|SEMICOLON><.*>*<V.*><.*>*'
            ),  #, '<CM|COLON|DASH|SEMICOLON>','',''
            'compose_sub':
            RulesPattern(
                '<.*>*<V.*><.*>*<CC|CCNEG|CCAD|CM|COLON|DASH|SEMICOLON|PREP|PDEL><CQUE|CSUBF|CSUBI|CSUBX|CQ><.*>*<V.*><.*>*',
                '<.*>*<V.*><.*>*',
                '<CC|CCNEG|CCAD|CM|COLON|DASH|SEMICOLON|PREP|PDEL><CQUE|CSUBF|CSUBI|CSUBX|CQ><.*>*<V.*><.*>*'
            ),  # --> subordinate
            'last':
            RulesPattern(
                '<.*>*<V.*><.*>*<CQUE|CSUBF|CSUBI|CSUBX><.*>*<V.*><.*>*',
                '<.*>*<V.*><.*>*', '<CQUE|CSUBF|CSUBI|CSUBX|CQ><.*>*<V.*><.*>*'
            ),  #--> subordinate
        }
Пример #13
0
 def _deals_output(self, segment, editD, trans_segments, status_tokenizer, status):
   if self.out == 'moses': # Moses output is tokenizer
     if status_tokenizer == False:# tokenize output
       segment.source_text = TMUtilsMatching.pre_process(segment.source_text, self.src_lang, 'tokenizer', {})
       segment.target_text = TMUtilsMatching.pre_process(segment.target_text, self.tgt_lang, 'tokenizer', {})
     trans_segments.append((segment, editD))
     return trans_segments, 'break'
   else:
     if status_tokenizer == True:  # TM output is untokenizer
       segment.target_text = TMUtilsMatching.pre_process(segment.target_text.split(' '), self.tgt_lang, 'untokenizer', {})
       segment.source_text = TMUtilsMatching.pre_process(segment.source_text.split(' '), self.src_lang, 'untokenizer', {})
     trans_segments.append((segment, editD))
     if status == 'translate': status = 'break'
     else: status = 'continue'
     #if editD == 100: # Add this if to obtain better matching time
     #  status = 'break'
   logging.info("Final Output (Query -- Source -- Target): {} {} {}".format(safe_str(self.query_dic['query'] + ' -- '), safe_str(segment.source_text + ' -- '), safe_str(segment.target_text)))
   return trans_segments, status
Пример #14
0
    def _prepare_target_text(self, query, segment, translation, source_lang,
                             target_lang):
        segment.source_text = query
        segment.domain = []
        segment.file_name = []

        if re.search("</?[^<>]+/?>",
                     query) is not None:  # If there are tags on query
            tgt_tags = TMUtilsMatching.transfer_tags(
                segment.source_text, translation, (source_lang, target_lang))
            segment.target_text = TMUtilsMatching.pre_process(
                tgt_tags.split(' '), target_lang, 'untokenizer', {})
        else:
            segment.target_text = translation.strip('\n')
        logging.info("Translate less minumum_match : {} {}".format(
            segment.source_text + ' -- ', translation))

        return segment
Пример #15
0
    def clause_chunk(self, text):
        dicSegment_Rules = {}
        # Check if need to transfer to universal posTag
        if self.class_lang == 'generic_geral' and self.lang.upper(
        ) not in TMUtilsMatching.pre_process(' ', self.lang.upper(),
                                             'get_lang_universalPOS', {}):
            text_Universal = TMUtilsMatching.pre_process(
                [[[word, pos] for word, pos in text]], self.lang,
                'universal_pos_tagger', {})

            if not text_Universal:  # If rhere are some problem with universal posTag
                return [Tree('S', text)]

            text = [(word, pos) for word, pos in text_Universal]

        #Run each rule
        for r in self.order:
            if r == 'initial':
                lSentences = [
                    text
                ]  # --> Lista inicial de segmentos a serem processados
            else:
                chunkO = RegexpChunkParser(
                    self.rules[r].get_rule_obj(),
                    chunk_label='splitPoint',
                    root_label='S')  # Create chunk Object

                #Process to chunk the segments --> Call each rule in recursive form
                lChunk_Segments = lSentences

                len_actual = 0  #--> Control the split number
                len_previous = len(lSentences)

                while len_actual != len_previous:
                    len_previous = len(lChunk_Segments)
                    lChunk_Segments = self._recursive_rule(
                        lChunk_Segments, chunkO)
                    len_actual = len(lChunk_Segments)

                dicSegment_Rules[r] = lChunk_Segments
                lSentences = lChunk_Segments  # --> Load all chunks obtain by one rule
        self.timer.print()
        return dicSegment_Rules['last']
Пример #16
0
    def process(self, query_dic, src_text, tgt_text):  #, src_pos, tgt_pos

        # pre-process and apply regex to tm_src
        src_re = self.re_pp[self.src_lang].process(src_text)

        if src_text != src_re:  # Was applied regex in src

            tgt_re = self.re_pp[self.tgt_lang].process(tgt_text)

            # if query_dic['query_re'] == query_dic['tokenizer']: # Was not applied regular expression on query
            #   if src_re is not None and src_pos is not None:
            #     src_text, src_pos = TMRegexMatch._delete_elements(src_re.split(' '), src_pos.split(' '))
            #   if tgt_re is not None and tgt_pos is not None:
            #     tgt_text, tgt_pos = TMRegexMatch._delete_elements(tgt_re.split(' '), tgt_pos.split(' '))

            #ter = TMUtilsMatching._ter_score(query_dic['tokenizer'], src_text)
            #else: #Transform target into query

            ter = TMUtilsMatching._ter_score(query_dic['query_re'], src_re)
            #Extract patterns (find and replace) value
            src_query_f, src_query_r = TMRegexMatch._extract_find_replace(
                query_dic['tokenizer'].split(' '),
                query_dic['query_re'].split(' '))
            tgt_query_f = src_query_f.copy()
            tgt_query_r = src_query_r.copy()
            src_f, src_r = TMRegexMatch._extract_find_replace(
                src_text.split(' '), src_re.split(' '))
            ter = ter - len(src_f)
            src_text = TMRegexMatch._replace_values(src_query_f, src_query_r,
                                                    src_re.split(' '), src_f,
                                                    src_r)

            tgt_f, tgt_r = TMRegexMatch._extract_find_replace(
                tgt_text.split(' '), tgt_re.split(' '))
            tgt_text = TMRegexMatch._replace_values(tgt_query_f, tgt_query_r,
                                                    tgt_re.split(' '), tgt_f,
                                                    tgt_r)

        else:
            ter = TMUtilsMatching._ter_score(query_dic['tokenizer'],
                                             src_text)  #Regex did't applied
        return tgt_text, src_text, ter  #, src_pos, tgt_pos
Пример #17
0
 def _match_rank_concordance(self, best_segments):  # , output
     self.timer.start("ter")
     l_ter_score = [
         TMUtilsMatching._ter_score(self.query, segment[0].source_text)
         for segment in best_segments
     ]
     self.timer.stop("ter")
     l_best_sort = sorted(zip(best_segments, l_ter_score),
                          key=operator.itemgetter(1),
                          reverse=True)
     return [(segment[0][0], segment[1]) for segment in l_best_sort]
Пример #18
0
 def _match_tags(self, src_text, src_re_reduce, tgt_text, status, ini_editD):
   reduce = False
   out_tags_query, src_text, tgt_text, match = TMTags._match_tags(self.query_dic['query'], src_text, tgt_text)
   if match == 100:
     status = 'find'
   else:
     match = self._tm_edit_distance(out_tags_query, src_text, TMUtilsMatching.strip_tags(self.query_dic['query_re_reduce']).strip(), TMUtilsMatching.strip_tags(src_re_reduce).strip())
     if self.query_dic['query'] != out_tags_query:
       self.query_dic['query'] = out_tags_query
       reduce = True
     if match >= ini_editD:
       ini_editD = match
   return src_text, tgt_text, status, reduce, ini_editD
Пример #19
0
  def _only_word_sequence(text, lang): # Receive original sequence
    only_word = []
    only_st = []
    l_src_st = TMUtilsMatching.check_stopwords(lang)
    for match in re.finditer(r'[a-zA-Z0-9\u4e00-\u9fff\u3040-\u309Fー\u30A0-\u30FF]+', text):  # Get all the words and numbers

      if l_src_st: # For some language we don't have stopwords list
        if match.group() in l_src_st:
          only_st.append(match.group())
        else:
          only_st.append('P')
          only_word.append(match.group())

    return only_word, only_st
Пример #20
0
  def _validate_pipe(self, pipe):
    match_process = {
      'regex': None,
      'posTag': None,
      'tags': TMTags()
    }

    try:
      match_process['regex'] = TMRegexMatch(self.src_lang, self.tgt_lang)
      logging.info("Loading regex for matching")
    except ValueError:
      if 'regex' in pipe:
        pipe.pop(pipe.index('regex'))
        logging.info("Unsupported regex for matching")

    query_out_tags = XmlUtils.replace_tags(self.query)

    try:
      if 'tokenizer' not in self.query_dic:
        self.query_dic['tokenizer'] = TMUtilsMatching.pre_process(query_out_tags, self.src_lang, 'tokenizer', {})
      logging.info("Loading Tokenizer for {}".format(self.src_lang))

      try:
        if 'pos' not in self.query_dic:
          self.query_dic['pos'] = TMUtilsMatching.pre_process(self.query_dic['tokenizer'], self.src_lang, 'pos_tagger', {})
        match_process['posTag'] = TMPosMatch(self.src_lang, self.tgt_lang)
        logging.info("Loading regex for matching")
      except Exception as e:
        if 'posTag' in pipe:
          pipe.pop(pipe.index('posTag'))
          logging.info("Unsupported posTag for matching")
    except Exception as e:
      if 'posTag' in pipe:
        pipe.pop(pipe.index('posTag'))
        logging.info("Unsupported Tokenizer for {}".format(self.src_lang))

    return match_process, pipe
Пример #21
0
  def _match_rank(self, best_segments):
    self.timer.start("rank segments")
    editD_score = []
    if 'query_tags' in self.query_dic: # Simplified tags
      query = TMUtilsMatching.reduce_tags(self.query_dic['query_tags']) # Yo tengo un <T1>gato</T1>. Yo tengo un T gato T.
    else:
      query = self.query_dic['query']

    for i in range(0, len(best_segments)):
      segment = best_segments[i]
      # Simplified tags in tm source
      if re.search("</?T[0-9]*/?>", segment[0].source_text):
        src_text = TMUtilsMatching.reduce_tags(segment[0].source_text) # Simplified tags in tm source and target
      else: src_text = segment[0].source_text
      # Applied Regex and simplified
      if 'regex' in self.pipe: src_re = TMUtilsMatching.pre_process(src_text, self.src_lang, 'reg_exp', self.match['regex'].re_pp)
      else: src_re = src_text


      src_re_reduce = TMRegexMatch.simplified_name(src_re)
      best_segments[i] = (segment[0], segment[1], src_re, src_re_reduce)
      editD_score.append(self._tm_edit_distance(query, src_text, self.query_dic['query_re_reduce'], src_re_reduce))  # EditD with tags simplied TMUtilsMatching._edit_distance(query, src_text)
    self.timer.stop("rank segments")
    return sorted(zip(best_segments, editD_score), key=operator.itemgetter(1), reverse=True)
Пример #22
0
 def _match_rank(self, best_segments, threshold):  #, output
     segments = []
     self.timer.start("ter")
     l_ter_score = [
         TMUtilsMatching._edit_distance(self.query, segment[0].source_text)
         for segment in best_segments
     ]
     self.timer.stop("ter")
     l_best_sort = sorted(zip(best_segments, l_ter_score),
                          key=operator.itemgetter(1),
                          reverse=True)
     for segment, ter in l_best_sort:  # TM output --> only show segments with ter > threshold
         if ter >= threshold - 10:
             segments.append((segment[0]))
         else:
             break
     return segments
Пример #23
0
    def __init__(self):  # language,#
        self.order = ['comma', 'last']  # 'conjunction',

        self.sw = TMUtilsMatching.check_stopwords(
            'EN'
        )  #TMTextProcessors.stop_words('english')  # stopwords.words('english')
        self.ut = TMTextProcessors.univ_pos_tagger(
            'EN')  # TMUniversalPosTag('EN')
        self.rules = {
            # pattern ---> left ---> right
            'comma':
            RulesPattern('<.*>*<V.*><.*>*<CC><.*>*<V.*><.*>*',
                         '<.*>*<V.*><.*>*', '<CC><.*>*<V.*><.*>*'),
            'last':
            RulesPattern('<.*>*<V.*><.*>*<\,|\;|\:|\-><.*>*<V.*><.*>*',
                         '<.*>*<V.*><.*>*', '<\,|\;|\:|\-><.*>*<V.*><.*>*'),
        }
Пример #24
0
    def __init__(self):
        self.order = ['comma', 'last']  # last = conjuntion

        self.sw = TMUtilsMatching.check_stopwords('FR')  # stop words
        self.ut = TMTextProcessors.univ_pos_tagger(
            'FR')  # TMUniversalPosTag('ES')

        self.rules = {
            'last':
            RulesPattern('<.*>*<V.*><.*>*<KON><.*>*<V.*><.*>*',
                         '<.*>*<V.*><.*>*', '<KON><.*>*<V.*><.*>*'),
            'comma':
            RulesPattern(
                '<.*>*<V.*><.*>*<PUN|SENT><.*>*<V.*><.*>*',
                '<.*>*<V.*><.*>*',  # Punctuation marks
                '<PUN|SENT><.*>*<V.*><.*>*')
        }
Пример #25
0
    def __init__(self, lang):
        self.order = ['comma', 'last']  # subordinate

        self.sw = TMUtilsMatching.check_stopwords(
            lang)  # stopwords.words('spanish')
        self.ut = TMTextProcessors.univ_pos_tagger(
            lang)  # TMUniversalPosTag('ES')

        self.rules = {
            'last':
            RulesPattern(
                '<.*>*<VERB><.*>*<CONJ|SCONJ><.*>*<VERB><.*>*',  # conjunctions
                '<.*>*<VERB><.*>*',
                '<CONJ><.*>*<VERB><.*>*'),
            'comma':
            RulesPattern('<.*>*<VERB><.*>*<\.><.*>*<VERB><.*>*',
                         '<.*>*<VERB><.*>*', '<\.><.*>*<VERB><.*>*'),  # comma
        }
Пример #26
0
  def _match(self):

    #Create dictionary con query info (posTag and universal)

    if self.split_type == 'sentence':
      list_info_query = [{'tokenizer': self.list_query[j]} for j in range(0, len(self.list_query))]
    else:
      list_info_query = [{'tokenizer': self.list_query[j], 'pos': self.list_pos [j]} for j in range(0, len(self.list_query))]

    # Query Elasticsearch --> out=moses to return only one segment
    l_best_segments = self.tmdb_api.query([TMUtilsMatching.pre_process(q.split(' '), self.src_lang, 'untokenizer', {}) for q in self.list_query], list_info_query, (self.src_lang, self.tgt_lang), pipe=['regex', 'tags', 'posTag'], out='moses', limit=5, domains=None, min_match=80, concordance=False, aut_trans=False, exact_length=False)

    join_source = ''
    join_target = ''
    total_match = 0
    for i in range(0, len(l_best_segments)):
      if l_best_segments[i]:
        segment, match = l_best_segments[i][0]
        join_source = join_source + ' ' + segment.source_text
        join_target = join_target + ' ' + segment.target_text
      else:
          join_source = join_source + ' ' + self.list_query[i]
          join_target = join_target + ' ' + self.list_query[i]
          match = 0
      total_match = total_match + match

      if self.split_type == 'phrase':
        if self.list_marks:
          if self.list_marks[0]:
            mark = self.list_marks.pop(0)[0]
            join_source = join_source + ' ' + mark
            join_target = join_target + ' ' + mark

    total_match = total_match/len(self.list_query)
    #print(join_source + ' ---- ' + join_target + ' ---- ' + str(total_match))
    return join_source, join_target, int(math.floor(total_match))
Пример #27
0
    def execute(self, threshold, l_best_segments, match_process,
                align_features, concordance):  #, output
        self.timer.start("preprocess")
        query_dic = self._preprocess(
            self.query,
            self.src_lang)  # Tokenize, posTag and universal query string
        self.timer.stop("preprocess")

        if concordance:
            return self._match_rank_concordance(l_best_segments)
        else:
            rank_segments = self._match_rank(l_best_segments, threshold)
            trans_segments = []
            # Check if the retrieve segments are 100% match or apply transformations
            for segment in rank_segments:
                #segment = segment[0]
                if segment.source_text == self.query:  # 100% match --> Return match considering domain
                    ter = 100
                    if self.query.isupper():
                        segment.source_text = segment.source_text.upper()
                    if self.query.islower():
                        segment.source_text = segment.source_text.lower()
                    #trans_segments.append((segment,ter))
                else:
                    #Pre-process source and target
                    tgt_text = TMUtilsMatching.pre_process(
                        segment.target_text, self.tgt_lang, 'tokenizer',
                        {})  # Pre-process tgt
                    src_text = TMUtilsMatching.pre_process(
                        segment.source_text, self.src_lang, 'tokenizer',
                        {})  # Tokenize tm_src
                    if 'regex' in match_process:
                        if (query_dic['tokenizer'] == query_dic['query_re']):
                            ter = TMUtilsMatching._ter_score(
                                query_dic['tokenizer'],
                                src_text)  # Regex did't applied on query
                        else:
                            self.timer.start("_regx_match")
                            tgt_text, src_text, ter = self._regx_match(
                                query_dic, src_text, tgt_text
                            )  #, segment.source_pos, segment.target_pos
                            self.timer.stop("_regx_match")
                            logging.info(
                                "Applied Regex Segment: {} {} {}".format(
                                    tgt_text, src_text, str(ter)))
                    else:
                        ter = TMUtilsMatching._ter_score(
                            query_dic['tokenizer'],
                            src_text)  # Regex did't enter as a parameter
                    if ter < threshold:
                        logging.info("TER less threshold: {} ".format(
                            str(ter)))
                        continue
                    if 'posTag' in match_process and ter != 100:  #Check segments with only one difference
                        if segment.source_pos is not None and segment.target_pos is not None:  #This part need the pos tagger annotation
                            self.timer.start("fuzzy_match")
                            #target_word (to D, R, or I), target_position, operation(R I or D),src_un_match(some time have source or query information)
                            tgt_word, tgt_position, operation, src_un_match, src_position = self._combine_feature_match(
                                query_dic, tgt_text, src_text,
                                segment.source_pos, segment.target_pos,
                                align_features)

                            logging.info("Un_match: {} {} ".format(
                                tgt_word, operation))

                            if src_un_match is not None:
                                src_text = self._create_target_expression(
                                    src_text, src_position, operation,
                                    src_un_match, 'source')  #src_un_match,
                                # src_text = src_text.split(' ')
                                # if operation == 'R':
                                #   src_text[int(src_position.split(' _ ')[1])] = tgt_word
                                # if operation == 'I':
                                #   new_src_text = src_text[:int(src_position)] + [src_un_match] + src_text[int(src_position):]
                                #   #new_src_text.append(src_un_match)
                                #   #new_src_text = new_src_text + src_text[int(src_position):]
                                #   src_text = new_src_text
                                # if operation == 'D':
                                #   src_text.pop(int(src_position))
                                # src_text = ' '.join(src_text)
                            if tgt_word is not None:
                                tgt_text = self._create_target_expression(
                                    tgt_text, tgt_position, operation,
                                    src_un_match, 'target')  #tgt_word,

                                self.timer.stop("fuzzy_match")
                    segment.source_text = TMUtilsMatching.pre_process(
                        src_text.split(' '), self.src_lang, 'untokenizer', {})
                    segment.target_text = TMUtilsMatching.pre_process(
                        tgt_text.split(' '), self.tgt_lang, 'untokenizer', {})
                    logging.info("Target segment: {}".format(
                        segment.target_text))
                    if self.query.isupper():
                        segment.source_text = segment.source_text.upper()
                        segment.target_text = segment.target_text.upper()
                    if self.query.islower():
                        segment.source_text = segment.source_text.lower()
                        segment.target_text = segment.target_text.lower()
                trans_segments.append((segment, ter))
            return trans_segments
Пример #28
0
  def _tm_edit_distance(self, q_text, s_text, q_simplified, s_simplified):
    # Corner case - matching artificial empty segment -> giving minimal score
    if q_text and not s_text.strip():
      return 1
    #Always reduce the tags to count only one element
    '''
    print('**original**')
    print(q_text)
    print('**src**')
    print(s_text)
    print('**originalS**')
    print(q_simplified)
    print('**srcS**')
    print(s_simplified)
    '''
    # 1) ********** Obtain words and stop words sequences
    q_onlyW, q_st_word = TMMatching._only_word_sequence(q_text, self.src_lang)
    s_onlyW, s_st_word = TMMatching._only_word_sequence(s_text, self.src_lang)
    '''
    print(q_onlyW)
    print(s_onlyW)
    print(q_st_word)
    print(s_st_word)
    '''
    if not q_onlyW and not q_st_word:
    #print(self.src_lang)
    #if self.src_lang=='zh':
      editD = 100 - (TMUtilsMatching._edit_distance(q_text, s_text)) #* 100
    else:
      # Normal editDistance, without puntuation marks and only word, without stop words
      nchar_diff = TMUtilsMatching._edit_distance(' '.join(q_onlyW), ' '.join(s_onlyW))  # Consider all the words, without any substitution
      #print(q_onlyW)
      #print(s_onlyW)
      nchar_len = len(' '.join(q_onlyW)) + len(' '.join(s_onlyW))
      if nchar_len == 0: nchar_len = 1
      #print(nchar_len)
      char_diff = (2*nchar_diff)/(nchar_len)  # total of charaters

      # 2) ********* Simplified --> Convert to letter and keep only puntuation marks
      q_replaceW, q_onlyS = TMMatching._symbol_sequence(q_simplified)  # Original query

      # Ex. '- 3.67 housing units constructed under the $ #  home % ownership saving scheme in the Hanano/ and (Hamdaniya districts;' --> - N N N N N N $ #  N % N N N N N N/ N (N N;
      s_replaceW, s_onlyS = TMMatching._symbol_sequence(s_simplified) # Original tm_src

      if (len(s_onlyS) == 0 and len(q_onlyS) == 0): # There are not symbol
        n_symbol_diff = 0
      else:
        n_symbol_diff = TMUtilsMatching._edit_distance(q_replaceW, s_replaceW) #(' '.join(q_onlyS), ' '.join(s_onlyS))/2#

      len_symbols = len(q_replaceW.split(' ')) + len(q_replaceW.split(' '))  # len(q_onlyS) + len(s_onlyS)
      if len_symbols == 0: len_symbols = 1

      symbol_diff = (2*n_symbol_diff)/len_symbols


      # 3) ********* Exist or not exist the query words on source
      nword_diff = set(q_onlyW).difference(s_onlyW) # Replace regular expression by only one word
      onlyW_len = len(q_onlyW)
      if onlyW_len == 0: onlyW_len = 1
      word_diff = (len(nword_diff))/onlyW_len # only query words

      # 4) ********* Stop words
      stop_words = True
      if (len(q_st_word) == 0 and len(s_st_word) == 0):  # There are not stop word or this language doesn't have stop words list
        stop_words = False

      if stop_words:
        n_st_diff = TMUtilsMatching._edit_distance(' '.join(q_st_word), ' '.join(s_st_word))
        len_stop_word = len(' '.join(q_st_word)) + len(' '.join(s_st_word))
        stop_word_diff = (2 * n_st_diff)/len_stop_word

        editD = (1 - ((0.70 * (char_diff)) + (0.10 * (word_diff)) + (0.10 * (symbol_diff)) + (0.10 * (stop_word_diff)))) * 100
      else:
        editD = (1 - ((0.70 * (char_diff)) + (0.15 * (word_diff)) + (0.15 * (symbol_diff)))) * 100

    if editD < 0:
      editD = 10
    return int(math.floor(editD))
Пример #29
0
        # Tokenizer
        tok = TMTokenizer(lang.upper()).tokenizer
        pos = TMPosTagger(lang.upper())

        for eline in file.readlines():
            tok_sentences = tok.process(eline)
            print(tok_sentences)
            pos_sentence = [
                element
                for word, element in pos.tag_segments([tok_sentences])[0]
            ]

            # Split several steps

            list_sentences = TMUtilsMatching.pre_process(
                tok_sentences, args.source, 'split_sentences', {})
            #print('+++++++++++++++++')
            #print(list_sentences)

            list_word_pos = []
            if list_sentences:
                i = 0
                for each_sent in list_sentences:
                    # Create word_pos
                    len_e = len(each_sent.split())
                    list_word_pos.append([
                        (w, p)
                        for w, p in zip(each_sent.split(), pos_sentence[i:i +
                                                                        len_e])
                    ])
                    i = i + len_e
Пример #30
0
  def execute_segment(self, segment, src_re, src_re_reduce, ini_editD, align_features, equal):
    logging.info("Applied match PIPE")
    tgt_text = segment.target_text
    src_text = segment.source_text
    status = ''

    editD = ini_editD
    status_tokenizer = False
    if equal:
      if self.query == src_text:
        return segment, editD, 'find', equal, status_tokenizer
      else:
        equal = False
    if not equal:
      for op in self.pipe: #Indicate by parameters
        if op == 'regex':
          if self.query_dic['query'] != self.query_dic['query_re']: # If query has regex   #and not TMMatching.check_upper_equal(self.query_dic['query'], self.query_dic['query_re'])
              logging.info("Applied Regex")
              self.timer.start("_regx_match")
              # ************************** Compare query_re with src_re --> simplified
              match = ini_editD
              if src_re != src_text:
                if src_re_reduce.lower() == self.query_dic['query_re_reduce'].lower():  # With simplified regular expression and in lowercase
                  match = 100  # Perfect match
                tgt_text, src_text = self._regex_transform(segment.source_text, segment.target_text)
                ini_editD = self._tm_edit_distance(self.query_dic['query'],src_text, self.query_dic['query_re_reduce'], src_re_reduce) #match
                logging.info("After applied Regex Segment: {} {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD)))
              if match == 100:
                status = 'find'
              self.timer.stop("_regx_match")
        if op == 'tags':
          logging.info("Delete Tags")
          self.timer.start("_tags_match")
          src_text, tgt_text, status, reduce, ini_editD = self._match_tags(src_text, src_re_reduce, tgt_text, status, ini_editD)
          logging.info("After applied Tags: {} {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD)))
          self.timer.stop("_tags_match")

        if op == 'posTag':
          self.timer.start("fuzzy_match")
          upper = False
          if segment.source_pos is not None and segment.target_pos is not None:  # This part need the pos tagger annotation
            squery, tok_query, pos_query = self.check_query_parameters()
            logging.info("Apply posTag matching")
            self.timer.start("fuzzy_preprocess")
            if status_tokenizer == False:  # Tokenize source and target
              tgt_text = TMUtilsMatching.pre_process(tgt_text, self.tgt_lang, 'tokenizer', {})  # Pre-process tgt
              src_text = TMUtilsMatching.pre_process(src_text, self.src_lang, 'tokenizer', {})  # Tokenize tm_src
              self.query_dic['query_re_reduce_tok'] = TMUtilsMatching.pre_process(self.query_dic['query_re_reduce'], self.src_lang, 'tokenizer', {})  # Tokenize the simplified query
              status_tokenizer = True

            if 'universal' not in self.query_dic:
              self.query_dic['universal'] = TMUtilsMatching.segment_2_universal(tok_query.lower(), pos_query, self.src_lang)
            #print(self.query_dic['universal'])
            src_word_pos = TMUtilsMatching.segment_2_universal(src_text.lower(), segment.source_pos, self.src_lang)  # [word, pos] tm_src segment
            tgt_word_pos = TMUtilsMatching.segment_2_universal(tgt_text.lower(), segment.target_pos, self.tgt_lang)  # [word, pos] tm_tgt segment

            self.timer.stop("fuzzy_preprocess")
            if isinstance(self.query_dic['universal'], list) and isinstance(src_word_pos, list) and isinstance(tgt_word_pos, list):

              logging.info("Check unmatch word --> PosTag")
              if TMUtilsMatching.len_compare(pos_query.split(' '), segment.source_pos.split(' ')) is True and (tok_query != src_text):
                logging.info("Query and source have same length or only one difference")

                self.timer.start("search unmatch")
                tgt_un_match, tgt_position, operation, src_un_match, src_position, pos_tag = self._combine_feature_match(tok_query, src_word_pos, tgt_word_pos, align_features)
                self.timer.stop("search unmatch")
                logging.info("Unmatch word and operation: {} {}".format(safe_str(src_un_match), safe_str(operation), safe_str(ini_editD)))
                self.timer.start("create target unmatch")

                if src_un_match is not None:
                  # Create new src
                  src_text, upper = self._create_target_expression(src_text, src_position, operation, src_un_match, 'source', upper, pos_tag)
                  # Improve edit distance
                  src_re = TMUtilsMatching.pre_process(src_text, self.src_lang, 'reg_exp', self.match['regex'].re_pp)
                  src_re_reduce = TMRegexMatch.simplified_name(src_re)
                  penalize_match = self._improve_match(src_un_match, operation)
                  ini_editD = self._tm_edit_distance(tok_query.lower(), src_text.lower(), self.query_dic['query_re_reduce_tok'].lower(), src_re_reduce.lower()) - penalize_match  # match
                  # Create new tgt
                if tgt_un_match is not None:
                  tgt_text, upper = self._create_target_expression(tgt_text, tgt_position, operation, tgt_un_match, 'target', upper, pos_tag)  # tgt_word,
                self.timer.stop("create target unmatch")
                logging.info("After applied posTag: {} {}".format(safe_str(src_text+ ' -- '), safe_str(tgt_text+ ' -- '), safe_str(ini_editD)))
          self.timer.stop("fuzzy_match")

        # Check if find or break some transformation
        if ini_editD > editD:
          editD = ini_editD
        if status == 'find' or status == 'break':
          segment.source_text = src_text
          segment.target_text = tgt_text
          return segment, editD, status, equal, status_tokenizer
      if editD >= self.min_match:
        segment.source_text = src_text
        segment.target_text = tgt_text
        status = 'find'
      else:
        #Call split rules
        if 'split' in self.pipe and not self.trans_segments: # Applied split if exist posTagger for source language  and self.query_dic['pos']

          src_text = None
          tgt_text = None
          editSplit = 0

          # Split by sentences.
          list_sentences = TMUtilsMatching.pre_process(self.query_dic['tokenizer'], self.src_lang, 'split_sentences', {})
          logging.info("split by Sentences : {} ".format(list_sentences))

          # Check sentence first
          if len(list_sentences) > 1:

            split_match = TMSplitMatch([TMUtilsMatching.pre_process(q.split(' '), self.src_lang, 'untokenizer', {}) for q in list_sentences], [], self.src_lang, self.tgt_lang, 'sentence', self.machine_translation, self.domain)
            src_text, tgt_text, editSplit = split_match._match()
            #print('*****Only sentences *****')
            #print(src_text)
            #print(tgt_text)
            #print(editSplit)

          if editSplit >= self.min_match:  # Check if split method return segments from ActivaTM
            segment.source_text, segment.target_text, editD = src_text, tgt_text, editSplit

          else: # Split in small phrase
            # Check if exist split for an especific pairs of languages
            lang_class = G_CONFIG.get_split_rules(self.src_lang, self.tgt_lang)

            if lang_class:
              logging.info("Split Query by Phrase")
              all_split, all_marks = self._splitByPhrase(lang_class, list_sentences)

              # Check if any split rule was applied
              if len(all_split) > 1:
                  # print(list_query_split)
                split_match = TMSplitMatch(all_split, all_marks, self.src_lang, self.tgt_lang, 'phrase', self.machine_translation, self.domain)
                src_text, tgt_text, editSplit = split_match._match()

                if editSplit >= self.min_match: #Check if split method return segments from ActivaTM
                  segment.source_text, segment.target_text, editD = src_text, tgt_text, editSplit

        if editD >= self.min_match:
          status = 'find'
          status_tokenizer = True
        else:
          if not self.trans_segments:  #If doesn't found any match, prepare segment to automatic translation. If there aren't automatic translation, then return []
            #logging.info("Prepare Automatic Translation : ")
            self.trans_segments.append((segment, editD))
          status = 'break' # If exist segment on the list, break the for and there aren't translation
    return segment, editD, status, equal, status_tokenizer