Пример #1
0
  def _improve_match(self, query_info, operation):
    query_word = query_info.split(' _ ')

    if operation == 'R': #Estimate editD between the words
      return (TMUtilsMatching._edit_distance(query_word[0], query_word[1]) / 2)
    else:
      return (len(query_word[0]) / 2) # EditD is equal ao total de characters add or delete from the string
Пример #2
0
 def _match_rank(self, best_segments, threshold):  #, output
     segments = []
     self.timer.start("ter")
     l_ter_score = [
         TMUtilsMatching._edit_distance(self.query, segment[0].source_text)
         for segment in best_segments
     ]
     self.timer.stop("ter")
     l_best_sort = sorted(zip(best_segments, l_ter_score),
                          key=operator.itemgetter(1),
                          reverse=True)
     for segment, ter in l_best_sort:  # TM output --> only show segments with ter > threshold
         if ter >= threshold - 10:
             segments.append((segment[0]))
         else:
             break
     return segments
Пример #3
0
  def _tm_edit_distance(self, q_text, s_text, q_simplified, s_simplified):
    # Corner case - matching artificial empty segment -> giving minimal score
    if q_text and not s_text.strip():
      return 1
    #Always reduce the tags to count only one element
    '''
    print('**original**')
    print(q_text)
    print('**src**')
    print(s_text)
    print('**originalS**')
    print(q_simplified)
    print('**srcS**')
    print(s_simplified)
    '''
    # 1) ********** Obtain words and stop words sequences
    q_onlyW, q_st_word = TMMatching._only_word_sequence(q_text, self.src_lang)
    s_onlyW, s_st_word = TMMatching._only_word_sequence(s_text, self.src_lang)
    '''
    print(q_onlyW)
    print(s_onlyW)
    print(q_st_word)
    print(s_st_word)
    '''
    if not q_onlyW and not q_st_word:
    #print(self.src_lang)
    #if self.src_lang=='zh':
      editD = 100 - (TMUtilsMatching._edit_distance(q_text, s_text)) #* 100
    else:
      # Normal editDistance, without puntuation marks and only word, without stop words
      nchar_diff = TMUtilsMatching._edit_distance(' '.join(q_onlyW), ' '.join(s_onlyW))  # Consider all the words, without any substitution
      #print(q_onlyW)
      #print(s_onlyW)
      nchar_len = len(' '.join(q_onlyW)) + len(' '.join(s_onlyW))
      if nchar_len == 0: nchar_len = 1
      #print(nchar_len)
      char_diff = (2*nchar_diff)/(nchar_len)  # total of charaters

      # 2) ********* Simplified --> Convert to letter and keep only puntuation marks
      q_replaceW, q_onlyS = TMMatching._symbol_sequence(q_simplified)  # Original query

      # Ex. '- 3.67 housing units constructed under the $ #  home % ownership saving scheme in the Hanano/ and (Hamdaniya districts;' --> - N N N N N N $ #  N % N N N N N N/ N (N N;
      s_replaceW, s_onlyS = TMMatching._symbol_sequence(s_simplified) # Original tm_src

      if (len(s_onlyS) == 0 and len(q_onlyS) == 0): # There are not symbol
        n_symbol_diff = 0
      else:
        n_symbol_diff = TMUtilsMatching._edit_distance(q_replaceW, s_replaceW) #(' '.join(q_onlyS), ' '.join(s_onlyS))/2#

      len_symbols = len(q_replaceW.split(' ')) + len(q_replaceW.split(' '))  # len(q_onlyS) + len(s_onlyS)
      if len_symbols == 0: len_symbols = 1

      symbol_diff = (2*n_symbol_diff)/len_symbols


      # 3) ********* Exist or not exist the query words on source
      nword_diff = set(q_onlyW).difference(s_onlyW) # Replace regular expression by only one word
      onlyW_len = len(q_onlyW)
      if onlyW_len == 0: onlyW_len = 1
      word_diff = (len(nword_diff))/onlyW_len # only query words

      # 4) ********* Stop words
      stop_words = True
      if (len(q_st_word) == 0 and len(s_st_word) == 0):  # There are not stop word or this language doesn't have stop words list
        stop_words = False

      if stop_words:
        n_st_diff = TMUtilsMatching._edit_distance(' '.join(q_st_word), ' '.join(s_st_word))
        len_stop_word = len(' '.join(q_st_word)) + len(' '.join(s_st_word))
        stop_word_diff = (2 * n_st_diff)/len_stop_word

        editD = (1 - ((0.70 * (char_diff)) + (0.10 * (word_diff)) + (0.10 * (symbol_diff)) + (0.10 * (stop_word_diff)))) * 100
      else:
        editD = (1 - ((0.70 * (char_diff)) + (0.15 * (word_diff)) + (0.15 * (symbol_diff)))) * 100

    if editD < 0:
      editD = 10
    return int(math.floor(editD))