Пример #1
0
 def link_lists_old(self,
                    search_max=200,
                    editCost=20,
                    offsetCost=1,
                    offsetInertia=5):
     DEBUG = False
     if DEBUG:
         offsetHist = []
         jHist = []
         editDistHist = 0
     offset = self._calculate_offset(self.html_word_list,
                                     self.pdf_word_list,
                                     max((search_max // 10), 5), search_max)
     offsets = [offset] * offsetInertia
     searchOrder = np.array([(-1)**(i % 2) * (i // 2)
                             for i in range(1, search_max + 1)])
     links = OrderedDict()
     for i, a in enumerate(self.html_word_list):
         j = 0
         searchIndices = np.clip(offset + searchOrder, 0,
                                 len(self.pdf_word_list) - 1)
         jMax = len(searchIndices)
         matched = False
         # Search first for exact matches
         while not matched and j < jMax:
             b = self.pdf_word_list[searchIndices[j]]
             if a[1] == b[1]:
                 links[a[0]] = b[0]
                 matched = True
                 offsets[i % offsetInertia] = searchIndices[j] + 1
                 offset = int(np.median(offsets))
                 if DEBUG:
                     jHist.append(j)
                     offsetHist.append(offset)
             j += 1
         # If necessary, search for min edit distance
         if not matched:
             cost = [0] * search_max
             for k, m in enumerate(searchIndices):
                 cost[k] = (
                     editdist(a[1], self.pdf_word_list[m][1]) * editCost +
                     k * offsetCost)
             nearest = np.argmin(cost)
             links[a[0]] = self.pdf_word_list[searchIndices[nearest]][0]
             if DEBUG:
                 jHist.append(nearest)
                 offsetHist.append(searchIndices[nearest])
                 editDistHist += 1
     if DEBUG:
         self.logger.debug(offsetHist)
         self.logger.debug(jHist)
         self.logger.debug(editDistHist)
         self.offsetHist = offsetHist
     self.links = links
     if self.verbose:
         self.logger.debug(
             "Linked {:d} words to {:d} bounding boxes".format(
                 len(self.html_word_list), len(self.pdf_word_list)))
Пример #2
0
 def link_fuzzy(i):
     (_, word) = self.html_word_list[i]
     l = u = i
     l, u, L, U = get_anchors(l, u)
     offset = int(L + float(i - l) / (u - l) * (U - L))
     searchIndices = np.clip(offset + search_order, 0, M - 1)
     cost = [0] * search_max
     for j, k in enumerate(searchIndices):
         other = self.pdf_word_list[k][1]
         if (word.startswith(other) or word.endswith(other)
                 or other.startswith(word) or other.endswith(word)):
             html_to_pdf[i] = k
             return
         else:
             cost[j] = int(editdist(
                 word, other)) * edit_cost + j * offset_cost
     html_to_pdf[i] = searchIndices[np.argmin(cost)]
     return
Пример #3
0
def get_lattice_similarity(lattice1: List[List[str]],
                           lattice2: List[List[str]],
                           threshold: float = 0.8,
                           ignore_stress: bool = False) -> float:
    """
    Compare two lattices to find the similarity ratio of the closest phonetic renderings of them
    "threshold" is the similarity we're trying to match to return a potential link, higher values help us avoid
    expensive computation for the actual similarity score.
    The range of the output is [0, 1], 0 being the least similar, and 1 indicating an identical phonetic rendering in
    the two lattices. This is from Python's difflib.SequenceMatcher, calculated as follows: Where T is the total number
    of elements in both sequences, and M is the number of matches, this is 2.0*M / T.
    See also: https://docs.python.org/3.7/library/difflib.html#difflib.SequenceMatcher.ratio
    """
    max_ratio = 0
    for p1 in lattice_to_phonemes(
            lattice1):  # for each rendering from lattice1
        if ignore_stress:
            p1 = remove_stress(p1)
        c1 = Counter(p1.split())
        l1 = len(p1.split())
        p1 = p1.split()
        for p2 in lattice_to_phonemes(
                lattice2):  # for each rendering from lattice 2
            # Jaccard similarity
            if ignore_stress:
                p2 = remove_stress(p2)
            c2 = Counter(p2.split())
            l2 = len(p2.split())
            if sum((c1 & c2).values()) * 2 / (l1 + l2) < threshold:
                # If the candidates can't pass an orderless filter, there's no use in getting an exact ratio from the
                # (more expensive) SequenceMatcher
                continue

            # m = SequenceMatcher(a=p1.split(), b=p2.split(), autojunk=False)
            # ratio = m.ratio()
            p2 = p2.split()
            ratio = 1 - editdist(p1, p2) / max(len(p1), len(p2))
            # ratio = (len(p1) + len(p2) - 2 * dist) / (len(p1) + len(p2))
            if ratio == 1:
                return 1

            if ratio > max_ratio:
                max_ratio = ratio
    return max_ratio
Пример #4
0
def _masked_edit_dist(src, tsf, pivots, id2word, output_path):
  distances = []
  i = 0
  print('output write to:\n%s' % output_path)
  # print(output_path)
  fd = open(output_path, 'w')
  for s, t in zip(src, tsf):
    s_ = set(s)
    t_ = set(t)
    modified = (s_ | t_) - (s_ & t_)
    # s_masked = [w if w not in modified else 0 for w in s] # 0 = '_PAD'
    # t_masked = [w if w not in modified else 0 for w in t] # 0 = '_PAD'

    pivot_set = pivots[0] | pivots[1]
    s_masked = [w if w not in pivot_set else 0 for w in s] # 0 = '_PAD'
    t_masked = [w if w not in pivot_set else 0 for w in t] # 0 = '_PAD'
    s_masked_ = ' '.join([str(w) for w in s_masked])
    t_masked_ = ' '.join([str(w) for w in t_masked])
    ed = editdist(s_masked_, t_masked_)
    distances.append(ed)
    
    fd.write('s: %s\n' % _format_sentence(s, id2word, pivots))
    fd.write('t: %s\n' % _format_sentence(t, id2word, pivots))
    # debug
    # if(i < 5):
    #   print('modified:', [id2word[w] for w in modified])
    #   print('s:', _format_sentence(s, id2word, pivots))
    #   print('t:', _format_sentence(t, id2word, pivots))
    #   print('s_masked:', _format_sentence(s_masked, id2word, pivots))
    #   print('t_masked:', _format_sentence(t_masked, id2word, pivots))
    #   print('ed %d' % ed)
    #   i += 1
  avg_dist = np.average(distances)
  distances = Counter(distances)

  dist_distribution = np.zeros(8)
  for i in range(8): 
    if(i < 7): dist_distribution[i] = float(distances[i]) / len(src)
    else: dist_distribution[i] = 1 - dist_distribution[: i].sum()
  return avg_dist, distances, dist_distribution