def find_clusters(para_blocks):

  def uniq_path(blocks):
    r = []
    prev = None
    for b in blocks:
      if prev == None or b.path != prev:
        r.append(b.path)
      prev = b.path
    return r

  def find_common(seqs):
    s0 = None
    for s1 in seqs:
      if s0 == None:
        s0 = s1
      else:
        s0 = [ s0[i0] for (i0,i1) in find_lcs(s0, s1) ]
    return s0

  # obtain the common paths.
  common_paths = find_common([ uniq_path(blocks) for blocks in para_blocks ])

  # clusters = [ ( doc1_blocks1, doc2_blocks1, ..., docm_blocks1 ),
  #                ...
  #              ( doc1_blocksn, doc2_blocksn, ..., docm_blocksn ) ]
  clusters = zip(*[ retrieve_blocks(common_paths, blocks) for blocks in para_blocks ])

  # compare each cluster of text blocks.
  layout = []
  for blockgroups in clusters:
    if blockgroups[0]:
      layout.append(LayoutSectionCluster(len(layout), blockgroups))

  return layout
def find_clusters(para_blocks):
    def uniq_path(blocks):
        r = []
        prev = None
        for b in blocks:
            if prev == None or b.path != prev:
                r.append(b.path)
            prev = b.path
        return r

    def find_common(seqs):
        s0 = None
        for s1 in seqs:
            if s0 == None:
                s0 = s1
            else:
                s0 = [s0[i0] for (i0, i1) in find_lcs(s0, s1)]
        return s0

    # obtain the common paths.
    common_paths = find_common([uniq_path(blocks) for blocks in para_blocks])

    # clusters = [ ( doc1_blocks1, doc2_blocks1, ..., docm_blocks1 ),
    #                ...
    #              ( doc1_blocksn, doc2_blocksn, ..., docm_blocksn ) ]
    clusters = zip(
        *[retrieve_blocks(common_paths, blocks) for blocks in para_blocks])

    # compare each cluster of text blocks.
    layout = []
    for blockgroups in clusters:
        if blockgroups[0]:
            layout.append(LayoutSectionCluster(len(layout), blockgroups))

    return layout
Пример #3
0
 def match_blocks(self, blocks0, strict=True):
   diffs = [ d for (d,m,p) in self.pattern ]
   mains = [ m for (d,m,p) in self.pattern ]
   paths = [ p for (d,m,p) in self.pattern ]
   layout = []
   for (diffscore,mainscore,blocks1) in zip(diffs, mains, retrieve_blocks(paths, blocks0)):
     if strict and not blocks1:
       return None
     layout.append(LayoutSection(len(layout), diffscore, mainscore, blocks1))
   return layout
 def match_blocks(self, blocks0, strict=True):
   diffs = [ d for (d,m,p) in self.pattern ]
   mains = [ m for (d,m,p) in self.pattern ]
   paths = [ p for (d,m,p) in self.pattern ]
   layout = []
   for (diffscore,mainscore,blocks1) in zip(diffs, mains, retrieve_blocks(paths, blocks0)):
     if strict and not blocks1:
       return None
     layout.append(LayoutSection(len(layout), diffscore, mainscore, blocks1))
   return layout