Exemplo n.º 1
0
    def group_textboxes(self, laparams, boxes):
        assert boxes

        def dist(obj1, obj2):
            """A distance function between two TextBoxes.

            Consider the bounding rectangle for obj1 and obj2.
            Return its area less the areas of obj1 and obj2,
            shown as 'www' below. This value may be negative.
                    +------+..........+ (x1, y1)
                    | obj1 |wwwwwwwwww:
                    +------+www+------+
                    :wwwwwwwwww| obj2 |
            (x0, y0) +..........+------+
            """
            x0 = min(obj1.x0, obj2.x0)
            y0 = min(obj1.y0, obj2.y0)
            x1 = max(obj1.x1, obj2.x1)
            y1 = max(obj1.y1, obj2.y1)
            return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height)

        def isany(obj1, obj2):
            """Check if there's any other object between obj1 and obj2.
            """
            x0 = min(obj1.x0, obj2.x0)
            y0 = min(obj1.y0, obj2.y0)
            x1 = max(obj1.x1, obj2.x1)
            y1 = max(obj1.y1, obj2.y1)
            objs = set(plane.find((x0, y0, x1, y1)))
            return objs.difference((obj1, obj2))
        # XXX this still takes O(n^2)  :(
        dists = []
        for i in xrange(len(boxes)):
            obj1 = boxes[i]
            for j in xrange(i+1, len(boxes)):
                obj2 = boxes[j]
                dists.append((0, dist(obj1, obj2), obj1, obj2))
        dists.sort()
        plane = Plane(self.bbox)
        plane.extend(boxes)
        while dists:
            (c, d, obj1, obj2) = dists.pop(0)
            if c == 0 and isany(obj1, obj2):
                dists.append((1, d, obj1, obj2))
                continue
            if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or
                isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))):
                group = LTTextGroupTBRL([obj1, obj2])
            else:
                group = LTTextGroupLRTB([obj1, obj2])
            plane.remove(obj1)
            plane.remove(obj2)
            # this line is optimized -- don't change without profiling
            dists = [n for n in dists if n[2] in plane._objs and n[3] in plane._objs]
            for other in plane:
                dists.append((0, dist(group, other), group, other))
            dists.sort()
            plane.add(group)
        assert len(plane) == 1
        return list(plane)
Exemplo n.º 2
0
 def group_textlines(self, laparams, lines):
     plane = Plane(self.bbox)
     plane.extend(lines)
     boxes = {}
     for line in lines:
         neighbors = line.find_neighbors(plane, laparams.line_margin)
         if line not in neighbors: continue
         members = []
         for obj1 in neighbors:
             members.append(obj1)
             if obj1 in boxes:
                 members.extend(boxes.pop(obj1))
         if isinstance(line, LTTextLineHorizontal):
             box = LTTextBoxHorizontal()
         else:
             box = LTTextBoxVertical()
         for obj in uniq(members):
             box.add(obj)
             boxes[obj] = box
     done = set()
     for line in lines:
         if line not in boxes: continue
         box = boxes[line]
         if box in done:
             continue
         done.add(box)
         if not box.is_empty():
             yield box
     return
 def group_textlines(self, laparams, lines):
     plane = Plane(self.bbox)
     plane.extend(lines)
     boxes = {}
     for line in lines:
         neighbors = line.find_neighbors(plane, laparams.line_margin)
         if line not in neighbors: continue
         members = []
         for obj1 in neighbors:
             members.append(obj1)
             if obj1 in boxes:
                 members.extend(boxes.pop(obj1))
         if isinstance(line, LTTextLineHorizontal):
             box = LTTextBoxHorizontal()
         else:
             box = LTTextBoxVertical()
         for obj in uniq(members):
             box.add(obj)
             boxes[obj] = box
     done = set()
     for line in lines:
         if line not in boxes: continue
         box = boxes[line]
         if box in done:
             continue
         done.add(box)
         if not box.is_empty():
             yield box
     return