Exemplo n.º 1
0
    def group_textboxes(self, laparams, boxes):
        assert boxes

        def dist(obj1, obj2):
            """A distance function between two TextBoxes.

            Consider the bounding rectangle for obj1 and obj2.
            Return its area less the areas of obj1 and obj2,
            shown as 'www' below. This value may be negative.
                    +------+..........+ (x1, y1)
                    | obj1 |wwwwwwwwww:
                    +------+www+------+
                    :wwwwwwwwww| obj2 |
            (x0, y0) +..........+------+
            """
            x0 = min(obj1.x0, obj2.x0)
            y0 = min(obj1.y0, obj2.y0)
            x1 = max(obj1.x1, obj2.x1)
            y1 = max(obj1.y1, obj2.y1)
            return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height)

        def isany(obj1, obj2):
            """Check if there's any other object between obj1 and obj2.
            """
            x0 = min(obj1.x0, obj2.x0)
            y0 = min(obj1.y0, obj2.y0)
            x1 = max(obj1.x1, obj2.x1)
            y1 = max(obj1.y1, obj2.y1)
            objs = set(plane.find((x0, y0, x1, y1)))
            return objs.difference((obj1, obj2))
        # XXX this still takes O(n^2)  :(
        dists = []
        for i in xrange(len(boxes)):
            obj1 = boxes[i]
            for j in xrange(i+1, len(boxes)):
                obj2 = boxes[j]
                dists.append((0, dist(obj1, obj2), obj1, obj2))
        dists.sort()
        plane = Plane(self.bbox)
        plane.extend(boxes)
        while dists:
            (c, d, obj1, obj2) = dists.pop(0)
            if c == 0 and isany(obj1, obj2):
                dists.append((1, d, obj1, obj2))
                continue
            if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or
                isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))):
                group = LTTextGroupTBRL([obj1, obj2])
            else:
                group = LTTextGroupLRTB([obj1, obj2])
            plane.remove(obj1)
            plane.remove(obj2)
            # this line is optimized -- don't change without profiling
            dists = [n for n in dists if n[2] in plane._objs and n[3] in plane._objs]
            for other in plane:
                dists.append((0, dist(group, other), group, other))
            dists.sort()
            plane.add(group)
        assert len(plane) == 1
        return list(plane)
Exemplo n.º 2
0
 def group_textboxes(self, laparams, boxes):
     dists = []
     for (obj1, obj2) in zip(boxes[0:], boxes[1:]):
         dists.append((0, dist(obj1, obj2), obj1, obj2))
         #for i in xrange(len(boxes)):
     #    obj1 = boxes[i]
     #    for j in xrange(i + 1, len(boxes)):
     #        obj2 = boxes[j]
     #        dists.append((0, dist(obj1, obj2), obj1, obj2))
     #dists.sort()
     plane = Plane(boxes)
     while dists:
         (c, d, obj1, obj2) = dists.pop(0)
         if c == 0 and isany(obj1, obj2, plane):
             dists.append((1, d, obj1, obj2))
             continue
         if (isinstance(obj1, LTTextBoxVertical) or
                 isinstance(obj1, LTTextGroupTBRL) or
                 isinstance(obj2, LTTextBoxVertical) or
                 isinstance(obj2, LTTextGroupTBRL)):
             group = LTTextGroupTBRL([obj1, obj2])
         else:
             group = LTTextGroupLRTB([obj1, obj2])
         plane.remove(obj1)
         plane.remove(obj2)
         dists = [n for n in dists if not n[2] in (obj1, obj2) and not n[3] in (obj1, obj2)]
         for other in plane:
             dists.append((0, dist(group, other), group, other))
             #dists.sort()
         plane.add(group)
     assert len(plane) == 1
     return list(plane)
Exemplo n.º 3
0
 def group_textboxes(self, laparams, boxes):
     if len(boxes) > 100:
         # Grouping this many boxes would take too long and it doesn't make much sense to do so
         # considering the type of grouping (nesting 2-sized subgroups) that is done here.
         logging.info("Too many boxes (%d) to group, skipping.", len(boxes))
         return boxes
     dists = []
     for obj1, obj2 in zip(boxes[0:], boxes[1:]):
         dists.append((0, dist(obj1, obj2), obj1, obj2))
     dists.sort()
     plane = Plane(boxes)
     while dists:
         (c, d, obj1, obj2) = dists.pop(0)
         if c == 0 and isany(obj1, obj2, plane):
             dists.append((1, d, obj1, obj2))
             continue
         if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or
                 isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))):
             group = LTTextGroupTBRL([obj1, obj2])
         else:
             group = LTTextGroupLRTB([obj1, obj2])
         plane.remove(obj1)
         plane.remove(obj2)
         dists = [n for n in dists if not n[2] in (obj1, obj2) and not n[3] in (obj1, obj2)]
         for other in plane:
             dists.append((0, dist(group, other), group, other))
         dists.sort()
         plane.add(group)
     assert len(plane) == 1
     return list(plane)
Exemplo n.º 4
0
 def get_textboxes(self, laparams, lines):
     plane = Plane(lines)
     for line in lines:
         plane.add(line)
     plane.finish()
     boxes = {}
     for line in lines:
         neighbors = line.find_neighbors(plane, laparams.line_margin)
         assert line in neighbors, line
         members = []
         for obj1 in neighbors:
             members.append(obj1)
             if obj1 in boxes:
                 members.extend(boxes.pop(obj1))
         if isinstance(line, LTTextLineHorizontal):
             box = LTTextBoxHorizontal()
         else:
             box = LTTextBoxVertical()
         for obj in uniq(members):
             box.add(obj)
             boxes[obj] = box
     done = set()
     for line in lines:
         box = boxes[line]
         if box in done:
             continue
         done.add(box)
         yield box.analyze(laparams)
     return