예제 #1
0
 def analyze(self, laparams):
     LTTextGroup.analyze(self, laparams)
     # reorder the objects from top-right to bottom-left.
     self._objs = csort(self._objs, key=lambda obj:
                        -(1+laparams.boxes_flow)*(obj.x0+obj.x1)
                        -(1-laparams.boxes_flow)*(obj.y1))
     return
예제 #2
0
 def analyze(self, laparams):
     LTTextGroup.analyze(self, laparams)
     # reorder the objects from top-right to bottom-left.
     self._objs = csort(self._objs, key=lambda obj:
                        -(1+laparams.boxes_flow)*(obj.x0+obj.x1)
                        - (1-laparams.boxes_flow)*(obj.y1))
     return
예제 #3
0
 def analyze(self, laparams):
     LTTextBox.analyze(self, laparams)
     self._objs = csort(self._objs, key=lambda obj: -obj.x1)
     return
예제 #4
0
 def analyze(self, laparams):
     LTTextBox.analyze(self, laparams)
     self._objs = csort(self._objs, key=lambda obj: -obj.x1)
     return
예제 #5
0
class LTLayoutContainer(LTContainer):

    def __init__(self, bbox):
        LTContainer.__init__(self, bbox)
        self.layout = None
        return
        
    def analyze(self, laparams):
        # textobjs is a list of LTChar objects, i.e.
        # it has all the individual characters in the page.
        (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self._objs)
        if not textobjs: return
        textlines = list(self.get_textlines(laparams, textobjs))
        assert len(textobjs) <= sum( len(line._objs) for line in textlines )
        (empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
        textboxes = list(self.get_textboxes(laparams, textlines))
        assert len(textlines) == sum( len(box._objs) for box in textboxes )
        top = self.group_textboxes(laparams, textboxes)
        def assign_index(obj, i):
            if isinstance(obj, LTTextBox):
                obj.index = i
                i += 1
            elif isinstance(obj, LTTextGroup):
                for x in obj:
                    i = assign_index(x, i)
            return i
        assign_index(top, 0)
        textboxes.sort(key=lambda box:box.index)
        self._objs = textboxes + otherobjs + empties
        self.layout = top
        return self

    def get_textlines(self, laparams, objs):
        obj0 = None
        line = None
        for obj1 in objs:
            if obj0 is not None:
                k = 0
                if (obj0.is_compatible(obj1) and obj0.is_voverlap(obj1) and 
                    min(obj0.height, obj1.height) * laparams.line_overlap < obj0.voverlap(obj1) and
                    obj0.hdistance(obj1) < max(obj0.width, obj1.width) * laparams.char_margin):
                    # obj0 and obj1 is horizontally aligned:
                    #
                    #   +------+ - - -
                    #   | obj0 | - - +------+   -
                    #   |      |     | obj1 |   | (line_overlap)
                    #   +------+ - - |      |   -
                    #          - - - +------+
                    #
                    #          |<--->|
                    #        (char_margin)
                    k |= 1
                if (laparams.detect_vertical and
                    obj0.is_compatible(obj1) and obj0.is_hoverlap(obj1) and 
                    min(obj0.width, obj1.width) * laparams.line_overlap < obj0.hoverlap(obj1) and
                    obj0.vdistance(obj1) < max(obj0.height, obj1.height) * laparams.char_margin):
                    # obj0 and obj1 is vertically aligned:
                    #
                    #   +------+
                    #   | obj0 |
                    #   |      |
                    #   +------+ - - -
                    #     |    |     | (char_margin)
                    #     +------+ - -
                    #     | obj1 |
                    #     |      |
                    #     +------+
                    #
                    #     |<-->|
                    #   (line_overlap)
                    k |= 2
                if ( (k & 1 and isinstance(line, LTTextLineHorizontal)) or
                     (k & 2 and isinstance(line, LTTextLineVertical)) ):
                    line.add(obj1)
                elif line is not None:
                    yield line.analyze(laparams)
                    line = None
                else:
                    if k == 2:
                        line = LTTextLineVertical(laparams.word_margin)
                        line.add(obj0)
                        line.add(obj1)
                    elif k == 1:
                        line = LTTextLineHorizontal(laparams.word_margin)
                        line.add(obj0)
                        line.add(obj1)
                    else:
                        line = LTTextLineHorizontal(laparams.word_margin)
                        line.add(obj0)
                        yield line.analyze(laparams)
                        line = None
            obj0 = obj1
        if line is None:
            line = LTTextLineHorizontal(laparams.word_margin)
            line.add(obj0)
        yield line.analyze(laparams)
        return

    def get_textboxes(self, laparams, lines):
        plane = Plane(lines)
        boxes = {}
        for line in lines:
            neighbors = line.find_neighbors(plane, laparams.line_margin)
            assert line in neighbors, line
            members = []
            for obj1 in neighbors:
                members.append(obj1)
                if obj1 in boxes:
                    members.extend(boxes.pop(obj1))
            if isinstance(line, LTTextLineHorizontal):
                box = LTTextBoxHorizontal()
            else:
                box = LTTextBoxVertical()
            for obj in uniq(members):
                box.add(obj)
                boxes[obj] = box
        done = set()
        for line in lines:
            box = boxes[line]
            if box in done: continue
            done.add(box)
            yield box.analyze(laparams)
        return

    def group_textboxes(self, laparams, boxes):
        def dist((x0,y0,x1,y1), obj1, obj2):
            """A distance function between two TextBoxes.
            
            Consider the bounding rectangle for obj1 and obj2.
            Return its area less the areas of obj1 and obj2, 
            shown as 'www' below. This value may be negative.
                    +------+..........+ (x1,y1)
                    | obj1 |wwwwwwwwww:
                    +------+www+------+
                    :wwwwwwwwww| obj2 |
            (x0,y0) +..........+------+
            """
            return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height)
        boxes = boxes[:]
        # XXX this is very slow when there're many textboxes.
        while 2 <= len(boxes):
            mindist = (INF,0)
            minpair = None
            plane = Plane(boxes)
            boxes = csort(boxes, key=lambda obj: obj.width*obj.height)
            for i in xrange(len(boxes)):
                for j in xrange(i+1, len(boxes)):
                    (obj1, obj2) = (boxes[i], boxes[j])
                    b = (min(obj1.x0,obj2.x0), min(obj1.y0,obj2.y0),
                         max(obj1.x1,obj2.x1), max(obj1.y1,obj2.y1))
                    others = set(plane.find(b)).difference((obj1,obj2))
                    d = dist(b, obj1, obj2)
                    # disregard if there's any other object in between.
                    if 0 < d and others:
                        d = (1,d)
                    else:
                        d = (0,d)
                    if mindist <= d: continue
                    mindist = d
                    minpair = (obj1, obj2)
            assert minpair is not None, boxes
            (obj1, obj2) = minpair
            boxes.remove(obj1)
            boxes.remove(obj2)
            if (isinstance(obj1, LTTextBoxVertical) or
                isinstance(obj2, LTTextBoxVertical) or 
                isinstance(obj1, LTTextGroupTBRL) or
                isinstance(obj2, LTTextGroupTBRL)):
                group = LTTextGroupTBRL([obj1, obj2])
            else:
                group = LTTextGroupLRTB([obj1, obj2])
            boxes.append(group.analyze(laparams))
        assert len(boxes) == 1
        return boxes.pop()
예제 #6
0
    def group_textboxes(self, laparams, boxes):
        assert boxes

        def dist(obj1, obj2):
            """A distance function between two TextBoxes.

            Consider the bounding rectangle for obj1 and obj2.
            Return its area less the areas of obj1 and obj2,
            shown as 'www' below. This value may be negative.
                    +------+..........+ (x1, y1)
                    | obj1 |wwwwwwwwww:
                    +------+www+------+
                    :wwwwwwwwww| obj2 |
            (x0, y0) +..........+------+
            """
            x0 = min(obj1.x0, obj2.x0)
            y0 = min(obj1.y0, obj2.y0)
            x1 = max(obj1.x1, obj2.x1)
            y1 = max(obj1.y1, obj2.y1)
            return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height)

        def isany(obj1, obj2):
            """Check if there's any other object between obj1 and obj2.
            """
            x0 = min(obj1.x0, obj2.x0)
            y0 = min(obj1.y0, obj2.y0)
            x1 = max(obj1.x1, obj2.x1)
            y1 = max(obj1.y1, obj2.y1)
            objs = set(plane.find((x0, y0, x1, y1)))
            return objs.difference((obj1, obj2))
        
        # XXX this still takes O(n^2)  :(
        dists = []
        for i in xrange(len(boxes)):
            obj1 = boxes[i]
            for j in xrange(i+1, len(boxes)):
                obj2 = boxes[j]
                dists.append((0, dist(obj1, obj2), obj1, obj2))
        # We could use dists.sort(), but it would randomize the test result.
        dists = csort(dists)
        plane = Plane(self.bbox)
        plane.extend(boxes)
        while dists:
            (c, d, obj1, obj2) = dists.pop(0)
            if c == 0 and isany(obj1, obj2):
                dists.append((1, d, obj1, obj2))
                continue
            if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or
                isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))):
                group = LTTextGroupTBRL([obj1, obj2])
            else:
                group = LTTextGroupLRTB([obj1, obj2])
            plane.remove(obj1)
            plane.remove(obj2)
            dists = [ n for n in dists if (n[2] in plane and n[3] in plane) ]
            for other in plane:
                dists.append((0, dist(group, other), group, other))
            dists = csort(dists)
            plane.add(group)
        assert len(plane) == 1
        return list(plane)