def analyze(self, laparams): LTTextGroup.analyze(self, laparams) # reorder the objects from top-right to bottom-left. self._objs = csort(self._objs, key=lambda obj: -(1+laparams.boxes_flow)*(obj.x0+obj.x1) -(1-laparams.boxes_flow)*(obj.y1)) return
def analyze(self, laparams): LTTextGroup.analyze(self, laparams) # reorder the objects from top-right to bottom-left. self._objs = csort(self._objs, key=lambda obj: -(1+laparams.boxes_flow)*(obj.x0+obj.x1) - (1-laparams.boxes_flow)*(obj.y1)) return
def analyze(self, laparams): LTTextBox.analyze(self, laparams) self._objs = csort(self._objs, key=lambda obj: -obj.x1) return
class LTLayoutContainer(LTContainer): def __init__(self, bbox): LTContainer.__init__(self, bbox) self.layout = None return def analyze(self, laparams): # textobjs is a list of LTChar objects, i.e. # it has all the individual characters in the page. (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self._objs) if not textobjs: return textlines = list(self.get_textlines(laparams, textobjs)) assert len(textobjs) <= sum( len(line._objs) for line in textlines ) (empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines) textboxes = list(self.get_textboxes(laparams, textlines)) assert len(textlines) == sum( len(box._objs) for box in textboxes ) top = self.group_textboxes(laparams, textboxes) def assign_index(obj, i): if isinstance(obj, LTTextBox): obj.index = i i += 1 elif isinstance(obj, LTTextGroup): for x in obj: i = assign_index(x, i) return i assign_index(top, 0) textboxes.sort(key=lambda box:box.index) self._objs = textboxes + otherobjs + empties self.layout = top return self def get_textlines(self, laparams, objs): obj0 = None line = None for obj1 in objs: if obj0 is not None: k = 0 if (obj0.is_compatible(obj1) and obj0.is_voverlap(obj1) and min(obj0.height, obj1.height) * laparams.line_overlap < obj0.voverlap(obj1) and obj0.hdistance(obj1) < max(obj0.width, obj1.width) * laparams.char_margin): # obj0 and obj1 is horizontally aligned: # # +------+ - - - # | obj0 | - - +------+ - # | | | obj1 | | (line_overlap) # +------+ - - | | - # - - - +------+ # # |<--->| # (char_margin) k |= 1 if (laparams.detect_vertical and obj0.is_compatible(obj1) and obj0.is_hoverlap(obj1) and min(obj0.width, obj1.width) * laparams.line_overlap < obj0.hoverlap(obj1) and obj0.vdistance(obj1) < max(obj0.height, obj1.height) * laparams.char_margin): # obj0 and obj1 is vertically aligned: # # +------+ # | obj0 | # | | # +------+ - - - # | | | (char_margin) # +------+ - - # | obj1 | # | | # +------+ # # |<-->| # (line_overlap) k |= 2 if ( (k & 1 and isinstance(line, LTTextLineHorizontal)) or (k & 2 and isinstance(line, LTTextLineVertical)) ): line.add(obj1) elif line is not None: yield line.analyze(laparams) line = None else: if k == 2: line = LTTextLineVertical(laparams.word_margin) line.add(obj0) line.add(obj1) elif k == 1: line = LTTextLineHorizontal(laparams.word_margin) line.add(obj0) line.add(obj1) else: line = LTTextLineHorizontal(laparams.word_margin) line.add(obj0) yield line.analyze(laparams) line = None obj0 = obj1 if line is None: line = LTTextLineHorizontal(laparams.word_margin) line.add(obj0) yield line.analyze(laparams) return def get_textboxes(self, laparams, lines): plane = Plane(lines) boxes = {} for line in lines: neighbors = line.find_neighbors(plane, laparams.line_margin) assert line in neighbors, line members = [] for obj1 in neighbors: members.append(obj1) if obj1 in boxes: members.extend(boxes.pop(obj1)) if isinstance(line, LTTextLineHorizontal): box = LTTextBoxHorizontal() else: box = LTTextBoxVertical() for obj in uniq(members): box.add(obj) boxes[obj] = box done = set() for line in lines: box = boxes[line] if box in done: continue done.add(box) yield box.analyze(laparams) return def group_textboxes(self, laparams, boxes): def dist((x0,y0,x1,y1), obj1, obj2): """A distance function between two TextBoxes. Consider the bounding rectangle for obj1 and obj2. Return its area less the areas of obj1 and obj2, shown as 'www' below. This value may be negative. +------+..........+ (x1,y1) | obj1 |wwwwwwwwww: +------+www+------+ :wwwwwwwwww| obj2 | (x0,y0) +..........+------+ """ return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height) boxes = boxes[:] # XXX this is very slow when there're many textboxes. while 2 <= len(boxes): mindist = (INF,0) minpair = None plane = Plane(boxes) boxes = csort(boxes, key=lambda obj: obj.width*obj.height) for i in xrange(len(boxes)): for j in xrange(i+1, len(boxes)): (obj1, obj2) = (boxes[i], boxes[j]) b = (min(obj1.x0,obj2.x0), min(obj1.y0,obj2.y0), max(obj1.x1,obj2.x1), max(obj1.y1,obj2.y1)) others = set(plane.find(b)).difference((obj1,obj2)) d = dist(b, obj1, obj2) # disregard if there's any other object in between. if 0 < d and others: d = (1,d) else: d = (0,d) if mindist <= d: continue mindist = d minpair = (obj1, obj2) assert minpair is not None, boxes (obj1, obj2) = minpair boxes.remove(obj1) boxes.remove(obj2) if (isinstance(obj1, LTTextBoxVertical) or isinstance(obj2, LTTextBoxVertical) or isinstance(obj1, LTTextGroupTBRL) or isinstance(obj2, LTTextGroupTBRL)): group = LTTextGroupTBRL([obj1, obj2]) else: group = LTTextGroupLRTB([obj1, obj2]) boxes.append(group.analyze(laparams)) assert len(boxes) == 1 return boxes.pop()
def group_textboxes(self, laparams, boxes): assert boxes def dist(obj1, obj2): """A distance function between two TextBoxes. Consider the bounding rectangle for obj1 and obj2. Return its area less the areas of obj1 and obj2, shown as 'www' below. This value may be negative. +------+..........+ (x1, y1) | obj1 |wwwwwwwwww: +------+www+------+ :wwwwwwwwww| obj2 | (x0, y0) +..........+------+ """ x0 = min(obj1.x0, obj2.x0) y0 = min(obj1.y0, obj2.y0) x1 = max(obj1.x1, obj2.x1) y1 = max(obj1.y1, obj2.y1) return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height) def isany(obj1, obj2): """Check if there's any other object between obj1 and obj2. """ x0 = min(obj1.x0, obj2.x0) y0 = min(obj1.y0, obj2.y0) x1 = max(obj1.x1, obj2.x1) y1 = max(obj1.y1, obj2.y1) objs = set(plane.find((x0, y0, x1, y1))) return objs.difference((obj1, obj2)) # XXX this still takes O(n^2) :( dists = [] for i in xrange(len(boxes)): obj1 = boxes[i] for j in xrange(i+1, len(boxes)): obj2 = boxes[j] dists.append((0, dist(obj1, obj2), obj1, obj2)) # We could use dists.sort(), but it would randomize the test result. dists = csort(dists) plane = Plane(self.bbox) plane.extend(boxes) while dists: (c, d, obj1, obj2) = dists.pop(0) if c == 0 and isany(obj1, obj2): dists.append((1, d, obj1, obj2)) continue if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))): group = LTTextGroupTBRL([obj1, obj2]) else: group = LTTextGroupLRTB([obj1, obj2]) plane.remove(obj1) plane.remove(obj2) dists = [ n for n in dists if (n[2] in plane and n[3] in plane) ] for other in plane: dists.append((0, dist(group, other), group, other)) dists = csort(dists) plane.add(group) assert len(plane) == 1 return list(plane)