def group_textboxes(items):
     new_items = []
     prev = items[0]
     for item in items[1:]:
         if isinstance(prev, LTChar):
             box = LTTextBox()
             box.add(prev)
             box.set_bbox((prev.x0, prev.y0, prev.x1, prev.y1))
             prev = box
         y_diff = (prev.y0 - item.y1)
         x_diff = (item.x0 - prev.x1)
         if y_diff < get_size(prev) / 2 and x_diff < get_size(
                 prev) and x_diff >= -get_size(prev) / 2:
             xs = [item.x0, item.x1, prev.x0, prev.x1]
             ys = [item.y0, item.y1, prev.y0, prev.y1]
             prev.add(item)
             prev.set_bbox((min(xs), min(ys), max(xs), max(ys)))
         elif y_diff < get_size(prev) / 2 and (
                 item.x0 - prev.x0) < get_size(prev) / 2 and (
                     item.x1 - prev.x1) > -get_size(prev) / 2:
             vert = LTTextBoxVertical()
             xs = [item.x0, item.x1, prev.x0, prev.x1]
             ys = [item.y0, item.y1, prev.y0, prev.y1]
             for child in prev:
                 vert.add(child)
             vert.add(item)
             vert.set_bbox((min(xs), min(ys), max(xs), max(ys)))
             prev = vert
         else:
             new_items.append(prev)
             prev = item
         #new_items.append(prev)
         #prev = item
     new_items.append(prev)
     return new_items
示例#2
0
def group_textlines(self, laparams, lines):
    """Patched class method that fixes empty line aggregation, and allows
    run-time line margin detection"""
    plane = Plane(self.bbox)
    plane.extend(lines)
    boxes = {}
    for line in lines:
        neighbors = line.find_neighbors(plane, laparams.line_margin)
        if line not in neighbors or not line.get_text().strip():
            continue

        # Correct margin to paragraph specific
        true_margin = laparams.line_margin
        for obj1 in neighbors:
            if obj1 is line:
                continue
            margin = min(abs(obj1.y0 - line.y1), abs(obj1.y1 - line.y0))
            margin = margin * 1.05 / line.height
            if margin < true_margin:
                true_margin = margin

        neighbors = line.find_neighbors(plane, true_margin)
        if line not in neighbors:
            continue

        members = []
        for obj1 in neighbors:
            if not obj1.get_text().strip():
                continue
            members.append(obj1)
            if obj1 in boxes:
                members.extend(boxes.pop(obj1))
        if isinstance(line, LTTextLineHorizontal):
            box = LTTextBoxHorizontal()
        else:
            box = LTTextBoxVertical()
        for obj in uniq(members):
            box.add(obj)
            boxes[obj] = box
    done = set()
    for line in lines:
        if line not in boxes:
            continue
        box = boxes[line]
        if box in done:
            continue
        done.add(box)
        if not box.is_empty():
            yield box
    return
示例#3
0
 def group_textlines(self, laparams: LAParams,
                     lines: List[LTTextContainer]) -> Generator:
     plane = Plane(self.bbox)
     plane.extend(lines)
     boxes: Dict[LTText, LTTextBox] = {}
     for line in lines:
         if isinstance(line, LTTextLineHorizontalExtended):
             box = LTTextBoxHorizontal()
             if self.rsrcmgr:
                 klass = line.maybe_classify(self.rsrcmgr)
                 if klass == LTTitle:
                     self.rsrcmgr.after_title = True
                 elif not self.rsrcmgr.after_abstract and klass == LTSectionHeader:
                     self.rsrcmgr.after_abstract = True
                 elif klass == LTSectionHeader and 'references' in line.get_text(
                 ).lower():
                     self.rsrcmgr.after_ref = True
                 box = klass()
         else:
             box = LTTextBoxVertical()
         if not isinstance(box, LTTitle) and not isinstance(
                 box, LTSectionHeader):
             neighbors = line.find_neighbors_with_rsrcmgr(
                 plane, laparams.line_margin, self.rsrcmgr)
             if line not in neighbors:
                 continue
         else:
             neighbors = [line]
         members = []
         for obj1 in neighbors:
             members.append(obj1)
             if obj1 in boxes:
                 members.extend(boxes.pop(obj1))
         for obj in uniq(members):
             box.add(obj)
             boxes[obj] = box
     done: Set[LTTextBox] = set()
     for line in lines:
         if line not in boxes:
             continue
         box = boxes[line]
         if box in done:
             continue
         done.add(box)
         if not box.is_empty():
             yield box
     return