def group_textboxes(items): new_items = [] prev = items[0] for item in items[1:]: if isinstance(prev, LTChar): box = LTTextBox() box.add(prev) box.set_bbox((prev.x0, prev.y0, prev.x1, prev.y1)) prev = box y_diff = (prev.y0 - item.y1) x_diff = (item.x0 - prev.x1) if y_diff < get_size(prev) / 2 and x_diff < get_size( prev) and x_diff >= -get_size(prev) / 2: xs = [item.x0, item.x1, prev.x0, prev.x1] ys = [item.y0, item.y1, prev.y0, prev.y1] prev.add(item) prev.set_bbox((min(xs), min(ys), max(xs), max(ys))) elif y_diff < get_size(prev) / 2 and ( item.x0 - prev.x0) < get_size(prev) / 2 and ( item.x1 - prev.x1) > -get_size(prev) / 2: vert = LTTextBoxVertical() xs = [item.x0, item.x1, prev.x0, prev.x1] ys = [item.y0, item.y1, prev.y0, prev.y1] for child in prev: vert.add(child) vert.add(item) vert.set_bbox((min(xs), min(ys), max(xs), max(ys))) prev = vert else: new_items.append(prev) prev = item #new_items.append(prev) #prev = item new_items.append(prev) return new_items
def group_textlines(self, laparams, lines): """Patched class method that fixes empty line aggregation, and allows run-time line margin detection""" plane = Plane(self.bbox) plane.extend(lines) boxes = {} for line in lines: neighbors = line.find_neighbors(plane, laparams.line_margin) if line not in neighbors or not line.get_text().strip(): continue # Correct margin to paragraph specific true_margin = laparams.line_margin for obj1 in neighbors: if obj1 is line: continue margin = min(abs(obj1.y0 - line.y1), abs(obj1.y1 - line.y0)) margin = margin * 1.05 / line.height if margin < true_margin: true_margin = margin neighbors = line.find_neighbors(plane, true_margin) if line not in neighbors: continue members = [] for obj1 in neighbors: if not obj1.get_text().strip(): continue members.append(obj1) if obj1 in boxes: members.extend(boxes.pop(obj1)) if isinstance(line, LTTextLineHorizontal): box = LTTextBoxHorizontal() else: box = LTTextBoxVertical() for obj in uniq(members): box.add(obj) boxes[obj] = box done = set() for line in lines: if line not in boxes: continue box = boxes[line] if box in done: continue done.add(box) if not box.is_empty(): yield box return
def group_textlines(self, laparams: LAParams, lines: List[LTTextContainer]) -> Generator: plane = Plane(self.bbox) plane.extend(lines) boxes: Dict[LTText, LTTextBox] = {} for line in lines: if isinstance(line, LTTextLineHorizontalExtended): box = LTTextBoxHorizontal() if self.rsrcmgr: klass = line.maybe_classify(self.rsrcmgr) if klass == LTTitle: self.rsrcmgr.after_title = True elif not self.rsrcmgr.after_abstract and klass == LTSectionHeader: self.rsrcmgr.after_abstract = True elif klass == LTSectionHeader and 'references' in line.get_text( ).lower(): self.rsrcmgr.after_ref = True box = klass() else: box = LTTextBoxVertical() if not isinstance(box, LTTitle) and not isinstance( box, LTSectionHeader): neighbors = line.find_neighbors_with_rsrcmgr( plane, laparams.line_margin, self.rsrcmgr) if line not in neighbors: continue else: neighbors = [line] members = [] for obj1 in neighbors: members.append(obj1) if obj1 in boxes: members.extend(boxes.pop(obj1)) for obj in uniq(members): box.add(obj) boxes[obj] = box done: Set[LTTextBox] = set() for line in lines: if line not in boxes: continue box = boxes[line] if box in done: continue done.add(box) if not box.is_empty(): yield box return