def __handle_lt_figure(self, element: LTFigure): """ sometimes pieces of text are wrongly detected as LTFigure, e.g. in slide-sets with border lines. -> extract text from LTFigure line by line put them into a LTTextBoxHorizontal as a workaround @return: LTTextBoxHorizontal containing found texts line by line """ # check if text is hold within figure element, forward line = LTTextLineHorizontal(0) wrapper = LTTextBoxHorizontal() wrapper.add(line) y_prior = element._objs[0].y0 for letter in element: if isinstance(letter, LTChar): if abs(letter.y0 - y_prior) > 0.05: # new line, yield wrapper wrapper.analyze(self.la_params) yield wrapper wrapper = LTTextBoxHorizontal() line = LTTextLineHorizontal(0) wrapper.add(line) y_prior = letter.y0 line.add(letter)
def create_container(self, text): box = LTTextBoxHorizontal() line = LTTextLineHorizontal(0) for c in text: line.add(self.create_char(c)) box.add(line) return box
def test_bad_investor_info(self): from casparser.extract.pdfminer import parse_investor_info with pytest.raises(CASParseError) as exc_info: box = LTTextBoxHorizontal() box.get_text() parse_investor_info([], 0, 0) assert "Unable to parse investor data" in str(exc_info)
def split_boxes_by_style( self, container: LTTextContainer ) -> Generator[LTTextContainer, LTTextContainer, None]: """ pdfminers paragraphs are sometimes too broad and contain lines that should be splitted into header and content @param container: the extracted original paragraph """ line: LTTextLineHorizontal wrapper = LTTextBoxHorizontal() wrapper.page = container.page stack = [] for line in container: size = max([ obj.size for obj in itertools.islice(line, 10) if isinstance(obj, LTChar) ]) if not stack: wrapper.add(line) stack.append(size) else: prior = stack.pop() stack.append(size) diff = abs(prior - size) if diff != 0 and max(prior, size) / min(prior, size) > 1.15: # break paragraph yield wrapper wrapper = LTTextBoxHorizontal() wrapper.add(line) yield wrapper
def group_textlines(self, laparams, lines): """Patched class method that fixes empty line aggregation, and allows run-time line margin detection""" plane = Plane(self.bbox) plane.extend(lines) boxes = {} for line in lines: neighbors = line.find_neighbors(plane, laparams.line_margin) if line not in neighbors or not line.get_text().strip(): continue # Correct margin to paragraph specific true_margin = laparams.line_margin for obj1 in neighbors: if obj1 is line: continue margin = min(abs(obj1.y0 - line.y1), abs(obj1.y1 - line.y0)) margin = margin * 1.05 / line.height if margin < true_margin: true_margin = margin neighbors = line.find_neighbors(plane, true_margin) if line not in neighbors: continue members = [] for obj1 in neighbors: if not obj1.get_text().strip(): continue members.append(obj1) if obj1 in boxes: members.extend(boxes.pop(obj1)) if isinstance(line, LTTextLineHorizontal): box = LTTextBoxHorizontal() else: box = LTTextBoxVertical() for obj in uniq(members): box.add(obj) boxes[obj] = box done = set() for line in lines: if line not in boxes: continue box = boxes[line] if box in done: continue done.add(box) if not box.is_empty(): yield box return
def group_textlines(self, laparams: LAParams, lines: List[LTTextContainer]) -> Generator: plane = Plane(self.bbox) plane.extend(lines) boxes: Dict[LTText, LTTextBox] = {} for line in lines: if isinstance(line, LTTextLineHorizontalExtended): box = LTTextBoxHorizontal() if self.rsrcmgr: klass = line.maybe_classify(self.rsrcmgr) if klass == LTTitle: self.rsrcmgr.after_title = True elif not self.rsrcmgr.after_abstract and klass == LTSectionHeader: self.rsrcmgr.after_abstract = True elif klass == LTSectionHeader and 'references' in line.get_text( ).lower(): self.rsrcmgr.after_ref = True box = klass() else: box = LTTextBoxVertical() if not isinstance(box, LTTitle) and not isinstance( box, LTSectionHeader): neighbors = line.find_neighbors_with_rsrcmgr( plane, laparams.line_margin, self.rsrcmgr) if line not in neighbors: continue else: neighbors = [line] members = [] for obj1 in neighbors: members.append(obj1) if obj1 in boxes: members.extend(boxes.pop(obj1)) for obj in uniq(members): box.add(obj) boxes[obj] = box done: Set[LTTextBox] = set() for line in lines: if line not in boxes: continue box = boxes[line] if box in done: continue done.add(box) if not box.is_empty(): yield box return