def __init__(self): LTTextContainer.__init__(self) self.start_row = -1 self.end_row = -1 self.start_col = -1 self.end_col = -1 self.children = [] self.father = None
def callback(container: LTTextContainer): """Checks whether an container includes the provided text.""" if not hasattr(container, 'get_text'): raise TypeError text = container.get_text().strip().replace('\n', '') return all(map(lambda s: s in text, strings))
def group(characters, delta=5, use_h_axis=True): """ group characters based on proximity Args: use_h_axis: whether to group characters based on their distance horizontally. When False, you group text into lines """ groups = [] for c in characters: obj = None for obj in groups: if use_h_axis: if obj.vdistance(c) < delta and obj.hdistance(c) < delta: break else: if obj.vdistance(c) < delta: break obj = None if obj is None: obj = LTTextContainer() obj.add(c) groups.append(obj) else: obj.add(c) return groups
def merge(lines, delta=5): """ merge text elements where they should be in a single group Some text elements might get placed into separate groups if they are are on separate lines, and their lines start at different x-positions. This function cleans those up. """ merged = [] for line in lines: elem = list(line) for a, b in itertools.combinations(range(len(elem)), 2): if elem[a] is None or elem[b] is None: continue if elem[a].hdistance(elem[b]) < delta: elem[a].extend(elem[b]) elem[b] = None line = LTTextContainer() line.extend([x for x in elem if x is not None]) merged.append(line) return merged
def process_paragraph(self, paragraph: LTTextContainer, index: int, page_containers: List[LTTextContainer]): """Process a paragraph. The default add a paragraph to the CorpusDocument""" self._result.add_paragraph(paragraph.get_text().strip(), str(index))
def check_text_is_date(paragraph: LTTextContainer) -> bool: """Returns true if the text of a paragraph contains only a date.""" return re.match(r'\d\d? \w+ \d\d\d\d\s*$', paragraph.get_text().strip()) is not None
def callback(element: LTTextContainer): """Checks whether an element's text does not match the provided text.""" if hasattr(element, 'get_text'): return text != element.get_text().strip().replace('\n', '') else: raise TypeError
from pdfminer.high_level import extract_pages from pdfminer.layout import LTTextContainer, LTChar, LTAnno import os def label_func(): return '1. Определение' labels_txt = open('labels.txt', 'w') for page_layout in extract_pages("text1.pdf"): for element in page_layout: if isinstance(element, LTTextContainer): for text_line in element: if isinstance(element, LTTextContainer): if str(LTTextContainer.get_text( self=text_line)).startswith(label_func()): print(str(LTTextContainer.get_text(self=text_line)))