def pdfminer_parse(file_path, file_hash): output_string = StringIO() try: with open(file_path, 'rb') as fp: parser = PDFParser(fp) doc = PDFDocument(parser) laparams = LAParams() rsrcmgr = PDFResourceManager(caching=True) txt_device = TextConverter(rsrcmgr, output_string, codec='utf-8', laparams=laparams) aggr_device = PDFPageAggregator(rsrcmgr, laparams=laparams) txt_interpreter = PDFPageInterpreter(rsrcmgr, txt_device) aggr_interpreter = PDFPageInterpreter(rsrcmgr, aggr_device) from pdfminer.high_level import extract_pages extract_pages("test.pdf") for page in PDFPage.get_pages(fp, caching=True): txt_interpreter.process_page(page) aggr_interpreter.process_page(page) layout = aggr_device.get_result() # for element in layout: # if isinstance(element, LTImage): # print() # elif isinstance(element, LTFigure): # print() # TODO Complete except pdfparser.PDFSyntaxError: print(f"{file_hash} is not a PDF") return None return { 'text': output_string.getvalue(), 'meta': flatten_and_unicode(doc.info) }
def test_get_common_font_from_pages(get_test_decision_standard_pdf_path, get_test_decision_v1_pdf_path): pages_1 = extract_pages(get_test_decision_standard_pdf_path.open('rb')) assert common.get_common_font_from_pages( list(pages_1)) == 'TimesNewRomanPSMT' pages_2 = extract_pages(get_test_decision_v1_pdf_path.open('rb')) assert common.get_common_font_from_pages(list(pages_2)) == 'Arial'
def test_line_margin(self): # The lines have margin 0.2 relative to the height. # Extract with line_margin 0.19 should break into 3 separate textboxes. pages = list( extract_pages(self._get_test_file_path(), laparams=LAParams(line_margin=0.19))) self.assertEqual(len(pages), 1) page = pages[0] elements = [ element for element in page if isinstance(element, LTTextContainer) ] self.assertEqual(len(elements), 3) self.assertEqual(elements[0].get_text(), "Text1\n") self.assertEqual(elements[1].get_text(), "Text2\n") self.assertEqual(elements[2].get_text(), "Text3\n") # Extract with line_margin 0.21 should merge into one textbox. pages = list( extract_pages(self._get_test_file_path(), laparams=LAParams(line_margin=0.21))) self.assertEqual(len(pages), 1) page = pages[0] elements = [ element for element in page if isinstance(element, LTTextContainer) ] self.assertEqual(len(elements), 1) self.assertEqual(elements[0].get_text(), "Text1\nText2\nText3\n")
def test_extract_text_containers(get_test_decision_standard_pdf_path): page = extract_pages(get_test_decision_standard_pdf_path.open('rb'), page_numbers=[1]) text_containers = common.extract_text_containers(page) assert len(text_containers) == 31 page = extract_pages(get_test_decision_standard_pdf_path.open('rb'), page_numbers=[2]) text_containers = common.extract_text_containers(page) assert len(text_containers) == 33
def load( cls, fn, page_numbers=None, preload=False, reader_kwargs=None, **kwargs ): """Load PDF file. Parameters ---------- fn : str Filename of PDF. page_numbers : list Pages to be loaded. (0-indexed) preload : bool Preload content from all pages. reader_kwargs : dict Other kwargs for `pdfminer.high_level.extract_pages()`. """ if reader_kwargs: reader_kwargs.pop('page_numbers', None) else: reader_kwargs = {} pages = pmhl.extract_pages( fn, page_numbers=page_numbers, **reader_kwargs ) if preload: pages = list(pages) return cls(pages)
def element_generator( file_path: str, page_numbers=None) -> Generator[LTTextContainer, None, None]: """ yields flat list of paragraphs within a document. :param file_path: :return: """ pNumber = 0 # disable boxes_flow, style based hierarchy detection is based on purely flat list of paragraphs params = LAParams(boxes_flow=None, detect_vertical=False) # setting for easy doc # params = LAParams(boxes_flow=0.5, detect_vertical=True) # setting for column doc # todo, do pre-analysis in count_sizes --> are there many boxes within same line # todo, understand LAParams, for columns, NONE works better, for vertical only layout LAParams(boxes_flow=None, detect_vertical=False) works better!! :O # do some sort of layout analyis, if there are many boxes vertically next to each other, use layout analysis # - column type # - straight forward document for page_layout in extract_pages(file_path, laparams=params, page_numbers=page_numbers): for element in page_layout: if isinstance(element, LTTextContainer): element.meta = {"page": pNumber} yield element pNumber += 1
def parse(): files = os.listdir(bp.config['UPLOAD_FOLDER']) filepaths = [bp.config['UPLOAD_FOLDER'] + '/' + x for x in files] d_files = {k: v for k, v in enumerate(filepaths)} print(d_files) VocabularyIndex = InvertedIndex() for k, v in d_files.items(): # iterate through document apges for page_layout in extract_pages(v): # compile all text on page page_text = "" for (count, element) in enumerate(page_layout, 1): if isinstance(element, LTTextContainer): page_text += element.get_text() #print(page_text) # add page to inverted index page_no = int(page_layout.pageid) VocabularyIndex.index_document(page_no, page_text) print(f"processed {page_no}") data = VocabularyIndex.get_index() print(data) '''
def findMaxFontSize(pdf): """ 求出目标 pdf 第一页中的最大字体大小并返回 :param pdf: 待处理的 pdf 文件url :return: 最大字体大小 """ maxFontSize = 0 count = 0 LIMIT = 4 # 最多处理到第 LIMIT 个元素 for page_layout in extract_pages(pdf, page_numbers=0): for element in page_layout: count += 1 if not isinstance(element, LTTextContainer): continue first_line = next(element.__iter__()) first_ch = next(first_line.__iter__()) if not isinstance(first_ch, LTChar): continue # 只要求该元素第一行的第一个字符的大小即可 size = first_ch.size if (size > maxFontSize): maxFontSize = size if count == LIMIT: break # end for element break # end for page_layout return maxFontSize
def get_layout(file_path): for page_layout in extract_pages(file_path): for element in page_layout: #if isinstance(element, LTTextBoxHorizontal): #print(element.get_text()) with open('../data/xml_files/adho_conferences/testlayout.txt', 'a',encoding="utf-8") as fd: fd.write(str(element))
def pdf_to_text(fp, parse_page=[]): ''' Input parameters ------ fp: file path of pdf parse_page: page numbers (starting at 0), to conver to text. All pages if empty [] Output returns ------ string_output: String output of contents in pdf ''' output_string = StringIO() with open(fp, 'rb') as in_file: num_pages = len(list(extract_pages(in_file))) #print('Number of pages {}'.format(num_pages)) parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for i, page in enumerate(PDFPage.create_pages(doc)): if len(parse_page) > 0 and i not in parse_page: continue #print('Processing page number {}'.format(i)) interpreter.process_page(page) string_output = output_string.getvalue() # Close open handles device.close() output_string.close() return string_output
def get_aadhar_details(path, password): i = 0 try: for page_layout in extract_pages(path, password): for element in page_layout: if isinstance(element, LTTextBoxHorizontal): i += 1 if i == 2: addr = element.get_text() elif i == 24: aadhar_no = element.get_text() addr_list = addr.splitlines() num_list = aadhar_no.splitlines() #name # print(addr_list[2]) # #mobile number # print(addr_list[-1]) # #aadhar number aadhar = num_list[0] splitted_aadhar_with_space = aadhar.split(' ') aadhar = "".join(splitted_aadhar_with_space) return (addr_list[2], addr_list[-1], aadhar) except: return False
def identify_features(self,pdf_path,doc2vec): #we download text examples representing features features=pickle.load(open("features.p","rb")) #we transform all the features to doc2vec embedings f_vecs=doc2vec.transform([v for k,v in features.items()]) paragraphs=[] #we go through the pages of the pdf_document for page_layout in extract_pages(pdf_path): #through all the elements in the page_layout for i, element in enumerate(page_layout): #we Only append if it is a textCOntainer and it has a certain length if isinstance(element, LTTextContainer) and len(element.get_text()) > 20 and re.match('[a-zA-Z]+',element.get_text()[0]) != None: paragraphs.append(element.get_text()) #we transform all paragraphs in doc2vec embeddings paragraphs_d2v=doc2vec.transform(paragraphs) #we create in which we will keep which paragraphs are most representative of certain feature. Important it is an ordered dict #because the order will used to access the calculated similarities features_similarities=OrderedDict({k:[] for k in features.keys()}) for i,p in enumerate(paragraphs_d2v): #we calculate of a paragraph and all the feature examples sims=doc2vec.model.wv.cosine_similarities(p.toarray().transpose(),f_vecs.toarray()) #go through all the features and append their corresponding similarity for ix,k in enumerate(features_similarities.keys()): features_similarities[k].append([i,sims[ix][ix]]) #sorting the similarities for k in features_similarities.keys(): features_similarities[k]=sorted(features_similarities[k],key=lambda x: x[1]) return paragraphs,features_similarities
def read( self, override_la_params=None, override_page_numbers=None ) -> Generator[LTTextContainer, Any, None]: pNumber = 0 # disable boxes_flow, style based hierarchy detection is based on purely flat list of paragraphs # params = LAParams(boxes_flow=None, detect_vertical=False) # setting for easy doc # params = LAParams(boxes_flow=0.5, detect_vertical=True) # setting for column doc # todo, do pre-analysis in count_sizes --> are there many boxes within same line # todo, understand LAParams, for columns, NONE works better, for vertical only layout LAParams(boxes_flow=None, detect_vertical=False) works better!! :O # do some sort of layout analyis, if there are many boxes vertically next to each other, use layout analysis # - column type # - straight forward document for page_layout in extract_pages( self.uri, laparams=self.la_params if not override_la_params else override_la_params, page_numbers=self.page_numbers if not override_page_numbers else override_page_numbers): for element in page_layout: element.page = pNumber if isinstance(element, LTTextContainer): yield from self.split_boxes_by_style(element) #yield element elif isinstance(element, LTFigure): yield from self.__handle_lt_figure(element) pNumber += 1
def layout(): writeText() la_params = LAParams(boxes_flow=-0.5) pages = extract_pages(FILE_PATH, laparams=la_params) for page_layout in pages: for element in page_layout: if isinstance(element, LTTextContainer): print(element.get_text().encode('utf8'))
def mock_download(self): pages = list( extract_pages( join(DOWNLOADING_PATH, f'{product}_sections_{language}.pdf'), laparams=LAParams(boxes_flow=BOXES_FLOW, char_margin=CHAR_MARGIN), )) return pages
def _extract_pages_and_text_containers(self, pdf): self._pages = list( extract_pages(pdf, laparams=self.data.get('laparams', None)))[1:] for page in self._pages: self._text_containers.append([ element for element in page if isinstance(element, LTTextContainer) and element.get_text() != '' and not re.search(r'^\s+$', element.get_text()) ])
def extractAuthor(pdf, elem, line): """ 根据作者所在元素等位置信息求出目标 pdf 中作者信息并处理后返回 :param pdf: 待处理的 pdf 文件url :param elem: 含作者信息的元素位置 :param line: 作者信息所在行的偏移量 :return: 作者信息列表 """ # print('[Authors]') author_str = '' for page_layout in extract_pages(pdf, page_numbers=0): elem_count = 0 for element in page_layout: if not isinstance(element, LTTextContainer): continue if elem_count != elem: elem_count += 1 continue if line == 0: # 此时读入目标元素的全部内容 for text_line in element: size = next(text_line.__iter__()).size for character in text_line: ch = character.get_text() # 去掉特殊的角标 if ch == ' ' or ch == '†' or ch == '*': author_str += ' ' elif not isinstance(character, LTChar): continue elif str(character.size) == str(size): author_str += ch else: # 此时从当前元素的剩余行读入作者信息 line_count = 0 for text_line in element: if line_count < line: line_count += 1 continue size = next(text_line.__iter__()).size for character in text_line: ch = character.get_text() # 去掉特殊的角标 if ch == ' ' or ch == '†' or ch == '*': author_str += ' ' elif not isinstance(character, LTChar): continue elif str(character.size) == str(size): author_str += ch line_count += 1 break # end for element break # end for page_layout author_str = author_str.replace(u'\xa0', u'').replace('&', ',').replace('and ', ',') return author_str.encode('gbk', 'ignore')
def _extract_pages_and_text_containers(self, pdf): self._pages = list( extract_pages(pdf, laparams=self.data.get('laparams', None))) for page in self._pages: self._text_containers.append([ element for element in page if isinstance(element, LTTextContainer) and all([ element.get_text().strip() != '', ]) ])
def extractpages(i): for layout in extract_pages(i): for element in layout: if isinstance(element, LTTextContainer): text = element.get_text() c.append(text) #elif isinstance(element, LTImage): #text = element.get_any() #c.append(text) return c
def _extract_pages_and_text_containers(self, pdf): """Method called to prepare the pages and containers from pdf for processing during pre-process.""" self._pages = list( extract_pages(pdf, laparams=self.data.get('laparams', None))) for page in self._pages: self._text_containers.append([ element for element in page if isinstance(element, LTTextContainer) and element.get_text() != '' and not re.search(r'^\s+$', element.get_text()) ])
def download(self): """Download EPAR pdf.""" path = urlretrieve(self.download_url_)[0] pages = list( extract_pages( path, laparams=LAParams(boxes_flow=self.BOXES_FLOW_, char_margin=self.CHAR_MARGIN_), )) return pages
def page(): layout = extract_pages(f) for i in layout: if isinstance(i, LTTextContainer): print(i.get_text()) for text_line in i: for character in text_line: if isinstance(character, LTChar): print(character.fontname) print(character.size)
def check_decision(cls, item: Optional[PDPCDecisionItem] = None, options: Optional[Options] = None) -> bool: with pdpc_decisions.classes.PDFFile(item, options) as pdf: first_page = extract_pages(pdf, page_numbers=[0]) containers = common.extract_text_containers(first_page) for container in containers: if container.get_text().strip() == 'SUMMARY OF THE DECISION': return True return False
def load( pdf_file: IO, pdf_file_path: Optional[str] = None, la_params: Optional[Dict] = None, **kwargs, ) -> PDFDocument: """ Loads the pdf file into a PDFDocument. Args: pdf_file (io): The PDF file. la_params (dict): The layout parameters passed to PDF Miner for analysis. See the PDFMiner documentation here: https://pdfminersix.readthedocs.io/en/latest/reference/composable.html#laparams. Note that py_pdf_parser will re-order the elements it receives from PDFMiner so options relating to element ordering will have no effect. pdf_file_path (str, optional): Passed to `PDFDocument`. See the documentation for `PDFDocument`. kwargs: Passed to `PDFDocument`. See the documentation for `PDFDocument`. Returns: PDFDocument: A PDFDocument with the file loaded. """ if la_params is None: la_params = {} la_params = {**DEFAULT_LA_PARAMS, **la_params} pages: Dict[int, Page] = {} for page in extract_pages(pdf_file, laparams=LAParams(**la_params)): elements = [ element for element in page if isinstance(element, LTTextBox) ] # If all_texts=True then we may get some text from inside figures if la_params.get("all_texts"): figures = (element for element in page if isinstance(element, LTFigure)) for figure in figures: elements += [ element for element in figure if isinstance(element, LTTextBox) ] if not elements: logger.warning( f"No elements detected on page {page.pageid}, skipping this page." ) continue pages[page.pageid] = Page(width=page.width, height=page.height, elements=elements) return PDFDocument(pages=pages, pdf_file_path=pdf_file_path, **kwargs)
def from_pdf_to_txt(read_file): results = '' for page_layout in extract_pages(read_file): for element in page_layout: # print(element) if isinstance(element, LTTextBoxHorizontal): string = element.get_text() string = string.replace(":", '') string = string.replace(":", '') results = results + string.replace(' ', '') # print(results) return results
def test_no_boxes_flow(self): pages = list( extract_pages(self._get_test_file_path(), laparams=LAParams(boxes_flow=None))) self.assertEqual(len(pages), 1) page = pages[0] elements = [ element for element in page if isinstance(element, LTTextContainer) ] self.assertEqual(len(elements), 1) self.assertEqual(elements[0].get_text(), "Text1\nText2\nText3\n")
def extract_PDF_textbox(pdf_name=PDF_NAME): text_group_index = 0 for page_layout in extract_pages(pdf_name, laparams=LAParams(line_margin=LINE_MARGIN)): for element in page_layout: if isinstance(element, LTTextBox): text_group_index += 1 RESULT[text_group_index] = [] for text_line in element: text = text_line.get_text() RESULT[text_group_index].append(text) return f"Converted {text_group_index} group of texts from PDF"
def _extract_pages_and_text_containers(self, pdf): self._pages = list(extract_pages(pdf, laparams=self.data.get('laparams', None))) if common.check_first_page_is_cover(pdf): self._pages = self._pages[1:] for page in self._pages: containers = [element for element in page if isinstance(element, LTTextContainer) and element.get_text() != '' and not re.search(r'^\s+$', element.get_text())] containers = common.split_joined_text_containers(containers) containers = sorted(containers, key=lambda item: item.x0) containers = sorted(containers, key=lambda item: item.y0, reverse=True) self._text_containers.append(containers)
def process_args(inputfile: str, outputfile: str): FINAL_OUTPUT = "Created with Ardio by Bell Eapen at nuchange.com. " print('{0} is the input and {1} is output'.format(inputfile, outputfile)) common_font_size = get_common_font_size(inputfile) for page_layout in extract_pages(inputfile): for element in page_layout: if isinstance(element, LTTextContainer): if(get_common_font_size_of_element(element) == common_font_size): FINAL_OUTPUT = FINAL_OUTPUT + \ remove_all_but_alpabets(element.get_text()) print(FINAL_OUTPUT) write_audio_file(FINAL_OUTPUT, outputfile)
def get_common_font_size(inputfile: str): SIZE_COUNT = {} for page_layout in extract_pages(inputfile): for element in page_layout: if isinstance(element, LTTextContainer): for text_line in element: for character in text_line: if isinstance(character, LTChar): if character.size in SIZE_COUNT: SIZE_COUNT[character.size] = SIZE_COUNT[character.size] + 1 else: SIZE_COUNT[character.size] = 1 return max(SIZE_COUNT.items(), key=operator.itemgetter(1))[0]