def pdf_to_text(pdf): """Return extracted text from PDF. Warning: This function can be slow... up to 300ms per page This function does not perform optical character recognition. Args: pdf: bytestring of PDF file Returns: str of text extracted from `pdf` contents. """ # make input and output buffers in_buffer = StringIO.StringIO(pdf) out_buffer = StringIO.StringIO() # configure pdf parser parser = pdfparser.PDFParser(in_buffer) doc = pdfparser.PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize(password='') rsrcmgr = pdfinterp.PDFResourceManager() laparams = layout.LAParams() # convert pdf to text device = converter.TextConverter(rsrcmgr, outfp=out_buffer, codec='utf-8', laparams=laparams) interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): interpreter.process_page(page) return out_buffer.getvalue()
def parsePDFPorto(fileStream,OutType): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() if (OutType=="HTML"): device = converter.HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) if (OutType=="Text"): device = converter.TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) if (OutType=="XML"): device = converter.XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) parser = PDFParser(fileStream) for page in PDFPage.get_pages(fileStream): interpreter.process_page(page) fileStream.close() device.close() str = retstr.getvalue() retstr.close() return str.decode('utf-8')
def pdf_miner(from_file, to_txt): log.debug('trying with pdfminer') pdf = codecs.open(from_file.path, mode='rb') output = codecs.open(to_txt.path, mode='wb') try: resourceman = pdfinterp.PDFResourceManager() device = converter.TextConverter( resourceman, output, laparams=layout.LAParams()) interpreter = pdfinterp.PDFPageInterpreter(resourceman, device) for page in pdfpage.PDFPage.get_pages(pdf): interpreter.process_page(page) output.close() device.close() pdf.close() except Exception, e: log.critical(e) return
def scan(self, file_object, options): extract_text = options.get("extract_text", False) file_limit = options.get("limit", 2000) self.metadata["total"] = {"objects": 0, "extracted": 0} extracted_objects = set() try: with io.BytesIO(file_object.data) as pdf_object: parsed_pdf = pdfparser.PDFParser(pdf_object) pdf_document = pdfdocument.PDFDocument(parsed_pdf) self.metadata.setdefault("annotatedUris", []) for xref in pdf_document.xrefs: for object_id in xref.get_objids(): self.metadata["total"]["objects"] += 1 try: object = pdf_document.getobj(object_id) if isinstance(object, dict): for (key, value) in object.items(): if key in ["AA", "OpenAction"]: file_object.flags.append(f"{self.scanner_name}::auto_action") if key in ["JS", "Javascript"]: file_object.flags.append(f"{self.scanner_name}::javascript_embedded") try: if key == "A": uri = value.get("URI") if (uri is not None and uri not in self.metadata["annotatedUris"]): self.metadata["annotatedUris"].append(uri) except AttributeError: pass if self.metadata["total"]["extracted"] >= file_limit: continue if isinstance(object, pdftypes.PDFStream): try: child_filename = f"{self.scanner_name}::object_{object_id}" child_fo = objects.StrelkaFile(data=object.get_data(), filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) if object_id not in extracted_objects: self.children.append(child_fo) extracted_objects.add(object_id) self.metadata["total"]["extracted"] += 1 except TypeError: file_object.flags.append(f"{self.scanner_name}::type_error_{object_id}") except struct.error: file_object.flags.append(f"{self.scanner_name}::struct_error_{object_id}") except ValueError: file_object.flags.append(f"{self.scanner_name}::value_error_{object_id}") except pdftypes.PDFObjectNotFound: file_object.flags.append(f"{self.scanner_name}::object_not_found_{object_id}") except pdftypes.PDFNotImplementedError: file_object.flags.append(f"{self.scanner_name}::not_implemented_error_{object_id}") except pdftypes.PSSyntaxError: file_object.flags.append(f"{self.scanner_name}::ps_syntax_error_{object_id}") if extract_text: rsrcmgr = pdfinterp.PDFResourceManager(caching=True) retstr = io.StringIO() la_params = layout.LAParams(detect_vertical=True, char_margin=1.0, line_margin=0.3, word_margin=0.3) device = converter.TextConverter(rsrcmgr, retstr, codec="utf-8", laparams=la_params) interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device) for page in pdfpage.PDFPage.get_pages(pdf_object, set()): try: interpreter.process_page(page) except struct.error: file_object.flags.append(f"{self.scanner_name}::text_struct_error") pdf_object_text = retstr.getvalue() child_filename = f"{self.scanner_name}::text" child_fo = objects.StrelkaFile(data=pdf_object_text, filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) self.children.append(child_fo) file_object.flags.append(f"{self.scanner_name}::extracted_text") device.close() retstr.close() except IndexError: file_object.flags.append(f"{self.scanner_name}::index_error") except pdfdocument.PDFEncryptionError: file_object.flags.append(f"{self.scanner_name}::encrypted_pdf") except pdfparser.PDFSyntaxError: file_object.flags.append(f"{self.scanner_name}::pdf_syntax_error") except psparser.PSEOF: file_object.flags.append(f"{self.scanner_name}::ps_eof") except psparser.PSSyntaxError: file_object.flags.append(f"{self.scanner_name}::ps_syntax_error")
def scan(self, data, file, options, expire_at): extract_text = options.get('extract_text', False) file_limit = options.get('limit', 2000) self.event['total'] = {'objects': 0, 'extracted': 0} extracted_objects = set() try: with io.BytesIO(data) as pdf_io: parsed = pdfparser.PDFParser(pdf_io) pdf = pdfdocument.PDFDocument(parsed) self.event.setdefault('annotated_uris', []) for xref in pdf.xrefs: for object_id in xref.get_objids(): self.event['total']['objects'] += 1 try: object = pdf.getobj(object_id) if isinstance(object, dict): for (key, value) in object.items(): if key in ['AA', 'OpenAction']: self.flags.append('auto_action') if key in ['JS', 'Javascript']: self.flags.append( 'javascript_embedded') try: if key == 'A': uri = value.get('URI') if uri not in self.event[ 'annotated_uris']: self.event[ 'annotated_uris'].append( uri) except AttributeError: pass if self.event['total']['extracted'] >= file_limit: continue if isinstance(object, pdftypes.PDFStream): try: if object_id not in extracted_objects: extract_file = strelka.File( name=f'object_{object_id}', source=self.name, ) for c in strelka.chunk_string( object.get_data()): self.upload_to_cache( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1 extracted_objects.add(object_id) except TypeError: self.flags.append('type_error_{object_id}') except struct.error: self.flags.append( 'struct_error_{object_id}') except ValueError: self.flags.append('value_error_{object_id}') except pdftypes.PDFObjectNotFound: self.flags.append('object_not_found_{object_id}') except pdftypes.PDFNotImplementedError: self.flags.append( 'not_implemented_error_{object_id}') except psparser.PSSyntaxError: self.flags.append('ps_syntax_error_{object_id}') if extract_text: rsrcmgr = pdfinterp.PDFResourceManager(caching=True) retstr = io.StringIO() la_params = layout.LAParams( detect_vertical=True, char_margin=1.0, line_margin=0.3, word_margin=0.3, ) device = converter.TextConverter( rsrcmgr, retstr, codec='utf-8', laparams=la_params, ) interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device) for page in pdfpage.PDFPage.get_pages(data, set()): try: interpreter.process_page(page) except struct.error: self.flags.append('text_struct_error') extract_file = strelka.File( name='text', source=self.name, ) for c in strelka.chunk_string(retstr.getvalue()): self.upload_to_cache( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.flags.append('extracted_text') device.close() retstr.close() except IndexError: self.flags.append('index_error') except pdfdocument.PDFEncryptionError: self.flags.append('encrypted_pdf') except pdfparser.PDFSyntaxError: self.flags.append('pdf_syntax_error') except psparser.PSEOF: self.flags.append('ps_eof') except psparser.PSSyntaxError: self.flags.append('ps_syntax_error')