def pdf_to_text(pdf): """Return extracted text from PDF. Warning: This function can be slow... up to 300ms per page This function does not perform optical character recognition. Args: pdf: bytestring of PDF file Returns: str of text extracted from `pdf` contents. """ # make input and output buffers in_buffer = StringIO.StringIO(pdf) out_buffer = StringIO.StringIO() # configure pdf parser parser = pdfparser.PDFParser(in_buffer) doc = pdfparser.PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize(password='') rsrcmgr = pdfinterp.PDFResourceManager() laparams = layout.LAParams() # convert pdf to text device = converter.TextConverter(rsrcmgr, outfp=out_buffer, codec='utf-8', laparams=laparams) interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): interpreter.process_page(page) return out_buffer.getvalue()
def _parse_pdf(self, stream): """Parse a single PDF and return the date and description.""" LOG.info("Parsing accident report data from %s" % stream.name) fields = self._get_fields() try: # so much pdfminer boilerplate.... document = pdfdocument.PDFDocument(pdfparser.PDFParser(stream)) rsrcmgr = pdfinterp.PDFResourceManager() device = pdfconverter.PDFPageAggregator( rsrcmgr, laparams=pdflayout.LAParams()) interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device) except psparser.PSException as err: LOG.warn("Parsing %s failed, skipping: %s" % (stream.name, err)) return dict([(f.name, f.value) for f in fields]) page_num = 1 for page in pdfpage.PDFPage.create_pages(document): LOG.debug("Parsing page %s" % page_num) interpreter.process_page(page) layout = device.get_result() for field in fields: field.update(layout, page=page_num) if (not field.value and field.short_circuit and page_num >= field.maxpage): LOG.warn("No %s found in %s, aborting parsing" % (field.name, stream.name)) return dict([(f.name, f.value) for f in fields]) page_num += 1 return dict([(f.name, f.value) for f in fields])
def converte_pdf(nome_arquivo, semaforo=None, profundidade=None, pagina=None): parametros = pdf_layout.LAParams(word_margin=100) gerenciador = pdf_interp.PDFResourceManager() str_saida = cStringIO.StringIO() arquivo_pdf = file(nome_arquivo, 'rb') dispositivo = pdf_converter.HTMLConverter(gerenciador, str_saida, \ codec='utf-8', laparams=parametros) interpretador = pdf_interp.PDFPageInterpreter(gerenciador, dispositivo) if pagina == None: tipo_aux = 'pdf' pg_inicio, pg_fim = 0, -1 else: tipo_aux = 'pdf_parte' pg_inicio, pg_fim = pagina - 1, pagina + 1 tipo = tipo_aux try: parser = pdf_parser.PDFParser(arquivo_pdf) documento = pdf_parser.PDFDocument() parser.set_document(documento) documento.set_parser(parser) paginas = [p for p in documento.get_pages()] if len(paginas) > MAX_PAGINAS_PDF: return 'pdf_longo', None for pagina_atual in paginas[pg_inicio:pg_fim]: try: interpretador.process_page(pagina_atual) except Exception: tipo = tipo_aux + '_defeito' dados_html = str_saida.getvalue() except (AssertionError, pdf_parser.PDFSyntaxError): return tipo, None finally: arquivo_pdf.close() dispositivo.close() str_saida.close() removido_tags = _converte_html(dados_html, 'div') removido_espacos_desnecessarios = re.sub("\ +ç", "ç", removido_tags, \ flags=re.IGNORECASE) sem_numero_pagina = re.sub("\nPage\ [0-9]+\ *\n[0-9]+\ *\n", "\n", \ removido_espacos_desnecessarios) texto_final = re.sub('(\ *\n)+', '\n', re.sub('[\ \t]+', r' ', \ sem_numero_pagina)) return tipo, texto_final
def get_ltpages(infile, caching=True): rm = pdfinterp.PDFResourceManager(caching=caching) laparams = layout.LAParams(detect_vertical=False) device = converter.PDFPageAggregator(rm, laparams=laparams) interpreter = pdfinterp.PDFPageInterpreter(rm, device) ltpages = [] for page in PDFPage.get_pages(infile, caching=caching): interpreter.process_page(page) ltpages.append(device.get_result()) device.close() return ltpages
def _parse_pages(self, document): """Return the info extracted for the PDF BORME pages.""" resource_manager = pdfinterp.PDFResourceManager() # value is specified not as an actual length, but as a proportion of # the length to the size of each character in question. # Two text chunks whose distance is closer than the **char_margin** # is considered continuous and get grouped into one. # it may be required to insert blank characters (spaces) as necessary # if the distance between two words is greater than the # **word_margin**. # as a blank between words might not be represented as a space, but # indicated by the positioning of each word. # two lines whose distance is closer than the line_margin is grouped as # a text box, which is a rectangular area that contains a "cluster" of # text portions. # 6.0 --> all without one # params = layout.LAParams(char_margin=8.0) params = layout.LAParams(char_margin=14.0) device = converter.PDFPageAggregator(resource_manager, laparams=params) interpreter = pdfinterp.PDFPageInterpreter(resource_manager, device) pdf_pages = [page for page in pdfpage.PDFPage.create_pages(document)] raw_pages = [] is_first_page = True for page in pdf_pages: interpreter.process_page(page) my_layout = device.get_result() acts = self._parse_raw_acts(my_layout, is_first_page) raw_pages.append(acts) is_first_page = False debug_txt = "Page number: %i Acts: %i" % (len(raw_pages), len(acts)) self._log.debug(debug_txt) pages = [] counter = len(raw_pages) for page in raw_pages: counter -= 1 is_last_page = (counter == 0) next_pages = [] if not is_last_page: next_pages = raw_pages[(len(raw_pages) - counter):] last_page_act = None if pages: if pages[-1]: last_page_act = pages[-1][-1] acts = self._parse_acts(page, next_pages, last_page_act) pages.append(acts) return pages
def pdf_miner(from_file, to_txt): log.debug('trying with pdfminer') pdf = codecs.open(from_file.path, mode='rb') output = codecs.open(to_txt.path, mode='wb') try: resourceman = pdfinterp.PDFResourceManager() device = converter.TextConverter( resourceman, output, laparams=layout.LAParams()) interpreter = pdfinterp.PDFPageInterpreter(resourceman, device) for page in pdfpage.PDFPage.get_pages(pdf): interpreter.process_page(page) output.close() device.close() pdf.close() except Exception, e: log.critical(e) return
def doPDF(url): fp = open(url, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize() resource_manager = pdfinterp.PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(resource_manager, laparams=laparams) interpreter = pdfinterp.PDFPageInterpreter(resource_manager, device) pages = doc.get_pages() str = '' for page in pages: interpreter.process_page(page) layout = device.get_result() for x in layout: if isinstance(x, LTText): str += x.get_text() return str
def scan(self, file_object, options): extract_text = options.get("extract_text", False) file_limit = options.get("limit", 2000) self.metadata["total"] = {"objects": 0, "extracted": 0} extracted_objects = set() try: with io.BytesIO(file_object.data) as pdf_object: parsed_pdf = pdfparser.PDFParser(pdf_object) pdf_document = pdfdocument.PDFDocument(parsed_pdf) self.metadata.setdefault("annotatedUris", []) for xref in pdf_document.xrefs: for object_id in xref.get_objids(): self.metadata["total"]["objects"] += 1 try: object = pdf_document.getobj(object_id) if isinstance(object, dict): for (key, value) in object.items(): if key in ["AA", "OpenAction"]: file_object.flags.append(f"{self.scanner_name}::auto_action") if key in ["JS", "Javascript"]: file_object.flags.append(f"{self.scanner_name}::javascript_embedded") try: if key == "A": uri = value.get("URI") if (uri is not None and uri not in self.metadata["annotatedUris"]): self.metadata["annotatedUris"].append(uri) except AttributeError: pass if self.metadata["total"]["extracted"] >= file_limit: continue if isinstance(object, pdftypes.PDFStream): try: child_filename = f"{self.scanner_name}::object_{object_id}" child_fo = objects.StrelkaFile(data=object.get_data(), filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) if object_id not in extracted_objects: self.children.append(child_fo) extracted_objects.add(object_id) self.metadata["total"]["extracted"] += 1 except TypeError: file_object.flags.append(f"{self.scanner_name}::type_error_{object_id}") except struct.error: file_object.flags.append(f"{self.scanner_name}::struct_error_{object_id}") except ValueError: file_object.flags.append(f"{self.scanner_name}::value_error_{object_id}") except pdftypes.PDFObjectNotFound: file_object.flags.append(f"{self.scanner_name}::object_not_found_{object_id}") except pdftypes.PDFNotImplementedError: file_object.flags.append(f"{self.scanner_name}::not_implemented_error_{object_id}") except pdftypes.PSSyntaxError: file_object.flags.append(f"{self.scanner_name}::ps_syntax_error_{object_id}") if extract_text: rsrcmgr = pdfinterp.PDFResourceManager(caching=True) retstr = io.StringIO() la_params = layout.LAParams(detect_vertical=True, char_margin=1.0, line_margin=0.3, word_margin=0.3) device = converter.TextConverter(rsrcmgr, retstr, codec="utf-8", laparams=la_params) interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device) for page in pdfpage.PDFPage.get_pages(pdf_object, set()): try: interpreter.process_page(page) except struct.error: file_object.flags.append(f"{self.scanner_name}::text_struct_error") pdf_object_text = retstr.getvalue() child_filename = f"{self.scanner_name}::text" child_fo = objects.StrelkaFile(data=pdf_object_text, filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) self.children.append(child_fo) file_object.flags.append(f"{self.scanner_name}::extracted_text") device.close() retstr.close() except IndexError: file_object.flags.append(f"{self.scanner_name}::index_error") except pdfdocument.PDFEncryptionError: file_object.flags.append(f"{self.scanner_name}::encrypted_pdf") except pdfparser.PDFSyntaxError: file_object.flags.append(f"{self.scanner_name}::pdf_syntax_error") except psparser.PSEOF: file_object.flags.append(f"{self.scanner_name}::ps_eof") except psparser.PSSyntaxError: file_object.flags.append(f"{self.scanner_name}::ps_syntax_error")
def scan(self, data, file, options, expire_at): extract_text = options.get('extract_text', False) file_limit = options.get('limit', 2000) self.event['total'] = {'objects': 0, 'extracted': 0} extracted_objects = set() try: with io.BytesIO(data) as pdf_io: parsed = pdfparser.PDFParser(pdf_io) pdf = pdfdocument.PDFDocument(parsed) self.event.setdefault('annotated_uris', []) for xref in pdf.xrefs: for object_id in xref.get_objids(): self.event['total']['objects'] += 1 try: object = pdf.getobj(object_id) if isinstance(object, dict): for (key, value) in object.items(): if key in ['AA', 'OpenAction']: self.flags.append('auto_action') if key in ['JS', 'Javascript']: self.flags.append( 'javascript_embedded') try: if key == 'A': uri = value.get('URI') if uri not in self.event[ 'annotated_uris']: self.event[ 'annotated_uris'].append( uri) except AttributeError: pass if self.event['total']['extracted'] >= file_limit: continue if isinstance(object, pdftypes.PDFStream): try: if object_id not in extracted_objects: extract_file = strelka.File( name=f'object_{object_id}', source=self.name, ) for c in strelka.chunk_string( object.get_data()): self.upload_to_cache( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1 extracted_objects.add(object_id) except TypeError: self.flags.append('type_error_{object_id}') except struct.error: self.flags.append( 'struct_error_{object_id}') except ValueError: self.flags.append('value_error_{object_id}') except pdftypes.PDFObjectNotFound: self.flags.append('object_not_found_{object_id}') except pdftypes.PDFNotImplementedError: self.flags.append( 'not_implemented_error_{object_id}') except psparser.PSSyntaxError: self.flags.append('ps_syntax_error_{object_id}') if extract_text: rsrcmgr = pdfinterp.PDFResourceManager(caching=True) retstr = io.StringIO() la_params = layout.LAParams( detect_vertical=True, char_margin=1.0, line_margin=0.3, word_margin=0.3, ) device = converter.TextConverter( rsrcmgr, retstr, codec='utf-8', laparams=la_params, ) interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device) for page in pdfpage.PDFPage.get_pages(data, set()): try: interpreter.process_page(page) except struct.error: self.flags.append('text_struct_error') extract_file = strelka.File( name='text', source=self.name, ) for c in strelka.chunk_string(retstr.getvalue()): self.upload_to_cache( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.flags.append('extracted_text') device.close() retstr.close() except IndexError: self.flags.append('index_error') except pdfdocument.PDFEncryptionError: self.flags.append('encrypted_pdf') except pdfparser.PDFSyntaxError: self.flags.append('pdf_syntax_error') except psparser.PSEOF: self.flags.append('ps_eof') except psparser.PSSyntaxError: self.flags.append('ps_syntax_error')