def pdf_to_text(pdf): """Return extracted text from PDF. Warning: This function can be slow... up to 300ms per page This function does not perform optical character recognition. Args: pdf: bytestring of PDF file Returns: str of text extracted from `pdf` contents. """ # make input and output buffers in_buffer = StringIO.StringIO(pdf) out_buffer = StringIO.StringIO() # configure pdf parser parser = pdfparser.PDFParser(in_buffer) doc = pdfparser.PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize(password='') rsrcmgr = pdfinterp.PDFResourceManager() laparams = layout.LAParams() # convert pdf to text device = converter.TextConverter(rsrcmgr, outfp=out_buffer, codec='utf-8', laparams=laparams) interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): interpreter.process_page(page) return out_buffer.getvalue()
def _parse_pdf(self, stream): """Parse a single PDF and return the date and description.""" LOG.info("Parsing accident report data from %s" % stream.name) fields = self._get_fields() try: # so much pdfminer boilerplate.... document = pdfdocument.PDFDocument(pdfparser.PDFParser(stream)) rsrcmgr = pdfinterp.PDFResourceManager() device = pdfconverter.PDFPageAggregator( rsrcmgr, laparams=pdflayout.LAParams()) interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device) except psparser.PSException as err: LOG.warn("Parsing %s failed, skipping: %s" % (stream.name, err)) return dict([(f.name, f.value) for f in fields]) page_num = 1 for page in pdfpage.PDFPage.create_pages(document): LOG.debug("Parsing page %s" % page_num) interpreter.process_page(page) layout = device.get_result() for field in fields: field.update(layout, page=page_num) if (not field.value and field.short_circuit and page_num >= field.maxpage): LOG.warn("No %s found in %s, aborting parsing" % (field.name, stream.name)) return dict([(f.name, f.value) for f in fields]) page_num += 1 return dict([(f.name, f.value) for f in fields])
def converte_pdf(nome_arquivo, semaforo=None, profundidade=None, pagina=None): parametros = pdf_layout.LAParams(word_margin=100) gerenciador = pdf_interp.PDFResourceManager() str_saida = cStringIO.StringIO() arquivo_pdf = file(nome_arquivo, 'rb') dispositivo = pdf_converter.HTMLConverter(gerenciador, str_saida, \ codec='utf-8', laparams=parametros) interpretador = pdf_interp.PDFPageInterpreter(gerenciador, dispositivo) if pagina == None: tipo_aux = 'pdf' pg_inicio, pg_fim = 0, -1 else: tipo_aux = 'pdf_parte' pg_inicio, pg_fim = pagina - 1, pagina + 1 tipo = tipo_aux try: parser = pdf_parser.PDFParser(arquivo_pdf) documento = pdf_parser.PDFDocument() parser.set_document(documento) documento.set_parser(parser) paginas = [p for p in documento.get_pages()] if len(paginas) > MAX_PAGINAS_PDF: return 'pdf_longo', None for pagina_atual in paginas[pg_inicio:pg_fim]: try: interpretador.process_page(pagina_atual) except Exception: tipo = tipo_aux + '_defeito' dados_html = str_saida.getvalue() except (AssertionError, pdf_parser.PDFSyntaxError): return tipo, None finally: arquivo_pdf.close() dispositivo.close() str_saida.close() removido_tags = _converte_html(dados_html, 'div') removido_espacos_desnecessarios = re.sub("\ +ç", "ç", removido_tags, \ flags=re.IGNORECASE) sem_numero_pagina = re.sub("\nPage\ [0-9]+\ *\n[0-9]+\ *\n", "\n", \ removido_espacos_desnecessarios) texto_final = re.sub('(\ *\n)+', '\n', re.sub('[\ \t]+', r' ', \ sem_numero_pagina)) return tipo, texto_final
def parseXFA(path): with open(path, "rb") as pdf_file: text = '' parser = pdfparser.PDFParser(pdf_file) document = pdfdocument.PDFDocument(parser) tempy = json.dumps( [ str( (k, stream_raw_data(v)) ) for (k,v) in xfa_alist(xfa(acroform(document))) ], indent=4, ) text += ''.join([ch for ch in tempy if ch in (ascii_letters + digits + ' ' + '/')]) return stripHTML(text)
def pdf_metadata(fname): ret = {} with open(fname, "rb") as f: p = pdfparser.PDFParser(f) doc = pdfdocument.PDFDocument(p) for info in doc.info: for k in info: try: v = info[k].resolve() except AttributeError: v = str(info[k]) ret[k.lower()] = v return ret
def get_identifier(stream): """ 返回文献标示符 :return: 标示类型和值,例如'{'arXiv': '1805.03977'}, {'doi': '10.1016/j.rser.2016.06.056'}, {'None': ''}' """ identifier = {} rsrcmgr = PDFResourceManager() sio = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, sio, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pdf_stream = pdfparser.PDFParser(stream) doc = pdfdocument.PDFDocument(pdf_stream, caching=True) if 'Metadata' in dict(doc.catalog).keys(): metadata = pdftypes.resolve1( doc.catalog['Metadata']).get_data().decode() if get_doi(metadata): identifier['doi'] = get_doi(metadata) else: identifier['None'] = "" else: try: stream = BufferedReader(stream._file) for page in PDFPage.get_pages(stream): interpreter.process_page(page) text = sio.getvalue() line = text line = line.replace(' ', '') line = line.replace('\n', '') res = re.findall(vixra_regex, line, re.IGNORECASE) if res: arxiv_id = list(set([r.strip(".") for r in res]))[0][::-1] arxiv_id = re.sub(r'v([0-9])', '', arxiv_id) identifier['arXiv'] = arxiv_id else: identifier['None'] = "" except: return "" return identifier
def scan(self, file_object, options): extract_text = options.get("extract_text", False) file_limit = options.get("limit", 2000) self.metadata["total"] = {"objects": 0, "extracted": 0} extracted_objects = set() try: with io.BytesIO(file_object.data) as pdf_object: parsed_pdf = pdfparser.PDFParser(pdf_object) pdf_document = pdfdocument.PDFDocument(parsed_pdf) self.metadata.setdefault("annotatedUris", []) for xref in pdf_document.xrefs: for object_id in xref.get_objids(): self.metadata["total"]["objects"] += 1 try: object = pdf_document.getobj(object_id) if isinstance(object, dict): for (key, value) in object.items(): if key in ["AA", "OpenAction"]: file_object.flags.append(f"{self.scanner_name}::auto_action") if key in ["JS", "Javascript"]: file_object.flags.append(f"{self.scanner_name}::javascript_embedded") try: if key == "A": uri = value.get("URI") if (uri is not None and uri not in self.metadata["annotatedUris"]): self.metadata["annotatedUris"].append(uri) except AttributeError: pass if self.metadata["total"]["extracted"] >= file_limit: continue if isinstance(object, pdftypes.PDFStream): try: child_filename = f"{self.scanner_name}::object_{object_id}" child_fo = objects.StrelkaFile(data=object.get_data(), filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) if object_id not in extracted_objects: self.children.append(child_fo) extracted_objects.add(object_id) self.metadata["total"]["extracted"] += 1 except TypeError: file_object.flags.append(f"{self.scanner_name}::type_error_{object_id}") except struct.error: file_object.flags.append(f"{self.scanner_name}::struct_error_{object_id}") except ValueError: file_object.flags.append(f"{self.scanner_name}::value_error_{object_id}") except pdftypes.PDFObjectNotFound: file_object.flags.append(f"{self.scanner_name}::object_not_found_{object_id}") except pdftypes.PDFNotImplementedError: file_object.flags.append(f"{self.scanner_name}::not_implemented_error_{object_id}") except pdftypes.PSSyntaxError: file_object.flags.append(f"{self.scanner_name}::ps_syntax_error_{object_id}") if extract_text: rsrcmgr = pdfinterp.PDFResourceManager(caching=True) retstr = io.StringIO() la_params = layout.LAParams(detect_vertical=True, char_margin=1.0, line_margin=0.3, word_margin=0.3) device = converter.TextConverter(rsrcmgr, retstr, codec="utf-8", laparams=la_params) interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device) for page in pdfpage.PDFPage.get_pages(pdf_object, set()): try: interpreter.process_page(page) except struct.error: file_object.flags.append(f"{self.scanner_name}::text_struct_error") pdf_object_text = retstr.getvalue() child_filename = f"{self.scanner_name}::text" child_fo = objects.StrelkaFile(data=pdf_object_text, filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) self.children.append(child_fo) file_object.flags.append(f"{self.scanner_name}::extracted_text") device.close() retstr.close() except IndexError: file_object.flags.append(f"{self.scanner_name}::index_error") except pdfdocument.PDFEncryptionError: file_object.flags.append(f"{self.scanner_name}::encrypted_pdf") except pdfparser.PDFSyntaxError: file_object.flags.append(f"{self.scanner_name}::pdf_syntax_error") except psparser.PSEOF: file_object.flags.append(f"{self.scanner_name}::ps_eof") except psparser.PSSyntaxError: file_object.flags.append(f"{self.scanner_name}::ps_syntax_error")
def scan(self, data, file, options, expire_at): extract_text = options.get('extract_text', False) file_limit = options.get('limit', 2000) self.event['total'] = {'objects': 0, 'extracted': 0} extracted_objects = set() try: with io.BytesIO(data) as pdf_io: parsed = pdfparser.PDFParser(pdf_io) pdf = pdfdocument.PDFDocument(parsed) self.event.setdefault('annotated_uris', []) for xref in pdf.xrefs: for object_id in xref.get_objids(): self.event['total']['objects'] += 1 try: object = pdf.getobj(object_id) if isinstance(object, dict): for (key, value) in object.items(): if key in ['AA', 'OpenAction']: self.flags.append('auto_action') if key in ['JS', 'Javascript']: self.flags.append( 'javascript_embedded') try: if key == 'A': uri = value.get('URI') if uri not in self.event[ 'annotated_uris']: self.event[ 'annotated_uris'].append( uri) except AttributeError: pass if self.event['total']['extracted'] >= file_limit: continue if isinstance(object, pdftypes.PDFStream): try: if object_id not in extracted_objects: extract_file = strelka.File( name=f'object_{object_id}', source=self.name, ) for c in strelka.chunk_string( object.get_data()): self.upload_to_cache( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1 extracted_objects.add(object_id) except TypeError: self.flags.append('type_error_{object_id}') except struct.error: self.flags.append( 'struct_error_{object_id}') except ValueError: self.flags.append('value_error_{object_id}') except pdftypes.PDFObjectNotFound: self.flags.append('object_not_found_{object_id}') except pdftypes.PDFNotImplementedError: self.flags.append( 'not_implemented_error_{object_id}') except psparser.PSSyntaxError: self.flags.append('ps_syntax_error_{object_id}') if extract_text: rsrcmgr = pdfinterp.PDFResourceManager(caching=True) retstr = io.StringIO() la_params = layout.LAParams( detect_vertical=True, char_margin=1.0, line_margin=0.3, word_margin=0.3, ) device = converter.TextConverter( rsrcmgr, retstr, codec='utf-8', laparams=la_params, ) interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device) for page in pdfpage.PDFPage.get_pages(data, set()): try: interpreter.process_page(page) except struct.error: self.flags.append('text_struct_error') extract_file = strelka.File( name='text', source=self.name, ) for c in strelka.chunk_string(retstr.getvalue()): self.upload_to_cache( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.flags.append('extracted_text') device.close() retstr.close() except IndexError: self.flags.append('index_error') except pdfdocument.PDFEncryptionError: self.flags.append('encrypted_pdf') except pdfparser.PDFSyntaxError: self.flags.append('pdf_syntax_error') except psparser.PSEOF: self.flags.append('ps_eof') except psparser.PSSyntaxError: self.flags.append('ps_syntax_error')
def _get_document(self, my_file): """Returns a PDFDocument from a file name.""" fp = open(my_file, "rb") parser = pdfparser.PDFParser(fp) return pdfdocument.PDFDocument(parser)