Exemplo n.º 1
0
def pdf_to_text(pdf):
    """Return extracted text from PDF.

  Warning: This function can be slow... up to 300ms per page
  This function does not perform optical character recognition.

  Args:
    pdf: bytestring of PDF file
  Returns:
    str of text extracted from `pdf` contents.
  """
    # make input and output buffers
    in_buffer = StringIO.StringIO(pdf)
    out_buffer = StringIO.StringIO()

    # configure pdf parser
    parser = pdfparser.PDFParser(in_buffer)
    doc = pdfparser.PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password='')
    rsrcmgr = pdfinterp.PDFResourceManager()
    laparams = layout.LAParams()

    # convert pdf to text
    device = converter.TextConverter(rsrcmgr,
                                     outfp=out_buffer,
                                     codec='utf-8',
                                     laparams=laparams)
    interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device)

    for page in doc.get_pages():
        interpreter.process_page(page)

    return out_buffer.getvalue()
Exemplo n.º 2
0
def parsePDFPorto(fileStream,OutType):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
   
    if (OutType=="HTML"):
        device = converter.HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    if (OutType=="Text"):
        device = converter.TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    if (OutType=="XML"):
        device = converter.XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    
    parser = PDFParser(fileStream)
    
    for page in PDFPage.get_pages(fileStream):
            interpreter.process_page(page)
    fileStream.close()
   
    device.close()

    str = retstr.getvalue()
    retstr.close()
    return str.decode('utf-8')
Exemplo n.º 3
0
def pdf_miner(from_file, to_txt):
    log.debug('trying with pdfminer')
    pdf = codecs.open(from_file.path, mode='rb')
    output = codecs.open(to_txt.path, mode='wb')
    try:
        resourceman = pdfinterp.PDFResourceManager()
        device = converter.TextConverter(
            resourceman, output, laparams=layout.LAParams())
        interpreter = pdfinterp.PDFPageInterpreter(resourceman, device)
        for page in pdfpage.PDFPage.get_pages(pdf):
            interpreter.process_page(page)
        output.close()
        device.close()
        pdf.close()
    except Exception, e:
        log.critical(e)
        return
Exemplo n.º 4
0
    def scan(self, file_object, options):
        extract_text = options.get("extract_text", False)
        file_limit = options.get("limit", 2000)

        self.metadata["total"] = {"objects": 0, "extracted": 0}
        extracted_objects = set()

        try:
            with io.BytesIO(file_object.data) as pdf_object:
                parsed_pdf = pdfparser.PDFParser(pdf_object)
                pdf_document = pdfdocument.PDFDocument(parsed_pdf)

                self.metadata.setdefault("annotatedUris", [])
                for xref in pdf_document.xrefs:
                    for object_id in xref.get_objids():
                        self.metadata["total"]["objects"] += 1

                        try:
                            object = pdf_document.getobj(object_id)
                            if isinstance(object, dict):
                                for (key, value) in object.items():
                                    if key in ["AA", "OpenAction"]:
                                        file_object.flags.append(f"{self.scanner_name}::auto_action")
                                    if key in ["JS", "Javascript"]:
                                        file_object.flags.append(f"{self.scanner_name}::javascript_embedded")

                                    try:
                                        if key == "A":
                                            uri = value.get("URI")
                                            if (uri is not None and
                                                uri not in self.metadata["annotatedUris"]):
                                                    self.metadata["annotatedUris"].append(uri)

                                    except AttributeError:
                                        pass

                            if self.metadata["total"]["extracted"] >= file_limit:
                                continue
                            if isinstance(object, pdftypes.PDFStream):
                                try:
                                    child_filename = f"{self.scanner_name}::object_{object_id}"
                                    child_fo = objects.StrelkaFile(data=object.get_data(),
                                                                   filename=child_filename,
                                                                   depth=file_object.depth + 1,
                                                                   parent_uid=file_object.uid,
                                                                   root_uid=file_object.root_uid,
                                                                   parent_hash=file_object.hash,
                                                                   root_hash=file_object.root_hash,
                                                                   source=self.scanner_name)
                                    if object_id not in extracted_objects:
                                        self.children.append(child_fo)
                                        extracted_objects.add(object_id)
                                        self.metadata["total"]["extracted"] += 1

                                except TypeError:
                                    file_object.flags.append(f"{self.scanner_name}::type_error_{object_id}")
                                except struct.error:
                                    file_object.flags.append(f"{self.scanner_name}::struct_error_{object_id}")

                        except ValueError:
                            file_object.flags.append(f"{self.scanner_name}::value_error_{object_id}")
                        except pdftypes.PDFObjectNotFound:
                            file_object.flags.append(f"{self.scanner_name}::object_not_found_{object_id}")
                        except pdftypes.PDFNotImplementedError:
                            file_object.flags.append(f"{self.scanner_name}::not_implemented_error_{object_id}")
                        except pdftypes.PSSyntaxError:
                            file_object.flags.append(f"{self.scanner_name}::ps_syntax_error_{object_id}")

                if extract_text:
                    rsrcmgr = pdfinterp.PDFResourceManager(caching=True)
                    retstr = io.StringIO()
                    la_params = layout.LAParams(detect_vertical=True,
                                                char_margin=1.0,
                                                line_margin=0.3,
                                                word_margin=0.3)
                    device = converter.TextConverter(rsrcmgr, retstr,
                                                     codec="utf-8",
                                                     laparams=la_params)
                    interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device)
                    for page in pdfpage.PDFPage.get_pages(pdf_object, set()):
                        try:
                            interpreter.process_page(page)

                        except struct.error:
                            file_object.flags.append(f"{self.scanner_name}::text_struct_error")

                    pdf_object_text = retstr.getvalue()
                    child_filename = f"{self.scanner_name}::text"
                    child_fo = objects.StrelkaFile(data=pdf_object_text,
                                                   filename=child_filename,
                                                   depth=file_object.depth + 1,
                                                   parent_uid=file_object.uid,
                                                   root_uid=file_object.root_uid,
                                                   parent_hash=file_object.hash,
                                                   root_hash=file_object.root_hash,
                                                   source=self.scanner_name)
                    self.children.append(child_fo)
                    file_object.flags.append(f"{self.scanner_name}::extracted_text")
                    device.close()
                    retstr.close()

        except IndexError:
            file_object.flags.append(f"{self.scanner_name}::index_error")
        except pdfdocument.PDFEncryptionError:
            file_object.flags.append(f"{self.scanner_name}::encrypted_pdf")
        except pdfparser.PDFSyntaxError:
            file_object.flags.append(f"{self.scanner_name}::pdf_syntax_error")
        except psparser.PSEOF:
            file_object.flags.append(f"{self.scanner_name}::ps_eof")
        except psparser.PSSyntaxError:
            file_object.flags.append(f"{self.scanner_name}::ps_syntax_error")
Exemplo n.º 5
0
    def scan(self, data, file, options, expire_at):
        extract_text = options.get('extract_text', False)
        file_limit = options.get('limit', 2000)

        self.event['total'] = {'objects': 0, 'extracted': 0}
        extracted_objects = set()

        try:
            with io.BytesIO(data) as pdf_io:
                parsed = pdfparser.PDFParser(pdf_io)
                pdf = pdfdocument.PDFDocument(parsed)

                self.event.setdefault('annotated_uris', [])
                for xref in pdf.xrefs:
                    for object_id in xref.get_objids():
                        self.event['total']['objects'] += 1

                        try:
                            object = pdf.getobj(object_id)
                            if isinstance(object, dict):
                                for (key, value) in object.items():
                                    if key in ['AA', 'OpenAction']:
                                        self.flags.append('auto_action')
                                    if key in ['JS', 'Javascript']:
                                        self.flags.append(
                                            'javascript_embedded')

                                    try:
                                        if key == 'A':
                                            uri = value.get('URI')
                                            if uri not in self.event[
                                                    'annotated_uris']:
                                                self.event[
                                                    'annotated_uris'].append(
                                                        uri)

                                    except AttributeError:
                                        pass

                            if self.event['total']['extracted'] >= file_limit:
                                continue
                            if isinstance(object, pdftypes.PDFStream):
                                try:
                                    if object_id not in extracted_objects:
                                        extract_file = strelka.File(
                                            name=f'object_{object_id}',
                                            source=self.name,
                                        )

                                        for c in strelka.chunk_string(
                                                object.get_data()):
                                            self.upload_to_cache(
                                                extract_file.pointer,
                                                c,
                                                expire_at,
                                            )

                                        self.files.append(extract_file)
                                        self.event['total']['extracted'] += 1
                                        extracted_objects.add(object_id)

                                except TypeError:
                                    self.flags.append('type_error_{object_id}')
                                except struct.error:
                                    self.flags.append(
                                        'struct_error_{object_id}')

                        except ValueError:
                            self.flags.append('value_error_{object_id}')
                        except pdftypes.PDFObjectNotFound:
                            self.flags.append('object_not_found_{object_id}')
                        except pdftypes.PDFNotImplementedError:
                            self.flags.append(
                                'not_implemented_error_{object_id}')
                        except psparser.PSSyntaxError:
                            self.flags.append('ps_syntax_error_{object_id}')

                if extract_text:
                    rsrcmgr = pdfinterp.PDFResourceManager(caching=True)
                    retstr = io.StringIO()
                    la_params = layout.LAParams(
                        detect_vertical=True,
                        char_margin=1.0,
                        line_margin=0.3,
                        word_margin=0.3,
                    )
                    device = converter.TextConverter(
                        rsrcmgr,
                        retstr,
                        codec='utf-8',
                        laparams=la_params,
                    )
                    interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device)
                    for page in pdfpage.PDFPage.get_pages(data, set()):
                        try:
                            interpreter.process_page(page)

                        except struct.error:
                            self.flags.append('text_struct_error')

                    extract_file = strelka.File(
                        name='text',
                        source=self.name,
                    )
                    for c in strelka.chunk_string(retstr.getvalue()):
                        self.upload_to_cache(
                            extract_file.pointer,
                            c,
                            expire_at,
                        )
                    self.files.append(extract_file)

                    self.flags.append('extracted_text')
                    device.close()
                    retstr.close()

        except IndexError:
            self.flags.append('index_error')
        except pdfdocument.PDFEncryptionError:
            self.flags.append('encrypted_pdf')
        except pdfparser.PDFSyntaxError:
            self.flags.append('pdf_syntax_error')
        except psparser.PSEOF:
            self.flags.append('ps_eof')
        except psparser.PSSyntaxError:
            self.flags.append('ps_syntax_error')