예제 #1
0
def pdf_to_text(pdf):
    """Return extracted text from PDF.

  Warning: This function can be slow... up to 300ms per page
  This function does not perform optical character recognition.

  Args:
    pdf: bytestring of PDF file
  Returns:
    str of text extracted from `pdf` contents.
  """
    # make input and output buffers
    in_buffer = StringIO.StringIO(pdf)
    out_buffer = StringIO.StringIO()

    # configure pdf parser
    parser = pdfparser.PDFParser(in_buffer)
    doc = pdfparser.PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password='')
    rsrcmgr = pdfinterp.PDFResourceManager()
    laparams = layout.LAParams()

    # convert pdf to text
    device = converter.TextConverter(rsrcmgr,
                                     outfp=out_buffer,
                                     codec='utf-8',
                                     laparams=laparams)
    interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device)

    for page in doc.get_pages():
        interpreter.process_page(page)

    return out_buffer.getvalue()
예제 #2
0
파일: jsonify.py 프로젝트: stpierre/crashes
    def _parse_pdf(self, stream):
        """Parse a single PDF and return the date and description."""
        LOG.info("Parsing accident report data from %s" % stream.name)
        fields = self._get_fields()

        try:
            # so much pdfminer boilerplate....
            document = pdfdocument.PDFDocument(pdfparser.PDFParser(stream))
            rsrcmgr = pdfinterp.PDFResourceManager()
            device = pdfconverter.PDFPageAggregator(
                rsrcmgr, laparams=pdflayout.LAParams())
            interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device)
        except psparser.PSException as err:
            LOG.warn("Parsing %s failed, skipping: %s" % (stream.name, err))
            return dict([(f.name, f.value) for f in fields])

        page_num = 1

        for page in pdfpage.PDFPage.create_pages(document):
            LOG.debug("Parsing page %s" % page_num)

            interpreter.process_page(page)
            layout = device.get_result()

            for field in fields:
                field.update(layout, page=page_num)
                if (not field.value and field.short_circuit
                        and page_num >= field.maxpage):
                    LOG.warn("No %s found in %s, aborting parsing" %
                             (field.name, stream.name))
                    return dict([(f.name, f.value) for f in fields])

            page_num += 1
        return dict([(f.name, f.value) for f in fields])
예제 #3
0
def converte_pdf(nome_arquivo, semaforo=None, profundidade=None, pagina=None):
    parametros = pdf_layout.LAParams(word_margin=100)
    gerenciador = pdf_interp.PDFResourceManager()
    str_saida = cStringIO.StringIO()

    arquivo_pdf = file(nome_arquivo, 'rb')

    dispositivo = pdf_converter.HTMLConverter(gerenciador, str_saida, \
     codec='utf-8', laparams=parametros)
    interpretador = pdf_interp.PDFPageInterpreter(gerenciador, dispositivo)

    if pagina == None:
        tipo_aux = 'pdf'
        pg_inicio, pg_fim = 0, -1
    else:
        tipo_aux = 'pdf_parte'
        pg_inicio, pg_fim = pagina - 1, pagina + 1

    tipo = tipo_aux

    try:
        parser = pdf_parser.PDFParser(arquivo_pdf)
        documento = pdf_parser.PDFDocument()
        parser.set_document(documento)
        documento.set_parser(parser)

        paginas = [p for p in documento.get_pages()]

        if len(paginas) > MAX_PAGINAS_PDF:
            return 'pdf_longo', None

        for pagina_atual in paginas[pg_inicio:pg_fim]:
            try:
                interpretador.process_page(pagina_atual)
            except Exception:
                tipo = tipo_aux + '_defeito'

        dados_html = str_saida.getvalue()
    except (AssertionError, pdf_parser.PDFSyntaxError):
        return tipo, None
    finally:
        arquivo_pdf.close()
        dispositivo.close()
        str_saida.close()

    removido_tags = _converte_html(dados_html, 'div')
    removido_espacos_desnecessarios = re.sub("\ +ç", "ç", removido_tags, \
     flags=re.IGNORECASE)
    sem_numero_pagina = re.sub("\nPage\ [0-9]+\ *\n[0-9]+\ *\n", "\n", \
     removido_espacos_desnecessarios)
    texto_final = re.sub('(\ *\n)+', '\n', re.sub('[\ \t]+', r' ', \
     sem_numero_pagina))

    return tipo, texto_final
예제 #4
0
def get_ltpages(infile, caching=True):
    rm = pdfinterp.PDFResourceManager(caching=caching)
    laparams = layout.LAParams(detect_vertical=False)
    device = converter.PDFPageAggregator(rm, laparams=laparams)
    interpreter = pdfinterp.PDFPageInterpreter(rm, device)
    ltpages = []
    for page in PDFPage.get_pages(infile, caching=caching):
        interpreter.process_page(page)
        ltpages.append(device.get_result())
    device.close()
    return ltpages
예제 #5
0
    def _parse_pages(self, document):
        """Return the info extracted for the PDF BORME pages."""
        resource_manager = pdfinterp.PDFResourceManager()
        # value is specified not as an actual length, but as a proportion of
        # the length to the size of each character in question.
        # Two text chunks whose distance is closer than the **char_margin**
        # is considered continuous and get grouped into one.
        # it may be required to insert blank characters (spaces) as necessary
        # if the distance between two words is greater than the
        # **word_margin**.
        # as a blank between words might not be represented as a space, but
        # indicated by the positioning of each word.
        # two lines whose distance is closer than the line_margin is grouped as
        # a text box, which is a rectangular area that contains a "cluster" of
        # text portions.
        # 6.0 --> all without one
        # params = layout.LAParams(char_margin=8.0)
        params = layout.LAParams(char_margin=14.0)
        device = converter.PDFPageAggregator(resource_manager, laparams=params)
        interpreter = pdfinterp.PDFPageInterpreter(resource_manager, device)
        pdf_pages = [page for page in pdfpage.PDFPage.create_pages(document)]
        raw_pages = []
        is_first_page = True
        for page in pdf_pages:
            interpreter.process_page(page)
            my_layout = device.get_result()
            acts = self._parse_raw_acts(my_layout, is_first_page)
            raw_pages.append(acts)
            is_first_page = False
            debug_txt = "Page number: %i Acts: %i" % (len(raw_pages),
                                                      len(acts))
            self._log.debug(debug_txt)

        pages = []
        counter = len(raw_pages)
        for page in raw_pages:
            counter -= 1
            is_last_page = (counter == 0)
            next_pages = []
            if not is_last_page:
                next_pages = raw_pages[(len(raw_pages) - counter):]
            last_page_act = None
            if pages:
                if pages[-1]:
                    last_page_act = pages[-1][-1]
            acts = self._parse_acts(page, next_pages, last_page_act)
            pages.append(acts)
        return pages
예제 #6
0
def pdf_miner(from_file, to_txt):
    log.debug('trying with pdfminer')
    pdf = codecs.open(from_file.path, mode='rb')
    output = codecs.open(to_txt.path, mode='wb')
    try:
        resourceman = pdfinterp.PDFResourceManager()
        device = converter.TextConverter(
            resourceman, output, laparams=layout.LAParams())
        interpreter = pdfinterp.PDFPageInterpreter(resourceman, device)
        for page in pdfpage.PDFPage.get_pages(pdf):
            interpreter.process_page(page)
        output.close()
        device.close()
        pdf.close()
    except Exception, e:
        log.critical(e)
        return
예제 #7
0
def doPDF(url):
    fp = open(url, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()

    resource_manager = pdfinterp.PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(resource_manager, laparams=laparams)
    interpreter = pdfinterp.PDFPageInterpreter(resource_manager, device)
    pages = doc.get_pages()
    str = ''
    for page in pages:
        interpreter.process_page(page)
        layout = device.get_result()
        for x in layout:
            if isinstance(x, LTText):
                str += x.get_text()
    return str
예제 #8
0
파일: scan_pdf.py 프로젝트: zachsis/strelka
    def scan(self, file_object, options):
        extract_text = options.get("extract_text", False)
        file_limit = options.get("limit", 2000)

        self.metadata["total"] = {"objects": 0, "extracted": 0}
        extracted_objects = set()

        try:
            with io.BytesIO(file_object.data) as pdf_object:
                parsed_pdf = pdfparser.PDFParser(pdf_object)
                pdf_document = pdfdocument.PDFDocument(parsed_pdf)

                self.metadata.setdefault("annotatedUris", [])
                for xref in pdf_document.xrefs:
                    for object_id in xref.get_objids():
                        self.metadata["total"]["objects"] += 1

                        try:
                            object = pdf_document.getobj(object_id)
                            if isinstance(object, dict):
                                for (key, value) in object.items():
                                    if key in ["AA", "OpenAction"]:
                                        file_object.flags.append(f"{self.scanner_name}::auto_action")
                                    if key in ["JS", "Javascript"]:
                                        file_object.flags.append(f"{self.scanner_name}::javascript_embedded")

                                    try:
                                        if key == "A":
                                            uri = value.get("URI")
                                            if (uri is not None and
                                                uri not in self.metadata["annotatedUris"]):
                                                    self.metadata["annotatedUris"].append(uri)

                                    except AttributeError:
                                        pass

                            if self.metadata["total"]["extracted"] >= file_limit:
                                continue
                            if isinstance(object, pdftypes.PDFStream):
                                try:
                                    child_filename = f"{self.scanner_name}::object_{object_id}"
                                    child_fo = objects.StrelkaFile(data=object.get_data(),
                                                                   filename=child_filename,
                                                                   depth=file_object.depth + 1,
                                                                   parent_uid=file_object.uid,
                                                                   root_uid=file_object.root_uid,
                                                                   parent_hash=file_object.hash,
                                                                   root_hash=file_object.root_hash,
                                                                   source=self.scanner_name)
                                    if object_id not in extracted_objects:
                                        self.children.append(child_fo)
                                        extracted_objects.add(object_id)
                                        self.metadata["total"]["extracted"] += 1

                                except TypeError:
                                    file_object.flags.append(f"{self.scanner_name}::type_error_{object_id}")
                                except struct.error:
                                    file_object.flags.append(f"{self.scanner_name}::struct_error_{object_id}")

                        except ValueError:
                            file_object.flags.append(f"{self.scanner_name}::value_error_{object_id}")
                        except pdftypes.PDFObjectNotFound:
                            file_object.flags.append(f"{self.scanner_name}::object_not_found_{object_id}")
                        except pdftypes.PDFNotImplementedError:
                            file_object.flags.append(f"{self.scanner_name}::not_implemented_error_{object_id}")
                        except pdftypes.PSSyntaxError:
                            file_object.flags.append(f"{self.scanner_name}::ps_syntax_error_{object_id}")

                if extract_text:
                    rsrcmgr = pdfinterp.PDFResourceManager(caching=True)
                    retstr = io.StringIO()
                    la_params = layout.LAParams(detect_vertical=True,
                                                char_margin=1.0,
                                                line_margin=0.3,
                                                word_margin=0.3)
                    device = converter.TextConverter(rsrcmgr, retstr,
                                                     codec="utf-8",
                                                     laparams=la_params)
                    interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device)
                    for page in pdfpage.PDFPage.get_pages(pdf_object, set()):
                        try:
                            interpreter.process_page(page)

                        except struct.error:
                            file_object.flags.append(f"{self.scanner_name}::text_struct_error")

                    pdf_object_text = retstr.getvalue()
                    child_filename = f"{self.scanner_name}::text"
                    child_fo = objects.StrelkaFile(data=pdf_object_text,
                                                   filename=child_filename,
                                                   depth=file_object.depth + 1,
                                                   parent_uid=file_object.uid,
                                                   root_uid=file_object.root_uid,
                                                   parent_hash=file_object.hash,
                                                   root_hash=file_object.root_hash,
                                                   source=self.scanner_name)
                    self.children.append(child_fo)
                    file_object.flags.append(f"{self.scanner_name}::extracted_text")
                    device.close()
                    retstr.close()

        except IndexError:
            file_object.flags.append(f"{self.scanner_name}::index_error")
        except pdfdocument.PDFEncryptionError:
            file_object.flags.append(f"{self.scanner_name}::encrypted_pdf")
        except pdfparser.PDFSyntaxError:
            file_object.flags.append(f"{self.scanner_name}::pdf_syntax_error")
        except psparser.PSEOF:
            file_object.flags.append(f"{self.scanner_name}::ps_eof")
        except psparser.PSSyntaxError:
            file_object.flags.append(f"{self.scanner_name}::ps_syntax_error")
예제 #9
0
    def scan(self, data, file, options, expire_at):
        extract_text = options.get('extract_text', False)
        file_limit = options.get('limit', 2000)

        self.event['total'] = {'objects': 0, 'extracted': 0}
        extracted_objects = set()

        try:
            with io.BytesIO(data) as pdf_io:
                parsed = pdfparser.PDFParser(pdf_io)
                pdf = pdfdocument.PDFDocument(parsed)

                self.event.setdefault('annotated_uris', [])
                for xref in pdf.xrefs:
                    for object_id in xref.get_objids():
                        self.event['total']['objects'] += 1

                        try:
                            object = pdf.getobj(object_id)
                            if isinstance(object, dict):
                                for (key, value) in object.items():
                                    if key in ['AA', 'OpenAction']:
                                        self.flags.append('auto_action')
                                    if key in ['JS', 'Javascript']:
                                        self.flags.append(
                                            'javascript_embedded')

                                    try:
                                        if key == 'A':
                                            uri = value.get('URI')
                                            if uri not in self.event[
                                                    'annotated_uris']:
                                                self.event[
                                                    'annotated_uris'].append(
                                                        uri)

                                    except AttributeError:
                                        pass

                            if self.event['total']['extracted'] >= file_limit:
                                continue
                            if isinstance(object, pdftypes.PDFStream):
                                try:
                                    if object_id not in extracted_objects:
                                        extract_file = strelka.File(
                                            name=f'object_{object_id}',
                                            source=self.name,
                                        )

                                        for c in strelka.chunk_string(
                                                object.get_data()):
                                            self.upload_to_cache(
                                                extract_file.pointer,
                                                c,
                                                expire_at,
                                            )

                                        self.files.append(extract_file)
                                        self.event['total']['extracted'] += 1
                                        extracted_objects.add(object_id)

                                except TypeError:
                                    self.flags.append('type_error_{object_id}')
                                except struct.error:
                                    self.flags.append(
                                        'struct_error_{object_id}')

                        except ValueError:
                            self.flags.append('value_error_{object_id}')
                        except pdftypes.PDFObjectNotFound:
                            self.flags.append('object_not_found_{object_id}')
                        except pdftypes.PDFNotImplementedError:
                            self.flags.append(
                                'not_implemented_error_{object_id}')
                        except psparser.PSSyntaxError:
                            self.flags.append('ps_syntax_error_{object_id}')

                if extract_text:
                    rsrcmgr = pdfinterp.PDFResourceManager(caching=True)
                    retstr = io.StringIO()
                    la_params = layout.LAParams(
                        detect_vertical=True,
                        char_margin=1.0,
                        line_margin=0.3,
                        word_margin=0.3,
                    )
                    device = converter.TextConverter(
                        rsrcmgr,
                        retstr,
                        codec='utf-8',
                        laparams=la_params,
                    )
                    interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device)
                    for page in pdfpage.PDFPage.get_pages(data, set()):
                        try:
                            interpreter.process_page(page)

                        except struct.error:
                            self.flags.append('text_struct_error')

                    extract_file = strelka.File(
                        name='text',
                        source=self.name,
                    )
                    for c in strelka.chunk_string(retstr.getvalue()):
                        self.upload_to_cache(
                            extract_file.pointer,
                            c,
                            expire_at,
                        )
                    self.files.append(extract_file)

                    self.flags.append('extracted_text')
                    device.close()
                    retstr.close()

        except IndexError:
            self.flags.append('index_error')
        except pdfdocument.PDFEncryptionError:
            self.flags.append('encrypted_pdf')
        except pdfparser.PDFSyntaxError:
            self.flags.append('pdf_syntax_error')
        except psparser.PSEOF:
            self.flags.append('ps_eof')
        except psparser.PSSyntaxError:
            self.flags.append('ps_syntax_error')