Пример #1
0
def extractembedded(outfp, fname, objids, pagenos, password='',
                    dumpall=False, codec=None, extractdir=None):
    def extract1(obj):
        filename = os.path.basename(obj['UF'] or obj['F'])
        fileref = obj['EF']['F']
        fileobj = doc.getobj(fileref.objid)
        if not isinstance(fileobj, PDFStream):
            raise PDFValueError(
                'unable to process PDF: reference for %r is not a PDFStream' %
                (filename))
        if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
            raise PDFValueError(
                'unable to process PDF: reference for %r is not an EmbeddedFile' %
                (filename))
        path = os.path.join(extractdir, filename)
        if os.path.exists(path):
            raise IOError('file exists: %r' % path)
        print('extracting: %r' % path, file=sys.stderr)
        out = file(path, 'wb')
        out.write(fileobj.get_data())
        out.close()
        return

    fp = open(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    for xref in doc.xrefs:
        for objid in xref.get_objids():
            obj = doc.getobj(objid)
            if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC:
                extract1(obj)
    fp.close()
    return
Пример #2
0
def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None, extractdir=None):
    fp = open(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            dumpxml(outfp, obj, codec=codec)
    if pagenos:
        for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents:
                        obj = stream_value(obj)
                        dumpxml(outfp, obj, codec=codec)
                else:
                    dumpxml(outfp, page.attrs)
    if dumpall:
        dumpallobjs(outfp, doc, codec=codec)
    if (not objids) and (not pagenos) and (not dumpall):
        dumptrailers(outfp, doc)
    fp.close()
    if codec not in ('raw','binary'):
        outfp.write('\n')
    return
Пример #3
0
def extractembedded(fname, password='', extractdir=None, emailsDir=None):
    def extract1(obj):
        filename = os.path.basename(obj['F'])
        #       filename = os.path.basename(obj['UF'] or obj['F'])
        fileref = obj['EF']['F']
        fileobj = doc.getobj(fileref.objid)
        if not isinstance(fileobj, PDFStream):
            raise PDFValueError(
                'unable to process PDF: reference for %r is not a PDFStream' %
                (filename))
        if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
            raise PDFValueError(
                'unable to process PDF: reference for %r is not an EmbeddedFile'
                % (filename))
        file_name, extension = os.path.splitext(fname)
        path = os.path.join(extractdir, file_name + " " + filename)
        while os.path.exists(path):
            path = os.path.join(
                extractdir,
                file_name + " " + str(randint(1, 100)) + " " + filename)
            print >> sys.stderr, "file exists, create random name %s" % path
        # print >>sys.stderr, 'extracting: %r' % path
        out = file(path, 'wb')
        out.write(fileobj.get_data())
        out.close()
        return

    fp = file(os.path.join(emailsDir, fname), 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    for xref in doc.xrefs:
        if type(xref
                ) == PDFXRef:  # Ignore PDFXreffallback. Not sure what it is.
            for objid in xref.get_objids():
                obj = doc.getobj(objid)
                if isinstance(obj,
                              dict) and obj.get('Type') is LITERAL_FILESPEC:
                    extract1(obj)
    return
Пример #4
0
    cmd_args = virtual_environment(parser)
    from pdfminer3.pdfdocument import PDFDocument
    from pdfminer3.pdftypes import PDFObjectNotFound
    from pdfminer3.pdfparser import PDFParser, PDFStream
    print(cmd_args.file_name)
    input_file = open(cmd_args.file_name, "rb")
    parsed = PDFDocument(PDFParser(input_file))
    try:
        shutil.rmtree('%s.pdfminer_out' % cmd_args.file_name)
    except FileNotFoundError:
        pass
    os.mkdir('%s.pdfminer_out' % cmd_args.file_name)
    for obj_id in set(obj_id for xref in parsed.xrefs
                      for obj_id in xref.get_objids()):
        try:
            obj = parsed.getobj(obj_id)
        except PDFObjectNotFound:
            continue
        if not isinstance(obj, PDFStream):
            continue
        print('%s' % obj)
        obj.decode()
        length = obj.attrs.get('Length', '')
        output_file = open(
            '%s.pdfminer_out/pdf_%07d_0.dat' % (cmd_args.file_name, obj_id),
            'wb')
        output_file.write(obj.data)
        output_file.close()
    toc = time()
    print('\nExecution time: %s sec.' % (toc - tic))