Python PDFDocument.getobj примеры использования

Язык программирования: Python

Пространство имен/Пакет: pdfminer3.pdfdocument

Класс/Тип: PDFDocument

Метод/Функция: getobj

Примеров на hotexamples.com: 4

Python PDFDocument.getobj - 4 примера найдено. Это лучшие примеры Python кода для pdfminer3.pdfdocument.PDFDocument.getobj, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

PDFDocument(15)

getobj(4)

get_outlines(2)

get_pages(1)

initialize(1)

set_parser(1)

Пример #1

Показать файл

def extractembedded(outfp, fname, objids, pagenos, password='',
                    dumpall=False, codec=None, extractdir=None):
    def extract1(obj):
        filename = os.path.basename(obj['UF'] or obj['F'])
        fileref = obj['EF']['F']
        fileobj = doc.getobj(fileref.objid)
        if not isinstance(fileobj, PDFStream):
            raise PDFValueError(
                'unable to process PDF: reference for %r is not a PDFStream' %
                (filename))
        if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
            raise PDFValueError(
                'unable to process PDF: reference for %r is not an EmbeddedFile' %
                (filename))
        path = os.path.join(extractdir, filename)
        if os.path.exists(path):
            raise IOError('file exists: %r' % path)
        print('extracting: %r' % path, file=sys.stderr)
        out = file(path, 'wb')
        out.write(fileobj.get_data())
        out.close()
        return

    fp = open(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    for xref in doc.xrefs:
        for objid in xref.get_objids():
            obj = doc.getobj(objid)
            if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC:
                extract1(obj)
    fp.close()
    return

Пример #2

Показать файл

def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None, extractdir=None):
    fp = open(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            dumpxml(outfp, obj, codec=codec)
    if pagenos:
        for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents:
                        obj = stream_value(obj)
                        dumpxml(outfp, obj, codec=codec)
                else:
                    dumpxml(outfp, page.attrs)
    if dumpall:
        dumpallobjs(outfp, doc, codec=codec)
    if (not objids) and (not pagenos) and (not dumpall):
        dumptrailers(outfp, doc)
    fp.close()
    if codec not in ('raw','binary'):
        outfp.write('\n')
    return

Пример #3

Показать файл

def extractembedded(fname, password='', extractdir=None, emailsDir=None):
    def extract1(obj):
        filename = os.path.basename(obj['F'])
        #       filename = os.path.basename(obj['UF'] or obj['F'])
        fileref = obj['EF']['F']
        fileobj = doc.getobj(fileref.objid)
        if not isinstance(fileobj, PDFStream):
            raise PDFValueError(
                'unable to process PDF: reference for %r is not a PDFStream' %
                (filename))
        if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
            raise PDFValueError(
                'unable to process PDF: reference for %r is not an EmbeddedFile'
                % (filename))
        file_name, extension = os.path.splitext(fname)
        path = os.path.join(extractdir, file_name + " " + filename)
        while os.path.exists(path):
            path = os.path.join(
                extractdir,
                file_name + " " + str(randint(1, 100)) + " " + filename)
            print >> sys.stderr, "file exists, create random name %s" % path
        # print >>sys.stderr, 'extracting: %r' % path
        out = file(path, 'wb')
        out.write(fileobj.get_data())
        out.close()
        return

    fp = file(os.path.join(emailsDir, fname), 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    for xref in doc.xrefs:
        if type(xref
                ) == PDFXRef:  # Ignore PDFXreffallback. Not sure what it is.
            for objid in xref.get_objids():
                obj = doc.getobj(objid)
                if isinstance(obj,
                              dict) and obj.get('Type') is LITERAL_FILESPEC:
                    extract1(obj)
    return

Пример #4

Показать файл

    cmd_args = virtual_environment(parser)
    from pdfminer3.pdfdocument import PDFDocument
    from pdfminer3.pdftypes import PDFObjectNotFound
    from pdfminer3.pdfparser import PDFParser, PDFStream
    print(cmd_args.file_name)
    input_file = open(cmd_args.file_name, "rb")
    parsed = PDFDocument(PDFParser(input_file))
    try:
        shutil.rmtree('%s.pdfminer_out' % cmd_args.file_name)
    except FileNotFoundError:
        pass
    os.mkdir('%s.pdfminer_out' % cmd_args.file_name)
    for obj_id in set(obj_id for xref in parsed.xrefs
                      for obj_id in xref.get_objids()):
        try:
            obj = parsed.getobj(obj_id)
        except PDFObjectNotFound:
            continue
        if not isinstance(obj, PDFStream):
            continue
        print('%s' % obj)
        obj.decode()
        length = obj.attrs.get('Length', '')
        output_file = open(
            '%s.pdfminer_out/pdf_%07d_0.dat' % (cmd_args.file_name, obj_id),
            'wb')
        output_file.write(obj.data)
        output_file.close()
    toc = time()
    print('\nExecution time: %s sec.' % (toc - tic))