def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): fp = open(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) doc.is_extractable = True if objids: for objid in objids: obj = doc.getobj(objid) dumpxml(outfp, obj, codec=codec) if pagenos: for (pageno, page) in enumerate(PDFPage.create_pages(doc)): if pageno in pagenos: if codec: for obj in page.contents: obj = stream_value(obj) dumpxml(outfp, obj, codec=codec) else: dumpxml(outfp, page.attrs) if dumpall: dumpallobjs(outfp, doc, codec=codec) if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc) fp.close() if codec not in ('raw', 'binary'): outfp.write('\n') return
def dumpoutline(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): fp = file(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) doc.is_extractable = True pages = dict((page.pageid, pageno) for (pageno, page) in enumerate(PDFPage.create_pages(doc))) def resolve_dest(dest): if isinstance(dest, str): dest = resolve1(doc.get_dest(dest)) elif isinstance(dest, PSLiteral): dest = resolve1(doc.get_dest(dest.name)) if isinstance(dest, dict): dest = dest['D'] return dest try: outlines = doc.get_outlines() outfp.write('<outlines>\n') for (level, title, dest, a, se) in outlines: pageno = None if dest: dest = resolve_dest(dest) pageno = pages[dest[0].objid] elif a: action = a.resolve() if isinstance(action, dict): subtype = action.get('S') if subtype and repr(subtype) == '/GoTo' and action.get( 'D'): dest = resolve_dest(action['D']) pageno = pages[dest[0].objid] s = e(title).encode('utf-8', 'xmlcharrefreplace') outfp.write('<outline level="%r" title="%s">\n' % (level, s)) if dest is not None: outfp.write('<dest>') dumpxml(outfp, dest) outfp.write('</dest>\n') if pageno is not None: outfp.write('<pageno>%r</pageno>\n' % pageno) outfp.write('</outline>\n') outfp.write('</outlines>\n') except PDFNoOutlines: pass parser.close() fp.close() return
def pdf_to_txt(path): fp = open(path, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable(): raise PDFPageAggregator else: rsrcmgr = PDFResourceManager() device = PDFDevice(rsrcmgr) interpreter = PDFPageInterpreter(rsrcmgr, device) # 处理每一页 for page in PDFPage.create_pages(document): interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): with open(path[:-4] + '.txt', 'a') as f: f.write(x.get_text().encode('utf-8') + '\n')
def extractembedded(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): def extract1(obj): filename = os.path.basename(obj['UF'] or obj['F']) fileref = obj['EF']['F'] fileobj = doc.getobj(fileref.objid) if not isinstance(fileobj, PDFStream): raise PDFValueError( 'unable to process PDF: reference for %r is not a PDFStream' % (filename)) if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE: raise PDFValueError( 'unable to process PDF: reference for %r is not an EmbeddedFile' % (filename)) path = os.path.join(extractdir, filename) if os.path.exists(path): raise IOError('file exists: %r' % path) print('extracting: %r' % path, file=sys.stderr) out = file(path, 'wb') out.write(fileobj.get_data()) out.close() return fp = file(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) doc.is_extractable = True for xref in doc.xrefs: for objid in xref.get_objids(): obj = doc.getobj(objid) if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC: extract1(obj) return
def main(): global OUTFILE, VERBOSE, ENCODING printout(BANNER) args = parse_args() links = set() emails = set() usernames = set() ips = set() paths = set() softwares = set() locations = set() img_users = set() img_software = set() img_locations = set() img_serials = set() pdf_metadata = [] img_metadata = [] # get all input files if os.path.isfile(args.path): files = [args.path] elif os.path.isdir(args.path): files = [os.path.join(args.path, f) for f in os.listdir(args.path) if os.path.isfile(os.path.join(args.path, f)) and f.endswith('.pdf')] printout('Files to be processed:', False) for h in files: printout(' %s' % os.path.join(args.path, h), False) else: printout('[!] Error: provided path %s is not a valid file or folder' % args.path) sys.exit(-1) # extract data from all files for f in files: with open(f, 'rb') as fp: try: if VERBOSE: printout('* Processing file %s...' % f) else: print(' ' * 200, end='\r') print('* Processing file %s...' % f, end='\r') parser = PDFParser(fp) doc = PDFDocument(parser) if not doc.is_extractable: printout('[!] Document %s is set not to be extractable. Trying anyway...' % f) doc.is_extractable = True metadata = get_metadata(doc) metadata['_filename'] = f pdf_metadata.append(metadata) if args.email or args.links or args.ips or args.paths or args.usernames or args.software: xml = get_xml(f) decoded = html.unescape(xml) if args.email: emails |= set(retrieve_all(decoded, rex.RE_EMAIL)) if args.links: links |= set(retrieve_all(decoded, rex.RE_WWW)) links |= set(urls_in_tags(decoded.splitlines())) if args.ips: ips |= set(retrieve_all(decoded, rex.RE_IP)) if args.extract_paths: paths |= set(paths_in_tooltips(decoded.splitlines())) if args.usernames or args.software: [u, s] = get_users_sw_from_meta(metadata) usernames |= set(u) softwares |= set(s) if args.images: image_meta = extract_images(doc, store_path=args.store_images, filename=f) img_metadata.append(image_meta) [img_u, img_sw, img_ser, img_loc] = get_users_sw_from_img_meta(image_meta) img_users |= set(img_u) img_software |= set(img_sw) img_locations |= set(img_loc) img_serials |= set(img_ser) except Exception as ex: printout('[!] Error while processing file %s: %s' % (f, ex)) printout() printout(ex, False) # now we also retrieve info from the paths structure found [u_linux, u_mac, u_windows] = get_info_from_paths(paths) usernames |= set(u_linux) usernames |= set(u_mac) usernames |= set(u_windows) # if images were extracted and metadata to be shown, first show img metadata if args.metadata and args.images: printout('%s %s %s' % ('.' * 31, 'image metadata', '.' * 31)) printout() print_image_metadata(img_metadata) # show pdf metadata if args.metadata: printout('%s %s %s' % ('.' * 32, 'PDF metadata', '.' * 32)) printout() print_metadata(pdf_metadata) # print the summary of results if args.summary: printout('.' * 78 + '\n') if args.usernames: print_results('* Usernames found', usernames) if args.paths: print_results('* Paths found', paths) if args.ips: print_results('* IPs found', ips) if args.email: print_results('* Emails found', emails) if args.links: print_results('* Links found', links) if args.software: print_results('* Software found', softwares) if args.images: if img_users and args.usernames: print_results('* Users in images', img_users) if img_software and args.software: print_results('* Software in images', img_software) if img_locations: print_results('* GPS Locations', img_locations) if img_serials: print_results('* Serial # in images', img_serials)