def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): fp = file(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) doc.initialize(password) if objids: for objid in objids: obj = doc.getobj(objid) dumpxml(outfp, obj, codec=codec) if pagenos: for (pageno,page) in enumerate(PDFPage.create_pages(doc)): if pageno in pagenos: if codec: for obj in page.contents: obj = stream_value(obj) dumpxml(outfp, obj, codec=codec) else: dumpxml(outfp, page.attrs) if dumpall: dumpallobjs(outfp, doc, codec=codec) if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc) fp.close() if codec not in ('raw','binary'): outfp.write('\n') return
def pdf_to_text(page_object): parser = PDFParser(page_object) # Create a PDF document object that stores the document structure doc = PDFDocument(parser) # Connect the parser and document objects. parser.set_document(doc) doc.initialize('') # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF page aggregator object device = PDFPageAggregator(rsrcmgr, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) text_content = [] # i = page number #without this it doesn't work # page are items in page for i, page in enumerate(PDFPage.create_pages(doc)): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() for object in layout: if isinstance(object, LTTextBox) or isinstance(object, LTTextLine): trial = [] trial.append(object.get_text()) for word in trial: text_content.append(word) return text_content
def main(): # Open a PDF file. with open('/home/chris/Documents/Literature/DFT Primer.pdf', 'rb') as fp: # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() print rsrcmgr # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): print interpreter.process_page(page) outlines = document.get_outlines() for (level,title,dest,a,se) in outlines: print (level, title) return 0
def with_pdf(pdf_doc, fn, pdf_pwd, *args): """Open the pdf document, and apply the function, returning the results""" result = None try: # open the pdf file fp = open(pdf_doc, "rb") # create a parser object associated with the file object parser = PDFParser(fp) # create a PDFDocument object that stores the document structure doc = PDFDocument(parser) # connect the parser and document objects parser.set_document(doc) # supply the password for initialization doc.initialize(pdf_pwd) if doc.is_extractable: # apply the function and return the result result = fn(doc, *args) # close the pdf file fp.close() except IOError: # the file doesn't exist or similar problem pass return result
def extractembedded(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): def extract1(obj): filename = os.path.basename(obj['UF'] or obj['F']) fileref = obj['EF']['F'] fileobj = doc.getobj(fileref.objid) if not isinstance(fileobj, PDFStream): raise PDFValueError( 'unable to process PDF: reference for %r is not a PDFStream' % (filename)) if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE: raise PDFValueError( 'unable to process PDF: reference for %r is not an EmbeddedFile' % (filename)) path = os.path.join(extractdir, filename) if os.path.exists(path): raise IOError('file exists: %r' % path) print >>sys.stderr, 'extracting: %r' % path out = file(path, 'wb') out.write(fileobj.get_data()) out.close() return fp = file(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) for xref in doc.xrefs: for objid in xref.get_objids(): obj = doc.getobj(objid) if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC: extract1(obj) return
def parse_paragraphs(self, text): # Will only work for markdown elements # divided by '##' markers # or for pdf like chapters, e.g. \n\n 2 Conclusion \n\n lines = text.split('\n') headlines = [] if self.is_pdf: with open(self.paper_filename, 'rb') as pdf: parser = PDFParser(pdf) document = PDFDocument(parser) try: outlines = document.get_outlines() for (level, title, _, _, _) in outlines: if level == 1: headlines.append(title) except PDFNoOutlines: logging.info( "No outline found -> skipping paragraph search..." ) else: # check markdown headlines for index, line in enumerate(lines): if line.startswith('## '): headlines.append(line) if len(headlines) > 0: self.count_paragraphs(text, lines, headlines)
def extract_pdf(file): """ extract the string content of a pdf """ parser = PDFParser(file) document = PDFDocument(parser) document.initialize("") if not document.is_extractable: return -1 rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() codec = 'utf-8' device = TextConverter(rsrcmgr, retstr, codec = codec, showpageno=False, laparams = laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pagenos = set() for page in PDFPage.get_pages(file, pagenos, maxpages=0, password="", caching=True, check_extractable=True): interpreter.process_page(page) content = retstr.getvalue() return content
def parse (self): fp = file(self.pdf, 'rb') parser = PDFParser(fp, dbg=self.debug) doc = PDFDocument(parser, dbg=self.debug) #extract blob of data after EOF (if it exists) if doc.found_eof and doc.eof_distance > 3: self.bin_blob = parser.read_from_end(doc.eof_distance) res = '<pdf>' visited = set() #keep track of the objects already visited for xref in doc.xrefs: for objid in xref.get_objids(): if objid in visited: continue if objid == 21 or objid == 67: print objid visited.add(objid) try: obj = doc.getobj(objid) res += '<object id="' + str(objid) + '">\n' res += self.dump(obj) res += '\n</object>\n\n' except PDFObjectNotFound as e: mal_obj = parser.read_n_from(xref.get_pos(objid)[1], 4096) mal_obj = mal_obj.replace('<', '0x3C') res += '<object id="%d" type="malformed">\n%s\n</object>\n\n' % (objid, mal_obj) self.takenote(self.malformed, 'objects', objid) except Exception as e: res += '<object id="%d" type="exception">\n%s\n</object>\n\n' % (objid, e.message) fp.close() res += self.dumptrailers(doc) res += '</pdf>' self.xml=res self.errors = doc.errors self.bytes_read = parser.BYTES return
def dumppdf(fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): fp = file(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) res = "" if objids: for objid in objids: obj = doc.getobj(objid) res += dumpxml(obj, codec=codec) if pagenos: for (pageno,page) in enumerate(PDFPage.create_pages(doc)): if pageno in pagenos: if codec: for obj in page.contents: obj = stream_value(obj) res += dumpxml( obj, codec=codec) else: res += dumpxml(page.attrs) #print "before dumpall" if dumpall: res += dumpallobjs( doc, codec=codec) #print "after dumpall" if (not objids) and (not pagenos) and (not dumpall): res += dumptrailers( doc) fp.close() if codec not in ('raw','binary'): res += '\n' #print "end proc" return res
def get_toc(pdf_path): infile = open(pdf_path, "rb") parser = PDFParser(infile) document = PDFDocument(parser) toc = list() for (level, title, dest, a, structelem) in document.get_outlines(): toc.append((level, title)) return toc
def pdf2metadata(fp): parser = PDFParser(fp) doc = PDFDocument(parser) parser.set_document(doc) doc.initialize() if 'Metadata' in doc.catalog: metadata = resolve1(doc.catalog['Metadata']).get_data() #print metadata # The raw XMP metadata return doc.info # The "Info" metadata
def loadPDF(library, file_name): """adds a paper to the library""" fp = open(file_name, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. document = PDFDocument(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) password = "" document.initialize(password) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: print "CANT" # raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) text_content = [] authors = [] #list of authors citations = [] #list of authors that have been cited #pages_length = sum(1 for page in document.get_pages()) for ii, page in enumerate(PDFPage.create_pages(document)): print '---------------------------------------------------------------------------------------------------' print "page number {}".format(ii) interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() for jj, lt_obj in enumerate(layout._objs): if jj>3: break if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): cur_line = lt_obj.get_text().encode('ascii', 'ignore') match = pattern_ignore.match(cur_line) if match is None and len(cur_line)<200: print bcolors.OKGREEN +" "+cur_line+bcolors.ENDC else: print bcolors.FAIL+" "+cur_line[0:150]+bcolors.ENDC else: print "PICTURE" break paper_title = file_name paper = library.getPaper(paper_title) paper.addAuthorIds(authors) paper.addCitationIds(citations)
def print_all_obj(filename): with file(filename, 'rb') as f: parser = PDFParser(f) doc = PDFDocument(parser, None) visited_objids = set() for xref in doc.xrefs: for objid in xref.get_objids(): if objid in visited_objids: continue visited_objids.add(objid) print objid, get_obj_type(doc.getobj(objid))
def proc(self, pdfFp): """Get meta-data as available from a PDF document""" parser = PDFParser(pdfFp) doc = PDFDocument(parser) parser.set_document(doc) doc.initialize() self.info = doc.info if 'Metadata' in doc.catalog: self.metadata = xmp_to_dict( resolve1(doc.catalog['Metadata']).get_data() ) self.raw_doc = pdfFp.getvalue()
def getDocumentInfoAndAnnotations(pdfFile): logger.info("Parsing pdf file " + pdfFile); # Open PDF file. fp = open(pdfFile, 'rb'); docInfo = None; docAnnotations = []; # Create a PDF parser object associated with the file object. parser = PDFParser(fp); # Create a PDF document object that stores the document structure. document = PDFDocument(parser); # Supply the password for initialization. # (If no password is set, give an empty string.) document.initialize(''); # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager(); # Create a PDF device object. device = PDFDevice(rsrcmgr); # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device); # Process each page contained in the document. pageNum = 0; for page in PDFPage.create_pages(document): pageNum += 1; interpreter.process_page(page); if(page.annots): try: if isinstance( page.annots, list ): annots = page.annots; else: annots = page.annots.resolve(); for annot in annots: if isinstance( annot, PDFObjRef ): annot = annot.resolve(); if(annot.has_key('Subj')): if(annot['Subj'] == 'Sticky Note' and docInfo == None): logger.debug('DOC INFO ' + annot['Subj'] + ' Contents=' + annot['Contents']); docInfo = annot['Contents']; elif(annot['Subj'] == 'Comment on Text'): logger.debug('COMMENT ON TEXT ' + annot['Subj'] + ' Contents=' + annot['Contents']); contents = annot['Contents']; docAnnotations.append(str(pageNum) + ':' + contents); else: logger.debug('UNKNOWN ANNOTATION: ' + annot['Subj'] + ' Contents=' + annot['Contents']); except Exception, e: logger.error("error getting annotation"); logger.exception(e); # move file to error os.rename(file, "/home1/northbr6/batch/apps/catalogue/output/error/" + os.path.basename(file));
def pdf_from_resource(resource): """ Builds PDF mining objects from input data. This function attempts to open a PDF file for processing. """ parser = PDFParser(resource) document = PDFDocument() parser.set_document(document) document.set_parser(parser) document.initialize() return document
def parse(filename, maxlevel): fp = open(filename, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) outlines = doc.get_outlines() for (level, title, dest, a, se) in outlines: if level <= maxlevel: title_words = title.encode('utf8') \ .replace('\n', '') \ .split() title = ' '.join(title_words) print('<h{level}>{title}</h{level}>' .format(level=level, title=title))
def load_document(self, _file, password=""): """turn the file into a PDFMiner document""" log.info("loading document...") parser = module_parser(_file) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize(password) if not doc.is_extractable: raise ValueError("PDF text extraction not allowed") return doc
def Parse_PDF(self): def parse_lt_objs (lt_objs, page_number, text=[]): """Iterate through the list of LT* objects and capture the text or image data contained in each""" text_content = [] page_text = {} for lt_obj in lt_objs: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): # text, so arrange is logically based on its column width text_content.append(lt_obj.get_text()) elif isinstance(lt_obj, LTFigure): # LTFigure objects are containers for other LT* objects, so recurse through the children text_content.append(parse_lt_objs(lt_obj, page_number, text_content)) for k, v in sorted([(key,value) for (key,value) in page_text.items()]): # sort the page_text hash by the keys (x0,x1 values of the bbox), # which produces a top-down, left-to-right sequence of related columns text_content.append(''.join(v)) return '\n'.join(text_content) fp = open( self.filePath, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) try: document.initialize('') except: pass rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) text_content = [] i = 0 for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() self.text_content.append(parse_lt_objs(layout, (i+1)).strip()) i += 1 return self.text_content
def check_pdf_password(pdf, password): fp = open(pdf, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) try: doc.initialize(password) if doc.is_extractable: print '' print 'The PDF Password Is:' + password return True else: print 'exception' return False except: print '\r', return False
def convert_file(pdf_file, file_name): parser = PDFParser(pdf_file) pdf = PDFDocument(parser) pdf.initialize("") if not pdf.is_extractable: raise PDFPage.PDFTextExtractionNotAllowed("Document does not allow text extraction: " + file_name) resource = PDFResourceManager() laparams = LAParams() output = StringIO.StringIO() device = TextConverter(resource, output, codec="utf-8", laparams=laparams) interpreter = PDFPageInterpreter(resource, device) for page in PDFPage.create_pages(pdf): interpreter.process_page(page) return output.getvalue()
def dumpoutline(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): fp = file(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) doc.initialize(password) pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(PDFPage.create_pages(doc)) ) def resolve_dest(dest): if isinstance(dest, str): dest = resolve1(doc.get_dest(dest)) elif isinstance(dest, PSLiteral): dest = resolve1(doc.get_dest(dest.name)) if isinstance(dest, dict): dest = dest['D'] return dest try: outlines = doc.get_outlines() outfp.write('<outlines>\n') for (level,title,dest,a,se) in outlines: pageno = None if dest: dest = resolve_dest(dest) pageno = pages[dest[0].objid] elif a: action = a.resolve() if isinstance(action, dict): subtype = action.get('S') if subtype and repr(subtype) == '/GoTo' and action.get('D'): dest = resolve_dest(action['D']) pageno = pages[dest[0].objid] s = e(title).encode('utf-8', 'xmlcharrefreplace') outfp.write('<outline level="%r" title="%s">\n' % (level, s)) if dest is not None: outfp.write('<dest>') dumpxml(outfp, dest) outfp.write('</dest>\n') if pageno is not None: outfp.write('<pageno>%r</pageno>\n' % pageno) outfp.write('</outline>\n') outfp.write('</outlines>\n') except PDFNoOutlines: pass parser.close() fp.close() return
class PDF(list): def __init__(self, file, password='', just_text=1): self.parser = PDFParser(file) self.doc = PDFDocument(self.parser) self.parser.set_document(self.doc) self.doc.initialize(password) if self.doc.is_extractable: self.resmgr = PDFResourceManager() self.device = TextConverter(self.resmgr, outfp=StringIO()) self.interpreter = PDFPageInterpreter( self.resmgr, self.device) for page in PDFPage.create_pages(self.doc): self.append(self.interpreter.process_page(page)) self.metadata = self.doc.info if just_text: self._cleanup() def _cleanup(self): """ Frees lots of non-textual information, such as the fonts and images and the objects that were needed to parse the PDF. """ del self.device del self.doc del self.parser del self.resmgr del self.interpreter def text(self, clean=True): """ Returns the text of the PDF as a single string. Options: :clean: Removes misc cruft, like lots of whitespace. """ if clean: return utils.normalise_whitespace(''.join(self)) else: return ''.join(self)
def extractComments(fp): parser = PDFParser(fp) doc = PDFDocument(parser, "") visited = set() pages = [] resultList = [] def extract(objid, obj): result = None if isinstance(obj, dict): # 'Type' is PDFObjRef type if obj.has_key('Type') and obj['Type'].name == 'Page': pages.append(objid) elif obj.has_key('C'): try: pr = obj['P'] pi = pages.index(pr.objid)+1 except: pi = -1 try: result = (fp.name, objid, pi, obj['Subtype'].name, obj['Subj'],obj['T'],obj['Contents']) except: # if any of the listed entries do not exist, ignore #print(objid, pi, obj['Subtype'].name) result = () return result for xref in doc.xrefs: for objid in xref.get_objids(): if objid in visited: continue visited.add(objid) try: obj = doc.getobj(objid) if obj is None: continue r= extract(objid,obj) if r: resultList.append(r) except PDFObjectNotFound, e: print >>sys.stderr, 'not found: %r' % e
def __init__(self, pdf): self.document = pdf #initialize parsing parameters self.file_pointer = open(self.document.name, 'rb') self.parser = PDFParser(self.file_pointer) self.pdf_document = PDFDocument(self.parser) self.pdf_document.initialize() # set resaource management self.resource_manager = PDFResourceManager() self.pdf_device = PDFPageAggregator(self.resource_manager, laparams=LAParams()) #set interpreter self.interpreter = PDFPageInterpreter(self.resource_manager, self.pdf_device)
def __init__(self, file, password="", just_text=1): self.parser = PDFParser(file) self.doc = PDFDocument(self.parser) self.parser.set_document(self.doc) self.doc.initialize(password) if self.doc.is_extractable: self.resmgr = PDFResourceManager() self.device = TextConverter(self.resmgr, outfp=StringIO()) self.interpreter = PDFPageInterpreter(self.resmgr, self.device) for page in PDFPage.create_pages(self.doc): self.append(self.interpreter.process_page(page)) self.metadata = self.doc.info if just_text: self._cleanup()
def get_headings(filename): os.chdir('..') rd.open_location("/PDF",True) filename_=filename[:-14] for compare_filename in os.listdir(os.getcwd()): if filename_ == compare_filename[:-4]: in_file=open(compare_filename, 'rb') parse_file=PDFParser(in_file) file=PDFDocument(parse_file) pages=0 for page in PDFPage.get_pages(in_file): pages+=1 headings_list=[] try: for (level,title,dest,a,structelem) in file.get_outlines(): headings_list.append((level,title)) rd.open_location("/program",True) return headings_list,pages except: rd.open_location("/program",True) return None,pages
def valid_toc(self, toc): with open(str(self._doc), "rb") as pdffile: parser = PDFParser(pdffile) document = PDFDocument(parser) try: real_toc = list(document.get_outlines()) except PDFNoOutlines: return len(toc) == 0 print("TOC from PDF file:", real_toc) if len(real_toc) != len(toc): print("Incorrect TOC length") return False for ref, real in zip(toc, real_toc): print("Checking", ref) if not ref[0] + 1 == real[0]: # level return False if not self._is_reference_to_ith_page(real[2][0], ref[1] - 1): # destination return False if not ref[2] == real[1]: # title return False return True
def extract_contents(self): parser = PDFParser(self.fd) doc = PDFDocument(parser) self.total_pages = self.get_pages_total() self.pages = zip(PDFPage.get_pages(self.fd), range(1, self.total_pages)) try: outlines = doc.get_outlines() except PDFNoOutlines: # No built-in outlines return None else: # built-in outlines exist def search_page_toc(objid): for page, pagenum in self.pages: if page.pageid == objid: return pagenum return 0 for (level, title, dest, a, se) in outlines: if dest is not None: pn = search_page_toc(dest[0].objid) if pn > 0: self.outlines.append((title, pn))
def __init__(self, *args, **kwargs): super(AccountRIB, self).__init__(*args, **kwargs) self.parsed_text = b'' try: try: from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage newapi = True except ImportError: from pdfminer.pdfparser import PDFDocument newapi = False from pdfminer.pdfparser import PDFParser, PDFSyntaxError from pdfminer.converter import TextConverter from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter except ImportError: self.logger.warning('Please install python-pdfminer to get IBANs') else: parser = PDFParser(BytesIO(self.doc)) try: if newapi: doc = PDFDocument(parser) else: doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) except PDFSyntaxError: return rsrcmgr = PDFResourceManager() out = BytesIO() device = TextConverter(rsrcmgr, out) interpreter = PDFPageInterpreter(rsrcmgr, device) if newapi: pages = PDFPage.create_pages(doc) else: doc.initialize() pages = doc.get_pages() for page in pages: interpreter.process_page(page) self.parsed_text = out.getvalue()
def get_pdf_totalpage(file): file = open(file, 'rb') parser = PDFParser(file) document = PDFDocument(parser) page_count = resolve1(document.catalog['Pages'])['Count'] return page_count
def main(): global OUTFILE, VERBOSE, ENCODING printout(BANNER) args = parse_args() links = set() emails = set() usernames = set() ips = set() paths = set() softwares = set() locations = set() img_users = set() img_software = set() img_locations = set() img_serials = set() pdf_metadata = [] img_metadata = [] # get all input files if os.path.isfile(args.path): files = [args.path] elif os.path.isdir(args.path): files = [os.path.join(args.path, f) for f in os.listdir(args.path) if os.path.isfile(os.path.join(args.path, f)) and f.endswith('.pdf')] printout('Files to be processed:', False) for h in files: printout(' %s' % os.path.join(args.path, h), False) else: printout('[!] Error: provided path %s is not a valid file or folder' % args.path) sys.exit(-1) # extract data from all files for f in files: with open(f, 'rb') as fp: try: if VERBOSE: printout('* Processing file %s...' % f) else: print(' ' * 200, end='\r') print('* Processing file %s...' % f, end='\r') parser = PDFParser(fp) doc = PDFDocument(parser) if not doc.is_extractable: printout('[!] Document %s is set not to be extractable. Trying anyway...' % f) doc.is_extractable = True metadata = get_metadata(doc) metadata['_filename'] = f pdf_metadata.append(metadata) if args.email or args.links or args.ips or args.paths or args.usernames or args.software: xml = get_xml(f) decoded = html.unescape(xml) if args.email: emails |= set(retrieve_all(decoded, rex.RE_EMAIL)) if args.links: links |= set(retrieve_all(decoded, rex.RE_WWW)) links |= set(urls_in_tags(decoded.splitlines())) if args.ips: ips |= set(retrieve_all(decoded, rex.RE_IP)) if args.extract_paths: paths |= set(paths_in_tooltips(decoded.splitlines())) if args.usernames or args.software: [u, s] = get_users_sw_from_meta(metadata) usernames |= set(u) softwares |= set(s) if args.images: image_meta = extract_images(doc, store_path=args.store_images, filename=f) img_metadata.append(image_meta) [img_u, img_sw, img_ser, img_loc] = get_users_sw_from_img_meta(image_meta) img_users |= set(img_u) img_software |= set(img_sw) img_locations |= set(img_loc) img_serials |= set(img_ser) except Exception as ex: printout('[!] Error while processing file %s: %s' % (f, ex)) printout() printout(ex, False) # now we also retrieve info from the paths structure found [u_linux, u_mac, u_windows] = get_info_from_paths(paths) usernames |= set(u_linux) usernames |= set(u_mac) usernames |= set(u_windows) # if images were extracted and metadata to be shown, first show img metadata if args.metadata and args.images: printout('%s %s %s' % ('.' * 31, 'image metadata', '.' * 31)) printout() print_image_metadata(img_metadata) # show pdf metadata if args.metadata: printout('%s %s %s' % ('.' * 32, 'PDF metadata', '.' * 32)) printout() print_metadata(pdf_metadata) # print the summary of results if args.summary: printout('.' * 78 + '\n') if args.usernames: print_results('* Usernames found', usernames) if args.paths: print_results('* Paths found', paths) if args.ips: print_results('* IPs found', ips) if args.email: print_results('* Emails found', emails) if args.links: print_results('* Links found', links) if args.software: print_results('* Software found', softwares) if args.images: if img_users and args.usernames: print_results('* Users in images', img_users) if img_software and args.software: print_results('* Software in images', img_software) if img_locations: print_results('* GPS Locations', img_locations) if img_serials: print_results('* Serial # in images', img_serials)
def main(argv): global Verbose_Flag global Use_local_time_for_output_flag global testing argp = argparse.ArgumentParser(description="extract_pseudo_JSON-from_PDF.py: Extract the pseudo JSON from the end of the thesis PDF file") argp.add_argument('-v', '--verbose', required=False, default=False, action="store_true", help="Print lots of output to stdout") argp.add_argument('-t', '--testing', default=False, action="store_true", help="execute test code" ) argp.add_argument('-p', '--pdf', type=str, default="test.pdf", help="read PDF file" ) argp.add_argument('-j', '--json', type=str, default="calendar_event.json", help="JSON file for extracted calendar event" ) argp.add_argument('-a', '--acronyms', type=str, default="acronyms.tex", help="acronyms filename" ) argp.add_argument('-l', '--ligatures', default=False, action="store_true", help="leave ligatures rahter than replace them" ) args = vars(argp.parse_args(argv)) Verbose_Flag=args["verbose"] filename=args["pdf"] if Verbose_Flag: print("filename={}".format(filename)) #output_string = StringIO() output_string = BytesIO() with open(filename, 'rb') as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams()) #device = HTMLConverter(rsrcmgr, output_string, laparams=LAParams(), layoutmode='normal', codec='utf-8') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) text=output_string.getvalue().decode('UTF-8') if Verbose_Flag: print("text: {}".format(text)) # define the maker string quad__euro_marker='€€€€' # look for the new start of the For DiVA information diva_start=text.find("{0} For DIVA {0}".format(quad__euro_marker)) if diva_start < 0: # if not found, then try the older For DIVA string diva_start=text.find("For DIVA") if Verbose_Flag: print("For DIVA found at diva_start={}".format(diva_start)) if diva_start >= 0: diva_data=text[:] diva_data=diva_data[diva_start:] diva_start=diva_data.find("{") if diva_start >= 0: diva_data=diva_data[diva_start:] end_block=diva_data.find('”Number of lang instances”:') # note these are right double quote marks if end_block < 0: end_block=diva_data.find('"Number of lang instances":') # note these are straight double quote marks if end_block > 0: end_block=diva_data.find(',', end_block) if end_block > 0: dict_string=diva_data[:] dict_string=dict_string[:end_block]+'}' dict_string=dict_string.replace('', '') # remove any new page characters dict_string=dict_string.replace('”', '"') dict_string=dict_string.replace('\n\n', '\n') dict_string=dict_string.replace(' \n', '') dict_string=dict_string.replace(',}', '}') dict_string=dict_string.replace('”', '"') #dict_string=dict_string.replace('"', '"') #dict_string=dict_string.replace('<br>', '\n') #dict_string=dict_string.replace('<br>"', '\n"') #dict_string=dict_string.replace('<br>}', '\n}') dict_string=dict_string.replace(',\n\n}', '\n}') dict_string=dict_string.replace(',\n}', '\n}') # fix an error in the early template if dict_string.find(',Äddress": ') > 0: print("fix an error in the early template") dict_string=dict_string.replace(',Äddress": ', ',"Address": "') dict_string=dict_string.replace('\"Lindstedtsvägen', 'Lindstedtsvägen') dict_string=dict_string.replace('¨Lindstedtsvägen', 'Lindstedtsvägen') dict_string=dict_string.replace('¨Isafjordsgatan', 'Isafjordsgatan') if not args['ligatures']: dict_string=replace_ligature(dict_string) print("looking for and replacing ligatures") if Verbose_Flag: print("dict_string={}".format(dict_string)) print("dict_string={}".format(dict_string)) d=json.loads(dict_string) if Verbose_Flag: print("d={}".format(d)) abs_keywords=diva_data[(end_block+1):] abs_keywords=abs_keywords.replace('', '') if Verbose_Flag: print("abs_keywords={}".format(abs_keywords)) number_of_quad_euros=abs_keywords.count(quad__euro_marker) if Verbose_Flag: print("number_of_quad_euros={}".format(number_of_quad_euros)) abstracts=dict() keywords=dict() if (number_of_quad_euros % 2) == 1: print("Odd number of markers") save_abs_keywords=abs_keywords[:] number_of_pairs_of_markers=int(number_of_quad_euros/2) for i in range(0, number_of_pairs_of_markers): abstract_key_prefix='”Abstract[' key_offset=abs_keywords.find(abstract_key_prefix) if key_offset > 0: # found a key for an abstract # get language code lang_code_start=key_offset+len(abstract_key_prefix) lang_code=abs_keywords[lang_code_start:lang_code_start+3] quad__euro_marker_start=abs_keywords.find(quad__euro_marker, lang_code_start) if quad__euro_marker_start >= 0: quad__euro_marker_end=abs_keywords.find(quad__euro_marker, quad__euro_marker_start + 5) abstracts[lang_code]=abs_keywords[quad__euro_marker_start+5:quad__euro_marker_end] #br_offset=abstracts[lang_code].find('<br>') #if br_offset >= 0: # abstracts[lang_code]=abstracts[lang_code][br_offset+4:] abs_keywords=abs_keywords[quad__euro_marker_end+1:] abs_keywords=save_abs_keywords[:] for i in range(0, number_of_pairs_of_markers): abstract_key_prefix='”Keywords[' key_offset=abs_keywords.find(abstract_key_prefix) if key_offset > 0: # found a key for an abstract # get language code lang_code_start=key_offset+len(abstract_key_prefix) lang_code=abs_keywords[lang_code_start:lang_code_start+3] quad__euro_marker_start=abs_keywords.find(quad__euro_marker, lang_code_start) if quad__euro_marker_start > 0: quad__euro_marker_end=abs_keywords.find(quad__euro_marker, quad__euro_marker_start + 5) keywords[lang_code]=abs_keywords[quad__euro_marker_start+5:quad__euro_marker_end] keywords[lang_code]=keywords[lang_code].replace('\n', '') # remove newlines from keywords keywords[lang_code]=keywords[lang_code].strip() # remove starting end ending white space br_offset=keywords[lang_code].find('<br>') if br_offset >= 0: keywords[lang_code]=keywords[lang_code][br_offset+4:] abs_keywords=abs_keywords[quad__euro_marker_end+1:] for a in abstracts: print("a={0}, abstract={1}".format(a, abstracts[a])) abstracts[a]=clean_up_abstract(abstracts[a]) any_acronyms_in_abstracts=False for a in abstracts: acronyms_present=check_for_acronyms(abstracts[a]) if acronyms_present: any_acronyms_in_abstracts=True if any_acronyms_in_abstracts: acronyms_filename=args["acronyms"] print("Acronyms found, getting acronyms from {}".format(acronyms_filename)) acronym_dict=get_acronyms(acronyms_filename) if len(acronym_dict) == 0: print("no acronyms found in {}".format(acronyms_filename)) else: # entries of the form: acronym_dict[label]={'acronym': acronym, 'phrase': phrase} for a in abstracts: abstracts[a]=spellout_acronyms_in_abstract(acronym_dict, abstracts[a]) print("abstracts={}".format(abstracts)) print("keywords={}".format(keywords)) d['abstracts']=abstracts d['keywords']=keywords output_filename=args["json"] if Verbose_Flag: print("output_filename={}".format(output_filename)) with open(output_filename, 'w', encoding='utf-8') as output_FH: j_as_string = json.dumps(d, ensure_ascii=False) print(j_as_string, file=output_FH) else: print('No "Number of lang instances" found') dict_string=diva_data[:] print("initial dict_string={}".format(dict_string)) dict_string=dict_string.replace('', '') # remove any new page characters dict_string=dict_string.replace('”', '"') dict_string=dict_string.replace('\n\n', '\n') dict_string=dict_string.replace(' \n', '') dict_string=dict_string.replace(',}', '}') #dict_string=dict_string.replace('"', '"') #dict_string=dict_string.replace('<br>', '\n') #dict_string=dict_string.replace('<br>"', '\n"') #dict_string=dict_string.replace('<br>}', '\n}') dict_string=dict_string.replace(',\n\n}', '\n}') dict_string=dict_string.replace(',\n}', '\n}') # fix an error in the early template if dict_string.find(',Äddress": ') > 0: print("fix an error in the early template") dict_string=dict_string.replace(',Äddress": ', ',"Address": "') dict_string=dict_string.replace('\"Lindstedtsvägen', 'Lindstedtsvägen') dict_string=dict_string.replace('¨Lindstedtsvägen', 'Lindstedtsvägen') dict_string=dict_string.replace('¨Isafjordsgatan', 'Isafjordsgatan') if not args['ligatures']: dict_string=replace_ligature(dict_string) print("looking for and replacing ligatures") print("dict_string={}".format(dict_string)) d=json.loads(dict_string) print("d={}".format(d)) output_filename=args["json"] if Verbose_Flag: print("output_filename={}".format(output_filename)) with open(output_filename, 'w', encoding='utf-8') as output_FH: j_as_string = json.dumps(d, ensure_ascii=False) print(j_as_string, file=output_FH)
def parse_assessment_to_excel(assessment_path, database_path): utc_now = datetime.utcnow() data_dictionary = OrderedDict( {"Processed_UTC": utc_now.isoformat()} ) # Lets make a dictionary where all the parsed values are kept, lets add time when parsing was started # TODO add also processed file name assessment_file = open(assessment_path, 'rb') parser = PDFParser(assessment_file) doc = PDFDocument(parser) fields = resolve1(doc.catalog['AcroForm'])['Fields'] for i in fields: field = resolve1(i) key, value = field.get('T'), field.get('V') if debug: print '{}: {} -> {}'.format(key, value, type(value)) # DEBUG if type(value) == str: unicode_value = unicode( value.decode("iso-8859-1").replace( u"\xfe\xff\x00", u"").replace(u"\x00", u"").replace(u'\xfe\xff', u"") ) # Lets convert the string to unicode and replace is needed to remove some funny characters data_dictionary[key] = [unicode_value] elif value == None: data_dictionary[key] = [u"ei"] else: data_dictionary[key] = [value.name] if value.name == "Off": data_dictionary[key] = [u"ei"] if value.name == "Yes": data_dictionary[key] = [u"jah"] assessment_file.close() # Create pandas dataframe for exporting data data_frame = pandas.DataFrame(data_dictionary) if debug: print list(data_frame.columns) # DEBUG if os.path.exists(database_path) == True: print "Info - Database file {} already exists, loading previous records".format( database_path) existing_data = pandas.read_excel( database_path, index_col=0) # TODO set first column as index if debug: print existing_data # Add to exsiting data data_frame = existing_data.append(data_frame, sort=False) # Fix index numbering data_frame = data_frame.reset_index(drop=True) # Fix index numbering # Create backup of current database move_file(database_path, "database_backup", "{:%Y%m%dT%H%M%S}_{}".format( utc_now, uuid.uuid4())) # Create unique filename for each bacup # Export to excel and add formatting sheet_name = "Hindamised" writer = pandas.ExcelWriter(database_path, engine='xlsxwriter') data_frame.to_excel(writer, sheet_name, encoding='utf8') # Get sheet to do some formatting sheet = writer.sheets[sheet_name] # Set default column size, if this does not work you are missing XslxWriter module first_col = 1 last_col = len(data_frame.columns) width = 25 sheet.set_column(first_col, last_col, width) # freeze column names and ID column sheet.freeze_panes(1, 1) # Apply filter to excel first_row = 0 last_row = len(data_frame) sheet.autofilter(first_row, first_col, last_row, last_col) # Save the file writer.save() return data_dictionary
def run_convert_code(): pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' textJson = [] entries = os.listdir('../files/pdf') source_file = '' text = '' date = '' # recuperation du chemin de chaque doc pdf dans le dossier for pdf_path in entries: source_file = pdf_path try: images = convert_from_path( '../files/pdf/' + pdf_path, poppler_path=r'C:\poppler-21.03.0\Library\bin') list_images = images[0] ocr_dict = pytesseract.image_to_data(list_images, lang='eng', output_type=Output.DICT) text1 = " ".join(ocr_dict['text']) file = open('../files/pdf/' + pdf_path, 'rb') parser = PDFParser(file) document = PDFDocument(parser) if resolve1(document.catalog['Pages'])['Count'] > 1: pil_im1 = images[1] ocr_dict1 = pytesseract.image_to_data(pil_im1, lang='eng', output_type=Output.DICT) text2 = " ".join(ocr_dict1['text']) else: text2 = '' text = text1 + text2 jour = re.search( "(lundi|mardi|Mercredi|mércredi|jeudi|vendredi|samedi|dimanche)", text, re.IGNORECASE) pat = r"\W*([\w]+)" n = 3 groups = re.search( r'{}\W*{}{}'.format(pat * n, str(jour.group(0)), pat * n), text, re.IGNORECASE).groups() date_input = str(groups[n:][0]) + '-' + str( groups[n:][1]).lower() + '-' + str(groups[n:][2]) if len(str(groups[n:][0])) < 2: journ = '0' + str(groups[n:][0]) else: journ = str(groups[n:][0]) if str(groups[n:][1]).lower() == 'janvier': mois = '01' if str(groups[n:][1]).lower() == 'février': mois = '02' if str(groups[n:][1]).lower() == 'mars': mois = '03' if str(groups[n:][1]).lower() == 'avril': mois = '04' if str(groups[n:][1]).lower() == 'mai': mois = '05' if str(groups[n:][1]).lower() == 'juin': mois = '06' if str(groups[n:][1]).lower() == 'juillet': mois = '07' if str(groups[n:][1]).lower() == 'août': mois = '08' if str(groups[n:][1]).lower() == 'septembre': mois = '09' if str(groups[n:][1]).lower() == 'octobre': mois = '10' if str(groups[n:][1]).lower() == 'novembre': mois = '11' if str(groups[n:][1]).lower() == 'décembre': mois = '12' date = journ + '/' + mois + '/' + str(groups[n:][2]) except Exception as e: print('petit probleme') cas_positifs = re.search( r'(\w+\s+){0,3}sont revenus positifs(\w+\s+){0,3}', text, re.IGNORECASE) if cas_positifs: pos_num = str(cas_positifs.group(0)) cas_positifs_nums = [ int(s) for s in pos_num.split() if s.isdigit() ] if not cas_positifs_nums: cas_positifs_nums = [0] else: cas_positifs_nums = [0] cas_importes = re.search(r'(\w+\s+){0,3}cas importés(\w+\s+){0,3}', text, re.IGNORECASE) if cas_importes: imp_num = str(cas_importes.group(0)) cas_importes_nums = [ int(s) for s in imp_num.split() if s.isdigit() ] if not cas_importes_nums: cas_importes_nums = [0] else: cas_importes_nums = [0] cas_contacts = re.search(r'(\w+\s+){0,3}cas contacts(\w+\s+){0,3}', text, re.IGNORECASE) if cas_contacts: cont_num = str(cas_contacts.group(0)) cas_contacts_nums = [ int(s) for s in cont_num.split() if s.isdigit() ] if not cas_contacts_nums: cas_contacts_nums = [0] else: cas_contacts_nums = [0] tests_realises = re.search(r'(\w+\s+){0,3}tests réalisés(\w+\s+){0,3}', text, re.IGNORECASE) if tests_realises: test_num = str(tests_realises.group(0)) if tests_realises.group(0) is not None: cas_test_nums = [ int(s) for s in test_num.split() if s.isdigit() ] if not cas_test_nums: cas_test_nums = [0] else: cas_test_nums = [0] sous_traitement = re.search( r'(\w+\s+){0,3}sous traitement(\w+\s+){0,3}', text, re.IGNORECASE) if sous_traitement: trait_num = str(sous_traitement.group(0)) cas_sous_traitement_nums = [ int(s) for s in trait_num.split() if s.isdigit() ] if not cas_sous_traitement_nums: cas_sous_traitement_nums = [0] else: cas_sous_traitement_nums = [0] contacts_suivis = re.search( r'(\w+\s+){0,3}contacts suivis(\w+\s+){0,3}', text, re.IGNORECASE) if contacts_suivis: suivi_num = str(contacts_suivis.group(0)) cas_contacts_suivis_nums = [ int(s) for s in suivi_num.split() if s.isdigit() ] if not cas_contacts_suivis_nums: cas_ccontacts_suivis_nums = [0] else: cas_contacts_suivis_nums = [0] cas_communautaires = re.search( r'(\w+\s+){0,12} communautaire(\w+\s+){0,3}', text, re.IGNORECASE) if cas_communautaires: comm_num = str(cas_communautaires.group(0)) cas_communautaires_nums = [ int(s) for s in comm_num.split() if s.isdigit() ] if not cas_communautaires_nums: cas_communautaires_nums = [0] else: cas_communautaires_nums = [0] cas_gueris = re.search( r'(\w+\s+){0,6}négatifs et déclarés guéris(\w+\s+){0,3}', text, re.IGNORECASE) if cas_gueris: gueris_num = str(cas_gueris.group(0)) cas_gueris_nums = [ int(s) for s in gueris_num.split() if s.isdigit() ] else: cas_gueris_nums = [0] cas_deces = re.search(r'(\w+\s+){0,10}décès(\w+\s+){0,10}', text, re.IGNORECASE) if cas_deces: deces_num = str(cas_deces.group(0)) cas_deces_nums = [int(s) for s in deces_num.split() if s.isdigit()] if not cas_deces_nums: cas_deces_nums = [0] else: cas_deces_nums = [0] # different regions du senegal reg1 = r"(?i)(?:\bDakar\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}Dakar)" reg2 = r"(?i)(?:\bThiès\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}Thiès)" reg3 = r"(?i)(?:\bLouga\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}Louga)" reg4 = r"(?i)(?:\bDiourbel\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}Diourbel)" reg5 = r"(?i)(?:\bFatick\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}Fatick)" reg6 = r"(?i)(?:\bKaolack\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}Kaolack)" reg7 = r"(?i)(?:\bKaffrine\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}Kaffrine)" reg8 = r"(?i)(?:\bKolda\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}Kolda)" reg9 = r"(?i)(?:\bTamba\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}Tamba)" reg10 = r"(?i)(?:\bZiguinchor\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}Ziguinchor)" reg11 = r"(?i)(?:\bSaint-Louis\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}Saint-Louis)" reg12 = r"(?i)(?:\bMatam\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}Matam)" reg13 = r"(?i)(?:\bSédhiou\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}Sédhiou)" reg14 = r"(?i)(?:\bKédougou\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}Kédougou)" nb_cas_dkr = re.findall(reg1, text) if not nb_cas_dkr: nb_cas_dakar = [0] else: dkr_str = str(nb_cas_dkr) nb_cas_dakar = re.findall(r'\d+', dkr_str) nbCasTh = re.findall(reg2, text) if not nbCasTh: nb_cas_thies = [0] else: th_str = str(nbCasTh) nb_cas_thies == re.findall(r'\d+', th_str) nbCasLg = re.findall(reg3, text) if not nbCasLg: nb_cas_louga = [0] else: lg_str = str(nbCasLg) nb_cas_louga = re.findall(r'\d+', lg_str) nbCasDbl = re.findall(reg4, text) if not nbCasDbl: nb_cas_diourbel = [0] else: dbl_str = str(nbCasDbl) nb_cas_diourbel = re.findall(r'\d+', dbl_str) nbCasFtk = re.findall(reg5, text) if not nbCasFtk: nb_cas_fatick = [0] else: ftk_str = str(nbCasFtk) nb_cas_fatick = re.findall(r'\d+', ftk_str) nbCasKlk = re.findall(reg6, text) if not nbCasKlk: nb_cas_kaolack = [0] else: klk_str = str(nbCasKlk) nb_cas_kaolack = re.findall(r'\d+', klk_str) nbCasKfr = re.findall(reg7, text) if not nbCasKfr: nb_cas_kaffrine = [0] else: kfr_str = str(nbCasKfr) nb_cas_kaffrine = re.findall(r'\d+', kfr_str) nbCasKld = re.findall(reg8, text) if not nbCasKld: nb_cas_kolda = [0] else: kld_str = str(nbCasKld) nb_cas_kolda = re.findall(r'\d+', kld_str) nbCasTmb = re.findall(reg9, text) if not nbCasTmb: nb_cas_tamba = [0] else: tmb_str = str(nbCasTmb) nb_cas_tamba = re.findall(r'\d+', tmb_str) nbCasZig = re.findall(reg10, text) if not nbCasZig: nb_cas_ziguinchor = [0] else: zig_str = str(nbCasZig) nb_cas_ziguinchor = re.findall(r'\d+', zig_str) nbCasSl = re.findall(reg11, text) if not nbCasSl: nb_cas_saintl = [0] else: sl_str = str(nbCasSl) nb_cas_saintl = re.findall(r'\d+', sl_str) nbCasMtm = re.findall(reg12, text) if not nbCasMtm: nb_cas_matam = [0] else: mtm_str = str(nbCasMtm) nb_cas_matam = re.findall(r'\d+', mtm_str) nbCasSdh = re.findall(reg13, text) if not nbCasSdh: nb_cas_sedhiou = [0] else: sdh_str = str(nbCasSdh) nb_cas_sedhiou = re.findall(r'\d+', sdh_str) nbCasKdg = re.findall(reg14, text) if not nbCasKdg: nb_cas_kedougou = [0] else: kdg_str = str(nbCasKdg) nb_cas_kedougou = re.findall(r'\d+', kdg_str) annee_mois = '' json_data = { pdf_path: { 'date': date, 'nouveaux_cas': cas_positifs_nums[0], 'cas_importes': cas_importes_nums[0], 'cas_contacts': cas_contacts_nums[0], 'test_realise': cas_test_nums[0], 'personne_sous_traitement': cas_sous_traitement_nums[0], 'cas_communautaires': cas_communautaires_nums[0], 'nombre_gueris': cas_gueris_nums[0], 'nombre_deces': cas_deces_nums[0], 'date_heure_extraction': str(datetime.now()), 'nom_fichier_source': source_file, 'localites': { 'Dakar': int(nb_cas_dakar[0]), 'Thies': int(nb_cas_thies[0]), 'Diourbel': int(nb_cas_diourbel[0]), 'Fatick': int(nb_cas_fatick[0]), 'Kaolack': int(nb_cas_kaolack[0]), 'Kaffrine': int(nb_cas_kaffrine[0]), 'Louga': int(nb_cas_louga[0]), 'Kolda': int(nb_cas_kolda[0]), 'Tambacounda': int(nb_cas_tamba[0]), 'Ziguinchor': int(nb_cas_ziguinchor[0]), 'Saint-Louis': int(nb_cas_saintl[0]), 'Matam': int(nb_cas_matam[0]), 'Sedhiou': int(nb_cas_sedhiou[0]), 'Kedougou': int(nb_cas_kedougou[0]) } } } textJson.append(json_data[pdf_path]) valeurs = set(map(lambda x: x['date'][3:12], textJson)) new_list = [[y for y in textJson if y['date'][3:12] == x] for x in valeurs] for i in new_list: doc_name = i[0]['date'][3:12] txtFile = doc_name.replace('/', '-') annee_mois = doc_name if txtFile: with open(str(txtFile) + '.json', 'w', encoding='utf-8') as f: json.dump(i, f, ensure_ascii=False, indent=4) f.close() for f in glob.glob('*.json'): shutil.move(f, '../files/jsons')
from pdfminer.pdfpage import PDFPage from pdfminer.pdfparser import PDFParser from sklearn.metrics.pairwise import cosine_similarity import re from nltk.tokenize import sent_tokenize from nltk.corpus import stopwords filepath='C:/Users/lenovo/Desktop/ACL2020' list1=os.listdir(filepath) list_words=[] corpus=[] for i in range(len(list1)): outs="" fp = open(filepath+'/'+list1[i], 'rb') parser = PDFParser(fp) doc = PDFDocument(parser=parser) parser.set_document(doc=doc) resource = PDFResourceManager() laparam = LAParams() device = PDFPageAggregator(resource, laparams=laparam) interpreter = PDFPageInterpreter(resource, device) for page in PDFPage.get_pages(fp): interpreter.process_page(page) layout = device.get_result() for out in layout: if hasattr(out, 'get_text'): outs=out.get_text()+outs outs=outs.lower().replace('\n','') english_pu=['’','“','“'] punctuation_map = dict((ord(char), None) for char in string.punctuation) without_punctuation = outs.translate(punctuation_map) # 去除文章标点符号
def PDF_to_TXT_regex(title): #print("\n\n ~~~~~~~~ \n\n ~~~~~~~~ \n\n") print("Title: {}".format(title)) with open(title, 'rb') as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) #print(output_string.getvalue()) import re # Let us createa an empty array to append incubation periods that we find for sentence in output_string.getvalue().split(". "): if "incubation" in sentence: #print(sentence) #day = re.findall(r" \d{1.2} day", sentence) day = re.findall(r" ((\d{1,2}\.)?\d{1,2}) day[s]?", sentence) #day = re.findall(r" (\d{1,2}\.(\d{1,2})?) day[s]?", sentence) print(day) # let's print the numbers where we found them: if (len(day) >0): #print("\nDays: {}, Array length = {}".format(day, len(day))) #print("Corresponding sentence is:\n{}".format(sentence)) incubations.append(float(day[0][0]) ) else: pass #print(day) ''' # let's print now the numbers and their sentences # only when there is 1 number found if len(day) == 1: print("\nday[0][0] = {} day[s]".format(day[0][0])) print("\nSentence: {}".format(sentence)) incubations.append(day) ''' ''' day2 = re.findall(r"(?:\d{1,2}\.)?\d{1,2}", sentence) #print(day2) day3 = re.findall(r"(\d{1,2}(\.\d{1,2})? day[s]?)", sentence) day3 = re.findall(r"(\d{1,2}(\.\d{1,2})?) day[s]?", sentence) #day3 = re.findall(r" \d{1,2}(\.\d{1,2})? day[s]?", sentence) #print(day3) if len(day2) == 1: print(day2[0]) print(sentence) if len(day3) == 1: print("day3[0] = {} day[s]".format(day3[0][0])) print("Sentence: {}".format(sentence)) incubations.append(day3[0][0]) ''' #print("Incubation days from the paper:\n{}".format(incubations)) print(incubations) print(sorted(incubations)) import matplotlib.pyplot as plt bins = [0., 2., 4., 6., 8., 10., 12., 14., 16., 18., 20., 22., 24., 26., 28.] fig = plt.figure(figsize=[14,16]) plt.rc('font', size=18) #plt.hist(sorted(incubations), bins=bins) plt.hist((incubations), bins=bins) plt.xlabel('Incubation period') plt.ylabel('Frequency/Probability') plt.title('Histogram of Coronavirus incubation periods') plt.show() plt.close()
def get_pdf_rows(data, miner_layout=True): """ Takes PDF file content as string and yield table row data for each page. For each page in the PDF, the function yields a list of rows. Each row is a list of cells. Each cell is a list of strings present in the cell. Note that the rows may belong to different tables. There are no logic tables in PDF format, so this parses PDF drawing instructions and tries to find rectangles and arrange them in rows, then arrange text in the rectangles. External dependencies: PDFMiner (http://www.unixuser.org/~euske/python/pdfminer/index.html). """ try: from pdfminer.pdfparser import PDFParser, PDFSyntaxError except ImportError: raise ImportError('Please install python-pdfminer') try: from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage newapi = True except ImportError: from pdfminer.pdfparser import PDFDocument newapi = False from pdfminer.converter import PDFPageAggregator from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.layout import LAParams, LTRect, LTTextBox, LTTextLine, LTLine, LTChar, LTCurve parser = PDFParser(BytesIO(data)) try: if newapi: doc = PDFDocument(parser) else: doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) except PDFSyntaxError: return rsrcmgr = PDFResourceManager() if miner_layout: device = PDFPageAggregator(rsrcmgr, laparams=LAParams()) else: device = PDFPageAggregator(rsrcmgr) interpreter = PDFPageInterpreter(rsrcmgr, device) if newapi: pages = PDFPage.get_pages(BytesIO(data), check_extractable=True) else: doc.initialize() pages = doc.get_pages() if LOGGER.isEnabledFor(DEBUGFILES): import tempfile import PIL.Image as Image import PIL.ImageDraw as ImageDraw import random path = tempfile.mkdtemp(prefix='pdf') for npage, page in enumerate(pages): LOGGER.debug('processing page %s', npage) interpreter.process_page(page) page_layout = device.get_result() texts = sum([list(lttext_to_multilines(obj, page_layout)) for obj in page_layout._objs if isinstance(obj, (LTTextBox, LTTextLine, LTChar))], []) LOGGER.debug('found %d text objects', len(texts)) if LOGGER.isEnabledFor(DEBUGFILES): img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255)) draw = ImageDraw.Draw(img) for t in texts: color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255)) draw.rectangle((t.x0, t.y0, t.x1, t.y1), outline=color) draw.text((t.x0, t.y0), t.text.encode('utf-8'), color) fpath = '%s/1text-%03d.png' % (path, npage) img.save(fpath) LOGGER.log(DEBUGFILES, 'saved %r', fpath) if not miner_layout: texts.sort(key=lambda t: (t.y0, t.x0)) # TODO filter ltcurves that are not lines? # TODO convert rects to 4 lines? lines = [lt_to_coords(obj, page_layout) for obj in page_layout._objs if isinstance(obj, (LTRect, LTLine, LTCurve))] LOGGER.debug('found %d lines', len(lines)) if LOGGER.isEnabledFor(DEBUGFILES): img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255)) draw = ImageDraw.Draw(img) for l in lines: color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255)) draw.rectangle((l.x0, l.y0, l.x1, l.y1), outline=color) fpath = '%s/2lines-%03d.png' % (path, npage) img.save(fpath) LOGGER.log(DEBUGFILES, 'saved %r', fpath) lines = list(uniq_lines(lines)) LOGGER.debug('found %d unique lines', len(lines)) rows = build_rows(lines) LOGGER.debug('built %d rows (%d boxes)', len(rows), sum(len(row) for row in rows)) if LOGGER.isEnabledFor(DEBUGFILES): img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255)) draw = ImageDraw.Draw(img) for r in rows: for b in r: color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255)) draw.rectangle((b.x0 + 1, b.y0 + 1, b.x1 - 1, b.y1 - 1), outline=color) fpath = '%s/3rows-%03d.png' % (path, npage) img.save(fpath) LOGGER.log(DEBUGFILES, 'saved %r', fpath) textrows = arrange_texts_in_rows(rows, texts) LOGGER.debug('assigned %d strings', sum(sum(len(c) for c in r) for r in textrows)) if LOGGER.isEnabledFor(DEBUGFILES): img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255)) draw = ImageDraw.Draw(img) for row, trow in zip(rows, textrows): for b, tlines in zip(row, trow): color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255)) draw.rectangle((b.x0 + 1, b.y0 + 1, b.x1 - 1, b.y1 - 1), outline=color) draw.text((b.x0 + 1, b.y0 + 1), '\n'.join(tlines).encode('utf-8'), color) fpath = '%s/4cells-%03d.png' % (path, npage) img.save(fpath) LOGGER.log(DEBUGFILES, 'saved %r', fpath) yield textrows device.close()
def get_pdf_details(self): # save pdf to local # it gives random name to pdf, it will delete it after processing random_string = str(uuid.uuid4())[0:10] file_path = os.path.join(BASE_DIR, 'pdf_files', "{}.pdf".format(random_string)) html_file_path = os.path.join(BASE_DIR, 'pdf_files', "{}.html".format(random_string)) with open(file_path, 'wb') as f: f.write(self.response.content) text = "" # Usage Type 1: # Rendering pdf as text. Best way to get PDF content, but got problems with jusText, not getting article as expected with open(file_path, 'rb') as f: parser = PDFParser(f) document = PDFDocument(parser) manager = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(manager, laparams=laparams) interpreter = PDFPageInterpreter(manager, device) for page in PDFPage.get_pages(f): interpreter.process_page(page) layout = device.get_result() for element in layout: if isinstance(element, (LTTextBoxHorizontal)): # alterin element get as html element, so jusText library can find relative texts text += "<p>{}</p>".format(element.get_text()) # End of usage type 1 # Usage Type 2: # Rendering pdf as html. Not a great way to get PDF content. Font sizes, html elements etc. not rendering as expected. # If fixed, would work with jusText as expected. with open(html_file_path, 'wb') as outf: extract_text_to_fp(f, outf, output_type='html') with open(html_file_path, 'rb') as f: text = " ".join( [x.decode().replace('\n', '') for x in f.readlines()]) # End of usage type 2 if document.info: self.title = document.info[0].get('Title', None) if self.title: self.title = self.title.decode() # jusText raises exception if text variable is empty if text: parapraphs = justext.justext( text, justext.get_stoplist(language='English')) content = " ".join([ parapraph.text for parapraph in parapraphs if not parapraph.is_boilerplate and not parapraph.is_heading and parapraph.class_type == 'good' ]) self.content = content self.raw_content = content # Remove reduntant files. os.unlink(file_path) os.unlink(html_file_path)
def get_input_fields(self, source_pdf: str = None, replace_none_value: bool = False) -> dict: """Get input fields in the PDF. Stores input fields internally so that they can be used without parsing PDF again. Parameter `replace_none_value` is for convience to visualize fields. :param source_pdf: source filepath, defaults to None :param replace_none_value: if value is None replace it with key name, defaults to False :return: dictionary of input key values or `None` """ record_fields = {} if source_pdf is None and self.active_fields: return self.active_fields self.switch_to_pdf_document(source_pdf) source_parser = PDFParser(self.active_fileobject) source_document = PDFDocument(source_parser) try: fields = resolve1(source_document.catalog["AcroForm"])["Fields"] except KeyError: self.logger.info('PDF "%s" does not have any input fields.', self.active_pdf) return None for i in fields: field = resolve1(i) if field is None: continue name, value, rect, label = ( field.get("T"), field.get("V"), field.get("Rect"), field.get("TU"), ) if value is None and replace_none_value: record_fields[name.decode("iso-8859-1")] = { "value": name.decode("iso-8859-1"), "rect": iterable_items_to_int(rect), "label": label.decode("iso-8859-1") if label else None, } else: try: record_fields[name.decode("iso-8859-1")] = { "value": value.decode("iso-8859-1") if value else "", "rect": iterable_items_to_int(rect), "label": label.decode("iso-8859-1") if label else None, } except AttributeError: self.logger.debug("Attribute error") record_fields[name.decode("iso-8859-1")] = { "value": value, "rect": iterable_items_to_int(rect), "label": label.decode("iso-8859-1") if label else None, } self.active_fields = record_fields if record_fields else None return record_fields
def cate(load_path, keywords, mode, save_path, category, win): new_win = None canvas = None fill_line = None if win != None: new_win = Toplevel(win) new_win.title(category + '分类进度') new_win.geometry('300x20') canvas = Canvas(new_win, width=300, height=20, bg="white") canvas.place(x=0, y=0) fill_line = canvas.create_rectangle(0, 0, 300, 20, width=0, fill="green") pdfs = os.listdir(load_path) n = 300 / len(pdfs) for pdf in tqdm(pdfs): with open(load_path + pdf, 'rb') as fp: parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建一个PDF资源管理器对象来存储共赏资源 rsrcmgr = PDFResourceManager() # 设定参数进行分析 laparams = LAParams() # 创建一个PDF设备对象 # device=PDFDevice(rsrcmgr) device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) flag = False for page in PDFPage.create_pages(document): interpreter.process_page(page) # # 接受该页面的LTPage对象 layout = device.get_result( ) # return text image line curve for x in layout: if isinstance(x, LTText): text = x.get_text() isin = is_in(keywords, text, mode) if isin: copy(load_path, pdf, save_path, category) flag = True break if flag: break if new_win != None and canvas != None and fill_line != None: n = n + 300 / len(pdfs) canvas.coords(fill_line, (0, 0, n, 20)) new_win.update() if new_win != None: new_win.destroy()
def get_title_from_io(pdf_io, min_ch, min_wd): parser = PDFParser(pdf_io) # if pdf is protected with a pwd, 2nd param here is password doc = PDFDocument(parser) # pdf may not allow extraction # pylint: disable=no-else-return if doc.is_extractable: rm = PDFResourceManager() dev = TextOnlyDevice(rm) interpreter = TextOnlyInterpreter(rm, dev) first_page = StringIO() converter = TextConverter(rm, first_page, laparams=LAParams()) page_interpreter = PDFPageInterpreter(rm, converter) for page in PDFPage.create_pages(doc): interpreter.process_page(page) page_interpreter.process_page(page) break converter.close() first_page_text = first_page.getvalue() first_page.close() dev.recover_last_paragraph() verbose('all blocks') for b in dev.blocks: verbose(b) title = None max_tfs_cutoff = None tfs_tol = 1 y_tol = 1 max_num_iter = 4 # number of times to lower max_tfs_cutoff if title too short or too few words. tfs_iter = 0 while tfs_iter < max_num_iter and ( not title or (min_ch > 1 and len(title) < min_ch) or (min_wd > 1 and len(title.split(' ')) > 1 and len(title.split(' ')) < min_wd)): tfs_iter += 1 # pylint: disable=W0603 # global ALGO # don't neet 'global ALGO' as it's not being modified, can still read it. if ALGO == "original": # find max font size max_tfs = max([ b for b in dev.blocks if (not max_tfs_cutoff or b[1] < max_tfs_cutoff) ], key=lambda x: x[1])[1] verbose('max_tfs: ', max_tfs) # find max blocks with max font size max_blocks = list( filter(lambda x: abs(x[1] - max_tfs) < tfs_tol, dev.blocks)) # find the one with the highest y coordinate # this is the most close to top max_y = max(max_blocks, key=lambda x: x[3])[3] verbose('max_y: ', max_y) found_blocks = list( filter(lambda x: abs(x[3] - max_y) < y_tol, max_blocks)) verbose('found blocks') for b in found_blocks: verbose(b) title = '' for b in found_blocks: title += ''.join(b[4]) elif ALGO == "max2": # find max font size all_tfs = sorted(list( map(lambda x: x[1], [ b for b in dev.blocks if (not max_tfs_cutoff or b[1] < max_tfs_cutoff) ])), reverse=True) max_tfs = all_tfs[0] verbose('max_tfs: ', max_tfs) selected_blocks = [] max2_tfs = -1 for b in dev.blocks: if max2_tfs == -1: if abs(b[1] - max_tfs) < tfs_tol: selected_blocks.append(b) elif len(selected_blocks) > 0: # max is added selected_blocks.append(b) max2_tfs = b[1] else: if abs(b[1] - max_tfs) < tfs_tol or abs(b[1] - max2_tfs) < tfs_tol: selected_blocks.append(b) else: break for b in selected_blocks: verbose(b) title = '' for b in selected_blocks: title += ''.join(b[4]) elif ALGO == "max_position": # find max font size max_tfs = max([ b for b in dev.blocks if (not max_tfs_cutoff or b[1] < max_tfs_cutoff) ], key=lambda x: x[1])[1] verbose('max_tfs: ', max_tfs) # find max blocks with max font size tfs_tol = 1 max_blocks = [ b for b in dev.blocks if abs(b[1] - max_tfs) < tfs_tol ] for b in max_blocks: verbose(b) # Now use the y-range of max_blocks as the check # for all blocks, with a much higher tolerance for # tfs to account for sub/superscript characters which # can vary by +/- 10pts. y_max = max(max_blocks, key=lambda x: x[3])[3] y_min = min(max_blocks, key=lambda x: x[3])[3] y_range = y_max - y_min y_mid = (y_max + y_min) * 0.5 verbose(f"{y_range = }, {y_mid = }") # find the one with the highest y coordinate # this is the most close to top y_tol = 2 tfs_tol = 8 found_blocks = [ b for b in dev.blocks if b in max_blocks or ( b[3] <= y_max + y_tol and b[3] >= y_min - y_tol and abs(b[1] - max_tfs) < tfs_tol) ] verbose('found blocks') for b in found_blocks: verbose(b) title = '' for b in found_blocks: title += ''.join(b[4]) else: raise Exception("unsupported ALGO") max_tfs_cutoff = max_tfs verbose(f"before retrieving spaces, {title = }") # Retrieve missing spaces if needed # if " " not in title: # title = retrieve_spaces(first_page_text, title) new_title = retrieve_spaces_word_based(first_page_text, title.replace(' ', '')) if len(new_title) > len(title): title = new_title # Remove duplcate spaces if any are present if " " in title: title = " ".join(title.split()) return title else: return None
def makepdf(self, pdfdata1, udct, zeros): parser = PDFParser(BytesIO(pdfdata1)) document = PDFDocument(parser, fallback=False) prev = document.find_xref(parser) info = document.xrefs[0].trailer['Info'].objid root = document.xrefs[0].trailer['Root'].objid size = 1 # calculate last object id, size is only xref size but not count of object in xref for ref in document.xrefs: if isinstance(ref, PDFXRefStream): no = max(ref.ranges, key=operator.itemgetter(1))[1] else: if len(ref.offsets) == 0: no = 0 else: no = max(ref.offsets.keys()) size = max(size, no) pages = len(document.getobj(document.catalog['Pages'].objid)['Kids']) page = udct.get(b'sigpage', 0) if 0 <= udct.get(b'sigpage', 0) <= pages - 1 else 0 page = document.getobj( document.catalog['Pages'].objid)['Kids'][page].objid nsig, fields = self.getfields(root, document) annots = self.getannots(page, document) infodata = self.getdata(pdfdata1, info, prev, document) rootdata = self.getdata(pdfdata1, root, prev, document, ('AcroForm', )) pagedata = self.getdata(pdfdata1, page, prev, document, ('Annots', )) no = size + 1 visualization, nav = self.makevisualization(no, udct, nsig, page) objs = [ self.makeobj(page, (b'/Annots[%s%d 0 R]' % (annots, no + 3) + pagedata)), self.makeobj(no + 0, infodata), self.makeobj(no + 1, (b'/AcroForm %d 0 R' % (no + 2)) + rootdata), self.makeobj( no + 2, b'/Fields[%s%d 0 R]/SigFlags %d' % (fields, no + 3, udct[b'sigflags'])), visualization, self.makeobj(nav + 1, ( b'/ByteRange [0000000000 0000000000 0000000000 0000000000]/ContactInfo(%s)\ /Filter/Adobe.PPKLite/Location(%s)/M(D:%s)/Prop_Build<</App<</Name/>>>>/Reason(%s)/SubFilter/adbe.pkcs7.detached/Type/Sig\ /Contents <' % (udct[b'contact'], udct[b'location'], udct[b'signingdate'], udct[b'reason'])) + zeros + b'>'), # self.makeobj(nav + 1, (b'/ByteRange [0000000000 0000000000 0000000000 0000000000]/ContactInfo(%s)\ #/Filter/Adobe.PPKMS/SubFilter/ETSI.CAdES.detached/Location(%s)/M(D:%s)/Prop_Build<</App<</Name/>>>>/Reason(%s)/SubFilter/adbe.pkcs7.detached/Type/Sig\ #/Contents <' % (udct[b'contact'], udct[b'location'], udct[b'signingdate'], udct[b'reason'])) + zeros + b'>'), ] size = nav - no + 2 pdfdata2 = b''.join(objs) startxref = len(pdfdata1) xref = b'xref\n%d 1\n%010d 00000 n \n%d %d\n' % ( page, startxref + pdfdata2.find(b'\n%d 0 obj\n' % page) + 1, no, size) xref += b''.join([ b'%010d 00000 n \n' % (startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + i)) + 1) for i in range(size) ]) trailer = b'''\ trailer <</ID [<%(h1)s><%(h2)s>]/Info %(info)d 0 R/Prev %(prev)d/Root %(root)d 0 R/Size %(size)d>>\n\ startxref\n\ %(startxref)d\n\ %%%%EOF\n\ ''' trailer = trailer % { b'page': page, b'no': no, b'startxref': startxref + len(pdfdata2), b'prev': prev, b'info': no + 0, b'root': no + 1, b'size': size, b'h1': hashlib.md5(pdfdata1).hexdigest().upper().encode('ascii'), b'h2': hashlib.md5(pdfdata2).hexdigest().upper().encode('ascii'), } pdfdata2 = pdfdata2 + xref + trailer return pdfdata2
from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice fp = open('./test.pdf', 'rb') #创建一个PDF文档解析器对象 parser = PDFParser(fp) #创建一个PDF文档对象存储文档结构 #提供密码初始化,没有就不用传该参数 document = PDFDocument(parser, password) #检查文件是否允许文本提取 if not document.is_extractable: raise PDFTextExtractionNotAllowed #创建一个PDF资源管理器对象来存储共享资源 rsrcmgr = PDFResourceManager() #创建一个pdf设备对象 device = PDFDevice(rsrcmgr)
def __init__(self, pdf_stream, password="", pagenos=[], maxpages=0): # noqa: C901 ReaderBackend.__init__(self) self.pdf_stream = pdf_stream # Extract Metadata parser = PDFParser(pdf_stream) doc = PDFDocument(parser, password=password, caching=True) if doc.info: for k in doc.info[0]: v = doc.info[0][k] # print(repr(v), type(v)) if isinstance(v, (bytes, str, unicode)): self.metadata[k] = make_compat_str(v) elif isinstance(v, (psparser.PSLiteral, psparser.PSKeyword)): self.metadata[k] = make_compat_str(v.name) # Secret Metadata if "Metadata" in doc.catalog: metadata = resolve1(doc.catalog["Metadata"]).get_data() # print(metadata) # The raw XMP metadata # print(xmp_to_dict(metadata)) self.metadata.update(xmp_to_dict(metadata)) # print("---") # Extract Content text_io = BytesIO() rsrcmgr = PDFResourceManager(caching=True) converter = TextConverter(rsrcmgr, text_io, codec="utf-8", laparams=LAParams(), imagewriter=None) interpreter = PDFPageInterpreter(rsrcmgr, converter) self.metadata["Pages"] = 0 self.curpage = 0 for page in PDFPage.get_pages( self.pdf_stream, pagenos=pagenos, maxpages=maxpages, password=password, caching=True, check_extractable=False, ): # Read page contents interpreter.process_page(page) self.metadata["Pages"] += 1 self.curpage += 1 # Collect URL annotations # try: if page.annots: refs = self.resolve_PDFObjRef(page.annots) if refs: if isinstance(refs, list): for ref in refs: if ref: self.references.add(ref) elif isinstance(refs, Reference): self.references.add(refs) # except Exception as e: # logger.warning(str(e)) # Remove empty metadata entries self.metadata_cleanup() # Get text from stream self.text = text_io.getvalue().decode("utf-8") text_io.close() converter.close() # print(self.text) # Extract URL references from text for url in extractor.extract_urls(self.text): self.references.add(Reference(url, self.curpage)) for ref in extractor.extract_arxiv(self.text): self.references.add(Reference(ref, self.curpage)) for ref in extractor.extract_doi(self.text): self.references.add(Reference(ref, self.curpage))
def text_to_lda(self, fp=None): #Reading the PDF Document and saving as lone lone=self.convert_pdf_to_text() # Gets inputs rb fp = open(self.a, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) fields = resolve1(doc.catalog['AcroForm'])['Fields'] inps=[] for i in fields: field = resolve1(i) name, value = field.get('T'), field.get('V') inps.append('{0}: {1}'.format(name, value)) inf=[] ini=[] for i in fields: field=resolve1(i) name,value=field.get('T'),field.get('V') inf.append(name) ini.append(value) # Topic Modeling # Fitting Count Vectorizer on the document with Stop Words vect=CountVectorizer(ngram_range=(1,1),stop_words='english') dtm = vect.fit_transform(inps) #Converting the Document Term Matrix from Count Vectorizer into a Pandas Dataframe dfm=pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names()) #Fitting the Latent Dirichlet Allocation Model on the Document Term Matrix lda = LatentDirichletAllocation(n_components=5) lda_dtf = lda.fit_transform(dtm) #Latent Dirichlet Allocation Model # lda_dtf # Topic Extracting #Extracting 5 Topics from LDA and the most common words in each topic sorting = np.argsort(lda.components_)[:, ::-1] features = np.array(vect.get_feature_names()) # mg.tools.print_topics(topics=range(5), feature_names=features, sorting=sorting, topics_per_chunk=5, n_words=15) #Sentences within the Topic Model 1 topic_0 = np.argsort(lda_dtf[:,0])[::-1] t0=[] for i in topic_0[:5]: j=(f".".join(inps[i].split(f".")[:2]) + f".\n") t0.append(j) #Senteces within the Topic Model 2 topic_1 = np.argsort(lda_dtf[:,1])[::-1] t1=[] for i in topic_1[:5]: j=(f".".join(inps[i].split(f".")[:2]) + f".\n") t1.append(j) #Senteces within the Topic Model 3 topic_2 = np.argsort(lda_dtf[:,2])[::-1] t2=[] for i in topic_2[:5]: j=(f".".join(inps[i].split(f".")[:2]) + f".\n") t2.append(j) #Senteces within the Topic Model 4 topic_3 = np.argsort(lda_dtf[:,3])[::-1] t3=[] for i in topic_3[:5]: j=(f".".join(inps[i].split(f".")[:2]) + f".\n") t3.append(j) #Senteces within the Topic Model 5 topic_4 = np.argsort(lda_dtf[:,4])[::-1] t4=[] for i in topic_4[:5]: j=(f".".join(inps[i].split(f".")[:2]) + f".\n") t4.append(j) st0=str(t0).replace("b'"," ").replace("\\n"," ").replace("\\"," ").replace("b\\"," ") st1=str(t1).replace("b'"," ").replace("\\n"," ").replace("\\"," ").replace("b\\"," ").replace("'b"," ") st2=str(t2).replace("b'"," ").replace("\\n"," ").replace("\\"," ").replace("b\\"," ").replace("'b"," ") st3=str(t3).replace("b'"," ").replace("\\n"," ").replace("\\"," ").replace("b\\"," ").replace("'b"," ") st4=str(t4).replace("b'"," ").replace("\\n"," ").replace("\\"," ").replace("b\\"," ").replace("'b"," ") print ("Topic 0: \n" + st0 + "\n") print ("Topic 1: \n" + st1 + "\n") print ("Topic 2: \n" + st2 + "\n") print ("Topic 3: \n" + st3 + "\n") print ("Topic 4: \n" + st4 + "\n")
def pdf2txt(self): ''' ============================= return : str, text File path ''' # input password = '' pagenos = set() maxpages = 0 # output imagewriter = None rotation = 0 codec = 'UTF-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() infp = open(self.input_path, "rb") if self.output_path == None: self.output_path = self.input_path[:-4] + '_trans.txt' outfp = open(self.output_path, "w", encoding='UTF8') else: outfp = open(self.output_path, "w", encoding='UTF8') #page total num parser = PDFParser(infp) document = PDFDocument(parser) page_total_num = resolve1(document.catalog['Pages'])['Count'] # rsrcmgr = PDFResourceManager(caching=caching) # pdf -> text converter device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) # pdf -> text interpreter interpreter = PDFPageInterpreter(rsrcmgr, device) # pdf -> text start with tqdm(total=page_total_num) as pbar: for page in PDFPage.get_pages(infp, pagenos, maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) pbar.update(1) print('[INFO] pdf -> text') outfp.close() infp.close() return self.output_path
def parse(self, full_path: str): info = super().parse(full_path) if self.content_length > 0: with open(full_path, "rb") as f: try: parser = PDFParser(f) document = PDFDocument(parser) except PDFSyntaxError: print("couldn't parse PDF " + full_path) return info info["content"] = "" if len(document.info) > 0 and "Title" in document.info[ 0] and document.info[0]["Title"] != b"": if isinstance(document.info[0]["Title"], bytes): info["content"] += document.info[0]["Title"].decode( "utf-8", "replace") + "\n" else: info["content"] += document.info[0]["Title"].resolve( ).decode("utf-8", "replace") + "\n" try: if document.is_extractable: resource_manager = PDFResourceManager() la_params = LAParams() device = PDFPageAggregator(resource_manager, laparams=la_params) interpreter = PDFPageInterpreter( resource_manager, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance( lt_obj, LTTextLine): text = lt_obj.get_text() if len(info["content"]) + len( text) <= self.content_length: info["content"] += text else: info["content"] += text[ 0:self.content_length - len(info["content"])] break else: continue break else: print("PDF is not extractable: " + full_path) except ValueError: print("Couldn't parse page for " + full_path) return info
def makepdf(self, pdfdata1, udct, zeros): parser = PDFParser(BytesIO(pdfdata1)) document = PDFDocument(parser, fallback=False) prev = document.find_xref(parser) info = document.xrefs[0].trailer['Info'].objid root = document.xrefs[0].trailer['Root'].objid size = 1 # calculate last object id, size is only xref size but not count of object in xref for ref in document.xrefs: if isinstance(ref, PDFXRefStream): no = max(ref.ranges, key=operator.itemgetter(1))[1] else: if len(ref.offsets) == 0: no = 0 else: no = max(ref.offsets.keys()) size = max(size, no) page = document.getobj( document.catalog['Pages'].objid)['Kids'][0].objid nsig, fields = self.getfields(root, document) annots = self.getannots(page, document) infodata = self.getdata(pdfdata1, info, prev, document) rootdata = self.getdata(pdfdata1, root, prev, document, ('AcroForm', )) pagedata = self.getdata(pdfdata1, page, prev, document, ('Annots', )) annotation = udct.get(b'signature', b'').decode('utf8') x1, y1, x2, y2 = udct.get(b'signaturebox', (0, 0, 0, 0)) annotation = FreeText( Location(x1=x1, y1=y1, x2=x2, y2=y2, page=0), Appearance( fill=[0, 0, 0], stroke_width=1, wrap_text=True, font_size=12, content=annotation, ), ) pdfa = annotation.as_pdf_object(identity(), page=None) pdfar = b'[%d %d %d %d]' % tuple(pdfa.Rect) pdfas = pdfa.AP.N.stream.encode('latin1') no = size + 1 objs = [ self.makeobj(page, (b'/Annots[%s%d 0 R]' % (annots, no + 3) + pagedata)), self.makeobj(no + 0, infodata), self.makeobj(no + 1, (b'/AcroForm %d 0 R' % (no + 2)) + rootdata), self.makeobj( no + 2, b'/Fields[%s%d 0 R]/SigFlags %d' % (fields, no + 3, udct[b'sigflags'])), self.makeobj( no + 3, b''' /Type /Annot /Subtype /FreeText /AP <</N %d 0 R>> /BS <</S /S /Type /Border /W 0>> /C [] /Contents (%s) /DA (0 0 0 rg /%s 12 Tf) /Rect %s /F 704 /P %d 0 R /FT /Sig /T(Signature%d) /V %d 0 R ''' % (no + 4, pdfa.Contents.encode('latin1'), pdfa.AP.N.Resources.Font.keys()[0].encode('latin1'), pdfar, page, nsig, no + 5)), self.makeobj( no + 4, b''' /BBox %s /FormType 1 /Length %d /Matrix [1 0 0 1 0 0] /Resources <</Font <<%s <</BaseFont /Helvetica /Encoding /WinAnsiEncoding /Subtype /Type1 /Type /Font>>>> /ProcSet /PDF>> /Subtype /Form /Type /XObject ''' % ( pdfar, len(pdfas), pdfa.AP.N.Resources.Font.keys()[0].encode('latin1'), ), b'stream\n' + pdfas + b'\nendstream\n'), self.makeobj(no + 5, ( b'/ByteRange [0000000000 0000000000 0000000000 0000000000]/ContactInfo(%s)\ /Filter/Adobe.PPKLite/Location(%s)/M(D:%s)/Prop_Build<</App<</Name/>>>>/Reason(%s)/SubFilter/adbe.pkcs7.detached/Type/Sig\ /Contents <' % (udct[b'contact'], udct[b'location'], udct[b'signingdate'], udct[b'reason'])) + zeros + b'>'), ] pdfdata2 = b''.join(objs) xref = b'''\ xref\n\ %(page)d 1\n\ %(p0)010d 00000 n \n\ %(no)d 6\n\ %(n0)010d 00000 n \n\ %(n1)010d 00000 n \n\ %(n2)010d 00000 n \n\ %(n3)010d 00000 n \n\ %(n4)010d 00000 n \n\ %(n5)010d 00000 n \n\ ''' startxref = len(pdfdata1) dct = { b'page': page, b'no': no, b'startxref': startxref + len(pdfdata2), b'prev': prev, b'info': no + 0, b'root': no + 1, b'size': 6, b'p0': startxref + pdfdata2.find(b'\n%d 0 obj\n' % page) + 1, b'n0': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 0)) + 1, b'n1': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 1)) + 1, b'n2': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 2)) + 1, b'n3': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 3)) + 1, b'n4': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 4)) + 1, b'n5': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 5)) + 1, b'h1': hashlib.md5(pdfdata1).hexdigest().upper().encode('ascii'), b'h2': hashlib.md5(pdfdata2).hexdigest().upper().encode('ascii'), } trailer = b'''\ trailer <</ID [<%(h1)s><%(h2)s>]/Info %(info)d 0 R/Prev %(prev)d/Root %(root)d 0 R/Size %(size)d>>\n\ startxref\n\ %(startxref)d\n\ %%%%EOF\n\ ''' xref = xref % dct trailer = trailer % dct pdfdata2 = pdfdata2 + xref + trailer return pdfdata2
headers = '\t'.join([x.text for x in headerline]) tbrows = [x for x in tls if x.y1 < hd1tl.y0] else: tbrows = [x for x in tls] ry0s = sorted([x.y0 for x in tbrows], reverse=True) rows = clustrows(ry0s) tbrows2 = [sorted([x for x in tbrows if x.y0 in ri], key=lambda x: x.x0) for ri in rows] tbrows3 = ['\t'.join([x.text for x in r]) for r in tbrows2] if header1 != '': return([headers] + tbrows3) else: return(tbrows3) infile = open(inpdf, 'rb') document = PDFDocument(PDFParser(infile)) page_it = PageIterator(document, LAParams(char_margin=0.2)) tbs5=[] tbs6=[] tbs7=[] tbs8=[] pg = 1 while pg < 264: if pg == 153: tbs5 = tbs5 + parsetable(page_it, header1='SAMPLE') elif pg > 153 and pg < 224: tbs5 = tbs5 + parsetable(page_it) elif pg == 224: tbs6 = tbs6 + parsetable(page_it, header1='PDID') elif pg > 224 and pg < 248:
__author__: "Sushovan Mandal" __license__: "GPLv2" __email__: "*****@*****.**" ''' from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFTextExtractionNotAllowed # Open a PDF file. fp = open('extras/sample.pdf', 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed #parser.set_document(doc) #doc.set_parser(parser) #document.initialize() metadata = document.info # The "Info" metadata #print document.catalog for d in metadata: if type(d) == dict: for key, value in d.iteritems(): print("%s, %s" % (unicode(key), unicode(value, errors='ignore')))
from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFTextExtractionNotAllowed from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice path = '/Users/mattstringer/Dropbox/ProyectoLaCumbre/DataClean/pdfs/example_finca.pdf' # Open a PDF file. fp = file(path, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. document = PDFDocument(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) document.initialize(password) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page)
def mine_area(filename): """ use pdfminer to get the valid area of each page. all results are relative position! """ pageboxlist = [] # 打开一个pdf文件 with open(filename, 'rb') as fp: # 创建一个PDF文档解析器对象 parser = PDFParser(fp) # 创建一个PDF文档对象存储文档结构 # 提供密码初始化,没有就不用传该参数 #document = PDFDocument(parser, password) document = PDFDocument(parser) # 检查文件是否允许文本提取 if not document.is_extractable: raise PDFTextExtractionNotAllowed # 创建一个PDF资源管理器对象来存储共享资源 # caching = False不缓存 rsrcmgr = PDFResourceManager(caching=False) # 创建一个PDF设备对象 laparams = LAParams() # 创建一个PDF页面聚合对象 device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解析器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 处理文档当中的每个页面 # doc.get_pages() 获取page列表 # for i, page in enumerate(document.get_pages()): # PDFPage.create_pages(document) 获取page列表的另一种方式 # 循环遍历列表,每次处理一个page的内容 count = 0 for page in PDFPage.create_pages(document): interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象。一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 boxlist = [] for item in layout: if count >= 3: break box = item.bbox boxlist.append(box) if isinstance(item, LTTextBox) or isinstance(item, LTTextLine): print('text:{}'.format(item)) print(item.height) print(item.get_text()) count += 1 elif isinstance(item, LTImage): print('image:{}'.format(item)) elif isinstance(item, LTFigure): print('figure:{}'.format(item)) elif isinstance(item, LTAnno): print('anno:{}'.format(item)) elif isinstance(item, LTChar): print('char:{}'.format(item)) elif isinstance(item, LTLine): print('line:{}'.format(item)) elif isinstance(item, LTRect): print('rect:{}'.format(item)) elif isinstance(item, LTCurve): print('curve:{}'.format(item)) pageboxlist.append(boxlist) # for x in layout: # #如果x是水平文本对象的话 # if (isinstance(x, LTTextBoxHorizontal)): # # text=re.sub(replace,'',x.get_text()) # text = x.get_text() # if len(text) != 0: # print text break res = [] for boxlist in pageboxlist: tmp = get_max_box(boxlist) res.append(tmp) return res
def parse_pdf(path=None, data=None, savePath=None, y_tolerance=1.5, char_tolerance=0.5): ''' function : 处理pdf :param:词间最大间距,行间最大间距,输入路径,输出路径 :return 无 ''' # 记录page行数 pdfRowNumber = 0 theMaxColSize = [] wb = Workbook() ws = wb.active if data == None: data = open(path, 'rb') parser = PDFParser(data) document = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=None) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) height = page.mediabox[3] - page.mediabox[1] layout = device.get_result() pageContainer, theMaxColNum = get_line_word( layout, height, y_tolerance=y_tolerance, char_tolerance=char_tolerance) # 按照位置信息排序 for line in pageContainer: line.sort(key=itemgetter('x0')) pageContainer.sort(key=lambda line: line[0]['top']) if len(pageContainer[0]) < theMaxColNum: for i in range(len(pageContainer)): if len(pageContainer[i]) == theMaxColNum: repairList = align_front_row(pageContainer[0:i], theMaxColNum) del pageContainer[0:i] pageContainer.insert(0, repairList) break # 对最后一排进行判断 if len(pageContainer[-1]) < theMaxColNum: pageContainer[-1] = align_last_row(pageContainer[-2:], theMaxColNum) # 写入excel alignment = Alignment(horizontal='center', vertical='center') for idx, line in enumerate(pageContainer): for idy, item in enumerate(line): cellIndex = ws.cell(row=idx + 1 + pdfRowNumber, column=idy + 1) if item['text'] == '': pass elif item['text'] == None: ws.merge_cells(start_row=idx + 1 + pdfRowNumber, start_column=1, end_row=idx + 1 + pdfRowNumber, end_column=theMaxColNum) ws.cell(idx + 1 + pdfRowNumber, 1).alignment = alignment break else: if idx == 0 and len(line) == 2: pass else: cellIndex.alignment = alignment if item['text'].isdigit(): cellIndex.value = int(item['text']) cellIndex.number_format = '0' elif is_float(item['text']): cellIndex.value = float(item['text']) else: cellIndex.value = item['text'] thePageMaxColSize = [0 for i in range(theMaxColNum)] for line in pageContainer: if len(line) == 2: continue for col, item in enumerate(line): if len(item['text']) > thePageMaxColSize[col]: thePageMaxColSize[col] = len(item['text']) if theMaxColSize == []: theMaxColSize = thePageMaxColSize[:] else: for i in range(theMaxColNum): if theMaxColSize[i] < thePageMaxColSize[i]: theMaxColSize[i] = thePageMaxColSize[i] # 将该页的行数相加,使excel连续 pdfRowNumber += len(pageContainer) # 保存excel文件至本地 letter = [ 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' ] for col, theSize in enumerate(theMaxColSize): rest = (col + 1) % 26 cut = int((col + 1) / 26) colLetter = '' if cut == 0: colLetter = letter[rest - 1] else: colLetter = letter[cut] + letter[rest - 1] ws.column_dimensions[colLetter].width = theSize * 2 if savePath != None: wb.save(savePath) else: wb.save(path.replace('.pdf', '.xlsx'))
def get_input_fields(self, source_path: str = None, replace_none_value: bool = False) -> dict: """Get input fields in the PDF. Stores input fields internally so that they can be used without parsing the PDF again. Parameter ``replace_none_value`` is for convience to visualize fields. If no source path given, assumes a PDF is already opened. **Examples** **Robot Framework** .. code-block:: robotframework ***Settings*** Library RPA.PDF ***Tasks*** Example Keyword ${fields}= Get Input Fields /tmp/sample.pdf **Python** .. code-block:: python from RPA.PDF import PDF pdf = PDF() def example_keyword(): fields = pdf.get_input_fields("/tmp/sample.pdf") :param source_path: source filepath, defaults to None. :param replace_none_value: if value is None replace it with key name, defaults to False. :return: dictionary of input key values or `None`. """ record_fields = {} if not source_path and self.ctx.active_pdf_document.fields: return self.ctx.active_pdf_document.fields self.ctx.switch_to_pdf(source_path) source_parser = PDFParser(self.ctx.active_pdf_document.fileobject) source_document = PDFDocument(source_parser) try: fields = pdfminer.pdftypes.resolve1( source_document.catalog["AcroForm"])["Fields"] except KeyError as err: raise KeyError('PDF "%s" does not have any input fields.' % self.ctx.active_pdf_document.path) from err for i in fields: field = pdfminer.pdftypes.resolve1(i) if field is None: continue name, value, rect, label = ( field.get("T"), field.get("V"), field.get("Rect"), field.get("TU"), ) if value is None and replace_none_value: record_fields[name.decode("iso-8859-1")] = { "value": name.decode("iso-8859-1"), "rect": iterable_items_to_int(rect), "label": label.decode("iso-8859-1") if label else None, } else: try: record_fields[name.decode("iso-8859-1")] = { "value": value.decode("iso-8859-1") if value else "", "rect": iterable_items_to_int(rect), "label": label.decode("iso-8859-1") if label else None, } except AttributeError: self.logger.debug("Attribute error") record_fields[name.decode("iso-8859-1")] = { "value": value, "rect": iterable_items_to_int(rect), "label": label.decode("iso-8859-1") if label else None, } self.ctx.active_pdf_document.fields = record_fields or None return record_fields
def get_pages_in_pdf(file): document = PDFDocument(PDFParser(file)) return resolve1(document.catalog['Pages'])['Count']
#coding=utf-8 ''' Created on 2017��1��12�� @author: feifei ''' import os from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument path_project = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir)) path_book = path_project + os.sep + "input" + os.sep + "McCrackens Removable Partial Prosthodontics_nodrm.pdf" path_out = path_project + os.sep + "output" + os.sep + "Contemporary Fixed Prosthodontics, 5ed index.txt" # Open a PDF document. fp = open(path_book, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) # Get the outlines of the document. outlines = document.get_outlines() with open(path_out, "w") as f: for (level, title, dest, a, se) in outlines: f.write(title.encode("utf-8") + "\n") print "over"
class PdfFileParser(object): def __init__(self, infile, outfile=None, password=None, selectedpages=None, maxSplit=3, W=1440.0, H=1080.0, outputJson=False, trimbox=None, trimboxes=None, exclude=False, debug=0): self.args = { a[0]: a[1] for a in locals().items() if a[0] not in ['self', 'outputJson'] } self.outputJson = outputJson self.DEBUG = debug self.picklefile = infile + '.pickle' self.selectedpages = selectedpages self.pickleLoaded = False self.savedconfig = None self.coords = [] self.pagesCoords = [] self.trimbox = trimbox self.trimboxes = trimboxes self.exclude = exclude self.pageRanges = SelectedPages(selectedpages) if ENABLE_PICKLE and os.path.isfile(self.picklefile): try: with open(self.picklefile, 'rb') as f: self.savedconfig = pickle.load(f) savedargs = self.savedconfig['args'] equal = True for k, v in self.args.items(): if k == 'selectedpages': if v not in SelectedPages(savedargs[k]): equal = False elif k not in savedargs: equal = False elif v != savedargs[k]: equal = False if not equal: break if equal: self.pickleLoaded = True self.pagesCoords = self.savedconfig['pagesCoords'] except Exception, e: print e self.fname = infile self.W = float(W) self.H = float(H) self.maxSplit = maxSplit self.outfile = outfile if self.outfile == None: outFilename, outExt = os.path.splitext(infile) self.outfile = outFilename + '-out' + outExt if not (self.selectedpages == None or self.selectedpages == ''): outFilename, outExt = os.path.splitext(self.outfile) self.outfile = '%s(%s)%s' % (outFilename, self.selectedpages, outExt) if os.path.isfile(self.outfile): i = 1 outfile, outExt = os.path.splitext(self.outfile) while os.path.isfile("%s(%d)%s" % (outfile, i, outExt)): i += 1 self.outfile = "%s%d%s" % (outfile, i, outExt) self.password = password self.endPage = self.pageRanges.getEndPage( 30000) - 1 # 1 base vs 2 base self.inFile = open(self.fname, 'rb') self.parser = PDFParser(self.inFile) self.document = PDFDocument(self.parser) self.rsrcmgr = PDFResourceManager() self.laparams = LAParams() if not self.pickleLoaded: self.device = PDFPageAggregator(self.rsrcmgr, laparams=self.laparams) self.interpreter = PDFPageInterpreter(self.rsrcmgr, self.device) self.pagesEnumerator = enumerate( PDFPage.create_pages(self.document))
def getText(self): """ Algorithm: 1) Txr information from PDF file to PDF document object using parser 2) Open the PDF file 3) Parse the file using PDFParser object 4) Assign the parsed content to PDFDocument object 5) Now the information in this PDFDocumet object has to be processed. For this we need PDFPageInterpreter, PDFDevice and PDFResourceManager 6) Finally process the file page by page """ # Open and read the pdf file in binary mode with open(self.pdf_file_path, "rb") as fp: # Create parser object to parse the pdf content parser = PDFParser(fp) # Store the parsed content in PDFDocument object document = PDFDocument(parser, self.password) # Check if document is extractable, if not abort if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create PDFResourceManager object that stores shared resources # such as fonts or images rsrcmgr = PDFResourceManager() # set parameters for analysis laparams = LAParams() # Create a PDFDevice object which translates interpreted # information into desired format # Device to connect to resource manager to store shared resources # device = PDFDevice(rsrcmgr) # Extract the decive to page aggregator to get LT object elements device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create interpreter object to process content from PDFDocument # Interpreter needs to be connected to resource manager for shared # resources and device interpreter = PDFPageInterpreter(rsrcmgr, device) # Initialize the text extracted_text = "" # Ok now that we have everything to process a pdf document, # lets process it page by page numpages = 0 for page in PDFPage.create_pages(document): # As the interpreter processes the page stored in PDFDocument # object interpreter.process_page(page) # The device renders the layout from interpreter layout = device.get_result() # Out of the many LT objects within layout, we are interested # in LTTextBox and LTTextLine for lt_obj in layout: if (isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine)): extracted_text += lt_obj.get_text() numpages += 1 if numpages <= 4: print(extracted_text.encode("utf-8")) return extracted_text.encode("utf-8")
def parse(DataIO, save_path, start=None, end=None): # 用文件对象创建一个PDF文档分析器 parser = PDFParser(DataIO) # 创建一个PDF文档 doc = PDFDocument(parser) #分析器和文档相互连接 parser.set_document(doc) #doc.set_parser(parser) # 提供初始化密码,没有默认为空 #doc.initialize() # 检查文档是否可以转成TXT,如果不可以就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDF资源管理器,来管理共享资源 rsrcmagr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() # 将资源管理器和设备对象聚合 device = PDFPageAggregator(rsrcmagr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmagr, device) # 循环遍历列表,每次处理一个page内容 #pages = PDFPage.get_pages(doc) # doc.get_pages()获取page列表 #for page in pages: page_num = 0 for page in PDFPage.create_pages(doc): page_num = page_num + 1 if start is not None and end is not None: if page_num < start: continue if page_num > end: break interpreter.process_page(page) # 接收该页面的LTPage对象 layout = device.get_result() f = open('./text/'+str(page_num)+'.txt', 'w') #with open('%s' % (save_path), 'a') as f: # 这里的layout是一个LTPage对象 里面存放着page解析出来的各种对象 # 一般包括LTTextBox,LTFigure,LTImage,LTTextBoxHorizontal等等一些对象 # 想要获取文本就得获取对象的text属性 for x in layout: #try: if isinstance(x, LTTextBoxHorizontal): # 得到文本 result = x.get_text() try: print("***************** LTTextBoxHorizontal ************") print(result) #if len(result) >= 15: # 写到文件中 f.write(result + "\n") except: print('写入文件错误', result) pass if isinstance(x, LTTextBox): print("***************** LTTextBox ************") print(x.get_text()) if isinstance(x, LTFigure): print("***************** LTFigure ************") parse_lt_figure(x, page_num, f) if isinstance(x, LTImage): print("***************** LTImage ************") saved_file = save_image(x, page_num) print('save image ' + x.name) if isinstance(x, LTChar): print('ppppppppppppppp') print(x.get_text()) f.write(x.get_text()) if isinstance(x, LTCurve): print("***************** LTCurve ************") f.close()