def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): fp = open(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) if objids: for objid in objids: obj = doc.getobj(objid) dumpxml(outfp, obj, codec=codec) if pagenos: for (pageno,page) in enumerate(PDFPage.create_pages(doc)): if pageno in pagenos: if codec: for obj in page.contents: obj = stream_value(obj) dumpxml(outfp, obj, codec=codec) else: dumpxml(outfp, page.attrs) if dumpall: dumpallobjs(outfp, doc, codec=codec) if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc) fp.close() if codec not in ('raw','binary'): outfp.write('\n') return
def extractembedded(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): def extract1(obj): filename = os.path.basename(obj['UF'] or obj['F']) fileref = obj['EF']['F'] fileobj = doc.getobj(fileref.objid) if not isinstance(fileobj, PDFStream): raise PDFValueError( 'unable to process PDF: reference for %r is not a PDFStream' % (filename)) if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE: raise PDFValueError( 'unable to process PDF: reference for %r is not an EmbeddedFile' % (filename)) path = os.path.join(extractdir, filename) if os.path.exists(path): raise IOError('file exists: %r' % path) print('extracting: %r' % path, file=sys.stderr) out = file(path, 'wb') out.write(fileobj.get_data()) out.close() return fp = open(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) for xref in doc.xrefs: for objid in xref.get_objids(): obj = doc.getobj(objid) if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC: extract1(obj) fp.close() return
def dumpoutline(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): fp = open(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) pages = dict((page.pageid, pageno) for (pageno, page) in enumerate(PDFPage.create_pages(doc), 1)) def resolve_dest(dest): if isinstance(dest, str): dest = resolve1(doc.get_dest(dest)) elif isinstance(dest, PSLiteral): dest = resolve1(doc.get_dest(dest.name)) if isinstance(dest, dict): dest = dest['D'] if isinstance(dest, PDFObjRef): dest = dest.resolve() return dest try: outlines = doc.get_outlines() outfp.write('<outlines>\n') for (level, title, dest, a, se) in outlines: pageno = None if dest: dest = resolve_dest(dest) pageno = pages[dest[0].objid] elif a: action = a if isinstance(action, dict): subtype = action.get('S') if subtype and repr(subtype) == '/\'GoTo\'' and action.get( 'D'): dest = resolve_dest(action['D']) pageno = pages[dest[0].objid] s = e(title).encode('utf-8', 'xmlcharrefreplace') outfp.write('<outline level="%r" title="%s">\n' % (level, s)) if dest is not None: outfp.write('<dest>') dumpxml(outfp, dest) outfp.write('</dest>\n') if pageno is not None: outfp.write('<pageno>%r</pageno>\n' % pageno) outfp.write('</outline>\n') outfp.write('</outlines>\n') except PDFNoOutlines: pass parser.close() fp.close() return
def get_ToC(file): """This funciton will locate the Table of Content, and return a dataframe with the corresponding ToC-number and name Args: file (the pdf-file that will be read): reads and extract the specific words from the string """ # Open a PDF document. fp = open(file, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) # Get the outlines of the document. outlines = document.get_outlines() for (level, title, dest, a, se) in outlines: print(level, title)
def createPDFDoc(fpath): fp = open(fpath, 'rb') parser = PDFParser(fp) document = PDFDocument(parser, password='') # Check if the document allows text extraction. If not, abort. assert document.is_extractable return document
def __init__(self, pdf, codec='utf-8'): """ Parameters: -------------- codec: codific, default utf-8 pdf: path to the pdf file Attributes: --------------- records: list of lines from the pdf file text: string of joined records, default "" didascalies: list of found didascalies with regexpr nimages: int, number of found images """ self.pdf = pdf self.text = "" self.records = [] self.didascalies = [] self.nimages = 0 self.images = [] parser = PDFParser(pdf) #parser = PDFParser(open(pdf, 'rb')) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object # that stores shared resources. rsrcmgr = PDFResourceManager() # Create a buffer for the parsed text retstr = StringIO() # Spacing parameters for parsing laparams = LAParams() self.codec = codec device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # Create a PDF interpreter object interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) #images img_device = PDFPageAggregator(rsrcmgr, laparams=laparams) img_interpreter = PDFPageInterpreter(rsrcmgr, img_device) for page in PDFPage.create_pages(document): img_interpreter.process_page(page) pdf_item = img_device.get_result() if pdf_item is not None: for thing in pdf_item: if isinstance(thing, LTImage): self.save_image(thing) if isinstance(thing, LTFigure): self.find_images_in_thing(thing) lines = retstr.getvalue().splitlines() for line in lines: self.records.append(line)
def _extract_pdf_scores(stream): # these laparams seem to work ok with the ILIAS default PDF # formatting as well as with UR custom styling. # see pdf/tests/default_style.pdf and pdf/tests.ur_style.pdf laparams = LAParams(line_overlap=0, char_margin=20, word_margin=0.1, boxes_flow=0, detect_vertical=False) rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) parser = PDFParser(stream) document = PDFDocument(parser) page = next(PDFPage.create_pages(document)) interpreter.process_page(page) layout = device.get_result() boxes = [] table_head_y = None # y position of result table header order_name = "Reihenfolge" # FIXME localize for element in layout: if isinstance(element, LTTextBoxHorizontal): boxes.append(element) if order_name in element.get_text().strip(): table_head_y = element.y0 tboxes = list(filter(lambda box: box.y0 == table_head_y, boxes)) # if LAParams is set correctly, head should extract the whole # results table's text now. table = tboxes[0].get_text().replace('\t', '') table = table[table.find(order_name):] # note: question titles might lack spaces; this is no problem # since we compare question names and scores only through # Result.normalize_question_title() later. scores = dict() cols = [] for line in table.split("\n")[1:]: cols += re.split(r'\s+', line) if len(cols) >= 6: scores[cols[2]] = cols[4] cols = cols[6:] return scores
def _createPDFDoc(self, fpath, password): fp = open(fpath, 'rb') parser = PDFParser(fp) try: document = PDFDocument(parser, password) except PDFPasswordIncorrect: raise AssertionError( "Password '{}' is incorrect.".format(password)) except TypeError: raise AssertionError( "Unable to extract the pdf. Please check the password.") return fp, document
def extractembedded(fname, password='', extractdir=None, emailsDir=None): def extract1(obj): filename = os.path.basename(obj['F']) # filename = os.path.basename(obj['UF'] or obj['F']) fileref = obj['EF']['F'] fileobj = doc.getobj(fileref.objid) if not isinstance(fileobj, PDFStream): raise PDFValueError( 'unable to process PDF: reference for %r is not a PDFStream' % (filename)) if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE: raise PDFValueError( 'unable to process PDF: reference for %r is not an EmbeddedFile' % (filename)) file_name, extension = os.path.splitext(fname) path = os.path.join(extractdir, file_name + " " + filename) while os.path.exists(path): path = os.path.join( extractdir, file_name + " " + str(randint(1, 100)) + " " + filename) print >> sys.stderr, "file exists, create random name %s" % path # print >>sys.stderr, 'extracting: %r' % path out = file(path, 'wb') out.write(fileobj.get_data()) out.close() return fp = file(os.path.join(emailsDir, fname), 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) for xref in doc.xrefs: if type(xref ) == PDFXRef: # Ignore PDFXreffallback. Not sure what it is. for objid in xref.get_objids(): obj = doc.getobj(objid) if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC: extract1(obj) return
def convert(pdffile): my_file = pdffile extracted_text = "" # Open and read the pdf file in binary mode fp = open(my_file, "rb") # Create parser object to parse the pdf content parser = PDFParser(fp) # Store the parsed content in PDFDocument object document = PDFDocument(parser, password) # Check if document is extractable, if not abort if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create PDFResourceManager object that stores shared resources such as fonts or images rsrcmgr = PDFResourceManager() # set parameters for analysis laparams = LAParams() # Create a PDFDevice object which translates interpreted information into desired format # Device needs to be connected to resource manager to store shared resources # device = PDFDevice(rsrcmgr) # Extract the decive to page aggregator to get LT object elements device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create interpreter object to process page content from PDFDocument # Interpreter needs to be connected to resource manager for shared resources and device interpreter = PDFPageInterpreter(rsrcmgr, device) # Ok now that we have everything to process a pdf document, lets process it page by page for page in PDFPage.create_pages(document): # As the interpreter processes the page stored in PDFDocument object interpreter.process_page(page) # The device renders the layout from interpreter layout = device.get_result() # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): extracted_text += lt_obj.get_text() #close the pdf file fp.close() # print (extracted_text.encode("utf-8")) with open(log_file, "wb") as my_log: my_log.write(extracted_text.encode("utf-8")) print("Done !!")
def main(args): msg(SCRIPT, args) if len(args) != 1: msg('Parse a PDF file and print some pdfminer-specific stats') msg('Usage:', SCRIPT, '<PDF-filename>') return 1 infilename, = args lt_types = collections.Counter() with open(infilename, 'rb') as pdf_file: # Create a PDF parser object associated with the file object. parser = PDFParser(pdf_file) # Create a PDF document object that stores the document structure. # Supply the password for initialization. password = '' document = PDFDocument(parser, password) # Check if the document allows text extraction. if not document.is_extractable: raise PDFTextExtractionNotAllowed(filename) # Make a page iterator pages = PDFPage.create_pages(document) # Set up for some analysis rsrcmgr = PDFResourceManager() laparams = LAParams( detect_vertical=True, all_texts=True, ) #device = PDFDevice(rsrcmgr) device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Look at all (nested) objects on each page for page_count, page in enumerate(pages, 1): # oh so stateful interpreter.process_page(page) layout = device.get_result() lt_types.update(type(item).__name__ for item in flat_iter(layout)) msg('page_count', page_count) msg('lt_types:', ' '.join('{}:{}'.format(*tc) for tc in lt_types.items()))
def convert_pdf_to_txt(path): fp = open(path, 'rb') txt = '' parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): txt += lt_obj.get_text() return (txt)
def other_causes(_row_array, filename, delimiter, num_causes): if num_causes < 1: num_causes = 1 elif num_causes > 5: num_causes = 5 causes_table = [] causes_table.append( "State, Cause 1, Cause 1 Value, Cause 1 Value (Per Capita)") for index in range(0, num_causes - 1): causes_table[0] = causes_table[0] + delimiter + "Cause " + str(index+2) \ + delimiter + "Cause Value " + str(index + 2)\ + delimiter + "Cause Value " + str(index + 2) + " (Per Capita)" causes_table[0] = causes_table[0] + '\r' count = 1 output_string = StringIO() print("Processing other causes of death (2015)...") with open(filename + '.pdf', 'rb') as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams(char_margin=20)) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) text = str(output_string.getvalue()) writeblock(filename, text, '', '.txt') skip_line = lambda str: -1 if str.find('\n') == -1 else str[str.find('\n') + 1:] isolate_line = lambda str: -1 if str.find('\n') == -1 else str[:str.find( '\n')] entry_value_per_capita = 0 entry_value = 0 for row in _row_array: if row == _row_array[0]: continue state = parseop(row, ',', 1, 0, parse.RETRIEVE) pop = float(parseop(row, delimiter, 8, 0, parse.RETRIEVE)) pattern = re.compile(state) search = pattern.search(text) if search == None: continue ## print(entry1.end()) placeholder_string = text[search.end():] #find first cause of death value if state == "Maryland": lines = 8 else: lines = 4 for i in range(0, lines): #Skip first 4 lines return_value = skip_line(placeholder_string) if return_value == -1: exit(-1) placeholder_string = return_value causes_line = state for index in range(0, num_causes): #find cause of death label ## placeholder_string = skip_line(placeholder_string) #skip another line entry_name = isolate_line(placeholder_string) #isolate it pattern = re.compile("\D+(?= )") search = pattern.search(entry_name) if search == None: print("Name not found..") else: entry_name = search.group() #Apply regular expression entry_name = entry_name[1:] # remove prefix space #find cause of death value placeholder_string = skip_line( placeholder_string) #skip another line entry_value = placeholder_string[:placeholder_string.find(' ')] for i in range(0, 10): entry_value = entry_value.replace(',', '') #Remove up to 9 commas #per capita if pop > 0: entry_value_per_capita = float( float(float(entry_value) / pop) * 100) else: entry_value_per_capita = -1 placeholder_string = skip_line( placeholder_string) #skip another line #Add to table causes_line = causes_line + delimiter + entry_name + delimiter + entry_value \ + delimiter + str(entry_value_per_capita) causes_table.append(causes_line + '\r') count += 1 writeblock("us_cause_of_death_2015", causes_table, '', '.csv')
if __name__ == '__main__': # # Documents: https: // buildmedia.readthedocs.org / media / pdf / pdfminer - docs / latest / pdfminer - docs.pdf # tic = time() sys.path.append('../') from utils import virtual_environment parser = argparse.ArgumentParser(prog='pdfminer.poc') parser.add_argument('file_name', type=str) cmd_args = virtual_environment(parser) from pdfminer3.pdfdocument import PDFDocument from pdfminer3.pdftypes import PDFObjectNotFound from pdfminer3.pdfparser import PDFParser, PDFStream print(cmd_args.file_name) input_file = open(cmd_args.file_name, "rb") parsed = PDFDocument(PDFParser(input_file)) try: shutil.rmtree('%s.pdfminer_out' % cmd_args.file_name) except FileNotFoundError: pass os.mkdir('%s.pdfminer_out' % cmd_args.file_name) for obj_id in set(obj_id for xref in parsed.xrefs for obj_id in xref.get_objids()): try: obj = parsed.getobj(obj_id) except PDFObjectNotFound: continue if not isinstance(obj, PDFStream): continue print('%s' % obj) obj.decode()
def extract_pdf(self): assert self.extension in ['pdf'] self.content = self.file.read() parser = PDFParser(self.file) doc = PDFDocument(parser) available_fields = list(doc.info[0].keys()) self.properties['auteur'] = None self.properties['creation_date'] = None self.properties['modification_date'] = None self.properties['creator'] = None self.properties['producer'] = None if 'CreationDate' in available_fields: if isinstance(doc.info[0]["CreationDate"], PDFObjRef): doc.info[0]["CreationDate"] = resolve1( doc.info[0]["CreationDate"]) try: pdf_creation_date = str( self.convertPdfDatetime(doc.info[0]["CreationDate"])) self.properties['creation_date'] = str(pdf_creation_date) except: pass if 'ModDate' in available_fields: if isinstance(doc.info[0]["ModDate"], PDFObjRef): doc.info[0]["ModDate"] = resolve1(doc.info[0]["ModDate"]) try: pdf_modif_date = str( self.convertPdfDatetime(doc.info[0]["ModDate"])) self.properties['modification_date'] = str(pdf_modif_date) except: pass if 'Author' in available_fields: if isinstance(doc.info[0]["Author"], PDFObjRef): doc.info[0]["Author"] = resolve1(doc.info[0]["Author"]) try: pdf_auteur = doc.info[0]["Author"].decode("utf-8") self.properties['auteur'] = pdf_auteur except: pass if 'Creator' in available_fields: if isinstance(doc.info[0]["Creator"], PDFObjRef): doc.info[0]["Creator"] = resolve1(doc.info[0]["Creator"]) try: pdf_creator = doc.info[0]["Creator"].decode("utf-16") self.properties['creator'] = pdf_creator except: pass if 'Producer' in available_fields: if isinstance(doc.info[0]["Producer"], PDFObjRef): doc.info[0]["Producer"] = resolve1(doc.info[0]["Producer"]) try: pdf_producer = doc.info[0]["Producer"].decode("utf-16") self.properties['producer'] = pdf_producer except: pass parser.set_document(doc) pages = resolve1(doc.catalog['Pages']) pages_count = pages.get('Count', 0) #Only the first 300 characters for clarity self.content = self.convert_pdf_to_txt() self.properties['content'] = self.content[:300] + '(...)' self.properties['page_count'] = pages_count return self.properties