def main(): dirout = '/Users/dariaulybina/Desktop/georgetown/global-economics/convert_pdfs/pdfminer_p2/tag_converted_docs/' dirin = '/Users/dariaulybina/Desktop/georgetown/global-economics/scrape_articles/pdfs_downloaded/' layoutmode = 'normal' codec = 'utf-8' laparams = LAParams() caching = True stripcontrol = True pdf_list = list_files(dirin) print pdf_list for fn in pdf_list: fname = os.path.join(dirin, fn) print fname file_out = fn.replace('.pdf', '.tag') outfile = os.path.join(dirout, file_out) #print(outfile) outfp = file(outfile, 'w') fp = file(fname, 'rb') rsrcmgr = PDFResourceManager(caching=caching) #device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) #device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) #device = HTMLConverter(rsrcmgr, outfp, codec=codec,layoutmode=layoutmode, laparams=laparams) device = TagExtractor(rsrcmgr, outfp, codec=codec) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp): interpreter.process_page(page) fp.close() device.close() outfp.close() print 'Document done' print 'Finished all documents'
def getTransContent(fp, resultfp): outfp = cStringIO.StringIO() # This secontion contains pdf parsing boilerplate password = '' pagenos = set() maxpages = 0 # output option outtype = None imagewriter = None rotation = 0 layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() rsrcmgr = PDFResourceManager(caching=caching) device = TagExtractor(rsrcmgr, outfp, codec=codec) # end boilerplate for pdf parsing interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() parseTrans(outfp.getvalue(), resultfp) device.close()
def getTransContent(fp, resultfp): outfp = cStringIO.StringIO() # This secontion contains pdf parsing boilerplate password = '' pagenos = set() maxpages = 0 # output option outtype = None imagewriter = None rotation = 0 layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() rsrcmgr = PDFResourceManager(caching=caching) device = TagExtractor(rsrcmgr, outfp, codec=codec) # end boilerplate for pdf parsing interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() parseTrans(outfp.getvalue(), resultfp) device.close()
def get_text_from_pdfdata(data): fp = StringIO(data) outfp = StringIO() rsrc = PDFResourceManager() #device = TextConverter(rsrc, outfp, codec="utf-8") device = TagExtractor(rsrc, outfp, codec="latin-1") doc = PDFDocument() #fp = open(inputbuffer, 'rb') parser = PDFParser(fp) try: parser.set_document(doc) except: # occurs for example if document is encrypted return False try: doc.set_parser(parser) except: return False try: doc.initialize('') except: return False interpreter = PDFPageInterpreter(rsrc, device) try: for i, page in enumerate(doc.get_pages()): try: interpreter.process_page(page) except: print "Cancelling PDF extraction due to Error" return False except: print "Cancelling PDF extraction due to Error" return False device.close() fp.close() return outfp.getvalue().decode("latin-1")