def toHtml(path_pdf, path_html, rotation=0, pagenos=set()): with open(path_html, 'w') as outfp: device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter # , debug = debug ) interpreter = PDFPageInterpreter(rsrcmgr, device) with open(path_pdf, 'rb') as fp: pages = PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=check_extractable) for page in pages: page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page)
def extract_pdf_page(filename): input_file_name = Path(filename).stem # Paths for creating folder and file output_file_folder = Path(HTML_PATH, input_file_name) output_file_folder.mkdir(parents=True, exist_ok=True) output_file_path = Path(output_file_folder, input_file_name + ".html") output_file = io.StringIO() laparams = LAParams() rsrcmgr = PDFResourceManager() device = HTMLConverter(rsrcmgr, output_file, laparams=laparams) # EXTRACTING TEXT TO HTML with open(filename, 'rb') as fh: interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): interpreter.process_page(page) device.close() html = output_file.getvalue() with open (output_file_path, 'w', encoding="utf-8" ) as fd: fd.write(html) output_file.close() return html
def __init__(self): # debug option self.setdebug(0) #only first page self.pagenos=set([0]) self.pageno = 1 self.outfp = stdmodel() self.codec = 'utf-8' self.showpageno = True self.scale = 1 self.password = '' self.maxpages = 0 self.rotation = 0 self.imagewriter = None self.laparams = LAParams() self.layoutmode = 'normal' # ResourceManager facilitates reuse of shared resources such as fonts and images so that # large objects are not allocated multiple times. #### This will cause some problem when set to default True. self.caching = False self.rsrcmgr = PDFResourceManager(caching=self.caching) # Important Main converter for pdf file self.device = TextConverter(self.rsrcmgr, self.outfp, codec=self.codec, laparams=self.laparams, imagewriter=self.imagewriter) self.htmldevice = HTMLConverter(self.rsrcmgr, self.outfp, codec=self.codec, scale=self.scale, layoutmode=self.layoutmode, laparams=self.laparams, imagewriter=self.imagewriter)
def convert_pdf(path, format='text', codec='utf-8', password=''): rsrcmgr = PDFResourceManager() retstr = BytesIO() laparams = LAParams() if format == 'text': device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) elif format == 'html': device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) elif format == 'xml': device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) else: raise ValueError('provide format, either text, html or xml!') fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue().decode() soup = bs(text) prettyHTML = soup.prettify() print(prettyHTML) # html_file = open("../../Data/document-page0.xml", "w") # html_file.write(prettyHTML) # html_file.close() # fp.close() # device.close() # retstr.close() return text
def convert_pdf(path, outp, format='txt', codec='utf-8', password=''): rsrcmgr = PDFResourceManager() laparams = LAParams() outf = open(outp + '.' + format, 'wb') if format == 'txt': device = TextConverter(rsrcmgr, outf, codec=codec, laparams=laparams) elif format == 'html': device = HTMLConverter(rsrcmgr, outf, codec=codec, laparams=laparams) elif format == 'xml': device = XMLConverter(rsrcmgr, outf, codec=codec, laparams=laparams) else: raise ValueError('provide format, either text, html or xml!') fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() outf.close()
def pdf_to_html(path): manager = PDFResourceManager() retstr = BytesIO() layout = LAParams(all_texts=True) device = HTMLConverter(manager, retstr, laparams=layout) filepath = open(path, 'rb') interpreter = PDFPageInterpreter(manager, device) for page in PDFPage.get_pages(filepath, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() filepath.close() device.close() retstr.close() # Write HTML String to file.html # f = open("demofile3.html", "wb") # f.write(text) # f.close() font_size = extract_font_table(text) return font_size
def convert(fp): showpageno = True pagenos = set() laparams = LAParams() rsrcmgr = PDFResourceManager(caching=False) retstr = StringIO2() retstr.encoding = 'utf-8' device = HTMLConverter(rsrcmgr, retstr, scale=1, layoutmode='normal', laparams=laparams, outdir=None, debug=False) process_pdf(rsrcmgr, device, fp, pagenos, maxpages=0, password='', caching=False, check_extractable=True) device.close() return retstr.getvalue()
def pdf_to_text(path): manager = PDFResourceManager() retstr = BytesIO() layout = LAParams(all_texts=True) # device = TextConverter(manager, retstr, laparams=layout) layoutmode = 'normal' imagewriter = None device = HTMLConverter(manager, retstr, layoutmode=layoutmode, laparams=layout, imagewriter=imagewriter) filepath = open(path, 'rb') interpreter = PDFPageInterpreter(manager, device) for page in PDFPage.get_pages(filepath, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() filepath.close() device.close() retstr.close() # print text from BeautifulSoup import BeautifulSoup parsed_html = BeautifulSoup(text) return clean_string(parsed_html.text)
def __init__(self, pdf_file, txt_file, file_format='txt', layout_analysis=True): self.pdf_file = file(pdf_file, 'rb') self.outfp = file(txt_file, 'w') if layout_analysis: laparams = LAParams() else: laparams = None self.rsrcmgr = PDFResourceManager(caching=True) if file_format == 'txt': self.device = TextConverter(self.rsrcmgr, self.outfp, codec='utf-8', laparams=laparams, imagewriter=None) elif file_format == 'html': self.device = HTMLConverter(self.rsrcmgr, self.outfp, codec='utf-8', laparams=laparams, imagewriter=None) elif file_format == 'xml': self.device = XMLConverter(self.rsrcmgr, self.outfp, codec='utf-8', laparams=laparams, imagewriter=None)
def pdf_to_text(path): manager = PDFResourceManager(caching=True) retstr = BytesIO() laparams = LAParams() device = HTMLConverter(manager, retstr, laparams=laparams) filepath = open(path, 'rb') interpreter = PDFPageInterpreter(manager, device) for page in PDFPage.get_pages(filepath, set(), maxpages=0, caching=True, check_extractable=True): interpreter.process_page(page) device.close() text = retstr.getvalue() filepath.close() retstr.close() text_file = open("Output.txt", "w") text_file.write(str(text)) text_file.close() return text
def convertPDF(fname, pages=None): if not pages: pagenos = set() else: pagenos = set(pages) caching = True outfp = StringIO() layoutmode = 'normal' laparams = LAParams() rotation = 0 rsrcmgr = PDFResourceManager(caching=caching) device = HTMLConverter(rsrcmgr, outfp, codec='utf-8', scale=1, layoutmode=layoutmode, laparams=laparams, imagewriter=None) fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=0, password='', caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() text = outfp.getvalue() outfp.close() return text
def convert_pdf_to_html(self): """ Converts the pdf that is currently stored in the temporary file inside the repository to an html object """ rsrcmgr = PDFResourceManager() # Magic (simply functional) retstr = BytesIO() codec = 'utf-8' laparams = LAParams() device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(self.path + 'parliament/repository/temp.pdf', 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() self.text = retstr.getvalue() retstr.close()
def readPDF(pdfFile): rsrcmgr = PDFResourceManager() #retstr = StringIO() codec = 'utf-8' laparams = LAParams() #device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) output = BytesIO() print("stage1") converter = HTMLConverter(rsrcmgr, output, codec=codec, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, converter) print("stage2") password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(pdfFile, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) converter.close() print("stage3") #textstr = retstr.getvalue() convertedPDF = output.getvalue() print("stage4") #retstr.close() output.close() #device.close() return convertedPDF
def convert_pdf(path, format='text', codec='utf-8', password=''): r = requests.get(path) f = io.BytesIO(r.content) rsrcmgr = PDFResourceManager() retstr = BytesIO() laparams = LAParams() if format == 'text': device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) elif format == 'html': device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) elif format == 'xml': device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) else: raise ValueError('provide format, either text, html or xml!') fp = io.BytesIO(f.getvalue()) interpreter = PDFPageInterpreter(rsrcmgr, device) maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue().decode() fp.close() device.close() retstr.close() return text
def pdftohtml(page): output = BytesIO() manager = PDFResourceManager() class imagewriter(object): @staticmethod def export_image(img): if img.stream: fstream = img.stream.get_rawdata() else: return "undefined" imhash = md5(fstream).hexdigest() imgobj = db.get_imgbyhash(imhash) if imgobj is not "undefined": return imgobj["tabname"] + "." + str(imgobj["id"]) else: return "undefined" converter = HTMLConverter(manager ,output ,laparams=LAParams() ,imagewriter=imagewriter) interpreter = PDFPageInterpreter(manager, converter) interpreter.process_page(page) converter.close() text = output.getvalue().decode("utf-8") output.close() return text
def to_html(self): self.retstr = StringIO() device = HTMLConverter(self.rsrcmgr, self.retstr, codec=self.codec, laparams=self.laparams) return self.__convert(device)
def read_pages(self, path, html=False, laparams=None, maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0, layoutmode='normal', output_dir=None, strip_control=False, debug=False, disable_caching=False, **kwargs): rsrcmgr = PDFResourceManager(caching=True) pages = [] with open(path, "rb") as f: for page in PDFPage.get_pages(f, None, maxpages=0, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 text = StringIO() if html: device = HTMLConverter(rsrcmgr, text, codec=None, scale=scale, layoutmode=layoutmode, laparams=laparams) else: device = TextConverter(rsrcmgr, text, codec=None, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) pages.append(text.getvalue()) device.close() return pages
def convert_pdf_to_html(path): rsrcmgr = PDFResourceManager() rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' #laparams = LAParams() laparams = LAParams(char_margin=3.5, all_texts=True) device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 # is for all caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() str = retstr.getvalue() retstr.close() return str
def html(self): html = None if os.path.isfile(self.__filename): output_file = 'cache/html/' + str(uuid.uuid4()) + '.html' if not os.path.exists(os.path.dirname(output_file)): os.makedirs(os.path.dirname(output_file)) codec = 'utf-8' maxpages = 0 pagenos = None html = True outfp = open(output_file, 'wb') rsrcmgr = PDFResourceManager() laparams = LAParams() device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, layoutmode='normal', text_colors={}) fp = open(self.__filename, 'rb') # noinspection PyBroadException try: interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages): interpreter.process_page(page) except: pass fp.close() device.close() outfp.flush() outfp.close() if os.path.isfile(output_file): file = open(output_file, "r", encoding='utf-8') html = file.read() return html
def convert_to_html(self,msg): testfile = urllib.URLopener() filename = msg.rsplit('/', 1)[1] filename = filename[:-4] + ".html" try : testfile.retrieve(msg, filename) except IOError as e: self.format_error_flag = True print "hey" return e except : self.format_error_flag = True return "Unexpected error" fp = file(filename, 'rb') rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp): interpreter.process_page(page) data = retstr.getvalue() break upload_file = open(filename, 'w') upload_file.write(data) return filename
def convert_pdf_to_html(path): rsrcmgr = PDFResourceManager() retstr = BytesIO() codec = 'utf-8' laparams = LAParams() device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 #is for all caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() string = retstr.getvalue() retstr.close() # out = open(path[:-4]+".html", 'w') # out.write(string) return str(string)
def pdf2html(pdfPath, htmlPath): '''按照tool中pdf2txt的方法,写的函数''' caching = True rsrcmgr = PDFResourceManager(caching=caching) scale = 1 layoutmode = 'noraml' laparams = LAParams() outdir = None debug = False outfp = io.open(htmlPath, 'wt', encoding='utf-8', errors='ignore') device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir, debug=debug) pagenos = set() maxpages = 0 password = '' fp = io.open(pdfPath, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() outfp.close()
def convert_pdf_to_html(self,fname,pages=None,skip_first=True) : if not pages: pagenums = set() else: pagenums = set(pages) manager = PDFResourceManager() codec = 'utf-8' caching = True output = io.BytesIO() converter = HTMLConverter(manager, output, codec=codec, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = open(fname, 'rb') print('Processing Page # :',end=' ') for i,page in enumerate(PDFPage.get_pages(infile, pagenums,caching=caching, check_extractable=True)): if skip_first : if i in [0,1] : continue print(i,end=',') interpreter.process_page(page) convertedPDF = output.getvalue() infile.close(); converter.close(); output.close() return convertedPDF
def convertPDFToHTMLPage(bookPath): rsrcmgr = PDFResourceManager() codec = 'utf-8' scale = 1 rotation = 0 outfile = bookPath.replace('.pdf', '.html') outfp = file(outfile, 'w') laparams = LAParams() layoutmode = 'normal' device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams) fp = file(bookPath, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, password="", check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() print "HTML output written to : ", outfile
def pdf_to_string(path, format='xml', password=''): rsrcmgr = PDFResourceManager() out_stream = BytesIO() laparams = LAParams() if format == 'text': device = TextConverter(rsrcmgr, out_stream, laparams=laparams) elif format == 'html': device = HTMLConverter(rsrcmgr, out_stream, laparams=laparams) elif format == 'xml': device = XMLConverter(rsrcmgr, out_stream, laparams=laparams) else: raise ValueError('provide format, either text, html or xml!') fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() text = out_stream.getvalue().decode("utf-8") out_stream.close() return text
def pdfTotxt(filepath, outpath): try: fp = file(filepath, 'rb') outfp = file(outpath, 'w') # 创建一个PDF资源管理器对象来存储共享资源 # caching = False不缓存 rsrcmgr = PDFResourceManager(caching=False) # 创建一个PDF设备对象 laparams = LAParams() device = HTMLConverter(rsrcmgr, outfp, codec='utf-8', laparams=laparams, imagewriter=None) # 创建一个PDF解析器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos=set(), maxpages=0, password='', caching=False, check_extractable=True): page.rotate = page.rotate % 360 interpreter.process_page(page) # 关闭输入流 fp.close() # 关闭输出流 device.close() outfp.flush() outfp.close() except Exception, e: print "Exception:%s", e
def parse(self): self.fp = open(self.filename, 'rb') self.rsrcmgr = PDFResourceManager(caching=caching) self.device = HTMLConverter(self.rsrcmgr, self.outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir) process_pdf(self.rsrcmgr, self.device, self.fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) self.outfp.seek(0) return BeautifulSoup.BeautifulSoup("".join(self.outfp.readlines()))
def update_model(fp): # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # BEGIN LAYOUT ANALYSIS # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) outfp = StringIO() codec = 'utf-8' scale = 1 layoutmode = 'normal' imagewriter = ImageWriter('image.jpg') outfp = open('cv_analyzer/resume.txt', 'wb') device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, pagemargin=0, fontscale=1.0, debug=0, imagewriter=imagewriter) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # loop over all pages in the document for page in PDFPage.create_pages(document): # convert the pdf pages into html format interpreter.process_page(page) device.close() outfp.close() read_model()
def readText(self,path, outtype='text', opts={}): outfile = path[:-3] + outtype outdir = '/'.join(path.split('/')[:-1]) # debug option pagenos = set() maxpages = 0 # output option # ?outfile = None # ?outtype = None outdir = None #layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) print laparams # #PDFDocument.debug = debug #PDFParser.debug = debug CMapDB.debug = self.debug PDFResourceManager.debug = self.debug PDFPageInterpreter.debug = self.debug PDFDevice.debug = self.debug # rsrcmgr = PDFResourceManager() #outtype = 'text' outfp = StringIO() device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fp = file(path, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, check_extractable=True) fp.close() device.close() print outfp.getvalue() outfp.close() return
def lerPDF(arquivo): recursos = PDFResourceManager() buffer = StringIO() layoutParams = LAParams() disp = HTMLConverter(recursos, buffer, laparams=layoutParams) process_pdf(recursos, disp, arquivo) disp.close() conteudo = buffer.getvalue() buffer.close() return conteudo