def pdf2txt(self, lowerBorder=-1, upperBorder=-1): """ Returns the plain text of the document. If lowerBorder is an int number > -1, only page referring to this number will be returned. If lowerBorder and upperBorder are >-1 and upperBorder > lowerBoder, the pages referring to that range will be returned. """ rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(self.filename, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True if (lowerBorder==-1 and upperBorder==-1) or (lowerBorder>-1 and upperBorder=="max"): pagenos=set() elif lowerBorder > -1 and upperBorder==-1: #extract only a single page pagenos=set(range(lowerBorder, lowerBorder+1)) elif lowerBorder==-1 or upperBorder==-1 or lowerBorder > upperBorder: raise ValueError("illegal parameter passed") else: pagenos=set(range(lowerBorder, upperBorder+1)) for (pageno, page) in enumerate(PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True)): if pageno < lowerBorder and upperBorder == "max": continue interpreter.process_page(page) fp.close() device.close() s = retstr.getvalue() retstr.close() return s.decode('utf-8')
def convert(url, pages=None): assert isinstance(url, basestring) assert pages == None or isinstance(pages, list) rscmng = PDFResourceManager() retstr = StringIO() device = TextConverter(rscmng, retstr, codec='utf-8', laparams=LAParams()) web_page = urllib2.urlopen(urllib2.Request(url)) fp = StringIO(web_page.read()) interpreter = PDFPageInterpreter(rscmng, device) pdf_pages = PDFPage.get_pages( fp, set(pages if pages != None else []), maxpages=0, password='', caching=True, check_extractable=True ) for page in pdf_pages: interpreter.process_page(page) result = retstr.getvalue() fp.close() web_page.close() device.close() retstr.close() return result
def parse(self, path): out = StringIO.StringIO() fp = None # Directory if os.path.isdir(path): raise NotImplementedError() # File else: fp = file(path) rsrc = PDFResourceManager() codec = 'utf-8' laparams = LAParams() laparams.char_margin = 2.0 laparams.line_margin = 2.0 laparams.word_margin = 0.0 device = TextConverter(rsrc, out, codec=codec, laparams=laparams) doc = PDFDocument() parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize() interpreter = PDFPageInterpreter(rsrc, device) for page in doc.get_pages(): interpreter.process_page(page) device.close() sample = Sample(path, None, out.getvalue()) out.close() return sample
def convert(fname, pages=None): if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = file(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums): # a = page.contents[0].rawdata # print ('u', a) # print # splitData = a.split('\n') interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close() # print ('u', text) # print # print(text) return text
def get_pdf_text(path): """ Reads a pdf file and returns a dict of the text where the index represents the page number. http://stackoverflow.com/a/20905381 """ rsrcmgr = PDFResourceManager() retstr = StringIO() # change to to utf-8 if the text comes out garbled codec = 'ascii' #codec = 'utf-8' laparams = LAParams() pages = {} device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams, showpageno=True, pages=pages) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() retstr.close() return pages
def convert_pdf_to_txt(self, path): """ A very simple conversion function which returns text for parsing from PDF. path = The path to the file """ try: rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter( rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return text except Exception as e: text = "" return text self.logger.error( "Failed to PDF to text: " + str(e))
def convert_pdf_to_txt(path, output): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() f = open(output, 'wb') f.write(text) f.close() return text
def convert_pdf_to_txt(path): ## TAKEN FROM STACK OVERFLOW ## see... http://www.unixuser.org/~euske/python/pdfminer/programming.html for tutorial ## Also see... https://github.com/dpapathanasiou/pdfminer-layout-scanner/blob/master/layout_scanner.py rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() fp = file(path, 'rb') password = "" maxpages = 0 caching = True pagenos=set() # Read text from pages device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) str = retstr.getvalue() fp.close() device.close() retstr.close() return str
def getTexts(self): try: password ='' pagenos = set() maxpages = 0 codec = 'utf-8' caching = True laparams = LAParams() rsrcmgr = PDFResourceManager(caching=caching) outfp = file('temppdf.txt','w') device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fname= self.fname fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True) fp.close() device.close() outfp.close() infp = file('temppdf.txt','rb') test=infp.read() infp.close() os.remove('temppdf.txt') self.text=test return "ok" except Exception,e: return e
def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 120 caching = True pagenos=set() # print "two" for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) # print "one" try: fp.close() device.close() str = retstr.getvalue() retstr.close() except: str = retstr.getvalue() return str
def convert_pdf(path='provide path here', format='text', codec='utf-8'): rsrcmgr = PDFResourceManager() retstr = BytesIO() laparams = LAParams() if format == 'text': device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) else: raise ValueError('Please provide the format to extract') fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) maxpages = 500 #mention the maximum pages here (Note: Large number of pages will decrease the performance.) caching = True page_numbers=set() for page in PDFPage.get_pages(fp, page_numbers, maxpages=maxpages,caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue().decode() fp.close() device.close() retstr.close() bulletins_data = re.findall('•([^•]+)*', str(text)) list_of_bullet_points = [] json_dict = {} for points in bulletins_data: list_of_bullet_points.append(points) json_dict['bulletins'] = list_of_bullet_points json_data= json.dumps(json_dict) parsed = json.loads(json_data) final_data = json.dumps(parsed, indent=4, sort_keys=True) #creates a pretty json with the data extracted document = Document() # creates a new document document.add_heading('Bulletins data in the PDF') document.add_paragraph(str(final_data)) document.save('json_data.docx') # saves it to the filesystem os.startfile("json_data.docx") # will open the file return ''
def pdf_read(pdf): """ Use PDFMiner to extract text from pdf file. <PDFMiner even though more low-level but pretty good tool to read pdfs> Args: *pdf* (str) -- path to pdf file Returns: *text* (str) -- a text extracted from pdf """ # initalizing objects res_manager = PDFResourceManager() strio = StringIO() lps = LAParams() device = TextConverter(res_manager, strio, codec='utf-8', laparams=lps) interpreter = PDFPageInterpreter(res_manager, device) # opening a pdf file with 'rb' mode for reading binary files pdf_file = file(pdf, 'rb') for page in PDFPage.get_pages(pdf_file, maxpages=0, password='', caching=True, check_extractable=True): interpreter.process_page(page) # finishing up pdf_file.close() device.close() text = strio.getvalue() strio.close() return text
def run(path): print "Calling parser :%s" % path t0 = time.clock() rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() book = Book() i = 0 for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page_tmp = Page() begin_page = len(retstr.getvalue()) interpreter.process_page(page) page_tmp.text = retstr.getvalue()[begin_page:-1] book.pages.append(page_tmp) fp.close() device.close() retstr.close() print "Parsing in:", time.clock() - t0 return book
def get_text(path): txt_path = path + '.txt' if (os.path.isfile(txt_path)): return open(txt_path).read() path = path + '.pdf' rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() str = retstr.getvalue() retstr.close() write_text(txt_path, str) return str
def convert_pdf_to_txt(path): """ Converts PDF to text using the pdfminer library """ rsrcmgr = PDFResourceManager() retstr = StringIO() codec = "utf-8" laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) file_handle = file(path, "rb") interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages( file_handle, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True ): interpreter.process_page(page) text = retstr.getvalue() file_handle.close() device.close() retstr.close() return text
def pdf_from_url_to_txt(url, maxpages=0): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # Open the url provided as an argument to the function and read the content f = urllib2.urlopen(urllib2.Request(url)).read() # Cast to StringIO object fp = StringIO(f) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() string = retstr.getvalue() retstr.close() return string
def pdf_to_text(pdfname): from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfpage import PDFPage from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from cStringIO import StringIO # PDFMiner boilerplate rsrcmgr = PDFResourceManager() sio = StringIO() # codec = 'utf-8' codec = 'ascii' laparams = LAParams() device = TextConverter(rsrcmgr, sio, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Extract text fp = file(pdfname, 'rb') for page in PDFPage.get_pages(fp): interpreter.process_page(page) fp.close() # Get text from StringIO text = sio.getvalue() # Cleanup device.close() sio.close() return text
def edit_file(fname, pages=None): if 'log_in' not in session.keys() : return redirect(url_for('index')) #f = send_from_directory(app.config['UPLOAD_FOLDER'],fname) filename = fname fname = os.path.join(app.config['UPLOAD_FOLDER'], fname) exten = fname.split('.')[1] print exten if exten != 'pdf' : f = open(fname, 'rb').read() text = f else : if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = file(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close print filename return '<!doctype html><title>Edit File</title><h1>Upload new File</h1><form action="/save" method=post><p><textarea name="contents" rows=30 cols = 150 autofocus>' + text +'</textarea><br /><input type=hidden name=filename value=' + str(filename) + '> <input type=submit value=Upload></form></html>'
def convert_pdf_to_txt(path): temp = os.path.splitext(path) rsrcmgr = PDFResourceManager() retstr = StringIO() codec = "utf-8" laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, "rb") interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages( fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True ): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() outputFile = temp[0] + ".txt" print outputFile ff = open(outputFile, "w") ff.write(text) ff.close()
def pdf2txt(path): ''' Converts a given PDF to plain text in UTF8. ''' try: rsrcMgr = PDFResourceManager() retStr = StringIO() codec = 'utf-8' laParams = LAParams() device = TextConverter(rsrcMgr, retStr, codec=codec, laparams=laParams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcMgr, device) password = "" maxPages = 0 caching = True pageNos=set() for page in PDFPage.get_pages(fp,pageNos,maxpages=maxPages,password=password,caching=caching,check_extractable=True): interpreter.process_page(page) fp.close() device.close() text = retStr.getvalue() retStr.close() return text except: return None
def pdf_to_txt(path): """converts pdf into a string @param path: path to the file @type path: string @return: pdf content @rtype: string""" rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() s = retstr.getvalue() retstr.close() return s.replace('\x0c','')
def extract_text_from_pdf(pdf_filename): """ Function to extract the text from pdf documents using pdfminer Parameters: ----------- pdf_filename -- string File name of the pdf document as string Returns: -------- extracted_text -- string Text extracted from pdf as string """ resource_manager = PDFResourceManager() return_string = StringIO() la_params = LAParams() device = TextConverter(resource_manager, return_string, codec='utf-8', laparams=la_params) fp = file(pdf_filename, 'rb') interpreter = PDFPageInterpreter(resource_manager, device) page_nos = set() for page in PDFPage.get_pages(fp, page_nos): interpreter.process_page(page) fp.close() device.close() extracted_text = return_string.getvalue() return_string.close() return extracted_text
def pdfconvert(infullpath, file, outfullpath, pages=None): #Handle PDF if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) pdffile = open(infullpath, 'rb') for page in PDFPage.get_pages(pdffile, pagenums): interpreter.process_page(page) pdffile.close() converter.close() txtfilename = file jpgfile = os.path.splitext(outfullpath)[0] + '.jpg' txtfile = os.path.splitext(outfullpath)[0] + '.txt' string.replace(txtfile, ' ', '_') string.replace(txtfile, '(', '_') string.replace(txtfile, ')', '_') text = output.getvalue() output.close temp = open(txtfile, 'w') temp.write (text) temp.close() imagemagick_string = 'convert ' + '"' + infullpath + '" "' + jpgfile + '"' os.system(imagemagick_string)
def pdf_to_txt(fichero_pdf,fichero_txt): # Especificamos la configuracion de nuestro pdf password = '' pagenos = set() maxpages = 0 imagewriter = None rotation = 0 codec = 'utf-8' caching = True laparams = LAParams() # Estrablecemos el gestor rsrcmgr = PDFResourceManager(caching=caching) # Creamos el fichero de salida y lingamos el dispositivo que lo transforma outfp = file(fichero_txt, 'w') device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) # Para cada pagina del fichero pdf vamos interpretandola mediante el dispositivo fp = file(fichero_pdf, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) # Cerramos los dispositivos abiertos fp.close() device.close() outfp.close() return 1
def __convert(self, ifile, ofile=None): fp = file(ifile, 'rb') if ofile is None: outfp = StringIO.StringIO() else: outfp = file(ofile, 'wb') rsrcmgr = PDFResourceManager(caching=self.caching) device = TextConverter(rsrcmgr, outfp, codec=self.codec, laparams=self.laparams, imagewriter=self.imagewriter) interpreter = PDFPageInterpreter(rsrcmgr, device) try: for page in PDFPage.get_pages(fp, self.pagenos, maxpages=self.maxpages, password=self.password, caching=self.caching, check_extractable=True): page.rotate = (page.rotate + self.rotation) % 360 interpreter.process_page(page) except (PDFException, MemoryError) as e: print "Could not extract text {0}".format(e) fp.close() device.close() retval = None if ofile is None: retval = outfp.getvalue() outfp.close() return retval
def pdf_to_txt(path, lowerBorder=-1, upperBorder=-1): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True if lowerBorder==-1 and upperBorder==-1: pagenos=set() else: if lowerBorder==-1 or upperBorder==-1 or lowerBorder > upperBorder: raise ValueError("illegal parameter passed") else: pagenos=set(range(lowerBorder, upperBorder+1)) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() s = retstr.getvalue() retstr.close() return s.decode('utf-8')
def pdfconvert(infullpath, file, infolder, pages=None): #Handle PDF if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) pdffile = open(infullpath, 'rb') # print "pdffile=", pdffile for page in PDFPage.get_pages(pdffile, pagenums): interpreter.process_page(page) pdffile.close() converter.close() txtfilename = file jpgfile = infolder + str(txtfilename) + '.jpg' txtfile = corpuspath + corpusfolder + '/' + txtfilename + '.txt' text = output.getvalue() output.close temp = open(txtfile, 'w') temp.write (text) temp.close() imagemagick_string = 'convert ' + '"' + infullpath + '" "' + jpgfile + '"' os.system(imagemagick_string) return jpgfile
def pdf_to_text(pdf): pagenos = set() maxpages = 0 # output option rotation = 0 codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() rsrcmgr = PDFResourceManager(caching=caching) outtype = 'text' retstr = BytesIO() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = pdf if isinstance(pdf, str): fp = open(pdf, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() result = retstr.getvalue() print(result) return result
def convert_pdf_to_txt(path): """ This function converts a .pdf file to text @path: file path to .pdf document from: http://stackoverflow.com/questions/26494211/ extracting-text-from-a-pdf-file-using-pdfminer-in-python/26495057#26495057 """ rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return text
def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) with open(path, 'rb') as fp: parser = PDFParser(fp) doc = PDFDocument(caching=True) parser.set_document(doc) doc.set_parser(parser) doc.initialize('') interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in doc.get_pages(): interpreter.process_page(page) text = retstr.getvalue() device.close() retstr.close() return text
def extract_text_from_pdf(pdf_path): ''' read pdf file into string type ''' resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) with open(pdf_path, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(page) text = fake_file_handle.getvalue() # close open handles converter.close() fake_file_handle.close() if text: return text
def convert_pdf_to_txt(r, max_pages=3): text = None rsrcmgr = PDFResourceManager() retstr = BytesIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) if r.status_code != 200: logger.info(u"error: status code {} in convert_pdf_to_txt".format( r.status_code)) return None if not r.encoding: r.encoding = "utf-8" fp = StringIO(r.content_big()) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" caching = True pagenos = set() pages = PDFPage.get_pages(fp, pagenos, maxpages=max_pages, password=password, caching=caching, check_extractable=True) for page in pages: interpreter.process_page(page) text = retstr.getvalue() device.close() retstr.close() # logger.info(text) return text
def parse_pdf(fname, outfile): # input option password = b'' pagenos = set() maxpages = 0 # output option #outfile = fname + '.txt' outtype = 'text' imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' encoding = 'utf-8' pageno = 0 scale = 1 caching = True showpageno = False laparams = LAParams() # rsrcmgr = PDFResourceManager(caching=caching) outfp = open(outfile, 'w', encoding=encoding) device = TextConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter) with open(fname, 'rb') as fp: interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) device.close() outfp.close() return
def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = io.StringIO() device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=LAParams()) filepath = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) pagenos = set() for page in PDFPage.get_pages(filepath, pagenos, maxpages=0, password='', caching=True, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() filepath.close() device.close() retstr.close() return text
def convert_pdf_to_text(self, fp=None): rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) fp = open(self.a, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return text
def convert_pdf_to_txt(file): # input: a pdf file pdfFilePath = file rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(file, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() # get pdf file name and then create a new text file to store the text content of a paper pdfFileName = os.path.basename(pdfFilePath) portion = os.path.splitext(pdfFileName) if portion[1] == ".pdf": txtFileName = portion[0] + ".txt" # write text into txtFileName and save to current directory() f = open(txtFileName, "w+") f.write(text) f.close() return txtFileName
def PDF_TO_TEXT(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() #Read through pages in PDF for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return text
def extract_text_from_pdf(pdf_path): """ Helper function to extract the plain text from .pdf files :param pdf_path: path to PDF file to be extracted :return: iterator of string of extracted text """ with open(pdf_path, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) page_interpreter.process_page(page) text = fake_file_handle.getvalue() yield text # close open handles converter.close() fake_file_handle.close()
def convert(fname, pages=None): if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = file(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close # lista = text.split('\n') # print lista # for i in range(0,len(lista)): # print i, lista[i] return text
def pdf_to_text(fname, pages=None): if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = open(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close() result = [] for line in text.split('\n'): result.append(line) return result
def pdfparser(filename): manager = PDFResourceManager() output = io.BytesIO() codec = 'utf-8' laparams = LAParams() converter = TextConverter(manager, output, codec=codec, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(manager, converter) #pdf_file = open(file, 'rb') pdf_file = open(filename, 'rb') # Process each page contained in the document. for page in PDFPage.get_pages(pdf_file): interpreter.process_page(page) pdf_file.close() converter.close() text = output.getvalue() return text
def convert(fname, pages=None): if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = open(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() text_file = open("Resume.txt", "w", encoding="utf-8") text_file.write(text) output.close text_file.close() return text
def convert_pdf_to_txt(path): pdf_rsc_manager = PDFResourceManager() str_io = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(pdf_rsc_manager, str_io, codec=codec, laparams=laparams) pdf_file = open(path, 'rb') interpreter = PDFPageInterpreter(pdf_rsc_manager, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(pdf_file, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) text = str_io.getvalue() pdf_file.close() device.close() str_io.close() return text
def readPDF(pdffile): with open(pdffile,'rb') as fd: rsrcmgr=PDFResourceManager() retstr=StringIO() laparams=LAParams() device=TextConverter(rsrcmgr,retstr,laparams=laparams) process_pdf(rsrcmgr,device,fd) device.close() content=retstr.getvalue() retstr.close() strs = str(content).split('\n') for val in strs: if val == '': strs.remove(val) strs = "===".join(strs) strs = re.sub('===','<p>',strs) print(strs) return strs # pdffile='d:/33.pdf' # readPDF(pdffile)
def convert_pdf_to_txt(path): resourceManager = PDFResourceManager() returnstream = BytesIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(resourceManager, returnstream, codec=codec, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(resourceManager, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) text = returnstream.getvalue() fp.close() device.close() returnstream.close() return (text)
def _convert_pdf_to_txt(self, pdf_path, page_list, codec='utf-8', password="",\ maxpages=0, caching=True): """ This is a functhion that extract all the text from a pdf file. Args: pdf_path (str): path of the pdf need to be processed """ rsrcmgr = PDFResourceManager() retstr = io.StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, \ laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pagenos = set() with open(pdf_path, 'rb') as fp: pages_objs_list = list( PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True)) if page_list != None: pages_objs_list = [pages_objs_list[i] for i in page_list] for page in pages_objs_list: interpreter.process_page(page) text = retstr.getvalue() orig_text = text content = text.split('\n') content = [x.strip() for x in content if x.strip()] device.close() retstr.close() return orig_text, content
def getPages(self, infile): outfile = infile.replace('pdf', 'txt') print(outfile) removeNoneLine = re.compile(r'\n[\s|]*\n') debug = 0 pagenos = set() password = '' maxpages = 0 rotation = 0 codec = 'utf-8' #输出编码 caching = True imagewriter = None laparams = LAParams() # PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug rsrcmgr = PDFResourceManager(caching=caching) outfp = file(outfile, 'w') #pdf转换 device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) fp = file(infile, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) #处理文档对象中每一页的内容 for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): #page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
def pdf2txt(): filextension = filename.split('.')[-1].lower() if filextension == "pdf": pi = 0 pdfout = StringIO() pdfrm = PDFResourceManager() converter = TextConverter(pdfrm, pdfout, laparams=LAParams()) interpreter = PDFPageInterpreter(pdfrm, converter) infile = open(filename, 'rb') for page in PDFPage.get_pages(infile): #仅检索前10页内�? if pi > 9: break interpreter.process_page(page) pi += 1 infile.close() converter.close() text = pdfout.getvalue() pdfout.close print(text[0:2000])
def convert_pdf_to_txt(path, page_no=0): text = "" rsrcmgr = PDFResourceManager() retstr = io.StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos=[page_no], check_extractable=True): page_no += 1 interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return text
def convert_pdf_to_txt(path): resource_manager = PDFResourceManager() return_string = StringIO() codec = 'utf-8' la_params = LAParams() device = TextConverter(resource_manager, return_string, codec=codec, laparams=la_params) interpreter = PDFPageInterpreter(resource_manager, device) with open(path, 'rb') as file: for page in PDFPage.get_pages(file, caching=True, check_extractable=True): interpreter.process_page(page) text = return_string.getvalue() device.close() return_string.close() return text
def extractPDFText(pdfFilePath): #Returns text in the pdf with open(pdfFilePath , 'rb') as fh: completeText = "" for page in PDFPage.get_pages(fh , caching = True , check_extractable = True): resourceManager = PDFResourceManager() fakeFileHandle = io.StringIO() converter = TextConverter(resourceManager , fakeFileHandle) pageInterpreter = PDFPageInterpreter(resourceManager , converter) pageInterpreter.process_page(page) text = fakeFileHandle.getvalue() completeText += text completeText += " " converter.close() fakeFileHandle.close() return completeText
def extract_text_by_page(pdf_path): ''' This fuction read a PDF document page by page using pdfmine library :param pdf_path: directory that contains a pdf file :return: It return a iterator, to read page by page the data ''' with open(pdf_path, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) page_interpreter.process_page(page) text = fake_file_handle.getvalue() yield text # close open handles converter.close() fake_file_handle.close()
def convert_pdf_to_text(path): resource_manager = PDFResourceManager() return_string = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(resource_manager, return_string, codec=codec, laparams=laparams) file_path = open(path, 'rb') interpreter = PDFPageInterpreter(resource_manager, device) password = "" maxpages = 0 caching = True pagenos = set() filename = os.path.basename(path) #for page in PDFPage.get_pages(file_path, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): try: log.info(f'Converting {filename}') for page in PDFPage.get_pages(file_path, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=False): interpreter.process_page(page) result = return_string.getvalue() file_path.close() device.close() return_string.close() return result except Exception as ex: log.error(f'Exception of type {type(ex).__name__} thrown on: {path}') pass
def processPDF(file_path): #TODO: This method is generally too slow to be useful. Needs a rewrite (Add PDF as an allowed format in the SQL #TODO: query when done) error = None file_string = None try: rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(file_path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): print(page) interpreter.process_page(page) file_string = retstr.getvalue() print(file_string) fp.close() device.close() retstr.close() except Exception as e: error = e.message return [error, file_string]
def convert(fname, pages=None): if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = file(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close return text #print(convert("sample3.pdf"))
def pdf2doc(pdfname): # PDFファイル名が未指定の場合は、空文字列を返して終了 if (pdfname == ''): return '' else: # 処理するPDFファイルを開く/開けなければ try: fp = open(pdfname, 'rb') except: return '' # リソースマネージャインスタンス rsrcmgr = PDFResourceManager() # 出力先インスタンス outfp = StringIO() # パラメータインスタンス laparams = LAParams() # 縦書き文字を横並びで出力する laparams.detect_vertical = True # デバイスの初期化 device = TextConverter(rsrcmgr, outfp, codec='utf-8', laparams=laparams) # テキスト抽出インタプリタインスタンス interpreter = PDFPageInterpreter(rsrcmgr, device) # 対象ページを読み、テキスト抽出する。(maxpages:0は全ページ) for page in PDFPage.get_pages(fp, pagenos=None, maxpages=0, password=None, caching=True, check_extractable=True): interpreter.process_page(page) # 取得したテキストをすべて読みだす ret = outfp.getvalue() # 後始末をしておく fp.close() device.close() outfp.close() # 空白と改行をとりさり一塊のテキストとして返す return re.sub(r"\s| ", '', ret)
def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = io.StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() string = retstr.getvalue() retstr.close() return string
def main(fname, output_f): # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option imagewriter = None rotation = 0 codec = 'utf-8' caching = True laparams = LAParams() # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) outfp = open(output_f, 'w') device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) fp = open(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close()
def getChapterContents(self, chapterLink): log("PdfEBook: Getting chapter contents for %s" % chapterLink) # Create the set of pages that we want pagesRequired = set() # If we want the entire book, then use an empty set if chapterLink != 'ENTIRE_BOOK': # Check if there pages are a range of pages if '-' not in chapterLink: pagesRequired.add(int(chapterLink)) else: pageRange = chapterLink.split('-') startPage = int(pageRange[0]) endPage = int(pageRange[1]) while startPage <= endPage: pagesRequired.add(startPage) startPage = startPage + 1 chapterContent = "" try: output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams(), showpageno=False) # converter = HTMLConverter(manager, output, laparams=LAParams(), showpageno=False) interpreter = PDFPageInterpreter(manager, converter) infile = file(self.filePath, 'rb') for page in PDFPage.get_pages(infile, pagesRequired): interpreter.process_page(page) infile.close() converter.close() chapterContent = output.getvalue() output.close except: log("PdfEBook: Failed to read contents for %s in pdf %s with error: %s" % (chapterLink, self.filePath, traceback.format_exc()), xbmc.LOGERROR) return chapterContent
def pdf_to_text(pdfname): rsrcmgr = PDFResourceManager() # used to handle interpreter and device output = StringIO() # destination of interpreter processing codec = 'utf-8' laparams = LAParams() # params layout device = TextConverter(rsrcmgr, output, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Extract text fp = open(pdfname, 'rb') for page in PDFPage.get_pages(fp): interpreter.process_page(page) fp.close() # Get text from StringIO text = output.getvalue() # Cleanup device.close() output.close() return text