def run(path): print "Calling parser :%s" % path t0 = time.clock() rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() book = Book() i = 0 for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page_tmp = Page() begin_page = len(retstr.getvalue()) interpreter.process_page(page) page_tmp.text = retstr.getvalue()[begin_page:-1] book.pages.append(page_tmp) fp.close() device.close() retstr.close() print "Parsing in:", time.clock() - t0 return book
def pdfconvert(infullpath, file, outfullpath, pages=None): #Handle PDF if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) pdffile = open(infullpath, 'rb') for page in PDFPage.get_pages(pdffile, pagenums): interpreter.process_page(page) pdffile.close() converter.close() txtfilename = file jpgfile = os.path.splitext(outfullpath)[0] + '.jpg' txtfile = os.path.splitext(outfullpath)[0] + '.txt' string.replace(txtfile, ' ', '_') string.replace(txtfile, '(', '_') string.replace(txtfile, ')', '_') text = output.getvalue() output.close temp = open(txtfile, 'w') temp.write (text) temp.close() imagemagick_string = 'convert ' + '"' + infullpath + '" "' + jpgfile + '"' os.system(imagemagick_string)
def parse(self, path): out = StringIO.StringIO() fp = None # Directory if os.path.isdir(path): raise NotImplementedError() # File else: fp = file(path) rsrc = PDFResourceManager() codec = 'utf-8' laparams = LAParams() laparams.char_margin = 2.0 laparams.line_margin = 2.0 laparams.word_margin = 0.0 device = TextConverter(rsrc, out, codec=codec, laparams=laparams) doc = PDFDocument() parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize() interpreter = PDFPageInterpreter(rsrc, device) for page in doc.get_pages(): interpreter.process_page(page) device.close() sample = Sample(path, None, out.getvalue()) out.close() return sample
def convert_pdf_to_txt(path, output): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() f = open(output, 'wb') f.write(text) f.close() return text
def get_pdf_text(path): """ Reads a pdf file and returns a dict of the text where the index represents the page number. http://stackoverflow.com/a/20905381 """ rsrcmgr = PDFResourceManager() retstr = StringIO() # change to to utf-8 if the text comes out garbled codec = 'ascii' #codec = 'utf-8' laparams = LAParams() pages = {} device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams, showpageno=True, pages=pages) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() retstr.close() return pages
def edit_file(fname, pages=None): if 'log_in' not in session.keys() : return redirect(url_for('index')) #f = send_from_directory(app.config['UPLOAD_FOLDER'],fname) filename = fname fname = os.path.join(app.config['UPLOAD_FOLDER'], fname) exten = fname.split('.')[1] print exten if exten != 'pdf' : f = open(fname, 'rb').read() text = f else : if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = file(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close print filename return '<!doctype html><title>Edit File</title><h1>Upload new File</h1><form action="/save" method=post><p><textarea name="contents" rows=30 cols = 150 autofocus>' + text +'</textarea><br /><input type=hidden name=filename value=' + str(filename) + '> <input type=submit value=Upload></form></html>'
def convert(fname, pages=None): if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = file(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums): # a = page.contents[0].rawdata # print ('u', a) # print # splitData = a.split('\n') interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close() # print ('u', text) # print # print(text) return text
def get_text(path): txt_path = path + '.txt' if (os.path.isfile(txt_path)): return open(txt_path).read() path = path + '.pdf' rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() str = retstr.getvalue() retstr.close() write_text(txt_path, str) return str
def pdfconvert(infullpath, file, infolder, pages=None): #Handle PDF if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) pdffile = open(infullpath, 'rb') # print "pdffile=", pdffile for page in PDFPage.get_pages(pdffile, pagenums): interpreter.process_page(page) pdffile.close() converter.close() txtfilename = file jpgfile = infolder + str(txtfilename) + '.jpg' txtfile = corpuspath + corpusfolder + '/' + txtfilename + '.txt' text = output.getvalue() output.close temp = open(txtfile, 'w') temp.write (text) temp.close() imagemagick_string = 'convert ' + '"' + infullpath + '" "' + jpgfile + '"' os.system(imagemagick_string) return jpgfile
def convert_pdf_to_txt(path): ## TAKEN FROM STACK OVERFLOW ## see... http://www.unixuser.org/~euske/python/pdfminer/programming.html for tutorial ## Also see... https://github.com/dpapathanasiou/pdfminer-layout-scanner/blob/master/layout_scanner.py rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() fp = file(path, 'rb') password = "" maxpages = 0 caching = True pagenos=set() # Read text from pages device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) str = retstr.getvalue() fp.close() device.close() retstr.close() return str
def convert_pdf_to_txt(self, path): """ A very simple conversion function which returns text for parsing from PDF. path = The path to the file """ try: rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter( rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return text except Exception as e: text = "" return text self.logger.error( "Failed to PDF to text: " + str(e))
def pdf_from_url_to_txt(url, maxpages=0): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # Open the url provided as an argument to the function and read the content f = urllib2.urlopen(urllib2.Request(url)).read() # Cast to StringIO object fp = StringIO(f) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() string = retstr.getvalue() retstr.close() return string
def pdf_to_text(pdf): pagenos = set() maxpages = 0 # output option rotation = 0 codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() rsrcmgr = PDFResourceManager(caching=caching) outtype = 'text' retstr = BytesIO() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = pdf if isinstance(pdf, str): fp = open(pdf, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() result = retstr.getvalue() print(result) return result
def convert_pdf_to_txt(path): temp = os.path.splitext(path) rsrcmgr = PDFResourceManager() retstr = StringIO() codec = "utf-8" laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, "rb") interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages( fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True ): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() outputFile = temp[0] + ".txt" print outputFile ff = open(outputFile, "w") ff.write(text) ff.close()
def extract_text_from_pdf(pdf_filename): """ Function to extract the text from pdf documents using pdfminer Parameters: ----------- pdf_filename -- string File name of the pdf document as string Returns: -------- extracted_text -- string Text extracted from pdf as string """ resource_manager = PDFResourceManager() return_string = StringIO() la_params = LAParams() device = TextConverter(resource_manager, return_string, codec='utf-8', laparams=la_params) fp = file(pdf_filename, 'rb') interpreter = PDFPageInterpreter(resource_manager, device) page_nos = set() for page in PDFPage.get_pages(fp, page_nos): interpreter.process_page(page) fp.close() device.close() extracted_text = return_string.getvalue() return_string.close() return extracted_text
def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) with open(path, 'rb') as fp: parser = PDFParser(fp) doc = PDFDocument(caching=True) parser.set_document(doc) doc.set_parser(parser) doc.initialize('') interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in doc.get_pages(): interpreter.process_page(page) text = retstr.getvalue() device.close() retstr.close() return text
def __convert(self, ifile, ofile=None): fp = file(ifile, 'rb') if ofile is None: outfp = StringIO.StringIO() else: outfp = file(ofile, 'wb') rsrcmgr = PDFResourceManager(caching=self.caching) device = TextConverter(rsrcmgr, outfp, codec=self.codec, laparams=self.laparams, imagewriter=self.imagewriter) interpreter = PDFPageInterpreter(rsrcmgr, device) try: for page in PDFPage.get_pages(fp, self.pagenos, maxpages=self.maxpages, password=self.password, caching=self.caching, check_extractable=True): page.rotate = (page.rotate + self.rotation) % 360 interpreter.process_page(page) except (PDFException, MemoryError) as e: print "Could not extract text {0}".format(e) fp.close() device.close() retval = None if ofile is None: retval = outfp.getvalue() outfp.close() return retval
def pdf_read(pdf): """ Use PDFMiner to extract text from pdf file. <PDFMiner even though more low-level but pretty good tool to read pdfs> Args: *pdf* (str) -- path to pdf file Returns: *text* (str) -- a text extracted from pdf """ # initalizing objects res_manager = PDFResourceManager() strio = StringIO() lps = LAParams() device = TextConverter(res_manager, strio, codec='utf-8', laparams=lps) interpreter = PDFPageInterpreter(res_manager, device) # opening a pdf file with 'rb' mode for reading binary files pdf_file = file(pdf, 'rb') for page in PDFPage.get_pages(pdf_file, maxpages=0, password='', caching=True, check_extractable=True): interpreter.process_page(page) # finishing up pdf_file.close() device.close() text = strio.getvalue() strio.close() return text
def pdf_to_text(pdfname): from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfpage import PDFPage from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from cStringIO import StringIO # PDFMiner boilerplate rsrcmgr = PDFResourceManager() sio = StringIO() # codec = 'utf-8' codec = 'ascii' laparams = LAParams() device = TextConverter(rsrcmgr, sio, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Extract text fp = file(pdfname, 'rb') for page in PDFPage.get_pages(fp): interpreter.process_page(page) fp.close() # Get text from StringIO text = sio.getvalue() # Cleanup device.close() sio.close() return text
def pdf2txt(self, lowerBorder=-1, upperBorder=-1): """ Returns the plain text of the document. If lowerBorder is an int number > -1, only page referring to this number will be returned. If lowerBorder and upperBorder are >-1 and upperBorder > lowerBoder, the pages referring to that range will be returned. """ rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(self.filename, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True if (lowerBorder==-1 and upperBorder==-1) or (lowerBorder>-1 and upperBorder=="max"): pagenos=set() elif lowerBorder > -1 and upperBorder==-1: #extract only a single page pagenos=set(range(lowerBorder, lowerBorder+1)) elif lowerBorder==-1 or upperBorder==-1 or lowerBorder > upperBorder: raise ValueError("illegal parameter passed") else: pagenos=set(range(lowerBorder, upperBorder+1)) for (pageno, page) in enumerate(PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True)): if pageno < lowerBorder and upperBorder == "max": continue interpreter.process_page(page) fp.close() device.close() s = retstr.getvalue() retstr.close() return s.decode('utf-8')
def pdf2txt(path): ''' Converts a given PDF to plain text in UTF8. ''' try: rsrcMgr = PDFResourceManager() retStr = StringIO() codec = 'utf-8' laParams = LAParams() device = TextConverter(rsrcMgr, retStr, codec=codec, laparams=laParams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcMgr, device) password = "" maxPages = 0 caching = True pageNos=set() for page in PDFPage.get_pages(fp,pageNos,maxpages=maxPages,password=password,caching=caching,check_extractable=True): interpreter.process_page(page) fp.close() device.close() text = retStr.getvalue() retStr.close() return text except: return None
def convert(url, pages=None): assert isinstance(url, basestring) assert pages == None or isinstance(pages, list) rscmng = PDFResourceManager() retstr = StringIO() device = TextConverter(rscmng, retstr, codec='utf-8', laparams=LAParams()) web_page = urllib2.urlopen(urllib2.Request(url)) fp = StringIO(web_page.read()) interpreter = PDFPageInterpreter(rscmng, device) pdf_pages = PDFPage.get_pages( fp, set(pages if pages != None else []), maxpages=0, password='', caching=True, check_extractable=True ) for page in pdf_pages: interpreter.process_page(page) result = retstr.getvalue() fp.close() web_page.close() device.close() retstr.close() return result
def convert_pdf(path='provide path here', format='text', codec='utf-8'): rsrcmgr = PDFResourceManager() retstr = BytesIO() laparams = LAParams() if format == 'text': device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) else: raise ValueError('Please provide the format to extract') fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) maxpages = 500 #mention the maximum pages here (Note: Large number of pages will decrease the performance.) caching = True page_numbers=set() for page in PDFPage.get_pages(fp, page_numbers, maxpages=maxpages,caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue().decode() fp.close() device.close() retstr.close() bulletins_data = re.findall('•([^•]+)*', str(text)) list_of_bullet_points = [] json_dict = {} for points in bulletins_data: list_of_bullet_points.append(points) json_dict['bulletins'] = list_of_bullet_points json_data= json.dumps(json_dict) parsed = json.loads(json_data) final_data = json.dumps(parsed, indent=4, sort_keys=True) #creates a pretty json with the data extracted document = Document() # creates a new document document.add_heading('Bulletins data in the PDF') document.add_paragraph(str(final_data)) document.save('json_data.docx') # saves it to the filesystem os.startfile("json_data.docx") # will open the file return ''
def convert_pdf_to_txt(path): """ Converts PDF to text using the pdfminer library """ rsrcmgr = PDFResourceManager() retstr = StringIO() codec = "utf-8" laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) file_handle = file(path, "rb") interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages( file_handle, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True ): interpreter.process_page(page) text = retstr.getvalue() file_handle.close() device.close() retstr.close() return text
def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 120 caching = True pagenos=set() # print "two" for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) # print "one" try: fp.close() device.close() str = retstr.getvalue() retstr.close() except: str = retstr.getvalue() return str
def convert_pdf_to_txt(path): """ This function converts a .pdf file to text @path: file path to .pdf document from: http://stackoverflow.com/questions/26494211/ extracting-text-from-a-pdf-file-using-pdfminer-in-python/26495057#26495057 """ rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return text
def pdf_to_txt(fichero_pdf,fichero_txt): # Especificamos la configuracion de nuestro pdf password = '' pagenos = set() maxpages = 0 imagewriter = None rotation = 0 codec = 'utf-8' caching = True laparams = LAParams() # Estrablecemos el gestor rsrcmgr = PDFResourceManager(caching=caching) # Creamos el fichero de salida y lingamos el dispositivo que lo transforma outfp = file(fichero_txt, 'w') device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) # Para cada pagina del fichero pdf vamos interpretandola mediante el dispositivo fp = file(fichero_pdf, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) # Cerramos los dispositivos abiertos fp.close() device.close() outfp.close() return 1
def pdf_to_txt(path, lowerBorder=-1, upperBorder=-1): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True if lowerBorder==-1 and upperBorder==-1: pagenos=set() else: if lowerBorder==-1 or upperBorder==-1 or lowerBorder > upperBorder: raise ValueError("illegal parameter passed") else: pagenos=set(range(lowerBorder, upperBorder+1)) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() s = retstr.getvalue() retstr.close() return s.decode('utf-8')
def getTexts(self): try: password ='' pagenos = set() maxpages = 0 codec = 'utf-8' caching = True laparams = LAParams() rsrcmgr = PDFResourceManager(caching=caching) outfp = file('temppdf.txt','w') device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fname= self.fname fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True) fp.close() device.close() outfp.close() infp = file('temppdf.txt','rb') test=infp.read() infp.close() os.remove('temppdf.txt') self.text=test return "ok" except Exception,e: return e
def pdf_to_txt(path): """converts pdf into a string @param path: path to the file @type path: string @return: pdf content @rtype: string""" rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() s = retstr.getvalue() retstr.close() return s.replace('\x0c','')
def scrape(filepath: str) -> Dict: sop = SOP() rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec="utf-8", laparams=laparams) _, file_extension = os.path.splitext(filepath) if file_extension != ".pdf": raise FileTypeException with open(filepath, "rb") as f: interpreter = PDFPageInterpreter(rsrcmgr, device) page = next( PDFPage.get_pages(f, set(), maxpages=1, caching=False, check_extractable=True)) interpreter.process_page(page) text = retstr.getvalue() text = text.replace("\n", "") match = re.search(r"SOP\s(\d{1}\-\d+)", text) try: sop.number = match.group(1) except AttributeError: pass match = re.search(r"Effective\:\s(\d{1,2}\/\d{2}\/\d{2})", text) try: sop.effective_date = match.group(1) except AttributeError: pass match = re.search(r"(Review\sDue|Expires)\:\s(\d{1,2}\/\d{2}\/\d{2})", text) try: sop.expires_date = match.group(2) except AttributeError: pass match = re.search(r"Replaces\:\s(\d{1,2}\/\d{2}\/\d{2})", text) try: sop.replaces_date = match.group(1) except AttributeError: pass return sop.__dict__
def pdf2text(path): x = os.path.split(path) new_f_name = x[-1] if new_f_name.endswith('.pdf'): new_f_txt = new_f_name.replace('.pdf', '.txt') elif new_f_name.endswith('.PDF'): new_f_txt = new_f_name.replace('.PDF', '.txt') print(new_f_txt) os.chdir(output_path) f = open(new_f_txt, "a") try: rsrcmgr = PDFResourceManager() retstr = io.StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(f): interpreter.process_page(page) path = retstr.getvalue() print(path) except: print('This pdf contains images. Starting OCR....') pages = convert_from_path(path, 500) image_counter = 1 for page in pages: filename = "page_" + str(image_counter) + ".jpg" page.save(filename, 'JPEG') image_counter = image_counter + 1 filelimit = image_counter - 1 for i in range(1, filelimit + 1): filename = "page_" + str(i) + ".jpg" text = str(((pytesseract.image_to_string(Image.open(filename))))) text = text.replace('-\n', '') f.write(text) f.close() del_files = glob.glob(output_path + '*') for i in del_files: if i.endswith(".jpg") or i.endswith(".jpeg") or i.endswith(".JPG"): os.remove(i) print('OCRed file: %s is available' % (new_f_txt)) print('Deleted all images to the above job')
def request_pdf(url, case_id, court_name): try: response = requests.request("GET", url, proxies=proxy_dict) if response.status_code == 200: res = response.text if res is None: logging.error("No data for: " + str(case_id)) return "NULL" file_path = module_directory + "/../Data_Files/PDF_Files/" + court_name + "_" + slugify( case_id) + ".pdf" fw = open(file_path, "wb") fw.write(response.content) text_data = "" pdf_manager = PDFResourceManager() string_io = StringIO() pdf_to_text = TextConverter(pdf_manager, string_io, codec='utf-8', laparams=LAParams()) interpreter = PDFPageInterpreter(pdf_manager, pdf_to_text) for page in PDFPage.get_pages(open(file_path, 'rb')): interpreter.process_page(page) text_data = string_io.getvalue() file_path = module_directory + "/../Data_Files/Text_Files/" + court_name + "_" + slugify( case_id) + ".txt" fw = open(file_path, "w") fw.write(str(text_data)) return str(text_data) else: logging.error("Failed to get text file for: " + str(case_id)) return "NULL" except Exception as e: logging.error( "Failed to get pdf file for: " + str(case_id) + ". Error: %s", e) return "NULL"
def extract_text(data): try: try: from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage newapi = True except ImportError: from pdfminer.pdfparser import PDFDocument newapi = False from pdfminer.pdfparser import PDFParser, PDFSyntaxError from pdfminer.converter import TextConverter from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter except ImportError: raise ImportError('Please install python3-pdfminer to parse PDF') else: parser = PDFParser(BytesIO(data)) try: if newapi: doc = PDFDocument(parser) else: doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) except PDFSyntaxError: return rsrcmgr = PDFResourceManager() if sys.version_info.major == 2: out = BytesIO() else: out = StringIO() device = TextConverter(rsrcmgr, out) interpreter = PDFPageInterpreter(rsrcmgr, device) if newapi: pages = PDFPage.create_pages(doc) else: doc.initialize() pages = doc.get_pages() for page in pages: interpreter.process_page(page) return out.getvalue()
def uploaded_file(): if request.method == 'POST': # check if the post request has the file part if 'file' not in request.files: #flash('No file part') return redirect(url_for('index')) file = request.files['file'] if file.filename == '': flash('No file selected for uploading') return redirect(url_for('index')) elif file and allowed_file(file.filename): filename = secure_filename(file.filename) file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) text_file_path = filename rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) pdf_file = "/tmp/" + filename fp = open(pdf_file, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() string = "" for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) #fp.close() #device.close() string = retstr.getvalue() new_str = re.sub('[^a-zA-Z0-9\n]', ' ', string) #retstr.close() return new_str else: #flash('Allowed file types is pdf') return redirect(url_for('index'))
def extractToText(self,fileName, oFileName=None): """ Extracts all the text from the specified file to the output file using pdfminer. Args: fileName: Name for the file to be extracted oFileName: Name for the output textfile, if none specified is the specified fileName """ # Only allow processing of pdfs here if (fileName.find(".pdf") == -1): return None if (oFileName == None): oFileName = fileName.replace(".pdf",".txt") retStr = StringIO() device = TextConverter(self.rsrc_mgr_, retStr, codec=self.codec_, laparams=LAParams(char_margin= 20)) fp = open(fileName,"rb") # Create the interpreter interpreter = PDFPageInterpreter(self.rsrc_mgr_, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp,pagenos, maxpages = maxpages , password = password , caching = caching , check_extractable = True): interpreter.process_page(page) text = retStr.getvalue() oFile = open(oFileName,"w") oFile.writelines(text) # Close the file oFile.close() # Close the input pdf fp.close()
def pdfparser(data): fp = open(data, 'rb') rsrcmgr = PDFResourceManager() retstr = io.StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. pages = PdfFileReader(open(data, "rb"), strict=False).getNumPages() i = 0 for page in PDFPage.get_pages(fp): i += 1 if i > pages - 2: break interpreter.process_page(page) data = retstr.getvalue() file = open("textTA.txt", "wb") file.write(data.encode()) file.close()
def first_page_str(og_filename): """ takes a pdf-file and returns a string with the text on the first page """ output_string = StringIO() with open(og_filename, "rb") as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) text = output_string.getvalue() text_pages = text.split("\x0c") new_filename = text_pages[0].strip() return new_filename
def pdfparser(in_path,out_path): fp = open(in_path, 'rb') rsrcmgr = PDFResourceManager() retstr = io.StringIO() codec ='utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.get_pages(fp): interpreter.process_page(page) data = retstr.getvalue() with open(out_path,'w',encoding=codec) as f: f.write(data) f.close() return data
def read(filepath): """ Reads PDF saved at filepath and returns list of strings containing text from each page of the PDF. :param filepath: :return: """ pages = [] with open(filepath, 'rb') as fp: resource_manager = PDFResourceManager() buffer = io.StringIO() device = TextConverter(resource_manager, buffer, laparams=LAParams()) interpreter = PDFPageInterpreter(resource_manager, device) for page in PDFPage.get_pages(fp): interpreter.process_page(page) data = buffer.getvalue() pages += [data] return pages
def pdf_to_text(path): with open(path, 'rb') as fp: rsrcmgr = PDFResourceManager() outfp = io.StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, outfp, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp): interpreter.process_page(page) text = outfp.getvalue() doc = nlp(text) case_list = [] for entity in doc.ents: case = {'Text': entity.text, 'Label': entity.label_ } case_list.append(case) print(entity.text,entity.label_) result = {"pdf_to_text":text.replace('\n',''), "text_label":case_list } return Response(result)
def pdfparser(data): fp = open(data, 'rb') rsrcmgr = PDFResourceManager() retstr = io.StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. filename = data for i, page in enumerate(PDFPage.get_pages(fp)): try: interpreter.process_page(page) data = retstr.getvalue() except error as e: print(e) print(filename, "failed to read %dth file" % i + 1) return e return data
def parse(fname): # pylint: disable=too-many-branches fp = open(fname, 'rb') rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.get_pages(fp): interpreter.process_page(page) data = retstr.getvalue() res = defaultdict(set) current_stage = 'title' for line in StringIO(data): print line,
def __init__(self, file_object, password=''): self.pdf_document = PDFDocument() self.parser = PDFParser(file_object) self.parser.set_document(self.pdf_document) self.pdf_document.set_parser(self.parser) self.pdf_document.initialize(password) if self.pdf_document.is_extractable: self.resource_manager = PDFResourceManager() self.text_converter = TextConverter(self.resource_manager, outfp=StringIO()) self.interpreter = PDFPageInterpreter(self.resource_manager, self.text_converter) self.pdf = PDF(metadata=self.pdf_document.info) for page in self.pdf_document.get_pages(): self.pdf.append_page(self.interpreter.process_page(page))
def _index_pdf(self, bin_data): '''Index PDF documents''' if PDFResourceManager is None: return buf = u"" if bin_data.startswith(b'%PDF-'): f = io.BytesIO(bin_data) try: resource_manager = PDFResourceManager() with io.StringIO() as content, TextConverter(resource_manager, content) as device: logging.getLogger("pdfminer").setLevel(logging.CRITICAL) interpreter = PDFPageInterpreter(resource_manager, device) for page in PDFPage.get_pages(f): interpreter.process_page(page) buf = content.getvalue() except Exception: pass return buf
def pdf_parser(data): """ :param data: The file stream :return: The converted text """ fp = open(data, 'rb') rsrc_mgr = PDFResourceManager() ret_str = io.StringIO() la_params = LAParams() device = TextConverter(rsrc_mgr, ret_str, laparams=la_params) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrc_mgr, device) # Process each page contained in the document. for page in PDFPage.get_pages(fp): interpreter.process_page(page) data = ret_str.getvalue() return data
def pdfparser(data): fp = open(data, 'rb') rsrcmgr = PDFResourceManager( ) #to store shared resources such as fonts or images retstr = io.StringIO() # Cast to StringIO object codec = 'utf-8' # Set parameters for analysis. laparams = LAParams() # Create a PDF device object device = TextConverter(rsrcmgr, retstr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. with open('{}'.format(data.replace('pdf', 'txt')), 'w') as f: for page in PDFPage.get_pages(fp): interpreter.process_page(page) data = retstr.getvalue() f.write(data.encode('cp850', 'replace').decode('cp850')) print("done")
def parse(file): """ Args: file: """ output_string = StringIO() with open(file, 'rb') as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams(detect_vertical=True)) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) pages = str(resolve1(doc.catalog["Pages"])["Count"]) content = output_string.getvalue() return f"{pages} {content}"
def pdf_to_text_pdfminer(path): # Open PDF File pdf_file = open(path, 'rb') # Initialze / Settings rsrcmgr = PDFResourceManager() retstr = io.StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) # Create PDF Interpreter Object interpreter = PDFPageInterpreter(rsrcmgr, device) # Process Each PDF Page for page in PDFPage.get_pages(pdf_file): interpreter.process_page(page) pdf_txt = retstr.getvalue() return (pdf_txt)
def pdfparser(s): from StringIO import StringIO m_file = StringIO(s) parser = PDFParser(m_file) document = PDFDocument(parser) rsmgr = PDFResourceManager() rstr = StringIO() lpm = LAParams() cdc = 'utf-8' device = TextConverter(rsmgs, rstr, codec=cdc, laparams=lpm) interpreter = PDFInterpreter(rsmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) data = rstr.getvalue() print data
def read_pdf_text(path, retured_value): output_string = StringIO() if path: with open(path, 'rb') as file: parser = PDFParser(file) fileDoc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) fileLen = resolve1(fileDoc.catalog['Pages'])['Count'] counter = 0 for page in PDFPage.create_pages(fileDoc): precent = int(round(((counter + 1) / fileLen) * 100)) print(f"reading at {precent}%") interpreter.process_page(page) counter += 1 retured_value = output_string return retured_value
def convertWithPdfMiner(fname): pages_text = [] rsrcmgr = PDFResourceManager() sio = StringIO() codec = 'utf-8' # ISO-8859-1 is good for foreign languages laparams = LAParams() device = TextConverter(rsrcmgr, sio, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pdf = open(fname, "rb") count = 0 for page in PDFPage.get_pages(pdf): # Get (and store) the "cursor" position of stream before reading from PDF # On the first page, this will be zero read_position = sio.tell() interpreter.process_page(page) sio.seek(read_position, 0) page_text = sio.read() pages_text.append(page_text) return pages_text
def pdfparser(input_path, fname): filename = os.path.join(input_path, fname) fp = open(filename, 'rb') rsrcmgr = PDFResourceManager() retstr = io.StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.get_pages(fp): interpreter.process_page(page) txt_string = retstr.getvalue() ori_df = DataFrame(re.split("\n", txt_string)) ori_df.columns = ["value"] return ori_df
def get_text_lines(location): """ Return a list of unicode text lines extracted from a pdf file at `location`. May raise exceptions. """ extracted_text = BytesIO() lines = [] laparams = LAParams() with open(location, 'rb') as pdf_file: with contextlib.closing(PDFParser(pdf_file)) as parser: document = PDFDocument(parser) manager = PDFResourceManager() with contextlib.closing(TextConverter(manager, extracted_text, laparams=laparams)) as extractor: interpreter = PDFPageInterpreter(manager, extractor) pages = PDFPage.create_pages(document) for page in pages: interpreter.process_page(page) lines = extracted_text.getvalue().splitlines(True) return lines
def skip_test_pdfminer(self): from io import StringIO from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfpage import PDFPage from pdfminer.pdfparser import PDFParser output_string = StringIO() with open(TestHierarchy.straight_forward_doc, 'rb') as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) print(output_string.getvalue())
def read_pdf(list_file): list_due_date = [] for f in list_file: with open(f, 'rb') as fp: rsrcmgr = PDFResourceManager() retstr = io.StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = PDFPage.get_pages(fp) for page in pages: interpreter.process_page(page) data = retstr.getvalue() if 'VENCIMENTO' in data or 'Fatura de Pagamento' in data or 'NOTA FISCAL DE SERVIÇOS' in data: list_due_date.append(f) return list_due_date
def pdf2string(path): """ From a given pdf path, it creates a string of the pdf. :param path: Path to the pdf file :return: string of the pdf file """ file_in = open(path, 'rb') # Create a PDF interpreter object. (pdfminer) retstr = io.StringIO() rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.get_pages(file_in): interpreter.process_page(page) data = retstr.getvalue() return data
def textPDF_to_text(self): for i in range(self.number_of_pages): fp = open('{}//{}_{}.pdf'.format(self.temp_folder_name, self.file_name, i), 'rb') rsrcmgr = PDFResourceManager() retstr = io.StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.get_pages(fp): interpreter.process_page(page) text = retstr.getvalue() textfile = open('{}//{}_{}.txt'.format(self.temp_folder_name, self.file_name, i), 'w') textfile.write(text) textfile.close()
def convertPDFToText(path): """ This function converts pdf to text path: path of the file to be converted """ fp = open(path, 'rb') rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.get_pages(fp): interpreter.process_page(page) data = retstr.getvalue() return data
def extract_additional_content( cls, content_arg: str) -> t.List[t.Tuple[t.Type[ContentType], str]]: path = Path(content_arg) if not path.is_file(): raise Exception(f"Not a file: {content_arg}") if path.suffix != ".pdf": raise Exception(f"Not a .pdf: {content_arg}") text = StringIO() with path.open("rb") as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, text, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) return [ (TextContent, text.getvalue()), ]