def pdf_merge(pdf1, pdf2): try: tmp1 = os.tempnam() tmp2 = os.tempnam() tmp3 = os.tempnam() output = pyPdf.PdfFileWriter() file(tmp1, "w").write(pdf1) file(tmp2, "w").write(pdf2) input1 = pyPdf.PdfFileReader(file(tmp1, "rb")) input2 = pyPdf.PdfFileReader(file(tmp2, "rb")) for page in range(input1.getNumPages()): output.addPage(input1.getPage(page)) for page in range(input2.getNumPages()): output.addPage(input2.getPage(page)) outputStream = file(tmp3, "wb") output.write(outputStream) outputStream.close() #cmd="/usr/bin/pdftk %s %s cat output %s"%(tmp1,tmp2,tmp3) #os.system(cmd) pdf3 = file(tmp3).read() os.unlink(tmp1) os.unlink(tmp2) os.unlink(tmp3) return pdf3 except: raise Exception("Failed to merge PDF files")
def test_sealed_document(self): with self.new_document_from_file() as d: file_contents = d.original_file.get_bytes() with contextlib.closing(cStringIO.StringIO(file_contents)) as s: self.assertEqual(1, pyPdf.PdfFileReader(s).getNumPages()) self.assertIsNone(d.sealed_document) d = self.api.ready(d) self.assertIsNone(d.sealed_document) author = d.author d = self.api._sign(d, author) self.assertIsNone(d.sealed_document) # wait for sealing to be completed, and refresh time.sleep(10) d = self.api.get_document(d.id) self.assertIsNotNone(d.sealed_document) file_contents = d.sealed_document.get_bytes() with contextlib.closing(cStringIO.StringIO(file_contents)) as s: self.assertEqual(2, pyPdf.PdfFileReader(s).getNumPages()) d._set_read_only() self.assertTrue(d.sealed_document._read_only) d._set_invalid() with self.assertRaises(InvalidScriveObject, None): d.sealed_document.get_bytes()
def generate_tests(settings): """create parameterized tests""" test_cases = unittest.TestSuite() try: cases = import_module("inspection." + settings['cases']) except ImportError: cases = import_module(settings['cases']) except ImportError: raise RuntimeError("module {} not found".format(settings['cases'])) pdf_a_im = pyPdf.PdfFileReader(file(settings['pdf_a'], "rb")) total_a_pages = pdf_a_im.getNumPages() pdf_b_im = pyPdf.PdfFileReader(file(settings['pdf_b'], "rb")) total_b_pages = pdf_b_im.getNumPages() settings['include'] = list( set(settings['include']) - set(settings['exclude'])) for case_name in settings['include']: TestClass = cases.__getattribute__(case_name) setattr(TestClass, '_settings', settings) SuperClass = inspect.getmro(TestClass)[1] method_list = inspect.getmembers(TestClass, predicate=inspect.ismethod) super_method_list = inspect.getmembers( SuperClass, predicate=inspect.ismethod) test_method_list = list(set(method_list) - set(super_method_list)) test_name_list = [method[0] for method in test_method_list if method[ 0] != 'tearDownClass' and method[0] != 'setUpClass'] for test_name in test_name_list: for pi in range(1, total_a_pages + 1): for pj in range(1, total_b_pages + 1): test_cases.addTest(TestClass(test_name, pi, pj)) return test_cases
def write(self): """Assembles the final PDF and writes to disk.""" pdf_writer = pyPdf.PdfFileWriter() if self.front_matter is not None: front_matter = pyPdf.PdfFileReader(file(self.front_matter, "rb")) for page in range(front_matter.getNumPages()): pdf_writer.addPage(front_matter.getPage(page)) working_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) doc = SimpleDocTemplate(working_file) doc.pagesize = portrait(letter) story = [] styles = getSampleStyleSheet() for section in self.sections: heading_text = section.heading story.append(Paragraph(heading_text, styles['Heading1'])) for content in section.contents: if 'figure' in content: figure = content['figure'] if os.path.exists(figure): im = utils.ImageReader(figure) img_width, img_height = im.getSize() aspect = img_height / float(img_width) story.append( Image(figure, width=img_width, height=(img_width * aspect))) if content.get('caption', None) is not None: caption_text = '<font size=10>{0}</font>'.format( content['caption'].strip()) story.append(Paragraph(caption_text, styles['Italic'])) story.append(Spacer(1, 10)) if 'table' in content: _t = self.build_table(content['table']) story.append(_t) if content.get('caption', None) is not None: caption_text = '<font size=10>{0}</font>'.format( content['caption'].strip()) story.append(Paragraph(caption_text, styles['Italic'])) story.append(Spacer(1, 10)) if 'text' in content: for para in content['text']: story.append(Paragraph(para, styles['Normal'])) story.append(Spacer(1, 12)) doc.build(story) body_matter = pyPdf.PdfFileReader(working_file) for page in range(body_matter.getNumPages()): pdf_writer.addPage(body_matter.getPage(page)) try: os.remove(working_file.name) except OSError: # Windows reports file in use, other OS errors, etc. pass if self.end_matter is not None: end_matter = pyPdf.PdfFileReader(file(self.end_matter, "rb")) for page in range(end_matter.getNumPages()): pdf_writer.addPage(end_matter.getPage(page)) output_stream = file(self.output_filename, "wb") pdf_writer.write(output_stream)
def process(self, inputFile1): input1 = pyPdf.PdfFileReader(inputFile1) output = pyPdf.PdfFileWriter() for (num, page) in enumerate(input1.pages): if num in self.operations: for mergeFile, mergeNumber in self.operations[num]: merger = pyPdf.PdfFileReader(mergeFile) mergerPage = merger.getPage(mergeNumber) mergerPage.mergePage(page) page = mergerPage output.addPage(page) outputFile = cStringIO.StringIO() output.write(outputFile) return outputFile
def getPageExtend(self, page=0): self.pagesize = [ int(round(float(row) * self._papertype_conversion_factor)) for row in pyPdf.PdfFileReader(file(self.fname, 'rb')).getPage( page).mediaBox[2:4] ] self.getPageSize()
def getPDFContent(path): content = "" pdf = pyPdf.PdfFileReader(file(path, "rb")) print pdf.getNumPages() for i in range(0, pdf.getNumPages()): content += pdf.getPage(i).extractText() + "\n" return content
def merge_pdf_on_disk(self, docs): streams = [] writer = pyPdf.PdfFileWriter() for doc in docs: current_buff = tempfile.mkstemp(suffix='.pdf', prefix='credit_control_slip')[0] current_buff = os.fdopen(current_buff, 'w+b') current_buff.seek(0) streams.append(current_buff) current_buff.write(doc) current_buff.seek(0) reader = pyPdf.PdfFileReader(current_buff) for page in xrange(reader.getNumPages()): writer.addPage(reader.getPage(page)) buff = tempfile.mkstemp(suffix='.pdf', prefix='credit_control_slip_merged')[0] try: buff = os.fdopen(buff, 'w+b') # The writer close the reader file here buff.seek(0) writer.write(buff) buff.seek(0) return buff.read() except IOError: raise finally: buff.close() for stream in streams: stream.close()
def get_no_pages(self, pdf_file, preserve): import pyPdf reader = pyPdf.PdfFileReader(open(pdf_file)) if not preserve: #if you want to remove the file after accessing: os.remove(pdf_file) return reader.getNumPages()
def speack_from_pdf(path, language): pdf = pyPdf.PdfFileReader(open(path, "rb")) fp = file(path, 'rb') num_of_pages = pdf.getNumPages() for i in range(num_of_pages): inside = [i] pagenos = set(inside) rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True text = "" for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() text = text.decode("utf-8", "ignore") print('page ' + str(i)) speack(text, language)
def _merge_PDFs(self, pdfslides): """Merge the given PDFs into one.""" output_filename = "%s.pdf" % self._get_filename().split(".svg")[0] output_filepath = abspath(join(os.curdir, output_filename)) has_pyPdf = False try: import pyPdf has_pyPdf = True except ImportError: pass if has_pyPdf: logging.info("Using 'pyPdf' to join PDFs to %s", output_filepath) output = pyPdf.PdfFileWriter() inputfiles = [] for slide in pdfslides: inputstream = file(slide, "rb") inputfiles.append(inputstream) reader = pyPdf.PdfFileReader(inputstream) output.addPage(reader.getPage(0)) outputStream = file(output_filepath, "wb") output.write(outputStream) outputStream.close() for f in inputfiles: f.close() else: logging.warning("PyPDF not installed, cannot merge PDF slides")
def _fill_body_place_and_odg(self): pdf = file(self.pdf.path) pp = pyPdf.PdfFileReader(pdf) txt = pp.getPage(0).extractText() self.body = txt self.place = self.__get_place(txt) self.set_odg_list(self.__get_odg(txt))
def page_extractor_pyPdf(file_name): """Extracts the text of a pdf with pyPdf""" import pyPdf file_reader = pyPdf.PdfFileReader(open(file_name, 'r')) first_page = file_reader.getPage(0) return first_page.extractText()
def merge_pdfs(pdf_path_list, to_file): pdf_out = pyPdf.PdfFileWriter() total_page_size = 0 f_in_handler_list = [] for path in pdf_path_list: # 这里不能用 with open(), 因为with语句会自动关闭文件, 导致pdf_out.write(f_out)失败 f = open(path, "rb") f_in_handler_list.append(f) pdf_in = pyPdf.PdfFileReader(f) page_size = pdf_in.getNumPages() total_page_size += page_size # 分别将page添加到输出pdf_out中 for i in range(0, page_size): pdf_out.addPage(pdf_in.getPage(i)) print "Processed file path: %s, page_size: %d" % (path, page_size) with open(to_file, "wb") as f_out: pdf_out.write(f_out) print "Merged file path: %s, page_size: %d" % (to_file, pdf_out.getNumPages()) # close pdf_in句柄 for f in f_in_handler_list: f.close()
def cutpdf(name_input, name_output): pdf_input = file(name_input, "rb") pdf_output = file(name_output, "wb") output = pyPdf.PdfFileWriter() input1 = pyPdf.PdfFileReader(pdf_input) pg = input1.getNumPages() for i in range(0, pg): page1 = input1.getPage(i) page2 = copy.copy(page1) cutline = (page1.mediaBox.getUpperRight_x() / 2, page1.mediaBox.getUpperRight_y()) page1.mediaBox.upperRight = cutline output.addPage(page1) page2.mediaBox.upperLeft = cutline output.addPage(page2) output.write(pdf_output) pdf_output.close return True
def tokenize(path): # open PDF pdf = pyPdf.PdfFileReader(open(str(path), "rb")) stopword_list = list(stopwords.words("english")) # read PDF file in a list pdf_content = [] for page in pdf.pages: pdf_content.append(page.extractText()) # tokenize all the words in the resume tokenize = [] for line in pdf_content: tokenize = filter(None, (line.split(" "))) # remove punctuations and case-fold no_punctuations = [] for token in tokenize: no_punctuations.append(token.rstrip(",:|.-").lower()) # remove stop words without_stop_words = [] for word in filter(None, no_punctuations): if word not in stopword_list: without_stop_words.append(word) return without_stop_words
def get_url_title(url): """Try to load url and return a tuple of its title and any error text. Works for html and pdf (but not, e.g., Word docs). """ notes = '' try: page = requests.get(url) except Exception as ex: title = 'Source' notes = 'Error loading url: "{}".'.format(str(ex)) else: # Test page.url rather than url parameter since redirection # can route to something other than the url parameter. if not page.url.endswith('pdf'): try: soup = BeautifulSoup( page.text, convertEntities=BeautifulSoup.HTML_ENTITIES) title = soup.find('head').find('title').text except Exception as ex: title = 'Source' notes = 'Error parsing page: "{}".'.format(str(ex)) else: try: pdf = pyPdf.PdfFileReader(StringIO(page.content)) title = pdf.documentInfo['/Title'] except Exception as ex: title = 'Source' notes = 'Error parsing PDF: "{}".'.format(str(ex)) if not title.strip(): title = 'Source' return title, notes
def ActualizarCantPaginas(data): list_dir_zonas = [ '{}\\{}\\{}'.format(path_urbano_croquis_listado, x[0], x[1]) for x in data ] for dir_zona in list_dir_zonas: for archivo in os.listdir(dir_zona): if (archivo.endswith('.pdf')): l = len(archivo) cod = archivo[0:l - 4] nom_pdf = "{}\\{}".format(dir_zona, archivo) pdf = pyPdf.PdfFileReader(open(nom_pdf, "rb")) cant_pag = pdf.getNumPages() list_web = nom_pdf.split("\\")[3:] nom_web = "" for i in list_web: nom_web = nom_web + '/' + i # print nom_web nom_web = nom_web.replace("\\", "/") # print nom_web conx.ActualizarCantidadPaginas(fase, cod, cant_pag, nom_pdf, nom_web)
def combine_multiple_canvas(self): """Combine multiple PDF files at once when is working with multiple canvas""" if not self.multiple_canvas or not pyPdf or not self.temp_files: return readers = [] def append_pdf(input, output): for page_num in range(input.numPages): output.addPage(input.getPage(page_num)) output = pyPdf.PdfFileWriter() for f_name in self.temp_files: reader = pyPdf.PdfFileReader(file(f_name, 'rb')) readers.append(reader) append_pdf(reader, output) if isinstance(self.filename, basestring): fp = file(self.filename, 'wb') else: fp = self.filename output.write(fp) # Closes and clear objects fp.close() for r in readers: del r del output
def pdf_pages(file_info, width): output = '' if width is None: width = DEFAULT_PAGE_WIDTH if not os.path.isfile(file_info['path'] + '.pdf'): if file_info['extension'] in ('rtf', 'doc', 'odt') and not os.path.isfile(file_info['path'] + '.pdf'): server.fg_make_pdf_for_word_path(file_info['path'], file_info['extension']) if 'pages' not in file_info: try: reader = pyPdf.PdfFileReader(open(file_info['path'] + '.pdf')) file_info['pages'] = reader.getNumPages() except: file_info['pages'] = 1 max_pages = 1 + int(file_info['pages']) formatter = '%0' + unicode(len(unicode(max_pages))) + 'd' for page in range(1, max_pages): page_file = dict() test_path = file_info['path'] + 'page-in-progress' if os.path.isfile(test_path): while (os.path.isfile(test_path) and time.time() - os.stat(test_path)[stat.ST_MTIME]) < 30: if not os.path.isfile(test_path): break time.sleep(1) page_file['extension'] = 'png' page_file['path'] = file_info['path'] + 'page-' + formatter % page page_file['fullpath'] = page_file['path'] + '.png' if not os.path.isfile(page_file['fullpath']): server.fg_make_png_for_pdf_path(file_info['path'] + '.pdf', 'page') if os.path.isfile(page_file['fullpath']): output += unicode(image_for_docx(docassemble.base.functions.DALocalFile(page_file['fullpath']), docassemble.base.functions.this_thread.current_question, docassemble.base.functions.this_thread.misc.get('docx_template', None), width=width)) else: output += "[Error including page image]" output += ' ' return(output)
def textoPdfCoef(nombrePdf): pdf = pyPdf.PdfFileReader(open(nombrePdf, "rb")) for page in pdf.pages: data = [] a = page.extractText().split("TR=100")[1].split("Datos")[0].split(".") r = 10 for i, d in enumerate([2, 3, 5, 10, 25, 50, 100]): #for j,tr in enumerate(["C1","X0","C2"]): if i < 6 and i >= 1: C1 = float(a[r * i][1 + len(str(d)):] + "." + a[r * i + 1][:3]) elif i == 0: C1 = float(a[r * i][1:] + "." + a[r * i + 1][:3]) else: None C1 = float(a[r * (i - 1) + 3][1 + len(str(d)):] + "." + a[r * (i - 1) + 4][:3]) if i < 6: X0 = float(a[r * i + 1][3:] + "." + a[r * i + 2][:3]) C2 = float(a[r * i + 2][3:] + "." + a[r * i + 3][:3]) else: None X0 = float(a[r * (i - 1) + 4][3:] + "." + a[r * (i - 1) + 5][:3]) C2 = float(a[r * (i - 1) + 5][3:] + "." + a[r * (i - 1) + 6][:3]) data.append([C1, X0, C2]) return data
def getPDFContent(doc,search_terms): #print "getPDFContent" filename = "iomat_%s.pdf" % (doc.date.replace("/","-")) content = "" pdf = pyPdf.PdfFileReader(file(filename,'rb')) str = "" for i in range (0,pdf.getNumPages()): page_content = pdf.getPage(i).extractText().lower() #print type(page_content) for term in search_terms: #print u"Procurando o termo %s" % term if page_content.find(term) != -1: where = page_content.find(term) str = str + "O termo %s foi encontrado no diario %s na pagina %d \n" % (term,filename,i+1) other = "" if where>60 and (where + 60 < len(page_content)): other = page_content[where-60:where+60] str = str + other + "\r\n\r\n" docLink = "http://www.iomat.mt.gov.br/ler_pdf.php?download=ok\&edi_id=%d\&page=0" % (doc.value) str = str + docLink + "\r\n\r\n\r\n" return str
def get_metadata(self, filing): url = filing['pdf_url'] try: pdf = urllib2.urlopen(url).read() except urllib2.HTTPError: return None # We can save the PDF here and then upload it to Document Cloud. pdf = pyPdf.PdfFileReader(StringIO(pdf)) text = pdf.getPage(0).extractText() print print text print metadata = {} for line in text.split('\n'): if line.find('=') > -1: k, v = line.split('=') k = k.title() if v == 'N/A': v = '' metadata.update({ k: v, }) return metadata
def scrape_text(src): pages = [] pdf = pyPdf.PdfFileReader(open(src, "rb")) for page in pdf.pages: text = extract_text(page) pages.append(text) return pages
def pdf_opener(filename): # filename = "test_pf.pdf" pdf = pyPdf.PdfFileReader(open(filename, "rb")) return pdf for page in pdf.pages: print page.extractText()
def GetPageCount(): global path global pageCount fh = file(path, 'rb') input = pyPdf.PdfFileReader(fh) pageCount = input.getNumPages() fh.close()
def AutoSplit(event): global pageCount global path if path: dlg = wx.MessageDialog( top, "This will automatically export individual pages.", "Confirm AutoSplit", wx.OK | wx.CANCEL | wx.ICON_QUESTION) result = dlg.ShowModal() dlg.Destroy() if result == wx.ID_OK: fh = file(path, 'rb') input = pyPdf.PdfFileReader(fh) DestFileName = wx.GetTextFromUser( "Please provide output file name:", "Export File Name?", "Pages from " + os.path.basename(path).split('.')[0], top) for page in range(0, pageCount): outputPage = pyPdf.PdfFileWriter() outputPage.addPage(input.getPage(page)) outputFileName = os.path.dirname( path) + os.sep + DestFileName + " - " + str(page + 1) + ".pdf" outputStream = file(outputFileName, "wb") outputPage.write(outputStream) outputStream.close() fh.close() else: dlg = wx.MessageDialog(top, "No file selected!", "Error...", wx.OK | wx.ICON_QUESTION) dlg.ShowModal() top.SetStatusText("No file selected. Please use \"Select File...\"")
def ExportSelected(event): if path: PagesToExport = SelectionPanel.GetSelections() if len(PagesToExport) < 1: dlg = wx.MessageDialog(top, "No pages selected!", "Error...", wx.OK | wx.ICON_QUESTION) dlg.ShowModal() dlg.Destroy() top.SetStatusText( "No pages selected. Please use the selection panel.") else: fh = file(path, 'rb') DestFileName = wx.GetTextFromUser( "Please provide output file name:", "Export File Name?", "Pages from " + os.path.basename(path).split('.')[0], top) input = pyPdf.PdfFileReader(fh) outputPage = pyPdf.PdfFileWriter() for PageToExport in PagesToExport: outputPage.addPage(input.getPage(PageToExport)) outputFileName = os.path.dirname( path) + os.sep + DestFileName + ".pdf" outputStream = file(outputFileName, "wb") outputPage.write(outputStream) outputStream.close() fh.close() else: dlg = wx.MessageDialog(top, "No file selected!", "Error...", wx.OK | wx.ICON_QUESTION) dlg.ShowModal() top.SetStatusText("No file selected. Please use \"Select File...\"") return
def pages(F): #zwraca ilosc stron pliku o scierzce F FILEOBJ = file(F, "rb") PDF = pyPdf.PdfFileReader(FILEOBJ) n = PDF.getNumPages() FILEOBJ.close() return n
def pdfsearch(request): if request.method == 'POST': raw = request.POST url = request.path #unpack raw json into data variable for key in raw: data = simplejson.loads(key) #unpack data to assign tier, node, and parent variables i=0 while i in range(len(data)): if data[i]['name'] == 'tier': tier = data[i]['value'] i += 1 elif data[i]['name'] == 'node': node = data[i]['value'] i += 1 elif data[i]['name'] == 'parent': parent = data[i]['value'] i += 1 else: i += 1 #identify the model from the selected tier if tier == 'node_1': model = Node_1 elif tier == 'node_2': model = Node_2 elif tier == 'node_3': model = Node_3 elif tier == 'node_4': model = Node_4 elif tier == 'node_5': model = Node_5 else: pass parent = Parent.objects.get(id = parent) # from the model, create specific node variable that node's PDF path variable node = model.objects.get(node_name__contains = node[:4]) try : path = node.pdf_path #pdf path in model should be the page(s) pertinent to that node except: msg = 'No PDF content to display' return HttpResponse(msg, mimetype = 'application/text') #get the actual content of the PDF content = 'penis' p = file(path, "rb") pdf = pyPdf.PdfFileReader(p) i=0 while i in range(pdf.getNumPages()): content += pdf.getPage(i).extractText() + "\n" i += 1 content = {"pdf" : str(content)} return HttpResponse(content, content_type = 'application/json') else: return 'f****d'