def merge(self, pdf_one, pdf_two, filename='my.pdf', output_dir='D:/pdf/'): ''' function:#pdfone为扫描的正面;#pdftwo为扫描的背面;#本函数实现将两个扫描文件按原有的顺序合并起来 :param pdf_one: :param pdf_two: :param filename: :param output_dir: :return: ''' input_one = open(pdf_one, 'rb') input_two = open(pdf_two, 'rb') pdf_input_one = PdfFileReader(input_one) pdf_input_two = PdfFileReader(input_two) numOne = pdf_input_one.getNumPages() numTwo = pdf_input_two.getNumPages() print(numOne, numTwo) pdf_output = PdfFileWriter() index_one = 0 index_two = numTwo - 1 while True: if index_one == numOne: break print(index_one, index_two) page1 = pdf_input_one.getPage(index_one) pdf_output.addPage(page1) page2 = pdf_input_two.getPage(index_two) pdf_output.addPage(page2) index_one += 1 index_two -= 1 pdf_name = output_dir + filename output_stream = open(pdf_name, 'wb') pdf_output.write(output_stream) output_stream.close() input_one.close() input_two.close() print('Done!')
def write_with_template(self,out_file="",template_filename=""): """ テンプレとマージしてそのまま出力しちゃう """ # リーダーでテンプレ読む template_filename = template_filename or self.template_filename reader = PdfFileReader(template_filename) # なんかページ処理 page = reader.getPage(0) page_width = page.mediaBox.getWidth() page_height = page.mediaBox.getHeight() # バッファからPDFデータ化 self.buffer.seek(0) # シークして読み込みできるように new_pdf = PdfFileReader(self.buffer) # 読み込み # テンプレート・ページと内容をマージ page.mergePage(new_pdf.getPage(0)) # writerに渡す writer = PdfFileWriter() writer.addPage(page) # 書き込む with open(out_file, 'wb') as f: writer.write(f)
def PdfPrettyPrint(inputname, outputname): inputfile = open(inputname, 'rb') wrt = PdfFileWriter() ipt = PdfFileReader(inputfile) #print ipt.getDocumentInfo() pdfnums = ipt.getNumPages() #print pdfnums i = 0 while i < pdfnums: page = ipt.getPage(i) wrt.addPage(page) if i + 2 < pdfnums: page = ipt.getPage(i + 2) wrt.addPage(page) else: wrt.addBlankPage() if i + 1 < pdfnums: page = ipt.getPage(i + 1) page.rotateClockwise(180) wrt.addPage(page) else: wrt.addBlankPage() if i + 3 < pdfnums: page = ipt.getPage(i + 3) page.rotateClockwise(180) wrt.addPage(page) else: wrt.addBlankPage() i = i + 4 fl = open(outputname, "wb") wrt.write(fl) inputfile.close() fl.close() return True
def process_pdf_automatically(self): self.statusBar().showMessage('Procesando...') # print("File Name:", self.name) if self.name != "": self.dir = QFileDialog.getExistingDirectory() ls = [] files = [x for x in os.listdir(self.dir + '/') if x.endswith('.pdf') and x != "join.pdf"] outfile = PdfFileWriter() bancos = ['bbva', 'santander'] for i in files: pdf = PdfFileReader(open(self.dir + '/' + str(i), 'rb')) page = pdf.getPage(0) pages = pdf.getNumPages() last = pdf.getPage(pages - 1) text = last.extractText() banco = re.findall("(bbva|santander)", text.lower()) text = page.extractText() fecha = \ re.findall("(corte.*[0-9]{1,2}[/][0-9]{1,2}[/][0-9]{2,4})", text.lower())[0] fecha = \ re.findall("([0-9]{1,2}[/][0-9]{1,2}[/][0-9]{2,4})", fecha)[0] ls.append({'page': page, 'bank': Counter(banco).most_common()[0][0].upper(), 'date': fecha}) fecha = [] for i in ls: fecha.append(i['date']) fecha.sort(key=lambda date: datetime.strptime(date, '%d/%m/%Y')) for i in fecha: for x in ls: if (x['date'] == i): outfile.addPage(x['page']) self.statusBar().showMessage('Creando PDF...') save_in = self.dir + '/' + self.name + '.pdf' with open(save_in, 'wb') as f: outfile.write(f) self.statusBar().showMessage('Creación del PDF Exitosa') self.show_dialog("Acción realizada con éxito") else: self.show_dialog("No fue posible crear el archivo PDF") self.statusBar().showMessage('')
def run(self): if self.beforeHandler(self._id, self.attachUrl): return filename = self.tempDir + str(random.random()) filename1 = self.tempDir + str(random.random()) + '.pdf' try: urllib.request.urlretrieve(self.attachUrl, filename) input_stream = open(filename, 'rb') pdf_input = PdfFileReader(input_stream) pdf_output = PdfFileWriter() page = 0 pages = pdf_input.getNumPages() - 1 # remove last page while page < pages: pdf_output.addPage(pdf_input.getPage(page)) page += 1 output_stream = open(filename1, 'wb') pdf_output.write(output_stream) output_stream.close() input_stream.close() if self.success is not None: self.success(self._id, filename1) except Exception as e: if self.error is not None: self.error(e, self.attachUrl) finally: if os.path.exists(filename): os.remove(filename) if os.path.exists(filename1): os.remove(filename1)
def convert_to_text(self): f = open(self.input_file, 'rb') pdf = PdfFileReader(f) page = pdf.getPage(0) text = page.extractText() f.close() return text
def _create_pdf_from_rtf_files(self): pdfs = [] self.progress.emit(0) for count, file in enumerate(self.files): changed_file = change_filetype(file, "pdf", self.engine) pdfs.append(changed_file) self.progress.emit(count + 1) merger = PdfFileMerger() pages = [] chapters = [] for file in pdfs: read_pdf = PdfFileReader(file) txt = read_pdf.getPage(0) page_content = txt.extractText() try: chapter = helper_functions.get_chapter_from_pdf_txt( page_content) chapters.append(chapter) except: chapter = os.path.basename(file) chapter = chapter.split(".")[0] chapter = chapter.replace("_", " ") chapters.append(chapter) pages.append(read_pdf.getNumPages()) merger.append(fileobj=file) self.pages = pages self.chapters = chapters if not self.create_toc: merger.write(self.master_file_name) else: merger.write("tmp.pdf") merger.close() self.trash += pdfs
def extract_pdf_pypdf2(pdf_path): with open(pdf_path, 'rb') as f: pdf = PdfFileReader(f) if pdf.isEncrypted: pdf.decrypt('') page_obj = pdf.getPage(2) return page_obj.extractText()
def RemovePdfOwnerPassword(inputname, outputname): ''' ''' inputfile = open(inputname, 'rb') wrt = PdfFileWriter() ipt = PdfFileReader(inputfile) try: ipt.decrypt("") except KeyError as e: if e.message == '/Encrypt': print("%s is not an encrypted pdf" % inputname) return -1 else: raise e print(ipt.getDocumentInfo()) size = ipt.getNumPages() i = 0 while i < size: page = ipt.getPage(i) #print(page.extractText()) wrt.addPage(page) i = i + 1 fl = open(outputname, "wb") wrt.write(fl) inputfile.close() fl.close() return 0
def translate(self): '''读取pdf内容,并翻译,写入txt文件''' f = open(self.fullPath, 'rb') pdf = PdfFileReader(f) index = 0 for i in range(0, pdf.getNumPages()): extractedText = pdf.getPage(i).extractText() content = extractedText.split('\n') content = self.removeBlankFromList(content) # 拼接之后的文本,如果单词间歇超过一个空格的,认为是需要换行处理的 content_list = self.enter_symbol(content) for line in content_list: line = line.strip() if line: ret = translate_func(line) trans = ret if ret else '翻译失败' self.write(line + '\n') self.write(trans) index += 1 print(index, end=' ', flush=True) f.close() Logger().write(self.fileName + '翻译完成,新文档:' + self.new_fullPath)
def merge_pdfs(origin, num_pages, aux, verso=None, above=False, allPage=False): """ this is a general purpose merging function, it helps in various plugins in order to not redo the wheel. It merges origin as the back, aux above. """ try: output = PdfFileWriter() input_result = PdfFileReader(io.BytesIO(origin)) pages = [] for i in range(0, num_pages): page_origin = input_result.getPage(i) if allPage or i % 2 == 0: page_aux = PdfFileReader(io.BytesIO(aux)).getPage(0) pages.append(tasks.merge.delay(page_origin, page_aux, above)) else: _merge_verso(verso, page_origin, above, pages) for page in pages : if type(page) == PyPDF2.pdf.PageObject: output.addPage(page) else : #request celery result data = page.get() output.addPage(data) out_io = io.BytesIO() output.write(out_io) out_io.seek(0, 0) return out_io.read() except Exception: labresult.app.logger.error(traceback.format_exc()) raise MergePDFException('Error while merging PDFs')
def getDataUsingPyPdf2(filename): pdf = PdfFileReader(open(filename, "rb")) content = "" num = pdf.getNumPages() for i in range(0, num): extractedText = pdf.getPage(i).extractText() content += extractedText + "\n" return content
def readPDFfile(infile): pdf = PdfFileReader(infile, "rb")) content = "" num = pdf.getNumPages() for i in range(0, num): extractedText = pdf.getPage(i).extractText() content += extractedText + "\n" return content
class PdfInput: def __init__(self, filename): self.filename = filename self.stream = open(filename, "rb") self.pdf_reader = PdfFileReader(self.stream) def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close() def close(self): self.stream.close() def get_number_of_pages(self): return self.pdf_reader.getNumPages() def get_page_text(self, pageno): page = self.pdf_reader.getPage(pageno - 1) return page.extractText() def get_page_image(self, pageno): with tempfile.NamedTemporaryFile() as tmp_file: with self._open_page_image(pageno, tmp_file.name) as image: image.load() return image def get_page_png_file(self, pageno): tmp_file = tempfile.NamedTemporaryFile(suffix=".png") with self._open_page_image(pageno, tmp_file.name) as image: image.save(tmp_file) return tmp_file def _open_page_image(self, pageno, image_file): print("pdf -> png...", file=sys.stderr) subprocess.run([ "gs", "-sDEVICE=png16m", "-dNOPAUSE", "-dFirstPage={}".format(pageno), "-dLastPage={}".format(pageno), "-sOutputFile={}".format(image_file), "-r300", "-q", self.filename, "-c", "quit"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) return load(image_file)
def getPdffileBookmark(filename, bookmark_file_savepath): pdf = PdfFileReader(open(filename, "rb")) pagecount = pdf.getNumPages() print('pagecount:', pagecount) pageLabels = { } #真实页码的索引 indirectRef “{'/Type': '/Fit', '/Page': IndirectObject(7871, 0), '/Title': '封面'}” for i in range(pagecount): page = pdf.getPage(i) pageLabels[page.indirectRef.idnum] = i + 1 # print(page.indirectRef.idnum,i+1) bookmark_file = codecs.open(bookmark_file_savepath, 'w', encoding='utf-8') title = [] pagedir = [] bookmark_jibie = [] outlines = pdf.getOutlines() print(outlines) index = 0 jibie = 0 for outline in outlines: index += 1 jibie = 0 print(len(outline), outline) if type(outline) == PyPDF2.generic.Destination: # print('dict--------') # print(list(outline.keys())) # for x,j in enumerate(list(outline.keys())): # print(str(outline[j])) # print(outline['/Title']) # print(outline['/Type']) # print(outline.page.idnum) bookmark_file.write(outline['/Title'] + '\t' + str(pageLabels[outline.page.idnum]) + '\r\n') if type(outline) == list: # print('list') jibie = 1 for i, outline in enumerate(outline): if type(outline) == PyPDF2.generic.Destination: bookmark_file.write('\t' * jibie + outline['/Title'] + '\t' + str(pageLabels[outline.page.idnum]) + '\r\n') elif type(outline) == list: jibie = 2 for i, o in enumerate(outline): if type(outline) == PyPDF2.generic.Destination: bookmark_file.write( '\t' * jibie + outline['/Title'] + '\t' + str(pageLabels[outline.page.idnum]) + '\r\n') # print('\n') # if index>=3: # break bookmark_file.close()
def searchPDF(filename,search_term): search_term = search_term.lower() pages = [] pdf = PdfFileReader(open(filename, "rb")) for i in range(0, pdf.getNumPages()): content = pdf.getPage(i).extractText().lower() if(search_term in content): pages.append(i + 1) return pages;
def convertPDFAlternative(self, path): from PyPDF2.pdf import PdfFileReader if not os.path.exists(path): return False pdf = PdfFileReader(open(path, "rb")) for i in range(0, pdf.getNumPages()): print(i) extractedText = pdf.getPage(i).extractText() self.pages.append(extractedText) return True
def get(self, request, *args, **kwargs): fontname_g = "HeiseiMin-W3" pdfmetrics.registerFont(UnicodeCIDFont(fontname_g)) reader = PdfFileReader('media/pdf/riyuu-format4.pdf') writer = PdfFileWriter() buffer = io.BytesIO() cc = canvas.Canvas(buffer) cc.setFont(fontname_g, 11) initial = 295 before_rect_x = 748 after_rect_x = 776.5 line_height = 11.9 input_list = [{ 'label': '便器からの立ち座り', 'before_flag': True, 'after_flag': False }, { 'label': 'トイレまでの移動', 'before_flag': False, 'after_flag': True }, { 'label': 'トイレ出入口の出入(扉の開閉含む)', 'before_flag': True, 'after_flag': False }] welfare_equipment_material = PdfMaterial.objects.get( key="welfare_equipment") cc = self.motion_purpose_draw(cc, before_rect_x, after_rect_x, welfare_equipment_material.materials, input_list, initial, line_height) cc.showPage() cc.save() buffer.seek(0) new_pdf = PdfFileReader(buffer) existing_page = reader.getPage(0) existing_page.mergePage(new_pdf.getPage(0)) writer.addPage(existing_page) new = io.BytesIO() writer.write(new) new.seek(0) print('finish') return FileResponse(new, as_attachment=True, filename='hello.pdf')
def getDataUsingPyPdf2(filename): pdf = PdfFileReader(open(filename, "rb")) content = "" for i in range(0, pdf.getNumPages()): #print(str(i)) extractedText = pdf.getPage(i).extractText() content += extractedText + "\n" content = " ".join(content.replace("\xa0", " ").strip().split()) return content.encode("ascii", "ignore")
def add_filler(options): output = [] filler_data = options['filler'] if not filler_data["include"]: return [] if not len(filler_data["order"]): thread_print("WARNING: No filler ordering was specified, filler will not be added") return [] for filename in filler_data["order"]: try: filler = PdfFileReader(open(os.path.join(filler_data["directory"], f'{filename}.pdf'), 'rb')) for i in range(filler.getNumPages()): page: PageObject = filler.getPage(i) if not validate_mediabox(page.mediaBox, options): thread_print(f'WARNING: Page {i + 1} in "{filename}" has incorrect dimensions\nExpected {options["page-size"]["width"]} x {options["page-size"]["height"]}, received {float(page.mediaBox.getWidth()) / inch} x {float(page.mediaBox.getHeight()) / inch}.') output.append(filler.getPage(i)) except OSError: thread_print(f'WARNING: Unable to open file "{filename}.pdf", this item will be skipped.') continue return output
def get(self, request, *args, **kwargs): fontname_g = "HeiseiKakuGo-W5" pdfmetrics.registerFont(UnicodeCIDFont(fontname_g)) buffer = io.BytesIO() cc = canvas.Canvas(buffer) reader = PdfFileReader('media/pdf/sample.pdf') existing_page = reader.getPage(0) cc.setFont(fontname_g, 24) cc.drawString(0, 820, "テスト") cc.showPage() cc.save() buffer.seek(0) new_pdf = PdfFileReader(buffer) existing_page.mergePage(new_pdf.getPage(0)) writer = PdfFileWriter() writer.addPage(existing_page) new = io.BytesIO() writer.write(new) new.seek(0) return FileResponse(new, as_attachment=True, filename='hello.pdf')
def pdfSplit(pdf_main, pdf_part): try: pdf_read_obj = PdfFileReader(pdf_main) pdf_write_obj = PdfFileWriter() page_num = pdf_read_obj.getNumPages() page_last_obj = pdf_read_obj.getPage(page_num - 1) page_last_obj.rotateClockwise(90) pdf_write_obj.addPage(page_last_obj) pdf_write_obj.write(open(pdf_part, 'wb')) return page_num - 1 except Exception as e: return False
def clickOK(): File = PdfFileReader(open(selectPDF.get() + '.pdf', 'rb')) page_cound = File.getNumPages() pprint.pprint(page_cound) ageList = [] for i in range(0, page_cound): try: if i == entry1.get(): ageList.append(File.getPage(i).extractText()) pprint.pprint(ageList[int(0)]) except: print("except")
def PdfPassword(filepath, password): # Check if file exists checkFile = os.path.isfile(filepath) if checkFile: # Get the path of directory and filename path, filename = os.path.split(filepath) # Get the file extension to check for pdf files file_extension = os.path.splitext(filepath)[1] if file_extension == ".pdf": # The output filename output_file = os.path.join(path, f"temp_{ts}_{filename}") # Create a PdfFileWriter object pdf_writer = PdfFileWriter() # Open our PDF file with the PdfFileReader file = PdfFileReader(filepath) # Get number of pages in original file # Iterate through every page of the original file and add it to our new file for idx in range(file.numPages): # Get the page at index idx page = file.getPage(idx) # Add it to the output file pdf_writer.addPage(page) # Encrypt the new file with the entered password pdf_writer.encrypt(password, use_128bit=True) # Open a new file with open(output_file, "wb") as file: # Write our encrypted PDF to this file pdf_writer.write(file) print('File Written To Path:', output_file) else: # File extension is not PDF print( f"Not A PDF File Given, File Has Extension: {file_extension}") sys.exit() else: # No file exists on the current path print("Check The File Path") sys.exit()
def PdfMultiplePassword(filepaths, password): # Check if files exists check_path = [os.path.isfile(x) for x in filepaths] # Gets the files extension file_extensions = [os.path.splitext(x)[1] for x in filepaths] # Check if files extension are pdf file_extensions_check = [x for x in file_extensions if x != ".pdf"] if False in check_path: # Get the index of the file that doesn't exists index = check_path.index(False) print(f"File Doesn't Exists: {filepaths[index]}") sys.exit() else: # Not a PDF file is given if file_extensions_check: print("Submit Only PDF Files") sys.exit() else: count = 1 # Iterate through every pdf of the filepaths for path in filepaths: # Create a PdfFileWriter object pdf_writer = PdfFileWriter() # Open our PDF file with the PdfFileReader pdf_reader = PdfFileReader(path) # Get the page at index idx for page in range(pdf_reader.getNumPages()): # Add each page to the writer object pdf_writer.addPage(pdf_reader.getPage(page)) # The output filename output_file = f"merge_enc_{count}_{ts}.pdf" # Encrypt the new file with the entered password pdf_writer.encrypt(password, use_128bit=True) # Write out the merged PDF with open(output_file, 'wb') as file: pdf_writer.write(file) count += 1 print('File Written To Path:', output_file)
def calculate_locations(filename,keywords): locations = [] fp = open(filename, 'rb') parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) #Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = PDFPage.create_pages(document) pagenum = 0 reader = PdfFileReader(file(filename,"rb")) for page in pages: interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() page = reader.getPage(pagenum) x = page.trimBox[0].as_numeric() y = page.trimBox[1].as_numeric() #Handling special case if (x > 0 and y < 0): x = 0 # print "At page = %s X = %s , y = %s"%(pagenum,x,y) for keyword in keywords: print '********************************' co_ordinates = get_location(keyword,layout,x,y) print'Keyword %s , location %s'%(keyword,co_ordinates) print '********************************' if co_ordinates != None : for location in co_ordinates: print "PageNum-->%s"%pagenum l = LocationKeeper(keyword,location,pagenum) locations.append(l) pagenum+=1 return locations
def getTitlePDFfromBookmarkfile(pdf_filepath, bookmark_filepath, pdf_filepath_output): bookmark_file = codecs.open(bookmark_filepath, 'r', encoding='utf-8') lines = bookmark_file.readlines() page_start = 0 for i, line in enumerate(lines): # print(line) if line.find(u'目录') >= 0: line = line.strip() print(line) print(line.split('\t')) page_start = int(line.split('\t')[1]) page_start -= 1 print(page_start) page_end = 0 page_list = [] for i, line in enumerate(lines): line = line.strip() # print(line) if line.find('\t') >= 0: # print(int(line.rsplit('\t',1)[1])) page_list.append(int(line.rsplit('\t', 1)[1])) # page_list=page_list.sort() # print(page_list) for i in range(0, len(page_list)): if page_list[i] > page_start: page_end = page_list[i] break page_end -= 1 print(page_end) if page_end <= page_start and page_start >= 0 and page_end > 0: print('not find title page') return pdf = PdfFileReader(open(pdf_filepath, "rb")) output = PdfFileWriter() for i in range(page_start, page_end + 1): output.addPage(pdf.getPage(i)) # dst_pdf.addPage(pdf.getPage(i)) # pdf_bytes = io.BytesIO() # output.write(pdf_bytes) # pdf_bytes.seek(0) # img = Image(file=pdf_bytes, resolution=300) # img.convert("png") # img.save(pdf_filepath_output+'_out.tif') stream = open(pdf_filepath_output, 'wb') output.write(stream)
def split_pdf(inFile, outFile): ''' 拆分文档 :param inFile: 输入文件 :param outFile: 输出文件 :return: ''' pdfFileWriter = PdfFileWriter() pdfFileReader = PdfFileReader(open(inFile, 'rb')) page_count = pdfFileReader.getNumPages() print(page_count) # 将 pdf 第2页之后的页面,输出到一个新的文件 for i in range(2, page_count): pdfFileWriter.addPage(pdfFileReader.getPage(i)) pdfFileWriter.write(open(outFile, 'wb'))
def enumerate_pages(files, options, style=0, start=None, page_map=None, write_pages=False, no_filler_before=None, verbose=False): pages = [] if start == None: start = 1 if style == 0 else 'A' counter = start # Read filler pages if filler needs to be interlaced filler = None if no_filler_before: filler = add_filler(options) valid_filler_indeces = [] for file in files: check_stop_script() try: # Save page num assignment to map if page_map != None: page_map[counter] = file if filler and file not in no_filler_before: valid_filler_indeces.append(len(pages)) # Read input file input = PdfFileReader(open(file, 'rb')) num_pages = input.getNumPages() page_num = f'{counter}' # Add all the pages to the list for i in range(num_pages): input_page: PageObject = input.getPage(i) # Verify that it has the proper dimensions mediabox = input_page.mediaBox if not validate_mediabox(mediabox, options): thread_print(f'WARNING: Page {i + 1} in "{file}" has incorrect dimensions\nExpected {options["page-size"]["width"]} x {options["page-size"]["height"]}, received {float(mediabox.getWidth()) / inch} x {float(mediabox.getHeight()) / inch}.') continue # Calculate this page number if not write_pages: continue if num_pages > 1: page_num = f'{counter}.{i + 1}' pages.append(add_page_num(input_page, page_num, options) if options["enumerate-pages"] else input_page) except OSError: thread_print(f'Error when parsing "{file}"') # Increment Counter counter = (counter + 1) if style == 0 else (chr(ord(counter) + 1)) # Interlace filler (if applicable) if filler: interlace_filler(pages, filler, valid_filler_indeces) return pages
def _removePropertyEndPage(self, file_pdf): '''移除资产明细表中的无用页''' fd_in = open(file_pdf, "rb") pdf_in = PdfFileReader(fd_in) page_num = pdf_in.getNumPages() pdf_out = PdfFileWriter() for num in range(page_num - 1): page = pdf_in.getPage(num) pdf_out.addPage(page) fd_out = open(file_pdf + 'tmp.pdf', "wb") pdf_out.write(fd_out) fd_in.close() fd_out.close() os.remove(file_pdf) os.rename(os.path.join('', file_pdf + 'tmp.pdf'), os.path.join('', file_pdf)) print(' > 已把最后一页删除')
class PdfLoader: def __init__(self, filename): self.filename = filename self.pdf_reader = PdfFileReader(open(filename, "rb")) def get_number_of_pages(self): return self.pdf_reader.getNumPages() def get_page_text(self, pageno): page = self.pdf_reader.getPage(pageno - 1) return page.extractText() def get_page_image(self, pageno): tmp_file = tempfile.NamedTemporaryFile("wb") path = tmp_file.name tmp_file.close() stdio = open(os.devnull, 'wb') return_value = call(["gs", "-sDEVICE=png16m", "-dNOPAUSE", "-dFirstPage=%d" % pageno, "-dLastPage=%d" % pageno, "-sOutputFile=%s" % path, "-r300", "-q", self.filename, "-c", "quit"], stdout=stdio, stderr=stdio) if return_value != 0: try: os.unlink(path) except: pass raise LoadError() img = Image.open(path) os.unlink(path) return img
def merge_page_nums(pages: List[PageObject], options, filename='page_nums.pdf'): output = [] path = os.path.join(options["folder-dir"], "tmp", filename) with open(path, 'rb') as f: page_num_pdf = PdfFileReader(f) for i, page in enumerate(pages): target: PageObject = page_num_pdf.getPage(i) target.mergePage(page) # For some reason the text doesn't appear properly if we don't write first thread_print("Writing extra output file because this is somehow necessary") tmp_out = PdfFileWriter() tmp_out.addPage(target) with open(os.path.join(options["folder-dir"], "tmp", "page_num_overlap.pdf"), 'wb') as f: pass#tmp_out.write(f) output.append(target) return output
def addBlankpage(inFile, outFile): ''' pdf读取写入操作 ''' pdfFileWriter = PdfFileWriter() # 获取 PdfFileReader 对象 pdfFileReader = PdfFileReader( inFile) # 或者这个方式:pdfFileReader = PdfFileReader(open(readFile, 'rb')) numPages = pdfFileReader.getNumPages() for index in range(0, numPages): pageObj = pdfFileReader.getPage(index) pdfFileWriter.addPage(pageObj) # 根据每页返回的 PageObject,写入到文件 pdfFileWriter.write(open(outFile, 'wb')) pdfFileWriter.addBlankPage() # 在文件的最后一页写入一个空白页,保存至文件中 pdfFileWriter.write(open(outFile, 'wb'))
def pdf_read(file_name): # input1 = PdfFileReader(file(file_name, 'rb')) # # print "title = %s" % (input1.getDocumentInfo().title) # # print input1.pages # for page in input1.getPage(1): # print page.extractText() pdf = PdfFileReader(file(file_name, "rb")) content = "" pagecount = pdf.getNumPages() print('pagecount:', pagecount) # pageLabels = {} # page = pdf.getPage(0) # pageLabels[page.indirectRef.idnum] = 1 # print pdf.getOutlines()[0]['/Page']['/Contents'] for i in range(0, pdf.getNumPages()): extractedText = pdf.getPage(i).extractText() content += extractedText + "\n" # return content.encode("ascii", "ignore") print content
def _create_hyperlinks(self, link_locations, page_locations): reader = PdfFileReader("tmp2.pdf") writer = PdfFileWriter() for i in range(reader.getNumPages()): page = reader.getPage(i) writer.addPage(page) for i in range(len(link_locations)): toc_page = 1 if self.toc_orientation == "P": toc_page = math.floor(i / settings["Items on vertical toc"]) if self.toc_orientation == "L": toc_page = math.floor(i / settings["Items on horizontal toc"]) writer.addLink(pagenum=toc_page, pagedest=page_locations[i] - 1, rect=link_locations[i], fit="/Fit", border=[0, 0, 0]) with open(self.filename, 'wb') as out: writer.write(out)
def crop(pdf_in, pdf_out): """ Параметры pdf_in - абсолютный путь к пдф pdf_out - абсолютный путь для исходящего пдф :return: status """ """ Временно к функции добавлен второй параметр - pdf_out. В продакшн она должна сохранять результат кропа в тот же файл """ status = True # Словарь с размерами бумаги для каждой страницы papers = analyze_papersize(pdf_in) # like {1: ('Speedmaster', 900, 640), 2: ('Dominant', 640, 450)} # TODO Доработать временное решение кропа в отсутствии инфы о размере бумаги. if papers == {}: perl_crop = "perl pdfcrop.pl {} {}".format(pdf_in, pdf_out) os.system(perl_crop) return status input = PdfFileReader(file(pdf_in, "rb")) output = PdfFileWriter() # Количество страниц pages_qty = input.getNumPages() for index in range(pages_qty): paper_machine = papers[index+1][0] paper_w = papers[index+1][1] paper_h = papers[index+1][2] for m in PrintingPress._registry: if paper_machine == m.name: machine = m plate_w = machine.plate_w plate_h = machine.plate_h page = input.getPage(index) """ EXAMLE # The resulting document has a trim box that is 200x200 points # and starts at 25,25 points inside the media box. # The crop box is 25 points inside the trim box. print mm(page.mediaBox.getUpperRight_x()), mm(page.mediaBox.getUpperRight_y()) page.trimBox.lowerLeft = (25, 25) page.trimBox.upperRight = (225, 225) page.cropBox.lowerLeft = (50, 50) page.cropBox.upperRight = (200, 200) """ print 'Crop page {} to paper {}x{}'.format(index+1, paper_w, paper_h) page.mediaBox.lowerLeft = ((pt(plate_w - paper_w)/2), pt(machine.klapan)) # отступ слева, отступ снизу page.mediaBox.upperRight = (pt(paper_w + (plate_w - paper_w)/2), pt(paper_h + machine.klapan)) # ширина+отступ, высота+отступ output.addPage(page) outputstream = file(pdf_out, "wb") output.write(outputstream) outputstream.close() return status