def diff_pdf_pages(pdf1_path, pdf2_path): pdf2_fp = PdfFileReader(io.BytesIO(pdf2_path)) pdf2_len = pdf2_fp.getNumPages() if not pdf1_path: return list(range(0, pdf2_len)) pdf1_fp = PdfFileReader(io.BytesIO(pdf1_path)) pdf1_len = pdf1_fp.getNumPages() list_differents = list() for i in range(pdf1_len): if i >= pdf2_len: list_differents.append(i) continue output1 = PdfFileWriter() output2 = PdfFileWriter() output1.addPage(pdf1_fp.getPage(i)) output2.addPage(pdf2_fp.getPage(i)) fp1 = io.BytesIO() fp2 = io.BytesIO() output1.write(fp1) output2.write(fp2) fp1.seek(0) fp2.seek(0) if fp1.read() != fp2.read(): list_differents.append(i) return list_differents
def main(): parser = argparse.ArgumentParser() parser.add_argument('input_pdf_path', metavar='PATH') parser.add_argument('-o', '--output', metavar='out', type=argparse.FileType('wb'), help='Output PDF file') parser.add_argument('-s', '--skip', type=int, default=0, help='Skip over the first n page(s).') args = parser.parse_args() logger = logging.getLogger(__name__) logging.basicConfig(level='INFO', format='%(asctime)s - %(levelname)s - %(message)s') directory = './temp/' if not os.path.exists(directory): os.makedirs(directory) images_path = [] pdf = PdfFileReader(open(args.input_pdf_path, "rb")) for i in range(0, pdf.getNumPages()): logger.info("Processing page {}/{}".format(i + 1, pdf.getNumPages())) images_path.append("./temp/{}.jpg".format(i)) process_page(pdf, i, i < args.skip) logger.info('Writing to output PDF file') args.output.write(img2pdf.convert(*list(map(img2pdf.input_images, images_path)))) logger.info('Done') shutil.rmtree(directory, True)
def write_pdf(self, output): # get plain pdf from rml template = select_template([ 'leprikon/{}/{}.rml'.format(self.pdf_export, self.subject.subject_type.slug), 'leprikon/{}/{}.rml'.format(self.pdf_export, self.subject.subject_type.subject_type), 'leprikon/{}/subject.rml'.format(self.pdf_export), ]) rml_content = template.render({ 'object': self, 'site': LeprikonSite.objects.get_current(), }) pdf_content = trml2pdf.parseString(rml_content.encode('utf-8')) # merge with background if self.print_setup.background: template_pdf = PdfFileReader(self.print_setup.background.file) registration_pdf = PdfFileReader(BytesIO(pdf_content)) writer = PdfFileWriter() # merge pages from both template and registration for i in range(registration_pdf.getNumPages()): if i < template_pdf.getNumPages(): page = template_pdf.getPage(i) page.mergePage(registration_pdf.getPage(i)) else: page = registration_pdf.getPage(i) writer.addPage(page) # write result to output writer.write(output) else: # write basic pdf registration to response output.write(pdf_content) return output
def iter_pdf_page_text(self, filename): year="" month="" day="" mydate="" self.filename = filename reader = PdfFileReader(open(filename,"rb")) logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename)) metadata = reader.getDocumentInfo() logging.info("METADATA: " + str(metadata)) try: if metadata.has_key('/CreationDate'): year = metadata['/CreationDate'][2:5] month = metadata['/CreationDate'][6:7] day = metadata['/CreationDate'][8:9] mydate =year+"-"+month+"-"+day else: mydate = datetime.datetime.now().strftime("%Y-%m-%d-%H%M%S") except: #hack ... but sometimes /creationdate is bunged mydate = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S") for pgnum in range(reader.getNumPages()): text = reader.getPage(pgnum).extractText() text = text.encode('ascii', 'ignore') text = text.replace('\n', ' ') yield text
def imp_exp_pdf(inputfile, outputfile, size, margin, padding): "For Import and Export PDF files by resizing" output = PdfFileWriter() input = PdfFileReader(file(inputfile, 'rb'), strict=False) totalPages = input.getNumPages() p = [] for i in range(0, input.getNumPages()): p.append(input.getPage(i)) if len(p) == 10: output_one_page(p, size, margin, padding, output) p = [] echoer = "Printed {} of {} [{:.2f}%]".format( i + 1, totalPages, (i + 1) / float(totalPages) * 100) print echoer if len(p) > 0: tmppdf = PdfFileReader(file('BlankA4.pdf', 'rb'), strict=False) tmppage = tmppdf.getPage(0) (w, h) = tmppage.mediaBox.upperRight output_one_page(p, size, margin, padding, output) p = [] print print 'Completed converting.' print 'Saving...' outputStream = file(outputfile, "wb") output.write(outputStream) outputStream.close() print 'END OF PROGRAM'
def union(input_files, output_file): output = PdfFileWriter() for input_file in input_files: if input_file.endswith('.pdf'): input = PdfFileReader(open(input_file, 'rb')) num_pages = input.getNumPages() for i in range(0, num_pages): output.addPage(input.getPage(i)) else: # input_file isn't pdf ex. jpeg, png im = PIL.Image.open(input_file) input_file_pdf = input_file.split('.')[0]+'.pdf' im.save(input_file_pdf, 'PDF', resoultion = 100.0) input = PdfFileReader(open(input_file_pdf, 'rb')) num_pages = input.getNumPages() for i in range(0, num_pages): output.addPage(input.getPage(i)) os.remove(input_file_pdf) with open(output_file, 'wb') as outputStream: output.write(outputStream) print('completed.') print('Union of some file is ' + output_file)
def _iter_pdf(self, filename): with open(filename, 'rb') as f: reader = PdfFileReader(f) logging.debug("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename)) for pgnum in range(reader.getNumPages()): text = reader.getPage(pgnum).extractText() text = text.encode('ascii', 'ignore') text = text.replace('\n', ' ') yield text
def iter_pdf_page_text(self, filename): self.filename = filename reader = PdfFileReader(filename) logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename)) for pgnum in range(reader.getNumPages()): text = reader.getPage(pgnum).extractText() text = text.encode('ascii', 'ignore') text = text.replace('\n', ' ') yield text
def add_page_numbers(inputfile, outputfile, startno=None, endno=None, fontname="Helvetica", fontsize=12, pagenoformat="- %i -", pagesize=A4, posx=280, posy=800): """ Adds page numbers to the input PDF file and stores the modified PDF in output. Optionally, the page range can be limited. :param inputfile: the input PDF :type inputfile: str :param outputfile: the output PDF :type outputfile: str :param startno: the first page to number, 1-based, use None to start from first page :type startno: int :param endno: the last page to number, 1-based, use None to end with last page :type endno: int :param fontname: the name of the font to use, eg 'Helvetica' :type fontname: str :param fontsize: the size of the font, eg 12 :type fontsize: int :param pagenoformat: the format string for the page number, eg '- %i -' :type pagenoformat: str :param pagesize: the page size, eg A4 :type pagesize: object :param posx: the X position for the page number :type posx: int :param posy: the Y position for the page number :type posy: int """ inputpdf = PdfFileReader(open(inputfile, "rb")) outputpdf = PdfFileWriter() if startno is None: startno = 1 if endno is None: endno = inputpdf.getNumPages() for i in xrange(inputpdf.getNumPages()): page = i + 1 current = inputpdf.getPage(i) # add page number? # taken from here: http://stackoverflow.com/a/17538003 if (page >= startno) and (page <= endno): packet = StringIO.StringIO() can = canvas.Canvas(packet, pagesize=pagesize) can.setFont(fontname, fontsize) can.drawString(posx, posy, pagenoformat % page) can.save() packet.seek(0) pagenopdf = PdfFileReader(packet) logger.info("Page " + str(page) + " added") current.mergePage(pagenopdf.getPage(0)) else: logger.info("Page " + str(page)) outputpdf.addPage(current) outputstream = file(outputfile, "wb") outputpdf.write(outputstream)
def process(self, content, mimetype='application/pdf'): """Process a PDF document. Args: content: Binary content of the document. mimetype: Id of MIME type (content ignored if it isn't `application/pdf`). Returns: Tuple: Relevancy of the document (based on keywords) Metadata extracted from the document (dictionary). """ relevancy = 0 metadata = {} if mimetype == 'application/pdf': # Obtain metadata doc = PdfFileReader(BytesIO(content)) info = doc.getDocumentInfo() if info: for k in info: metadata[k] = info.getText(k) # Extra metadata metadata['_num_pages'] = doc.getNumPages() # Process title, subject and metadata keywords # TODO guess title from page text when not provided if self.keywords: relevant = (metadata.get('/Title', '') + ' ' + metadata.get('/Subject', '') + ' ' + metadata.get('/Keywords', '')).lower() for word in self.keywords: if word.lower() in relevant: # Each relevant keyword increases relevancy in 10 points relevancy += 10 # Process pages. distance_factor = 1 for p in range(doc.getNumPages()): # Break if factor is too low if distance_factor < 0.01: break try: text = doc.getPage(p).extractText().lower() for word in self.keywords: relevancy += distance_factor * text.count(word.lower()) except Exception as ex: # Some bad formed PDFs raise decoding errors. Skip page. pass # Each new page reduces relevancy factor in a half distance_factor /= 2 # Relevancy is significant by the nearest tenth relevancy = round(relevancy, 1) else: relevancy = 0 metadata['_relevancy'] = relevancy return relevancy, metadata
def manipulatePDF(): global BDSDFullName,contentTXT,totalInTxt global testTableName, testTableFullName,testTableDict input0 = PdfFileReader(file(BDSDFullName[0],'rb')) merger1 = PdfFileMerger() numBDSD = input0.getNumPages() merger1.append(fileobj = input0, pages = (0,numBDSD)) #generate an instance for BDSD file pageIncrement = 0 i=0 #count how many test tables are inserted to BDSD file. tableCount = 0 testItemsPagesInitial = BDSDContentFillter(contentTXT) if numBDSD != totalInTxt+1: print '''\nError!\nNumber of pages in "content.txt" are different from the "BDSD file". This process is forced to stop. Please check both files and then start over again.''' raw_input('Press any key to quit...') sys.exit(0) exceptCount = False for testTable in testTableDict: try: startPage = int(testItemsPagesInitial[getTestItem(testTableDict[testTable])]) except KeyError as k: exceptCount = True errorMessage1_1 = "\nError: '%s'" % testTable errorMessage1_2 = "Above file is failed to merge into BDSD. You may want to abort this process and check both:\n 1. file name of test table, or\n 2. BDSD page number." print errorMessage1_1 print errorMessage1_2 message(errorMessage1_1) message(errorMessage1_2) position = startPage fileObj = PdfFileReader(file(testTable,'rb')) tableCount += 1 pages = range(0, fileObj.getNumPages()) merger1.merge(position , fileObj, pages) i += 1 currentPage = startPage pageIncrement = fileObj.getNumPages() testItemsPagesInitial = dictIncrement(testItemsPagesInitial,currentPage,pageIncrement) #open testtable and put all pages of it into a reader object. #for page in range(0,1): try: merger1.write(open('merger output.pdf','wb')) except: utils.PdfReadError() errorMessage = "\nError: There's an error occured during generate the final output PDF file, please feedback this issue to ChuRui, thanks a lot.\n" print errorMessage message(errorMessage) if exceptCount: warningMessage= "Warning: output PDF file couldn't be used in case there is an Error.\n" print warningMessage message(warningMessage) else: print "\n%d Test Tables successfully merged to \"%s\", please check the output file." % (tableCount, BDSDFullName[0])
def merge_pdf(infnList, outfn): """ 合并pdf :param infnList: 要合并的PDF文件路径列表 :param outfn: 保存的PDF文件名 :return: None """ pagenum = 0 pdf_output = PdfFileWriter() for pdf in infnList: # 先合并一级目录的内容 first_level_title = pdf['title'] dir_name = os.path.join(os.path.dirname( __file__), 'gen', first_level_title) padf_path = os.path.join(dir_name, first_level_title + '.pdf') pdf_input = PdfFileReader(open(padf_path, 'rb')) # 获取 pdf 共用多少页 page_count = pdf_input.getNumPages() for i in range(page_count): pdf_output.addPage(pdf_input.getPage(i)) # 添加书签 parent_bookmark = pdf_output.addBookmark( first_level_title, pagenum=pagenum) # 页数增加 pagenum += page_count # 存在子章节 if pdf['child_chapters']: for child in pdf['child_chapters']: second_level_title = child['title'] padf_path = os.path.join(dir_name, second_level_title + '.pdf') pdf_input = PdfFileReader(open(padf_path, 'rb')) # 获取 pdf 共用多少页 page_count = pdf_input.getNumPages() for i in range(page_count): pdf_output.addPage(pdf_input.getPage(i)) # 添加书签 pdf_output.addBookmark(second_level_title, pagenum=pagenum, parent=parent_bookmark) # 增加页数 pagenum += page_count # 合并 pdf_output.write(open(outfn, 'wb')) # 删除所有章节文件 shutil.rmtree(os.path.join(os.path.dirname(__file__), 'gen'))
def get_pdf_dimensions(self, path): """Get pdf dimensions using PyPDF2""" try: pdf = PdfFileReader(file(path, "rb")) except: return None page_list = [] if pdf.getNumPages() > 0: for page_number in range(0, pdf.getNumPages()): page = pdf.getPage(page_number) page_list.append({'page': page_number, 'width': page.mediaBox.getLowerRight_x(), 'height': page.mediaBox.getUpperLeft_y()}) return page_list else: return None
def merge_pdf_stack(request): pdf1 = "pdf1.pdf" pdf2 = "pdf2.pdf" pdfs = [pdf1, pdf2] buffer = BytesIO() doc = MyDocTemplateMerge(buffer, pagesize=PAGE_SIZE, rightMargin=MARGIN_SIZE, leftMargin=MARGIN_SIZE, topMargin=85, bottomMargin=18) content = [] no_page = 2 cpt = 0 content.append(Paragraph('Table of contents', ParagraphStyle('normal'))) for fname in pdfs: input = PdfFileReader(open(fname, 'rb')) number_of_page = input.getNumPages() content.append(Paragraph('%s %s-%s' % (fname, no_page, no_page + number_of_page), ParagraphStyle('normal'))) no_page = no_page + number_of_page cpt = cpt + 1 doc.build(content) merger = PdfFileMerger() merger.setPageMode('/UseOC') num_page = 1 no_page = 1 cpt = 0 for fname in pdfs: input = PdfFileReader(open(fname, 'rb')) number_of_page = input.getNumPages() lien = fname merger.append(input, bookmark=lien, import_bookmarks=False) num_page = num_page + 1 no_page = no_page + number_of_page cpt = cpt + 1 merger.append(buffer) output = open("output.pdf", "wb") merger.write(output) output.close() return render(request, "test.html")
def AddPrint(infile, outpath): files = glob.glob(infile) if not outpath: outpath = os.path.join(os.path.dirname(infile), "autoprint") print("Defaulting output to " + outpath) outisdir = os.path.isdir(outpath) outexists = os.path.exists(outpath) if len(files) > 1 and not(outisdir): outpath = os.path.dirname(outpath) outisdir = os.path.isdir(outpath) outexists = os.path.exists(outpath) if outisdir and os.path.samefile(os.path.dirname(infile), outpath): outpath = os.path.join(outpath, "autoprint") outisdir = os.path.isdir(outpath) outexists = os.path.exists(outpath) if not outexists and len(files) > 1: os.makedirs(outpath) outisdir = os.path.isdir(outpath) outexists = os.path.exists(outpath) # We have multiple files check if the output is a directory. if len(files) > 1 and not(outisdir): print('Out path must be a directory if infile is multiple files') return for f in files: if outisdir: outfile = os.path.join(outpath, os.path.basename(f)) else: outfile = outpath output = PdfFileWriter() input = PdfFileReader(open(f, "rb")) # print how many pages input has: print("Processing: '%s', %d pages" % (f, input.getNumPages())) for x in range(0, input.getNumPages()): output.addPage(input.getPage(x)) # add some Javascript to launch the print window on opening this PDF. output.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") # write output to disk outputStream = open(outfile, "wb") output.write(outputStream) print("Written: %s" % outfile)
def page_count(obj): if isinstance(obj, str): pdf = PdfFileReader(open(obj, 'rb')) print pdf.getNumPages() elif isinstance(obj, list): page_map = {} for filename in obj: page = PdfFileReader(open(filename, 'rb')).getNumPages() if page in page_map: page_map[page] += 1 else: page_map[page] = 1 print "page\tcount:" for page in page_map: print str(page) + '\t' + str(page_map[page])
def main(): files = os.listdir("./PDFs") sort_intuitive(files) files = files[:-1] srms = [] others = [] for f in files: if f.startswith("Single Round Match"): srms.append(f) else: others.append(f) # print files[50] # print alphanum_key(files[50]) srmpagenumbers = open("./Merged/srmpagenumbers.txt", "w") otherpagenumbers = open("./Merged/otherpagenumbers.txt", "w") curr = 0 merger = PdfFileMerger() for f in srms: inp = PdfFileReader(file("./PDFs/" + f, "rb")) numpages = inp.getNumPages() srmpagenumbers.write(str(curr) + ",, " + f + "\n") curr += numpages merger.append(inp) merger.write("./Merged/srmmerged.pdf") curr = 0 merger = PdfFileMerger() for f in others: inp = PdfFileReader(file("./PDFs/" + f, "rb")) numpages = inp.getNumPages() otherpagenumbers.write(str(curr) + ",, " + f + "\n") curr += numpages merger.append(inp) merger.write("./Merged/othermerged.pdf") srmpagenumbers.close() otherpagenumbers.close()
def add_files(category, filenames_, input_abs_dir): """ Handle pdf files for *category* (str). Input pdf files are in *input_abs_dir* (str) *filenames* gives the list of filenames relative to *input_abs_dir*. """ global proceedings_pdf global cumulative_page_count global blank_page_pdf mprint('(For {})'.format(category)) for filename_ in filenames_: input_pdf_path = os.path.join(input_abs_dir, filename_) mprint('\t' + os.path.relpath(input_pdf_path, working_dir)) input_pdf = PdfFileReader(open(input_pdf_path, 'rb')) input_number_of_pages = input_pdf.getNumPages() proceedings_pdf.appendPagesFromReader(input_pdf) cumulative_page_count += input_number_of_pages # check if blank page insertion is needed if cumulative_page_count % 2: # if odd number cumulative_page_count += 1 proceedings_pdf.appendPagesFromReader(blank_page_pdf)
def pdf_to_csv_with_PyPDF(): """ Iterates throught all the pdf stored in ./data/pdf/ folder and export its content to the file data.csv. The format of the csv file should have two columns: id and text """ bar = progressbar.ProgressBar() csv_data_file = _DATA_PATH + "data.csv" with open(csv_data_file, "w", newline='') as csvfile: data_writer = csv.writer(csvfile) data_writer.writerow(["document_id","document_text"]) for fn in bar(os.listdir(_PDF_PATH)): file_path = os.path.join(_PDF_PATH, fn) if file_path.endswith(".pdf"): try: input_file = PdfFileReader(open(file_path, 'rb')) text = "" for p in range(input_file.getNumPages()): text += input_file.getPage(p).extractText() + " " except utils.PdfReadError as e: print("Error al leer el PDF: {0}".format(fn)) except Exception as e: print("Error desconocido en el PDF: {0}".format(fn)) print("Error: {0}".format(e)) else: #TODO: Check if text is not empty data_writer.writerow([fn,text])
def pdf_get_no_pages(self, input_file): """Return number of pages in a pdf using PyPDF2.""" try: pdf_input = PdfFileReader(file(input_file, "rb")) return pdf_input.getNumPages() except: return None
def tearpage(filename, startpage=1): """ Copy filename to a tempfile, write pages startpage..N to filename. :param filename: PDF filepath :param startpage: page number for the new first page """ # Copy the pdf to a tmp file tmp = tempfile.NamedTemporaryFile() shutil.copy(filename, tmp.name) # Read the copied pdf try: input_file = PdfFileReader(open(tmp.name, 'rb')) except PdfReadError: _fixPdf(filename, tmp.name) input_file = PdfFileReader(open(tmp.name, 'rb')) # Seek for the number of pages num_pages = input_file.getNumPages() # Write pages excepted the first one output_file = PdfFileWriter() for i in range(startpage, num_pages): output_file.addPage(input_file.getPage(i)) tmp.close() outputStream = open(filename, "wb") output_file.write(outputStream)
def _merge_pdf(documents): '''Merge PDF files into one. :param documents: list of path of pdf files :returns: path of the merged pdf ''' writer = PdfFileWriter() streams = [] # We have to close the streams *after* PdfFilWriter's call to write() try: for document in documents: pdfreport = open(document, 'rb') streams.append(pdfreport) reader = PdfFileReader(pdfreport, overwriteWarnings=False) for page in range(0, reader.getNumPages()): writer.addPage(reader.getPage(page)) merged_file_fd, merged_file_path = tempfile.mkstemp(suffix='.html', prefix='report.merged.tmp.') with closing(os.fdopen(merged_file_fd, 'w')) as merged_file: writer.write(merged_file) finally: for stream in streams: try: stream.close() except Exception: pass for stream in streams: stream.close() return merged_file_path
def mergePDFList(self, pdf_data_list, start_on_recto=False): """Merge multiple PDFs in a new PDF. Both input and output are raw PDF data as string, so pdf_data_list must be a list of strings, and the output is the merged pdf as a string. If "start_on_recto" is set to true, some blank pages will be added in order to have each PDF as the recto page. This is useful if you have to print the merged pdf in recto/verso mode. """ from StringIO import StringIO from PyPDF2 import PdfFileWriter, PdfFileReader output = PdfFileWriter() for pdf_data in pdf_data_list: if pdf_data: pdf_reader = PdfFileReader(StringIO(pdf_data)) page_count = pdf_reader.getNumPages() for page in range(page_count): output.addPage(pdf_reader.getPage(page)) if start_on_recto and page_count % 2: output.addBlankPage() outputStream = StringIO() output.write(outputStream) return outputStream.getvalue()
def add_update_pdf_metadata(filename, update_dictionary): # This seems to be the only way to modify the existing PDF metadata. # # pylint: disable=protected-access, no-member def add_prefix(value): return '/' + value full_update_dictionary = {add_prefix(k): v for k, v in update_dictionary.items()} with open(filename, 'rb') as input_file: pdf_input = PdfFileReader(input_file) pdf_output = PdfFileWriter() for page in range(pdf_input.getNumPages()): pdf_output.addPage(pdf_input.getPage(page)) info_dict = pdf_output._info.getObject() info = pdf_input.documentInfo full_update_dictionary = dict(chain(info.items(), full_update_dictionary.items())) for key in full_update_dictionary: assert full_update_dictionary[key] is not None info_dict.update({NameObject(key): createStringObject(full_update_dictionary[key])}) _, temp_file_name = tempfile.mkstemp(prefix="email2pdf_add_update_pdf_metadata", suffix=".pdf") with open(temp_file_name, 'wb') as file_out: pdf_output.write(file_out) shutil.move(temp_file_name, filename)
def get_png_image_frompdf( input_pdf_file, newWidth = None, verify = True ): assert( os.path.basename( input_pdf_file ).endswith( '.pdf' ) ) assert( os.path.isfile( input_pdf_file ) ) ipdf = PdfFileReader( open( input_pdf_file, 'rb' ) ) assert( ipdf.getNumPages() == 1 ) mbox = ipdf.getPage( 0 ).mediaBox files = { 'file' : open( input_pdf_file, 'rb' ) } width = int( mbox.getWidth( ) ) height = int( mbox.getHeight( ) ) apiKey = get_cloudconvert_api_key( ) params = { 'apikey' : apiKey, 'input' : 'upload', 'inputformat' : 'pdf', 'outputformat' : 'png', } if newWidth is not None: assert( isinstance( newWidth, int ) ) assert( newWidth > 10 ) newHeight = int( height * 1.0 * newWidth / width ) params['converteroptions[resize]'] = '%dx%d' % ( newWidth, newHeight ) # ## response = requests.post( "https://api.cloudconvert.com/convert", params = params, files = files, verify = verify ) if response.status_code != 200: raise ValueError("Error, could not upload and convert PDF file %s." % input_pdf_file ) img = Image.open( StringIO( response.content ) ) return img
def toStringFormatParalell(path, rank, size, comm): pdf = PdfFileReader(open(path, "rb")) numero_paginas = pdf.getNumPages() print("******************************************",numero_paginas) intervalo = int(numero_paginas/size) resto = numero_paginas%size fin, inicio = 0, 0 if(rank==0): for i in range(1, size): if(i == rank): fin += intervalo inicio = (fin - intervalo) + 1 fin += resto data = {'inicio':inicio, 'fin': fin, 'path': path} comm.send(data, dest=i, tag=1) else: fin += intervalo inicio = (fin - intervalo) + 1 data = {'inicio':inicio, 'fin': fin, 'path': path} comm.send(data, dest=i, tag=1) if(rank!=0): data = comm.recv(source=0, tag=1) contenido_pagina = "" lista = list() for i in range(data['inicio'], data['fin']): txt = data['path'].replace(".pdf", rank + ".txt") subprocess.call( "pdftotext -f " + str(i + 1) + " -l " + str(i + 1) + " " + data['path'], shell=True) contenido_pagina = open(txt).read().lower() contenido_pagina = contenido_pagina.replace('á', 'a') contenido_pagina = contenido_pagina.replace('é', 'e') contenido_pagina = contenido_pagina.replace('í', 'i') contenido_pagina = contenido_pagina.replace('ó', 'o') contenido_pagina = contenido_pagina.replace('ú', 'u') contenido_pagina = contenido_pagina.replace('ñ', 'n') contenido_pagina = re.sub('[^a-z]', '', contenido_pagina) lista.append(contenido_pagina) #subprocess.call("rm -R " + txt, shell=True) comm.send(lista, dest=0, tag=2) if(rank == 0): book = [] for i in range(1,size): book += comm.recv(source=i, tag=2) return book
def test_cat(self): """Make sure files are properly concatenated.""" check_call([STAPLER, 'cat', ONEPAGE_PDF, FIVEPAGE_PDF, self.outputfile]) self.assert_(os.path.isfile(self.outputfile)) pdf = PdfFileReader(file(self.outputfile, 'rb')) self.assertEqual(pdf.getNumPages(), 6)
def split(paperpdf, splitpdf): output = PdfFileWriter() with open(paperpdf, "rb") as l: with open(paperpdf, "rb") as r: # I know... I know. # We have to do this because PyPDF2 kind of sucks. left = PdfFileReader(l) right = PdfFileReader(r) pagecount = left.getNumPages() print("%s has %s pages to split." % (paperpdf,pagecount)) for num in range(0, pagecount): left_page = left.getPage(num) right_page = right.getPage(num) midpoint = ( left_page.mediaBox.getUpperRight_x() / 2, left_page.mediaBox.getUpperRight_y() ) left_page.mediaBox.upperRight = midpoint output.addPage(left_page) right_page.mediaBox.upperLeft = midpoint output.addPage(right_page) print("Writing %s pages to %s" % (output.getNumPages(), splitpdf)) with open(splitpdf, "wb") as s: output.write(s)
def extract_text(link): amazon_file_name = "pdfs/" + link[25:] if not default_storage.exists(amazon_file_name): try: add_file(link) except: return '' pdf = default_storage.open(amazon_file_name, 'rb') try: pdf_file = PdfFileReader(pdf) except: print "BAD FILE-- %s " %(link) pages = pdf_file.getNumPages() count = 0 text = '' while count < pages: pg = pdf_file.getPage(count) pgtxt = pg.extractText() count = count + 1 text = text + pgtxt return text
def toStringFormat(path): # tiempo inicial # se inicia la cadena que almacenará el contenido de cada página # del pdf contenido_pagina = "" # instanciando lista a ocupar lista = list() # abrir pdf en modo lectura pdf = PdfFileReader(codecs.open(path, "rb")) # imprime cuantas páginas tiene el pdf: numero_paginas = pdf.getNumPages() # print("Numero de paginas del PDF: ", numero_paginas) # uso de la librería PyPDF2 para obtener la cantidad de hojas del pdf for i in range(numero_paginas): # convierte página i de pdf en txt subprocess.call( "pdftotext -f " + str(i + 1) + " -l " + str(i + 1) + " " + path, shell=True) # reemplazo de .pdf a .txt en path txt = path.replace(".pdf", ".txt") # abrir fichero txt que trae el contenido de la página i del pdf + # limpieza del string contenido_pagina = codecs.open(txt, encoding='ISO-8859-1').read().lower() contenido_pagina = contenido_pagina.replace('á', 'a') contenido_pagina = contenido_pagina.replace('é', 'e') contenido_pagina = contenido_pagina.replace('í', 'i') contenido_pagina = contenido_pagina.replace('ó', 'o') contenido_pagina = contenido_pagina.replace('ú', 'u') contenido_pagina = contenido_pagina.replace('ñ', 'n') contenido_pagina = re.sub('[^a-z]', '', contenido_pagina) lista.append(contenido_pagina) subprocess.call("rm -R " + txt, shell=True) return lista
def extract_data_from_pdf(cwd=os.getcwd()): # when typing the password, it will not be shown in the terminal password = getpass('Enter password: '******'temp.pdf') # to store the data based on the year (key = year, value = [month, inflow, outflow, netflow, interest, avg_balance]) data_dict = dict() for file_name in os.listdir(cwd): # to search for FRANK OCBC e-Statements in the same folder if file_name.startswith("FRANK") and file_name.endswith(".pdf"): file_dir = os.path.join(cwd, file_name) try: '''since PyPDF2 cannot open the encrypted file, we use pikepdf to open the file and create a copy in a temporary pdf file that is decrypted for extraction ''' temp_file = Pdf.open(file_dir, password=password) temp_file.save(temp_dir) pdf_obj = PdfFileReader(temp_dir) except PasswordError: print( 'Wrong password! Please rerun the script again to create the summary.' ) exit() except: print( 'Whoops. Something went wrong, please rerun the script again. In case the error still happens, \ please report this issue in https://github.com/nicklimmm/banking-statement-summarizer/issues!' ) exit() try: date_created = pdf_obj.getDocumentInfo()['/CreationDate'] year = int(date_created[2:6]) # the statement is created 1 month later, so we decrement by 1 to get the actual data month = int(date_created[6:8]) - 1 # error handling when the e-statement is received on January (which is e-statement for December) if month == 0: year -= 1 month = 12 # to handle different number of pages in each file, and the summary lies in the back pages of the file num_pages = pdf_obj.getNumPages() if num_pages == 3: page_obj = pdf_obj.getPage(num_pages - 3) else: page_obj = pdf_obj.getPage(num_pages - 2) # using regex to find the necessary details and extract those to variables text = page_obj.extractText().encode('ascii').decode('ascii') pattern = r'[0-9,\.]+\s+[0-9,\.]+\s+[0-9,\.]+\s+[0-9,\.]+[0-9]' result = re.findall(pattern, text) inflow, outflow, _, _ = list( map(float, result[-1].replace(',', '').split())) netflow = round(float(inflow) - float(outflow), 2) if year not in data_dict: data_dict[year] = [] data_dict[year].append((month, inflow, outflow, netflow)) except: print( 'Something went wrong... Please rerun the script or report the issue in GitHub.' ) os.remove(temp_dir) exit() # to prevent unauthorized access the decrypted pdf os.remove(temp_dir) return data_dict
from PyPDF2 import PdfFileWriter, PdfFileReader if __name__ == '__main__': # Read pdf from file infile = PdfFileReader('in.pdf') # Get total amount of pages totpages = infile.getNumPages() # Get starting page number from each page pdfpagenuminfo = infile.trailer["/Root"]["/PageLabels"]["/Nums"] pdfpagenumaliases = pdfpagenuminfo[0::2] # Shift page number of interest to get only the page of the last overlay for each frame pagestokeep = [x - 1 for x in pdfpagenumaliases[1::]] + [totpages - 1] # Initialize output output = PdfFileWriter() # Add content to output for i in pagestokeep: p = infile.getPage(i) p.cropBox.lowerLeft = (0, 10) output.addPage(p) # Write to output file with open('out.pdf', 'wb') as f: output.write(f)
import progressbar from PyPDF2 import PdfFileReader, PdfFileWriter input = json.load(open(sys.argv[1])) origfile = sys.argv[2] outfile = sys.argv[3] pages_sub_segments = input['segments'] #infiles = input['files'] shape = input['shape'] j = 0 parts = [] print 'reading file ...' pdf = PdfFileReader(open(origfile, 'rb')) pages = [pdf.getPage(i) for i in range(0, pdf.getNumPages())] colwidths = [] for segments in pages_sub_segments: colwidths += [ seg_info['segment']['right'] - seg_info['segment']['left'] for seg_info in segments if seg_info['iscolumn'] ] if len(colwidths) == 0: colwidth = shape[0] / 2 else: colwidth = max(colwidths) def crop(page, section): #print 'page dimensions', page.mediaBox, page.mediaBox.upperLeft, page.mediaBox.upperRight
def pages(self): '''int: Number of pages contained in PDF file''' with open(self.__filepath, 'rb') as in_pdf: pdf_handler = PdfFileReader(in_pdf) return pdf_handler.getNumPages()
def techs_selected(request, model=None, id=None): groups = TechGroup.objects.all() chosen_techs = Technology.objects.filter(tech_choices__session=get_session(request)) choices = TechChoice.objects.filter(session=get_session(request)).order_by('order') chosen_in_group = [] all_techs =[] relevance=[] empty=[] for tc in choices: tech = Technology.objects.get(pk=tc.technology.id) all_techs.append(tech) applicable = tech.applicable(get_session(request)) relevance_added=False if applicable == tech.TECH_USE_MAYBE: relevancy_objects = list(tech.maybe_relevant(get_session(request))) if len(relevancy_objects)!=0: relevance.append(relevancy_objects) relevance_added = True if applicable == tech.TECH_USE_NO: relevancy_objects = list(tech.not_relevant(get_session(request))) if len(relevancy_objects)!=0: relevance.append(relevancy_objects) relevance_added = True if not relevance_added: relevance.append(empty) all_chosen_techs = list(zip(all_techs,relevance)) if request.method == 'POST': # If the form has been submitted... form = PDF_prefs(request.POST) # A form bound to the POST data if form.is_valid(): # All validation rules pass # incl_selected=form.cleaned_data['incl_selected'] # incl_short_expl=form.cleaned_data['incl_short_expl'] # incl_akvopedia=[] # incl_akvopedia.append(form.cleaned_data['incl_akvopedia_1']) # incl_akvopedia.append(form.cleaned_data['incl_akvopedia_2']) # incl_akvopedia.append(form.cleaned_data['incl_akvopedia_3']) # incl_akvopedia.append(form.cleaned_data['incl_akvopedia_4']) # incl_akvopedia.append(form.cleaned_data['incl_akvopedia_5']) # incl_akvopedia.append(form.cleaned_data['incl_akvopedia_6']) # create list of Akvopedia articles to be included # Akvopedia_articles_URL=[] # for index,incl_akv in enumerate(incl_akvopedia): # if (incl_akv==True and chosen_in_group[index]!=''): # if chosen_in_group[index].url!='': # Akvopedia_articles_URL.append(chosen_in_group[index].url) # create list of factors and criteria answers = get_or_create_answers(get_session(request)) criterion_list=[] applicable_list=[] change_list = [] factor_list = [] old_factor = '' # the 'change' variable is used to detect when we need to display a new factor. The form list is just a list of all criteria. for answer in answers: criterion_list.append(answer.criterion) applicable_list.append(answer.applicable) new_factor = answer.criterion.factor factor_list.append(new_factor) change_list.append(new_factor != old_factor) old_factor = new_factor zipped_answerlist = list(zip(factor_list,change_list,criterion_list,applicable_list)) # This will generate all akvopedia articles in pdf form from the wiki. Needs to be done only once. #initialize_Akvopedia_articles() #create the basic PDF today=datetime.datetime.today() format_temp = "watercompass-%a-%b-%d-%Y_%H-%M-%S.temp.pdf" format_final= "watercompass-%a-%b-%d-%Y_%H-%M-%S.pdf" s_name_temp=today.strftime(format_temp) s_name_final=today.strftime(format_final) #first create first pages pdf_path=create_PDF_selected_techs(all_chosen_techs, zipped_answerlist,True,True,s_name_temp) # append akvopedia articles if checked. THIS_PATH=os.path.dirname(__file__) (HOME,HERE)=os.path.split(THIS_PATH) akvopedia_pdf_dir= settings.STATIC_ROOT + '/akvopedia_pdf/' output_dir=settings.STATIC_ROOT + 'pdf_tmp/' output = PdfFileWriter() outputStream = open(output_dir+s_name_final, "wb") input = PdfFileReader(open(output_dir+s_name_temp, "rb")) num_pages=input.getNumPages() for i in range(num_pages): output.addPage(input.getPage(i)) # for article_url in Akvopedia_articles_URL: # # create pdf path # URL_list=article_url.split("/") # article_name=URL_list[-1] # full_path=akvopedia_pdf_dir+article_name+'.pdf' # append article # input = PdfFileReader(file(full_path, "rb")) # num_pages=input.getNumPages() # for i in range(num_pages): # output.addPage(input.getPage(i)) output.write(outputStream) outputStream.close() return { 'techgroups' : groups, 'all_chosen_techs' : all_chosen_techs, 'session' : request.session, 'form' : form, 'pdf_file' :'/technologies/pdf/'+s_name_final, 'chosen_techs': choices } #HttpResponseRedirect(reverse('techs_selected_download')) # Redirect after POST else: form = PDF_prefs() # An unbound form return { 'techgroups' : groups, 'session' : request.session, 'form' : form, 'pdf_file' :'', 'chosen_techs' : choices }
os.mkdir(outputDir) # else: # if os.listdir(outputDir): # shutil.rmtree(outputDir) # os.mkdir(outputDir) # 得到目录起始页和终止页 prompt_start, prompt_end = '目录起始页:', '目录终止页:' prelog_startIdx, prelog_endIdx = int(input(prompt_start)), int( input(prompt_end)) # 取出这几页目录, 生成新文件 rd = PdfFileReader(open(pdf_path, 'rb')) # pdfilereader -> rd page_cnt = rd.getNumPages() wt = PdfFileWriter() for prelogue_pageIdx in range(prelog_startIdx, prelog_endIdx + 1): # prelog_idx 是和pdf上面显示的页码对应的 page_obj = rd.getPage(prelogue_pageIdx - 1) # getpage时候, 从零算起, 故真实页码-1 wt.addPage(page_obj) prelog_path = outputDir + '\\' + '目录集合.pdf' output_fd = open(prelog_path, 'wb') # fd 指的是一个句柄, 传给wt去写入 wt.write(output_fd) output_fd.close() ## 这一行值1亿美元!! print('生成所有目录!') print('目录地址:{}; 目标地址:{}'.format(prelog_path, outputDir)) # save pics
from PyPDF2 import PdfFileReader, PdfFileWriter import os import copy path = r"practice_files\half and half.pdf" input_pdf = PdfFileReader(path) output_pdf = PdfFileWriter() for page_num in range(0, input_pdf.getNumPages()): # Page_left/right har nu begge en kopi af siden page_left = input_pdf.getPage(page_num) page_right = copy.copy(page_left) # Finder midten af siden upper_right = page_left.mediaBox.upperRight new_coords = (upper_right[0] / 2, upper_right[1]) # Sætter top højre hjørne på venstre side page_left.mediaBox.upperRight = new_coords output_pdf.addPage(page_left) # sætter top venstre hjørne på højre side page_right.mediaBox.upperLeft = new_coords output_pdf.addPage(page_right) output_path = "The Little Mermaid.pdf" with open(output_path, "wb") as output_file: output_pdf.write(output_file)
def book_pdf(from_filename, output_filename): from PyPDF2 import PdfFileReader, PdfFileWriter from io import BytesIO pdf_reader = PdfFileReader(str(from_filename)) pdf_writer = PdfFileWriter() buffer = BytesIO() ####################################### # Rotation ####################################### n = pdf_reader.getNumPages() print(f'File: {from_filename} pre rotate constains {n} pages') for n in range(pdf_reader.getNumPages()): page = pdf_reader.getPage(n) if n % 2 == 0: page.rotateCounterClockwise(90) else: page.rotateClockwise(90) pdf_writer.addPage(page) pdf_writer.write(buffer) ####################################### # Cropping ####################################### import numpy as np import math from collections import Counter, OrderedDict def reorder(n): ''' Example: For 4 double sided pages you actually get 16 pages Ordered as: 8, 9, 10, 7 6, 11, 12, 5 4, 13, 14, 3 2, 15, 16, 1 ''' n_all = (n + 1) * 2 # total_range = int(math.ceil(n_all//4.0)*2) total_range = int(math.ceil(n_all / 2.0)) out = [] for a in range(1, total_range, 2): tmp = [] tmp.append(a + 1) tmp.append(n_all - a) tmp.append(n_all - a + 1) tmp.append(a) out.append(tmp) arr = np.concatenate(np.array(out)[::-1]) duplicates = [ item for item, count in Counter(arr).items() if count > 1 ] assert len( duplicates ) == 0, f"Page ordering - Duplicate error: {duplicates}\n{arr}" assert float(n + 1) == len( arr ) / 2.0, f"Page ordering - length error: golden={float(n+1)}, revised={len(arr)/2.0}" return list(arr) # Crop pdf_reader0 = PdfFileReader(buffer) pdf_reader1 = PdfFileReader(buffer) pdf_writer = PdfFileWriter() pages = reorder(n) print( f'File: {from_filename} after booking constains {n+1}*2={n*2+2} pages') print(f'Page ordering: {pages}') threshold = 0.02 out = {} for n in range(pdf_reader.getNumPages()): page1 = pdf_reader0.getPage(n) page2 = pdf_reader1.getPage(n) # get dimensions # The first two: x,y coordinates of the lower-left corner # The secnd two: x,y coordinates of the Upper-right corner (ll_height, ll_width, ur_height, ur_width) = page1.mediaBox # print(page1["/Rotate"], page2["/Rotate"]) # Right (first) page # When rotation comes into play, x,y dimensions are still from original image if page1["/Rotate"] == 90: page1.mediaBox.lowerLeft = (ll_height, int(ur_width / 2.0 - threshold * ur_width)) else: # == -90 page1.mediaBox.upperRight = (ur_height, int(ur_width / 2.0 + threshold * ur_width)) if page2["/Rotate"] == 90: page2.mediaBox.upperRight = (ur_height, int(ur_width / 2.0 + threshold * ur_width)) else: page2.mediaBox.lowerLeft = (ll_height, int(ur_width / 2.0 - threshold * ur_width)) # Force page ordering as book enforces try: n1 = pages.pop(0) n2 = pages.pop(0) # print(f'id: {n} right:{n1}, left:{n2}') out[n1] = page1 out[n2] = page2 except: raise Exception( f'Page ordering - During crop reordered pages does not match') print("Done cropping") # Reorder pages od = OrderedDict(sorted(out.items())) for k, v in od.items(): # print(k) pdf_writer.addPage(v) print("Done reordering") # Writing output with Path(output_filename).open(mode="wb") as output_file: pdf_writer.write(output_file)
# 17.2 - Challenge: Use GUI Elements to Help a User Modify Files # Solution to challenge # save part of a PDF based on a user-supplied page range using a GUI import easygui as gui from PyPDF2 import PdfFileReader, PdfFileWriter # let the user choose an input file input_file_path = gui.fileopenbox("", "Select a PDF to trim...", "*.pdf") if input_file_path is None: # exit on "Cancel" exit() # get the page length of the input file input_file = PdfFileReader(input_file_path) total_pages = input_file.getNumPages() # let the user choose a beginning page page_start = gui.enterbox("Enter the number of the first page to use:", "Where to begin?") if page_start is None: # exit on "Cancel" exit() # check for possible problems and try again: # 1) input page number isn't a (non-negative) digit # or 2) input page number is 0 # or 3) page number is greater than total number of pages while (not page_start.isdigit() or page_start == "0" or int(page_start) > total_pages): gui.msgbox("Please provide a valid page number.", "Whoops!") page_start = gui.enterbox("Enter the number of the first page to use:", "Where to begin?")
def update_file_info(self, file): # set defaults to blank file.add_string_attribute('title', '') file.add_string_attribute('description', '') file.add_string_attribute('album', '') file.add_string_attribute('creator', '') file.add_string_attribute('tracknumber', '') file.add_string_attribute('genre', '') file.add_string_attribute('date', '') file.add_string_attribute('bitrate', '') file.add_string_attribute('samplerate', '') file.add_string_attribute('length', '') file.add_string_attribute('datetime_original', '') file.add_string_attribute('exposure_time', '') file.add_string_attribute('fnumber', '') file.add_string_attribute('focal_length', '') file.add_string_attribute('gps_altitude', '') file.add_string_attribute('gps_latitude', '') file.add_string_attribute('gps_longitude', '') file.add_string_attribute('iso_speed', '') file.add_string_attribute('get_orientation', '') file.add_string_attribute('model', '') file.add_string_attribute('resolution_unit', '') file.add_string_attribute('xresolution', '') file.add_string_attribute('yresolution', '') file.add_string_attribute('shutter_speed_value', '') file.add_string_attribute('aperture_value', '') file.add_string_attribute('brightness_value', '') file.add_string_attribute('exposure_bias_value', '') file.add_string_attribute('max_aperture_value', '') file.add_string_attribute('metering_mode', '') file.add_string_attribute('light_source', '') file.add_string_attribute('flash', '') file.add_string_attribute('exposure_mode', '') file.add_string_attribute('gain_control', '') file.add_string_attribute('width', '') file.add_string_attribute('height', '') file.add_string_attribute('pages', '') if file.get_uri_scheme() != 'file': return # strip file:// to get absolute path filename = urllib.parse.unquote_plus(file.get_uri()[7:]) # mp3 handling if file.is_mime_type('audio/mpeg'): # attempt to read ID3 tag try: audio = EasyID3(filename) # sometimes the audio variable will not have one of these items # defined, that's why there is this long try / except attempt try: if 'title' in audio.keys(): file.add_string_attribute('title', audio['title'][0]) else: file.add_string_attribute('title', '') except Exception: file.add_string_attribute('title', _('Error')) try: file.add_string_attribute('album', audio['album'][0]) except Exception: file.add_string_attribute('album', _('Error')) try: file.add_string_attribute('creator', audio['artist'][0]) except Exception: file.add_string_attribute('creator', _('Error')) try: file.add_string_attribute('tracknumber', audio['tracknumber'][0]) except Exception: file.add_string_attribute('tracknumber', _('Error')) try: file.add_string_attribute('genre', audio['genre'][0]) except Exception: file.add_string_attribute('genre', _('Error')) try: file.add_string_attribute('date', audio['date'][0]) except Exception: file.add_string_attribute('date', _('Error')) except Exception: # [SabreWolfy] some files have no ID3 tag and will throw this # exception: file.add_string_attribute('title', '') file.add_string_attribute('description', '') file.add_string_attribute('album', '') file.add_string_attribute('creator', '') file.add_string_attribute('tracknumber', '') file.add_string_attribute('genre', '') file.add_string_attribute('date', '') # try to read MP3 information (bitrate, length, samplerate) try: mpfile = open(filename) mpinfo = MPEGInfo(mpfile) file.add_string_attribute('bitrate', str(mpinfo.bitrate / 1000) + ' Kbps') file.add_string_attribute('samplerate', str(mpinfo.sample_rate) + ' Hz') # [SabreWolfy] added consistent formatting of times in format # hh:mm:ss # [SabreWolfy[ to allow for correct column sorting by length mp3length = '%02i:%02i:%02i' % ((int(mpinfo.length / 3600)), (int(mpinfo.length / 60 % 60)), (int(mpinfo.length % 60))) mpfile.close() file.add_string_attribute('length', mp3length) except Exception: file.add_string_attribute('bitrate', _('Error')) file.add_string_attribute('length', _('Error')) file.add_string_attribute('samplerate', _('Error')) try: mpfile.close() except Exception: pass # image handling if file.get_mime_type().split('/')[0] in ('image'): # EXIF handling routines try: metadata = GExiv2.Metadata(filename) try: file.add_string_attribute( 'datetime_original', metadata.get_tag_string('Exif.Image.DateTime')) except Exception: file.add_string_attribute('datetime_original', '') try: file.add_string_attribute( 'creator', metadata.get_tag_string('Xmp.dc.creator')) except Exception: file.add_string_attribute('creator', '') try: file.add_string_attribute( 'description', metadata.get_tag_string('Exif.Image.ImageDescription')) except Exception: file.add_string_attribute('description', '') try: x = str(metadata.get_tag_string('Xmp.dc.title')) file.add_string_attribute('title', x[17:]) except Exception: file.add_string_attribute('title', '') try: file.add_string_attribute('exposure_time', metadata.get_exposure_time()) except Exception: file.add_string_attribute('exposure_time', '') try: file.add_string_attribute('fnumber', metadata.get_fnumber()) except Exception: file.add_string_attribute('fnumber', '') try: file.add_string_attribute('focal_length', metadata.get_focal_length()) except Exception: file.add_string_attribute('focal_length', '') try: file.add_string_attribute('gps_altitude', metadata.get_gps_altitude()) except Exception: file.add_string_attribute('gps_altitude', '') try: file.add_string_attribute('gps_latitude', metadata.get_gps_latitude()) except Exception: file.add_string_attribute('gps_latitude', '') try: file.add_string_attribute('gps_longitude', metadata.get_gps_longitude()) except Exception: file.add_string_attribute('gps_longitude', '') try: file.add_string_attribute('iso_speed', metadata.get_iso_speed()) except Exception: file.add_string_attribute('iso_speed', '') file.add_string_attribute('orientation', get_orientation(metadata)) try: file.add_string_attribute( 'model', metadata.get_tag_string('Exif.Image.Model')) except Exception: file.add_string_attribute('model', '') file.add_string_attribute('resolution_unit', get_resolution_unit(metadata)) try: file.add_string_attribute( 'xresolution', metadata.get_tag_string('Exif.Image.XResolution')) except Exception: file.add_string_attribute('xresolution', '') try: file.add_string_attribute( 'yresolution', metadata.get_tag_string('Exif.Image.YResolution')) except Exception: file.add_string_attribute('yresolution', '') try: file.add_string_attribute( 'shutter_speed_value', metadata.get_tag_string( 'Exif.Photo.ShutterSpeedValue')) except Exception: file.add_string_attribute('shutter_speed_value', '') try: file.add_string_attribute( 'aperture_value', metadata.get_tag_string('Exif.Photo.ApertureValue')) except Exception: file.add_string_attribute('aperture_value', '') try: file.add_string_attribute( 'brightness_value', metadata.get_tag_string('Exif.Photo.BrightnessValue')) except Exception: file.add_string_attribute('brightness_value', '') try: file.add_string_attribute( 'brightness_value', metadata.get_tag_string('Exif.Photo.BrightnessValue')) except Exception: file.add_string_attribute('brightness_value', '') try: file.add_string_attribute( 'exposure_bias_value', metadata.get_tag_string( 'Exif.Photo.ExposureBiasValue')) except Exception: file.add_string_attribute('exposure_bias_value', '') try: file.add_string_attribute( 'max_aperture_value', metadata.get_tag_string('Exif.Photo.MaxApertureValue')) except Exception: file.add_string_attribute('max_aperture_value', '') file.add_string_attribute('metering_mode', get_metering_mode(metadata)) file.add_string_attribute('light_source', get_light_source(metadata)) file.add_string_attribute('flash', get_flash(metadata)) file.add_string_attribute('exposure_mode', get_exposure_mode(metadata)) file.add_string_attribute('gain_control', get_gain_control(metadata)) except Exception: file.add_string_attribute('datetime_original', '') file.add_string_attribute('creator', '') file.add_string_attribute('title', '') file.add_string_attribute('description', '') file.add_string_attribute('exposure_time', '') file.add_string_attribute('fnumber', '') file.add_string_attribute('focal_length', '') file.add_string_attribute('gps_altitude', '') file.add_string_attribute('gps_latitude', '') file.add_string_attribute('gps_longitude', '') file.add_string_attribute('iso_speed', '') file.add_string_attribute('get_orientation', '') file.add_string_attribute('model', '') file.add_string_attribute('resolution_unit', '') file.add_string_attribute('xresolution', '') file.add_string_attribute('yresolution', '') file.add_string_attribute('shutter_speed_value', '') file.add_string_attribute('aperture_value', '') file.add_string_attribute('brightness_value', '') file.add_string_attribute('exposure_bias_value', '') file.add_string_attribute('max_aperture_value', '') file.add_string_attribute('metering_mode', '') file.add_string_attribute('light_source', '') file.add_string_attribute('flash', '') file.add_string_attribute('exposure_mode', '') file.add_string_attribute('gain_control', '') try: im = Image.open(filename) try: file.add_string_attribute('width', str(im.size[0])) except Exception: file.add_string_attribute('width', _('Error')) try: file.add_string_attribute('height', str(im.size[1])) except Exception: file.add_string_attribute('height', _('Error')) except Exception: file.add_string_attribute('width', '') file.add_string_attribute('height', '') # video/flac handling if file.is_mime_type('video/x-msvideo') or\ file.is_mime_type('video/mpeg') or\ file.is_mime_type('video/x-ms-wmv') or\ file.is_mime_type('video/mp4') or\ file.is_mime_type('audio/x-flac') or\ file.is_mime_type('video/x-flv') or\ file.is_mime_type('video/x-matroska') or\ file.is_mime_type('audio/x-wav'): metadata = MediaInfo(filename) file.add_string_attribute('format', metadata.get_format()) file.add_string_attribute('duration', metadata.get_duration_string()) file.add_string_attribute('overall_bitrate', metadata.get_overallbitrate()) file.add_string_attribute('frame_count', metadata.get_framecount()) file.add_string_attribute('video_format', metadata.get_videoformat()) file.add_string_attribute('width', metadata.get_width()) file.add_string_attribute('height', metadata.get_height()) file.add_string_attribute('bit_depth', metadata.get_bitdepth()) file.add_string_attribute('audio_format', metadata.get_audioformat()) file.add_string_attribute('title', metadata.get_title()) #JRB #file.add_string_attribute('description', # metadata.get_description()) # pdf handling if file.is_mime_type('application/pdf'): try: f = open(filename, 'rb') pdf = PdfFileReader(f) info = pdf.getDocumentInfo() try: file.add_string_attribute( 'title', info.title if info.title is not None else '') except Exception: file.add_string_attribute('title', _('Error')) try: file.add_string_attribute( 'description', info.subject if info.subject is not None else '') except Exception: file.add_string_attribute('description', _('Error')) try: file.add_string_attribute( 'creator', info.author if info.author is not None else '') except Exception: file.add_string_attribute('creator', _('Error')) try: file.add_string_attribute('pages', str(pdf.getNumPages())) except Exception: file.add_string_attribute('pages', _('Error')) if pdf.getNumPages() > 0: try: width = abs( pdf.getPage(0).mediaBox.upperRight[0] - pdf.getPage(0).mediaBox.lowerLeft[0]) file.add_string_attribute( 'width', str(int(float(width) * math.sqrt(2.0) / 4.0))) except Exception: file.add_string_attribute('width', '') try: height = abs( pdf.getPage(0).mediaBox.upperRight[1] - pdf.getPage(0).mediaBox.lowerLeft[1]) file.add_string_attribute( 'height', str(int(float(height) * math.sqrt(2.0) / 4.0))) except Exception: file.add_string_attribute('height', '') else: file.add_string_attribute('width', '') file.add_string_attribute('height', '') f.close() except Exception: file.add_string_attribute('title', _('Error')) file.add_string_attribute('description', _('Error')) file.add_string_attribute('creator', _('Error')) file.add_string_attribute('pages', _('Error')) file.add_string_attribute('width', _('Error')) file.add_string_attribute('height', _('Error')) self.get_columns()
#Allen Higgins C00197373 #Zoltan Fuzesi C00197361 #Robert Scully C00196960 from tabula import convert_into from PyPDF2 import PdfFileReader filename = raw_input('Enter PDF file Name to convert to CSV:--> ') try: reader = PdfFileReader(filename, 'r') totalPages = reader.getNumPages() outputFileName = filename[0:-4] + '.csv' convert_into(filename, outputFileName, output_format="csv", pages=range(2, totalPages)) except Exception as e: print( 'File not found. Please check name of file or if the file has been created' )
def write_files_to_output(directory, output_path): output = PdfFileWriter() for file in os.listdir(directory): # build the path path = os.path.join(directory, file) # check whether it is a directory if os.path.isdir(path): print(file) # iterate over the folders in this folder for inner_file in os.listdir(path): path_1 = os.path.join(path, inner_file) if os.path.isdir(path_1): for pdf_file in os.listdir(path_1): # get the PDF if pdf_file.endswith(".pdf"): # build the path to the PDF path_inner = os.path.join(path_1, pdf_file) file_to_pdf = open(path_inner, 'rb') # read the PDF in existing_pdf = PdfFileReader(file_to_pdf) # number = pdf_file.split('_')[0] # search for a student number in the file name pattern = re.compile('[sS]\d{7}') result = pattern.search(pdf_file) if result is None: # search the first page of the file page = existing_pdf.getPage(0) page_content = page.extractText() result = pattern.search(page_content) number = result.group( 0) if result is not None else '0' # concatenate version_student = str(file) + '_' + str(number) # create the canvas with the watermark on it packet = BytesIO() # create a new PDF with Reportlab can = canvas.Canvas(packet, pagesize=A4) # shift about the canvas to have it start at the middle of the page # can.translate(A4[0] / 2, A4[1] / 2) # can.rotate(45) # shift it back # can.translate(- A4[0] / 2, - A4[1] / 2) can.setFillColor(Color(0, 50, 100, alpha=0.5)) #0.5,0.5,0.5) can.setFont("Helvetica", 30) x, y = can._pagesize can.drawCentredString(x / 2, 10, str(version_student)) can.save() # move to the beginning of the StringIO buffer packet.seek(0) new_pdf = PdfFileReader(packet) # iterate over all pages nPages = existing_pdf.getNumPages() even = nPages % 2 == 0 for i in range(nPages): # add the watermark every page here page = existing_pdf.getPage(i) # this is where the watermark is added page.mergePage(new_pdf.getPage(0)) # add the page to the output output.addPage(page) if not even: # add a blank A4 page output.addBlankPage(210, 297) print(path_inner) else: continue # pick out the PDF else: continue file_output = open(output_path, "wb") output.write(file_output) file_output.close()
class PDFProcessor(object): def __init__(self, filename, language=None, config=None): self.filename = filename self.pdf_reader = PdfFileReader(filename) self.num_pages = self.pdf_reader.getNumPages() self.language = language self.config = config or {} def get_meta(self): doc_info = self.pdf_reader.getDocumentInfo() return {'title': doc_info.title} def get_images(self, pages=None, resolution=300): if pages is None: pages = range(self.num_pages) for page_no in pages: with self.get_image(page_no, resolution=resolution) as img: yield img @contextlib.contextmanager def get_image(self, page_no, resolution=300): filename = "{}[{}]".format(self.filename, page_no) with Image(filename=filename, resolution=resolution, background=wand.color.Color('#fff')) as img: img.alpha_channel = False yield img def get_text(self, pages=None): if pages is None: pages = range(self.num_pages) pdflib_pages = None if pdflib is not None: pdflib_doc = pdflib.Document(self.filename) pdflib_pages = list(pdflib_doc) for page_no in pages: if pdflib_pages is not None: page = pdflib_pages[page_no] text = ' '.join(page.lines).strip() else: page = self.pdf_reader.getPage(page_no) text = page.extractText() if not text.strip(): text = self.ocr_page(page_no) yield text.strip() def ocr_page(self, page_no): if tesserocr is None: return '' with self.get_image(page_no, resolution=300) as img: pil_image = PILImage.frombytes('RGB', img.size, img.make_blob('RGB')) return tesserocr.image_to_text( pil_image, lang=TESSERACT_LANGUAGE[self.language], path=self.config.get('TESSERACT_DATA_PATH', '')) def run_ocr(self, timeout=180): from froide.helper.document import run_ocr output_bytes = run_ocr(self.filename, language=TESSERACT_LANGUAGE[self.language], timeout=timeout) return output_bytes def save_pages(self, path, **kwargs): for page, img in enumerate(self.get_images(**kwargs), 1): filename = path.format(page=page) img.save(filename=filename) yield filename
def main(): parser = argparse.ArgumentParser( description='make movies from beamer slides') parser.add_argument('input', default='texput.pdf', nargs='?') parser.add_argument('output', default='input.xml', nargs='?') args = parser.parse_args() doc = fitz.open(args.input) output = open(args.output, 'w') output.write('<?xml version="1.0" encoding="UTF-8"?>' + "\n") output.write('<movie>' + "\n") with open(args.input, 'rb') as f: pdf = PdfFileReader(f) info = pdf.getDocumentInfo() number_of_pages = pdf.getNumPages() print("Reading through {} pages...".format(number_of_pages)) for i in range(number_of_pages): print("Page {}".format(i)) page = pdf.getPage(i) fitzpage = doc[i] mat = fitz.Matrix(4, 4) pix = fitzpage.getPixmap(matrix=mat, alpha=True) zoom = 4.0 * float(1080) / pix.height mat = fitz.Matrix(zoom, zoom) pix = fitzpage.getPixmap(matrix=mat, alpha=True) png = 'page{:03d}.png'.format(i) pix.writePNG(png) kind = 'wait' contents = '5' count = 0 for annot in page['/Annots']: # Other subtypes, such as /Link, cause errors subtype = annot.getObject()['/Subtype'] if subtype == "/Text": if annot.getObject()['/T'] == 'Wait': kind = 'wait' contents = annot.getObject()['/Contents'] if annot.getObject()['/T'] == 'Audio': kind = 'audio' contents = annot.getObject()['/Contents'] if annot.getObject()['/T'] == 'Video': kind = 'video' contents = annot.getObject()['/Contents'] count = count + 1 if count > 1: raise Exception("Too many annotations on page " + str(i + 1)) if kind == 'audio': output.write( ' <video src="{}" slide="{}"/>'.format(contents, png) + "\n") elif kind == 'video': output.write( ' <video src="{}" overlay="{}"/>'.format(contents, png) + "\n") elif kind == 'wait': output.write(' <video src="{}" in="0" out="{}"/>'.format( png, contents) + "\n") output.write('</movie>' + "\n") output.close() print("Wrote " + args.output)
""" This file is for diabetes data pre-processing """ import PyPDF2 from PyPDF2 import PdfFileReader, PdfFileWriter pdf_document = "Cover letter Deepmind.pdf" pdf = PdfFileReader(pdf_document) for page in range(pdf.getNumPages()): pdf_writer = PdfFileWriter current_page = pdf.getPage(page) pdf_writer.addPage(current_page) outputFilename = "example-page-{}.pdf".format(page + 1) with open(outputFilename, "wb") as out: pdf_writer.write(out) print("created", outputFilename) #with open('diabetes_file.txt', 'w') as f: # f.write("\n\n".join(pdfFileobj))
# Build a list of tuples for each file type the file dialog should display my_filetypes = [('PDF files','*.pdf'), ("All files", "*.*")] application_window = tkinter.Tk() # Ask the user to select a single file name. answer = filedialog.askopenfilename(parent=application_window, initialdir=os.getcwd(), title="Please select a file:", filetypes=my_filetypes) print(answer) print(answer) FILE_PATH = answer input1 = PdfFileReader(open(FILE_PATH, mode='rb')) n_pages=input1.getNumPages() print("document1.pdf has %d pages." % n_pages) end_of_document=False rt = RichText("") page_index=0; inc_page=True #n_pages=6 while (end_of_document== False): print("Página %d ." % page_index) if (inc_page == True): page=input1.getPage(page_index) txt=page.extractText() end_of_page=txt.find("\n\n") if (end_of_page >0):
def getPdfPageNum(path): with open(path, "rb") as file: doc = PdfFileReader(file) pagecount = doc.getNumPages() return pagecount
import os from PyPDF2 import PdfFileReader path = "C:/Users/Catrell Washington/Pride" input_file_name = os.path.join(path, "Pride.pdf") input_file = PdfFileReader(open(input_file_name, "rb")) output_file_name = os.path.join(path, "Pride.txt") output_file = open(output_file_name, "w") title = input_file.getDocumentInfo().title #get the file title total_pages = input_file.getNumPages() # get the total page count output_file.write(title + "\n") output_file.write("Number of Pages: {}\n\n".format(total_pages)) for page_num in range(0, total_pages): text = input_file.getPage(page_num).extractText() text = text.replace(" ", "\n") output_file.write(text) output_file.close()
del_name.unlink() file_names.remove(del_name) convert_pt_to_mm = 25.4 / 72.0 output_data = {} output_data[non_standard] = PdfFileWriter() for key in sheet_size_height.keys(): output_data[key] = PdfFileWriter() output_data[key + '_page_1'] = PdfFileWriter() # Считывание листов из файла PDF и сортировка их по форматам read_streams = [] for name in file_names: read_streams.append(open(name, 'rb')) pdf_document = PdfFileReader(read_streams[-1]) pages_count = pdf_document.getNumPages() for index in range(0, pages_count): current_page = pdf_document.getPage(index) page_added = False # Размеры страницы необходимо перевести из пунктов в мм page_height = round( float(current_page.mediaBox.getHeight()) * convert_pt_to_mm) page_width = round( float(current_page.mediaBox.getWidth()) * convert_pt_to_mm) for key in sheet_size_height.keys(): standard_height = sheet_size_height[key] standard_width = sheet_size_width[key] # Проверяются альбомные и портретные ориентации листа if (check_size(standard_height, page_height) and check_size(standard_width, page_width)) or ( check_size(standard_width, page_height)
def getArchiveFilenameList(self): out = [] pdf = PdfFileReader(open(self.path, 'rb')) for page in range(1, pdf.getNumPages() + 1): out.append("/%04d.jpg" % (page)) return out
def transform_comment_data_set(comment_data_set): for page, comment_data in comment_data_set.items(): if comment_data['analyze']: print(f"Analyze {comment_data['public_comment_path_pdf']}") text_comments = get_sentence_list.get_sentence_list( process.get_word_block( tdfp.convert_pdf_to_xml( comment_data['public_comment_path_pdf']))) comment_data['keyword_list'] = {} comment_data['address'] = [] if text_comments and not bogus.is_bogus_text(text_comments): with open(f"{comment_data['public_comment_path']}", 'w') as w: for comment in text_comments: w.write(comment) else: page_num = 0 with open(comment_data['public_comment_path_pdf'], 'rb') as infile: reader = PdfFileReader(infile) page_num = reader.getNumPages() if page_num <= 50: orc.extract_text_by_orc( comment_data['public_comment_path_pdf'], f"{comment_data['public_comment_path']}") elif page_num > 50: separate.separate_pdf_and_ocr( comment_data['public_comment_path_pdf'], f"{comment_data['public_comment_path']}") print("Extract keywords") if os.path.isfile(f"{comment_data['public_comment_path']}"): with open(f"{comment_data['public_comment_path']}") as r: comment_data[ 'keyword_list'] = get_keyword_list.get_keyword_list( r.read()) comment_data['address'].extend( get_address.get_address(r.readlines())) comment_data['address'].extend( get_spe_data.get_address(comment_data['summary'])) else: print( f"System cannot find {comment_data['public_comment_path_pdf']}" ) return { comment_data['comment_number']: { 'summary': comment_data['summary'], 'address': comment_data['address'], 'keyword_list': comment_data['keyword_list'], 'name': get_spe_data.get_name(comment_data['summary']), 'topic': get_spe_data.get_topic(comment_data['summary']), 'date': comment_data['date'], 'category': get_category(comment_data["keyword_list"], comment_data["summary"]) } for page, comment_data in comment_data_set.items() if comment_data['analyze'] }
from PyPDF2 import PdfFileWriter, PdfFileReader from PyPDF2Highlight import createHighlight, addHighlightToPage pdfInput = PdfFileReader(open("early-stopping-1703.09580.pdf", "rb")) pdfOutput = PdfFileWriter() page1 = pdfInput.getPage(0) number_of_pages = pdfInput.getNumPages() page_content = page1.extractText() import textract text = textract.process("early-stopping-1703.09580.pdf") print page_content.encode('utf-8') highlight = createHighlight(488.725021, 202.392357, 523.153376, 211.298922, { "author": "", "contents": "Bla-bla-bla" }) addHighlightToPage(highlight, page1, pdfOutput) pdfOutput.addPage(page1) outputStream = open("output.pdf", "wb") pdfOutput.write(outputStream)
__author__ = 'Chetan' import PyPDF2 from PyPDF2 import PdfFileReader pdf = open("diveintopython.pdf", 'rb') readerObj = PdfFileReader(pdf) print "PDF Reader Object is:\n", readerObj # Details of diveintopython book print "Details of diveintopython book" print "Number of pages", readerObj.getNumPages() print "Title:", readerObj.getDocumentInfo().title print "Author:", readerObj.getDocumentInfo().author print "Book Outline" for heading in readerObj.getOutlines(): if type(heading) is not list: print dict(heading).get('/Title') print "Reading Page 1" page = readerObj.getPage(1) print page.extractText() pdf.close()
from PyPDF2 import PdfFileReader, PdfFileWriter path = "C:/Users/cmello/Documents/" \ "Python Basics Book Dan Bader/" \ "exercises_chapter_13/" input_file_path = os.path.join(path, "output/Walrus.pdf") input_pdf = PdfFileReader(input_file_path) output_pdf = PdfFileWriter() # Decrypt the PDF file input_pdf.decrypt("IamtheWalrus") # without this, I get an error message. # 2 num_pages = input_pdf.getNumPages() for n in range(0, num_pages): page = input_pdf.getPage(n) page.rotateClockwise(270) output_pdf.addPage(page) output_file_path = os.path.join(path, "output/Walrus Rotated2.pdf") with open(output_file_path, "wb") as output_file: output_pdf.write(output_file) """ CIRO START FROM STEP # 3 import copy
from PyPDF2 import PdfFileWriter, PdfFileReader infile = PdfFileReader('dummy.pdf', 'rb') infile2 = PdfFileReader('dummy2.pdf', 'rb') output = PdfFileWriter() p2 = infile2.getPage(0) for i in xrange(infile.getNumPages()): p = infile.getPage(i) output.addPage(p) if i == 3: output.addPage(p2) with open('newfile.pdf', 'wb') as f: output.write(f)
from PyPDF2 import PdfFileReader from pathlib import Path pdf_path = (r"chapter 14 working with PDF\Pride_and_Prejudice.pdf") output_file_path = r"chapter 14 working with PDF\Pride_and_Prejudice.txt" pdf = PdfFileReader(pdf_path) with open(output_file_path, 'w') as output_file: title = pdf.documentInfo.title num_pages = pdf.getNumPages() output_file.write(f"{title}\nNumber of pages: {num_pages}\n\n") for page in pdf.pages: text = page.extractText() output_file.write(text) ''' first_page = pdf.getPage(0) print(first_page.extractText()) for page in pdf.pages: print(page.extractText()) # prints the whole book '''
from PyPDF2 import PdfFileMerger, PdfFileReader, PdfFileWriter # How to watermark PDF pages bellow: pdf_file = "super.pdf" watermark = "wtr.pdf" merged_file = "merged.pdf" with open(pdf_file, "rb") as input_file, open(watermark, "rb") as watermark_file: input_pdf = PdfFileReader(input_file) watermark_pdf = PdfFileReader(watermark_file) watermark_page = watermark_pdf.getPage(0) output = PdfFileWriter() for i in range(input_pdf.getNumPages()): pdf_page = input_pdf.getPage(i) pdf_page.mergePage(watermark_page) output.addPage(pdf_page) with open(merged_file, "wb") as merged_file: output.write(merged_file)
import os import sys from PyPDF2 import PdfFileReader nombreArchivo = sys.argv[1] reader = PdfFileReader(open(nombreArchivo, 'rb')) pags = reader.getNumPages() n = 1 cmd = "pdftk %s cat "%nombreArchivo while (n <= pags): cmd += "%d "%n n += 2 cmd += "output - | lpr" print(cmd) os.system(cmd) if input("De vuelta las hojas (ingrese x para abortar) ") != "x": n = pags if pags%2 == 1: n -= 1 os.system("lpr blank.pdf") cmd = "pdftk %s cat "%nombreArchivo while (n > 0): cmd += "%d "%n n -= 2 cmd += "output - | lpr" print(cmd) os.system(cmd)
c.save() ## 2. Add Watermark to PDF ### Open the watermark file with io.open('./watermark.pdf', mode='rb') as watermark_file: watermark = PdfFileReader(watermark_file) ### Open the output file with io.open(merged_file_path, mode='wb') as merged_file: ### Start the PDF writer buffer output = PdfFileWriter() ### Open the input file with io.open(input_file_path, mode='rb') as input_file: input_pdf = PdfFileReader(input_file) page_count = input_pdf.getNumPages() ### Add watermark to every page for page_number in range(page_count): input_page = input_pdf.getPage(page_number) input_page.mergePage(watermark.getPage(0)) output.addPage(input_page) ### Open the file stream to the output file and save the result with io.open(merged_file_path, mode='wb') as merged_file: #tmp_out = io.BytesIO() #output.write(tmp_out) #print(tmp_out.getvalue()) output.write(merged_file) ## 3. Clean files
def getArchiveFilenameList(self): pdf = PdfFileReader(open(self.path, 'rb')) return [ "/%04d.jpg" % (page) for page in range(1, pdf.getNumPages() + 1) ]