def save_with_even_pages(exam_id, exam_pdf_file): """Save a finalized exam pdf with evem number of pages. The exam is saved in the path returned by `get_exam_dir(exam_id)` with the name `exam.pdf`. If the pdf has an odd number of pages, an extra blank page is added at the end, this is specially usefull for printing and contatenating multiple copies at once. Parameters ---------- exam_id : int The exam identifier exam_pdf_file : str or File like object The exam pdf to be saved inthe data directory """ os.makedirs(exam_dir(exam_id), exist_ok=True) pdf_path = exam_pdf_path(exam_id) exam_pdf = PdfReader(exam_pdf_file) pagecount = len(exam_pdf.pages) if (pagecount % 2 == 0): exam_pdf_file.seek(0) exam_pdf_file.save(pdf_path) return new = PdfWriter() new.addpages(exam_pdf.pages) blank = PageMerge() box = exam_pdf.pages[0].MediaBox blank.mbox = box blank = blank.render() new.addpage(blank) new.write(pdf_path)
def re_arrange(file_path, output_file_name, dic): """ The function reorder takes two arguments path and dic path is the path of the source pdf file which is in wrong order and then creates a modified pdf file with pages in the right order. Parameters: path : Path of the pdf file to be modified dic : A dictionary with key value pairs of pages. Returns: None """ file_path = Path(file_path) # create a pdf object using PdfReader that could be read pdf_obj = PdfReader(file_path) # pdf_obj.pages attribute gives the length of the pages in pdf total_pages = len(pdf_obj.pages) print("Total Pages in PDF are:", total_pages) # Initialising the writer object using the PdfWriter class,from this we would create a new modified Pdf writer = PdfWriter() # new and old here mean the new position of the "old" page location for new, old in dic.items(): # indexing pages list writer.addpage(pdf_obj.pages[old - 1]) print(f"page{new} added from {old}") # accesing the name of the file without .pdf to save it with a new one writer.write(Path(os.path.dirname(file_path) + "\\" + output_file_name))
def ocr(tar_gz_filename, empty_page_threshold, language='eng'): tar = tarfile.open(tar_gz_filename) tar.extractall(path=TMP_DIR) env = os.environ.copy() env.update(dict(LD_LIBRARY_PATH=LIB_DIR, TESSDATA_PREFIX="{}/tessdata".format(SCRIPT_DIR))) output = PdfWriter() for filename in tar.getnames(): cmd = ['./tesseract', '-l', language, '-c', 'min_orientation_margin=0', # don't leave out characters close to border '{}/{}'.format(TMP_DIR, filename), '{}/partial'.format(TMP_DIR), 'pdf'] try: out = subprocess.check_output(cmd, cwd=SCRIPT_DIR, env=env, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: print('tesseract call failed, here\'s the output so far:') print(e.output) sys.exit(1) print(out) for p in PdfReader("{}/{}".format(TMP_DIR, "partial.pdf")).pages: try: if int(p.Contents['/Length']) < empty_page_threshold: continue except: # if in doubt add the page pass output.addpage(p) output.write('{}/output.pdf'.format(TMP_DIR)) for f in ['partial.pdf', DOWNLOAD_FILE] + tar.getnames(): os.remove("{}/{}".format(TMP_DIR, f)) return '{}/output.pdf'.format(TMP_DIR)
def post(self, request): serializer = BookSerializer(data=request.data) if serializer.is_valid(): inpfn = serializer.validated_data['pdf'] page_range = [ int(y) for y in serializer.validated_data['page'].split('-') ] page_start = int(page_range[0]) page_end = int(page_range[1]) path = os.path.join( '/books/pdfs', 'extracted_page_{}-{}.pdf'.format(page_start, page_end)) outfn = os.path.join( 'media', 'extracted_page_{}-{}.pdf'.format(page_start, page_end)) pages = PdfReader(inpfn).pages outdata = PdfWriter(outfn) page_range = (page_range + page_range[-1:])[:2] for pagenum in range(page_range[0], page_range[1] + 1): outdata.addpage(pages[pagenum - 1]) outdata.write() serializer.validated_data['pdf'] = os.path.join( 'extracted_page_{}-{}.pdf'.format(page_start, page_end)) serializer.save() return Response(serializer.data, status=status.HTTP_201_CREATED) return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
def subset_pdf(inp_file, ranges): # Create PDF with subset pages ranges = ranges.split(' ') for x in ranges: # If ranges is something like a word or negative for y in x.split('-'): try: int(y) except ValueError: return -1 ranges = ([int(y) for y in x.split('-')] for x in ranges) pages = PdfReader(inp_file).pages out_data = PdfWriter(inp_file) num_pages = 0 try: for one_range in ranges: one_range = (one_range + one_range[-1:])[:2] for page_num in range(one_range[0], one_range[1] + 1): out_data.addpage(pages[page_num - 1]) num_pages += 1 except IndexError: # If user gave invalid pages return -1 out_data.write() return num_pages
def _generate_thumbnail_image_content_file(document): content = None if document.file_on_server: content = document.unique_file.file_field.read() else: with requests.request('get', document.external_url, stream=True) as response: content = response.content temp_pdf_path = os.path.join(settings.MEDIA_ROOT, 'document_thumbnails', 'temp.pdf') with open(temp_pdf_path, 'w+') as f: f.write(content) reader = PdfReader(temp_pdf_path) if len(reader.pages) > 1: page = reader.pages[0] writer = PdfWriter() writer.addpage(page) writer.write(temp_pdf_path) images = Image(filename=temp_pdf_path, resolution=38) images.background_color = Color('white') images.alpha_channel = 'flatten' os.remove(temp_pdf_path) return ContentFile(images.make_blob('jpg'))
def delete(path, del_page): """ The function delete takes two arguments path and del_page path is the path of the source pdf file. This function deletes the pages from the pdf file. Parameters: path : Path of the pdf file. del_page : A list of pages to be deleted. Returns: None """ # create a pdf object using PdfReader that could be read pdf_obj = PdfReader(path) # pdf_obj.pages attribute gives the length of the pages in pdf total_pages = len(pdf_obj.pages) print("Total Pages in PDF are:", total_pages) # Initialising the writer object using the PdfWriter class writer = PdfWriter() # Adding only those pages that we need to this list excluding del_page page_list = [ page for page in range(1, total_pages + 1) if page not in del_page ] # Index of pdf_obj.pages starts from 0. for page in page_list: writer.addpage(pdf_obj.pages[page - 1]) # removing the original pdf os.remove(path) # writing the modified file to the memory writer.write(path)
def pdfrw(self): reader = PdfReader(self.file_name) writer = PdfWriter(self.output) for i in list(range(0, len(reader.pages))): writer.addpage(self._pdfrw_adjust(reader.pages[i])) writer.trailer.Info = IndirectPdfDict(reader.Info or {}) writer.write()
def split(fname, usernames, folder, pages=1): """ This function splits a big pdf into individual ones and names them in the order given in a txt files. This function relies on the pdfrw library. Disclaimer: I have not tested this function on quizzes with multiple pages :) Args: fname (str): Path to the large PDF to split. usernames (str): List of usernames, in order, to be used as file names. folder (str, optional): Folder to save the new PDFs in. pages (int, optional): Number of pages to include in the smaller PDFs Returns: This function does not return anything. """ infile = PdfReader(fname) page_num = 1 for i in range(len(infile.pages)): out = PdfWriter() if page_num < pages: out.addpage(infile.pages[i]) page_num += 1 else: out.addpage(infile.pages[i]) out.write("%s%s.pdf" % (folder, usernames.pop(0))) page_num = 1
def upload_book(request): if request.method == 'POST': form = BookForm(request.POST, request.FILES) if form.is_valid(): form2 = form.save(commit=False) inpfn = form.cleaned_data['pdf'] print(inpfn) page_range = [int(y) for y in form.cleaned_data['page'].split('-')] page_start = int(page_range[0]) page_end = int(page_range[1]) path = os.path.join('/books/pdfs', 'extracted_page_{}-{}.pdf'.format(page_start, page_end)) outfn = os.path.join('media', 'extracted_page_{}-{}.pdf'.format(page_start, page_end)) pages = PdfReader(inpfn).pages outdata = PdfWriter(outfn) page_range = (page_range + page_range[-1:])[:2] for pagenum in range(page_range[0], page_range[1]+1): outdata.addpage(pages[pagenum-1]) outdata.write() form2.pdf = os.path.join('extracted_page_{}-{}.pdf'.format(page_start, page_end)) form2.save() return redirect('book_list') else: form = BookForm() return render(request, 'upload_book.html', { 'form': form })
def upscale(file_name, scale=1.5, margin_x=0, margin_y=0, suffix='scaled', tempdir=None): """Upscale a PDF to a large size.""" def adjust(page): info = PageMerge().add(page) x1, y1, x2, y2 = info.xobj_box viewrect = (margin_x, margin_y, x2 - x1 - 2 * margin_x, y2 - y1 - 2 * margin_y) page = PageMerge().add(page, viewrect=viewrect) page[0].scale(scale) return page.render() # Set output file name if tempdir: output = NamedTemporaryFile(suffix='.pdf', dir=tempdir, delete=False).name elif suffix: output = os.path.join(os.path.dirname(file_name), add_suffix(file_name, suffix)) else: output = NamedTemporaryFile(suffix='.pdf').name reader = PdfReader(file_name) writer = PdfWriter(output) for i in list(range(0, len(reader.pages))): writer.addpage(adjust(reader.pages[i])) writer.trailer.Info = IndirectPdfDict(reader.Info or {}) writer.write() return output
def extract_crossword(file_path: Path, output_path: Path, overwrite: bool = True) -> Path: """Save page with crossword. Open a PDF document, search for a string identifying Le Monde crossword, and save the corresponding page to a file. Args: file_path: Path of the PDF document to process output_path: Path of the output directory overwrite: Whether to overwrite existing files (default to True) Returns: Path of the saved file """ LOGGER.debug(f'Processing {file_path}') max_extracted_pages = 15 rsrcmgr = PDFResourceManager(caching=True) crossword_page = None m = None with open(file_path, 'rb') as f: pages = [page for page in PDFPage.get_pages(f)] LOGGER.debug(f'Found {len(pages)} pages') first_checked_pageno = max(0, len(pages) - max_extracted_pages) LOGGER.debug(f'Searching last {max_extracted_pages} pages first') for i, page in enumerate(pages[first_checked_pageno:]): m = _search_in_page(page, rsrcmgr) if m: crossword_page = first_checked_pageno + i break if not crossword_page: LOGGER.debug(f'Extending search to all pages') for i, page in enumerate(pages[:first_checked_pageno]): m = _search_in_page(page, rsrcmgr) if m: crossword_page = i break if not crossword_page or not m: raise CrosswordNotFoundError LOGGER.debug(f'Crossword found on page {crossword_page}') path = output_path / '{}.pdf'.format(m.group(1)) if path.exists() and not overwrite: LOGGER.debug(f'File already exist ${path}') raise FileAlreadyExistError x = PdfReader(file_path) page = x.pages[crossword_page] y = PdfWriter() y.addpage(page) y.write(path) return path
def go(inpfn, outfn): reader = PdfReader(inpfn, decompress=False) page, = reader.pages writer = PdfWriter() writer.addpage(adjust(page)) writer.trailer.Info = IndirectPdfDict(reader.Info) writer.write(outfn)
def strip_pages_pdf(indir, infile, outdir=None, outfile=None, numpages=1, keep=False): ''' Deletes the first pages from a PDF. Omit outfile name to replace. Default is one page. If option keep is specified, keeps first pages of PDF, dropping rest. ''' if outfile is None: outfile = infile if outdir is None: outdir = indir output = PdfWriter() inpath = os.path.join(indir, infile) outpath = os.path.join(outdir, outfile) for i, page in enumerate(PdfReader(inpath).pages): if not keep: if i > (numpages - 1): output.addpage(page) if keep: if i <= (numpages - 1): output.addpage(page) output.write(outpath)
def two_up(data): pdf = PdfReader(fdata=data) pages = PageMerge() + pdf.pages assert len(pages) == 2 left, right = pages rotation = 270 scale = 0.7071067811865476 # sqrt(0.5) x_increment = scale * pages.xobj_box[2] left.Rotate = rotation left.scale(scale) right.Rotate = rotation right.scale(scale) right.x = x_increment writer = PdfWriter() writer.addpage(pages.render()) # retain and update metadata pdf.Info.Creator = 'modulo-nic.py %s' % __version__ writer.trailer.Info = pdf.Info sys.stdout.write('Content-Type: application/x-pdf\n\n') writer.write(sys.stdout)
def addAction(): fileName = 'report.pdf' pdf_writer = PdfWriter() pdf_reader = PdfReader(fileName) # JavaScript to be appended to PDF document. # To learn more please check: # "Developing Acrobat Applications Using JavaScript" # and "JavaScript for Acrobat API Reference" js = """ // genIDForm var name = this.getField("nameText").value; var age = this.getField("ageText").value; var gender = this.getField("genderChoice").value; // genQuestion1 var radioGroupSelectedVal = this.getField("q1").value; // genQuestion2 var q2op1 = this.getField("q2op1").value; var q2op2 = this.getField("q2op2").value; var q2op3 = this.getField("q2op3").value; var q2op4 = this.getField("q2op4").value; var fieldsData = ''; fieldsData += "Name: " + name + " Age: " + age + " Gender: " + gender + "\\n"; fieldsData += "Feeling about JC: " + radioGroupSelectedVal + "\\n"; fieldsData += "1) " + q2op1 + ", 2) " + q2op2 + ", 3) " + q2op3 + ", 4) " + q2op4; app.alert(fieldsData); """ # PDF document - Page 2 last = pdf_reader.pages[1] # Note: We have just one form in the entire pdf! # Annots are form fields for field in last.Annots: # Each field is compound with dictionaries inside of dictionaries inside of dictionaries...inside of dictionaries. # buttonSubmit is the name we gave to the last text box - see survey.py line 64 if (field.get('/T') == '(buttonSubmit)'): # AA - (Additional-Actions dictionary) # Acrobat js api reference suggests this is where we should insert hidden actions - like javascript. # For more please check: JavaScript for Acrobat API Reference # Site: https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/js_api_reference.pdf # Page: 303 or search for AA # D - An action that shall be performed when the mouse button is pressed inside the annotation's (field) active area. # For more please check: Document management - Portable document format - part 1: PDF 1.7 # Site: https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf # Page: 423 or search for "Entries in an annotation's additional-actions dictionary" field.update(PdfDict(AA=PdfDict(D=make_js_action(js)))) break # Make a copy of the original file for page in pdf_reader.pages: pdf_writer.addpage(page) pdf_writer.write('reportChanged.pdf')
def duplicate_pages(original_filepath, output_filename, num_of_duplicates): original = PdfReader(original_filepath) output = PdfWriter() for i in range(num_of_duplicates + 1): for page in original.pages: output.addpage(page) output_stream = open(app.config['DOWNLOAD_FOLDER'] + output_filename, 'wb') output.write(output_stream)
def split(number_of_pages, output): pdf_obj = PdfReader(r'C:\Users\DELL\Python stuffs\pdf pypy\reportlab-sample.pdf') total_pages = len(pdf_obj.pages) writer = PdfWriter() for page in range(number_of_pages): if page <= total_pages: writer.addpage(pdf_obj.pages[page]) writer.write(output)
def post(self, request): fontname_g = "HeiseiKakuGo-W5" pdfmetrics.registerFont(UnicodeCIDFont(fontname_g)) buffer = io.BytesIO() cc = canvas.Canvas(buffer) cc.setFont(fontname_g, 24) page = PdfReader('media/pdf/sample.pdf', decompress=False).pages pp = pagexobj(page[0]) # reader = PdfFileReader('media/pdf/sample.pdf') # writer = PdfFileWriter() test = { "test1": "test1", "test2": "S2", "test3": "テスト", "test4": [ { "key_label": "テスト1", "flag": True }, { "key_label": "テスト2", "flag": True }, { "key_label": "テスト3", "flag": True }, ] } a = request.data['test_list'] for i in a: self.test(cc, i.get('contents'), test) cc.doForm(makerl(cc, pp)) cc.showPage() cc.save() buffer.seek(0) # new_pdf = PdfFileReader(buffer) # existing_page = reader.getPage(0) # existing_page.mergePage(new_pdf.getPage(0)) # writer.addPage(existing_page) # new = io.BytesIO() # writer.write(new) # new.seek(0) # output_pdf = open('media/pdf/sample2.pdf', 'wb') r = PdfReader(buffer) y = PdfWriter() y.addpage(r.pages[0]) with open('media/pdf/sample2.pdf', 'wb') as f: y.write(f) # writer.write(output_pdf) # output_pdf.close() return Response({'detail': _('Successfully confirmed email.')}, status=status.HTTP_201_CREATED)
def main(): parser = argparse.ArgumentParser(description="Strip ResearchGate additions from a PDF") parser.add_argument("infile", metavar="input-filename", type=str, nargs=1, help="PDF file to process") parser.add_argument("outfile", metavar="output-filename", type=str, nargs=1, help="name for processed output file") args = parser.parse_args() # This regular expression matches the form of the ResearchGate # underlinings in the content streams. We match against a truncated form # of the distinctive RGB triplet because it's not always given with # the same accuracy. # "0.3333333333 0.6941176471 0.9607843137" regex = re.compile(r"""(0\.33333[0-9]+ 0\.694117[0-9]+ 0\.960784[0-9]+ RG \d+\.?\d* w \d+\.?\d* \d+\.?\d* m \d+\.?\d* \d+\.?\d* )l S""") dict_pages = PdfReader(args.infile[0]).pages def fix_stream(contents): # Look for underlinings and make them invisible. if not hasattr(contents, "stream"): return s = contents.stream # We identify RG underlinings by their (hopefully unique) # RGB colour triplet. if s is not None and regex.search(s): # Minimal change: change the line draw commands to # moves, so no line is drawn. It would be more # satisfying to remove the stream entirely, but it's # simpler and safer to preserve the file structure # (in particular, the stream length) wherever possible. contents.stream = regex.sub("\\1m\nS", s) for page in dict_pages: if "/Annots" in page: # Remove all annotations. This may of course cause some # collateral damage, but PDFs of articles don't usually have # annotations so probably this will just strip ResearchGate # links. If this becomes a problem, it should be easy to # identify RG annotations and remove only them. page.pop("/Annots") # There may be a stream in the Contents object and/or in its # children, so we check for both. fix_stream(page.Contents) for contents in page.Contents: fix_stream(contents) writer = PdfWriter() # Start at the second page to remove the ResearchGate cover sheet. for page in dict_pages[1:]: writer.addpage(page) writer.write(args.outfile[0])
def run_stage(src, out): i = PdfReader(src) o = PdfWriter() sum_i = len(i.pages) num_i = i.Root.PageLabels.Nums for r in range(1, len(num_i) // 2): o.addpage(i.pages[int(num_i[r * 2]) - 1]) o.addpage(i.pages[sum_i - 1]) o.write(out)
def merge_attachment(self): filename = 'Print Packing List.pdf' picking_obj = self.env['stock.picking'] picking = picking_obj.browse(self._context.get('active_ids')) lst = [] writer = PdfWriter() for pick in picking: if pick.packing_list_bool: pick.packing_list_print_bool = True ship_name = 'Packing List' "%s" % pick.name attachments = self.env['ir.attachment'].search([ ('res_id', '=', pick.id), ('name', '=', ship_name) ]) for att in attachments: lst.append(att) def get4_fedex(srcpages): scale = 0.88 srcpages = PageMerge() + srcpages x_increment, y_increment = (scale * i for i in srcpages.xobj_box[2:]) for i, page in enumerate(srcpages): page.scale(scale) return srcpages.render() for pdf in lst: pages = PdfReader(BytesIO(base64.decodestring(pdf.datas))).pages pick1 = picking_obj.browse(pdf.res_id) for index in range(0, len(pages), 1): writer.addpage(get4_fedex(pages[index:index + 1])) s = BytesIO() writer.write(s) reader = PdfFileReader(s) writer = PdfFileWriter() for page in range(0, reader.getNumPages()): p = reader.getPage(page) writer.addPage(p) s = BytesIO() writer.write(s) out = base64.b64encode(s.getvalue()) view_report_status_id = self.env['view.report'].create({ 'file_name': out, 'datas_fname': filename }) return { 'res_id': view_report_status_id.id, 'name': 'Print Packing List', 'view_type': 'form', 'view_mode': 'form', 'res_model': 'view.report', 'view_id': False, 'type': 'ir.actions.act_window', }
def resize_2_a4(infn): outfn = infn[:-4] + '-A4.pdf' reader = PdfReader(infn) writer = PdfWriter(outfn) a4_size = get_size('A4.pdf', 0) params = get_scale_margin(infn, a4_size, 0) for page in reader.pages: writer.addpage(adjust(page, params)) writer.trailer.Info = IndirectPdfDict(reader.Info or {}) writer.write()
def test_cut(start, end): ipdf = PdfReader('book.pdf') opdf = PdfWriter() for i in range(start, end): opdf.addpage(ipdf.pages[i]) opdf.write('pdfs/result.pdf') ipdf = PdfReader('../pdfs/result.pdf') return send_file('../pdfs/result.pdf')
def rotate(path, bad_page): output=path[:-4] reader = PdfReader(path) writer = PdfWriter() pages = reader.pages for page in range(len(pages)): if page == bad_page : pages[bad_page].Rotate = 90 writer.addpage(pages[bad_page]) writer.write(output)
def appendPdfs(self, src, dst): from pdfrw import PdfReader, PdfWriter new_pdf = PdfWriter() x = PdfReader(src) y = PdfReader(dst) new_pdf.addpage(x.pages[0]) print("Janraj") new_pdf.addpage(y.pages[0]) print("CJ") new_pdf.write("result.pdf")
def rotate_odd(path, output): reader = PdfReader(path) writer = PdfWriter() pages = reader.pages for page in range(len(pages)): if page % 2: pages[page].Rotate = 90 writer.addpage(pages[page]) writer.write(output)
def combine(inpfn, outfn, x, y, gap): # Read all pages from input file pages = PdfReader(inpfn).pages # Object to write output PDF writer = PdfWriter() while pages: writer.addpage(getPages(pages, x, y, gap)) writer.write(outfn)
def rotate_all_page(path): output=path[:-4] +'_converted.pdf' reader = PdfReader(path) writer = PdfWriter() pages = reader.pages for page in range(len(pages)): pages[page].Rotate = 90 writer.addpage(pages[page]) writer.write(output) return(output) print("It worked ooo")
def split(path, number_of_pages, output): pdf_obj = PdfReader(path) total_pages = len(pdf_obj.pages) writer = PdfWriter() for page in range(number_of_pages): if page <= total_pages: writer.addpage(pdf_obj.pages[page]) writer.write(output)
def pdf(rm_files_path, path_original_pdf, path_annotated_pdf, path_oap_pdf): """ Render pdf with annotations. The path_oap_pdf defines the pdf which includes only annotated pages. """ base_pdf = PdfReader(open(path_original_pdf, "rb")) # Parse remarkable files and write into pdf annotations_pdf = [] for page_nr in range(base_pdf.numPages): rm_file_name = "%s/%d" % (rm_files_path, page_nr) rm_file = "%s.rm" % rm_file_name if not os.path.exists(rm_file): annotations_pdf.append(None) continue page_layout = base_pdf.pages[page_nr].MediaBox crop_box = base_pdf.pages[page_nr].CropBox if page_layout is None: page_layout = base_pdf.pages[page_nr].ArtBox if page_layout is None: annotations_pdf.append(None) continue image_width, image_height = float(page_layout[2]), float( page_layout[3]) annotated_page = _render_rm_file(rm_file_name, image_width=image_width, image_height=image_height, crop_box=crop_box) if len(annotated_page.pages) <= 0: annotations_pdf.append(None) else: page = annotated_page.pages[0] annotations_pdf.append(page) # Merge annotations pdf and original pdf writer_full = PdfWriter() writer_oap = PdfWriter() for i in range(base_pdf.numPages): annotations_page = annotations_pdf[i] if annotations_page != None: merger = PageMerge(base_pdf.pages[i]) merger.add(annotations_page).render() writer_oap.addpage(base_pdf.pages[i]) writer_full.addpage(base_pdf.pages[i]) writer_full.write(path_annotated_pdf) writer_oap.write(path_oap_pdf)
def Add_Title_Page(input_file, title_file, output_file, page): # define the reader and writer objects reader_input = PdfReader(input_file) writer_output = PdfWriter() watermark_input = PdfReader(title_file) watermark = watermark_input.pages[page] writer_output.addpage(watermark) writer_output.addpages(reader_input.pages) writer_output.write(output_file)
def get(self,id): inpfn = 'teste.pdf' ranges = [id] # assert ranges, "Expected at least one range" # ranges = ([int(y) for y in x.split('-')] for x in ranges) outfn = '%sfrag' % os.path.basename(inpfn) pages = PdfReader(inpfn).pages outdata = PdfWriter() # for onerange in ranges: onerange = (onerange + onerange[-1:])[:2] for pagenum in range(onerange[0], onerange[1]+1): outdata.addpage(pages[pagenum-1]) outdata.write(outfn) # pdfout = base64.encodestring(open(outfn,"rb").read()) # self.write('<iframe src="data:application/pdf;base64,'+pdfout+'" style="position:fixed; top:0px; left:0px; bottom:0px; right:0px; width:100%; height:100%; border:none; margin:0; padding:0; overflow:hidden; z-index:999999;"/>')
So she did an 8.5x11" output with 0.5" margin all around (actual size of useful area 7.5x10") and we scaled it up by 4.8. We also copy the Info dict to the new PDF. ''' import sys import os from pdfrw import PdfReader, PdfWriter, PageMerge, IndirectPdfDict def adjust(page, margin=36, scale=4.8): info = PageMerge().add(page) x1, y1, x2, y2 = info.xobj_box viewrect = (margin, margin, x2 - x1 - 2 * margin, y2 - y1 - 2 * margin) page = PageMerge().add(page, viewrect=viewrect) page[0].scale(scale) return page.render() inpfn, = sys.argv[1:] outfn = 'poster.' + os.path.basename(inpfn) reader = PdfReader(inpfn) writer = PdfWriter(outfn) writer.addpage(adjust(reader.pages[0])) writer.trailer.Info = IndirectPdfDict(reader.Info or {}) writer.write()
import re import sys import os from pdfrw import PdfReader, PdfWriter loc_pages = "pages" loc_books = "books" # Store the human-coded page nums in simple txt files separated by commas for f in os.listdir(loc_pages): with open(os.path.join(loc_pages, f)) as handle: data = handle.read().strip('\n') data = [int(p) for p in data.split(',')] print data # Corresponding book has same filename, diff extension path_book = os.path.splitext(f)[0] + ".pdf" path_book = os.path.join(loc_books, path_book) # Try to open it pages = PdfReader(path_book).pages out_data = PdfWriter() for p in data: out_data.addpage(pages[p-1]) out_data.write('subset.%s' % os.path.basename(path_book))
args = parser.parse_args() # The shuffling magic even = PdfReader(args.evenFile[0]) odd = PdfReader(args.oddFile[0]) isEvenReversed = args.evenrev; isOddReversed = args.oddrev; all = PdfWriter() blank = PageMerge() blank.mbox = [0, 0, 612, 792] # 8.5 x 11 blank = blank.render() if isEvenReversed and not isOddReversed: for i in range(0, len(odd.pages)): all.addpage(odd.pages[i]) all.addpage(even.pages[len(even.pages)-1-i]) elif isOddReversed and not isEvenReversed: for i in range(0, len(odd.pages)): all.addpage(odd.pages[len(odd.pages)-1-i]) all.addpage(even.pages[i]) elif isEvenReversed and isOddReversed: for i in range(0, len(odd.pages)): all.addpage(odd.pages[len(odd.pages)-1-i]) all.addpage(even.pages[len(even.pages)-1-i]) else: for x,y in zip(odd.pages, even.pages): all.addpage(x) all.addpage(y) all.write(args.resultFile[0])
# /usr/bin/python # coding=utf-8 from pdfrw import PdfReader, PdfWriter # Путь до автореферата synopsis_path = '../synopsis.pdf' # Путь до титульника Научного Доклада ГИА (должно быть две страницы: титульник и пустая) gia_title_path = './gia_title.pdf' synopsis = PdfReader(synopsis_path) gia_title = PdfReader(gia_title_path) sci_rep = PdfWriter() for i, p in enumerate(synopsis.pages): if i < 2: sci_rep.addpage(gia_title.pages[i]) else: sci_rep.addpage(p) # Сохранение результата sci_rep.write('./sci_rep.pdf')
def go(inpfn, outfn): pages = PdfReader(inpfn, decompress=False).pages writer = PdfWriter() while pages: writer.addpage(get4(pages)) writer.write(outfn)
page.AA = PdfDict() # You probably should just wrap each JS action with a try/catch, # because Chrome does no error reporting or even logging otherwise; # you just get a silent failure. page.AA.O = make_js_action(""" try { %s } catch (e) { app.alert(e.message); } """ % (script)) page.Annots = PdfArray(annots) return page if len(sys.argv) > 1: js_file = open(sys.argv[1], 'r') fields = [] for line in js_file: if not line.startswith('/// '): break pieces = line.split() params = [pieces[1]] + [float(token) for token in pieces[2:]] fields.append(make_field(*params)) js_file.seek(0) out = PdfWriter() out.addpage(make_page(fields, js_file.read())) out.write('result.pdf')
import os import sys from pdfrw import PdfReader, PdfWriter if len(sys.argv) != 2: print("Usage: InvertOrder.py FILETOINVERT") sys.exit() filename = sys.argv[1] output = PdfWriter() for p in reversed(PdfReader(filename).pages): output.addpage(p) fname, fext = os.path.splitext(filename) outname = fname + "_inv" + fext print("Writing output to "+outname) output.write(outname)
var BALL_HEIGHT = %(BALL_HEIGHT)s; var BRICK_ROW_COUNT = %(BRICK_ROW_COUNT)s; var BRICK_COLUMN_COUNT = %(BRICK_COLUMN_COUNT)s; var BRICK_WIDTH = %(BRICK_WIDTH)s; var BRICK_HEIGHT = %(BRICK_HEIGHT)s; var BRICK_PADDING = %(BRICK_PADDING)s; var BRICK_OFFSET_BOTTOM = %(BRICK_OFFSET_BOTTOM)s; var BRICK_OFFSET_LEFT = %(BRICK_OFFSET_LEFT)s; %(script)s """ % locals()) page.Contents.stream = """ BT /F1 24 Tf 150 300 Td (Move your mouse down here!) Tj 40 -100 Td (also, README below...) Tj ET """ readme = PdfReader('README.pdf') out = PdfWriter() out.addpage(page) for readme_page in readme.pages: out.addpage(readme_page) out.write('breakout.pdf')
def write_async(self, outfile, process_semaphore, progress_cb=None): pdf_writer = PdfWriter(version="1.5") pdf_group = PdfDict() pdf_group.indirect = True pdf_group.CS = PdfName.DeviceRGB pdf_group.I = PdfBool(True) pdf_group.S = PdfName.Transparency pdf_font_mapping = PdfDict() pdf_font_mapping.indirect = True pdf_font_mapping.F1 = self._build_font() for _ in self._pages: pdf_page = PdfDict() pdf_page.Type = PdfName.Page pdf_writer.addpage(pdf_page) # pdfrw makes a internal copy of the pages # use the copy so that references to pages in links are correct pdf_pages = list(pdf_writer.pagearray) # Handle all pages in parallel @asyncio.coroutine def make_page(page, pdf_page, psem): # Prepare everything in parallel @asyncio.coroutine def get_pdf_thumbnail(psem): if page.thumbnail is None: return None return (yield from page.thumbnail.pdf_thumbnail(psem)) @asyncio.coroutine def get_pdf_background(psem): if page.background is None: return None return (yield from page.background.pdf_image(psem)) @asyncio.coroutine def get_pdf_mask(foreground, psem): if foreground.color is not None: return None return (yield from foreground.pdf_mask(psem)) pdf_thumbnail, pdf_background, pdf_foregrounds, pdf_masks = ( yield from asyncio.gather( get_pdf_thumbnail(psem), get_pdf_background(psem), asyncio.gather(*[fg.pdf_image(psem) for fg in page.foreground]), asyncio.gather(*[get_pdf_mask(fg, psem) for fg in page.foreground]))) pdf_page.MediaBox = PdfArray([0, 0, PdfNumber(page.width), PdfNumber(page.height)]) pdf_page.Group = pdf_group pdf_resources = PdfDict() pdf_xobject = PdfDict() if pdf_thumbnail is not None: pdf_page.Thumb = pdf_thumbnail im_index = 0 # Save graphics state and scale unity rectangle to page size matrix = TransformationMatrix() matrix.scale(page.width, page.height) before_graphics = ("q\n" + "%s cm\n" % matrix.to_pdf()) after_graphics = "\nQ\n" contents = "" graphics = "" current_color = None if page.color != self._factory.WHITE: if current_color != page.color: current_color = page.color graphics += page.color.to_pdf() + " rg " graphics += ("0 0 1 1 re " + "f\n") if pdf_background is not None: pdf_xobject[PdfName("Im%d" % im_index)] = pdf_background graphics += "/Im%d Do\n" % im_index im_index += 1 for foreground, pdf_foreground, pdf_mask in zip( page.foreground, pdf_foregrounds, pdf_masks): if pdf_mask is not None: pdf_xobject[PdfName("Im%d" % im_index)] = pdf_mask im_index += 1 pdf_xobject[PdfName("Im%d" % im_index)] = pdf_foreground if (foreground.color is not None and current_color != foreground.color): current_color = foreground.color graphics += foreground.color.to_pdf() + " rg " graphics += "/Im%d Do\n" % im_index im_index += 1 if graphics: contents += (before_graphics + graphics.rstrip(" \n") + after_graphics) current_color = None before_text = ("BT\n" + "/F1 1 Tf 3 Tr\n") after_text = "\nET\n" text = "" pdf_annots = [] for t in page.text: if t.text: matrix = TransformationMatrix() # Glyph size is 0.5 x 1 matrix.scale(2 / len(t.text), 1) matrix.translate(-0.5, -0.5) if t.direction == "ltr": pass elif t.direction == "rtl": matrix.translate(0, -1) elif t.direction == "ttb": matrix.rotate(90) matrix.rotate(-t.rotation) matrix.translate(0.5, 0.5) matrix.scale(t.width, t.height) matrix.translate(t.x, t.y) text += "%s Tm %s Tj\n" % ( matrix.to_pdf(), PdfString().from_bytes( t.text.encode("utf-16-be"), bytes_encoding="hex")) if t.external_link is not None or t.internal_link is not None: pdf_annot = PdfDict() pdf_annots.append(pdf_annot) pdf_annot.Type = PdfName.Annot pdf_annot.Subtype = PdfName.Link pdf_annot.Border = [0, 0, 0] pdf_annot.Rect = [PdfNumber(t.x), PdfNumber(t.y), PdfNumber(t.x + t.width), PdfNumber(t.y + t.height)] if t.external_link is not None: pdf_a = PdfDict() pdf_annot.A = pdf_a pdf_a.Type = PdfName.Action pdf_a.S = PdfName.URI pdf_a.URI = t.external_link.decode("latin-1") if t.internal_link is not None: pdf_target_page = pdf_pages[t.internal_link[0]] target_x, target_y = t.internal_link[1] pdf_annot.Dest = [ pdf_target_page, PdfName.XYZ, PdfNumber(target_x), PdfNumber(target_y), 0] text = text.rstrip(" \n") if text: pdf_resources.Font = pdf_font_mapping contents += (before_text + text + after_text) contents = contents.rstrip(" \n") if contents: pdf_contents = PdfDict() pdf_contents.indirect = True pdf_page.Contents = pdf_contents if COMPRESS_PAGE_CONTENTS: pdf_contents.Filter = [PdfName.FlateDecode] pdf_contents.stream = zlib.compress( contents.encode("latin-1"), 9).decode("latin-1") else: pdf_contents.stream = contents if pdf_annots: pdf_page.Annots = pdf_annots if pdf_xobject: pdf_resources.XObject = pdf_xobject if pdf_resources: pdf_page.Resources = pdf_resources # Report progress nonlocal finished_pages finished_pages += 1 if progress_cb: progress_cb(finished_pages / len(self._pages)) finished_pages = 0 yield from asyncio.gather( *[make_page(page, pdf_page, process_semaphore) for page, pdf_page in zip(self._pages, pdf_pages)]) with TemporaryDirectory(prefix="djpdf-") as temp_dir: pdf_writer.write(path.join(temp_dir, "temp.pdf")) cmd = [QPDF_CMD, "--stream-data=preserve", "--object-streams=preserve", "--normalize-content=n"] if LINEARIZE_PDF: cmd.extend(["--linearize"]) cmd.extend([path.abspath(path.join(temp_dir, "temp.pdf")), path.abspath(outfile)]) yield from run_command_async(cmd, process_semaphore)
usage: 4up.py my.pdf Creates 4up.my.pdf with a single output page for every 4 input pages. """ import sys import os from pdfrw import PdfReader, PdfWriter, PageMerge def get4(srcpages): scale = 0.5 srcpages = PageMerge() + srcpages x_increment, y_increment = (scale * i for i in srcpages.xobj_box[2:]) for i, page in enumerate(srcpages): page.scale(scale) page.x = x_increment if i & 1 else 0 page.y = 0 if i & 2 else y_increment return srcpages.render() inpfn, = sys.argv[1:] outfn = "4up." + os.path.basename(inpfn) pages = PdfReader(inpfn).pages writer = PdfWriter() for index in range(0, len(pages), 4): writer.addpage(get4(pages[index : index + 4])) writer.write(outfn)
#!/usr/bin/env python ''' usage: subset.py my.pdf page[range] [page[range]] ... eg. subset.py 1-3 5 7-9 Creates subset.my.pdf ''' import sys import os from pdfrw import PdfReader, PdfWriter inpfn = sys.argv[1] ranges = sys.argv[2:] assert ranges, "Expected at least one range" ranges = ([int(y) for y in x.split('-')] for x in ranges) outfn = 'subset.%s' % os.path.basename(inpfn) pages = PdfReader(inpfn).pages outdata = PdfWriter(outfn) for onerange in ranges: onerange = (onerange + onerange[-1:])[:2] for pagenum in range(onerange[0], onerange[1]+1): outdata.addpage(pages[pagenum-1]) outdata.write()