def fixpage(page, watermark): # Find the page's resource dictionary. Create if none resources = page.inheritable.Resources if resources is None: resources = page.Resources = PdfDict() # Find or create the parent's xobject dictionary xobjdict = resources.XObject if xobjdict is None: xobjdict = resources.XObject = PdfDict() # Allow for an infinite number of cascaded watermarks index = 0 while 1: watermark_name = '/Watermark.%d' % index if watermark_name not in xobjdict: break index += 1 xobjdict[watermark_name] = watermark # Turn the contents into an array if it is not already one contents = page.Contents if not isinstance(contents, PdfArray): contents = page.Contents = PdfArray([contents]) # Save initial state before executing page contents.insert(0, IndirectPdfDict(stream='q\n')) # Restore initial state and append the watermark contents.append(IndirectPdfDict(stream='Q %s Do\n' % watermark_name)) return page
def fixpage(page, count=[0]): count[0] += 1 evenpage = not (count[0] & 1) # For demo purposes, just go with the MediaBox and toast the others box = [float(x) for x in page.MediaBox] assert box[0] == box[1] == 0, "demo won't work on this PDF" for key, value in sorted(page.iteritems()): if 'box' in key.lower(): del page[key] startsize = tuple(box[2:]) finalsize = box[3], 2 * box[2] page.MediaBox = PdfArray((0, 0) + finalsize) page.Rotate = (int(page.Rotate or 0) + 90) % 360 contents = page.Contents if contents is None: return page contents = isinstance(contents, dict) and [contents] or contents prefix = '0 1 -1 0 %s %s cm\n' % (finalsize[0], 0) if evenpage: prefix = '1 0 0 1 %s %s cm\n' % (0, finalsize[1] / 2) + prefix first_prefix = 'q\n-1 0 0 -1 %s %s cm\n' % finalsize + prefix second_prefix = '\nQ\n' + prefix first_prefix = IndirectPdfDict(stream=first_prefix) second_prefix = IndirectPdfDict(stream=second_prefix) contents = PdfArray(([second_prefix] + contents) * 2) contents[0] = first_prefix page.Contents = contents return page
def upscale(file_name, scale=1.5, margin_x=0, margin_y=0, suffix='scaled', tempdir=None): """Upscale a PDF to a large size.""" def adjust(page): info = PageMerge().add(page) x1, y1, x2, y2 = info.xobj_box viewrect = (margin_x, margin_y, x2 - x1 - 2 * margin_x, y2 - y1 - 2 * margin_y) page = PageMerge().add(page, viewrect=viewrect) page[0].scale(scale) return page.render() # Set output file name if tempdir: output = NamedTemporaryFile(suffix='.pdf', dir=tempdir, delete=False).name elif suffix: output = os.path.join(os.path.dirname(file_name), add_suffix(file_name, suffix)) else: output = NamedTemporaryFile(suffix='.pdf').name reader = PdfReader(file_name) writer = PdfWriter(output) for i in list(range(0, len(reader.pages))): writer.addpage(adjust(reader.pages[i])) writer.trailer.Info = IndirectPdfDict(reader.Info or {}) writer.write() return output
def go(inpfn, outfn): reader = PdfReader(inpfn, decompress=False) page, = reader.pages writer = PdfWriter() writer.addpage(adjust(page)) writer.trailer.Info = IndirectPdfDict(reader.Info) writer.write(outfn)
def fixpage(*pages): pages = [pagexobj(x) for x in pages] class PageStuff(tuple): pass x = y = 0 for i, page in enumerate(pages): index = '/P%s' % i shift_right = x and '1 0 0 1 %s 0 cm ' % x or '' stuff = PageStuff((index, page)) stuff.stream = 'q %s%s Do Q\n' % (shift_right, index) x += page.BBox[2] y = max(y, page.BBox[3]) pages[i] = stuff # Multiple copies of first page used as a placeholder to # get blank page on back. for p1, p2 in zip(pages, pages[1:]): if p1[1] is p2[1]: pages.remove(p1) return IndirectPdfDict( Type=PdfName.Page, Contents=PdfDict(stream=''.join(page.stream for page in pages)), MediaBox=PdfArray([0, 0, x, y]), Resources=PdfDict(XObject=PdfDict(pages), ), )
def pdfrw(self): reader = PdfReader(self.file_name) writer = PdfWriter(self.output) for i in list(range(0, len(reader.pages))): writer.addpage(self._pdfrw_adjust(reader.pages[i])) writer.trailer.Info = IndirectPdfDict(reader.Info or {}) writer.write()
def make_cid_system_info_object(): """Make a CID System Info object. :returns PdfDict: CID System Info PdfDict object. """ return IndirectPdfDict(Registry=PdfString('(Adobe)'), Ordering=PdfString('(UCS)'), Supplement=0)
def resize_2_a4(infn): outfn = infn[:-4] + '-A4.pdf' reader = PdfReader(infn) writer = PdfWriter(outfn) a4_size = get_size('A4.pdf', 0) params = get_scale_margin(infn, a4_size, 0) for page in reader.pages: writer.addpage(adjust(page, params)) writer.trailer.Info = IndirectPdfDict(reader.Info or {}) writer.write()
def make_font_file_object(tt_font): """Make an embedded font object from the true type font itself. :param TrueTypeFont tt_font: Our utility class used to parse and calculate font metrics from a true type font. :returns PdfDict: font file PdfDict object stream. """ # TODO: make subset font here with open(tt_font.ttfPath, 'rb') as font_file: data = font_file.read() # Let's let pdfrw handle compressing streams return IndirectPdfDict(stream=data.decode('Latin-1'))
def concatenate(paths, output): writer = PdfWriter() for path in paths: reader = PdfReader(path) writer.addpages(reader.pages) writer.trailer.Info = IndirectPdfDict(Title='Combined PDF Title', Author='Michael Driscoll', Subject='PDF Combinations', Creator='The Concatenator') writer.write(output)
def pdfrw(pdf_files, output): writer = PdfWriter() for inpfn in pdf_files: writer.addpages(PdfReader(inpfn).pages) writer.trailer.Info = IndirectPdfDict( Title='HPA Design', Author='HPA Design', Subject='HPA Design', Creator='HPA Design', ) writer.write(output) return output
def _pdf_samples(self, request, response, samples): writer = PdfWriter() for sample in samples: self._pdf_sample(writer, sample) writer.trailer.Info = IndirectPdfDict( Title="Sample Labels", Author=str(request.user), Subject="Sample Labels", Creator="Turtleweb", ) writer.write(response)
def main(infiles, outfile, rows, cols, title, landscape): pages = [] for pdf in infiles: pages.extend(PdfReader(pdf).pages) pages_per_page = rows * cols pdf_writer = PdfWriter(outfile) for page_group in grouper(pages_per_page, pages): pdf_writer.addpage(put_pages_on_grid(page_group, rows, cols)) pdf_writer.trailer.Info = IndirectPdfDict(Title=title, Author="StaG-mwc", Creator=__file__) return pdf_writer
def make_to_unicode_object(): """Make a toUnicode object which allows the PDF reader to derive content from the PDF with the CIDFont embedded. This map converts from CIDs to Unicode code points. :returns PdfDict: toUnicode CMAP PdfDict object. """ # See section 9.10.3 ToUnicode CMaps of PDF 1.6 Spec # TODO: For now we put an empty mapping in. return IndirectPdfDict(stream='\n'.join(( "/CIDInit /ProcSet findresource begin", "12 dict begin", "begincmap", "/CIDSystemInfo", "<</Registry (Adobe)", "/Ordering (UCS)", "/Supplement 0", ">> def", "/CMapName /Adobe-Identity-UCS def", "/CMapType 2 def", "1 begincodespacerange", "<0000> <FFFF>", "endcodespacerange", "1 beginbfrange", "<0000> <FFFF> <0000>", "endbfrange", "endcmap", "CMapName currentdict /CMap defineresource pop", "end", "end")))
def make_cid_font_object(tt_font): """Make a CID Type 2 font object for including as a descendant of a composite Type 0 font object. :param TrueTypeFont tt_font: Our utility class used to parse and calculate font metrics from a true type font. :returns PdfDict: CID Font Type 2 PdfDict object. """ return IndirectPdfDict( Type=PdfName('Font'), Subtype=PdfName('CIDFontType2'), BaseFont=PdfName(tt_font.fontName), CIDSystemInfo=FreeText.make_cid_system_info_object(), FontDescriptor=FreeText.make_font_descriptor_object(tt_font), DW=int(round(tt_font.metrics.defaultWidth, 0)), Widths=PdfArray(tt_font.metrics.widths), CIDToGIDMap=FreeText.make_cid_to_gid_map_object(tt_font), )
def make_composite_font_object(font_file_path): """Make a PDF Type0 composite font object for embedding in the annotation's Resources dict. :param str font_file_path: The path and filename to the true type font we want to embed. :returns PdfDict: Resources PdfDict object, ready to be included in the Resources 'Font' subdictionary. """ # TODO: Get font name from font program itself tt_font = get_true_type_font(font_file_path, DEFAULT_BASE_FONT) return IndirectPdfDict(Type=PdfName('Font'), Subtype=PdfName('Type0'), BaseFont=PdfName(tt_font.fontName), Encoding=PdfName('Identity-H'), DescendantFonts=PdfArray( [FreeText.make_cid_font_object(tt_font)]), ToUnicode=FreeText.make_to_unicode_object())
def update_and_move(self, targetdir: str, doctitle: str, tags: List[str], date: str): """Update metadata of pdf and move to target directory. Arguments: targetdir {str} -- Target directory where pdf shall be placed. doctitle {str} -- New document title of pdf. tags {List[str]} -- Keywords/tags which shall be added to pdf. date {str} -- Date which will be entered into pdf filename. """ pdf = PdfReader(self.filepath) # Check for correct file ending if doctitle[-4:] != ".pdf": filename = date + " " + doctitle + ".pdf" else: filename = date + " " + doctitle doctitle = doctitle[0:-4] # Check for unique filename n = 1 if os.path.isfile(os.path.join(targetdir, filename)): filename = filename[0:-4] + "-" + str(n) + ".pdf" while os.path.isfile(os.path.join(targetdir, filename)): regex = re.compile(r"-\d{1,}.pdf", re.IGNORECASE) filename = regex.sub("-" + str(n) + ".pdf", filename) n = n + 1 # pdf.Info.Keywords = tags # pdf.Info.Title = doctitle # Write data writer = PdfWriter() writer.addpages(pdf.pages) writer.trailer.Info = IndirectPdfDict(Title=doctitle, Keywords=tags) writer.write(os.path.join(targetdir, filename)) # try to delete file ## try: os.remove(self.filepath) except OSError as e: # if failed, report it back to the user ## print("Error: %s - %s." % (e.filename, e.strerror))
def make_font_descriptor_object(tt_font): """Make a Font Descriptor object containing some calculated metrics for the font. :param TrueTypeFont tt_font: Our utility class used to parse and calculate font metrics from a true type font. :returns PdfDict: Font Descriptor PdfDict object. """ return IndirectPdfDict( Type=PdfName('FontDescriptor'), FontName=PdfName(tt_font.fontName), Flags=tt_font.metrics.flags, FontBBox=tt_font.metrics.bbox, ItalicAngle=int(tt_font.metrics.italicAngle), Ascent=int(round(tt_font.metrics.ascent, 0)), Descent=int(round(tt_font.metrics.descent, 0)), CapHeight=int(round(tt_font.metrics.capHeight, 0)), StemV=int(round(tt_font.metrics.stemV, 0)), MissingWidth=int(round(tt_font.metrics.defaultWidth, 0)), FontFile2=FreeText.make_font_file_object(tt_font))
def concatenate(input_paths, output_path, details=None): """Given an ordered sequence of paths to pdf files, concatenate to the desired output path with the given details. Args: input_paths: A sequence of paths to pdf files. output_path: The desired path for the concatenated pdf. details: A dictionary of metadata values desired for the final pdf. """ writer = PdfWriter() for path in input_paths: reader = PdfReader(path) writer.addpages(reader.pages) writer.trailer.Info = IndirectPdfDict() if details is not None: for metadata, value in details.items(): writer.trailer.Info[PdfName(metadata)] = value writer.write(output_path)
def make_cid_to_gid_map_object(tt_font): """Make a CID to GID map that is used to map character ids to glyph ids in the font. :param TrueTypeFont tt_font: Our utility class used to parse and calculate font metrics from a true type font. :returns PdfDict: CIDtoGID PdfDict object. """ # Let's make this as large as possibly addressable for now, it will compress nicely. mapping_size = 256 * 256 cid_to_gid_map = ["\x00"] * mapping_size * 2 for cc, glyph_name in tt_font.metrics.cmap.items(): # TODO: What is the expectation here since PDF only supports two bytes lookups? if cc >= mapping_size: continue glyph_id = tt_font.get_glyph_id(glyph_name) cid_to_gid_map[cc * 2] = chr(glyph_id >> 8) cid_to_gid_map[cc * 2 + 1] = chr(glyph_id & 0xFF) cid_to_gid_map = ''.join(cid_to_gid_map) # Let's let pdfrw handle the compressing of streams return IndirectPdfDict(stream=cid_to_gid_map)
def update_field(self, name, value): file_fields = self.file.Root.AcroForm.Fields field = file_fields[self.field_index(name)] rct = field.Rect height = round(float(rct[3]) - float(rct[1]), 2) width = round(float(rct[2]) - float(rct[0]), 2) xobj = IndirectPdfDict( BBox = [0, 0, width, height], FormType = 1, Resources = PdfDict(Prosec = [PdfName.PDF, PdfName.Text]), Subtype = PdfName.Form, Type = PdfName.XObject ) #Change the value of field when not foccused xobj.stream = "/Tx BMC\nBT\n /Helvetica 8.0 Tf\n 1.0 5.0 Td\n 0 g\n (" + value + ") Tj\nET EMC" file_fields[self.field_index(name)].AP = PdfDict(N = xobj) #Change the value when field is foccused field.update(PdfDict(V=value)) self.fields_info = self.read_fields() return field
def render(source, *, progress_cb=lambda x: None, expand_pages=True, template_alpha=0.3, only_annotated=False, black='black', white='white', gray=None, highlight=HIGHLIGHT_DEFAULT_COLOR): """Render a source document as a PDF file. source: The reMarkable document to be rendered. This may be - A filename or pathlib.Path to a zip file containing the document, such as is provided by the Cloud API. - A filename or pathlib.Path to a root-level file from the document, such as might be copied off the device directly. - An object implementing the Source API. See rmrl.sources for examples and further documentation. progress_cb: A function which will be called with a progress percentage between 0 and 100. The first 50% indicate rendering the annotations, and the second the merging of these into the base PDF file. If this callback raises an error, this function will abort gracefully and propagate the error up the stack. expand_pages: Boolean value (default True) indicating whether pages should be made larger, to reflect the view provided by the reMarkable device. template_alpha: Opacity of the template backgrounds in notebooks. 0 makes the templates invisible, 1 makes them fully dark. only_annotated: Boolean value (default False) indicating whether only pages with annotations should be output. black: A string giving the color to use as "black" in the document. Can be a color name or a hex string. Default: 'black' white: A string giving the color to use as "white" in the document. See `black` parameter for format. Default: 'white' gray: A string giving the color to use as "gray" in the document. See `black` parameter for format. Default: None, which means to pick an average between the "white" and "black" values. highlight: A string giving the color to use for the highlighter. See `black` parameter for format. """ colors = parse_colors(black, white, gray, highlight) vector = True # TODO: Different rendering styles source = sources.get_source(source) # If this is using a base PDF, the percentage is calculated # differently. uses_base_pdf = source.exists('{ID}.pdf') # Generate page information # If a PDF file was uploaded, but never opened, there may not be # a .content file. So, just load a barebones one with a 'pages' # key of zero length, so it doesn't break the rest of the # process. pages = [] if source.exists('{ID}.content'): with source.open('{ID}.content', 'r') as f: pages = json.load(f).get('pages', []) # Render each page as a pdf tmpfh = tempfile.TemporaryFile() pdf_canvas = canvas.Canvas(tmpfh, (PDFWIDTH, PDFHEIGHT)) # TODO: check pageCompression # Don't load all the pages into memory, because large notebooks # about 500 pages could use up to 3 GB of RAM. Create them by # iteration so they get released by garbage collector. changed_pages = [] annotations = [] for i in range(0, len(pages)): page = document.DocumentPage(source, pages[i], i, colors=colors) if source.exists(page.rmpath): changed_pages.append(i) page.render_to_painter(pdf_canvas, vector, template_alpha) annotations.append(page.get_grouped_annotations()) progress_cb((i + 1) / len(pages) * 50) pdf_canvas.save() tmpfh.seek(0) # This new PDF represents just the notebook. If there was a # parent PDF, merge it now. if uses_base_pdf and not changed_pages: # Since there is no stroke data, just return the PDF data progress_cb(100) log.info('exported pdf') return source.open('{ID}.pdf', 'rb') # PDF exists, stroke data exists, so mix them together. if uses_base_pdf: rmpdfr = PdfReader(tmpfh) basepdfr = PdfReader(source.open('{ID}.pdf', 'rb')) else: basepdfr = PdfReader(tmpfh) # Alias, which is used for annotations and layers. rmpdfr = basepdfr # If making a 'layered' PDF (with optional content groups, # OCGs), associate the annoatations with the layer. # This property list is put into the rmpdfr document, which # will not have any existing properties. ocgprop = IndirectPdfDict(OCGs=PdfArray(), D=PdfDict(Order=PdfArray())) for i in range(0, len(basepdfr.pages)): basepage = basepdfr.pages[i] rmpage = rmpdfr.pages[i] # Apply OCGs apply_ocg = False #TODO configurable? bool(int(QSettings().value( #'pane/notebooks/export_pdf_ocg'))) if apply_ocg: ocgorderinner = do_apply_ocg(basepage, rmpage, i, uses_base_pdf, ocgprop, annotations) else: ocgorderinner = None # Apply annotations to the rmpage. This must come after # applying OCGs, because the annotation may belong to # one of those groups. apply_annotations(rmpage, annotations[i], ocgorderinner) # If this is a normal notebook with highlighting, # just add the annotations and forget about the rest, # which are page geometry transformations. if uses_base_pdf: merge_pages(basepage, rmpage, i in changed_pages, expand_pages) progress_cb(((i + 1) / rmpdfr.numPages * 50) + 50) # Apply the OCG order. The basepdf may have already had OCGs # and so we must not overwrite them. NOTE: there are other # properties that ought to be carried over, but this is the # minimum required. if apply_ocg: if '/OCProperties' in basepdfr.Root: basepdfr.Root.OCProperties.OCGs += ocgprop.OCGs basepdfr.Root.OCProperties.D.Order += ocgprop.D.Order else: basepdfr.Root.OCProperties = ocgprop stream = tempfile.SpooledTemporaryFile(SPOOL_MAX) pdfw = PdfWriter(stream) if not only_annotated: # We are writing out everything, so we can take this shortcut: pdfw.write(trailer=basepdfr) else: for i, page in enumerate(basepdfr.pages): if i in changed_pages: pdfw.addpage(page) pdfw.write() stream.seek(0) log.info('exported pdf') return stream
import sys import os from pdfrw import PdfReader, PdfWriter, PageMerge, IndirectPdfDict argv = sys.argv[1:] if '-o' in argv: outfn = argv[argv.index('-o') + 1] del argv[argv.index('-o') + 1] del argv[argv.index('-o')] else: outfn = 'output.pdf' inpfn, underfn = argv under = PdfReader(underfn) trailer = PdfReader(inpfn) for page, upage in zip(trailer.pages, under.pages): PageMerge(page).add(upage, prepend=1).render() if trailer.Info is None: trailer.Info = IndirectPdfDict({}) # meta data comes from underneath.pdf trailer.Info.Title = under.Info.Title trailer.Info.Author = under.Info.Author trailer.Info.Subject = under.Info.Subject PdfWriter(outfn, trailer=trailer).write()
So she did an 8.5x11" output with 0.5" margin all around (actual size of useful area 7.5x10") and we scaled it up by 4.8. We also copy the Info dict to the new PDF. ''' import sys import os from pdfrw import PdfReader, PdfWriter, PageMerge, IndirectPdfDict def adjust(page, margin=36, scale=4.8): info = PageMerge().add(page) x1, y1, x2, y2 = info.xobj_box viewrect = (margin, margin, x2 - x1 - 2 * margin, y2 - y1 - 2 * margin) page = PageMerge().add(page, viewrect=viewrect) page[0].scale(scale) return page.render() inpfn, = sys.argv[1:] outfn = 'poster.' + os.path.basename(inpfn) reader = PdfReader(inpfn) writer = PdfWriter() writer.addpage(adjust(reader.pages[0])) writer.trailer.Info = IndirectPdfDict(reader.Info or {}) writer.write(outfn)
1) Concatenating multiple input PDFs. 2) adding metadata to the PDF. If you do not need to add metadata, look at subset.py, which has a simpler interface to PdfWriter. ''' import sys import os import find_pdfrw from pdfrw import PdfReader, PdfWriter, IndirectPdfDict inputs = sys.argv[1:] assert inputs outfn = 'output.pdf' writer = PdfWriter() for inpfn in inputs: writer.addpages(PdfReader(inpfn, decompress=False).pages) writer.trailer.Info = IndirectPdfDict( Title='your title goes here', Author='your name goes here', Subject='what is it all about?', Creator='some script goes here', ) writer.write(outfn)
from pdfrw import PdfReader """ x = PdfReader('source/07922XXX2258-2017Apr13-2017May15.pdf') print x.keys() print x.Info print x.Root.keys() print len(x.pages) print x.pages[0] print x.pages[0].Contents print x.pages[0].Contents.stream """ #writing pdfs from pdfrw import PdfWriter writer = PdfWriter() #y.addpage(x.pages[0]) #y.write('out.pdf') for pdf_filename in pdf_filenames: writer.addpages(PdfReader(pdf_filename).pages) from pdfrw import IndirectPdfDict writer.trailer.Info = IndirectPdfDict( Title='pdf bundle', Author='Adobe', Subject='pdf', Creator='Adobe', ) writer.write('out.pdf')
def process_pdf_file(inputFilename): try: print(f'Checking {inputFilename}') skip_this_page = False total_watermarks_skipped = 0 try: reader = PdfReader(inputFilename) except: pass else: writer = PdfWriter() wm_width = 0 page_count = 0 counts = dict() sample_pages = [] # Look through all pages for a potential primary watermark item if reader is not None: for idx, page in enumerate(reader.pages): if '/Resources' in page and '/XObject' in page[ '/Resources']: for xobj in page['/Resources']['/XObject']: # Warning: Masks may or may not indicate WM presence if '/Mask' in page['/Resources']['/XObject'][str( xobj)]: if '/Width' in page['/Resources']['/XObject'][ str(xobj)]: cur_width = int( page['/Resources']['/XObject'][str( xobj)]['/Width']) counts[cur_width] = counts.get( cur_width, 0) + 1 sample_pages.append(idx + 1) page_count += 1 if counts: wm_width = max(counts, key=lambda key: counts[key]) if counts[wm_width] != page_count and len(counts) < 4: print('*' * 40) print( f'* Potential watermarks found but only occurs in {counts[wm_width]} of {page_count} pages' ) print(f'* Sample pages: {sample_pages[0:9]}') counts = sorted(counts.items(), reverse=True, key=lambda x: x[1]) print(f'* {counts}') print('*' * 40) wm_width = 0 # Process all pages removing prop pages and watermark objects for idx, page in enumerate(reader.pages): skip_this_page = False # ************** Prop Pages ************** # Google try: skip_this_page = 'google' in page['/Annots'][0]['/A'][ '/URI'] except: pass # HathiTrust # Looks like this may need another method for checking for the existence of the JxCBE if not skip_this_page: try: if idx == 0: skip_this_page = '/JxCBE' in page[ '/Resources']['/XObject']['/CLC'][ '/Resources']['/XObject'] except: pass if not skip_this_page: try: if idx == 0: skip_this_page = '/JxCBE' in page[ '/Resources']['/XObject']['/CCA'][ '/Resources']['/XObject'] except: pass # Internet Archive / Microsoft if not skip_this_page: try: if idx == 2: skip_this_page = page['/Resources'][ '/XObject']['/Im001']['/Length'] == '8420' except: pass # ************** Watermarks ************** # Dump Google watermarks if '/Resources' in page and '/XObject' in page[ '/Resources'] and '/Wm' in page['/Resources'][ '/XObject']: junk = page['/Resources']['/XObject'].pop('/Wm') total_watermarks_skipped += 1 if '/Resources' in page and '/XObject' in page[ '/Resources']: for xobj in page['/Resources']['/XObject']: if '/Mask' in page['/Resources']['/XObject'][str( xobj)]: if '/Width' in page['/Resources']['/XObject'][ str(xobj)]: cur_width = int( page['/Resources']['/XObject'][str( xobj)]['/Width']) if cur_width == wm_width: junk = page['/Resources'][ '/XObject'].pop(str(xobj)) total_watermarks_skipped += 1 for xobj in page['/Resources']['/XObject']: if page['/Resources']['/XObject'][str( xobj)]['/Width'] == '156': junk = page['/Resources']['/XObject'].pop( str(xobj)) total_watermarks_skipped += 1 # Dump HathiTrust watermarks if '/Resources' in page and '/XObject' in page['/Resources'] and \ '/CBJ' in page['/Resources']['/XObject'] and \ '/Resources' in page['/Resources']['/XObject']['/CBJ'] and \ '/XObject' in page['/Resources']['/XObject']['/CBJ']['/Resources'] and \ '/PxCBA' in page['/Resources']['/XObject']['/CBJ']['/Resources']['/XObject']: junk = page['/Resources']['/XObject']['/CBJ'][ '/Resources']['/XObject'].pop('/PxCBA') total_watermarks_skipped += 1 if '/Resources' in page and '/XObject' in page['/Resources'] and \ '/CBJ' in page['/Resources']['/XObject'] and \ '/Resources' in page['/Resources']['/XObject']['/CBJ'] and \ '/XObject' in page['/Resources']['/XObject']['/CBJ']['/Resources'] and \ '/PxCBF' in page['/Resources']['/XObject']['/CBJ']['/Resources']['/XObject']: junk = page['/Resources']['/XObject']['/CBJ'][ '/Resources']['/XObject'].pop('/PxCBF') total_watermarks_skipped += 1 if '/Resources' in page and '/XObject' in page['/Resources'] and \ '/CBJ' in page['/Resources']['/XObject'] and \ '/Resources' in page['/Resources']['/XObject']['/CBJ'] and \ '/XObject' in page['/Resources']['/XObject']['/CBJ']['/Resources'] and \ '/PxCBG' in page['/Resources']['/XObject']['/CBJ']['/Resources']['/XObject']: junk = page['/Resources']['/XObject']['/CBJ'][ '/Resources']['/XObject'].pop('/PxCBG') total_watermarks_skipped += 1 # Add the page unless it's the prop page if not skip_this_page and page.Contents is not None: writer.addpage(page) # Copy and clean up the metadata if reader['/Info']: new_meta_dict = {} for info in reader['/Info']: if '/Producer' not in info: new_meta_dict[info] = reader['/Info'].get(info) writer.trailer.Info = IndirectPdfDict(new_meta_dict) # Write the new file if cleanup was necessary if total_watermarks_skipped or (len(reader.pages) != len( writer.pagearray)): filename, file_extension = os.path.splitext(inputFilename) os.rename(inputFilename, filename + '.bak') writer.write(inputFilename) if len(reader.pages) != len(writer.pagearray): print( f' {len(reader.pages) - len(writer.pagearray)} pages deleted', ) if total_watermarks_skipped: print( f' {total_watermarks_skipped} page watermark references removed' ) print(f' Clean file written to {inputFilename}') except Exception as e: print('Exception: ', e)
from pdfrw import PdfReader, IndirectPdfDict, BookmarkedPdfWriter from datetime import datetime output = BookmarkedPdfWriter() for i in xrange(3): totalPages = len(output.pagearray) output.addpages( PdfReader( 'static_pdfs/global/0ae80b493bc21e6de99f2ff6bbb8bc2c.pdf').pages) bmname = 'Bm (%s) - %s' % (i + 1, 'Root') t1 = output.addBookmark(bmname, totalPages) t2 = output.addBookmark("Child 1", totalPages + 1, t1) output.addBookmark("Child 1.1", totalPages + 2, t2) now = datetime.utcnow() date = 'D:%04d%02d%02d%02d%02d%02d' % (now.year, now.month, now.day, now.hour, now.minute, now.second) info = output.trailer.Info = IndirectPdfDict() info.Title = 'Test PDF with Bookmarks' info.Author = 'asdasd' info.Creator = 'random dude' info.Producer = 'another random dude' info.CreationDate = date output.write('result.pdf')
def cli(verbose, input, output): """ input: input file or files output: output folder, will create if not found. """ if verbose: click.echo(f"Current args: {input} {output}") path = Path(input) folder = path.resolve() file_name = '.' if path.is_file(): folder = path.absolute().parent file_name = path.name if verbose: click.echo(f"Current path: {path} {folder} {path.name}") files = [ entry.path for entry in os.scandir(folder) if file_name in entry.name and entry.name.endswith('.pdf') ] if verbose: click.echo(f"Found {len(files)} files") number = 1 out_path = os.path.realpath(output) if not os.path.exists(out_path): try: os.makedirs(out_path) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise for file in files: out_file = os.path.join(out_path, file) trailer = PdfReader(file) if trailer.Info and trailer.Info.Title: click.echo(f'Current title: {trailer.Info.Title}') else: click.echo("Current file doesn't have an existing title") if not trailer.Info: trailer.Info = IndirectPdfDict( Title='your title goes here', Author='Title change', Subject='This is a file with a changed title', Creator='Title Change 0.1', ) trailer.Info.Title = click.prompt( f'Write the new metadata title for {file}', type=str) PdfWriter(out_file, trailer=trailer).write() if verbose: click.echo( f"Wrote {os.path.basename(file)}, {number}/{len(files)}") number += 1 click.echo('Done!')
warn(f"O DRE {dre} está presente na pauta, mas não foi " f"encontrado no lote! Ele vai ficar sem prova!") continue this_min = min(int(file.stem) for file in dre_to_pages_map[dre]) this_max = max(int(file.stem) for file in dre_to_pages_map[dre]) if prev_max is not None: assert this_min == prev_max + 1 prev_max = this_max print() print("> Gerando os arquivos finais...", end='') for dre in dre_to_pages_map: prova_writer = PdfWriter() for filename in dre_to_pages_map[dre]: prova_writer.addpages(PdfReader(filename).pages) prova_writer.trailer.Info = IndirectPdfDict( Title=f"P1 AlgLin 2020 PLE: {dre}") prova_writer.write(os.fspath(final_dir / f"{dre}.pdf")) print() provas_dir = (pathlib.Path() / args.PROVAS_DIR).resolve() print(f"> Colocando as provas no diretório {provas_dir} ...", end='') provas_dir.mkdir() for filename in final_dir.iterdir(): shutil.copyfile(os.fspath(filename), os.fspath(provas_dir / filename.name)) print() print("> Gerando o zip...", end='') shutil.make_archive(os.fspath(provas_dir), "zip", root_dir=os.fspath(provas_dir.parent),