def remove_metadata_text(infile, ext): if ext.lower() == "pdf": trailer = PdfReader(infile) trailer.Info = {} PdfWriter(infile, trailer=trailer).write() elif ext.lower() == "docx": meta_fields= ["author", "category", "comments", "content_status", "identifier", "keywords", "language", "subject", "title"] document = Document(infile) properties = document.core_properties for meta_field in meta_fields: setattr(properties, meta_field, "") document.save("%s" % infile) document = Document("%s" % infile)
def set_pdf_field_value(self, pdf_file, field, fieldValue): try: trailer = PdfReader(pdf_file) if trailer.Info is None: trailer.Info = pdfrw.objects.pdfdict.PdfDict() setDataQuery = 'trailer.Info.{f} = "{v}"'.format(f=field.replace( '/', ''), v=fieldValue) exec(setDataQuery) new_pdf = os.path.split(pdf_file)[1] new_file = os.path.join(output_dir, new_pdf) PdfWriter(new_file, trailer=trailer).write() return new_file except: Logger.printMessage(pdf_file, is_error=True) return pdf_file
def remove_metadata_text(infile, ext): if ext.lower() == "pdf": trailer = PdfReader(infile) trailer.Info = {} PdfWriter(infile, trailer=trailer).write() elif ext.lower() == "docx": meta_fields = [ "author", "category", "comments", "content_status", "identifier", "keywords", "language", "subject", "title" ] document = Document(infile) properties = document.core_properties for meta_field in meta_fields: setattr(properties, meta_field, "") document.save("%s" % infile) document = Document("%s" % infile)
import sys import os from pdfrw import PdfReader, PdfWriter, PageMerge, IndirectPdfDict argv = sys.argv[1:] if '-o' in argv: outfn = argv[argv.index('-o') + 1] del argv[argv.index('-o') + 1] del argv[argv.index('-o')] else: outfn = 'output.pdf' inpfn, underfn = argv under = PdfReader(underfn) trailer = PdfReader(inpfn) for page, upage in zip(trailer.pages, under.pages): PageMerge(page).add(upage, prepend=1).render() if trailer.Info is None: trailer.Info = IndirectPdfDict({}) # meta data comes from underneath.pdf trailer.Info.Title = under.Info.Title trailer.Info.Author = under.Info.Author trailer.Info.Subject = under.Info.Subject PdfWriter(outfn, trailer=trailer).write()
def refingerprint_pdf(filename, directory, copy_count, suffix): try: base_dir = os.path.join('/tmp/', directory) base_file_path = os.path.join(base_dir, filename) #file size kb file_size = round(os.path.getsize(base_file_path) / 1024) content = PdfReader(base_file_path) if content.ID is None: file_id = 'No ID' else: file_id = str(content.ID[0]).replace('<', '').replace('>', '')\ .replace('(', '').replace(')', '') #bad file_ids can contain strange characters try: file_id.encode('utf-8').strip() except UnicodeDecodeError: file_id = 'Unreadable' processed_files = [] for copy_index in range(copy_count): if suffix and suffix != '': save_filename = filename + '-' + suffix + '-' + str( copy_index + 1) + '.pdf' else: save_filename = filename + '-' + str(copy_index + 1) + '.pdf' file_path = os.path.join('/tmp', directory, save_filename) download_link = '/fingerprinter/download/%s?file=%s' % ( directory, save_filename) content = PdfReader(base_file_path) #add some random meta data if not content.Info: content.Info = PDFInfo() content.Info.randomMetaData = binascii.b2a_hex( os.urandom(20)).upper() _filename = filename.strip().encode('utf-8') #change id to random id md = md5(_filename) md.update(str(time.time()).encode('utf-8')) md.update(os.urandom(10)) new_id = md.hexdigest().upper() #keep length 32 new_id = new_id[0:32] while len(new_id) < 32: new_id += random.choice('0123456789ABCDEF') content.ID = [new_id, new_id] PdfWriter(file_path, trailer=content).write() copy_info = { 'filename': save_filename, 'download_link': download_link, 'id': content.ID[0] } processed_files.append(copy_info) #save copy of info in file directory out_file = open(os.path.join(base_dir, 'file_info.json'), 'w') file_info = { 'filename': filename, 'size': file_size, 'id': file_id, 'directory_name': directory, 'processed_files': processed_files } json.dump(file_info, out_file, indent=4) out_file.close() #delete generated files delete_refingerprint.delay(base_dir) except Exception as e: delete_refingerprint.delay(base_dir) raise (e)
def cli(verbose, input, output): """ input: input file or files output: output folder, will create if not found. """ if verbose: click.echo(f"Current args: {input} {output}") path = Path(input) folder = path.resolve() file_name = '.' if path.is_file(): folder = path.absolute().parent file_name = path.name if verbose: click.echo(f"Current path: {path} {folder} {path.name}") files = [ entry.path for entry in os.scandir(folder) if file_name in entry.name and entry.name.endswith('.pdf') ] if verbose: click.echo(f"Found {len(files)} files") number = 1 out_path = os.path.realpath(output) if not os.path.exists(out_path): try: os.makedirs(out_path) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise for file in files: out_file = os.path.join(out_path, file) trailer = PdfReader(file) if trailer.Info and trailer.Info.Title: click.echo(f'Current title: {trailer.Info.Title}') else: click.echo("Current file doesn't have an existing title") if not trailer.Info: trailer.Info = IndirectPdfDict( Title='your title goes here', Author='Title change', Subject='This is a file with a changed title', Creator='Title Change 0.1', ) trailer.Info.Title = click.prompt( f'Write the new metadata title for {file}', type=str) PdfWriter(out_file, trailer=trailer).write() if verbose: click.echo( f"Wrote {os.path.basename(file)}, {number}/{len(files)}") number += 1 click.echo('Done!')
def _fill_pdf_metadata(out_file, issuer, issuer_address, column_fields, data, global_columns, verify_issuer, conf, interactive=False): # create version version = 2 # create issuer object (json) issuer = { "name": issuer, "identity": { "address": issuer_address, "verification": json.loads(verify_issuer)['methods'] } } # create metadata object (json) and add metadata metadata = {} # add custom metadata if column_fields: metadata_fields = json.loads(column_fields)['columns'] for f in metadata_fields: key = list(f)[0] if key in data: field_properties = f[key] field_properties['value'] = data[key] metadata[key] = field_properties # add global field metadata if global_columns: global_fields = json.loads(global_columns)['fields'] for g in global_fields: key = list(g)[0] # note that global fields override column data metadata[key] = g[key] # now look at special owner name/pubkey columns explicitly in code # TODO we should probably check if the public key is valid owner = None owner_address = None owner_pk = None if '__OWNER_PK__' in data and data[ '__OWNER_PK__'] and '__OWNER_ADDRESS__' in data and data[ '__OWNER_ADDRESS__']: # TODO maybe just calculate address from public key? owner_address = data['__OWNER_ADDRESS__'] owner_pk = data['__OWNER_PK__'] owner = { "name": data['__OWNER_NAME__'], "owner_address": data['__OWNER_ADDRESS__'], # TODO needed? - can be derived "pk": owner_pk } # add the metadata pdf_metadata = PdfDict(version=version, issuer=json.dumps(issuer), metadata=json.dumps(metadata), owner=json.dumps(owner), owner_proof='', chainpoint_proof='') else: # add the metadata (without dumps(owner) to keep owner empty) pdf_metadata = PdfDict(version=version, issuer=json.dumps(issuer), metadata=json.dumps(metadata), owner='', owner_proof='', chainpoint_proof='') pdf = PdfReader(out_file) if pdf.Info: pdf.Info.update(pdf_metadata) else: pdf.Info = pdf_metadata PdfWriter().write(out_file, pdf) # if owner exists then need to add owner_proof # hash pdf, sign hash message using node and add in owner_proof if owner: ##import time ##start = time.time() sha256_hash = None with open(out_file, 'rb') as pdf: sha256_hash = hashlib.sha256(pdf.read()).hexdigest() if (conf.testnet): setup('testnet') else: setup('mainnet') host, port = conf.full_node_url.split( ':') #TODO: update when NodeProxy accepts full url! proxy = NodeProxy(conf.full_node_rpc_user, conf.full_node_rpc_password, host, port).get_proxy() # Due to an old unresolved issue still pending in Bitcoin v0.20.0 # signmessage does not support signing with bech32 key. # To resolve we use the public key to get the base58check encoding that # signmessage is happy with so that we can sign! if (owner_address.startswith('bc') or owner_address.startswith('tb')): owner_address = PublicKey(owner_pk).get_address().to_string() # NOTE that address (the encoding) might have changed here from bech32 # to legacy... take care if you use it again in this function! sig = proxy.signmessage(owner_address, sha256_hash) # add owner_proof to metadata pdf_metadata = PdfDict(owner_proof=sig) pdf = PdfReader(out_file) pdf.Info.update(pdf_metadata) PdfWriter().write(out_file, pdf) ##end = time.time() ##print(end-start, " seconds") ##exit() if interactive: # print progress print('.', end="", flush=True)