def copy_pdf_id(src, target): """ Copies the PDF is from src to target. Both arguments are given as a file path """ src_trailer = PdfReader(src) target_trailer = PdfReader(target) target_trailer.ID = src_trailer.ID PdfWriter(target, trailer=target_trailer).write()
def refingerprint_pdf(filename, directory, copy_count, suffix): try: base_dir = os.path.join('/tmp/', directory) base_file_path = os.path.join(base_dir, filename) #file size kb file_size = round(os.path.getsize(base_file_path) / 1024) content = PdfReader(base_file_path) if content.ID is None: file_id = 'No ID' else: file_id = str(content.ID[0]).replace('<', '').replace('>', '')\ .replace('(', '').replace(')', '') #bad file_ids can contain strange characters try: file_id.encode('utf-8').strip() except UnicodeDecodeError: file_id = 'Unreadable' processed_files = [] for copy_index in range(copy_count): if suffix and suffix != '': save_filename = filename + '-' + suffix + '-' + str(copy_index + 1) + '.pdf' else: save_filename = filename + '-' + str(copy_index + 1) + '.pdf' file_path = os.path.join('/tmp', directory, save_filename) download_link = '/fingerprinter/download/%s?file=%s' % (directory, save_filename) content = PdfReader(base_file_path) #add some random meta data content.Info.randomMetaData = binascii.b2a_hex(os.urandom(20)).upper() _filename = filename.strip().encode('utf-8') #change id to random id md = md5(_filename) md.update(str(time.time()).encode('utf-8')) md.update(os.urandom(10)) new_id = md.hexdigest().upper() #keep length 32 new_id = new_id[0:32] while len(new_id) < 32: new_id += random.choice('0123456789ABCDEF') content.ID = [new_id, new_id] PdfWriter(file_path, trailer=content).write() copy_info = {'filename': save_filename, 'download_link': download_link, 'id': content.ID[0]} processed_files.append(copy_info) #save copy of info in file directory out_file = open(os.path.join(base_dir, 'file_info.json'), 'w') file_info = {'filename': filename, 'size': file_size, 'id': file_id, 'directory_name': directory, 'processed_files': processed_files} json.dump(file_info, out_file, indent=4) out_file.close() #delete generated files delete_refingerprint.delay(base_dir) except Exception as e: delete_refingerprint.delay(base_dir) raise(e)
def fingerprinter_upload(request): processed_files = [] pdf_file = request.FILES.get('pdf-file') copy_count = request.POST.get('copy-count', 1) suffix = request.POST.get('file-suffix', '') try: copy_count = int(copy_count) except: copy_count = 1 if pdf_file is not None: #make save directory rand_path = randomword(9) fingerprint_dir = os.path.join(settings.BASE_DIR, settings.STATIC_ROOT, 'fingerprints', rand_path) os.makedirs(fingerprint_dir) s = os.path.splitext(pdf_file.name) filename = s[0] #handle non ascii chars in file name #(strangly only wsgi seems to choke on those) if isinstance(filename, unicode): try: filename = unidecode(filename) except: filename = re.sub(r'[^\x00-\x7F]+', '.', filename) extension = s[1] file_content = pdf_file.read() content = PdfReader(io.BytesIO(file_content)) if content.ID is None: file_id = 'No ID' else: file_id = str(content.ID[0]).replace('<', '').replace('>', '')\ .replace('(', '').replace(')', '') #bad file_ids can contain strange characters #TODO When we upgrade try: file_id.encode('utf-8').strip() except UnicodeDecodeError: file_id = 'Unreadable' file_info = { 'filename': pdf_file.name, 'size': pdf_file.size, 'id': file_id, 'directory_name': rand_path } for copy_index in range(copy_count): if suffix and suffix != '': save_filename = filename + '-' + suffix + '-' + str( copy_index + 1) + extension else: save_filename = filename + '-' + str(copy_index + 1) + extension print('AAA', save_filename) file_path = os.path.join(fingerprint_dir, save_filename) static_link = os.path.join('/pdf', save_filename) download_link = os.path.join('/static/drop-pdf', save_filename) content = PdfReader(io.BytesIO(file_content)) #add some random meta data content.Info.randomMetaData = binascii.b2a_hex( os.urandom(20)).upper() #change id to random id md = hashlib.md5(filename) md.update(str(time.time())) md.update(os.urandom(10)) new_id = md.hexdigest().upper() #keep length 32 new_id = new_id[0:32] while len(new_id) < 32: new_id += random.choice('0123456789ABCDEF') content.ID = [new_id, new_id] PdfWriter(file_path, trailer=content).write() #copy file into online annotator with unique name annotation_name = filename + '-' + suffix + '-' \ + str(copy_index + 1) + '-' + rand_path + extension annotation_path = os.path.join(settings.BASE_DIR, settings.STATIC_ROOT, 'drop-pdf', annotation_name) shutil.copy(file_path, annotation_path) #For some reason nested directories do not provide files from static. #We need to clean up double "settings" file and sanify the basic setup but #For now serve the file from a dedicated URL. copy_info = { 'filename': save_filename, 'download_path': os.path.join(rand_path, save_filename), 'docdrop_link': annotation_name, 'id': content.ID[0] } processed_files.append(copy_info) else: raise Http404('file not provided') data = { 'processed_files': processed_files, 'file_info': file_info, 'archive_name': filename } print(data) return render_to_response('refingerprint_results.html', data)