def MergePDF(self, params): output = PdfFileWriter() outputPages = 0 pdf_fileName = self.getFileName(params['filepath']) if len(pdf_fileName) < 1: print 'there is not any files' return for i in range(0, params['fileCount']): filename = params['filepath'] + str(i) + '.pdf' print '*********************%s************************' % i # print 'filename: %s and pdf_fileName: %s' % (filename, pdf_fileName[i]) # 读取源pdf文件 input = PdfFileReader(file(filename, "rb")) # 如果pdf文件已经加密,必须首先解密才能使用pyPdf if input.isEncrypted == True: input.decrypt("map") # 获得源pdf文件中页面总数 pageCount = input.getNumPages() outputPages += pageCount print pageCount # 分别将page添加到输出output中 for iPage in range(0, pageCount): output.addPage(input.getPage(iPage)) print "All Pages Number:" + str(outputPages) # 最后写pdf文件 filePath = params['filepath'] + params['outfile'] outputStream = file(filePath, "wb") output.write(outputStream) outputStream.close() print "finished"
def render(self): output = PdfFileWriter() base1 = "%s/lib/%s" % (path.dirname(__file__), "kfza_base.pdf") base1 = open(base1, 'rb') b1_pdf = PdfFileReader(base1) wm = b1_pdf.getPage(0) p1 = PdfFileReader(self.generate_page_one()) page1 = p1.getPage(0) page1.mergePage(wm) output.addPage(page1) bpdf = "%s/lib/%s" % (path.dirname(__file__), self.base_pdf) with open(bpdf, 'rb') as pdf: pf = PdfFileReader(pdf) if pf.isEncrypted: pf.decrypt('') for page in range(pf.getNumPages()): output.addPage(pf.getPage(page)) if self.context.course.extra_questions: b1_pdf = PdfFileReader(base1) wm = b1_pdf.getPage(0) p1 = PdfFileReader(self.generate_page_one()) page1 = p1.getPage(1) page1.mergePage(wm) output.addPage(page1) ntf = TemporaryFile() output.write(ntf) ntf.seek(0) base1.close() return ntf
def MergePDF(filepath,outfile): output=PdfFileWriter() outputPages=0 pdf_fileName=getFileName(filepath) for each in pdf_fileName: print "file:" + each if(each.find(".pdf") < 0): continue # 读取源pdf文件 input = PdfFileReader(file(each, "rb")) # 如果pdf文件已经加密,必须首先解密才能使用pyPdf if input.isEncrypted == True: input.decrypt("map") # 获得源pdf文件中页面总数 pageCount = input.getNumPages() outputPages += pageCount print pageCount # 分别将page添加到输出output中 for iPage in range(0, pageCount): output.addPage(input.getPage(iPage)) print "All Pages Number:"+str(outputPages) # 最后写pdf文件 outputStream=file(filepath+outfile,"wb") output.write(outputStream) outputStream.close() print "finished"
def _get_images_from_pdf(pdf_filename, resolution, verbose, delete_files, temp_dir, make_thumbs, thumb_size, thumb_dir, thumb_prefix, pool_count=1): success = False try: if verbose == True: print "Splitting PDF into multiple pdf's for processing ..." # make sure there is a place to put our temporary pdfs if not os.path.exists(temp_dir): os.makedirs(temp_dir) # make sure if we are going to make thumbs, the folde rexists if make_thumbs == True: if not os.path.exists(thumb_dir): os.makedirs(thumb_dir) # read input pdf inputpdf = PdfFileReader(open(pdf_filename, "rb")) if inputpdf.getIsEncrypted(): inputpdf.decrypt('') if verbose == True: print "Writing out %i pages ..." % inputpdf.numPages # create all of the temporary pdfs for i in xrange(inputpdf.numPages): output = PdfFileWriter() output.addPage(inputpdf.getPage(i)) #print output.resolvedObjects filename = "{0}/document-page-{1}.pdf".format(temp_dir,i) with open(filename, "wb") as outputStream: output.write(outputStream) __pdf_queue.put(i) if verbose == True: print "Dispatching pdf workers ..." # spin up our workers to convert the pdfs to images #pool_count = 4 pool = Pool() pool.map_async( _pdf_converter_worker, [(x, resolution, verbose, delete_files, temp_dir, make_thumbs, thumb_size, thumb_dir, thumb_prefix) for \ x in range(pool_count)] ) while __pdf_texts.qsize() != inputpdf.numPages: time.sleep(.25) if verbose == True: print "Done converting PDF." success = True except Exception, e: print str(e)
def merge_pdf(new_filename, pdfs, encryp=False, user_pw="", owner_pw=None, lvl=128): """ Merges pdfs into one pdf called new_filename. pdf: list of tuples (path=string, password=string) """ output = PdfFileWriter() for path, pw in pdfs: pdf = PdfFileReader(open(path, "rb")) if pdf.isEncrypted: decryption = pdf.decrypt(pw) if decryption == 0: raise PasswordError for page_num in range(pdf.getNumPages()): page = pdf.getPage(page_num) output.addPage(page) with file(new_filename, "wb") as outputStream: outputStream = open(new_filename, "wb") if encryp and lvl == 128: output.encrypt(user_pw, owner_pw, True) elif encryp: output.encrypt(user_pw, owner_pw, False) output.write(outputStream)
def OCR(self, fn, resolution=300, verbose=False, part=''): i = 1 pdf = PdfFileReader(file(fn, 'rb')) if pdf.getIsEncrypted(): if pdf.decrypt(''): jnk = 0 else: return false pagedata = [] text = '' for p in pdf.pages: if verbose: print ' --- ' + str(i) part = str(part) # Temporary filenames for ImageMagick conversion pgfile = 'tmp-' + part + '-' + str(i) + '.pdf' pgfilejpg = 'tmp-' + part + '-' + str(i) + '.jpg' # Parse this page output = PdfFileWriter() output.addPage(p) outputStream = file(pgfile, 'wb') output.write(outputStream) outputStream.close() # Convert this page to a high-resolution JPEG img = PythonMagick.Image() img.density(str(resolution)) img.read(pgfile) img.write(pgfilejpg) # OCR the converted JPG im = Image.open(pgfilejpg) if (len(im.split()) == 4): r, g, b, a = im.split() im = Image.merge('RGB', (r, g, b)) t = image_to_string(im) # Cleanup os.remove(pgfile) os.remove(pgfilejpg) # Add to data object pagedata.append(OCRPage(i, t, self.OCRCleanup(t))) text += t i += 1 # Produce the output data object result = OCRResult(text, self.OCRCleanup(text), (i - 1), pagedata) return result
def OCR(self, fn, resolution=300, verbose=False, part=''): i = 1 pdf = PdfFileReader(file(fn, 'rb')) if pdf.getIsEncrypted(): if pdf.decrypt(''): jnk = 0 else: return false pagedata = [] text = '' for p in pdf.pages: if verbose: print ' --- ' + str(i) part = str(part) # Temporary filenames for ImageMagick conversion pgfile = 'tmp-' + part + '-' + str(i) + '.pdf' pgfilejpg = 'tmp-' + part + '-' + str(i) + '.jpg' # Parse this page output = PdfFileWriter() output.addPage(p) outputStream = file(pgfile,'wb') output.write(outputStream) outputStream.close() # Convert this page to a high-resolution JPEG img = PythonMagick.Image() img.density(str(resolution)) img.read(pgfile) img.write(pgfilejpg) # OCR the converted JPG im = Image.open(pgfilejpg) if(len(im.split()) == 4): r, g, b, a = im.split() im = Image.merge('RGB', (r,g,b)) t = image_to_string(im) # Cleanup os.remove(pgfile) os.remove(pgfilejpg) # Add to data object pagedata.append(OCRPage(i, t, self.OCRCleanup(t))) text += t i += 1 # Produce the output data object result = OCRResult(text, self.OCRCleanup(text), (i-1), pagedata) return result
def download_pdf(url): writer = PdfFileWriter() remoteFile = urlopen(Request(url)).read() memoryFile = StringIO(remoteFile) pdfFile = PdfFileReader(memoryFile) if pdfFile.isEncrypted: pdfFile.decrypt('') for pageNum in xrange(pdfFile.getNumPages()): currentPage = pdfFile.getPage(pageNum) #currentPage.mergePage(watermark.getPage(0)) writer.addPage(currentPage) outputStream = open('/home/hjiang/pmscrapy/pdf_folder/%s'%basename(url),"wb") writer.write(outputStream) outputStream.close()
def _get_pdf_reader(self, pdf_stream): pdf = PdfFileReader(pdf_stream) if pdf.isEncrypted: result = pdf.decrypt("") if (result == 0): self._logger.error("Failed to decrypt PDF file.") raise ValueError('Failed to decrypt PDF file.') return pdf
def read_pdf(filename): """Open a PDF file with pyPdf.""" if not os.path.exists(filename): raise CommandError("{} does not exist".format(filename)) pdf = PdfFileReader(file(filename, "rb")) if pdf.isEncrypted: while True: pw = prompt_for_pw(filename) matched = pdf.decrypt(pw) if matched: break else: print "The password did not match." return pdf
def merge_vac(icao_code_list, directory, options): """ Merges pages corresponding to icao_code_list in directory to a single pdf file. """ vac_all_file = "%s/%s.pdf" % (directory, VAC_ALL_FILE) vac_a5_file = "%s/%s.a5.pdf" % (directory, VAC_ALL_FILE) start_page = 1 if options.merge_better else 0 info("Merging all VAC charts into one pdf file (%s)..." % vac_all_file, options) from pyPdf import PdfFileWriter, PdfFileReader out_pdf = PdfFileWriter() for icao_code in icao_code_list: in_file = open("%s/%s.pdf" % (directory, icao_code), "rb") in_pdf = PdfFileReader(in_file) in_pdf.decrypt("") for i in range(start_page, in_pdf.numPages): out_pdf.addPage(in_pdf.getPage(i)) out_file = open("%s/%s.pdf" % (directory, VAC_ALL_FILE), "wb") out_pdf.write(out_file) out_file.close() if options.a5: info("Converting merged file to 2x1 A5 in A4 format (%s)..." % vac_a5_file, options) (status, output) = commands.getstatusoutput("pdfnup %s/%s.pdf --outfile %s/%s.a5.pdf" % (directory, VAC_ALL_FILE, directory, VAC_ALL_FILE)) if status != 0: fail("Failed to convert merged file to 2x1 A5 in A4 format.\n%s" % output)
def decrypt(pdf_path, out_path, password): pdf = PdfFileReader(open(os.path.abspath(pdf_path), 'rb')) if pdf.decrypt(str(password)) == 0: raise DecryptionError(pdf_path) title = pdf.documentInfo.title if pdf.documentInfo.title else _('Unknown') author = pdf.documentInfo.author if pdf.documentInfo.author else _('Unknown') out_pdf = PdfFileWriter(title=title, author=author) for page in pdf.pages: out_pdf.addPage(page) with open(out_path, 'wb') as out_file: out_pdf.write(out_file)
def split_pdf(path_pdf): inputpdf = PdfFileReader(file(path_pdf, "rb")) inputpdf.decrypt('') if not path.exists('./tmp'): makedirs('./tmp/') for i in range(inputpdf.numPages): output = PdfFileWriter() output.addPage(inputpdf.getPage(i * 1)) newname = path_pdf[:7] + "-" + str(i) + ".pdf" outputStream = file("./tmp/" + newname, "w+") output.write(outputStream) DateName = change_name(outputStream) outputStream.close() rename("./tmp/" + newname, "./tmp/" + DateName) AllPdf = listdir('./tmp/') now = datetime.now() now = now.strftime('%Y-%m-%d') DateNow = arrow.get(now) for pdf in AllPdf: PdfReturn = pdf.split('-')[::-1] PdfDate = '-'.join(PdfReturn) DateFile = arrow.get(PdfDate) delta = (DateFile - DateNow) if -5 <= delta.days <= 5: rename("./tmp/" + pdf, "./" + "planning.pdf") remove_all() return "planning.pdf" else: continue return False
def MergePDF(filepath,outfile): output=PdfFileWriter() outputPages=0 pdf_fileName=getFileName(filepath) print '总的',pdf_fileName for each in pdf_fileName: if '.DS_Store' in each: continue # print '看看',os.path.dirname(each),'+', os.path.splitext(each.replace(os.path.dirname(each),'')) # print '单的',each # 读取源pdf文件 input = PdfFileReader(file(each, "rb")) # print 'input:',input # 如果pdf文件已经加密,必须首先解密才能使用pyPdf if input.isEncrypted == True: print 'input.isEncrypted',input.isEncrypted input.decrypt("map") # 获得源pdf文件中页面总数 pageCount = input.getNumPages() outputPages += pageCount print pageCount # 分别将page添加到输出output中 for iPage in range(0, pageCount): output.addPage(input.getPage(iPage)) print "All Pages Number:"+str(outputPages) # 最后写pdf文件 outputStream=file(filepath+outfile,"wb") output.write(outputStream) outputStream.close() print "finished"
class PdfBox(object): ''' Wraps pyPdf utils into a pdf object''' pdfReader = None pdfInfo = None currentpage = 0 extractedPages = {} filepath = "" isencrypted = False password = "" author = "" title = "" subject = "" pages = 0 initialized = False def __init__(self, filepath, password = None): self.filepath = filepath self.pdfReader = PdfFileReader(file(filepath, "rb")) if password: self.password = password if self.initializePdf(self.password): self.pdfInfo = self.pdfReader.getDocumentInfo() self.author = self.pdfInfo.author self.title = self.pdfInfo.title self.pages = self.pdfReader.getNumPages() self.subject = self.pdfInfo.subject self.extractedPages = {} def initializePdf(self, password = None): if self.pdfReader.getIsEncrypted(): self.isencrypted = True if self.pdfReader.decrypt(self.password): self.initialized = True return True else: self.initialized = True return True return False def getPage(self, pagenum): self.currentpage = pagenum if self.extractedPages.has_key(pagenum): return self.extractedPages[pagenum] else: page = self.pdfReader.getPage(pagenum) text = page.extractText() self.extractedPages[pagenum] = text return text
import sys from pyPdf import PdfFileReader, PdfFileWriter if len(sys.argv) is not 4: print("Example:") print("python %s input.pdf output.pdf password" % __file__) exit(0) input_file = sys.argv[1] output_file = sys.argv[2] password = sys.argv[3] with open(input_file, "rb") as pdf: reader = PdfFileReader(pdf) if reader.isEncrypted: reader.decrypt('') writer = PdfFileWriter() for i in range(reader.getNumPages()): writer.addPage(reader.getPage(i)) with open(output_file, "wb") as outputStream: writer.encrypt(password) writer.write(outputStream) outputStream.close()
from pyPdf import PdfFileReader,PdfFileWriter my_path ='C:/Users/Intel i3/Desktop/python/Real_Python/book1-exercises-master/Course materials/Chapter 12/Practice files/Walrus.pdf' pdf_file = PdfFileReader(file(my_path,'rb')) yo = pdf_file.decrypt('IamtheWalrus') page = pdf_file.getNumPages() outfile = PdfFileWriter() for page_no in range(0,page): text = pdf_file.getPage(page_no) text = text.rotateClockwise(270) #text = text.encode('utf-8') #text = text.replace(' ','\n') outfile.addPage(text) out_path ='C:/Users/Intel i3/Desktop/python/Real_Python/book1-exercises-master/Course materials/Chapter 12/Practice files/new.pdf' out_file = open(out_path,'wb') outfile.write(out_file) out_file.close()
def appendPDFwithPDF(outFile,toAppend): ''' This function is meant to combine multiple pdf files, I'm not sure I like the pyPdf module's issues atm, hope it updates soon @author William Panting @param outFile a string representing the path of the file that is to be created/modified @param toAppend a string representing the path of the file that is to be appended to the orgional file, or an ordered list of multiple strings representing files @return bool true if successful false if not ''' pdfWriter=PdfFileWriter() #out file must not be a directory if os.path.isdir(outFile): logging.error('Input error: outFile cannot be a directory.') return False #if outfile is a file then it needs to be added to the output page by page just like the other pdfs elif os.path.isfile(outFile): #if toAppend is a string then make it into a list [outDir,toAppend] if isinstance(toAppend,str): toAppend=[outFile,toAppend] #if toAppend is a list prepend outDir to it elif isinstance(toAppend,list): toAppend.insert(0,outFile) #if toAppend is a string if isPDF(toAppend): toAppendReader=PdfFileReader(open(toAppend, "rb")) try: numPages=toAppendReader.getNumPages() except Exception: #this try catch handles where the pyPDF lib mistakenly thinks a pdf is encrypted, will not work with encryption 3,4 toAppendReader.decrypt('') numPages=toAppendReader.getNumPages() #loop over pages adding them one by one pageCount=0 while pageCount<numPages: pdfWriter.addPage(toAppendReader.getPage(pageCount)) pageCount+=1 #if toAppend is a list of paths elif isinstance(toAppend, list): for path in toAppend: #verify list as pdfs if isPDF(path)==False: logging.error('Error with input: '+str(path)+' --Each member of the list to append must be a valid pdf.') return False #loop over each page appending it toAppendReader=PdfFileReader(open(path, "rb")) try: numPages=toAppendReader.getNumPages() except Exception: #this try catch handles where the pyPDF lib mistakenly thinks a pdf is encrypted, will not work with encryption 3,4 toAppendReader.decrypt('') numPages=toAppendReader.getNumPages() #loop over pages adding them one by one pageCount=0 while pageCount<numPages: pdfWriter.addPage(toAppendReader.getPage(pageCount)) pageCount+=1 else: logging.error('Error with input: '+str(toAppend)+' --The input to Append must be a file path or list of file paths.') return False #write the concatenated file, must open for read write or if it exists or you get an exception in pyPdf if(os.path.lexists(outFile)): pdfStream = open(outFile, "r+b") else: pdfStream= open(outFile,'wb') pdfWriter.write(pdfStream) return True
import os import copy from pyPdf import PdfFileReader, PdfFileWriter path = '/home/alberick/Documents/python/books/realpython-jean/part1/book1-exercises/Course materials/Chapter 12/Practice files' input_file_name = os.path.join(path, 'Walrus.pdf') input_file = PdfFileReader(file(input_file_name, 'rb')) input_file.decrypt('IamtheWalrus') output_PDF = PdfFileWriter() for page_num in range(input_file.getNumPages()): page = input_file.getPage(page_num) page.rotateCounterClockwise(90) page_left = page page_right = copy.copy(page) upper_right = page_left.mediaBox.upperRight page_left.mediaBox.upperRight = (upper_right[0] / 2, upper_right[1]) output_PDF.addPage(page_left) page_right.mediaBox.upperLeft = (upper_right[0] / 2, upper_right[1]) output_PDF.addPage(page_right) output_file_name = os.path.join(path, 'Output/Walrus.pdf') with file(output_file_name, 'wb') as output_file: output_PDF.write(output_file) output_file.close()
import sys from pyPdf import PdfFileReader helpmsg = "Simple PDF brute force script\n" helpmsg += "Cracks pwds of the format <first 4 chars of email>0000-9999." helpmsg += "Example: snow0653\n\n" helpmsg += "Usage: pdfbrute.py <encrypted_pdf_file> <email_address>" if len(sys.argv) < 2: print helpmsg sys.exit() pdffile = PdfFileReader(file(sys.argv[1], "rb")) if pdffile.isEncrypted == False: print "[!] The file is not protected with any password. Exiting." exit print "[+] Attempting to Brute force. This could take some time..." z = "" for i in range(0, 9999): z = str(i) while (len(z) < 4): z = "0" + z a = str(sys.argv[2][:4] + str(z)) if pdffile.decrypt(a) > 0: print "[+] Password is: " + a print "[...] Exiting.." sys.exit()
def process_file(id_source): from pyPdf import PdfFileReader from numpy import array S = {} locations, comments = annotations.getPublicCommentsByFile(id_source) srcfile = "tmp/%s.pdf" % (id_source) pdf = PdfFileReader(file(srcfile, "rb")) if pdf.isEncrypted and pdf.decrypt("") == 0: print "PDF file encrypted with non-empty password: %s" % (srcfile, ) return False trim_box = pdf.pages[ 0].trimBox # Sacha's coordinate system now uses this box crop_box = pdf.pages[0].cropBox # ConTeXt's page inclusion uses this box fudge = (int(trim_box[2]) - int(trim_box[0])) / 612.0 # for the assumption of 612bp width bp_per_pixel = 72.0 / 150 * fudge roots = {} children_of = {} comments_res = {} page_comment = {} for k in comments: node = int(k) parent = comments[k]['id_parent'] if parent: if parent not in children_of: children_of[parent] = [] children_of[parent].append(node) else: loc_id = comments[node]['ID_location'] loc = locations[loc_id] if loc['page'] != 0: loc['center_x'] = loc['left'] + loc['w'] / 2.0 loc['center_y'] = loc['top'] + loc['h'] / 2.0 else: loc['center_x'] = None loc['center_y'] = None roots[node] = loc def oneline(s): return s.replace('\n', ' ') def texify(s): s = s.strip() patterns = [(r'\\', r'\\\\'), (r'%', r'\%'), (r'\$', r'\$'), ('_', r'\_'), (r'\&', r'\&'), (r'\^', r'\^\\null{}'), (r'#', r'\#'), (r'\|', r'$|$')] for p in patterns: s = re.sub(p[0], p[1], s) return s def rect2array(rect): return array(rect.lowerLeft + rect.upperRight, dtype=float) def rectangle_height(rect): return rect.upperRight[1] - rect.lowerLeft[1] S["last_page"] = -1 def print_child(n, levels=0): loc_id = comments[n]['ID_location'] location = locations[loc_id] page = int(location['page']) if levels == 0 and page > S["last_page"]: S["last_page"] = page if levels == 0 and page != 0: # a root comment not on page 0 needs callout root = roots[n] # Sacha's coords are from top left corner, relative to TrimBox # but in pixels (not postscript points). # evaluate comment_box_px, with this coord system, as [llx lly wwc h] comment_box_px = array( [root['left'], root['top'] + root['h'], root['w'], root['h']]) comment_box_bp = comment_box_px * bp_per_pixel # convert y coordinate to use bottom edge of trim_box as y=0 comment_box_bp[1] = int(rectangle_height(trim_box)) - int( comment_box_bp[1]) # convert to coordinates relative to CropBox comment_box_bp[0:2] += (rect2array(trim_box) - rect2array(crop_box))[0:2] comments_res[n] = {} comments_res[n]['location_ID'] = loc_id comments_res[n]['source_ID'] = id_source comments_res[n]['ensemble_ID'] = locations[loc_id]['id_ensemble'] comments_res[n]['x'] = comment_box_bp[0] comments_res[n]['y'] = comment_box_bp[1] comments_res[n]['w'] = comment_box_bp[2] comments_res[n]['h'] = comment_box_bp[3] comments_res[n]['page'] = page comments_res[n]['parent'] = -1 strPage = str(page) if strPage in page_comment: tmp = page_comment[strPage] tmp.append(n) page_comment[strPage] = tmp else: page_comment[strPage] = [n] elif levels != 0 and page != 0: parent = comments[n]['id_parent'] comments_res[n] = {} comments_res[n]['location_ID'] = loc_id comments_res[n]['source_ID'] = id_source comments_res[n]['ensemble_ID'] = locations[loc_id]['id_ensemble'] comments_res[n]['x'] = comments_res[parent]['x'] comments_res[n]['y'] = comments_res[parent]['y'] comments_res[n]['w'] = comments_res[parent]['w'] comments_res[n]['h'] = comments_res[parent]['h'] comments_res[n]['page'] = page comments_res[n]['parent'] = parent strPage = str(page) if strPage in page_comment: tmp = page_comment[strPage] tmp.append(n) page_comment[strPage] = tmp else: page_comment[strPage] = [n] if n in children_of: for k in sorted(children_of[n]): print_child(k, levels + 1) def cmp(a, b): if roots[a]['page'] == 0 and roots[b]['page'] == 0: return a - b # order by comment id for key in ['page', 'center_y', 'center_x']: if roots[a][key] != roots[b][key]: return int(1e6 * roots[a][key] - 1e6 * roots[b][key]) return 0 for root_id in sorted(roots, cmp): print_child(root_id, 0) return comments_res, page_comment
import sys from pyPdf import PdfFileReader helpmsg = "Simple PDF brute force script\n" helpmsg += "Cracks pwds of the format <first 4 chars of email>0000-9999." helpmsg += "Example: snow0653\n\n" helpmsg += "Usage: pdfbrute.py <encrypted_pdf_file> <email_address>" if len(sys.argv) < 2: print helpmsg sys.exit() pdffile = PdfFileReader(file(sys.argv[1], "rb")) if pdffile.isEncrypted == False: print "[!] The file is not protected with any password. Exiting." exit print "[+] Attempting to Brute force. This could take some time..." z = "" for i in range(0,9999): z = str (i) while (len(z) < 4): z = "0" + z a = str(sys.argv[2][:4] + str(z)) if pdffile.decrypt(a) > 0: print "[+] Password is: " + a print "[...] Exiting.." sys.exit()
class GenerateMarkPdf(object): # Wstaw parametry do listy def __init__(self, args): self.params = args # 0 nazwa skryptu # 1 nazwa orginalnego dokumentu # 2 polozenie znacznik na osi X # 3 polozenie znacznika na osi Y # 4 dane ktore trzeba zapisac # 5 dane2 ktore trzeba zapisac # 6 nazwa pliku: output default true # if len(args) < 5: # self.logger("Brak wszytkich parametrow") # return # # if len(self.params) < 7: # self.params.append(self.params[1]) ######################## Output # Utworz plik dla ktorego zostanie wygenerowany nowy PDF def setFilename(self, filename): self.output = PdfFileWriter() #@TODO ustaw nazwe pliku self.filename = filename ######################## logi # Trzeba utworzyc plik w lokalizacji i nadac mu uprawnienia def logger(self, data): try: logfile = open("/var/log/pdf_error.log", "a") today = strftime("%Y-%m-%d %H:%M:%S", gmtime()) logfile.write(str(today) + " " + str(data) + "\r\n") except Exception as e: print("Nie mozna utworzyc pliku") except UnboundLocalError as u: print("Nie mozna utworzyc pliku") finally: logfile.close() # Ustaw orginalny plik def setOrginalPdfFile(self): try: # copyfile(params[1], "") # wa = os.access(params[1], os.W_OK) # print(wa) self.input1 = PdfFileReader(file(self.params[1], "rb")) if self.input1.isEncrypted: self.input1.decrypt() except IOError as ie: self.logger("Nie ma takiego pliku " + " " + str(ie)) except Exception as e: self.logger(e) # W obiekcie page.mediaBox jest trzymana rozmiar strony # def getXPosition(self, page): # x = 20 # if self.params[2] == 'Left': # x = 20 # if self.params[2] == 'Right': # x = page.mediaBox[2] - 60 # return x # # # # W obiekcie page.mediaBox jest trzymana rozmiar strony # def getY(self, page): # y = 20 # # if self.params[2] == 'Left': # y = 20 # if self.params[2] == 'Right': # y = page.mediaBox[2] - 60 # # return y ######################### Watermark # Generuj plik PDF def watermark(self, x, y, z, w): packet = StringIO.StringIO() #x = self.params[2] #y = self.params[3] data = self.params[4] # data2 = self.params[5] can = canvas.Canvas(packet) can.drawString(int(x), int(y), str(data)) # can.drawString(int(z), int(w), str(data2)) can.save() #Ustaw kurson na poczatek bufora packet.seek(0) return PdfFileReader(packet) def createNewPdf(self): iloscStron = int(self.input1.getNumPages()) i = 0 while i < iloscStron: # Orginalna strona page = self.input1.getPage(i) x = 20 y = page.mediaBox[3] - 10 z = page.mediaBox[2] - 100 w = page.mediaBox[3] - 10 watermark = self.watermark(x, y, z, w) w = watermark.getPage(0) page.mergePage(w) self.output.addPage(page) i += 1 try: w = os.access( '/home/edokumenty/public_html/apps/edokumenty/var/tmp', os.W_OK) # Gdzie zapisac dane outputStream = file(self.filename, "wb") self.output.write(outputStream) except Exception as e: self.logger("Blad " + str(e))
def process_file(id_source): from processing.tex_template import tex_header from pyPdf import PdfFileReader from numpy import array logging.info("begin %s" % (id_source, )) OUTPUT = [] S = {} locations, comments = annotations.getPublicCommentsByFile(id_source) repfile = "%s/%s/%s" % (settings.HTTPD_MEDIA,settings.REPOSITORY_DIR, id_source) if not os.path.exists(repfile): logging.warning("%s not found. Skipping..." % (repfile, )) return srcfile = "/tmp/orig_%s.pdf" % (id_source, ) if not os.path.exists(srcfile): os.symlink(repfile, srcfile) pdf = PdfFileReader(file(srcfile, "rb")) if pdf.isEncrypted and pdf.decrypt("")==0: print "PDF file encrypted with non-empty password: %s" % (srcfile,) return False trim_box = pdf.pages[0].trimBox # Sacha's coordinate system now uses this box crop_box = pdf.pages[0].cropBox # ConTeXt's page inclusion uses this box fudge = (int(trim_box[2])-int(trim_box[0]))/612.0 # for the assumption of 612bp width bp_per_pixel = 72.0/150 * fudge roots = {} children_of = {} for k in comments: node = int(k) parent = comments[k]['id_parent'] if parent: if parent not in children_of: children_of[parent] = [] children_of[parent].append(node) else: loc_id = comments[node]['ID_location'] loc = locations[loc_id] if loc['page'] != 0: loc['center_x'] = loc['left'] + loc['w']/2.0 loc['center_y'] = loc['top'] + loc['h']/2.0 else: loc['center_x'] = None loc['center_y'] = None roots[node] = loc def oneline(s): return s.replace('\n', ' ') def texify(s): s = s.strip() patterns = [(r'\\', r'\\\\'), (r'%', r'\%'), (r'\$', r'\$'), ('_', r'\_'), (r'\&', r'\&'), (r'\^', r'\^\\null{}'), (r'#', r'\#'), (r'\|', r'$|$')] for p in patterns: s = re.sub(p[0], p[1], s) return s def rect2array(rect): return array(rect.lowerLeft+rect.upperRight, dtype=float) def rectangle_height(rect): return rect.upperRight[1]-rect.lowerLeft[1] S["last_page"] = -1 def print_child(n, levels=0): body = comments[n]['body'] loc_id = comments[n]['ID_location'] location = locations[loc_id] page = int(location['page']) if levels == 0 and page > S["last_page"]: OUTPUT.append('\n%% Comments on page %d of %s [%s]' % (page, "myfile", os.path.basename(srcfile))) if page == 0: sectitle = 'Global comments' else: sectitle = 'Comments on page %d' % page OUTPUT.append(r'\title{%s} \def\whatpage{%d}' % (sectitle, page)) S["last_page"] = page if comments[n]['admin'] == 1: me = 1 else: me = 0 msg = '\n'+r'\comment{note-%s}{%d}{%s}{%d}{%d}' % (n, levels, texify(body), me, int(n)) OUTPUT.append(unicode(msg).encode("ascii", "ignore")) if levels == 0 and page != 0: # a root comment not on page 0 needs callout root = roots[n] # Sacha's coords are from top left corner, relative to TrimBox # but in pixels (not postscript points). # evaluate comment_box_px, with this coord system, as [llx lly w h] comment_box_px = array([root['left'], root['top']+root['h'], root['w'], root['h']]) comment_box_bp = comment_box_px * bp_per_pixel # convert y coordinate to use bottom edge of trim_box as y=0 comment_box_bp[1] = int(rectangle_height(trim_box))-int(comment_box_bp[1]) # convert to coordinates relative to CropBox comment_box_bp[0:2] += (rect2array(trim_box)-rect2array(crop_box))[0:2] OUTPUT.append('\setpospxywhd{note-%d-dest}{1}' % n) OUTPUT.append('{%fbp}{%fbp}{%fbp}{%fbp}{0pt}' % tuple(comment_box_bp)) OUTPUT.append('''\startpositionoverlay{callouts} \setMPpositiongraphic{note-%d}{callout}{to=note-%d-dest} \stoppositionoverlay''' % (n, n)) if n in children_of: for k in sorted(children_of[n]): print_child(k, levels+1) def cmp(a,b): if roots[a]['page'] == 0 and roots[b]['page'] == 0: return a-b # order by comment id for key in ['page', 'center_y', 'center_x']: if roots[a][key] != roots[b][key]: return int(1e6*roots[a][key] - 1e6*roots[b][key]) return 0 tex_params = {'crop_wd': crop_box[2]-crop_box[0], 'crop_ht': crop_box[3]-crop_box[1], 'srcfile': srcfile } OUTPUT.append( tex_header % tex_params) OUTPUT.append( '\n\\starttext') for root_id in sorted(roots, cmp): print_child(root_id, 0) OUTPUT.append( '\n\\stoptext') texfile = "/tmp/%s.tex" % (id_source, ) f = open(texfile, "w") f.write("\n".join(OUTPUT)) f.close() cmd = "(cd /tmp; texexec --timeout=120 %s; mv %s.pdf %s/%s/%s)" % (texfile, id_source, settings.HTTPD_MEDIA, settings.ANNOTATED_DIR, id_source) os.system(cmd) logging.info("end %s" % (id_source, ))
def export_to_file(self, file_out, only_selected=False): """Export to file""" selection = self.iconview.get_selected_items() pdf_output = PdfFileWriter() pdf_input = [] for pdfdoc in self.pdfqueue: pdfdoc_inp = PdfFileReader(file(pdfdoc.copyname, 'rb')) if pdfdoc_inp.getIsEncrypted(): try: # Workaround for lp:#355479 stat = pdfdoc_inp.decrypt('') except: stat = 0 if (stat!=1): errmsg = _('File %s is encrypted.\n' 'Support for encrypted files has not been implemented yet.\n' 'File export failed.') % pdfdoc.filename raise Exception, errmsg #FIXME #else # ask for password and decrypt file pdf_input.append(pdfdoc_inp) for row in self.model: if only_selected and row.path not in selection: continue # add pages from input to output document nfile = row[2] npage = row[3] current_page = copy(pdf_input[nfile-1].getPage(npage-1)) angle = row[6] angle0 = current_page.get("/Rotate",0) crop = [row[7],row[8],row[9],row[10]] if angle != 0: current_page.rotateClockwise(angle) if crop != [0.,0.,0.,0.]: rotate_times = (((angle + angle0) % 360 + 45) / 90) % 4 crop_init = crop if rotate_times != 0: perm = [0,2,1,3] for it in range(rotate_times): perm.append(perm.pop(0)) perm.insert(1,perm.pop(2)) crop = [crop_init[perm[side]] for side in range(4)] #(x1, y1) = current_page.cropBox.lowerLeft #(x2, y2) = current_page.cropBox.upperRight (x1, y1) = [float(xy) for xy in current_page.mediaBox.lowerLeft] (x2, y2) = [float(xy) for xy in current_page.mediaBox.upperRight] x1_new = int(x1 + (x2-x1) * crop[0]) x2_new = int(x2 - (x2-x1) * crop[1]) y1_new = int(y1 + (y2-y1) * crop[3]) y2_new = int(y2 - (y2-y1) * crop[2]) #current_page.cropBox.lowerLeft = (x1_new, y1_new) #current_page.cropBox.upperRight = (x2_new, y2_new) current_page.mediaBox.lowerLeft = (x1_new, y1_new) current_page.mediaBox.upperRight = (x2_new, y2_new) pdf_output.addPage(current_page) # finally, write "output" to document-output.pdf pdf_output.write(file(file_out, 'wb'))
# 12.2 review exercises import os import copy from pyPdf import PdfFileReader, PdfFileWriter path = "C:/Real Python/Course materials/Chapter 12/Practice files" input_file_name = os.path.join(path, "Walrus.pdf") input_file = PdfFileReader(file(input_file_name, "rb")) output_PDF = PdfFileWriter() input_file.decrypt("IamtheWalrus") # decrypt the input file for page_num in range(0, input_file.getNumPages()): # rotate pages (call everything page_left for now; will make a copy) page_left = input_file.getPage(page_num) page_left.rotateCounterClockwise(90) page_right = copy.copy(page_left) # split each page in half upper_right = page_left.mediaBox.upperRight # get original page corner # crop and add left-side page page_left.mediaBox.upperRight = (upper_right[0] / 2, upper_right[1]) output_PDF.addPage(page_left) # crop and add right-side page page_right.mediaBox.upperLeft = (upper_right[0] / 2, upper_right[1]) output_PDF.addPage(page_right) # save new pages to an output file output_file_name = os.path.join(path, "Output/Updated Walrus.pdf") with file(output_file_name, "wb") as output_file:
path = for f in full: filename = "myfile.txt" path_to_file = pjoin("C:", "foo", "bar", "baz", filename) FILE = open(path_to_file, "w") from pyPdf import PdfFileReader # p = '/home/matt/Desktop/new/The_Warren_Buffett_Way.pdf' p = '/home/matt/Desktop/new/the_mckinsey_way.pdf' pdf = PdfFileReader(file(p, 'rb')) if pdf.isEncrypted: pdf.decrypt('') pdf.documentInfo pdf.getNumPages() 267 info = pdf.getDocumentInfo() info.author info.title p = '/home/matt/Desktop/new/The_McKinsey_Way.pdf' pdf = PdfFileReader(file(p, 'rb')) if pdf.isEncrypted: try: pdf.decrypt('')
def employer_resume_book_create(request): if request.POST.has_key("resume_book_id") and request.POST["resume_book_id"]: redelivering = True try: resume_book = ResumeBook.objects.get(id=request.POST["resume_book_id"]) except ResumeBook.DoesNotExist: raise Http404("No resume book exists with id of %s" % request.POST["resume_book_id"]) else: redelivering = False try: resume_book, created = ResumeBook.objects.get_or_create(recruiter=request.user.recruiter, delivered=False) except ResumeBook.MultipleObjectsReturned: resume_books = ResumeBook.objects.filter(recruiter=request.user.recruiter, delivered=False) for i, rb in enumerate(resume_books): if i != 0: rb.delete() else: resume_book = rb if redelivering: resume_book_name = resume_book.name else: now = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") resume_book_name = "%s_%s" % (str(request.user), now) resume_book.name = resume_book_name resume_book.save() file_path = "%semployer/resumebook/" % (s.MEDIA_ROOT,) if not os.path.exists(file_path): os.makedirs(file_path) if request.POST["delivery_format"] == "separate": # Create the zip file file_name = "%s%s" % (file_path, resume_book_name) output = zipfile.ZipFile(file_name, "w") try: for student in resume_book.students.visible(): resume_file = file("%s%s" % (s.MEDIA_ROOT, str(student.resume)), "rb") try: name = "%s %s (%s, %s).pdf" % ( student.first_name, student.last_name, student.graduation_year, student.degree_program, ) output.write(resume_file.name, name, zipfile.ZIP_DEFLATED) finally: resume_file.close() finally: output.close() else: output = PdfFileWriter() file_name = "%s%s.pdf" % (file_path, resume_book_name) report_buffer = cStringIO.StringIO() c = Canvas(report_buffer) now = datetime.now() first_line = "Created on %s at %s" % (now.strftime("%m/%d/%Y"), now.strftime("%I:%M %p")) c.drawString(1 * cm, 28.5 * cm, first_line) c.drawString(1 * cm, 28 * cm, str(request.user.recruiter)) c.drawString(1 * cm, 27.5 * cm, str(request.user.recruiter.employer)) c.drawString(16 * cm, 28.5 * cm, "Created using Umeqo") c.drawString(8.5 * cm, 26.5 * cm, "Resume Book Contents") pad_from_top = 0 for num, student in enumerate( resume_book.students.visible().order_by("graduation_year", "first_name", "last_name") ): c.drawString(6.5 * cm, (25.5 - pad_from_top * 0.5) * cm, "%s %s" % (student.first_name, student.last_name)) c.drawString( 12 * cm, (25.5 - pad_from_top * 0.5) * cm, "%s, %s" % (student.graduation_year, student.degree_program) ) pad_from_top += 1 if num == 50: c.showPage() c.save() output.addPage(PdfFileReader(cStringIO.StringIO(report_buffer.getvalue())).getPage(0)) report_buffer = cStringIO.StringIO() c = Canvas(report_buffer) pad_from_top = 0 c.showPage() c.save() output.addPage(PdfFileReader(cStringIO.StringIO(report_buffer.getvalue())).getPage(0)) for student in resume_book.students.visible().order_by("graduation_year", "first_name", "last_name"): resume_file = open("%s%s" % (s.MEDIA_ROOT, str(student.resume)), "rb") resume = PdfFileReader(resume_file) if resume.getIsEncrypted(): resume.decrypt("") for page in range(resume.getNumPages()): output.addPage(resume.getPage(page)) outputStream = file(file_name, "wb") output.write(outputStream) outputStream.close() resume_file.close() resume_book_contents = open(file_name, "rb") resume_book.resume_book.save(file_name, File(resume_book_contents)) resume_book_contents.close() return HttpResponse()
def processFile(self, curr_file): global extractedFrom author = '-' date = '-' generator = '-' created = '-' producer = '-' modded = '-' last_saved = '-' if ".pdf" in curr_file: try: pdfFile = PdfFileReader(file(curr_file, 'rb')) if pdfFile.getIsEncrypted(): pdfFile.decrypt('') docInfo = pdfFile.getDocumentInfo() if not docInfo: return last_saved = '-' #looks at the entire dictionary to parse for information if "/CreationDate" in docInfo: data = docInfo["/CreationDate"].strip("D:|'") year = data[0:4] date = data[4:6] + "/" + data[6:8] created_time = data[8:10] + ":" + data[10:12] created_time = time.strftime("%I:%M %p", time.strptime(created_time, "%H:%M")) created = date + "/" + year + " " + created_time if "/Author" in docInfo: author = docInfo["/Author"] + " " if len(author) <=1: author = "-" if "/Producer" in docInfo: producer = docInfo["/Producer"].strip("(Windows)") producer = re.sub(r'[^\w]', ' ', producer) if len(producer) == 0: producer = "-" while True: if " " in producer: producer = producer.replace(" ", " ") else: break if "/ModDate" in docInfo: data = docInfo["/ModDate"].strip("D:|'") year = data[0:4] date = data[4:6] + "/" + data[6:8] modded_time = data[8:10] + ":" + data[10:12] modded_time = time.strftime("%I:%M %p", time.strptime(modded_time, "%H:%M")) modded = date + "/" + year + " " + modded_time #strips '/' off file name (if it includes directory name) if "/" in curr_file: curr_file = curr_file[curr_file.rfind("/")+1:] if "\\" in curr_file: curr_file = curr_file.replace("\\","") #trim information if it's too long if len(curr_file) > 15: # trims file name curr_file = curr_file[:15] + "..." + curr_file[-13:] if len(producer) > 30: producer = producer[:20] + " [snipped] " if len(author) > 20: author = author[:20] + " [snipped] " #appends each piece of information. output will show ONLY if at least ONE file has data in a column self.container.append([" | " + curr_file,created,author,producer,modded,last_saved]) except Exception, err: return
# 12.2 review exercises import os import copy from pyPdf import PdfFileReader, PdfFileWriter path = "C:/Real Python/Course materials/Chapter 8/Practice files" inputFileName = os.path.join(path, "Walrus.pdf") inputFile = PdfFileReader(file(inputFileName, "rb")) outputPDF = PdfFileWriter() inputFile.decrypt("IamtheWalrus") # decrypt the input file for pageNum in range(0, inputFile.getNumPages()): # rotate pages (call everything pageLeft for now; will make a copy) pageLeft = inputFile.getPage(pageNum) pageLeft.rotateCounterClockwise(90) pageRight = copy.copy(pageLeft) # split each page in half upperRight = pageLeft.mediaBox.upperRight # get original page corner # crop and add left-side page pageLeft.mediaBox.upperRight = (upperRight[0]/2, upperRight[1]) outputPDF.addPage(pageLeft) # crop and add right-side page pageRight.mediaBox.upperLeft = (upperRight[0]/2, upperRight[1]) outputPDF.addPage(pageRight) # save new pages to an output file outputFileName = os.path.join(path, "Output/Updated Walrus.pdf") with file(outputFileName, "wb") as outputFile:
def employer_resume_book_create(request): if request.POST.has_key("resume_book_id") and request.POST['resume_book_id']: redelivering = True try: resume_book = ResumeBook.objects.get(id=request.POST["resume_book_id"]) except ResumeBook.DoesNotExist: raise Http404("No resume book exists with id of %s" % request.POST["resume_book_id"]) else: redelivering = False try: resume_book, created = ResumeBook.objects.get_or_create(recruiter = request.user.recruiter, delivered=False) except ResumeBook.MultipleObjectsReturned: resume_books = ResumeBook.objects.filter(recruiter=request.user.recruiter, delivered=False) for i, rb in enumerate(resume_books): if i != 0: rb.delete() else: resume_book = rb if redelivering: resume_book_name = resume_book.name else: now = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') resume_book_name = "%s_%s" % (str(request.user), now,) resume_book.name = resume_book_name resume_book.save() file_path = "%semployer/resumebook/"% (s.MEDIA_ROOT,) if not os.path.exists(file_path): os.makedirs(file_path) if request.POST['delivery_format'] == 'separate': # Create the zip file file_name = "%s%s" % (file_path, resume_book_name,) output = zipfile.ZipFile(file_name, 'w') try: for student in resume_book.students.visible(): resume_file = file("%s%s" % (s.MEDIA_ROOT, str(student.resume)), "rb") try: name = "%s %s (%s, %s).pdf" % (student.first_name, student.last_name, student.graduation_year, student.degree_program) output.write(resume_file.name, name, zipfile.ZIP_DEFLATED) finally: resume_file.close() finally: output.close() else: output = PdfFileWriter() file_name = "%s%s.pdf" % (file_path, resume_book_name) report_buffer = cStringIO.StringIO() c = Canvas(report_buffer) now = datetime.now() first_line = "Created on %s at %s" % (now.strftime('%m/%d/%Y'), now.strftime('%I:%M %p')) c.drawString(1*cm, 28.5*cm, first_line) c.drawString(1*cm, 28*cm, str(request.user.recruiter)) c.drawString(1*cm, 27.5*cm, str(request.user.recruiter.employer)) c.drawString(16*cm, 28.5*cm, "Created using Umeqo") c.drawString(8.5*cm, 26.5*cm, "Resume Book Contents") pad_from_top = 0 for num, student in enumerate(resume_book.students.visible().order_by("graduation_year", "first_name", "last_name")): c.drawString(6.5*cm, (25.5-pad_from_top*.5)*cm, "%s %s" % (student.first_name, student.last_name)) c.drawString(12*cm, (25.5-pad_from_top*.5)*cm, "%s, %s" %(student.graduation_year, student.degree_program)) pad_from_top += 1 if num == 50: c.showPage() c.save() output.addPage(PdfFileReader(cStringIO.StringIO(report_buffer.getvalue())).getPage(0)) report_buffer = cStringIO.StringIO() c = Canvas(report_buffer) pad_from_top = 0 c.showPage() c.save() output.addPage(PdfFileReader(cStringIO.StringIO(report_buffer.getvalue())).getPage(0)) for student in resume_book.students.visible().order_by("graduation_year", "first_name", "last_name"): resume_file = open("%s%s" % (s.MEDIA_ROOT, str(student.resume)), "rb") resume = PdfFileReader(resume_file) if resume.getIsEncrypted(): resume.decrypt("") for page in range(resume.getNumPages()): output.addPage(resume.getPage(page)) outputStream = file(file_name, "wb") output.write(outputStream) outputStream.close() resume_file.close() resume_book_contents = open(file_name, "rb") resume_book.resume_book.save(file_name, File(resume_book_contents)) resume_book_contents.close() return HttpResponse()
def export_to_file(self, file_out, only_selected=False): """Export to file""" selection = self.iconview.get_selected_items() pdf_output = PdfFileWriter() pdf_input = [] for pdfdoc in self.pdfqueue: pdfdoc_inp = PdfFileReader(open(pdfdoc.copyname, 'rb')) if pdfdoc_inp.getIsEncrypted(): try: # Workaround for lp:#355479 stat = pdfdoc_inp.decrypt('') except: stat = 0 if (stat != 1): errmsg = _( 'File %s is encrypted.\n' 'Support for encrypted files has not been implemented yet.\n' 'File export failed.') % pdfdoc.filename raise Exception(errmsg) #FIXME #else # ask for password and decrypt file pdf_input.append(pdfdoc_inp) for row in self.model: if only_selected and row.path not in selection: continue # add pages from input to output document nfile = row[2] npage = row[3] current_page = copy(pdf_input[nfile - 1].getPage(npage - 1)) angle = row[6] angle0 = current_page.get("/Rotate", 0) crop = [row[7], row[8], row[9], row[10]] if angle != 0: current_page.rotateClockwise(angle) if crop != [0., 0., 0., 0.]: rotate_times = int(round(((angle + angle0) % 360) / 90) % 4) crop_init = crop if rotate_times != 0: perm = [0, 2, 1, 3] for it in range(rotate_times): perm.append(perm.pop(0)) perm.insert(1, perm.pop(2)) crop = [crop_init[perm[side]] for side in range(4)] #(x1, y1) = current_page.cropBox.lowerLeft #(x2, y2) = current_page.cropBox.upperRight (x1, y1) = [float(xy) for xy in current_page.mediaBox.lowerLeft] (x2, y2) = [float(xy) for xy in current_page.mediaBox.upperRight] x1_new = int(x1 + (x2 - x1) * crop[0]) x2_new = int(x2 - (x2 - x1) * crop[1]) y1_new = int(y1 + (y2 - y1) * crop[3]) y2_new = int(y2 - (y2 - y1) * crop[2]) #current_page.cropBox.lowerLeft = (x1_new, y1_new) #current_page.cropBox.upperRight = (x2_new, y2_new) current_page.mediaBox.lowerLeft = (x1_new, y1_new) current_page.mediaBox.upperRight = (x2_new, y2_new) pdf_output.addPage(current_page) # finally, write "output" to document-output.pdf pdf_output.write(open(file_out, 'wb'))
def processFile(self, curr_file): global extractedFrom author = '-' date = '-' generator = '-' created = '-' producer = '-' modded = '-' last_saved = '-' if ".pdf" in curr_file: try: pdfFile = PdfFileReader(file(curr_file, 'rb')) if pdfFile.getIsEncrypted(): pdfFile.decrypt('') docInfo = pdfFile.getDocumentInfo() if not docInfo: return last_saved = '-' #looks at the entire dictionary to parse for information if "/CreationDate" in docInfo: data = docInfo["/CreationDate"].strip("D:|'") year = data[0:4] date = data[4:6] + "/" + data[6:8] created_time = data[8:10] + ":" + data[10:12] created_time = time.strftime( "%I:%M %p", time.strptime(created_time, "%H:%M")) created = date + "/" + year + " " + created_time if "/Author" in docInfo: author = docInfo["/Author"] + " " if len(author) <= 1: author = "-" if "/Producer" in docInfo: producer = docInfo["/Producer"].strip("(Windows)") producer = re.sub(r'[^\w]', ' ', producer) if len(producer) == 0: producer = "-" while True: if " " in producer: producer = producer.replace(" ", " ") else: break if "/ModDate" in docInfo: data = docInfo["/ModDate"].strip("D:|'") year = data[0:4] date = data[4:6] + "/" + data[6:8] modded_time = data[8:10] + ":" + data[10:12] modded_time = time.strftime( "%I:%M %p", time.strptime(modded_time, "%H:%M")) modded = date + "/" + year + " " + modded_time #strips '/' off file name (if it includes directory name) if "/" in curr_file: curr_file = curr_file[curr_file.rfind("/") + 1:] if "\\" in curr_file: curr_file = curr_file.replace("\\", "") #trim information if it's too long if len(curr_file) > 15: # trims file name curr_file = curr_file[:15] + "..." + curr_file[-13:] if len(producer) > 30: producer = producer[:20] + " [snipped] " if len(author) > 20: author = author[:20] + " [snipped] " #appends each piece of information. output will show ONLY if at least ONE file has data in a column self.container.append([ " | " + curr_file, created, author, producer, modded, last_saved ]) except Exception, err: return