def pdf_metadata_save(pdf_file, metadata, substitute_all_metadata = False, make_backup = True): if type(make_backup) is str: bak_file = make_backup else: bak_file = os.path.splitext(pdf_file)[0] + ".bak" os.rename(pdf_file, bak_file) with open(bak_file, 'rb') as fin: pdf_in = PdfFileReader(fin) writer = PdfFileWriter() for page in range(pdf_in.getNumPages()): writer.addPage(pdf_in.getPage(page)) infoDict = writer._info.getObject() info = pdf_in.documentInfo if not substitute_all_metadata: for key in info: #infoDict.update({NameObject(key): createStringObject(info[key])}) infoDict.update({key: info[key]}) for key in metadata: infoDict.update({NameObject('/' + key): createStringObject(str(metadata[key]))}) with open(pdf_file, 'wb') as fout: writer.write(fout) if make_backup == False: os.unlink(bak_file)
def extract_information(pdf_path): testread = "" with open(pdf_path, 'rb') as f: pdf = PdfFileReader(f) information = pdf.getDocumentInfo() testread = pdf.getPage(92).extractText().strip() print(pdf.getPage(92).extractText().strip()) number_of_pages = pdf.getNumPages() # txt = f""" # Information about {pdf_path}: # Author: {information.author} # Creator: {information.creator} # Producer: {information.producer} # Subject: {information.subject} # Title: {information.title} # Number of pages: {number_of_pages} # """ print(testread) # define variables s = testread.strip() file = "file.mp3" # initialize tts, create mp3 and play tts = gTTS(s, 'en') tts.save(file) #os.system("mpg123 " + file) return information
def pdf_metadata_load(pdf_file): with open(pdf_file, 'rb') as fin: pdf_in = PdfFileReader(fin) writer = PdfFileWriter() for page in range(pdf_in.getNumPages()): writer.addPage(pdf_in.getPage(page)) infoDict = writer._info.getObject() return pdf_in.documentInfo
def merge_pdfs(paths, output): pdf_writer = PdfFileWriter() for path in paths: pdf_reader = PdfFileReader(path) for page in range(pdf_reader.getNumPages()): # Add each page to the writer object pdf_writer.addPage(pdf_reader.getPage(page)) # Write out the merged PDF with open(output, 'wb') as out: pdf_writer.write(out)
def find_table_page(pdf: PdfFileReader) -> Tuple[str, int]: """ Finds the page containing the data table, then returns a tuple with: - the text extracted from the page, pre-processed - the page number (0-based) """ num_pages = pdf.getNumPages() for i in range(1, num_pages): # skip the first page, the table is not there text = extract_text(pdf, page=i) if TABLE_CAPTION_PATTERN.search(text): return text, i else: raise TableExtractionError('could not find the table in the pdf')
def pypdf3(): """Much slower than PyPDF3 method.""" # 5b. Get our files ready document_reader = PdfFileReader(document) output_file = PdfFileWriter() # Number of pages in input document page_count = document_reader.getNumPages() # Watermark objects watermark_reader = PdfFileReader(watermark) wtrmrk_page = watermark_reader.getPage(0) wtrmrk_width = (wtrmrk_page.mediaBox.getWidth() / 2) + 0 wtrmrk_height = (wtrmrk_page.mediaBox.getHeight() / 2) + 80 wtrmrk_rotate = -int(Info(watermark_reader).rotate) if Info(watermark_reader).rotate is not None else 0 # 5c. Go through all the input file pages to add a watermark to them for page_number in range(page_count): # Merge the watermark with the page if not self.underneath: input_page = document_reader.getPage(page_number) if wtrmrk_rotate != 0: input_page.mergeRotatedTranslatedPage(wtrmrk_page, wtrmrk_rotate, wtrmrk_width, wtrmrk_height) else: wtrmrk_width = 0 wtrmrk_height = 0 input_page.mergeTranslatedPage(wtrmrk_page, wtrmrk_width, wtrmrk_height) else: size = Info(document_reader).dimensions input_page = PageObject().createBlankPage(document_reader, size['w'], size['h']) if wtrmrk_rotate != 0: input_page.mergeRotatedTranslatedPage(wtrmrk_page, wtrmrk_rotate, wtrmrk_width, wtrmrk_height) else: wtrmrk_width = 0 wtrmrk_height = 0 input_page.mergeTranslatedPage(wtrmrk_page, wtrmrk_width, wtrmrk_height) input_page.mergePage(document_reader.getPage(page_number)) # Add page from input file to output document output_file.addPage(input_page) # 5d. finally, write "output" to PDF with open(output_filename, "wb") as outputStream: output_file.write(outputStream) return output_filename
def main(): if (len(sys.argv) != 3): print("usage: python 2-up.py input_file output_file") sys.exit(1) print("2-up input " + sys.argv[1]) input1 = PdfFileReader(open(sys.argv[1], "rb")) output = PdfFileWriter() for iter in range(0, input1.getNumPages() - 1, 2): lhs = input1.getPage(iter) rhs = input1.getPage(iter + 1) lhs.mergeTranslatedPage(rhs, lhs.mediaBox.getUpperRight_x(), 0, True) output.addPage(lhs) print(str(iter) + " "), sys.stdout.flush() print("writing " + sys.argv[2]) outputStream = file(sys.argv[2], "wb") output.write(outputStream) print("done.")
def prepare(self): # Process PDF input file to raw text file with open(self.inputPath, "rb") as fh: reader = PdfFileReader(fh) for page in tqdm(range(0, reader.numPages)): page_text = reader.getPage(page).extractText() print("Reading page", page, "of", reader.getNumPages()) filename = join_paths("./.TXT", hash(self.inputPath)) with open(filename, "a") as fh: fh.write(page_text) # Cleaning the TEXT file for better processing with open(filename, "r") as fh: lines = fh.readlines() lines = [l.replace("\n", "").replace("\r", "") for l in lines] with open(filename, "w") as fh: fh.writelines(lines) print("Cleaning... => ", filename) self.transform(filename)
def pypdf3(self): reader = PdfFileReader(self.file_name) writer = PdfFileWriter() # Number of pages in input document page_count = reader.getNumPages() for page_number in range(page_count): wtrmrk = reader.getPage(page_number) page = PageObject.createBlankPage(width=self.target_w, height=self.target_h) page.mergeScaledTranslatedPage(wtrmrk, self.scale, self.margin_x, self.margin_y) writer.addPage(page) with open(self.output, "wb") as outputStream: writer.write(outputStream) return self.output
def reorder(input_filename: str, output_filename: str) -> None: assert os.path.exists(input_filename) assert os.path.exists(output_filename) is False input_stream = open(input_filename, 'rb') output = PdfFileWriter() input_pdf = PdfFileReader(input_stream) pages = input_pdf.getNumPages() order = _make_sequence(pages) for page_number in order: page = input_pdf.getPage(page_number) output.addPage(page) output_stream = open(output_filename, "wb") output.write(output_stream) input_stream.close() output_stream.close()
def write_pdf(pdf_obj, destination): """ Write PDF object to file :param pdf_obj: PDF object to be written to file :param destination: Desintation path """ reader = PdfFileReader(pdf_obj) # Create new PDF object writer = PdfFileWriter() page_count = reader.getNumPages() # add the "watermark" (which is the new pdf) on the existing page for page_number in range(page_count): page = reader.getPage(page_number) writer.addPage(page) # finally, write "output" to a real file with open(destination, "wb") as outputStream: writer.write(outputStream)
def pdfHandler(file_dir): input1 = PdfFileReader(open(file_dir, 'rb')) print('document1.pdf has {} pages.'.format(str(input1.getNumPages()))) fields = input1.getFields() print(type(fields)) documentInfo = input1.getDocumentInfo() print(type(documentInfo)) if documentInfo is not None: for key in documentInfo.keys(): print('{} : {}'.format(key, documentInfo.get(key))) metaData = input1.getXmpMetadata() print(type(metaData)) if metaData is not None: # print(metaData) for relation in metaData.dc_relation: print('relation: {}'.format(relation))
def add_encryption(path, encryptPath, fileDicts): pdf_writer = PdfFileWriter() for fileName in fileDicts: input_pdf = os.path.join(path, fileName) output_pdf = os.path.join(encryptPath, fileName) pdf_reader = PdfFileReader(input_pdf) for page in range(pdf_reader.getNumPages()): pdf_writer.addPage(pdf_reader.getPage(page)) pdf_writer.encrypt(user_pwd=fileDicts[fileName], owner_pwd=None, use_128bit=True) #输出文件已存在便删除 if os.path.exists(output_pdf): os.remove(output_pdf) with open(output_pdf, 'wb') as fh: pdf_writer.write(fh)
def slicer(document, first_page=None, last_page=None, suffix='sliced', tempdir=None): """Slice a PDF document to remove pages.""" # Set output file name if tempdir: with NamedTemporaryFile(suffix='.pdf', dir=tempdir, delete=False) as temp: output = temp.name elif suffix: output = os.path.join(os.path.dirname(document), add_suffix(document, suffix)) else: with NamedTemporaryFile(suffix='.pdf') as temp: output = temp.name # Reindex page selections for simple user input first_page = first_page - 1 if not None else None # Validate page range by comparing selection to number of pages in PDF document pages = Info(document).pages invalid = 'Number of pages: ' + str( pages) + ' ----> Page Range Input: ' + str(first_page) + '-' + str( last_page) assert first_page <= last_page <= pages, invalid pdf = PdfFileReader(document) writer = PdfFileWriter() pages = list(range(pdf.getNumPages()))[first_page:last_page] for page in pages: writer.addPage(pdf.getPage(page)) with open(output, 'wb') as out: writer.write(out) return output
def rename(pdf,doi): #inpfn = 'Chem. Rev. 2019, 119, 10241-10287-VIP-acs.chemrev.9b00008.pdf' fin = open(pdf, 'rb') pdf_in = PdfFileReader(fin) writer = PdfFileWriter() for page in range(pdf_in.getNumPages()): writer.addPage(pdf_in.getPage(page)) infoDict = writer._info.getObject() info = pdf_in.documentInfo for key in info: infoDict.update({NameObject(key): createStringObject(info[key])}) print(key[0]+':'+ info[key]) # add the grade infoDict.update({NameObject('/doi'): createStringObject(u''+doi)}) # It does not appear possible to alter in place. temppdf=pdf+'.temppdf' fout = open(temppdf, 'wb') writer.write(fout) fin.close() fout.close() import os os.unlink(pdf) os.rename(temppdf, pdf) print('The DOI have been updated to:{0}'.format(doi))
def upscale(file_name, scale=1.5, margin_x=0, margin_y=0, suffix='scaled', tempdir=None): """Upscale a PDF to a large size.""" # Set output file name if tempdir: output = NamedTemporaryFile(suffix='.pdf', dir=tempdir, delete=False).name elif suffix: output = os.path.join(os.path.dirname(file_name), add_suffix(file_name, suffix)) else: output = NamedTemporaryFile(suffix='.pdf').name reader = PdfFileReader(file_name) writer = PdfFileWriter() dims = dimensions(file_name) target_w = dims['w'] * scale target_h = dims['h'] * scale # Number of pages in input document page_count = reader.getNumPages() for page_number in range(page_count): wtrmrk = reader.getPage(page_number) page = PageObject.createBlankPage(width=target_w, height=target_h) page.mergeScaledTranslatedPage(wtrmrk, scale, margin_x, margin_y) writer.addPage(page) with open(output, "wb") as outputStream: writer.write(outputStream) return output
from PyPDF3 import PdfFileWriter, PdfFileReader output = PdfFileWriter() input1 = PdfFileReader(open("document1.pdf", "rb")) # print how many pages input1 has: print "document1.pdf has %d pages." % input1.getNumPages() # add page 1 from input1 to output document, unchanged output.addPage(input1.getPage(0)) # add page 2 from input1, but rotated clockwise 90 degrees output.addPage(input1.getPage(1).rotateClockwise(90)) # add page 3 from input1, rotated the other way: output.addPage(input1.getPage(2).rotateCounterClockwise(90)) # alt: output.addPage(input1.getPage(2).rotateClockwise(270)) # add page 4 from input1, but first add a watermark from another PDF: page4 = input1.getPage(3) watermark = PdfFileReader(open("watermark.pdf", "rb")) page4.mergePage(watermark.getPage(0)) output.addPage(page4) # add page 5 from input1, but crop it to half size: page5 = input1.getPage(4) page5.mediaBox.upperRight = (page5.mediaBox.getUpperRight_x() / 2, page5.mediaBox.getUpperRight_y() / 2) output.addPage(page5) # add some Javascript to launch the print window on opening this PDF.
#tabloid - landscape page_actual_height = 11*72 page_actual_width = 17*72 else: print("error. We only do tabloid and landscape") pdb.set_trace() pdfFileObj = open(infile_path, 'rb') # creating a pdf Reader object pdfReader = PdfFileReader(pdfFileObj) # creating a pdf writer object for new pdf pdfWriter = PdfFileWriter() num_pages = pdfReader.getNumPages() #create a list of pages pages = [pdfReader.getPage(k) for k in range(num_pages)] trimboxes = [k.trimBox for k in pages] page_width = trimboxes[0].getWidth() page_height = trimboxes[0].getHeight() #****************** #set the total height of the pdf # page_buffer_tabloid = 52 # page_ybuffer_letter = 0 # page_xbuffer_letter = 55.5 # total_height = num_pages * page_actual_height + page_ybuffer_letter
from PyPDF3 import PdfFileWriter, PdfFileReader import sys if len(sys.argv) == 1 or sys.argv[1] == '-h': print('''args: infile p1 p2 .. pn outfile Program outputs outfile with p1, p2, ..., pn removed from infile.''' ) exit() infile = PdfFileReader(sys.argv[1], 'rb') outfile = PdfFileWriter() page_del = list(map(int, sys.argv[2:-1])) ptr = 0 for i in range(infile.getNumPages()): if ptr == len(page_del) or i < page_del[ptr]: p = infile.getPage(int(i)) outfile.addPage(p) elif i == page_del[ptr]: ptr += 1 with open(sys.argv[-1], 'wb') as f: outfile.write(f)
def zipper(opts, cord_path, base_path, rec_path, rec_pdf_exists, output_name, matching, empty_path): # ######### PDF Write Setup ######### # # Open the input PDFs cord_pdf = PdfFileReader(open(cord_path, 'rb'), False) base_pdf = PdfFileReader(open(base_path, 'rb'), False) rec_pdf = '' if rec_pdf_exists: rec_pdf = PdfFileReader(open(rec_path, 'rb'), False) empty_pdf = PdfFileReader(open(empty_path, 'rb'), False) # Check that the coordination PDF is longer than the base (and therefore rec) pdf too. # The Coordination PDF includes pages at the front that do not get sliced in, and instead actually sit # in the front. If the Coordination pdf is less than the Base or Rec, these are missing, or there was another error if cord_pdf.getNumPages() < base_pdf.getNumPages(): prompt = 'Coordination PDF is shorter than the Base PDF' eprint(prompt) logger.critical(prompt) exit(-7) # Find the difference in length of the PDFs, these are the leader pages of the coordination diff_length = cord_pdf.getNumPages() - base_pdf.getNumPages() logger.info('Diff Length: %s', str(diff_length)) output = PdfFileWriter() for ii in range(diff_length): output.addPage(cord_pdf.getPage(ii)) if matching: logger.info("Converting Coordination PDF to string") logging.disable(logging.INFO) cord_str_pages = pdf_pages_to_list_of_strings(cord_path) logging.disable(logging.NOTSET) logger.info("Converting Base PDF to string") logging.disable(logging.INFO) base_str_pages = pdf_pages_to_list_of_strings(base_path) logging.disable(logging.NOTSET) rec_str_pages = [] if rec_pdf_exists: logging.disable(logging.INFO) logger.info("Converting Recommended PDF to string") rec_str_pages = pdf_pages_to_list_of_strings(rec_path) logging.disable(logging.NOTSET) regex_cord = r'(TCC Curve: )(TCC_[\d]+[a-zA-Z]?)([-_#$\w\d\[\] ]*)' regex_base_rec = r'(TCC Name: )(TCC_[\d]+[a-zA-Z]?)([-_#$\w\d\[\] ]*)' for ii in range(diff_length, len(cord_str_pages)): output.addPage(cord_pdf.getPage(ii)) tcc_matches = re.finditer(regex_cord, cord_str_pages[ii], re.MULTILINE) for match_num, tcc_match in enumerate(tcc_matches, start=1): tcc_name = tcc_match.group(2) logger.info("Attempting to find: " + tcc_name) base_num = find_matching_page(tcc_name, base_str_pages, regex_base_rec, 'Base PDF') if base_num != -1: logger.info('Found on base page: %s', str(base_num)) rec_page_flag = check_for_rec(cord_str_pages[ii]) rec_num = 0 if rec_pdf_exists and rec_page_flag: rec_num = find_matching_page(tcc_name, rec_str_pages, regex_base_rec, 'Rec PDF') if rec_num != -1: logger.info('Found on rec page: %s', str(rec_num)) else: output.addPage(empty_pdf.getPage(0)) if base_num > 0: output.addPage(base_pdf.getPage(base_num)) if rec_num > 0: output.addPage(rec_pdf.getPage(rec_num)) break else: for jj in range(base_pdf.getNumPages()): output.addPage(cord_pdf.getPage(jj + diff_length)) output.addPage(base_pdf.getPage(jj)) if rec_pdf_exists: output.addPage(rec_pdf.getPage(jj)) # Finally, output everything to the PDF # The output name is chosen based on what the name of the coordination file is if opts['output']: output_name = opts['output'] else: output_name = "8.0 - Coordination Results & Recommendations_" + output_name + "2018_NEW.pdf" output_name = os.path.join(os.path.dirname(os.path.abspath(cord_path)), output_name) with open(output_name, "wb") as w: output.write(w)