def test_split(self): total_pages = pypdftk.get_num_pages(TEST_PDF_PATH) paths = pypdftk.split(TEST_PDF_PATH) self.assertEqual(len(paths) - 1, total_pages) self.assertTrue('doc_data.txt' in paths[0]) for p in paths: self.assertTrue(os.path.exists(p))
def test_split_output_dir(self): output_dir = mkdtemp() total_pages = pypdftk.get_num_pages(TEST_PDF_PATH) paths = pypdftk.split(TEST_PDF_PATH, out_dir=output_dir) self.assertEqual(len(paths) - 1, total_pages) for p in paths: out_path = os.path.join(output_dir, os.path.basename(p)) self.assertTrue(out_path)
def run(idir, bdir, ofile): authors, venues, papers = read_all_info(idir) fpdf_names = [] tmpdirname = tempfile.mkdtemp() for p in papers: if p['pubTypeSlot'] == 'Conference' or p['pubTypeSlot'] == 'Journal': if 'pdfLink' not in p: print("pdfLink missing:", p['id']) elif p['pdfLink'].startswith("http"): print("local link missing:", p['id']) else: source = bdir + "/" + p['pdfLink'] i = len(fpdf_names) dest = "%s/%d.pdf" % (tmpdirname, i) print("getting %s, putting it %s" % (source, dest)) get_pdf(source, dest) tdir = "%s/%d/" % (tmpdirname, i) os.mkdir(tdir) fpdf_names.append(tdir + "page_01.pdf") pypdftk.split(dest, tdir) pypdftk.concat(fpdf_names, out_file=ofile) shutil.rmtree(tmpdirname)
def split(input_filename): """Split the input file given by input_filename into individual pages. The page files will be written as PDFs into a temporary directory. Returns a tuple, where the first element is the output directory name, and the second is the list of PDFs of the pages.""" pdf_directory = tempfile.mkdtemp(dir="./") pages = pypdftk.split(input_filename, pdf_directory) # Keep only PDFs. PDFTk puts out a document info text file as well, which # we delete. pdfs = [] for page in pages: if page.rsplit(".", 1)[1] == "pdf": pdfs.append(page) else: os.remove(page) # sort pdfs into correct order (by extracting the actual page number) pdfs.sort(key=lambda s: int(s[s.rfind('page_') + 5:-4])) return (pdf_directory, pdfs)
def test_split(): input_file = "./Out/some_file.pdf" pypdftk.split(input_file, "./Out")
def print_f99_pdftk_html( stamp_print="", paginate=False, begin_image_num=None, page_count=False, file_content=None, silent_print=False, filing_timestamp=None, rep_id=None, attachment_file_content=None, ): # check if json_file is in the request # HTML("templates/forms/test.html").write_pdf("output/pdf/test/test.pdf") # HTML(string='''<br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><div><b>This is bold text</b></div><div><u>This is underline text</u></div><div><i>This is italics text</i><u><br></u></div><div align='center'><u>Title</u></div><div align='left'><u><br></u></div><ol><li>one</li><li>two</li><li>three</li></ol>''').write_pdf("output/pdf/test/test.pdf") # pdfkit.from_file("templates/forms/test.html", "output/pdf/test/test.pdf") # pypdftk.stamp(current_app.config['FORM_TEMPLATES_LOCATION'].format('F99'), "output/pdf/test/test.pdf", "output/pdf/test/output.pdf") try: silent_print = silent_print txn_img_num = begin_image_num filing_timestamp = filing_timestamp if ((page_count and file_content) or ((paginate or silent_print) and file_content and begin_image_num) or (not paginate and "json_file" in request.files)): if page_count and file_content: json_file_md5 = md5_for_text(file_content) json_data = json.loads(file_content) elif (paginate or silent_print) and file_content and begin_image_num: # generate md5 for file_content json_file_md5 = md5_for_text(file_content) json_data = json.loads(file_content) elif not paginate and "json_file" in request.files: json_file = request.files.get("json_file") silent_print = (True if request.form.get("silent_print") and request.form.get("silent_print").lower() in ["true", "1"] else False) page_count = (True if request.form.get("page_count") and request.form.get("page_count").lower() in ["true", "1"] else False) if silent_print: txn_img_num = request.form.get("begin_image_num", None) if not txn_img_num: if flask.request.method == "POST": envelope = common.get_return_envelope( "false", "begin_image_num is missing from your request") status_code = status.HTTP_400_BAD_REQUEST return flask.jsonify(**envelope), status_code txn_img_num = int(txn_img_num) filing_timestamp = request.form.get( "filing_timestamp", None) json_file_md5 = md5_for_file(json_file) json_file.stream.seek(0) # save json file as md5 file name json_file.save( current_app.config["REQUEST_FILE_LOCATION"].format( json_file_md5)) # load json file json_data = json.load( open(current_app.config["REQUEST_FILE_LOCATION"].format( json_file_md5))) md5_directory = current_app.config["OUTPUT_DIR_LOCATION"].format( json_file_md5) # if paginate or page_count is True and directory exist then don't remove it is_dir_exist = False if os.path.isdir(md5_directory): is_dir_exist = True os.makedirs(md5_directory, exist_ok=True) # os.makedirs(md5_directory + "images", exist_ok=True) if not os.path.exists(md5_directory + "images"): shutil.copytree("templates/forms/F99/images", md5_directory + "images") shutil.copyfile("templates/forms/F99/form-text.css", md5_directory + "form-text.css") infile = current_app.config["HTML_FORM_TEMPLATES_LOCATION"].format( "template") outfile = md5_directory + json_file_md5 + ".html" form99_json_data = json_data["data"] with open(infile) as inf: txt = inf.read() soup = bs4.BeautifulSoup(txt, features="html5lib") soup.find("label", attrs={ "id": "committeeName" }).string = form99_json_data["committeeName"] soup.find("label", attrs={ "id": "street1" }).string = form99_json_data["street1"] soup.find("label", attrs={ "id": "street2" }).string = form99_json_data["street2"] soup.find("label", attrs={ "id": "city" }).string = form99_json_data["city"] soup.find("label", attrs={ "id": "state" }).string = form99_json_data["state"] soup.find("label", attrs={ "id": "zipCode" }).string = form99_json_data["zipCode"] soup.find("span", attrs={ "id": "committeeId" }).string = form99_json_data["committeeId"] name_list = [ "LastName", "FirstName", "MiddleName", "Prefix", "Suffix" ] treasurerFullName = "" for item in name_list: item = "treasurer" + item if form99_json_data.get(item): treasurerFullName += form99_json_data.get(item) + ", " soup.find("label", attrs={ "id": "treasurerFullName" }).string = treasurerFullName[:-2] soup.find("label", attrs={ "id": "treasurerName" }).string = ((form99_json_data.get("treasurerLastName", "") + ", " + form99_json_data.get("treasurerFirstName", "") ).strip().rstrip(",").strip()) f99_html_data = form99_json_data["text"] soup.find("label", attrs={"id": "text"}).string = f99_html_data soup.find("label", attrs={ "id": form99_json_data["reason"] }).string = "X" date_array = form99_json_data["dateSigned"].split("/") soup.find("span", attrs={ "id": "dateSignedMonth" }).string = str(date_array[0]) soup.find("span", attrs={ "id": "dateSignedDate" }).string = str(date_array[1]) soup.find("span", attrs={ "id": "dateSignedYear" }).string = str(date_array[2]) with open(outfile, "w") as output_file: output_file.write( str(soup).replace("<", "<").replace(">", ">")) # F99 PDF page padding options options = { "margin-top": "0.40in", "margin-right": "0.20in", "margin-bottom": "0.40in", "margin-left": "0.20in", } # HTML(outfile).write_pdf(md5_directory + json_file_md5 + '.pdf', stylesheets=[CSS(current_app.config['FORMS_LOCATION'].format('F99.css'))]) pdfkit.from_file(outfile, md5_directory + json_file_md5 + ".pdf", options=options) # pdfkit.from_file(outfile, md5_directory + json_file_md5 + '.pdf') total_no_of_pages = pypdftk.get_num_pages(md5_directory + json_file_md5 + ".pdf") # checking if attachment_file exist if ((paginate or page_count) and attachment_file_content) or ( not paginate and "attachment_file" in request.files): # reading Attachment title file attachment_title_file = current_app.config[ "FORM_TEMPLATES_LOCATION"].format("Attachment_Title") if (paginate or page_count) and attachment_file_content: attachment_file = json.loads(attachment_file_content) else: attachment_file = request.files.get("attachment_file") attachment_file.save( os.path.join(md5_directory + "attachment_temp.pdf")) os.makedirs(md5_directory + "attachment", exist_ok=True) os.makedirs(md5_directory + "final_attachment", exist_ok=True) pypdftk.split(md5_directory + "attachment_temp.pdf", md5_directory + "attachment") os.remove(md5_directory + "attachment/doc_data.txt") attachment_no_of_pages = pypdftk.get_num_pages( os.path.join(md5_directory + "attachment_temp.pdf")) attachment_page_no = total_no_of_pages total_no_of_pages += attachment_no_of_pages # we are doing this to assign page numbers to attachment file for filename in os.listdir(md5_directory + "attachment"): attachment_page_no += 1 page_dict = {} page_dict["PAGESTR"] = ("PAGE " + str(attachment_page_no) + " / " + str(total_no_of_pages)) if silent_print: page_dict["IMGNO"] = txn_img_num + attachment_page_no pypdftk.fill_form( attachment_title_file, md5_directory + "attachment/attachment_page_" + str(attachment_page_no) + ".pdf", ) pypdftk.stamp( md5_directory + "attachment/" + filename, md5_directory + "attachment/attachment_page_" + str(attachment_page_no) + ".pdf", md5_directory + "final_attachment/attachment_" + str(attachment_page_no) + ".pdf", ) pypdftk.concat( directory_files(md5_directory + "final_attachment/"), md5_directory + "attachment.pdf", ) os.remove(md5_directory + "attachment_temp.pdf") os.makedirs(md5_directory + "pages", exist_ok=True) os.makedirs(md5_directory + "final_pages", exist_ok=True) pypdftk.split(md5_directory + json_file_md5 + ".pdf", md5_directory + "pages") os.remove(md5_directory + "pages/doc_data.txt") f99_page_no = 1 for filename in os.listdir(md5_directory + "pages"): page_dict = {} page_dict["PAGESTR"] = ("PAGE " + str(f99_page_no) + " / " + str(total_no_of_pages)) if silent_print: page_dict["IMGNO"] = txn_img_num txn_img_num += 1 # need to print timestamp on first page only if filing_timestamp and f99_page_no == 1: page_dict["FILING_TIMESTAMP"] = filing_timestamp page_number_file = current_app.config[ "FORM_TEMPLATES_LOCATION"].format("Page_Number") pypdftk.fill_form( page_number_file, page_dict, md5_directory + "pages/page_number_" + str(f99_page_no).zfill(6) + ".pdf", ) pypdftk.stamp( md5_directory + "pages/page_number_" + str(f99_page_no).zfill(6) + ".pdf", md5_directory + "pages/" + filename, md5_directory + "final_pages/page_" + str(f99_page_no).zfill(6) + ".pdf", ) f99_page_no += 1 pypdftk.concat( directory_files(md5_directory + "final_pages/"), json_file_md5 + "_temp.pdf", ) if ((paginate or page_count) and attachment_file_content) or ( not paginate and "attachment_file" in request.files): pypdftk.concat( [ json_file_md5 + "_temp.pdf", md5_directory + "attachment.pdf" ], md5_directory + "all_pages.pdf", ) shutil.rmtree(md5_directory + "attachment") shutil.rmtree(md5_directory + "final_attachment") os.remove(md5_directory + "attachment.pdf") else: shutil.move(json_file_md5 + "_temp.pdf", md5_directory + "all_pages.pdf") # clean up task shutil.rmtree(md5_directory + "pages") shutil.rmtree(md5_directory + "final_pages") os.remove(md5_directory + json_file_md5 + ".pdf") # if flask.request.method == "POST": response = { # 'file_name': ent_app.conf'{}.pdf'.format(json_file_md5), "total_pages": total_no_of_pages, } if not page_count and not paginate: s3 = boto3.client("s3") extraArgs = { "ContentType": "application/pdf", "ACL": "public-read" } if silent_print: response["pdf_url"] = current_app.config[ 'S3_FILE_URL'] + rep_id + '.pdf' s3.upload_file( md5_directory + 'all_pages.pdf', current_app. config['AWS_FECFILE_COMPONENTS_BUCKET_NAME'], current_app.config['AWS_FECFILE_OUTPUT_DIRECTORY'] + '/' + str(rep_id) + '.pdf', ExtraArgs=extraArgs) else: response["pdf_url"] = ( current_app.config["PRINT_OUTPUT_FILE_URL"].format( json_file_md5) + "all_pages.pdf", ) s3.upload_file( md5_directory + "all_pages.pdf", current_app. config["AWS_FECFILE_COMPONENTS_BUCKET_NAME"], md5_directory + "all_pages.pdf", ExtraArgs=extraArgs, ) else: if not is_dir_exist: shutil.rmtree(md5_directory) if paginate: txn_img_json = { "summary": { "committeeId": form99_json_data.get("committeeId", None), "begin_image_num": begin_image_num, "end_image_num": txn_img_num } } response["txn_img_json"] = txn_img_json envelope = common.get_return_envelope(data=response) status_code = (status.HTTP_200_OK if page_count or paginate else status.HTTP_201_CREATED) return flask.jsonify(**envelope), status_code # elif page_count or paginate: # if not is_dir_exist: # shutil.rmtree(md5_directory) # response = { # "total_pages": total_no_of_pages, # } # elif paginate: # txn_img_json = { # 'summary' : { # 'committeeId': form99_json_data.get('committeeId', None) # } # } # response['txn_img_json'] = txn_img_json # return True, response # elif silent_print and not flask.request.method == "POST": # return True, {} else: if paginate or page_count or silent_print: envelope = common.get_return_envelope(False, "") else: # elif flask.request.method == "POST": envelope = common.get_return_envelope( False, "json_file is missing from your request") return flask.jsonify(**envelope), status.HTTP_400_BAD_REQUEST except Exception as e: traceback.print_exception(*sys.exc_info()) return error("Error generating print preview, error message: " + str(e))
def page_pdfs(self): import pypdftk if not self._page_pdfs: self._page_pdfs = pypdftk.split(self.fn) return self._page_pdfs
def print_f99_pdftk_html(stamp_print): # check if json_file is in the request # HTML("templates/forms/test.html").write_pdf("output/pdf/test/test.pdf") # HTML(string='''<br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><div><b>This is bold text</b></div><div><u>This is underline text</u></div><div><i>This is italics text</i><u><br></u></div><div align='center'><u>Title</u></div><div align='left'><u><br></u></div><ol><li>one</li><li>two</li><li>three</li></ol>''').write_pdf("output/pdf/test/test.pdf") # pdfkit.from_file("templates/forms/test.html", "output/pdf/test/test.pdf") # pypdftk.stamp(current_app.config['FORM_TEMPLATES_LOCATION'].format('F99'), "output/pdf/test/test.pdf", "output/pdf/test/output.pdf") if 'json_file' in request.files: total_no_of_pages = 1 page_no = 1 json_file = request.files.get('json_file') # generate md5 for json file json_file_md5 = utils.md5_for_file(json_file) json_file.stream.seek(0) md5_directory = current_app.config['OUTPUT_DIR_LOCATION'].format( json_file_md5) os.makedirs(md5_directory, exist_ok=True) # os.makedirs(md5_directory + "images", exist_ok=True) if not os.path.exists(md5_directory + "images"): shutil.copytree("templates/forms/F99/images", md5_directory + "images") shutil.copyfile("templates/forms/F99/form-text.css", md5_directory + "form-text.css") infile = current_app.config['HTML_FORM_TEMPLATES_LOCATION'].format( 'template') json_file.save( current_app.config['REQUEST_FILE_LOCATION'].format(json_file_md5)) outfile = md5_directory + json_file_md5 + '.html' json_data = json.load( open(current_app.config['REQUEST_FILE_LOCATION'].format( json_file_md5))) form99_json_data = json_data['data'] # load the file with open(infile) as inf: txt = inf.read() soup = bs4.BeautifulSoup(txt) soup.find('label', attrs={ 'id': 'committeeName' }).string = form99_json_data['committeeName'] soup.find('label', attrs={ 'id': 'street1' }).string = form99_json_data['street1'] soup.find('label', attrs={ 'id': 'street2' }).string = form99_json_data['street2'] soup.find('label', attrs={ 'id': 'city' }).string = form99_json_data['city'] soup.find('label', attrs={ 'id': 'state' }).string = form99_json_data['state'] soup.find('label', attrs={ 'id': 'zipCode' }).string = form99_json_data['zipCode'] soup.find('span', attrs={ 'id': 'committeeId' }).string = form99_json_data['committeeId'] soup.find('label', attrs={'id': 'treasurerFullName'}).string = form99_json_data['treasurerLastName'] + \ ', ' + form99_json_data['treasurerFirstName'] \ + ', ' + form99_json_data['treasurerMiddleName'] \ + ', ' + form99_json_data['treasurerPrefix'] \ + ', ' + form99_json_data['treasurerSuffix'] soup.find('label', attrs={'id': 'treasurerName'}).string = form99_json_data['treasurerLastName'] + \ ', ' + form99_json_data['treasurerFirstName'] f99_html_data = form99_json_data['text'] soup.find('label', attrs={'id': 'text'}).string = f99_html_data soup.find('label', attrs={ 'id': form99_json_data['reason'] }).string = 'X' date_array = form99_json_data['dateSigned'].split("/") soup.find('span', attrs={ 'id': 'dateSignedMonth' }).string = str(date_array[0]) soup.find('span', attrs={ 'id': 'dateSignedDate' }).string = str(date_array[1]) soup.find('span', attrs={ 'id': 'dateSignedYear' }).string = str(date_array[2]) with open(outfile, "w") as output_file: output_file.write( str(soup).replace("<", "<").replace(">", ">")) # F99 PDF page padding options options = { 'margin-top': '0.36in', 'margin-right': '0.25in', 'margin-bottom': '0.39in', 'margin-left': '0.25in' } # HTML(outfile).write_pdf(md5_directory + json_file_md5 + '.pdf', stylesheets=[CSS(current_app.config['FORMS_LOCATION'].format('F99.css'))]) pdfkit.from_file(outfile, md5_directory + json_file_md5 + '.pdf', options=options) total_no_of_pages = pypdftk.get_num_pages(md5_directory + json_file_md5 + '.pdf') page_number_file = current_app.config[ 'FORM_TEMPLATES_LOCATION'].format('Page_Number') # checking if attachment_file exist if 'attachment_file' in request.files: # reading Attachment title file attachment_title_file = current_app.config[ 'FORM_TEMPLATES_LOCATION'].format('Attachment_Title') attachment_file = request.files.get('attachment_file') attachment_file.save( os.path.join(md5_directory + 'attachment_temp.pdf')) os.makedirs(md5_directory + 'attachment', exist_ok=True) os.makedirs(md5_directory + 'final_attachment', exist_ok=True) pypdftk.split(md5_directory + 'attachment_temp.pdf', md5_directory + 'attachment') os.remove(md5_directory + 'attachment/doc_data.txt') attachment_no_of_pages = pypdftk.get_num_pages( os.path.join(md5_directory + 'attachment_temp.pdf')) attachment_page_no = total_no_of_pages total_no_of_pages += attachment_no_of_pages # we are doing this to assign page numbers to attachment file for filename in os.listdir(md5_directory + 'attachment'): attachment_page_no += 1 pypdftk.fill_form( attachment_title_file, { "PAGESTR": "PAGE " + str(attachment_page_no) + " / " + str(total_no_of_pages) }, md5_directory + 'attachment/attachment_page_' + str(attachment_page_no) + '.pdf') pypdftk.stamp( md5_directory + 'attachment/' + filename, md5_directory + 'attachment/attachment_page_' + str(attachment_page_no) + '.pdf', md5_directory + 'final_attachment/attachment_' + str(attachment_page_no) + '.pdf') pypdftk.concat( directory_files(md5_directory + 'final_attachment/'), md5_directory + 'attachment.pdf') os.remove(md5_directory + 'attachment_temp.pdf') # shutil.rmtree(md5_directory + 'attachment') # shutil.rmtree(md5_directory + 'final_attachment') # pypdftk.concat([md5_directory + json_file_md5 + '.pdf', md5_directory + 'attachment.pdf'], md5_directory + 'all_pages_temp.pdf') # else: # shutil.move(md5_directory + json_file_md5 + '.pdf', md5_directory + 'all_pages_temp.pdf') os.makedirs(md5_directory + 'pages', exist_ok=True) os.makedirs(md5_directory + 'final_pages', exist_ok=True) pypdftk.split(md5_directory + json_file_md5 + '.pdf', md5_directory + 'pages') os.remove(md5_directory + 'pages/doc_data.txt') f99_page_no = 1 for filename in os.listdir(md5_directory + 'pages'): pypdftk.fill_form( page_number_file, { "PAGESTR": "PAGE " + str(f99_page_no) + " / " + str(total_no_of_pages) }, md5_directory + 'pages/page_number_' + str(f99_page_no) + '.pdf') pypdftk.stamp( md5_directory + 'pages/page_number_' + str(f99_page_no) + '.pdf', md5_directory + 'pages/' + filename, md5_directory + 'final_pages/page_' + str(f99_page_no) + '.pdf') f99_page_no += 1 pypdftk.concat(directory_files(md5_directory + 'final_pages/'), json_file_md5 + '_temp.pdf') if 'attachment_file' in request.files: pypdftk.concat([ json_file_md5 + '_temp.pdf', md5_directory + 'attachment.pdf' ], md5_directory + 'all_pages.pdf') shutil.rmtree(md5_directory + 'attachment') shutil.rmtree(md5_directory + 'final_attachment') os.remove(md5_directory + 'attachment.pdf') else: shutil.move(json_file_md5 + '_temp.pdf', md5_directory + 'all_pages.pdf') # clean up task shutil.rmtree(md5_directory + 'pages') shutil.rmtree(md5_directory + 'final_pages') # os.remove(md5_directory + json_file_md5 + '.html') # shutil.rmtree(md5_directory + 'images') # os.remove(md5_directory + 'form-text.css') os.remove(md5_directory + json_file_md5 + '.pdf') # for f99_page_no in range(f99_no_of_pages): # pypdftk.fill_form(page_number_file, # {"PAGESTR": "PAGE " + str(f99_page_no+1) + " / " + str(total_no_of_pages)}, # md5_directory + 'pages/page_' + str(f99_page_no+1) + '.pdf') # pypdftk.stamp(md5_directory + json_file_md5 + '.pdf', md5_directory + # 'pages/page_' + str(f99_page_no+1) + '.pdf', md5_directory + json_file_md5 + '_temp.pdf') # json_data['PAGESTR'] = "PAGE " + str(page_no) + " / " + str(total_no_of_pages) # json_data['MISCELLANEOUS_TEXT'] = '' # xfdf_path = pypdftk.gen_xfdf(json_data) # pypdftk.fill_form(infile, json_data, outfile) # HTML(string='''<br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><font face='Helvetica' size=10 ''' + f99_full_text).\ # write_pdf("output/pdf/test/test.pdf") # pypdftk.stamp(outfile, "output/pdf/test/test.pdf", "output/pdf/test/output.pdf") # additional_page_counter = 0 # if len(f99_pages_text_json['additional_pages']) > 0: # continuation_file = current_app.config['FORM_TEMPLATES_LOCATION'].format('F99_CONT') # os.makedirs(md5_directory + 'merge', exist_ok=True) # for additional_page in f99_pages_text_json['additional_pages']: # page_no += 1 # continuation_outfile = md5_directory + 'merge/' + str(additional_page_counter)+'.pdf' # pypdftk.fill_form(continuation_file, {"PAGESTR": "PAGE "+str(page_no)+" / " + str(total_no_of_pages), # "CONTINOUS_TEXT": additional_page[str(additional_page_counter)]}, continuation_outfile) # pypdftk.concat([outfile, continuation_outfile], md5_directory + json_file_md5 + '_all_pages_temp.pdf') # shutil.copy(md5_directory + json_file_md5 + '_all_pages_temp.pdf', outfile) # additional_page_counter += 1 # os.remove(md5_directory + json_file_md5 + '_all_pages_temp.pdf') # # # Add the F99 attachment # if 'attachment_file' in request.files: # pypdftk.concat([outfile, md5_directory + 'attachment.pdf'], md5_directory + 'all_pages.pdf') # os.remove(md5_directory + 'attachment.pdf') # else: # shutil.copy(outfile, md5_directory + 'all_pages.pdf') # os.remove(md5_directory + json_file_md5 +'_temp.pdf') # push output file to AWS s3 = boto3.client('s3') s3.upload_file( md5_directory + 'all_pages.pdf', current_app.config['AWS_FECFILE_COMPONENTS_BUCKET_NAME'], md5_directory + 'all_pages.pdf', ExtraArgs={ 'ContentType': "application/pdf", 'ACL': "public-read" }) response = { # 'file_name': '{}.pdf'.format(json_file_md5), 'pdf_url': current_app.config['PRINT_OUTPUT_FILE_URL'].format(json_file_md5) + 'all_pages.pdf' } if flask.request.method == "POST": envelope = common.get_return_envelope(data=response) status_code = status.HTTP_201_CREATED return flask.jsonify(**envelope), status_code else: if flask.request.method == "POST": envelope = common.get_return_envelope( 'false', 'JSON file is missing from your request') status_code = status.HTTP_400_BAD_REQUEST return flask.jsonify(**envelope), status_code
def print_f99_pdftk(stamp_print): # check if json_file is in the request if 'json_file' in request.files: total_no_of_pages = 1 page_no = 1 json_file = request.files.get('json_file') # generate md5 for json file json_file_md5 = utils.md5_for_file(json_file) json_file.stream.seek(0) md5_directory = current_app.config['OUTPUT_DIR_LOCATION'].format( json_file_md5) os.makedirs(md5_directory, exist_ok=True) infile = current_app.config['FORM_TEMPLATES_LOCATION'].format('F99') # save json file as md5 file name json_file.save( current_app.config['REQUEST_FILE_LOCATION'].format(json_file_md5)) outfile = md5_directory + json_file_md5 + '_temp.pdf' json_data = json.load( open(current_app.config['REQUEST_FILE_LOCATION'].format( json_file_md5))) # setting timestamp and imgno to empty as these needs to show up after submission if stamp_print != 'stamp': json_data['FILING_TIMESTAMP'] = '' json_data['IMGNO'] = '' f99_pages_text_json = json.loads(split_f99_text_pages(json_data)) json_data['MISCELLANEOUS_TEXT'] = f99_pages_text_json['main_page'] total_no_of_pages += len(f99_pages_text_json['additional_pages']) # checking if attachment_file exist if 'attachment_file' in request.files: # reading Attachment title file attachment_title_file = current_app.config[ 'FORM_TEMPLATES_LOCATION'].format('Attachment_Title') attachment_file = request.files.get('attachment_file') attachment_file.save( os.path.join(md5_directory + 'attachment_temp.pdf')) os.makedirs(md5_directory + 'attachment', exist_ok=True) os.makedirs(md5_directory + 'final_attachment', exist_ok=True) pypdftk.split(md5_directory + 'attachment_temp.pdf', md5_directory + 'attachment') os.remove(md5_directory + 'attachment/doc_data.txt') attachment_no_of_pages = pypdftk.get_num_pages( os.path.join(md5_directory + 'attachment_temp.pdf')) attachment_page_no = total_no_of_pages total_no_of_pages += attachment_no_of_pages # we are doing this to assign page numbers to attachment file for filename in os.listdir(md5_directory + 'attachment'): attachment_page_no += 1 pypdftk.fill_form( attachment_title_file, { "PAGESTR": "PAGE " + str(attachment_page_no) + " / " + str(total_no_of_pages) }, md5_directory + 'attachment/attachment_page_' + str(attachment_page_no) + '.pdf') pypdftk.stamp( md5_directory + 'attachment/' + filename, md5_directory + 'attachment/attachment_page_' + str(attachment_page_no) + '.pdf', md5_directory + 'final_attachment/attachment_' + str(attachment_page_no) + '.pdf') pypdftk.concat( directory_files(md5_directory + 'final_attachment/'), md5_directory + 'attachment.pdf') os.remove(md5_directory + 'attachment_temp.pdf') shutil.rmtree(md5_directory + 'attachment') shutil.rmtree(md5_directory + 'final_attachment') json_data['PAGESTR'] = "PAGE " + str(page_no) + " / " + str( total_no_of_pages) pypdftk.fill_form(infile, json_data, outfile, flatten=False) additional_page_counter = 0 if len(f99_pages_text_json['additional_pages']) > 0: continuation_file = current_app.config[ 'FORM_TEMPLATES_LOCATION'].format('F99_CONT') os.makedirs(md5_directory + 'merge', exist_ok=True) for additional_page in f99_pages_text_json['additional_pages']: page_no += 1 continuation_outfile = md5_directory + 'merge/' + str( additional_page_counter) + '.pdf' pypdftk.fill_form( continuation_file, { "PAGESTR": "PAGE " + str(page_no) + " / " + str(total_no_of_pages), "CONTINOUS_TEXT": additional_page[str(additional_page_counter)] }, continuation_outfile) pypdftk.concat([outfile, continuation_outfile], md5_directory + json_file_md5 + '_all_pages_temp.pdf') shutil.copy( md5_directory + json_file_md5 + '_all_pages_temp.pdf', outfile) additional_page_counter += 1 os.remove(md5_directory + json_file_md5 + '_all_pages_temp.pdf') # Add the F99 attachment if 'attachment_file' in request.files: pypdftk.concat([outfile, md5_directory + 'attachment.pdf'], md5_directory + 'all_pages.pdf') os.remove(md5_directory + 'attachment.pdf') else: shutil.copy(outfile, md5_directory + 'all_pages.pdf') os.remove(md5_directory + json_file_md5 + '_temp.pdf') # push output file to AWS s3 = boto3.client('s3') s3.upload_file( md5_directory + 'all_pages.pdf', current_app.config['AWS_FECFILE_COMPONENTS_BUCKET_NAME'], md5_directory + 'all_pages.pdf', ExtraArgs={ 'ContentType': "application/pdf", 'ACL': "public-read" }) response = { # 'file_name': '{}.pdf'.format(json_file_md5), 'pdf_url': current_app.config['PRINT_OUTPUT_FILE_URL'].format(json_file_md5) + 'all_pages.pdf' } if flask.request.method == "POST": envelope = common.get_return_envelope(data=response) status_code = status.HTTP_201_CREATED return flask.jsonify(**envelope), status_code else: if flask.request.method == "POST": envelope = common.get_return_envelope( 'false', 'JSON file is missing from your request') status_code = status.HTTP_400_BAD_REQUEST return flask.jsonify(**envelope), status_code
import subprocess import pypdftk import shlex from os import listdir from os.path import isfile, join from os import chdir from os import getcwd from natsort import natsorted, ns f = raw_input("Enter the file name : ") test = f cwd = getcwd() subprocess.call('mkdir temp', shell=True) temp_dir = cwd + '/temp' page_paths=[] page_paths = pypdftk.split(f, 'temp') #print(page_paths) s = raw_input("Enter the page nos. to be printed in grayscale separated by spaces : ") pageslist = map(int, s.split(' ')) #print(pageslist, type(pageslist)) onlyfiles = [f for f in listdir('temp/') if isfile(join('temp/', f))] #print(onlyfiles) mergelist = [] for file in onlyfiles: if file.startswith('page_') : pg = file.split('_')[1] #print(pg, type(pg), int(pg[:2]) < 10, pg[:2]) if int(pg[:2]) < 10 : c = 'mv -f '+file+' page_'+(pg[1])+'.pdf' #print(c) subprocess.call(c, shell=True, cwd=temp_dir)