def main(input_filename, output_filename, correct_length): pdf_directory, pages = split(input_filename) image_directory, images = convert_to_images(pages) pages_with_images = zip(pages, images) docs = split_documents(pages_with_images, correct_length) # split into docs with and without padding good_docs = [doc for doc in docs if not doc.isPadded] padded_docs = [doc for doc in docs if doc.isPadded] # flatten and pull out just the pdf filenames good_pdfs = [pdf for doc in good_docs for pdf in doc.pdf_pages] padded_pdfs = [pdf for doc in padded_docs for pdf in doc.pdf_pages] if len(good_pdfs) > 0: pypdftk.concat(good_pdfs, output_filename + '_good.pdf') if len(padded_pdfs) > 0: pypdftk.concat(padded_pdfs, output_filename + '_padded.pdf') # cleanup temp files for pdf_name, image_name in pages_with_images: os.remove(pdf_name) os.remove(image_name) os.rmdir(pdf_directory) os.rmdir(image_directory) show_summary(good_docs, padded_docs) print("Merged results written to {0}_good.pdf and {0}_padded.pdf".format( output_filename)) return 0
def run(idir, bdir, ofile): authors, venues, papers = read_all_info(idir) fpdf_names = [] tmpdirname = tempfile.mkdtemp() for p in papers: if p['pubTypeSlot'] == 'Conference' or p['pubTypeSlot'] == 'Journal': if 'pdfLink' not in p: print("pdfLink missing:", p['id']) elif p['pdfLink'].startswith("http"): print("local link missing:", p['id']) else: source = bdir + "/" + p['pdfLink'] i = len(fpdf_names) dest = "%s/%d.pdf" % (tmpdirname, i) print("getting %s, putting it %s" % (source, dest)) get_pdf(source, dest) tdir = "%s/%d/" % (tmpdirname, i) os.mkdir(tdir) fpdf_names.append(tdir + "page_01.pdf") pypdftk.split(dest, tdir) pypdftk.concat(fpdf_names, out_file=ofile) shutil.rmtree(tmpdirname)
def process(config): """ parse the JSON configuration then process each item independently """ tmp_dir = tempfile.mkdtemp(prefix="pdfFactory-") callback_url = config.get("callback", None) if not os.path.lexists(config["output"]) or config.get("overwrite", True): try: merge_list = [] for item in config["items"]: item_data = {} if "data" in config: item_data.update(config["data"]) if "data" in item: item_data.update(item["data"]) item["data"] = item_data merge_list.append(process_item(item, tmp_dir)) except KeyError as e: log.error("Error when parsing JSON file, some important values are missing !") log.error("Missing value: %s", e) clean_failure(tmp_dir, callback_url) except: log.error("Cannot proceed, got an unknown error:\n %s\n", traceback.format_exc()) clean_failure(tmp_dir, callback_url) # Merge resulting PDF log.debug("Merging pdf: %s...", str(merge_list)) # So A/foo/../B don't become A/ output = os.path.join(BASE_OUTPUT_DIR, config["output"]) check_output_folder(BASE_OUTPUT_DIR, output, create_folder=True) pypdftk.concat(merge_list, output) else: log.info("Document '%s' already exists and do not need re-generation.", config["output"]) success_callback(True, callback_url) clean_tmp(tmp_dir)
def test_concat(): files = ["./Out/page_01.pdf", "./Out/page_02.pdf", "./Out/page_03.pdf"] pypdftk.concat(files, "./Out/sample_output.pdf")
def print_sc1_line( f3x_data, md5_directory, sc1, sc1_start_page, total_no_of_pages, image_num=None ): sc1_start_page += 1 # is_memo_page = 0 sc1_schedule_page_dict = {} if image_num: sc1_schedule_page_dict["IMGNO"] = image_num image_num += 1 sc1_schedule_page_dict["PAGENO"] = sc1_start_page sc1_schedule_page_dict["TRANSACTION_ID"] = sc1.get("transactionId") sc1_schedule_page_dict["TOTALPAGES"] = total_no_of_pages sc1_schedule_page_dict["committeeName"] = f3x_data.get("committeeName") sc1_schedule_page_dict["committeeId"] = f3x_data.get("committeeId") sc1_schedule_page_dict["lenderName"] = sc1.get("lenderOrganizationName") for i in [ "lenderStreet1", "lenderStreet2", "lenderCity", "lenderState", "lenderZipCode", "loanInterestRate", "isLoanRestructured", "otherPartiesLiable", "pledgedCollateralIndicator", "pledgeCollateralDescription", "perfectedInterestIndicator", "futureIncomeIndicator", "SCPageNo", ]: sc1_schedule_page_dict[i] = sc1.get(i) if sc1.get("loanIncurredDate") != "": date_array = sc1.get("loanIncurredDate").split("/") sc1_schedule_page_dict["loanIncurredDateMonth"] = date_array[0] sc1_schedule_page_dict["loanIncurredDateDay"] = date_array[1] sc1_schedule_page_dict["loanIncurredDateYear"] = date_array[2] if sc1.get("loanDueDate") not in ["none", "null", " ", "", None]: if "-" in sc1.get("loanDueDate"): date_array = sc1.get("loanDueDate").split("-") if len(date_array) == 3: sc1_schedule_page_dict["loanDueDateMonth"] = date_array[1] sc1_schedule_page_dict["loanDueDateDay"] = date_array[2] sc1_schedule_page_dict["loanDueDateYear"] = date_array[0] else: sc1_schedule_page_dict["loanDueDateYear"] = sc1.get("loanDueDate") elif "/" in sc1.get("loanDueDate"): date_array = sc1.get("loanDueDate").split("/") if len(date_array) == 3: sc1_schedule_page_dict["loanDueDateMonth"] = date_array[0] sc1_schedule_page_dict["loanDueDateDay"] = date_array[1] sc1_schedule_page_dict["loanDueDateYear"] = date_array[2] else: sc1_schedule_page_dict["loanDueDateYear"] = sc1.get("loanDueDate") else: sc1_schedule_page_dict["loanDueDateYear"] = sc1.get("loanDueDate") if sc1.get("originalLoanDate") != "": date_array = sc1.get("originalLoanDate").split("/") sc1_schedule_page_dict["originalLoanDateMonth"] = date_array[0] sc1_schedule_page_dict["originalLoanDateDay"] = date_array[1] sc1_schedule_page_dict["originalLoanDateYear"] = date_array[2] if sc1.get("depositoryAccountEstablishedDate") != "": date_array = sc1.get("depositoryAccountEstablishedDate").split("/") sc1_schedule_page_dict["ACCOUNT_EST_DATE_MM"] = date_array[0] sc1_schedule_page_dict["ACCOUNT_EST_DATE_DD"] = date_array[1] sc1_schedule_page_dict["ACCOUNT_EST_DATE_YY"] = date_array[2] sc1_schedule_page_dict["loanAmount"] = "{0:.2f}".format( float(sc1.get("loanAmount")) ) sc1_schedule_page_dict["creditAmountThisDraw"] = "{0:.2f}".format( float(sc1.get("creditAmountThisDraw")) ) sc1_schedule_page_dict["totalOutstandingBalance"] = "{0:.2f}".format( float(sc1.get("totalOutstandingBalance")) ) sc1_schedule_page_dict["BACK_REF_TRAN_ID"] = sc1.get( "backReferenceTransactionIdNumber" ) sc1_schedule_page_dict["pledgeCollateralAmount"] = "{0:.2f}".format( float(sc1.get("pledgeCollateralAmount")) ) sc1_schedule_page_dict["PLEDGE_DESC"] = sc1.get("futureIncomeDescription") sc1_schedule_page_dict["PLEDGE_ESTIMATED_AMOUNT"] = "{0:.2f}".format( float(sc1.get("futureIncomeEstimate")) ) treasurerName = "" for i in [ "treasurerPrefix", "treasurerLastName", "treasurerFirstName", "treasurerMiddleName", "treasurerSuffix", ]: if sc1.get(i) != "": treasurerName += sc1.get(i) + " " sc1_schedule_page_dict["COMMITTEE_TREASURER_NAME"] = treasurerName[0:-1] sc1_schedule_page_dict["DEPOSITORY_NAME"] = sc1.get("depositoryAccountLocation") sc1_schedule_page_dict["DEPOSITORY_STREET1"] = sc1.get("depositoryAccountStreet1") sc1_schedule_page_dict["DEPOSITORY_STREET2"] = sc1.get("depositoryAccountStreet2") sc1_schedule_page_dict["DEPOSITORY_CITY"] = sc1.get("depositoryAccountCity") sc1_schedule_page_dict["DEPOSITORY_STATE"] = sc1.get("depositoryAccountState") sc1_schedule_page_dict["DEPOSITORY_ZIP"] = sc1.get("depositoryAccountZipCode") sc1_schedule_page_dict["BASIS"] = sc1.get("basisOfLoanDescription") if sc1.get("treasurerSignedDate") != "": date_array = sc1.get("treasurerSignedDate").split("/") sc1_schedule_page_dict["TREASUER_SIGN_DATE_MM"] = date_array[0] sc1_schedule_page_dict["TREASUER_SIGN_DATE_DD"] = date_array[1] sc1_schedule_page_dict["TREASUER_SIGN_DATE_YY"] = date_array[2] authorizedName = "" for i in [ "authorizedPrefix", "authorizedLastName", "authorizedFirstName", "authorizedMiddleName", "authorizedSuffix", ]: if sc1.get(i) != "": authorizedName += sc1.get(i) + " " sc1_schedule_page_dict["AUTH_REP_NAME"] = authorizedName[0:-1] sc1_schedule_page_dict["AUTH_REP_TITLE"] = sc1.get("authorizedTitle") if sc1.get("authorizedSignedDate") != "": date_array = sc1.get("authorizedSignedDate").split("/") sc1_schedule_page_dict["AUTH_REP_SIGN_MM"] = date_array[0] sc1_schedule_page_dict["AUTH_REP_SIGN_DD"] = date_array[1] sc1_schedule_page_dict["AUTH_REP_SIGN_YY"] = date_array[2] sc1_infile = current_app.config["FORM_TEMPLATES_LOCATION"].format("SC1") sc1_outfile = md5_directory + "SC" + "/page_" + str(sc1_start_page) + ".pdf" pypdftk.fill_form(sc1_infile, sc1_schedule_page_dict, sc1_outfile) # Memo text changes if sc1_schedule_page_dict.get("memoDescription"): # is_memo_page = 1 memo_dict = {} temp_memo_outfile = md5_directory + "SC/page_memo_temp.pdf" memo_infile = current_app.config["FORM_TEMPLATES_LOCATION"].format("TEXT") memo_outfile = md5_directory + "SC/page_memo_" + str(sc1_start_page) + ".pdf" memo_dict["scheduleName_1"] = "SC1" memo_dict["memoDescription_1"] = sc1_schedule_page_dict["memoDescription"] memo_dict["PAGESTR"] = ( "PAGE " + str(sc1_start_page + 1) + " / " + str(total_no_of_pages) ) if ( "transactionId" in sc1_schedule_page_dict and sc1_schedule_page_dict["transactionId"] ): memo_dict["transactionId_1"] = sc1_schedule_page_dict["transactionId"] if image_num: memo_dict["IMGNO"] = image_num image_num += 1 # build page pypdftk.fill_form(memo_infile, memo_dict, memo_outfile) pypdftk.concat([sc1_outfile, memo_outfile], temp_memo_outfile) os.remove(memo_outfile) os.rename(temp_memo_outfile, sc1_outfile) if path.isfile(md5_directory + "SC/all_pages.pdf"): pypdftk.concat( [ md5_directory + "SC/all_pages.pdf", md5_directory + "SC" + "/page_" + str(sc1_start_page) + ".pdf", ], md5_directory + "SC/temp_all_pages.pdf", ) os.rename( md5_directory + "SC/temp_all_pages.pdf", md5_directory + "SC/all_pages.pdf" ) else: os.rename( md5_directory + "SC/all_pages.pdf", md5_directory + "SC" + "/page_" + str(sc1_start_page) + ".pdf", ) return image_num
def print_sc_line( f3x_data, md5_directory, sc_schedules, sc_start_page, total_no_of_pages, image_num=None, ): try: sc_schedule_total = 0 os.makedirs(md5_directory + "SC/", exist_ok=True) sc_infile = current_app.config["FORM_TEMPLATES_LOCATION"].format("SC") sc1_list = [] totalOutstandingLoans = "0.00" page_num = 0 for sc in sc_schedules: page_subtotal = "{0:.2f}".format(float(sc.get("loanBalance"))) # memo_array = [] sc_schedule_total += float(page_subtotal) sc_schedule_page_dict = {} sc_schedule_page_dict["TRANSACTION_ID"] = sc.get("transactionId") sc_schedule_page_dict["totalPages"] = total_no_of_pages sc_schedule_page_dict["committeeName"] = f3x_data.get("committeeName") sc_schedule_page_dict["pageSubtotal"] = page_subtotal for item in [ "memoCode", "memoDescription", "lenderStreet1", "lenderStreet2", "lenderCity", "lenderState", "lenderZipCode", "electionOtherDescription", "isLoanSecured", ]: sc_schedule_page_dict[item] = sc.get(item) for item in [ "loanAmountOriginal", "loanPaymentToDate", "loanBalance", "loanInterestRate", ]: sc_schedule_page_dict[item] = "{0:.2f}".format(float(sc.get(item))) if "electionCode" in sc and sc.get("electionCode") != "": sc_schedule_page_dict["electionType"] = sc.get("electionCode")[0:1] sc_schedule_page_dict["electionYear"] = sc.get("electionCode")[1:5] # if sc.get('lenderOrganizationName') == "": if not sc.get("lenderOrganizationName"): lenderName = "" for item in [ "lenderLastName", "lenderFirstName", "lenderMiddleName", "lenderPrefix", "lenderSuffix", ]: if sc.get(item): lenderName += sc.get(item) + " " sc_schedule_page_dict["lenderName"] = lenderName[0:-1] else: sc_schedule_page_dict["lenderName"] = sc.get("lenderOrganizationName") if sc.get("loanIncurredDate"): date_array = sc.get("loanIncurredDate").split("/") sc_schedule_page_dict["loanIncurredDateMonth"] = date_array[0] sc_schedule_page_dict["loanIncurredDateDay"] = date_array[1] sc_schedule_page_dict["loanIncurredDateYear"] = date_array[2] if sc.get("loanDueDate"): if "-" in sc.get("loanDueDate"): date_array = sc.get("loanDueDate").split("-") if len(date_array) == 3: sc_schedule_page_dict["loanDueDateMonth"] = date_array[1] sc_schedule_page_dict["loanDueDateDay"] = date_array[2] sc_schedule_page_dict["loanDueDateYear"] = date_array[0] else: sc_schedule_page_dict["loanDueDateYear"] = sc.get("loanDueDate") elif "/" in sc.get("loanDueDate"): date_array = sc.get("loanDueDate").split("/") if len(date_array) == 3: sc_schedule_page_dict["loanDueDateMonth"] = date_array[0] sc_schedule_page_dict["loanDueDateDay"] = date_array[1] sc_schedule_page_dict["loanDueDateYear"] = date_array[2] else: sc_schedule_page_dict["loanDueDateYear"] = sc.get("loanDueDate") else: sc_schedule_page_dict["loanDueDateYear"] = sc.get("loanDueDate") if sc.get("child"): sc2 = [] for sc_child in sc.get("child"): if sc_child.get("transactionTypeIdentifier") == "SC2": sc2.append(sc_child) elif sc_child.get("transactionTypeIdentifier") == "SC1": # sc_child['SCPageNo'] = sc_start_page sc1_list.append(sc_child) if sc2: sc2_list = [] temp_sc2 = [] for i in range(len(sc2)): temp_sc2.append(sc2[i]) if i % 4 == 3 or i == len(sc2) - 1: sc2_list.append(temp_sc2) temp_sc2 = [] for i in range(len(sc2_list)): sc_schedule_single_page_dict = {} sc_schedule_single_page_dict = sc_schedule_page_dict for j in range(len(sc2_list[i])): sc2_name = "" for k in [ "prefix", "lastName", "firstName", "middleName", "suffix", ]: if sc2_list[i][j].get(k) != "": sc2_name += sc2_list[i][j].get(k) + "," sc_schedule_single_page_dict[ "name_{}".format(j + 1) ] = sc2_name[0:-1] sc_schedule_single_page_dict[ "street1_{}".format(j + 1) ] = sc2_list[i][j].get("street1") sc_schedule_single_page_dict[ "street2_{}".format(j + 1) ] = sc2_list[i][j].get("street2") sc_schedule_single_page_dict[ "city_{}".format(j + 1) ] = sc2_list[i][j].get("city") sc_schedule_single_page_dict[ "state_{}".format(j + 1) ] = sc2_list[i][j].get("state") sc_schedule_single_page_dict[ "zipCode_{}".format(j + 1) ] = sc2_list[i][j].get("zipCode") sc_schedule_single_page_dict[ "employer_{}".format(j + 1) ] = sc2_list[i][j].get("employer") sc_schedule_single_page_dict[ "occupation_{}".format(j + 1) ] = sc2_list[i][j].get("occupation") sc_schedule_single_page_dict[ "guaranteedAmount_{}".format(j + 1) ] = "{0:.2f}".format( float(sc2_list[i][j].get("guaranteedAmount")) ) sc_schedule_single_page_dict["pageNo"] = ( sc_start_page + page_num ) page_num += 1 if image_num: sc_schedule_single_page_dict["IMGNO"] = image_num image_num += 1 if ( sc_schedules[len(sc_schedules) - 1].get("transactionId") == sc_schedule_single_page_dict.get("TRANSACTION_ID") and i == len(sc2_list) - 1 ): totalOutstandingLoans = sc_schedule_single_page_dict[ "scheduleTotal" ] = "{0:.2f}".format(sc_schedule_total) sc_outfile = ( md5_directory + "SC" + "/page_" + str(sc_start_page) + ".pdf" ) pypdftk.fill_form( sc_infile, sc_schedule_single_page_dict, sc_outfile ) # Memo text changes if sc_schedule_page_dict.get("memoDescription"): memo_dict = {} temp_memo_outfile = md5_directory + "SC/page_memo_temp.pdf" memo_infile = current_app.config[ "FORM_TEMPLATES_LOCATION" ].format("TEXT") memo_outfile = ( md5_directory + "SC/page_memo_" + str(sc_start_page) + ".pdf" ) memo_dict["scheduleName_1"] = "SC13" memo_dict["PAGESTR"] = ( "PAGE " + str(sc_start_page + page_num) + " / " + str(total_no_of_pages) ) memo_dict["memoDescription_1"] = sc_schedule_page_dict[ "memoDescription" ] if ( "TRANSACTION_ID" in sc_schedule_page_dict and sc_schedule_page_dict["TRANSACTION_ID"] ): memo_dict["transactionId_1"] = sc_schedule_page_dict[ "TRANSACTION_ID" ] page_num += 1 if image_num: memo_dict["IMGNO"] = image_num image_num += 1 # build memo page pypdftk.fill_form(memo_infile, memo_dict, memo_outfile) pypdftk.concat( [sc_outfile, memo_outfile], temp_memo_outfile ) os.remove(memo_outfile) os.rename(temp_memo_outfile, sc_outfile) for j in range(len(sc2_list[i])): del sc_schedule_single_page_dict["name{}".format(j + 1)] del sc_schedule_single_page_dict["street1_{}".format(j + 1)] del sc_schedule_single_page_dict["street2_{}".format(j + 1)] del sc_schedule_single_page_dict["city_{}".format(j + 1)] del sc_schedule_single_page_dict["state_{}".format(j + 1)] del sc_schedule_single_page_dict["zipCode_{}".format(j + 1)] del sc_schedule_single_page_dict[ "employer_{}".format(j + 1) ] del sc_schedule_single_page_dict[ "occupation_{}".format(j + 1) ] del sc_schedule_single_page_dict[ "guaranteedAmount_{}".format(j + 1) ] if path.isfile(md5_directory + "SC/all_pages.pdf"): pypdftk.concat( [ md5_directory + "SC/all_pages.pdf", md5_directory + "SC" + "/page_" + str(sc_start_page) + ".pdf", ], md5_directory + "SC/temp_all_pages.pdf", ) os.rename( md5_directory + "SC/temp_all_pages.pdf", md5_directory + "SC/all_pages.pdf", ) else: os.rename( md5_directory + "SC" + "/page_" + str(sc_start_page) + ".pdf", md5_directory + "SC/all_pages.pdf", ) else: sc_schedule_page_dict["pageNo"] = sc_start_page + page_num page_num += 1 if image_num: sc_schedule_page_dict["IMGNO"] = image_num image_num += 1 if sc_schedules[len(sc_schedules) - 1].get( "transactionId" ) == sc_schedule_page_dict.get("TRANSACTION_ID"): totalOutstandingLoans = sc_schedule_page_dict[ "scheduleTotal" ] = "{0:.2f}".format(sc_schedule_total) sc_outfile = ( md5_directory + "SC" + "/page_" + str(sc_start_page) + ".pdf" ) pypdftk.fill_form(sc_infile, sc_schedule_page_dict, sc_outfile) # Memo text changes if sc_schedule_page_dict.get("memoDescription"): memo_dict = {} temp_memo_outfile = md5_directory + "SC/page_memo_temp.pdf" memo_infile = current_app.config[ "FORM_TEMPLATES_LOCATION" ].format("TEXT") memo_outfile = ( md5_directory + "SC/page_memo_" + str(sc_start_page) + ".pdf" ) memo_dict["scheduleName_1"] = "SC13" memo_dict["PAGESTR"] = ( "PAGE " + str(sc_start_page + page_num) + " / " + str(total_no_of_pages) ) memo_dict["memoDescription_1"] = sc_schedule_page_dict[ "memoDescription" ] if ( "TRANSACTION_ID" in sc_schedule_page_dict and sc_schedule_page_dict["TRANSACTION_ID"] ): memo_dict["transactionId_1"] = sc_schedule_page_dict[ "TRANSACTION_ID" ] page_num += 1 if image_num: memo_dict["IMGNO"] = image_num image_num += 1 # build page pypdftk.fill_form(memo_infile, memo_dict, memo_outfile) pypdftk.concat([sc_outfile, memo_outfile], temp_memo_outfile) os.remove(memo_outfile) os.rename(temp_memo_outfile, sc_outfile) if path.isfile(md5_directory + "SC/all_pages.pdf"): pypdftk.concat( [ md5_directory + "SC/all_pages.pdf", md5_directory + "SC" + "/page_" + str(sc_start_page) + ".pdf", ], md5_directory + "SC/temp_all_pages.pdf", ) os.rename( md5_directory + "SC/temp_all_pages.pdf", md5_directory + "SC/all_pages.pdf", ) else: os.rename( md5_directory + "SC" + "/page_" + str(sc_start_page) + ".pdf", md5_directory + "SC/all_pages.pdf", ) else: sc_schedule_page_dict["pageNo"] = sc_start_page + page_num page_num += 1 if image_num: sc_schedule_page_dict["IMGNO"] = image_num image_num += 1 if sc_schedules[len(sc_schedules) - 1].get( "transactionId" ) == sc_schedule_page_dict.get("TRANSACTION_ID"): totalOutstandingLoans = sc_schedule_page_dict[ "scheduleTotal" ] = "{0:.2f}".format(sc_schedule_total) sc_outfile = ( md5_directory + "SC" + "/page_" + str(sc_start_page) + ".pdf" ) pypdftk.fill_form(sc_infile, sc_schedule_page_dict, sc_outfile) # Memo text changes if sc_schedule_page_dict.get("memoDescription"): memo_dict = {} temp_memo_outfile = md5_directory + "SC/page_memo_temp.pdf" memo_infile = current_app.config["FORM_TEMPLATES_LOCATION"].format( "TEXT" ) memo_outfile = ( md5_directory + "SC/page_memo_" + str(sc_start_page) + ".pdf" ) memo_dict["scheduleName_1"] = "SC13" memo_dict["PAGESTR"] = ( "PAGE " + str(sc_start_page + page_num) + " / " + str(total_no_of_pages) ) memo_dict["memoDescription_1"] = sc_schedule_page_dict[ "memoDescription" ] if ( "TRANSACTION_ID" in sc_schedule_page_dict and sc_schedule_page_dict["TRANSACTION_ID"] ): memo_dict["transactionId_1"] = sc_schedule_page_dict[ "TRANSACTION_ID" ] page_num += 1 if image_num: memo_dict["IMGNO"] = image_num image_num += 1 # build page pypdftk.fill_form(memo_infile, memo_dict, memo_outfile) pypdftk.concat([sc_outfile, memo_outfile], temp_memo_outfile) os.remove(memo_outfile) os.rename(temp_memo_outfile, sc_outfile) if path.isfile(md5_directory + "SC/all_pages.pdf"): pypdftk.concat( [ md5_directory + "SC/all_pages.pdf", md5_directory + "SC" + "/page_" + str(sc_start_page) + ".pdf", ], md5_directory + "SC/temp_all_pages.pdf", ) os.rename( md5_directory + "SC/temp_all_pages.pdf", md5_directory + "SC/all_pages.pdf", ) else: os.rename( md5_directory + "SC" + "/page_" + str(sc_start_page) + ".pdf", md5_directory + "SC/all_pages.pdf", ) return sc1_list, sc_start_page + page_num - 1, totalOutstandingLoans, image_num except: # printing stack trace traceback.print_exception(*sys.exc_info())
def test_concat(self): total_pages = pypdftk.get_num_pages(TEST_PDF_PATH) output_file = pypdftk.concat( [TEST_PDF_PATH, TEST_PDF_PATH, TEST_PDF_PATH]) concat_total_pages = pypdftk.get_num_pages(output_file) self.assertEqual(total_pages * 3, concat_total_pages)
def print_sh4_line( f3x_data, md5_directory, line_number, sh4_list, page_cnt, current_page_num, total_no_of_pages, image_num=None, ): if sh4_list: last_page_cnt = 3 if len(sh4_list) % 3 == 0 else len(sh4_list) % 3 total_fedshare = 0 total_nonfedshare = 0 # total_fednonfed_share = 0 os.makedirs(md5_directory + "SH4/" + line_number, exist_ok=True) sh4_infile = current_app.config["FORM_TEMPLATES_LOCATION"].format( "SH4") for page_num in range(page_cnt): current_page_num += 1 # page_subtotal = 0 memo_array = [] last_page = False schedule_page_dict = {} schedule_page_dict["lineNumber"] = line_number schedule_page_dict["pageNo"] = current_page_num schedule_page_dict["totalPages"] = total_no_of_pages if image_num: schedule_page_dict["IMGNO"] = image_num image_num += 1 page_start_index = page_num * 3 if page_num + 1 == page_cnt: last_page = True # This call prepares data to render on PDF build_sh4_per_page_schedule_dict( last_page, last_page_cnt, page_start_index, schedule_page_dict, sh4_list, memo_array, ) page_fed_subtotal = float(schedule_page_dict["subFedShare"]) page_nonfed_subtotal = float(schedule_page_dict["subNonFedShare"]) schedule_page_dict["subTotalFedNonFedShare"] = "{0:.2f}".format( page_fed_subtotal + page_nonfed_subtotal) total_fedshare += page_fed_subtotal total_nonfedshare += page_nonfed_subtotal if page_cnt == page_num + 1: schedule_page_dict["TotalFedShare"] = "{0:.2f}".format( total_fedshare) schedule_page_dict["totalNonFedShare"] = "{0:.2f}".format( total_nonfedshare) schedule_page_dict["TotalFedNonFedShare"] = "{0:.2f}".format( total_fedshare + total_nonfedshare) schedule_page_dict["committeeName"] = f3x_data["committeeName"] sh4_outfile = (md5_directory + "SH4/" + line_number + "/page_" + str(page_num) + ".pdf") pypdftk.fill_form(sh4_infile, schedule_page_dict, sh4_outfile) # Memo text changes and build memo pages and return updated current_page_num current_page_num, image_num = build_memo_page( memo_array, md5_directory, line_number, current_page_num, page_num, total_no_of_pages, sh4_outfile, name="SH4", image_num=image_num, ) pypdftk.concat( directory_files(md5_directory + "SH4/" + line_number + "/"), md5_directory + "SH4/" + line_number + "/all_pages.pdf", ) if path.isfile(md5_directory + "SH4/all_pages.pdf"): pypdftk.concat( [ md5_directory + "SH4/all_pages.pdf", md5_directory + "SH4/" + line_number + "/all_pages.pdf", ], md5_directory + "SH4/temp_all_pages.pdf", ) os.rename( md5_directory + "SH4/temp_all_pages.pdf", md5_directory + "SH4/all_pages.pdf", ) else: os.rename( md5_directory + "SH4/" + line_number + "/all_pages.pdf", md5_directory + "SH4/all_pages.pdf", ) return image_num
def test_concat(self): total_pages = pypdftk.get_num_pages(TEST_PDF_PATH) output_file = pypdftk.concat([TEST_PDF_PATH, TEST_PDF_PATH, TEST_PDF_PATH]) concat_total_pages = pypdftk.get_num_pages(output_file) self.assertEqual(total_pages * 3, concat_total_pages)
def print_sb_line( f3x_data, md5_directory, line_number, sb_list, page_cnt, current_page_num, total_no_of_pages, image_num=None, ): try: if sb_list: last_page_cnt = 3 if len(sb_list) % 3 == 0 else len(sb_list) % 3 schedule_total = 0 os.makedirs(md5_directory + "SB/" + line_number, exist_ok=True) sb_infile = current_app.config["FORM_TEMPLATES_LOCATION"].format( "SB") for page_num in range(page_cnt): current_page_num += 1 memo_array = [] last_page = False schedule_page_dict = {} schedule_page_dict["lineNumber"] = line_number schedule_page_dict["pageNo"] = current_page_num schedule_page_dict["totalPages"] = total_no_of_pages if image_num: schedule_page_dict["IMGNO"] = image_num image_num += 1 page_start_index = page_num * 3 if page_num + 1 == page_cnt: last_page = True # This call prepares data to render on PDF build_sb_per_page_schedule_dict( last_page, last_page_cnt, page_start_index, schedule_page_dict, sb_list, memo_array, ) try: schedule_total += float(schedule_page_dict["pageSubtotal"]) if page_cnt == page_num + 1: schedule_page_dict["scheduleTotal"] = "{0:.2f}".format( schedule_total) schedule_page_dict["committeeName"] = f3x_data["committeeName"] sb_outfile = (md5_directory + "SB/" + line_number + "/page_" + str(page_num) + ".pdf") pypdftk.fill_form(sb_infile, schedule_page_dict, sb_outfile) # Memo text changes and build memo pages and return updated current_page_num current_page_num, image_num = build_memo_page( memo_array, md5_directory, line_number, current_page_num, page_num, total_no_of_pages, sb_outfile, name="SB", image_num=image_num, ) pypdftk.concat( directory_files(md5_directory + "SB/" + line_number + "/"), md5_directory + "SB/" + line_number + "/all_pages.pdf", ) if path.isfile(md5_directory + "SB/all_pages.pdf"): pypdftk.concat( [ md5_directory + "SB/all_pages.pdf", md5_directory + "SB/" + line_number + "/all_pages.pdf", ], md5_directory + "SB/temp_all_pages.pdf", ) os.rename( md5_directory + "SB/temp_all_pages.pdf", md5_directory + "SB/all_pages.pdf", ) else: os.rename( md5_directory + "SB/" + line_number + "/all_pages.pdf", md5_directory + "SB/all_pages.pdf", ) except: logging.error('**** Start - Error inside if condition ****') # printing stack trace traceback.print_exception(*sys.exc_info()) logging.error('**** End - Error inside if condition ****') return current_page_num, image_num except: # printing stack trace traceback.print_exception(*sys.exc_info())
def make_pdf(self): #Import dependencies from PyPDF2 import PdfFileReader from datetime import datetime import os import pypdftk import pytz pdf_pages = [] #Cycle through pages for j, page in enumerate(self.pages): template_name = os.path.join(THIS_FOLDER, "./pdf_templates/form.pdf") #Read pdf templates using PyPDF2 form = PdfFileReader(open(template_name, "rb")) #Get main form field names from pdf reader fields = form.getFields(tree=None, retval=None, fileobj=None) field_names = list(fields.keys()) #Make a copy of field_values field_values = self.details[:] #Add values from each page for product in page: field_values += [ product.reference, product.lot, product.quantity, product.description ] #Pad out unused fields, zip into dict for writing field_values += [""] * (len(field_names) - len(field_values)) field_dict = dict( zip(field_names, map(lambda x: x.upper(), field_values))) #Add page to writer, update fields from input data pdf_pages.append(pypdftk.fill_form(template_name, field_dict)) if self.checklist: end_form_template_name = os.path.join( THIS_FOLDER, "./pdf_templates/end_page.pdf") #Get pdf templates using PyPDF2 end_form = PdfFileReader(open(end_form_template_name, "rb")) #Get end form fields from reader end_fields = end_form.getFields(tree=None, retval=None, fileobj=None) end_field_names = list(end_fields.keys()) #Populate end field values with name and date, position depending on options end_field_values = [""] * 4 index = 2 if self.new else 0 tz = pytz.timezone("Australia/Brisbane") current_date = datetime.now(tz) end_field_values[index:index + 1] = [ self.client.first_name + " " + self.client.last_name, current_date.strftime("%d/%m/%Y") ] #Zip end field values and names into dict end_field_dict = dict(zip(end_field_names, end_field_values)) pdf_pages.append( pypdftk.fill_form(end_form_template_name, end_field_dict)) pypdftk.concat(pdf_pages, os.path.join(THIS_FOLDER, "../dynamic/print.pdf"))
def print_pdftk(stamp_print): # check if json_file is in the request # try: if 'json_file' in request.files: total_no_of_pages = 0 page_no = 1 has_sa_schedules = has_sb_schedules = False json_file = request.files.get('json_file') # generate md5 for json file # FIXME: check if PDF already exist with md5, if exist return pdf instead of re-generating PDF file. json_file_md5 = utils.md5_for_file(json_file) json_file.stream.seek(0) md5_directory = current_app.config['OUTPUT_DIR_LOCATION'].format(json_file_md5) os.makedirs(md5_directory, exist_ok=True) infile = current_app.config['FORM_TEMPLATES_LOCATION'].format('F3L') # save json file as md5 file name json_file.save(current_app.config['REQUEST_FILE_LOCATION'].format(json_file_md5)) outfile = md5_directory + json_file_md5 + '_temp.pdf' # load json file f3l_json = json.load(open(current_app.config['REQUEST_FILE_LOCATION'].format(json_file_md5))) # setting timestamp and imgno to empty as these needs to show up after submission output = {} if stamp_print != 'stamp': output['FILING_TIMESTAMP'] = '' output['IMGNO'] = '' # read data from json file f3l_data = f3l_json['data'] # check if summary is present in fecDataFile f3l_summary = [] if 'summary' in f3l_data: f3l_summary = f3l_data['summary'] # split coverage start date and coverage end date to set month, day, and year if f3l_data['coverageStartDate'] and len(f3l_data['coverageStartDate']) > 0: coverage_start_date_array = f3l_data['coverageStartDate'].split("/") f3l_data['coverageStartDateMonth'] = coverage_start_date_array[0] f3l_data['coverageStartDateDay'] = coverage_start_date_array[1] f3l_data['coverageStartDateYear'] = coverage_start_date_array[2] if f3l_data['coverageEndDate'] and len(f3l_data['coverageEndDate']) > 0: coverage_end_date_array = f3l_data['coverageEndDate'].split("/") f3l_data['coverageEndDateMonth'] = coverage_end_date_array[0] f3l_data['coverageEndDateDay'] = coverage_end_date_array[1] f3l_data['coverageEndDateYear'] = coverage_end_date_array[2] # checking for signed date, it is only available for submitted reports if f3l_data['date_signed'] and len(f3l_data['date_signed']) > 0: date_signed_array = f3l_data['date_signed'].split("-") f3l_data['dateSignedMonth'] = date_signed_array[0] f3l_data['dateSignedDay'] = date_signed_array[1] f3l_data['dateSignedYear'] = date_signed_array[2] # build treasurer name to map it to PDF template treasurer_full_name = [] treasurer_full_name.append(f3l_data['treasurerLastName']) treasurer_full_name.append(f3l_data['treasurerFirstName']) treasurer_full_name.append(f3l_data['treasurerMiddleName']) treasurer_full_name.append(f3l_data['treasurerPrefix']) treasurer_full_name.append(f3l_data['treasurerSuffix']) f3l_data['treasurerFullName'] = ",".join(map(str, treasurer_full_name)) f3l_data['treasurerName'] = f3l_data['treasurerLastName'] + "," + f3l_data['treasurerFirstName'] f3l_data['efStamp'] = '[Electronically Filed]' # checking if json contains summary details, for individual transactions print there wouldn't be summary if len(f3l_summary) > 0: total_no_of_pages = 1 f3l_data_summary_array = [f3l_data, f3l_summary] if 'memoText' in f3l_data and f3l_data['memoText']: total_no_of_pages += 1 else: f3l_data_summary_array = [f3l_data] f3l_data_summary = {i: j for x in f3l_data_summary_array for i, j in x.items()} # process all schedules and build the PDF's process_output, total_no_of_pages = process_schedules(f3l_data, md5_directory, total_no_of_pages) has_sa_schedules = process_output.get('has_sa_schedules') has_sb_schedules = process_output.get('has_sb_schedules') if len(f3l_summary) > 0: get_summary_detail(f3l_summary, f3l_data, f3l_data_summary) f3l_data_summary['PAGESTR'] = "PAGE " + str(page_no) + " / " + str(total_no_of_pages) pypdftk.fill_form(infile, f3l_data_summary, outfile) shutil.copy(outfile, md5_directory + 'F3L_Summary.pdf') os.remove(md5_directory + json_file_md5 + '_temp.pdf') # Memo text changes if 'memoText' in f3l_data_summary and f3l_data_summary['memoText']: memo_dict = {} temp_memo_outfile = md5_directory + 'F3L_Summary_memo.pdf' memo_infile = current_app.config['FORM_TEMPLATES_LOCATION'].format('TEXT') memo_dict['scheduleName_1'] = 'F3L' + f3l_data_summary['amendmentIndicator'] memo_dict['memoDescription_1'] = f3l_data_summary['memoText'] memo_dict['PAGESTR'] = "PAGE " + str(2) + " / " + str(total_no_of_pages) pypdftk.fill_form(memo_infile, memo_dict, temp_memo_outfile) pypdftk.concat([md5_directory + 'F3L_Summary.pdf', temp_memo_outfile], md5_directory + json_file_md5 + '_temp.pdf') shutil.copy(md5_directory + json_file_md5 + '_temp.pdf', md5_directory + 'F3L_Summary.pdf') os.remove(md5_directory + json_file_md5 + '_temp.pdf') # check if all_pages already exsits if os.path.exists(md5_directory + 'all_pages.pdf'): os.remove(md5_directory + 'all_pages.pdf') # checking for sa transactions if has_sa_schedules: pypdftk.concat([md5_directory + 'F3L_Summary.pdf', md5_directory + 'SA/all_pages.pdf'], md5_directory + 'all_pages.pdf') os.remove(md5_directory + 'SA/all_pages.pdf') shutil.rmtree(md5_directory + 'SA') else: shutil.copy(md5_directory + 'F3L_Summary.pdf', md5_directory + 'all_pages.pdf') # checking for sb transactions if has_sb_schedules: pypdftk.concat([md5_directory + 'all_pages.pdf', md5_directory + 'SB/all_pages.pdf'], md5_directory + 'temp_all_pages.pdf') shutil.move(md5_directory + 'temp_all_pages.pdf', md5_directory + 'all_pages.pdf') os.remove(md5_directory + 'SB/all_pages.pdf') shutil.rmtree(md5_directory + 'SB') else: # no summary, expecting it to be from individual transactions if has_sa_schedules: if os.path.exists(md5_directory + 'all_pages.pdf'): os.remove(md5_directory + 'all_pages.pdf') shutil.move(md5_directory + 'SA/all_pages.pdf', md5_directory + 'all_pages.pdf') else: shutil.move(md5_directory + 'SA/all_pages.pdf', md5_directory + 'all_pages.pdf') shutil.rmtree(md5_directory + 'SA') if has_sb_schedules: if os.path.exists(md5_directory + 'all_pages.pdf'): os.remove(md5_directory + 'all_pages.pdf') shutil.move(md5_directory + 'SB/all_pages.pdf', md5_directory + 'all_pages.pdf') else: shutil.move(md5_directory + 'SB/all_pages.pdf', md5_directory + 'all_pages.pdf') shutil.rmtree(md5_directory + 'SB') # push output file to AWS s3 = boto3.client('s3') s3.upload_file(md5_directory + 'all_pages.pdf', current_app.config['AWS_FECFILE_COMPONENTS_BUCKET_NAME'], md5_directory + 'all_pages.pdf', ExtraArgs={'ContentType': "application/pdf", 'ACL': "public-read"}) response = { # 'file_name': '{}.pdf'.format(json_file_md5), 'pdf_url': current_app.config['PRINT_OUTPUT_FILE_URL'].format(json_file_md5) + 'all_pages.pdf' } # return response if flask.request.method == "POST": envelope = common.get_return_envelope( data=response ) status_code = status.HTTP_201_CREATED return flask.jsonify(**envelope), status_code else: if flask.request.method == "POST": envelope = common.get_return_envelope( 'false', 'JSON file is missing from your request' ) status_code = status.HTTP_400_BAD_REQUEST return flask.jsonify(**envelope), status_code
def process_sa_line(f3l_data, md5_directory, line_number, sa_line, sa_line_page_cnt, sa_line_start_page, sa_line_last_page_cnt, total_no_of_pages): has_sa_schedules = False if len(sa_line) > 0: sa_line_start_page += 1 has_sa_schedules = True schedule_total = 0.00 schedule_aggregate_total = 0.00 os.makedirs(md5_directory + 'SA/' + line_number, exist_ok=True) sa_infile = current_app.config['FORM_TEMPLATES_LOCATION'].format('SA3L') if sa_line_page_cnt > 0: for sa_page_no in range(sa_line_page_cnt): # if sa_schedule_page_dict['memoDescription']: # sa_memo_obj = {'memoDescripton':sa_schedule_page_dict['memoDescription'], 'transactionId': sa_schedule_page_dict['transactionId']} # sa_memo.append(sa_memo_obj) page_subtotal = 0.00 memo_array = [] last_page = False sa_schedule_page_dict = {} sa_schedule_page_dict['lineNumber'] = line_number sa_schedule_page_dict['pageNo'] = sa_line_start_page + sa_page_no sa_schedule_page_dict['totalPages'] = total_no_of_pages page_start_index = sa_page_no * 4 if ((sa_page_no + 1) == sa_line_page_cnt): last_page = True # This call prepares data to render on PDF sa_schedule_dict = build_sa_per_page_schedule_dict(last_page, sa_line_last_page_cnt, page_start_index, sa_schedule_page_dict, sa_line, memo_array) page_subtotal = float(sa_schedule_page_dict['pageSubtotal']) page_aggregate_total = float(sa_schedule_page_dict['pageAggSubtotal']) schedule_total += page_subtotal schedule_aggregate_total += page_aggregate_total if sa_line_page_cnt == (sa_page_no + 1): sa_schedule_page_dict['scheduleTotal'] = '{0:.2f}'.format(schedule_total) sa_schedule_page_dict['scheduleAggTotal'] = '{0:.2f}'.format(page_aggregate_total) sa_schedule_page_dict['committeeName'] = f3l_data['committeeName'] sa_outfile = md5_directory + 'SA/' + line_number + '/page_' + str(sa_page_no) + '.pdf' pypdftk.fill_form(sa_infile, sa_schedule_page_dict, sa_outfile) # Memo text changes memo_dict = {} if len(memo_array) >= 1: temp_memo_outfile = md5_directory + 'SA/' + line_number + '/page_memo_temp.pdf' memo_infile = current_app.config['FORM_TEMPLATES_LOCATION'].format('TEXT') memo_outfile = md5_directory + 'SA/' + line_number + '/page_memo_' + str(sa_page_no) + '.pdf' memo_dict['scheduleName_1'] = memo_array[0]['scheduleName'] memo_dict['memoDescription_1'] = memo_array[0]['memoDescription'] memo_dict['transactionId_1'] = memo_array[0]['transactionId'] if len(memo_array) >= 2: memo_dict['scheduleName_2'] = memo_array[1]['scheduleName'] memo_dict['memoDescription_2'] = memo_array[1]['memoDescription'] memo_dict['transactionId_2'] = memo_array[1]['transactionId'] # build page pypdftk.fill_form(memo_infile, memo_dict, memo_outfile) pypdftk.concat([sa_outfile, memo_outfile], temp_memo_outfile) os.remove(memo_outfile) os.rename(temp_memo_outfile, sa_outfile) sa_line_start_page += 1 if len(memo_array) >= 3: memo_dict = {} memo_outfile = md5_directory + 'SA/' + line_number + '/page_memo_' + str(sa_page_no) + '.pdf' memo_dict['scheduleName_1'] = memo_array[2]['scheduleName'] memo_dict['memoDescription_1'] = memo_array[2]['memoDescription'] memo_dict['transactionId_1'] = memo_array[2]['transactionId'] if len(memo_array) >= 4: memo_dict['scheduleName_2'] = memo_array[3]['scheduleName'] memo_dict['memoDescription_2'] = memo_array[3]['memoDescription'] memo_dict['transactionId_2'] = memo_array[3]['transactionId'] pypdftk.fill_form(memo_infile, memo_dict, memo_outfile) pypdftk.concat([sa_outfile, memo_outfile], temp_memo_outfile) os.remove(memo_outfile) os.rename(temp_memo_outfile, sa_outfile) sa_line_start_page += 1 pypdftk.concat(directory_files(md5_directory + 'SA/' + line_number + '/'), md5_directory + 'SA/' + line_number + '/all_pages.pdf') # if len(sa_memo) > 0: # for sa_memo_obj in range(sa_memo): # sa_outfile = md5_directory + 'SA/' + line_number + '/page_' + str(sa_page_no) + '.pdf' # pypdftk.fill_form(sa_infile, sa_schedule_page_dict, sa_outfile) # if all_pages.pdf exists in SA folder, concatenate line number pdf to all_pages.pdf if path.isfile(md5_directory + 'SA/all_pages.pdf'): pypdftk.concat([md5_directory + 'SA/all_pages.pdf', md5_directory + 'SA/' + line_number + '/all_pages.pdf'], md5_directory + 'SA/temp_all_pages.pdf') os.rename(md5_directory + 'SA/temp_all_pages.pdf', md5_directory + 'SA/all_pages.pdf') else: os.rename(md5_directory + 'SA/' + line_number + '/all_pages.pdf', md5_directory + 'SA/all_pages.pdf') return has_sa_schedules