Python split примеры, pypdftk.split Python примеры использования

Пример #1

0

Показать файл

Файл: test.py Проект: revolunet/pypdftk

 def test_split(self):
     total_pages = pypdftk.get_num_pages(TEST_PDF_PATH)
     paths = pypdftk.split(TEST_PDF_PATH)
     self.assertEqual(len(paths) - 1, total_pages)
     self.assertTrue('doc_data.txt' in paths[0])
     for p in paths:
         self.assertTrue(os.path.exists(p))

Пример #2

0

Показать файл

 def test_split(self):
     total_pages = pypdftk.get_num_pages(TEST_PDF_PATH)
     paths = pypdftk.split(TEST_PDF_PATH)
     self.assertEqual(len(paths) - 1, total_pages)
     self.assertTrue('doc_data.txt' in paths[0])
     for p in paths:
         self.assertTrue(os.path.exists(p))

Пример #3

0

Показать файл

 def test_split_output_dir(self):
     output_dir = mkdtemp()
     total_pages = pypdftk.get_num_pages(TEST_PDF_PATH)
     paths = pypdftk.split(TEST_PDF_PATH, out_dir=output_dir)
     self.assertEqual(len(paths) - 1, total_pages)
     for p in paths:
         out_path = os.path.join(output_dir, os.path.basename(p))
         self.assertTrue(out_path)

Пример #4

0

Показать файл

Файл: test.py Проект: revolunet/pypdftk

 def test_split_output_dir(self):
     output_dir = mkdtemp()
     total_pages = pypdftk.get_num_pages(TEST_PDF_PATH)
     paths = pypdftk.split(TEST_PDF_PATH, out_dir=output_dir)
     self.assertEqual(len(paths) - 1, total_pages)
     for p in paths:
         out_path = os.path.join(output_dir, os.path.basename(p))
         self.assertTrue(out_path)

Пример #5

0

Показать файл

Файл: first_pages.py Проект: sameersingh/bibere

def run(idir, bdir, ofile):
    authors, venues, papers = read_all_info(idir)

    fpdf_names = []
    tmpdirname = tempfile.mkdtemp()
    for p in papers:
        if p['pubTypeSlot'] == 'Conference' or p['pubTypeSlot'] == 'Journal':
            if 'pdfLink' not in p:
                print("pdfLink missing:", p['id'])
            elif p['pdfLink'].startswith("http"):
                print("local link missing:", p['id'])
            else:
                source = bdir + "/" + p['pdfLink']
                i = len(fpdf_names)
                dest = "%s/%d.pdf" % (tmpdirname, i)
                print("getting %s, putting it %s" % (source, dest))
                get_pdf(source, dest)
                tdir = "%s/%d/" % (tmpdirname, i)
                os.mkdir(tdir)
                fpdf_names.append(tdir + "page_01.pdf")
                pypdftk.split(dest, tdir)
    pypdftk.concat(fpdf_names, out_file=ofile)
    shutil.rmtree(tmpdirname)

Пример #6

0

Показать файл

def run(idir, bdir, ofile):
    authors, venues, papers = read_all_info(idir)

    fpdf_names = []
    tmpdirname = tempfile.mkdtemp()
    for p in papers:
        if p['pubTypeSlot'] == 'Conference' or p['pubTypeSlot'] == 'Journal':
            if 'pdfLink' not in p:
                print("pdfLink missing:", p['id'])
            elif p['pdfLink'].startswith("http"):
                print("local link missing:", p['id'])
            else:
                source = bdir + "/" + p['pdfLink']
                i = len(fpdf_names)
                dest = "%s/%d.pdf" % (tmpdirname, i)
                print("getting %s, putting it %s" % (source, dest))
                get_pdf(source, dest)
                tdir = "%s/%d/" % (tmpdirname, i)
                os.mkdir(tdir)
                fpdf_names.append(tdir + "page_01.pdf")
                pypdftk.split(dest, tdir)
    pypdftk.concat(fpdf_names, out_file=ofile)
    shutil.rmtree(tmpdirname)

Пример #7

0

Показать файл

Файл: normalize.py Проект: ottobonn/exam-normalizer

def split(input_filename):
    """Split the input file given by input_filename into individual pages.
    The page files will be written as PDFs into a temporary directory.
    Returns a tuple, where the first element is the output directory name,
    and the second is the list of PDFs of the pages."""
    pdf_directory = tempfile.mkdtemp(dir="./")
    pages = pypdftk.split(input_filename, pdf_directory)
    # Keep only PDFs. PDFTk puts out a document info text file as well, which
    # we delete.
    pdfs = []
    for page in pages:
        if page.rsplit(".", 1)[1] == "pdf":
            pdfs.append(page)
        else:
            os.remove(page)
    # sort pdfs into correct order (by extracting the actual page number)
    pdfs.sort(key=lambda s: int(s[s.rfind('page_') + 5:-4]))
    return (pdf_directory, pdfs)

Пример #8

0

Показать файл

Файл: normalize.py Проект: ottobonn/exam-normalizer

def split(input_filename):
    """Split the input file given by input_filename into individual pages.
    The page files will be written as PDFs into a temporary directory.
    Returns a tuple, where the first element is the output directory name,
    and the second is the list of PDFs of the pages."""
    pdf_directory = tempfile.mkdtemp(dir="./")
    pages = pypdftk.split(input_filename, pdf_directory)
    # Keep only PDFs. PDFTk puts out a document info text file as well, which
    # we delete.
    pdfs = []
    for page in pages:
        if page.rsplit(".", 1)[1] == "pdf":
            pdfs.append(page)
        else:
            os.remove(page)
    # sort pdfs into correct order (by extracting the actual page number)
    pdfs.sort(key=lambda s: int(s[s.rfind('page_') + 5:-4]))
    return (pdf_directory, pdfs)

Пример #9

0

Показать файл

Файл: test_file.py Проект: cfourrou/pypdftk

def test_split():
    input_file = "./Out/some_file.pdf"
    pypdftk.split(input_file, "./Out")

Пример #10

0

Показать файл

def print_f99_pdftk_html(
    stamp_print="",
    paginate=False,
    begin_image_num=None,
    page_count=False,
    file_content=None,
    silent_print=False,
    filing_timestamp=None,
    rep_id=None,
    attachment_file_content=None,
):
    # check if json_file is in the request
    # HTML("templates/forms/test.html").write_pdf("output/pdf/test/test.pdf")
    # HTML(string='''<br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><div><b>This is bold text</b></div><div><u>This is underline text</u></div><div><i>This is italics text</i><u><br></u></div><div align='center'><u>Title</u></div><div align='left'><u><br></u></div><ol><li>one</li><li>two</li><li>three</li></ol>''').write_pdf("output/pdf/test/test.pdf")
    # pdfkit.from_file("templates/forms/test.html", "output/pdf/test/test.pdf")
    # pypdftk.stamp(current_app.config['FORM_TEMPLATES_LOCATION'].format('F99'), "output/pdf/test/test.pdf", "output/pdf/test/output.pdf")
    try:
        silent_print = silent_print
        txn_img_num = begin_image_num
        filing_timestamp = filing_timestamp

        if ((page_count and file_content) or
            ((paginate or silent_print) and file_content and begin_image_num)
                or (not paginate and "json_file" in request.files)):

            if page_count and file_content:
                json_file_md5 = md5_for_text(file_content)
                json_data = json.loads(file_content)

            elif (paginate
                  or silent_print) and file_content and begin_image_num:
                # generate md5 for file_content
                json_file_md5 = md5_for_text(file_content)
                json_data = json.loads(file_content)

            elif not paginate and "json_file" in request.files:
                json_file = request.files.get("json_file")
                silent_print = (True if request.form.get("silent_print")
                                and request.form.get("silent_print").lower()
                                in ["true", "1"] else False)
                page_count = (True if request.form.get("page_count")
                              and request.form.get("page_count").lower()
                              in ["true", "1"] else False)

                if silent_print:
                    txn_img_num = request.form.get("begin_image_num", None)

                    if not txn_img_num:
                        if flask.request.method == "POST":
                            envelope = common.get_return_envelope(
                                "false",
                                "begin_image_num is missing from your request")
                            status_code = status.HTTP_400_BAD_REQUEST
                            return flask.jsonify(**envelope), status_code
                    txn_img_num = int(txn_img_num)

                    filing_timestamp = request.form.get(
                        "filing_timestamp", None)

                json_file_md5 = md5_for_file(json_file)
                json_file.stream.seek(0)

                # save json file as md5 file name
                json_file.save(
                    current_app.config["REQUEST_FILE_LOCATION"].format(
                        json_file_md5))

                # load json file
                json_data = json.load(
                    open(current_app.config["REQUEST_FILE_LOCATION"].format(
                        json_file_md5)))

            md5_directory = current_app.config["OUTPUT_DIR_LOCATION"].format(
                json_file_md5)

            # if paginate or page_count is True and directory exist then don't remove it
            is_dir_exist = False
            if os.path.isdir(md5_directory):
                is_dir_exist = True

            os.makedirs(md5_directory, exist_ok=True)
            # os.makedirs(md5_directory + "images", exist_ok=True)
            if not os.path.exists(md5_directory + "images"):
                shutil.copytree("templates/forms/F99/images",
                                md5_directory + "images")
            shutil.copyfile("templates/forms/F99/form-text.css",
                            md5_directory + "form-text.css")
            infile = current_app.config["HTML_FORM_TEMPLATES_LOCATION"].format(
                "template")
            outfile = md5_directory + json_file_md5 + ".html"

            form99_json_data = json_data["data"]

            with open(infile) as inf:
                txt = inf.read()
                soup = bs4.BeautifulSoup(txt, features="html5lib")
                soup.find("label", attrs={
                    "id": "committeeName"
                }).string = form99_json_data["committeeName"]
                soup.find("label", attrs={
                    "id": "street1"
                }).string = form99_json_data["street1"]
                soup.find("label", attrs={
                    "id": "street2"
                }).string = form99_json_data["street2"]
                soup.find("label", attrs={
                    "id": "city"
                }).string = form99_json_data["city"]
                soup.find("label", attrs={
                    "id": "state"
                }).string = form99_json_data["state"]
                soup.find("label", attrs={
                    "id": "zipCode"
                }).string = form99_json_data["zipCode"]
                soup.find("span", attrs={
                    "id": "committeeId"
                }).string = form99_json_data["committeeId"]

                name_list = [
                    "LastName", "FirstName", "MiddleName", "Prefix", "Suffix"
                ]

                treasurerFullName = ""
                for item in name_list:
                    item = "treasurer" + item
                    if form99_json_data.get(item):
                        treasurerFullName += form99_json_data.get(item) + ", "
                soup.find("label", attrs={
                    "id": "treasurerFullName"
                }).string = treasurerFullName[:-2]

                soup.find("label", attrs={
                    "id": "treasurerName"
                }).string = ((form99_json_data.get("treasurerLastName", "") +
                              ", " +
                              form99_json_data.get("treasurerFirstName", "")
                              ).strip().rstrip(",").strip())

                f99_html_data = form99_json_data["text"]
                soup.find("label", attrs={"id": "text"}).string = f99_html_data
                soup.find("label", attrs={
                    "id": form99_json_data["reason"]
                }).string = "X"

                date_array = form99_json_data["dateSigned"].split("/")
                soup.find("span", attrs={
                    "id": "dateSignedMonth"
                }).string = str(date_array[0])
                soup.find("span", attrs={
                    "id": "dateSignedDate"
                }).string = str(date_array[1])
                soup.find("span", attrs={
                    "id": "dateSignedYear"
                }).string = str(date_array[2])

                with open(outfile, "w") as output_file:
                    output_file.write(
                        str(soup).replace("&lt;", "<").replace("&gt;", ">"))

                # F99 PDF page padding options
                options = {
                    "margin-top": "0.40in",
                    "margin-right": "0.20in",
                    "margin-bottom": "0.40in",
                    "margin-left": "0.20in",
                }

                # HTML(outfile).write_pdf(md5_directory + json_file_md5 + '.pdf', stylesheets=[CSS(current_app.config['FORMS_LOCATION'].format('F99.css'))])
                pdfkit.from_file(outfile,
                                 md5_directory + json_file_md5 + ".pdf",
                                 options=options)
                # pdfkit.from_file(outfile, md5_directory + json_file_md5 + '.pdf')

                total_no_of_pages = pypdftk.get_num_pages(md5_directory +
                                                          json_file_md5 +
                                                          ".pdf")

            # checking if attachment_file exist
            if ((paginate or page_count) and attachment_file_content) or (
                    not paginate and "attachment_file" in request.files):
                # reading Attachment title file
                attachment_title_file = current_app.config[
                    "FORM_TEMPLATES_LOCATION"].format("Attachment_Title")

                if (paginate or page_count) and attachment_file_content:
                    attachment_file = json.loads(attachment_file_content)
                else:
                    attachment_file = request.files.get("attachment_file")

                attachment_file.save(
                    os.path.join(md5_directory + "attachment_temp.pdf"))
                os.makedirs(md5_directory + "attachment", exist_ok=True)
                os.makedirs(md5_directory + "final_attachment", exist_ok=True)
                pypdftk.split(md5_directory + "attachment_temp.pdf",
                              md5_directory + "attachment")
                os.remove(md5_directory + "attachment/doc_data.txt")
                attachment_no_of_pages = pypdftk.get_num_pages(
                    os.path.join(md5_directory + "attachment_temp.pdf"))
                attachment_page_no = total_no_of_pages
                total_no_of_pages += attachment_no_of_pages

                # we are doing this to assign page numbers to attachment file
                for filename in os.listdir(md5_directory + "attachment"):
                    attachment_page_no += 1
                    page_dict = {}
                    page_dict["PAGESTR"] = ("PAGE " + str(attachment_page_no) +
                                            " / " + str(total_no_of_pages))

                    if silent_print:
                        page_dict["IMGNO"] = txn_img_num + attachment_page_no

                    pypdftk.fill_form(
                        attachment_title_file,
                        md5_directory + "attachment/attachment_page_" +
                        str(attachment_page_no) + ".pdf",
                    )
                    pypdftk.stamp(
                        md5_directory + "attachment/" + filename,
                        md5_directory + "attachment/attachment_page_" +
                        str(attachment_page_no) + ".pdf",
                        md5_directory + "final_attachment/attachment_" +
                        str(attachment_page_no) + ".pdf",
                    )
                pypdftk.concat(
                    directory_files(md5_directory + "final_attachment/"),
                    md5_directory + "attachment.pdf",
                )
                os.remove(md5_directory + "attachment_temp.pdf")

            os.makedirs(md5_directory + "pages", exist_ok=True)
            os.makedirs(md5_directory + "final_pages", exist_ok=True)
            pypdftk.split(md5_directory + json_file_md5 + ".pdf",
                          md5_directory + "pages")
            os.remove(md5_directory + "pages/doc_data.txt")

            f99_page_no = 1
            for filename in os.listdir(md5_directory + "pages"):
                page_dict = {}
                page_dict["PAGESTR"] = ("PAGE " + str(f99_page_no) + " / " +
                                        str(total_no_of_pages))

                if silent_print:
                    page_dict["IMGNO"] = txn_img_num
                    txn_img_num += 1
                    # need to print timestamp on first page only
                    if filing_timestamp and f99_page_no == 1:
                        page_dict["FILING_TIMESTAMP"] = filing_timestamp

                page_number_file = current_app.config[
                    "FORM_TEMPLATES_LOCATION"].format("Page_Number")
                pypdftk.fill_form(
                    page_number_file,
                    page_dict,
                    md5_directory + "pages/page_number_" +
                    str(f99_page_no).zfill(6) + ".pdf",
                )
                pypdftk.stamp(
                    md5_directory + "pages/page_number_" +
                    str(f99_page_no).zfill(6) + ".pdf",
                    md5_directory + "pages/" + filename,
                    md5_directory + "final_pages/page_" +
                    str(f99_page_no).zfill(6) + ".pdf",
                )
                f99_page_no += 1

            pypdftk.concat(
                directory_files(md5_directory + "final_pages/"),
                json_file_md5 + "_temp.pdf",
            )

            if ((paginate or page_count) and attachment_file_content) or (
                    not paginate and "attachment_file" in request.files):
                pypdftk.concat(
                    [
                        json_file_md5 + "_temp.pdf",
                        md5_directory + "attachment.pdf"
                    ],
                    md5_directory + "all_pages.pdf",
                )
                shutil.rmtree(md5_directory + "attachment")
                shutil.rmtree(md5_directory + "final_attachment")
                os.remove(md5_directory + "attachment.pdf")
            else:
                shutil.move(json_file_md5 + "_temp.pdf",
                            md5_directory + "all_pages.pdf")

            # clean up task
            shutil.rmtree(md5_directory + "pages")
            shutil.rmtree(md5_directory + "final_pages")
            os.remove(md5_directory + json_file_md5 + ".pdf")

            # if flask.request.method == "POST":

            response = {
                # 'file_name': ent_app.conf'{}.pdf'.format(json_file_md5),
                "total_pages": total_no_of_pages,
            }

            if not page_count and not paginate:
                s3 = boto3.client("s3")
                extraArgs = {
                    "ContentType": "application/pdf",
                    "ACL": "public-read"
                }

                if silent_print:
                    response["pdf_url"] = current_app.config[
                        'S3_FILE_URL'] + rep_id + '.pdf'
                    s3.upload_file(
                        md5_directory + 'all_pages.pdf',
                        current_app.
                        config['AWS_FECFILE_COMPONENTS_BUCKET_NAME'],
                        current_app.config['AWS_FECFILE_OUTPUT_DIRECTORY'] +
                        '/' + str(rep_id) + '.pdf',
                        ExtraArgs=extraArgs)
                else:
                    response["pdf_url"] = (
                        current_app.config["PRINT_OUTPUT_FILE_URL"].format(
                            json_file_md5) + "all_pages.pdf", )

                    s3.upload_file(
                        md5_directory + "all_pages.pdf",
                        current_app.
                        config["AWS_FECFILE_COMPONENTS_BUCKET_NAME"],
                        md5_directory + "all_pages.pdf",
                        ExtraArgs=extraArgs,
                    )
            else:
                if not is_dir_exist:
                    shutil.rmtree(md5_directory)
                if paginate:
                    txn_img_json = {
                        "summary": {
                            "committeeId":
                            form99_json_data.get("committeeId", None),
                            "begin_image_num":
                            begin_image_num,
                            "end_image_num":
                            txn_img_num
                        }
                    }
                    response["txn_img_json"] = txn_img_json

            envelope = common.get_return_envelope(data=response)
            status_code = (status.HTTP_200_OK if page_count or paginate else
                           status.HTTP_201_CREATED)
            return flask.jsonify(**envelope), status_code

            # elif page_count or paginate:
            #     if not is_dir_exist:
            #         shutil.rmtree(md5_directory)
            #         response = {
            #             "total_pages": total_no_of_pages,
            #         }
            #     elif paginate:
            #         txn_img_json = {
            #             'summary' : {
            #                 'committeeId': form99_json_data.get('committeeId', None)
            #             }
            #         }
            #         response['txn_img_json'] = txn_img_json
            #     return True, response
            # elif silent_print and not flask.request.method == "POST":
            #     return True, {}
        else:
            if paginate or page_count or silent_print:
                envelope = common.get_return_envelope(False, "")
            else:
                # elif flask.request.method == "POST":
                envelope = common.get_return_envelope(
                    False, "json_file is missing from your request")
            return flask.jsonify(**envelope), status.HTTP_400_BAD_REQUEST
    except Exception as e:
        traceback.print_exception(*sys.exc_info())
        return error("Error generating print preview, error message: " +
                     str(e))

Пример #11

0

Показать файл

Файл: test_file.py Проект: pmartynov/pypdftk

def test_split():
    input_file = "./Out/some_file.pdf"
    pypdftk.split(input_file, "./Out")

Пример #12

0

Показать файл

 def page_pdfs(self):
     import pypdftk
     if not self._page_pdfs:
         self._page_pdfs = pypdftk.split(self.fn)
     return self._page_pdfs

Пример #13

0

Показать файл

def print_f99_pdftk_html(stamp_print):
    # check if json_file is in the request
    # HTML("templates/forms/test.html").write_pdf("output/pdf/test/test.pdf")
    # HTML(string='''<br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><div><b>This is bold text</b></div><div><u>This is underline text</u></div><div><i>This is italics text</i><u><br></u></div><div align='center'><u>Title</u></div><div align='left'><u><br></u></div><ol><li>one</li><li>two</li><li>three</li></ol>''').write_pdf("output/pdf/test/test.pdf")
    # pdfkit.from_file("templates/forms/test.html", "output/pdf/test/test.pdf")
    # pypdftk.stamp(current_app.config['FORM_TEMPLATES_LOCATION'].format('F99'), "output/pdf/test/test.pdf", "output/pdf/test/output.pdf")

    if 'json_file' in request.files:
        total_no_of_pages = 1
        page_no = 1
        json_file = request.files.get('json_file')
        # generate md5 for json file
        json_file_md5 = utils.md5_for_file(json_file)
        json_file.stream.seek(0)
        md5_directory = current_app.config['OUTPUT_DIR_LOCATION'].format(
            json_file_md5)
        os.makedirs(md5_directory, exist_ok=True)
        # os.makedirs(md5_directory + "images", exist_ok=True)
        if not os.path.exists(md5_directory + "images"):
            shutil.copytree("templates/forms/F99/images",
                            md5_directory + "images")
        shutil.copyfile("templates/forms/F99/form-text.css",
                        md5_directory + "form-text.css")
        infile = current_app.config['HTML_FORM_TEMPLATES_LOCATION'].format(
            'template')
        json_file.save(
            current_app.config['REQUEST_FILE_LOCATION'].format(json_file_md5))
        outfile = md5_directory + json_file_md5 + '.html'
        json_data = json.load(
            open(current_app.config['REQUEST_FILE_LOCATION'].format(
                json_file_md5)))
        form99_json_data = json_data['data']
        # load the file
        with open(infile) as inf:
            txt = inf.read()
            soup = bs4.BeautifulSoup(txt)
            soup.find('label', attrs={
                'id': 'committeeName'
            }).string = form99_json_data['committeeName']
            soup.find('label', attrs={
                'id': 'street1'
            }).string = form99_json_data['street1']
            soup.find('label', attrs={
                'id': 'street2'
            }).string = form99_json_data['street2']
            soup.find('label', attrs={
                'id': 'city'
            }).string = form99_json_data['city']
            soup.find('label', attrs={
                'id': 'state'
            }).string = form99_json_data['state']
            soup.find('label', attrs={
                'id': 'zipCode'
            }).string = form99_json_data['zipCode']
            soup.find('span', attrs={
                'id': 'committeeId'
            }).string = form99_json_data['committeeId']
            soup.find('label', attrs={'id': 'treasurerFullName'}).string = form99_json_data['treasurerLastName'] + \
                                                                           ', ' + form99_json_data['treasurerFirstName'] \
                                                                           + ', ' + form99_json_data['treasurerMiddleName'] \
                                                                           + ', ' + form99_json_data['treasurerPrefix'] \
                                                                           + ', ' + form99_json_data['treasurerSuffix']
            soup.find('label', attrs={'id': 'treasurerName'}).string = form99_json_data['treasurerLastName'] + \
                                                                       ', ' + form99_json_data['treasurerFirstName']
            f99_html_data = form99_json_data['text']
            soup.find('label', attrs={'id': 'text'}).string = f99_html_data
            soup.find('label', attrs={
                'id': form99_json_data['reason']
            }).string = 'X'

            date_array = form99_json_data['dateSigned'].split("/")
            soup.find('span', attrs={
                'id': 'dateSignedMonth'
            }).string = str(date_array[0])
            soup.find('span', attrs={
                'id': 'dateSignedDate'
            }).string = str(date_array[1])
            soup.find('span', attrs={
                'id': 'dateSignedYear'
            }).string = str(date_array[2])

            with open(outfile, "w") as output_file:
                output_file.write(
                    str(soup).replace("&lt;", "<").replace("&gt;", ">"))

            # F99 PDF page padding options
            options = {
                'margin-top': '0.36in',
                'margin-right': '0.25in',
                'margin-bottom': '0.39in',
                'margin-left': '0.25in'
            }
            # HTML(outfile).write_pdf(md5_directory + json_file_md5 + '.pdf', stylesheets=[CSS(current_app.config['FORMS_LOCATION'].format('F99.css'))])
            pdfkit.from_file(outfile,
                             md5_directory + json_file_md5 + '.pdf',
                             options=options)

            total_no_of_pages = pypdftk.get_num_pages(md5_directory +
                                                      json_file_md5 + '.pdf')
            page_number_file = current_app.config[
                'FORM_TEMPLATES_LOCATION'].format('Page_Number')

        # checking if attachment_file exist
        if 'attachment_file' in request.files:
            # reading Attachment title file
            attachment_title_file = current_app.config[
                'FORM_TEMPLATES_LOCATION'].format('Attachment_Title')
            attachment_file = request.files.get('attachment_file')
            attachment_file.save(
                os.path.join(md5_directory + 'attachment_temp.pdf'))
            os.makedirs(md5_directory + 'attachment', exist_ok=True)
            os.makedirs(md5_directory + 'final_attachment', exist_ok=True)
            pypdftk.split(md5_directory + 'attachment_temp.pdf',
                          md5_directory + 'attachment')
            os.remove(md5_directory + 'attachment/doc_data.txt')
            attachment_no_of_pages = pypdftk.get_num_pages(
                os.path.join(md5_directory + 'attachment_temp.pdf'))
            attachment_page_no = total_no_of_pages
            total_no_of_pages += attachment_no_of_pages

            # we are doing this to assign page numbers to attachment file
            for filename in os.listdir(md5_directory + 'attachment'):
                attachment_page_no += 1
                pypdftk.fill_form(
                    attachment_title_file, {
                        "PAGESTR":
                        "PAGE " + str(attachment_page_no) + " / " +
                        str(total_no_of_pages)
                    }, md5_directory + 'attachment/attachment_page_' +
                    str(attachment_page_no) + '.pdf')
                pypdftk.stamp(
                    md5_directory + 'attachment/' + filename,
                    md5_directory + 'attachment/attachment_page_' +
                    str(attachment_page_no) + '.pdf',
                    md5_directory + 'final_attachment/attachment_' +
                    str(attachment_page_no) + '.pdf')
            pypdftk.concat(
                directory_files(md5_directory + 'final_attachment/'),
                md5_directory + 'attachment.pdf')
            os.remove(md5_directory + 'attachment_temp.pdf')
            # shutil.rmtree(md5_directory + 'attachment')
            # shutil.rmtree(md5_directory + 'final_attachment')
            # pypdftk.concat([md5_directory + json_file_md5 + '.pdf', md5_directory + 'attachment.pdf'], md5_directory + 'all_pages_temp.pdf')
        # else:
        #     shutil.move(md5_directory + json_file_md5 + '.pdf', md5_directory + 'all_pages_temp.pdf')
        os.makedirs(md5_directory + 'pages', exist_ok=True)
        os.makedirs(md5_directory + 'final_pages', exist_ok=True)
        pypdftk.split(md5_directory + json_file_md5 + '.pdf',
                      md5_directory + 'pages')
        os.remove(md5_directory + 'pages/doc_data.txt')
        f99_page_no = 1
        for filename in os.listdir(md5_directory + 'pages'):
            pypdftk.fill_form(
                page_number_file, {
                    "PAGESTR":
                    "PAGE " + str(f99_page_no) + " / " + str(total_no_of_pages)
                }, md5_directory + 'pages/page_number_' + str(f99_page_no) +
                '.pdf')
            pypdftk.stamp(
                md5_directory + 'pages/page_number_' + str(f99_page_no) +
                '.pdf', md5_directory + 'pages/' + filename, md5_directory +
                'final_pages/page_' + str(f99_page_no) + '.pdf')
            f99_page_no += 1

        pypdftk.concat(directory_files(md5_directory + 'final_pages/'),
                       json_file_md5 + '_temp.pdf')

        if 'attachment_file' in request.files:
            pypdftk.concat([
                json_file_md5 + '_temp.pdf', md5_directory + 'attachment.pdf'
            ], md5_directory + 'all_pages.pdf')
            shutil.rmtree(md5_directory + 'attachment')
            shutil.rmtree(md5_directory + 'final_attachment')
            os.remove(md5_directory + 'attachment.pdf')
        else:
            shutil.move(json_file_md5 + '_temp.pdf',
                        md5_directory + 'all_pages.pdf')

        # clean up task
        shutil.rmtree(md5_directory + 'pages')
        shutil.rmtree(md5_directory + 'final_pages')
        # os.remove(md5_directory + json_file_md5 + '.html')
        # shutil.rmtree(md5_directory + 'images')
        # os.remove(md5_directory + 'form-text.css')
        os.remove(md5_directory + json_file_md5 + '.pdf')

        # for f99_page_no in range(f99_no_of_pages):
        #     pypdftk.fill_form(page_number_file,
        #                   {"PAGESTR": "PAGE " + str(f99_page_no+1) + " / " + str(total_no_of_pages)},
        #                   md5_directory + 'pages/page_' + str(f99_page_no+1) + '.pdf')
        #     pypdftk.stamp(md5_directory + json_file_md5 + '.pdf', md5_directory +
        #                   'pages/page_' + str(f99_page_no+1) + '.pdf', md5_directory + json_file_md5 + '_temp.pdf')

        # json_data['PAGESTR'] = "PAGE " + str(page_no) + " / " + str(total_no_of_pages)

        # json_data['MISCELLANEOUS_TEXT'] = ''
        # xfdf_path = pypdftk.gen_xfdf(json_data)
        # pypdftk.fill_form(infile, json_data, outfile)

        # HTML(string='''<br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><font face='Helvetica' size=10 ''' + f99_full_text).\
        #     write_pdf("output/pdf/test/test.pdf")
        # pypdftk.stamp(outfile, "output/pdf/test/test.pdf", "output/pdf/test/output.pdf")

        # additional_page_counter = 0
        # if len(f99_pages_text_json['additional_pages']) > 0:
        #     continuation_file = current_app.config['FORM_TEMPLATES_LOCATION'].format('F99_CONT')
        #     os.makedirs(md5_directory + 'merge', exist_ok=True)
        #     for additional_page in f99_pages_text_json['additional_pages']:
        #         page_no += 1
        #         continuation_outfile = md5_directory + 'merge/' + str(additional_page_counter)+'.pdf'
        #         pypdftk.fill_form(continuation_file, {"PAGESTR": "PAGE "+str(page_no)+" / " + str(total_no_of_pages),
        #                                               "CONTINOUS_TEXT": additional_page[str(additional_page_counter)]}, continuation_outfile)
        #         pypdftk.concat([outfile, continuation_outfile], md5_directory + json_file_md5 + '_all_pages_temp.pdf')
        #         shutil.copy(md5_directory + json_file_md5 + '_all_pages_temp.pdf', outfile)
        #         additional_page_counter += 1
        #         os.remove(md5_directory + json_file_md5 + '_all_pages_temp.pdf')
        #
        # # Add the F99 attachment
        # if 'attachment_file' in request.files:
        #     pypdftk.concat([outfile, md5_directory + 'attachment.pdf'], md5_directory + 'all_pages.pdf')
        #     os.remove(md5_directory + 'attachment.pdf')
        # else:
        #     shutil.copy(outfile, md5_directory + 'all_pages.pdf')
        # os.remove(md5_directory + json_file_md5 +'_temp.pdf')
        # push output file to AWS
        s3 = boto3.client('s3')
        s3.upload_file(
            md5_directory + 'all_pages.pdf',
            current_app.config['AWS_FECFILE_COMPONENTS_BUCKET_NAME'],
            md5_directory + 'all_pages.pdf',
            ExtraArgs={
                'ContentType': "application/pdf",
                'ACL': "public-read"
            })
        response = {
            # 'file_name': '{}.pdf'.format(json_file_md5),
            'pdf_url':
            current_app.config['PRINT_OUTPUT_FILE_URL'].format(json_file_md5) +
            'all_pages.pdf'
        }

        if flask.request.method == "POST":
            envelope = common.get_return_envelope(data=response)
            status_code = status.HTTP_201_CREATED
            return flask.jsonify(**envelope), status_code

    else:

        if flask.request.method == "POST":
            envelope = common.get_return_envelope(
                'false', 'JSON file is missing from your request')
            status_code = status.HTTP_400_BAD_REQUEST
            return flask.jsonify(**envelope), status_code

Пример #14

0

Показать файл

def print_f99_pdftk(stamp_print):
    # check if json_file is in the request

    if 'json_file' in request.files:
        total_no_of_pages = 1
        page_no = 1
        json_file = request.files.get('json_file')
        # generate md5 for json file
        json_file_md5 = utils.md5_for_file(json_file)
        json_file.stream.seek(0)
        md5_directory = current_app.config['OUTPUT_DIR_LOCATION'].format(
            json_file_md5)
        os.makedirs(md5_directory, exist_ok=True)
        infile = current_app.config['FORM_TEMPLATES_LOCATION'].format('F99')
        # save json file as md5 file name
        json_file.save(
            current_app.config['REQUEST_FILE_LOCATION'].format(json_file_md5))
        outfile = md5_directory + json_file_md5 + '_temp.pdf'
        json_data = json.load(
            open(current_app.config['REQUEST_FILE_LOCATION'].format(
                json_file_md5)))
        # setting timestamp and imgno to empty as these needs to show up after submission
        if stamp_print != 'stamp':
            json_data['FILING_TIMESTAMP'] = ''
            json_data['IMGNO'] = ''

        f99_pages_text_json = json.loads(split_f99_text_pages(json_data))
        json_data['MISCELLANEOUS_TEXT'] = f99_pages_text_json['main_page']
        total_no_of_pages += len(f99_pages_text_json['additional_pages'])
        # checking if attachment_file exist
        if 'attachment_file' in request.files:
            # reading Attachment title file
            attachment_title_file = current_app.config[
                'FORM_TEMPLATES_LOCATION'].format('Attachment_Title')
            attachment_file = request.files.get('attachment_file')
            attachment_file.save(
                os.path.join(md5_directory + 'attachment_temp.pdf'))
            os.makedirs(md5_directory + 'attachment', exist_ok=True)
            os.makedirs(md5_directory + 'final_attachment', exist_ok=True)
            pypdftk.split(md5_directory + 'attachment_temp.pdf',
                          md5_directory + 'attachment')
            os.remove(md5_directory + 'attachment/doc_data.txt')
            attachment_no_of_pages = pypdftk.get_num_pages(
                os.path.join(md5_directory + 'attachment_temp.pdf'))
            attachment_page_no = total_no_of_pages
            total_no_of_pages += attachment_no_of_pages

            # we are doing this to assign page numbers to attachment file
            for filename in os.listdir(md5_directory + 'attachment'):
                attachment_page_no += 1
                pypdftk.fill_form(
                    attachment_title_file, {
                        "PAGESTR":
                        "PAGE " + str(attachment_page_no) + " / " +
                        str(total_no_of_pages)
                    }, md5_directory + 'attachment/attachment_page_' +
                    str(attachment_page_no) + '.pdf')
                pypdftk.stamp(
                    md5_directory + 'attachment/' + filename,
                    md5_directory + 'attachment/attachment_page_' +
                    str(attachment_page_no) + '.pdf',
                    md5_directory + 'final_attachment/attachment_' +
                    str(attachment_page_no) + '.pdf')
            pypdftk.concat(
                directory_files(md5_directory + 'final_attachment/'),
                md5_directory + 'attachment.pdf')
            os.remove(md5_directory + 'attachment_temp.pdf')
            shutil.rmtree(md5_directory + 'attachment')
            shutil.rmtree(md5_directory + 'final_attachment')

        json_data['PAGESTR'] = "PAGE " + str(page_no) + " / " + str(
            total_no_of_pages)

        pypdftk.fill_form(infile, json_data, outfile, flatten=False)
        additional_page_counter = 0
        if len(f99_pages_text_json['additional_pages']) > 0:
            continuation_file = current_app.config[
                'FORM_TEMPLATES_LOCATION'].format('F99_CONT')
            os.makedirs(md5_directory + 'merge', exist_ok=True)
            for additional_page in f99_pages_text_json['additional_pages']:
                page_no += 1
                continuation_outfile = md5_directory + 'merge/' + str(
                    additional_page_counter) + '.pdf'
                pypdftk.fill_form(
                    continuation_file, {
                        "PAGESTR":
                        "PAGE " + str(page_no) + " / " +
                        str(total_no_of_pages),
                        "CONTINOUS_TEXT":
                        additional_page[str(additional_page_counter)]
                    }, continuation_outfile)
                pypdftk.concat([outfile, continuation_outfile], md5_directory +
                               json_file_md5 + '_all_pages_temp.pdf')
                shutil.copy(
                    md5_directory + json_file_md5 + '_all_pages_temp.pdf',
                    outfile)
                additional_page_counter += 1
                os.remove(md5_directory + json_file_md5 +
                          '_all_pages_temp.pdf')

        # Add the F99 attachment
        if 'attachment_file' in request.files:
            pypdftk.concat([outfile, md5_directory + 'attachment.pdf'],
                           md5_directory + 'all_pages.pdf')
            os.remove(md5_directory + 'attachment.pdf')
        else:
            shutil.copy(outfile, md5_directory + 'all_pages.pdf')
        os.remove(md5_directory + json_file_md5 + '_temp.pdf')
        # push output file to AWS
        s3 = boto3.client('s3')
        s3.upload_file(
            md5_directory + 'all_pages.pdf',
            current_app.config['AWS_FECFILE_COMPONENTS_BUCKET_NAME'],
            md5_directory + 'all_pages.pdf',
            ExtraArgs={
                'ContentType': "application/pdf",
                'ACL': "public-read"
            })
        response = {
            # 'file_name': '{}.pdf'.format(json_file_md5),
            'pdf_url':
            current_app.config['PRINT_OUTPUT_FILE_URL'].format(json_file_md5) +
            'all_pages.pdf'
        }

        if flask.request.method == "POST":
            envelope = common.get_return_envelope(data=response)
            status_code = status.HTTP_201_CREATED
            return flask.jsonify(**envelope), status_code

    else:

        if flask.request.method == "POST":
            envelope = common.get_return_envelope(
                'false', 'JSON file is missing from your request')
            status_code = status.HTTP_400_BAD_REQUEST
            return flask.jsonify(**envelope), status_code

Пример #15

0

Показать файл

Файл: main.py Проект: shoaibkhan94/PdfGrayscaler

import subprocess
import pypdftk
import shlex
from os import listdir
from os.path import isfile, join
from  os import  chdir
from os import getcwd
from natsort import natsorted, ns

f = raw_input("Enter the file name : ")
test = f
cwd = getcwd()
subprocess.call('mkdir temp', shell=True)
temp_dir = cwd + '/temp'
page_paths=[]
page_paths = pypdftk.split(f, 'temp')
#print(page_paths)
s = raw_input("Enter the page nos. to be printed in grayscale separated by spaces : ")
pageslist = map(int, s.split(' '))
#print(pageslist, type(pageslist))
onlyfiles = [f for f in listdir('temp/') if isfile(join('temp/', f))]
#print(onlyfiles)
mergelist = []
for file in onlyfiles:
    if file.startswith('page_') :
        pg = file.split('_')[1]
        #print(pg, type(pg), int(pg[:2]) < 10, pg[:2])
        if int(pg[:2]) < 10 :
            c = 'mv -f '+file+' page_'+(pg[1])+'.pdf'
            #print(c)
            subprocess.call(c, shell=True, cwd=temp_dir)

Python split примеры использования