Пример #1
0
def pdf_metadata_save(pdf_file, metadata, substitute_all_metadata = False, make_backup = True):
    if type(make_backup) is str:
        bak_file = make_backup
    else:
        bak_file = os.path.splitext(pdf_file)[0] + ".bak"
    os.rename(pdf_file, bak_file)

    with open(bak_file, 'rb') as fin:
        pdf_in = PdfFileReader(fin)
        writer = PdfFileWriter()

        for page in range(pdf_in.getNumPages()):
            writer.addPage(pdf_in.getPage(page))

        infoDict = writer._info.getObject()

        info = pdf_in.documentInfo
        if not substitute_all_metadata:
            for key in info:
                #infoDict.update({NameObject(key): createStringObject(info[key])})
                infoDict.update({key: info[key]})

        for key in metadata:
            infoDict.update({NameObject('/' + key): createStringObject(str(metadata[key]))})

        with open(pdf_file, 'wb') as fout:
            writer.write(fout)

        if make_backup == False:
            os.unlink(bak_file)
Пример #2
0
def extract_information(pdf_path):
    testread = ""
    with open(pdf_path, 'rb') as f:
        pdf = PdfFileReader(f)
        information = pdf.getDocumentInfo()
        testread = pdf.getPage(92).extractText().strip()
        print(pdf.getPage(92).extractText().strip())
        number_of_pages = pdf.getNumPages()

    # txt = f"""
    # Information about {pdf_path}:

    # Author: {information.author}
    # Creator: {information.creator}
    # Producer: {information.producer}
    # Subject: {information.subject}
    # Title: {information.title}
    # Number of pages: {number_of_pages}
    # """
    print(testread)

    # define variables
    s = testread.strip()
    file = "file.mp3"

    # initialize tts, create mp3 and play
    tts = gTTS(s, 'en')
    tts.save(file)
    #os.system("mpg123 " + file)

    return information
Пример #3
0
def pdf_metadata_load(pdf_file):
    with open(pdf_file, 'rb') as fin:
        pdf_in = PdfFileReader(fin)
        writer = PdfFileWriter()

        for page in range(pdf_in.getNumPages()):
            writer.addPage(pdf_in.getPage(page))

        infoDict = writer._info.getObject()

    return pdf_in.documentInfo
Пример #4
0
def merge_pdfs(paths, output):
    pdf_writer = PdfFileWriter()

    for path in paths:
        pdf_reader = PdfFileReader(path)
        for page in range(pdf_reader.getNumPages()):
            # Add each page to the writer object
            pdf_writer.addPage(pdf_reader.getPage(page))

    # Write out the merged PDF
    with open(output, 'wb') as out:
        pdf_writer.write(out)
Пример #5
0
def find_table_page(pdf: PdfFileReader) -> Tuple[str, int]:
    """ Finds the page containing the data table, then returns a tuple with:
    - the text extracted from the page, pre-processed
    - the page number (0-based)
    """
    num_pages = pdf.getNumPages()

    for i in range(1,
                   num_pages):  # skip the first page, the table is not there
        text = extract_text(pdf, page=i)
        if TABLE_CAPTION_PATTERN.search(text):
            return text, i
    else:
        raise TableExtractionError('could not find the table in the pdf')
Пример #6
0
        def pypdf3():
            """Much slower than PyPDF3 method."""
            # 5b. Get our files ready
            document_reader = PdfFileReader(document)
            output_file = PdfFileWriter()

            # Number of pages in input document
            page_count = document_reader.getNumPages()

            # Watermark objects
            watermark_reader = PdfFileReader(watermark)
            wtrmrk_page = watermark_reader.getPage(0)
            wtrmrk_width = (wtrmrk_page.mediaBox.getWidth() / 2) + 0
            wtrmrk_height = (wtrmrk_page.mediaBox.getHeight() / 2) + 80
            wtrmrk_rotate = -int(Info(watermark_reader).rotate) if Info(watermark_reader).rotate is not None else 0

            # 5c. Go through all the input file pages to add a watermark to them
            for page_number in range(page_count):
                # Merge the watermark with the page
                if not self.underneath:
                    input_page = document_reader.getPage(page_number)
                    if wtrmrk_rotate != 0:
                        input_page.mergeRotatedTranslatedPage(wtrmrk_page, wtrmrk_rotate, wtrmrk_width, wtrmrk_height)
                    else:
                        wtrmrk_width = 0
                        wtrmrk_height = 0
                        input_page.mergeTranslatedPage(wtrmrk_page, wtrmrk_width, wtrmrk_height)
                else:
                    size = Info(document_reader).dimensions
                    input_page = PageObject().createBlankPage(document_reader, size['w'], size['h'])
                    if wtrmrk_rotate != 0:
                        input_page.mergeRotatedTranslatedPage(wtrmrk_page, wtrmrk_rotate, wtrmrk_width, wtrmrk_height)
                    else:
                        wtrmrk_width = 0
                        wtrmrk_height = 0
                        input_page.mergeTranslatedPage(wtrmrk_page, wtrmrk_width, wtrmrk_height)
                    input_page.mergePage(document_reader.getPage(page_number))

                # Add page from input file to output document
                output_file.addPage(input_page)

            # 5d. finally, write "output" to PDF
            with open(output_filename, "wb") as outputStream:
                output_file.write(outputStream)
            return output_filename
Пример #7
0
def main():
    if (len(sys.argv) != 3):
        print("usage: python 2-up.py input_file output_file")
        sys.exit(1)
    print("2-up input " + sys.argv[1])
    input1 = PdfFileReader(open(sys.argv[1], "rb"))
    output = PdfFileWriter()
    for iter in range(0, input1.getNumPages() - 1, 2):
        lhs = input1.getPage(iter)
        rhs = input1.getPage(iter + 1)
        lhs.mergeTranslatedPage(rhs, lhs.mediaBox.getUpperRight_x(), 0, True)
        output.addPage(lhs)
        print(str(iter) + " "),
        sys.stdout.flush()

    print("writing " + sys.argv[2])
    outputStream = file(sys.argv[2], "wb")
    output.write(outputStream)
    print("done.")
Пример #8
0
    def prepare(self):
        # Process PDF input file to raw text file
        with open(self.inputPath, "rb") as fh:
            reader = PdfFileReader(fh)
            for page in tqdm(range(0, reader.numPages)):
                page_text = reader.getPage(page).extractText()
                print("Reading page", page, "of", reader.getNumPages())
                filename = join_paths("./.TXT", hash(self.inputPath))
                with open(filename, "a") as fh:
                    fh.write(page_text)

        # Cleaning the TEXT file for better processing
        with open(filename, "r") as fh:
            lines = fh.readlines()
            lines = [l.replace("\n", "").replace("\r", "") for l in lines]
        with open(filename, "w") as fh:
            fh.writelines(lines)
            print("Cleaning... => ", filename)
        self.transform(filename)
Пример #9
0
    def pypdf3(self):
        reader = PdfFileReader(self.file_name)
        writer = PdfFileWriter()

        # Number of pages in input document
        page_count = reader.getNumPages()

        for page_number in range(page_count):
            wtrmrk = reader.getPage(page_number)

            page = PageObject.createBlankPage(width=self.target_w,
                                              height=self.target_h)
            page.mergeScaledTranslatedPage(wtrmrk, self.scale, self.margin_x,
                                           self.margin_y)
            writer.addPage(page)

        with open(self.output, "wb") as outputStream:
            writer.write(outputStream)
        return self.output
Пример #10
0
def reorder(input_filename: str, output_filename: str) -> None:
    assert os.path.exists(input_filename)
    assert os.path.exists(output_filename) is False

    input_stream = open(input_filename, 'rb')
    output = PdfFileWriter()
    input_pdf = PdfFileReader(input_stream)

    pages = input_pdf.getNumPages()
    order = _make_sequence(pages)

    for page_number in order:
        page = input_pdf.getPage(page_number)
        output.addPage(page)

    output_stream = open(output_filename, "wb")
    output.write(output_stream)
    input_stream.close()
    output_stream.close()
Пример #11
0
def write_pdf(pdf_obj, destination):
    """
    Write PDF object to file
    :param pdf_obj: PDF object to be written to file
    :param destination: Desintation path
    """
    reader = PdfFileReader(pdf_obj)    # Create new PDF object
    writer = PdfFileWriter()

    page_count = reader.getNumPages()

    # add the "watermark" (which is the new pdf) on the existing page
    for page_number in range(page_count):
        page = reader.getPage(page_number)
        writer.addPage(page)

    # finally, write "output" to a real file
    with open(destination, "wb") as outputStream:
        writer.write(outputStream)
Пример #12
0
def pdfHandler(file_dir):
    input1 = PdfFileReader(open(file_dir, 'rb'))

    print('document1.pdf has {} pages.'.format(str(input1.getNumPages())))

    fields = input1.getFields()
    print(type(fields))

    documentInfo = input1.getDocumentInfo()
    print(type(documentInfo))
    if documentInfo is not None:
        for key in documentInfo.keys():
            print('{} : {}'.format(key, documentInfo.get(key)))

    metaData = input1.getXmpMetadata()
    print(type(metaData))
    if metaData is not None:
        # print(metaData)
        for relation in metaData.dc_relation:
            print('relation: {}'.format(relation))
Пример #13
0
def add_encryption(path, encryptPath, fileDicts):
    pdf_writer = PdfFileWriter()
    for fileName in fileDicts:
        input_pdf = os.path.join(path, fileName)
        output_pdf = os.path.join(encryptPath, fileName)
        pdf_reader = PdfFileReader(input_pdf)

        for page in range(pdf_reader.getNumPages()):
            pdf_writer.addPage(pdf_reader.getPage(page))

        pdf_writer.encrypt(user_pwd=fileDicts[fileName],
                           owner_pwd=None,
                           use_128bit=True)

        #输出文件已存在便删除
        if os.path.exists(output_pdf):
            os.remove(output_pdf)

        with open(output_pdf, 'wb') as fh:
            pdf_writer.write(fh)
Пример #14
0
def slicer(document,
           first_page=None,
           last_page=None,
           suffix='sliced',
           tempdir=None):
    """Slice a PDF document to remove pages."""
    # Set output file name
    if tempdir:
        with NamedTemporaryFile(suffix='.pdf', dir=tempdir,
                                delete=False) as temp:
            output = temp.name
    elif suffix:
        output = os.path.join(os.path.dirname(document),
                              add_suffix(document, suffix))
    else:
        with NamedTemporaryFile(suffix='.pdf') as temp:
            output = temp.name

    # Reindex page selections for simple user input
    first_page = first_page - 1 if not None else None

    # Validate page range by comparing selection to number of pages in PDF document
    pages = Info(document).pages
    invalid = 'Number of pages: ' + str(
        pages) + ' ----> Page Range Input: ' + str(first_page) + '-' + str(
            last_page)
    assert first_page <= last_page <= pages, invalid

    pdf = PdfFileReader(document)
    writer = PdfFileWriter()

    pages = list(range(pdf.getNumPages()))[first_page:last_page]
    for page in pages:
        writer.addPage(pdf.getPage(page))

    with open(output, 'wb') as out:
        writer.write(out)
    return output
Пример #15
0
def rename(pdf,doi):
    #inpfn = 'Chem. Rev. 2019, 119, 10241-10287-VIP-acs.chemrev.9b00008.pdf'
 

    fin = open(pdf, 'rb')
    pdf_in = PdfFileReader(fin)

    writer = PdfFileWriter()

    for page in range(pdf_in.getNumPages()):
        writer.addPage(pdf_in.getPage(page))

    infoDict = writer._info.getObject()

    info = pdf_in.documentInfo
    for key in info:
        infoDict.update({NameObject(key): createStringObject(info[key])})
        print(key[0]+':'+ info[key])

    # add the grade
    infoDict.update({NameObject('/doi'): createStringObject(u''+doi)})

    # It does not appear possible to alter in place.
    temppdf=pdf+'.temppdf'
    fout = open(temppdf, 'wb')


    writer.write(fout)


    fin.close()
    fout.close()

 
    import os
    os.unlink(pdf)
    os.rename(temppdf, pdf)
    print('The DOI have been updated to:{0}'.format(doi))
Пример #16
0
def upscale(file_name,
            scale=1.5,
            margin_x=0,
            margin_y=0,
            suffix='scaled',
            tempdir=None):
    """Upscale a PDF to a large size."""
    # Set output file name
    if tempdir:
        output = NamedTemporaryFile(suffix='.pdf', dir=tempdir,
                                    delete=False).name
    elif suffix:
        output = os.path.join(os.path.dirname(file_name),
                              add_suffix(file_name, suffix))
    else:
        output = NamedTemporaryFile(suffix='.pdf').name

    reader = PdfFileReader(file_name)
    writer = PdfFileWriter()
    dims = dimensions(file_name)
    target_w = dims['w'] * scale
    target_h = dims['h'] * scale

    # Number of pages in input document
    page_count = reader.getNumPages()

    for page_number in range(page_count):
        wtrmrk = reader.getPage(page_number)

        page = PageObject.createBlankPage(width=target_w, height=target_h)
        page.mergeScaledTranslatedPage(wtrmrk, scale, margin_x, margin_y)
        writer.addPage(page)

    with open(output, "wb") as outputStream:
        writer.write(outputStream)

    return output
Пример #17
0
from PyPDF3 import PdfFileWriter, PdfFileReader

output = PdfFileWriter()
input1 = PdfFileReader(open("document1.pdf", "rb"))

# print how many pages input1 has:
print "document1.pdf has %d pages." % input1.getNumPages()

# add page 1 from input1 to output document, unchanged
output.addPage(input1.getPage(0))

# add page 2 from input1, but rotated clockwise 90 degrees
output.addPage(input1.getPage(1).rotateClockwise(90))

# add page 3 from input1, rotated the other way:
output.addPage(input1.getPage(2).rotateCounterClockwise(90))
# alt: output.addPage(input1.getPage(2).rotateClockwise(270))

# add page 4 from input1, but first add a watermark from another PDF:
page4 = input1.getPage(3)
watermark = PdfFileReader(open("watermark.pdf", "rb"))
page4.mergePage(watermark.getPage(0))
output.addPage(page4)

# add page 5 from input1, but crop it to half size:
page5 = input1.getPage(4)
page5.mediaBox.upperRight = (page5.mediaBox.getUpperRight_x() / 2,
                             page5.mediaBox.getUpperRight_y() / 2)
output.addPage(page5)

# add some Javascript to launch the print window on opening this PDF.
Пример #18
0
			#tabloid - landscape
			page_actual_height = 11*72
			page_actual_width = 17*72
	else:
		print("error. We only do tabloid and landscape")

	pdb.set_trace()

	pdfFileObj = open(infile_path, 'rb') 

	# creating a pdf Reader object 
	pdfReader = PdfFileReader(pdfFileObj) 

	# creating a pdf writer object for new pdf 
	pdfWriter = PdfFileWriter() 
	num_pages = pdfReader.getNumPages()

	#create a list of pages
	pages = [pdfReader.getPage(k) for k in range(num_pages)]
	

	trimboxes = [k.trimBox for k in pages]
	page_width = trimboxes[0].getWidth()
	page_height = trimboxes[0].getHeight()

#******************
	#set the total height of the pdf
	# page_buffer_tabloid = 52
	# page_ybuffer_letter = 0
	# page_xbuffer_letter = 55.5
	# total_height = num_pages * page_actual_height + page_ybuffer_letter
Пример #19
0
from PyPDF3 import PdfFileWriter, PdfFileReader
import sys

if len(sys.argv) == 1 or sys.argv[1] == '-h':
    print('''args: infile p1 p2 .. pn outfile
             Program outputs outfile with p1, p2, ..., pn removed from infile.'''
          )
    exit()

infile = PdfFileReader(sys.argv[1], 'rb')
outfile = PdfFileWriter()

page_del = list(map(int, sys.argv[2:-1]))
ptr = 0
for i in range(infile.getNumPages()):
    if ptr == len(page_del) or i < page_del[ptr]:
        p = infile.getPage(int(i))
        outfile.addPage(p)
    elif i == page_del[ptr]:
        ptr += 1

with open(sys.argv[-1], 'wb') as f:
    outfile.write(f)
Пример #20
0
def zipper(opts, cord_path, base_path, rec_path, rec_pdf_exists, output_name,
           matching, empty_path):
    # ######### PDF Write Setup ######### #
    # Open the input PDFs
    cord_pdf = PdfFileReader(open(cord_path, 'rb'), False)
    base_pdf = PdfFileReader(open(base_path, 'rb'), False)
    rec_pdf = ''
    if rec_pdf_exists:
        rec_pdf = PdfFileReader(open(rec_path, 'rb'), False)
    empty_pdf = PdfFileReader(open(empty_path, 'rb'), False)

    # Check that the coordination PDF is longer than the base (and therefore rec) pdf too.
    # The Coordination PDF includes pages at the front that do not get sliced in, and instead actually sit
    # in the front. If the Coordination pdf is less than the Base or Rec, these are missing, or there was another error
    if cord_pdf.getNumPages() < base_pdf.getNumPages():
        prompt = 'Coordination PDF is shorter than the Base PDF'
        eprint(prompt)
        logger.critical(prompt)
        exit(-7)

    # Find the difference in length of the PDFs, these are the leader pages of the coordination
    diff_length = cord_pdf.getNumPages() - base_pdf.getNumPages()
    logger.info('Diff Length: %s', str(diff_length))

    output = PdfFileWriter()

    for ii in range(diff_length):
        output.addPage(cord_pdf.getPage(ii))

    if matching:

        logger.info("Converting Coordination PDF to string")
        logging.disable(logging.INFO)
        cord_str_pages = pdf_pages_to_list_of_strings(cord_path)
        logging.disable(logging.NOTSET)

        logger.info("Converting Base PDF to string")
        logging.disable(logging.INFO)
        base_str_pages = pdf_pages_to_list_of_strings(base_path)
        logging.disable(logging.NOTSET)

        rec_str_pages = []
        if rec_pdf_exists:
            logging.disable(logging.INFO)
            logger.info("Converting Recommended PDF to string")
            rec_str_pages = pdf_pages_to_list_of_strings(rec_path)
            logging.disable(logging.NOTSET)

        regex_cord = r'(TCC Curve: )(TCC_[\d]+[a-zA-Z]?)([-_#$\w\d\[\] ]*)'
        regex_base_rec = r'(TCC Name: )(TCC_[\d]+[a-zA-Z]?)([-_#$\w\d\[\] ]*)'

        for ii in range(diff_length, len(cord_str_pages)):
            output.addPage(cord_pdf.getPage(ii))
            tcc_matches = re.finditer(regex_cord, cord_str_pages[ii],
                                      re.MULTILINE)

            for match_num, tcc_match in enumerate(tcc_matches, start=1):
                tcc_name = tcc_match.group(2)
                logger.info("Attempting to find: " + tcc_name)
                base_num = find_matching_page(tcc_name, base_str_pages,
                                              regex_base_rec, 'Base PDF')
                if base_num != -1:
                    logger.info('Found on base page: %s', str(base_num))
                rec_page_flag = check_for_rec(cord_str_pages[ii])
                rec_num = 0
                if rec_pdf_exists and rec_page_flag:
                    rec_num = find_matching_page(tcc_name, rec_str_pages,
                                                 regex_base_rec, 'Rec PDF')
                    if rec_num != -1:
                        logger.info('Found on rec page: %s', str(rec_num))
                    else:
                        output.addPage(empty_pdf.getPage(0))
                if base_num > 0:
                    output.addPage(base_pdf.getPage(base_num))
                    if rec_num > 0:
                        output.addPage(rec_pdf.getPage(rec_num))
                    break
    else:
        for jj in range(base_pdf.getNumPages()):
            output.addPage(cord_pdf.getPage(jj + diff_length))
            output.addPage(base_pdf.getPage(jj))
            if rec_pdf_exists:
                output.addPage(rec_pdf.getPage(jj))

    # Finally, output everything to the PDF
    # The output name is chosen based on what the name of the coordination file is
    if opts['output']:
        output_name = opts['output']
    else:
        output_name = "8.0 - Coordination Results & Recommendations_" + output_name + "2018_NEW.pdf"
        output_name = os.path.join(os.path.dirname(os.path.abspath(cord_path)),
                                   output_name)
    with open(output_name, "wb") as w:
        output.write(w)