Пример #1
0
 def merge(self, pdf_one, pdf_two, filename='my.pdf', output_dir='D:/pdf/'):
     '''
     function:#pdfone为扫描的正面;#pdftwo为扫描的背面;#本函数实现将两个扫描文件按原有的顺序合并起来
     :param pdf_one:
     :param pdf_two:
     :param filename:
     :param output_dir:
     :return:
     '''
     input_one = open(pdf_one, 'rb')
     input_two = open(pdf_two, 'rb')
     pdf_input_one = PdfFileReader(input_one)
     pdf_input_two = PdfFileReader(input_two)
     numOne = pdf_input_one.getNumPages()
     numTwo = pdf_input_two.getNumPages()
     print(numOne, numTwo)
     pdf_output = PdfFileWriter()
     index_one = 0
     index_two = numTwo - 1
     while True:
         if index_one == numOne: break
         print(index_one, index_two)
         page1 = pdf_input_one.getPage(index_one)
         pdf_output.addPage(page1)
         page2 = pdf_input_two.getPage(index_two)
         pdf_output.addPage(page2)
         index_one += 1
         index_two -= 1
     pdf_name = output_dir + filename
     output_stream = open(pdf_name, 'wb')
     pdf_output.write(output_stream)
     output_stream.close()
     input_one.close()
     input_two.close()
     print('Done!')
Пример #2
0
    def write_with_template(self,out_file="",template_filename=""):
        """
        テンプレとマージしてそのまま出力しちゃう
        """
        # リーダーでテンプレ読む
        template_filename = template_filename or self.template_filename
        reader = PdfFileReader(template_filename)

        # なんかページ処理
        page = reader.getPage(0)
        page_width = page.mediaBox.getWidth()
        page_height = page.mediaBox.getHeight()

        # バッファからPDFデータ化
        self.buffer.seek(0) # シークして読み込みできるように
        new_pdf = PdfFileReader(self.buffer) # 読み込み

        # テンプレート・ページと内容をマージ
        page.mergePage(new_pdf.getPage(0))

        # writerに渡す
        writer = PdfFileWriter()
        writer.addPage(page)

        # 書き込む
        with open(out_file, 'wb') as f:
            writer.write(f)
Пример #3
0
def PdfPrettyPrint(inputname, outputname):
    inputfile = open(inputname, 'rb')
    wrt = PdfFileWriter()
    ipt = PdfFileReader(inputfile)
    #print ipt.getDocumentInfo()
    pdfnums = ipt.getNumPages()
    #print pdfnums
    i = 0
    while i < pdfnums:
        page = ipt.getPage(i)
        wrt.addPage(page)
        if i + 2 < pdfnums:
            page = ipt.getPage(i + 2)
            wrt.addPage(page)
        else:
            wrt.addBlankPage()
        if i + 1 < pdfnums:
            page = ipt.getPage(i + 1)
            page.rotateClockwise(180)
            wrt.addPage(page)
        else:
            wrt.addBlankPage()
        if i + 3 < pdfnums:
            page = ipt.getPage(i + 3)
            page.rotateClockwise(180)
            wrt.addPage(page)
        else:
            wrt.addBlankPage()
        i = i + 4
    fl = open(outputname, "wb")
    wrt.write(fl)
    inputfile.close()
    fl.close()
    return True
Пример #4
0
    def process_pdf_automatically(self):
        self.statusBar().showMessage('Procesando...')
        # print("File Name:", self.name)
        if self.name != "":
            self.dir = QFileDialog.getExistingDirectory()
            ls = []
            files = [x for x in os.listdir(self.dir + '/') if
                     x.endswith('.pdf') and x != "join.pdf"]
            outfile = PdfFileWriter()

            bancos = ['bbva', 'santander']
            for i in files:
                pdf = PdfFileReader(open(self.dir + '/' + str(i), 'rb'))
                page = pdf.getPage(0)
                pages = pdf.getNumPages()
                last = pdf.getPage(pages - 1)
                text = last.extractText()
                banco = re.findall("(bbva|santander)", text.lower())
                text = page.extractText()
                fecha = \
                re.findall("(corte.*[0-9]{1,2}[/][0-9]{1,2}[/][0-9]{2,4})",
                           text.lower())[0]
                fecha = \
                re.findall("([0-9]{1,2}[/][0-9]{1,2}[/][0-9]{2,4})", fecha)[0]
                ls.append({'page': page,
                           'bank': Counter(banco).most_common()[0][0].upper(),
                           'date': fecha})

            fecha = []
            for i in ls:
                fecha.append(i['date'])
            fecha.sort(key=lambda date: datetime.strptime(date, '%d/%m/%Y'))

            for i in fecha:
                for x in ls:
                    if (x['date'] == i):
                        outfile.addPage(x['page'])

            self.statusBar().showMessage('Creando PDF...')

            save_in = self.dir + '/' + self.name + '.pdf'

            with open(save_in, 'wb') as f:
                outfile.write(f)

            self.statusBar().showMessage('Creación del PDF Exitosa')
            self.show_dialog("Acción realizada con éxito")
        else:
            self.show_dialog("No fue posible crear el archivo PDF")
            self.statusBar().showMessage('')
Пример #5
0
    def run(self):
        if self.beforeHandler(self._id, self.attachUrl):
            return
        filename = self.tempDir + str(random.random())
        filename1 = self.tempDir + str(random.random()) + '.pdf'
        try:
            urllib.request.urlretrieve(self.attachUrl, filename)
            input_stream = open(filename, 'rb')
            pdf_input = PdfFileReader(input_stream)
            pdf_output = PdfFileWriter()

            page = 0
            pages = pdf_input.getNumPages() - 1
            # remove last page
            while page < pages:
                pdf_output.addPage(pdf_input.getPage(page))
                page += 1

            output_stream = open(filename1, 'wb')
            pdf_output.write(output_stream)
            output_stream.close()
            input_stream.close()
            if self.success is not None:
                self.success(self._id, filename1)
        except Exception as e:
            if self.error is not None:
                self.error(e, self.attachUrl)
        finally:
            if os.path.exists(filename):
                os.remove(filename)
            if os.path.exists(filename1):
                os.remove(filename1)
 def convert_to_text(self):
     f = open(self.input_file, 'rb')
     pdf = PdfFileReader(f)
     page = pdf.getPage(0)
     text = page.extractText()
     f.close()
     return text
Пример #7
0
    def _create_pdf_from_rtf_files(self):
        pdfs = []
        self.progress.emit(0)
        for count, file in enumerate(self.files):
            changed_file = change_filetype(file, "pdf", self.engine)
            pdfs.append(changed_file)
            self.progress.emit(count + 1)
        merger = PdfFileMerger()
        pages = []
        chapters = []
        for file in pdfs:
            read_pdf = PdfFileReader(file)
            txt = read_pdf.getPage(0)
            page_content = txt.extractText()
            try:
                chapter = helper_functions.get_chapter_from_pdf_txt(
                    page_content)
                chapters.append(chapter)
            except:
                chapter = os.path.basename(file)
                chapter = chapter.split(".")[0]
                chapter = chapter.replace("_", " ")
                chapters.append(chapter)

            pages.append(read_pdf.getNumPages())
            merger.append(fileobj=file)
        self.pages = pages
        self.chapters = chapters
        if not self.create_toc:
            merger.write(self.master_file_name)
        else:
            merger.write("tmp.pdf")
        merger.close()
        self.trash += pdfs
Пример #8
0
def extract_pdf_pypdf2(pdf_path):
    with open(pdf_path, 'rb') as f:
        pdf = PdfFileReader(f)
        if pdf.isEncrypted:
            pdf.decrypt('')
        page_obj = pdf.getPage(2)
        return page_obj.extractText()
Пример #9
0
def RemovePdfOwnerPassword(inputname, outputname):
    '''
    '''
    inputfile = open(inputname, 'rb')
    wrt = PdfFileWriter()
    ipt = PdfFileReader(inputfile)
    try:
        ipt.decrypt("")
    except KeyError as e:
        if e.message == '/Encrypt':
            print("%s is not an encrypted pdf" % inputname)
            return -1
        else:
            raise e
    print(ipt.getDocumentInfo())
    size = ipt.getNumPages()
    i = 0
    while i < size:
        page = ipt.getPage(i)
        #print(page.extractText())
        wrt.addPage(page)
        i = i + 1
    fl = open(outputname, "wb")
    wrt.write(fl)

    inputfile.close()
    fl.close()
    return 0
Пример #10
0
    def translate(self):
        '''读取pdf内容,并翻译,写入txt文件'''
        f = open(self.fullPath, 'rb')
        pdf = PdfFileReader(f)

        index = 0
        for i in range(0, pdf.getNumPages()):
            extractedText = pdf.getPage(i).extractText()
            content = extractedText.split('\n')
            content = self.removeBlankFromList(content)

            # 拼接之后的文本,如果单词间歇超过一个空格的,认为是需要换行处理的
            content_list = self.enter_symbol(content)

            for line in content_list:
                line = line.strip()
                if line:
                    ret = translate_func(line)
                    trans = ret if ret else '翻译失败'
                    self.write(line + '\n')
                    self.write(trans)
                    index += 1
                    print(index, end=' ', flush=True)

        f.close()
        Logger().write(self.fileName + '翻译完成,新文档:' + self.new_fullPath)
Пример #11
0
def merge_pdfs(origin, num_pages, aux, verso=None, above=False, allPage=False):
    """
    this is a general purpose merging function, it helps in various plugins in order to
    not redo the wheel. It merges origin as the back, aux above.
    """
    try:
        output = PdfFileWriter()
        input_result = PdfFileReader(io.BytesIO(origin))
        pages = []

        for i in range(0, num_pages):
            page_origin = input_result.getPage(i)

            if allPage or i % 2 == 0:
                page_aux = PdfFileReader(io.BytesIO(aux)).getPage(0)
                pages.append(tasks.merge.delay(page_origin, page_aux, above))
            else:
                _merge_verso(verso, page_origin, above, pages)

        for page in pages :
            if type(page) == PyPDF2.pdf.PageObject:
                output.addPage(page)
            else :
                #request celery result
                data = page.get()
                output.addPage(data)

        out_io = io.BytesIO()
        output.write(out_io)
        out_io.seek(0, 0)
        return out_io.read()

    except Exception:
        labresult.app.logger.error(traceback.format_exc())
        raise MergePDFException('Error while merging PDFs')
def getDataUsingPyPdf2(filename):
    pdf = PdfFileReader(open(filename, "rb"))
    content = ""
    num = pdf.getNumPages()
    for i in range(0, num):
        extractedText = pdf.getPage(i).extractText()
        content += extractedText + "\n"
    return content
Пример #13
0
def readPDFfile(infile):
    pdf = PdfFileReader(infile, "rb"))
    content = ""
    num = pdf.getNumPages()
    for i in range(0, num):
        extractedText = pdf.getPage(i).extractText()
        content +=  extractedText + "\n"
    return content
class PdfInput:
    
    def __init__(self, filename):
        self.filename = filename
        self.stream = open(filename, "rb")
        self.pdf_reader = PdfFileReader(self.stream)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.close()

    def close(self):
        self.stream.close()

    def get_number_of_pages(self):
        return self.pdf_reader.getNumPages()

    def get_page_text(self, pageno):
        page = self.pdf_reader.getPage(pageno - 1)
        return page.extractText()
    
    def get_page_image(self, pageno):
        with tempfile.NamedTemporaryFile() as tmp_file:
            with self._open_page_image(pageno, tmp_file.name) as image:
                image.load()
                return image

    def get_page_png_file(self, pageno):
        tmp_file = tempfile.NamedTemporaryFile(suffix=".png")

        with self._open_page_image(pageno, tmp_file.name) as image:
            image.save(tmp_file)

        return tmp_file

    def _open_page_image(self, pageno, image_file):
        print("pdf -> png...", file=sys.stderr)

        subprocess.run([
                "gs",
                "-sDEVICE=png16m",
                "-dNOPAUSE",
                "-dFirstPage={}".format(pageno),
                "-dLastPage={}".format(pageno),
                "-sOutputFile={}".format(image_file),
                "-r300",
                "-q",
                self.filename,
                "-c",
                "quit"],
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL)

        return load(image_file)
Пример #15
0
def getPdffileBookmark(filename, bookmark_file_savepath):
    pdf = PdfFileReader(open(filename, "rb"))

    pagecount = pdf.getNumPages()
    print('pagecount:', pagecount)

    pageLabels = {
    }  #真实页码的索引 indirectRef  “{'/Type': '/Fit', '/Page': IndirectObject(7871, 0), '/Title': '封面'}”
    for i in range(pagecount):
        page = pdf.getPage(i)
        pageLabels[page.indirectRef.idnum] = i + 1
        # print(page.indirectRef.idnum,i+1)

    bookmark_file = codecs.open(bookmark_file_savepath, 'w', encoding='utf-8')
    title = []
    pagedir = []
    bookmark_jibie = []
    outlines = pdf.getOutlines()
    print(outlines)
    index = 0
    jibie = 0
    for outline in outlines:
        index += 1
        jibie = 0
        print(len(outline), outline)
        if type(outline) == PyPDF2.generic.Destination:
            # print('dict--------')
            # print(list(outline.keys()))
            # for x,j in enumerate(list(outline.keys())):
            #     print(str(outline[j]))
            # print(outline['/Title'])
            # print(outline['/Type'])
            # print(outline.page.idnum)
            bookmark_file.write(outline['/Title'] + '\t' +
                                str(pageLabels[outline.page.idnum]) + '\r\n')
        if type(outline) == list:
            # print('list')
            jibie = 1
            for i, outline in enumerate(outline):
                if type(outline) == PyPDF2.generic.Destination:
                    bookmark_file.write('\t' * jibie + outline['/Title'] +
                                        '\t' +
                                        str(pageLabels[outline.page.idnum]) +
                                        '\r\n')
                elif type(outline) == list:
                    jibie = 2
                    for i, o in enumerate(outline):
                        if type(outline) == PyPDF2.generic.Destination:
                            bookmark_file.write(
                                '\t' * jibie + outline['/Title'] + '\t' +
                                str(pageLabels[outline.page.idnum]) + '\r\n')

        # print('\n')
        # if index>=3:
        #     break
    bookmark_file.close()
Пример #16
0
def searchPDF(filename,search_term):  
    search_term = search_term.lower()  
    pages = []  
    pdf = PdfFileReader(open(filename, "rb"))  
  
    for i in range(0, pdf.getNumPages()):  
        content = pdf.getPage(i).extractText().lower()  
        if(search_term in content):  
            pages.append(i + 1)  
    return pages;  
Пример #17
0
 def convertPDFAlternative(self, path):
     from PyPDF2.pdf import PdfFileReader
     if not os.path.exists(path):
         return False
     pdf = PdfFileReader(open(path, "rb"))
     for i in range(0, pdf.getNumPages()):
         print(i)
         extractedText = pdf.getPage(i).extractText()
         self.pages.append(extractedText)
     return True
Пример #18
0
 def get(self, request, *args, **kwargs):
     fontname_g = "HeiseiMin-W3"
     pdfmetrics.registerFont(UnicodeCIDFont(fontname_g))
     reader = PdfFileReader('media/pdf/riyuu-format4.pdf')
     writer = PdfFileWriter()
     buffer = io.BytesIO()
     cc = canvas.Canvas(buffer)
     cc.setFont(fontname_g, 11)
     initial = 295
     before_rect_x = 748
     after_rect_x = 776.5
     line_height = 11.9
     input_list = [{
         'label': '便器からの立ち座り',
         'before_flag': True,
         'after_flag': False
     }, {
         'label': 'トイレまでの移動',
         'before_flag': False,
         'after_flag': True
     }, {
         'label': 'トイレ出入口の出入(扉の開閉含む)',
         'before_flag': True,
         'after_flag': False
     }]
     welfare_equipment_material = PdfMaterial.objects.get(
         key="welfare_equipment")
     cc = self.motion_purpose_draw(cc, before_rect_x, after_rect_x,
                                   welfare_equipment_material.materials,
                                   input_list, initial, line_height)
     cc.showPage()
     cc.save()
     buffer.seek(0)
     new_pdf = PdfFileReader(buffer)
     existing_page = reader.getPage(0)
     existing_page.mergePage(new_pdf.getPage(0))
     writer.addPage(existing_page)
     new = io.BytesIO()
     writer.write(new)
     new.seek(0)
     print('finish')
     return FileResponse(new, as_attachment=True, filename='hello.pdf')
Пример #19
0
def getDataUsingPyPdf2(filename):
    pdf = PdfFileReader(open(filename, "rb"))
    content = ""

    for i in range(0, pdf.getNumPages()):
        #print(str(i))
        extractedText = pdf.getPage(i).extractText()
        content += extractedText + "\n"

    content = " ".join(content.replace("\xa0", " ").strip().split())
    return content.encode("ascii", "ignore")
Пример #20
0
def add_filler(options):
    output = []
    filler_data = options['filler']
    if not filler_data["include"]: return []
    if not len(filler_data["order"]):
        thread_print("WARNING: No filler ordering was specified, filler will not be added")
        return []
    
    for filename in filler_data["order"]:
        try:
            filler = PdfFileReader(open(os.path.join(filler_data["directory"], f'{filename}.pdf'), 'rb'))
            for i in range(filler.getNumPages()):
                page: PageObject = filler.getPage(i)
                if not validate_mediabox(page.mediaBox, options):
                    thread_print(f'WARNING: Page {i + 1} in "{filename}" has incorrect dimensions\nExpected {options["page-size"]["width"]} x {options["page-size"]["height"]}, received {float(page.mediaBox.getWidth()) / inch} x {float(page.mediaBox.getHeight()) / inch}.')
                output.append(filler.getPage(i))
        except OSError:
            thread_print(f'WARNING: Unable to open file "{filename}.pdf", this item will be skipped.')
            continue
    return output
Пример #21
0
 def get(self, request, *args, **kwargs):
     fontname_g = "HeiseiKakuGo-W5"
     pdfmetrics.registerFont(UnicodeCIDFont(fontname_g))
     buffer = io.BytesIO()
     cc = canvas.Canvas(buffer)
     reader = PdfFileReader('media/pdf/sample.pdf')
     existing_page = reader.getPage(0)
     cc.setFont(fontname_g, 24)
     cc.drawString(0, 820, "テスト")
     cc.showPage()
     cc.save()
     buffer.seek(0)
     new_pdf = PdfFileReader(buffer)
     existing_page.mergePage(new_pdf.getPage(0))
     writer = PdfFileWriter()
     writer.addPage(existing_page)
     new = io.BytesIO()
     writer.write(new)
     new.seek(0)
     return FileResponse(new, as_attachment=True, filename='hello.pdf')
Пример #22
0
def pdfSplit(pdf_main, pdf_part):
    try:
        pdf_read_obj = PdfFileReader(pdf_main)
        pdf_write_obj = PdfFileWriter()
        page_num = pdf_read_obj.getNumPages()
        page_last_obj = pdf_read_obj.getPage(page_num - 1)
        page_last_obj.rotateClockwise(90)
        pdf_write_obj.addPage(page_last_obj)
        pdf_write_obj.write(open(pdf_part, 'wb'))
        return page_num - 1
    except Exception as e:
        return False
Пример #23
0
def clickOK():
    File = PdfFileReader(open(selectPDF.get() + '.pdf', 'rb'))
    page_cound = File.getNumPages()
    pprint.pprint(page_cound)

    ageList = []
    for i in range(0, page_cound):
        try:
            if i == entry1.get():
                ageList.append(File.getPage(i).extractText())
                pprint.pprint(ageList[int(0)])
        except:
            print("except")
Пример #24
0
def PdfPassword(filepath, password):
    # Check if file exists
    checkFile = os.path.isfile(filepath)

    if checkFile:
        # Get the path of directory and filename
        path, filename = os.path.split(filepath)

        # Get the file extension to check for pdf files
        file_extension = os.path.splitext(filepath)[1]

        if file_extension == ".pdf":

            # The output filename
            output_file = os.path.join(path, f"temp_{ts}_{filename}")

            # Create a PdfFileWriter object
            pdf_writer = PdfFileWriter()

            # Open our PDF file with the PdfFileReader
            file = PdfFileReader(filepath)

            # Get number of pages in original file
            # Iterate through every page of the original file and add it to our new file
            for idx in range(file.numPages):
                # Get the page at index idx
                page = file.getPage(idx)

                # Add it to the output file
                pdf_writer.addPage(page)

            # Encrypt the new file with the entered password
            pdf_writer.encrypt(password, use_128bit=True)

            # Open a new file
            with open(output_file, "wb") as file:
                # Write our encrypted PDF to this file
                pdf_writer.write(file)

            print('File Written To Path:', output_file)

        else:
            # File extension is not PDF
            print(
                f"Not A PDF File Given, File Has Extension: {file_extension}")
            sys.exit()

    else:
        # No file exists on the current path
        print("Check The File Path")
        sys.exit()
Пример #25
0
def PdfMultiplePassword(filepaths, password):
    # Check if files exists
    check_path = [os.path.isfile(x) for x in filepaths]

    # Gets the files extension
    file_extensions = [os.path.splitext(x)[1] for x in filepaths]

    # Check if files extension are pdf
    file_extensions_check = [x for x in file_extensions if x != ".pdf"]

    if False in check_path:

        # Get the index of the file that doesn't exists
        index = check_path.index(False)
        print(f"File Doesn't Exists: {filepaths[index]}")
        sys.exit()

    else:
        # Not a PDF file is given
        if file_extensions_check:
            print("Submit Only PDF Files")
            sys.exit()

        else:
            count = 1
            # Iterate through every pdf of the filepaths
            for path in filepaths:

                # Create a PdfFileWriter object
                pdf_writer = PdfFileWriter()

                # Open our PDF file with the PdfFileReader
                pdf_reader = PdfFileReader(path)

                # Get the page at index idx
                for page in range(pdf_reader.getNumPages()):
                    # Add each page to the writer object
                    pdf_writer.addPage(pdf_reader.getPage(page))

                # The output filename
                output_file = f"merge_enc_{count}_{ts}.pdf"

                # Encrypt the new file with the entered password
                pdf_writer.encrypt(password, use_128bit=True)

                # Write out the merged PDF
                with open(output_file, 'wb') as file:
                    pdf_writer.write(file)

                count += 1
                print('File Written To Path:', output_file)
Пример #26
0
def calculate_locations(filename,keywords):
    locations = []
    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)    
    #Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = PDFPage.create_pages(document)
    pagenum = 0
    reader = PdfFileReader(file(filename,"rb"))
    for page in pages:
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()    
        page = reader.getPage(pagenum)
        x = page.trimBox[0].as_numeric()
        y = page.trimBox[1].as_numeric()
        #Handling special case
        if  (x > 0 and y < 0):
                x = 0
#         print "At page = %s  X  = %s , y = %s"%(pagenum,x,y)
        for keyword in keywords:    
            print '********************************'
            co_ordinates = get_location(keyword,layout,x,y)
            print'Keyword %s , location %s'%(keyword,co_ordinates)
            print '********************************'
            if co_ordinates != None :
                for location in co_ordinates:
                    print "PageNum-->%s"%pagenum
                    l = LocationKeeper(keyword,location,pagenum)
                    locations.append(l)
        pagenum+=1
    return locations
Пример #27
0
def getTitlePDFfromBookmarkfile(pdf_filepath, bookmark_filepath,
                                pdf_filepath_output):

    bookmark_file = codecs.open(bookmark_filepath, 'r', encoding='utf-8')
    lines = bookmark_file.readlines()
    page_start = 0
    for i, line in enumerate(lines):
        # print(line)
        if line.find(u'目录') >= 0:
            line = line.strip()
            print(line)
            print(line.split('\t'))
            page_start = int(line.split('\t')[1])
    page_start -= 1
    print(page_start)
    page_end = 0
    page_list = []
    for i, line in enumerate(lines):
        line = line.strip()
        # print(line)
        if line.find('\t') >= 0:
            # print(int(line.rsplit('\t',1)[1]))
            page_list.append(int(line.rsplit('\t', 1)[1]))
    # page_list=page_list.sort()
    # print(page_list)
    for i in range(0, len(page_list)):
        if page_list[i] > page_start:
            page_end = page_list[i]
            break
        page_end -= 1
    print(page_end)
    if page_end <= page_start and page_start >= 0 and page_end > 0:
        print('not find title page')
        return
    pdf = PdfFileReader(open(pdf_filepath, "rb"))

    output = PdfFileWriter()
    for i in range(page_start, page_end + 1):
        output.addPage(pdf.getPage(i))

        # dst_pdf.addPage(pdf.getPage(i))

        # pdf_bytes = io.BytesIO()
        # output.write(pdf_bytes)
        # pdf_bytes.seek(0)
        # img = Image(file=pdf_bytes, resolution=300)
        # img.convert("png")
        # img.save(pdf_filepath_output+'_out.tif')
    stream = open(pdf_filepath_output, 'wb')
    output.write(stream)
Пример #28
0
def split_pdf(inFile, outFile):
    '''
    拆分文档
    :param inFile:     输入文件
    :param outFile:    输出文件
    :return:
    '''
    pdfFileWriter = PdfFileWriter()
    pdfFileReader = PdfFileReader(open(inFile, 'rb'))
    page_count = pdfFileReader.getNumPages()
    print(page_count)
    # 将 pdf 第2页之后的页面,输出到一个新的文件
    for i in range(2, page_count):
        pdfFileWriter.addPage(pdfFileReader.getPage(i))
    pdfFileWriter.write(open(outFile, 'wb'))
Пример #29
0
def enumerate_pages(files, options, style=0, start=None, page_map=None, write_pages=False, no_filler_before=None, verbose=False):
    pages = []
    if start == None:
        start = 1 if style == 0 else 'A'
    counter = start

    # Read filler pages if filler needs to be interlaced
    filler = None
    if no_filler_before: filler = add_filler(options)
    valid_filler_indeces = []

    for file in files:
        check_stop_script()
        try:         
            # Save page num assignment to map
            if page_map != None: page_map[counter] = file
            if filler and file not in no_filler_before: valid_filler_indeces.append(len(pages))

            # Read input file
            input = PdfFileReader(open(file, 'rb'))
            num_pages = input.getNumPages()
            page_num = f'{counter}'

            # Add all the pages to the list
            for i in range(num_pages):
                input_page: PageObject = input.getPage(i)

                # Verify that it has the proper dimensions
                mediabox = input_page.mediaBox
                if not validate_mediabox(mediabox, options):
                    thread_print(f'WARNING: Page {i + 1} in "{file}" has incorrect dimensions\nExpected {options["page-size"]["width"]} x {options["page-size"]["height"]}, received {float(mediabox.getWidth()) / inch} x {float(mediabox.getHeight()) / inch}.')
                    continue

                # Calculate this page number
                if not write_pages: continue
                if num_pages > 1: page_num = f'{counter}.{i + 1}'
                pages.append(add_page_num(input_page, page_num, options) if options["enumerate-pages"] else input_page)

        except OSError:
            thread_print(f'Error when parsing "{file}"')
        
        # Increment Counter
        counter = (counter + 1) if style == 0 else (chr(ord(counter) + 1))

    # Interlace filler (if applicable)
    if filler: interlace_filler(pages, filler, valid_filler_indeces)

    return pages
Пример #30
0
 def _removePropertyEndPage(self, file_pdf):
     '''移除资产明细表中的无用页'''
     fd_in = open(file_pdf, "rb")
     pdf_in = PdfFileReader(fd_in)
     page_num = pdf_in.getNumPages()
     pdf_out = PdfFileWriter()
     for num in range(page_num - 1):
         page = pdf_in.getPage(num)
         pdf_out.addPage(page)
     fd_out = open(file_pdf + 'tmp.pdf', "wb")
     pdf_out.write(fd_out)
     fd_in.close()
     fd_out.close()
     os.remove(file_pdf)
     os.rename(os.path.join('', file_pdf + 'tmp.pdf'),
               os.path.join('', file_pdf))
     print('   > 已把最后一页删除')
class PdfLoader:
    
    def __init__(self, filename):

        self.filename = filename
        self.pdf_reader = PdfFileReader(open(filename, "rb"))
    
    def get_number_of_pages(self):
        
        return self.pdf_reader.getNumPages()

    def get_page_text(self, pageno):

        page = self.pdf_reader.getPage(pageno - 1)
        return page.extractText()
    
    def get_page_image(self, pageno):
        
        tmp_file = tempfile.NamedTemporaryFile("wb")
        path = tmp_file.name
        tmp_file.close()
        
        stdio = open(os.devnull, 'wb')
        return_value = call(["gs",
                             "-sDEVICE=png16m",
                             "-dNOPAUSE", "-dFirstPage=%d" % pageno,
                             "-dLastPage=%d" % pageno,
                             "-sOutputFile=%s" % path,
                             "-r300",
                             "-q",
                             self.filename,
                             "-c",
                             "quit"],
                            stdout=stdio,
                            stderr=stdio)
        
        if return_value != 0:
            try:
                os.unlink(path)
            except:
                pass
            raise LoadError()
        
        img = Image.open(path)
        os.unlink(path)
        return img
Пример #32
0
def merge_page_nums(pages: List[PageObject], options, filename='page_nums.pdf'):
    output = []
    path = os.path.join(options["folder-dir"], "tmp", filename)
    with open(path, 'rb') as f:
        page_num_pdf = PdfFileReader(f)
        for i, page in enumerate(pages):
            target: PageObject = page_num_pdf.getPage(i)
            target.mergePage(page)
            # For some reason the text doesn't appear properly if we don't write first
            thread_print("Writing extra output file because this is somehow necessary")
            tmp_out = PdfFileWriter()
            tmp_out.addPage(target)
            with open(os.path.join(options["folder-dir"], "tmp", "page_num_overlap.pdf"), 'wb') as f:
                pass#tmp_out.write(f)
            output.append(target)

    return output
Пример #33
0
def addBlankpage(inFile, outFile):
    '''
    pdf读取写入操作
    '''
    pdfFileWriter = PdfFileWriter()

    # 获取 PdfFileReader 对象
    pdfFileReader = PdfFileReader(
        inFile)  # 或者这个方式:pdfFileReader = PdfFileReader(open(readFile, 'rb'))
    numPages = pdfFileReader.getNumPages()

    for index in range(0, numPages):
        pageObj = pdfFileReader.getPage(index)
        pdfFileWriter.addPage(pageObj)  # 根据每页返回的 PageObject,写入到文件
        pdfFileWriter.write(open(outFile, 'wb'))

    pdfFileWriter.addBlankPage()  # 在文件的最后一页写入一个空白页,保存至文件中
    pdfFileWriter.write(open(outFile, 'wb'))
Пример #34
0
def pdf_read(file_name):
    # input1 = PdfFileReader(file(file_name, 'rb'))
    # # print "title = %s" % (input1.getDocumentInfo().title)
    # # print input1.pages
    # for page in input1.getPage(1):
    #     print page.extractText()
    pdf = PdfFileReader(file(file_name, "rb"))
    content = ""
    pagecount = pdf.getNumPages()
    print('pagecount:', pagecount)
    # pageLabels = {}
    # page = pdf.getPage(0)
    # pageLabels[page.indirectRef.idnum] =  1
    # print pdf.getOutlines()[0]['/Page']['/Contents']
    for i in range(0, pdf.getNumPages()):
        extractedText = pdf.getPage(i).extractText()
        content += extractedText + "\n"
        # return content.encode("ascii", "ignore")
    print content
Пример #35
0
 def _create_hyperlinks(self, link_locations, page_locations):
     reader = PdfFileReader("tmp2.pdf")
     writer = PdfFileWriter()
     for i in range(reader.getNumPages()):
         page = reader.getPage(i)
         writer.addPage(page)
     for i in range(len(link_locations)):
         toc_page = 1
         if self.toc_orientation == "P":
             toc_page = math.floor(i / settings["Items on vertical toc"])
         if self.toc_orientation == "L":
             toc_page = math.floor(i / settings["Items on horizontal toc"])
         writer.addLink(pagenum=toc_page,
                        pagedest=page_locations[i] - 1,
                        rect=link_locations[i],
                        fit="/Fit",
                        border=[0, 0, 0])
     with open(self.filename, 'wb') as out:
         writer.write(out)
Пример #36
0
def crop(pdf_in, pdf_out):
    """
    Параметры
    pdf_in - абсолютный путь к пдф
    pdf_out - абсолютный путь для исходящего пдф
    :return: status
    """

    """ Временно к функции добавлен второй параметр - pdf_out. В продакшн она должна сохранять результат кропа
     в тот же файл
    """
    status = True

    # Словарь с размерами бумаги для каждой страницы
    papers = analyze_papersize(pdf_in)  # like {1: ('Speedmaster', 900, 640), 2: ('Dominant', 640, 450)}

    # TODO Доработать временное решение кропа в отсутствии инфы о размере бумаги.
    if papers == {}:
        perl_crop = "perl pdfcrop.pl {} {}".format(pdf_in, pdf_out)
        os.system(perl_crop)
        return status

    input = PdfFileReader(file(pdf_in, "rb"))
    output = PdfFileWriter()

    # Количество страниц
    pages_qty = input.getNumPages()

    for index in range(pages_qty):
        paper_machine = papers[index+1][0]
        paper_w = papers[index+1][1]
        paper_h = papers[index+1][2]

        for m in PrintingPress._registry:
            if paper_machine == m.name:
                machine = m

        plate_w = machine.plate_w
        plate_h = machine.plate_h

        page = input.getPage(index)

        """ EXAMLE
        # The resulting document has a trim box that is 200x200 points
        # and starts at 25,25 points inside the media box.
        # The crop box is 25 points inside the trim box.
        print mm(page.mediaBox.getUpperRight_x()), mm(page.mediaBox.getUpperRight_y())
        page.trimBox.lowerLeft = (25, 25)
        page.trimBox.upperRight = (225, 225)
        page.cropBox.lowerLeft = (50, 50)
        page.cropBox.upperRight = (200, 200)
        """

        print 'Crop page {} to paper {}x{}'.format(index+1, paper_w, paper_h)
        page.mediaBox.lowerLeft = ((pt(plate_w - paper_w)/2), pt(machine.klapan))  # отступ слева, отступ снизу
        page.mediaBox.upperRight = (pt(paper_w + (plate_w - paper_w)/2), pt(paper_h + machine.klapan))  # ширина+отступ, высота+отступ

        output.addPage(page)

    outputstream = file(pdf_out, "wb")
    output.write(outputstream)
    outputstream.close()

    return status