Пример #1
1
def diff_pdf_pages(pdf1_path, pdf2_path):
    pdf2_fp = PdfFileReader(io.BytesIO(pdf2_path))
    pdf2_len = pdf2_fp.getNumPages()

    if not pdf1_path:
        return list(range(0, pdf2_len))

    pdf1_fp = PdfFileReader(io.BytesIO(pdf1_path))
    pdf1_len = pdf1_fp.getNumPages()

    list_differents = list()
    for i in range(pdf1_len):
        if i >= pdf2_len:
            list_differents.append(i)
            continue

        output1 = PdfFileWriter()
        output2 = PdfFileWriter()
        output1.addPage(pdf1_fp.getPage(i))
        output2.addPage(pdf2_fp.getPage(i))

        fp1 = io.BytesIO()
        fp2 = io.BytesIO()
        output1.write(fp1)
        output2.write(fp2)

        fp1.seek(0)
        fp2.seek(0)

        if fp1.read() != fp2.read():
            list_differents.append(i)
    return list_differents
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('input_pdf_path', metavar='PATH')
    parser.add_argument('-o', '--output', metavar='out', type=argparse.FileType('wb'),
                        help='Output PDF file')
    parser.add_argument('-s', '--skip', type=int, default=0,
                        help='Skip over the first n page(s).')
    args = parser.parse_args()

    logger = logging.getLogger(__name__)
    logging.basicConfig(level='INFO', format='%(asctime)s - %(levelname)s - %(message)s')

    directory = './temp/'
    if not os.path.exists(directory):
        os.makedirs(directory)

    images_path = []
    pdf = PdfFileReader(open(args.input_pdf_path, "rb"))
    for i in range(0, pdf.getNumPages()):
        logger.info("Processing page {}/{}".format(i + 1, pdf.getNumPages()))
        images_path.append("./temp/{}.jpg".format(i))
        process_page(pdf, i, i < args.skip)

    logger.info('Writing to output PDF file')
    args.output.write(img2pdf.convert(*list(map(img2pdf.input_images, images_path))))
    logger.info('Done')

    shutil.rmtree(directory, True)
Пример #3
0
    def write_pdf(self, output):
        # get plain pdf from rml
        template = select_template([
            'leprikon/{}/{}.rml'.format(self.pdf_export, self.subject.subject_type.slug),
            'leprikon/{}/{}.rml'.format(self.pdf_export, self.subject.subject_type.subject_type),
            'leprikon/{}/subject.rml'.format(self.pdf_export),
        ])
        rml_content = template.render({
            'object': self,
            'site': LeprikonSite.objects.get_current(),
        })
        pdf_content = trml2pdf.parseString(rml_content.encode('utf-8'))

        # merge with background
        if self.print_setup.background:
            template_pdf = PdfFileReader(self.print_setup.background.file)
            registration_pdf = PdfFileReader(BytesIO(pdf_content))
            writer = PdfFileWriter()
            # merge pages from both template and registration
            for i in range(registration_pdf.getNumPages()):
                if i < template_pdf.getNumPages():
                    page = template_pdf.getPage(i)
                    page.mergePage(registration_pdf.getPage(i))
                else:
                    page = registration_pdf.getPage(i)
                writer.addPage(page)
            # write result to output
            writer.write(output)
        else:
            # write basic pdf registration to response
            output.write(pdf_content)
        return output
Пример #4
0
    def iter_pdf_page_text(self, filename):
    	year=""
    	month=""
    	day=""
    	mydate=""
    	
        self.filename = filename
        reader = PdfFileReader(open(filename,"rb"))
        logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename))
        
        metadata = reader.getDocumentInfo()
        logging.info("METADATA: " + str(metadata))
        
        try:
            if metadata.has_key('/CreationDate'):
                year = metadata['/CreationDate'][2:5]
                month = metadata['/CreationDate'][6:7]
                day = metadata['/CreationDate'][8:9]
                mydate =year+"-"+month+"-"+day 
            else:
                mydate = datetime.datetime.now().strftime("%Y-%m-%d-%H%M%S")
        except: #hack ... but sometimes /creationdate is bunged
            mydate = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S")

        for pgnum in range(reader.getNumPages()):
            text = reader.getPage(pgnum).extractText()
            text = text.encode('ascii', 'ignore')
            text = text.replace('\n', ' ')
            yield text
Пример #5
0
def imp_exp_pdf(inputfile, outputfile, size, margin, padding):
    "For Import and Export PDF files by resizing"
    output = PdfFileWriter()
    input = PdfFileReader(file(inputfile, 'rb'), strict=False)
    totalPages = input.getNumPages()
    p = []

    for i in range(0, input.getNumPages()):
        p.append(input.getPage(i))
        if len(p) == 10:
            output_one_page(p, size, margin, padding, output)
            p = []

            echoer = "Printed {} of {}  [{:.2f}%]".format(
                i + 1, totalPages, (i + 1) / float(totalPages) * 100)
            print echoer


    if len(p) > 0:
        tmppdf = PdfFileReader(file('BlankA4.pdf', 'rb'), strict=False)
        tmppage = tmppdf.getPage(0)
        (w, h) = tmppage.mediaBox.upperRight
        output_one_page(p, size, margin, padding, output)
        p = []

    print
    print 'Completed converting.'
    print 'Saving...'
    outputStream = file(outputfile, "wb")
    output.write(outputStream)
    outputStream.close()
    print 'END OF PROGRAM'
Пример #6
0
def union(input_files, output_file):
    output = PdfFileWriter()

    for input_file in input_files:
        if input_file.endswith('.pdf'):
            input = PdfFileReader(open(input_file, 'rb'))
            num_pages = input.getNumPages()

            for i in range(0, num_pages):
                output.addPage(input.getPage(i))

        else: # input_file isn't pdf ex. jpeg, png  
            im = PIL.Image.open(input_file)
            input_file_pdf = input_file.split('.')[0]+'.pdf'
            im.save(input_file_pdf, 'PDF', resoultion = 100.0)

            input = PdfFileReader(open(input_file_pdf, 'rb'))
            num_pages = input.getNumPages()

            for i in range(0, num_pages):
                output.addPage(input.getPage(i))

            os.remove(input_file_pdf)


    with open(output_file, 'wb') as outputStream:
        output.write(outputStream)

    print('completed.')
    print('Union of some file is ' + output_file)
Пример #7
0
 def _iter_pdf(self, filename):
     with open(filename, 'rb') as f:
         reader = PdfFileReader(f)
         logging.debug("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename))
         for pgnum in range(reader.getNumPages()):
             text = reader.getPage(pgnum).extractText()
             text = text.encode('ascii', 'ignore')
             text = text.replace('\n', ' ')
             yield text
Пример #8
0
 def iter_pdf_page_text(self, filename):
     self.filename = filename
     reader = PdfFileReader(filename)
     logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename))
     for pgnum in range(reader.getNumPages()):
         text = reader.getPage(pgnum).extractText()
         text = text.encode('ascii', 'ignore')
         text = text.replace('\n', ' ')
         yield text
Пример #9
0
def add_page_numbers(inputfile, outputfile, startno=None, endno=None, fontname="Helvetica", fontsize=12,
                     pagenoformat="- %i -", pagesize=A4, posx=280, posy=800):
    """
    Adds page numbers to the input PDF file and stores the modified PDF in output.
    Optionally, the page range can be limited.
    :param inputfile: the input PDF
    :type inputfile: str
    :param outputfile: the output PDF
    :type outputfile: str
    :param startno: the first page to number, 1-based, use None to start from first page
    :type startno: int
    :param endno: the last page to number, 1-based, use None to end with last page
    :type endno: int
    :param fontname: the name of the font to use, eg 'Helvetica'
    :type fontname: str
    :param fontsize: the size of the font, eg 12
    :type fontsize: int
    :param pagenoformat: the format string for the page number, eg '- %i -'
    :type pagenoformat: str
    :param pagesize: the page size, eg A4
    :type pagesize: object
    :param posx: the X position for the page number
    :type posx: int
    :param posy: the Y position for the page number
    :type posy: int
    """
    inputpdf = PdfFileReader(open(inputfile, "rb"))
    outputpdf = PdfFileWriter()

    if startno is None:
        startno = 1
    if endno is None:
        endno = inputpdf.getNumPages()
    for i in xrange(inputpdf.getNumPages()):
        page = i + 1
        current = inputpdf.getPage(i)
        # add page number?
        # taken from here: http://stackoverflow.com/a/17538003
        if (page >= startno) and (page <= endno):
            packet = StringIO.StringIO()
            can = canvas.Canvas(packet, pagesize=pagesize)
            can.setFont(fontname, fontsize)
            can.drawString(posx, posy, pagenoformat % page)
            can.save()
            packet.seek(0)
            pagenopdf = PdfFileReader(packet)
            logger.info("Page " + str(page) + " added")
            current.mergePage(pagenopdf.getPage(0))
        else:
            logger.info("Page " + str(page))
        outputpdf.addPage(current)

    outputstream = file(outputfile, "wb")
    outputpdf.write(outputstream)
Пример #10
0
 def process(self, content, mimetype='application/pdf'):
     """Process a PDF document.
     Args:
         content: Binary content of the document.
         mimetype: Id of MIME type (content ignored if it isn't `application/pdf`).
     Returns:
         Tuple:
             Relevancy of the document (based on keywords)
             Metadata extracted from the document (dictionary).
     """
     relevancy = 0
     metadata = {}
     if mimetype == 'application/pdf':
         # Obtain metadata
         doc = PdfFileReader(BytesIO(content))
         info = doc.getDocumentInfo()
         if info:
             for k in info:
                 metadata[k] = info.getText(k)
         # Extra metadata
         metadata['_num_pages'] = doc.getNumPages()
         # Process title, subject and metadata keywords
         # TODO guess title from page text when not provided
         if self.keywords:
             relevant = (metadata.get('/Title', '') + ' ' +
                         metadata.get('/Subject', '') + ' ' +
                         metadata.get('/Keywords', '')).lower()
             for word in self.keywords:
                 if word.lower() in relevant:
                     # Each relevant keyword increases relevancy in 10 points
                     relevancy += 10
             # Process pages.
             distance_factor = 1
             for p in range(doc.getNumPages()):
                 # Break if factor is too low
                 if distance_factor < 0.01:
                     break
                 try:
                     text = doc.getPage(p).extractText().lower()
                     for word in self.keywords:
                         relevancy += distance_factor * text.count(word.lower())
                 except Exception as ex:
                     # Some bad formed PDFs raise decoding errors. Skip page.
                     pass
                 # Each new page reduces relevancy factor in a half
                 distance_factor /= 2
         # Relevancy is significant by the nearest tenth
         relevancy = round(relevancy, 1)
     else:
         relevancy = 0
     metadata['_relevancy'] = relevancy
     return relevancy, metadata
Пример #11
0
def manipulatePDF():
    global BDSDFullName,contentTXT,totalInTxt
    global testTableName, testTableFullName,testTableDict
    input0 = PdfFileReader(file(BDSDFullName[0],'rb'))
    merger1 = PdfFileMerger()
    numBDSD = input0.getNumPages()
    merger1.append(fileobj = input0, pages = (0,numBDSD)) #generate an instance for BDSD file
    pageIncrement = 0 
    i=0 #count how many test tables are inserted to BDSD file.
    tableCount = 0
    testItemsPagesInitial = BDSDContentFillter(contentTXT)
    if numBDSD != totalInTxt+1:
        print '''\nError!\nNumber of pages in "content.txt" are different from the "BDSD file".
This process is forced to stop. Please check both files and then start over again.'''
        raw_input('Press any key to quit...')
        sys.exit(0)
    exceptCount = False
    for testTable in testTableDict:
        try:
            startPage = int(testItemsPagesInitial[getTestItem(testTableDict[testTable])])
        except KeyError as k:
            exceptCount = True
            errorMessage1_1 =  "\nError: '%s'" % testTable
            errorMessage1_2 =  "Above file is failed to merge into BDSD. You may want to abort this process and check both:\n    1. file name of test table, or\n    2. BDSD page number."
            print errorMessage1_1 
            print errorMessage1_2 
            message(errorMessage1_1)
            message(errorMessage1_2)
        position = startPage 
        fileObj = PdfFileReader(file(testTable,'rb'))
        tableCount += 1
        pages = range(0, fileObj.getNumPages())
        merger1.merge(position , fileObj, pages)
        i += 1
        currentPage = startPage
        pageIncrement = fileObj.getNumPages()
        testItemsPagesInitial = dictIncrement(testItemsPagesInitial,currentPage,pageIncrement)
#open testtable and put all pages of it into a reader object.
#for page in range(0,1):
    try:
        merger1.write(open('merger output.pdf','wb'))
    except:
        utils.PdfReadError()
        errorMessage =  "\nError: There's an error occured during generate the final output PDF file, please feedback this issue to ChuRui, thanks a lot.\n"
        print errorMessage 
        message(errorMessage)
    if exceptCount:
        warningMessage= "Warning: output PDF file couldn't be used in case there is an Error.\n"
        print warningMessage
        message(warningMessage)
    else:
        print "\n%d Test Tables successfully merged to \"%s\", please check the output file." % (tableCount, BDSDFullName[0])
Пример #12
0
def merge_pdf(infnList, outfn):
  """
  合并pdf
  :param infnList: 要合并的PDF文件路径列表
  :param outfn: 保存的PDF文件名
  :return: None
  """
  pagenum = 0
  pdf_output = PdfFileWriter()

  for pdf in infnList:
    # 先合并一级目录的内容
    first_level_title = pdf['title']
    dir_name = os.path.join(os.path.dirname(
        __file__), 'gen', first_level_title)
    padf_path = os.path.join(dir_name, first_level_title + '.pdf')

    pdf_input = PdfFileReader(open(padf_path, 'rb'))
    # 获取 pdf 共用多少页
    page_count = pdf_input.getNumPages()
    for i in range(page_count):
        pdf_output.addPage(pdf_input.getPage(i))

    # 添加书签
    parent_bookmark = pdf_output.addBookmark(
        first_level_title, pagenum=pagenum)

    # 页数增加
    pagenum += page_count

    # 存在子章节
    if pdf['child_chapters']:
      for child in pdf['child_chapters']:
        second_level_title = child['title']
        padf_path = os.path.join(dir_name, second_level_title + '.pdf')

        pdf_input = PdfFileReader(open(padf_path, 'rb'))
        # 获取 pdf 共用多少页
        page_count = pdf_input.getNumPages()
        for i in range(page_count):
            pdf_output.addPage(pdf_input.getPage(i))

        # 添加书签
        pdf_output.addBookmark(second_level_title, pagenum=pagenum, parent=parent_bookmark)
        # 增加页数
        pagenum += page_count

  # 合并
  pdf_output.write(open(outfn, 'wb'))

  # 删除所有章节文件
  shutil.rmtree(os.path.join(os.path.dirname(__file__), 'gen'))
Пример #13
0
 def get_pdf_dimensions(self, path):
     """Get pdf dimensions using PyPDF2"""
     try:
         pdf = PdfFileReader(file(path, "rb"))
     except:
         return None
     page_list = []
     if pdf.getNumPages() > 0:
         for page_number in range(0, pdf.getNumPages()):
             page = pdf.getPage(page_number)
             page_list.append({'page': page_number, 'width': page.mediaBox.getLowerRight_x(), 'height': page.mediaBox.getUpperLeft_y()})
         return page_list
     else: return None
Пример #14
0
def merge_pdf_stack(request):
    pdf1 = "pdf1.pdf"
    pdf2 = "pdf2.pdf"

    pdfs = [pdf1, pdf2]

    buffer = BytesIO()

    doc = MyDocTemplateMerge(buffer,
                             pagesize=PAGE_SIZE,
                             rightMargin=MARGIN_SIZE,
                             leftMargin=MARGIN_SIZE,
                             topMargin=85,
                             bottomMargin=18)

    content = []

    no_page = 2

    cpt = 0
    content.append(Paragraph('Table of contents', ParagraphStyle('normal')))
    for fname in pdfs:
        input = PdfFileReader(open(fname, 'rb'))
        number_of_page = input.getNumPages()
        content.append(Paragraph('%s          %s-%s' % (fname, no_page, no_page + number_of_page),
                                 ParagraphStyle('normal')))
        no_page = no_page + number_of_page
        cpt = cpt + 1

    doc.build(content)
    merger = PdfFileMerger()
    merger.setPageMode('/UseOC')

    num_page = 1
    no_page = 1
    cpt = 0
    for fname in pdfs:
        input = PdfFileReader(open(fname, 'rb'))
        number_of_page = input.getNumPages()
        lien = fname
        merger.append(input, bookmark=lien, import_bookmarks=False)
        num_page = num_page + 1
        no_page = no_page + number_of_page
        cpt = cpt + 1

    merger.append(buffer)
    output = open("output.pdf", "wb")
    merger.write(output)
    output.close()

    return render(request, "test.html")
Пример #15
0
def AddPrint(infile, outpath):
  files = glob.glob(infile)
  if not outpath:
    outpath = os.path.join(os.path.dirname(infile), "autoprint")
    print("Defaulting output to " + outpath)

  outisdir = os.path.isdir(outpath)
  outexists = os.path.exists(outpath)

  if len(files) > 1 and not(outisdir):
    outpath = os.path.dirname(outpath)
    outisdir = os.path.isdir(outpath)
    outexists = os.path.exists(outpath)

  if outisdir and os.path.samefile(os.path.dirname(infile), outpath):
    outpath = os.path.join(outpath, "autoprint")
    outisdir = os.path.isdir(outpath)
    outexists = os.path.exists(outpath)

  if not outexists and len(files) > 1:
    os.makedirs(outpath)
    outisdir = os.path.isdir(outpath)
    outexists = os.path.exists(outpath)

  # We have multiple files check if the output is a directory.
  if len(files) > 1 and not(outisdir):
    print('Out path must be a directory if infile is multiple files')
    return

  for f in files:
    if outisdir:
      outfile = os.path.join(outpath, os.path.basename(f))
    else:
      outfile = outpath

    output = PdfFileWriter()
    input  = PdfFileReader(open(f, "rb"))

    # print how many pages input has:
    print("Processing: '%s', %d pages" % (f, input.getNumPages()))

    for x in range(0, input.getNumPages()):
      output.addPage(input.getPage(x))

    # add some Javascript to launch the print window on opening this PDF.
    output.addJS("this.print({bUI:true,bSilent:false,bShrinkToFit:true});")

    # write output to disk
    outputStream = open(outfile, "wb")
    output.write(outputStream)
    print("Written: %s" % outfile)
Пример #16
0
def page_count(obj):
    if isinstance(obj, str):
        pdf = PdfFileReader(open(obj, 'rb'))
        print pdf.getNumPages()
    elif isinstance(obj, list):
        page_map = {}
        for filename in obj:
            page = PdfFileReader(open(filename, 'rb')).getNumPages()
            if page in page_map:
                page_map[page] += 1
            else:
                page_map[page] = 1
        print "page\tcount:"
        for page in page_map:
            print str(page) + '\t' + str(page_map[page])
Пример #17
0
def main():
    files = os.listdir("./PDFs")
    sort_intuitive(files)

    files = files[:-1]

    srms = []
    others = []
    for f in files:
        if f.startswith("Single Round Match"):
            srms.append(f)
        else:
            others.append(f)

    # print files[50]
    # print alphanum_key(files[50])

    srmpagenumbers = open("./Merged/srmpagenumbers.txt", "w")
    otherpagenumbers = open("./Merged/otherpagenumbers.txt", "w")

    curr = 0
    merger = PdfFileMerger()

    for f in srms:
        inp = PdfFileReader(file("./PDFs/" + f, "rb"))
        numpages = inp.getNumPages()
        srmpagenumbers.write(str(curr) + ",, " + f + "\n")
        curr += numpages

        merger.append(inp)

    merger.write("./Merged/srmmerged.pdf")

    curr = 0
    merger = PdfFileMerger()

    for f in others:
        inp = PdfFileReader(file("./PDFs/" + f, "rb"))
        numpages = inp.getNumPages()
        otherpagenumbers.write(str(curr) + ",, " + f + "\n")
        curr += numpages

        merger.append(inp)

    merger.write("./Merged/othermerged.pdf")

    srmpagenumbers.close()
    otherpagenumbers.close()
Пример #18
0
def add_files(category, filenames_, input_abs_dir):
    """
    Handle pdf files for *category* (str).
        Input pdf files are in *input_abs_dir* (str)
        *filenames* gives the list of filenames relative to *input_abs_dir*.
    """
    global proceedings_pdf
    global cumulative_page_count
    global blank_page_pdf

    mprint('(For {})'.format(category))

    for filename_ in filenames_:
        input_pdf_path = os.path.join(input_abs_dir, filename_)
        mprint('\t' + os.path.relpath(input_pdf_path, working_dir))
        input_pdf = PdfFileReader(open(input_pdf_path, 'rb'))
        input_number_of_pages = input_pdf.getNumPages()
        proceedings_pdf.appendPagesFromReader(input_pdf)

        cumulative_page_count += input_number_of_pages

        # check if blank page insertion is needed
        if cumulative_page_count % 2:  # if odd number
            cumulative_page_count += 1
            proceedings_pdf.appendPagesFromReader(blank_page_pdf)
Пример #19
0
def pdf_to_csv_with_PyPDF():
    """
    Iterates throught all the pdf stored in ./data/pdf/ folder and export its
    content to the file data.csv.
    The format of the csv file should have two columns: id and text
    """
    bar = progressbar.ProgressBar()
    csv_data_file = _DATA_PATH + "data.csv"
    with open(csv_data_file, "w", newline='') as csvfile:
        data_writer = csv.writer(csvfile)
        data_writer.writerow(["document_id","document_text"])
        for fn in bar(os.listdir(_PDF_PATH)):
            file_path = os.path.join(_PDF_PATH, fn)
            if file_path.endswith(".pdf"):
                try:
                    input_file = PdfFileReader(open(file_path, 'rb'))
                    text = ""
                    for p in range(input_file.getNumPages()):
                        text += input_file.getPage(p).extractText() + " "
                except utils.PdfReadError as e:
                    print("Error al leer el PDF: {0}".format(fn))
                except Exception as e:
                    print("Error desconocido en el PDF: {0}".format(fn))
                    print("Error: {0}".format(e))
                else:
                    #TODO: Check if text is not empty
                    data_writer.writerow([fn,text])
Пример #20
0
 def pdf_get_no_pages(self, input_file):
     """Return number of pages in a pdf using PyPDF2."""
     try:
         pdf_input = PdfFileReader(file(input_file, "rb"))
         return pdf_input.getNumPages()
     except:
         return None
Пример #21
0
def tearpage(filename, startpage=1):
    """
    Copy filename to a tempfile, write pages startpage..N to filename.

    :param filename: PDF filepath
    :param startpage: page number for the new first page
    """
    # Copy the pdf to a tmp file
    tmp = tempfile.NamedTemporaryFile()
    shutil.copy(filename, tmp.name)

    # Read the copied pdf
    try:
        input_file = PdfFileReader(open(tmp.name, 'rb'))
    except PdfReadError:
        _fixPdf(filename, tmp.name)
        input_file = PdfFileReader(open(tmp.name, 'rb'))
    # Seek for the number of pages
    num_pages = input_file.getNumPages()

    # Write pages excepted the first one
    output_file = PdfFileWriter()
    for i in range(startpage, num_pages):
        output_file.addPage(input_file.getPage(i))

    tmp.close()
    outputStream = open(filename, "wb")
    output_file.write(outputStream)
Пример #22
0
def _merge_pdf(documents):
    '''Merge PDF files into one.

    :param documents: list of path of pdf files
    :returns: path of the merged pdf
    '''
    writer = PdfFileWriter()
    streams = []  # We have to close the streams *after* PdfFilWriter's call to write()
    try:
        for document in documents:
            pdfreport = open(document, 'rb')
            streams.append(pdfreport)
            reader = PdfFileReader(pdfreport, overwriteWarnings=False)
            for page in range(0, reader.getNumPages()):
                writer.addPage(reader.getPage(page))

        merged_file_fd, merged_file_path = tempfile.mkstemp(suffix='.html', prefix='report.merged.tmp.')
        with closing(os.fdopen(merged_file_fd, 'w')) as merged_file:
            writer.write(merged_file)
    finally:
        for stream in streams:
            try:
                stream.close()
            except Exception:
                pass

    for stream in streams:
        stream.close()

    return merged_file_path
Пример #23
0
def mergePDFList(self, pdf_data_list, start_on_recto=False):
  """Merge multiple PDFs in a new PDF.
  
  Both input and output are raw PDF data as string, so pdf_data_list must be
  a list of strings, and the output is the merged pdf as a string.
  If "start_on_recto" is set to true, some blank pages will be added in order
  to have each PDF as the recto page. This is useful if you have to print the
  merged pdf in recto/verso mode.
  """
  from StringIO import StringIO
  from PyPDF2 import PdfFileWriter, PdfFileReader
  
  output = PdfFileWriter()
  
  for pdf_data in pdf_data_list:
    if pdf_data:
      pdf_reader = PdfFileReader(StringIO(pdf_data))
      page_count = pdf_reader.getNumPages()
      for page in range(page_count):
        output.addPage(pdf_reader.getPage(page))
      if start_on_recto and page_count % 2:
        output.addBlankPage()

  outputStream = StringIO()
  output.write(outputStream)
  return outputStream.getvalue()
Пример #24
0
def add_update_pdf_metadata(filename, update_dictionary):
    # This seems to be the only way to modify the existing PDF metadata.
    #
    # pylint: disable=protected-access, no-member

    def add_prefix(value):
        return '/' + value

    full_update_dictionary = {add_prefix(k): v for k, v in update_dictionary.items()}

    with open(filename, 'rb') as input_file:
        pdf_input = PdfFileReader(input_file)
        pdf_output = PdfFileWriter()

        for page in range(pdf_input.getNumPages()):
            pdf_output.addPage(pdf_input.getPage(page))

        info_dict = pdf_output._info.getObject()

        info = pdf_input.documentInfo

        full_update_dictionary = dict(chain(info.items(), full_update_dictionary.items()))

        for key in full_update_dictionary:
            assert full_update_dictionary[key] is not None
            info_dict.update({NameObject(key): createStringObject(full_update_dictionary[key])})

        _, temp_file_name = tempfile.mkstemp(prefix="email2pdf_add_update_pdf_metadata", suffix=".pdf")

        with open(temp_file_name, 'wb') as file_out:
            pdf_output.write(file_out)

    shutil.move(temp_file_name, filename)
Пример #25
0
def get_png_image_frompdf( input_pdf_file, newWidth = None, verify = True ):
    assert( os.path.basename( input_pdf_file ).endswith( '.pdf' ) )
    assert( os.path.isfile( input_pdf_file ) )
    ipdf = PdfFileReader( open( input_pdf_file, 'rb' ) )
    assert( ipdf.getNumPages() == 1 )
    mbox = ipdf.getPage( 0 ).mediaBox
    files = { 'file' : open( input_pdf_file, 'rb' ) }
    width = int( mbox.getWidth( ) )
    height = int( mbox.getHeight( ) )
    apiKey = get_cloudconvert_api_key( )
    params = { 'apikey' : apiKey,
               'input' : 'upload',
               'inputformat' : 'pdf',
               'outputformat' : 'png',
    }
    if newWidth is not None:
        assert( isinstance( newWidth, int ) )
        assert( newWidth > 10 )
        newHeight = int( height * 1.0 * newWidth / width )
        params['converteroptions[resize]'] = '%dx%d' % ( newWidth, newHeight )
    #
    ##    
    response = requests.post( "https://api.cloudconvert.com/convert", params = params,
                              files = files, verify = verify )
    if response.status_code != 200:
        raise ValueError("Error, could not upload and convert PDF file %s." % input_pdf_file )
    img = Image.open( StringIO( response.content ) )
    return img
Пример #26
0
def toStringFormatParalell(path, rank, size, comm):

    pdf = PdfFileReader(open(path, "rb"))    
    numero_paginas = pdf.getNumPages()
    print("******************************************",numero_paginas)

    intervalo = int(numero_paginas/size)
    resto = numero_paginas%size
    fin, inicio = 0, 0

    if(rank==0):

        for i in range(1, size):

            if(i == rank):
                fin += intervalo
                inicio = (fin - intervalo) + 1
                fin += resto
                data = {'inicio':inicio, 'fin': fin, 'path': path}
                comm.send(data, dest=i, tag=1)

            else:

                fin += intervalo
                inicio = (fin - intervalo) + 1
                data = {'inicio':inicio, 'fin': fin, 'path': path}
                comm.send(data, dest=i, tag=1)

    if(rank!=0):

        data = comm.recv(source=0, tag=1)
        contenido_pagina = ""
        lista = list()
        for i in range(data['inicio'], data['fin']):

            txt = data['path'].replace(".pdf", rank + ".txt")

            subprocess.call(
            "pdftotext -f " + str(i + 1) + " -l " + str(i + 1) +
            " " + data['path'], shell=True)

            contenido_pagina = open(txt).read().lower()
            contenido_pagina = contenido_pagina.replace('á', 'a')
            contenido_pagina = contenido_pagina.replace('é', 'e')
            contenido_pagina = contenido_pagina.replace('í', 'i')
            contenido_pagina = contenido_pagina.replace('ó', 'o')
            contenido_pagina = contenido_pagina.replace('ú', 'u')
            contenido_pagina = contenido_pagina.replace('ñ', 'n')
            contenido_pagina = re.sub('[^a-z]', '', contenido_pagina)
            lista.append(contenido_pagina)
            #subprocess.call("rm -R " + txt, shell=True)

        comm.send(lista, dest=0, tag=2)

    if(rank == 0):
        book = []
        for i in range(1,size):
            book += comm.recv(source=i, tag=2)

        return book
Пример #27
0
 def test_cat(self):
     """Make sure files are properly concatenated."""
     check_call([STAPLER, 'cat', ONEPAGE_PDF, FIVEPAGE_PDF,
                 self.outputfile])
     self.assert_(os.path.isfile(self.outputfile))
     pdf = PdfFileReader(file(self.outputfile, 'rb'))
     self.assertEqual(pdf.getNumPages(), 6)
Пример #28
0
def split(paperpdf, splitpdf):
    output = PdfFileWriter()

    with open(paperpdf, "rb") as l:
        with open(paperpdf, "rb") as r:
            # I know... I know.
            # We have to do this because PyPDF2 kind of sucks.
            left = PdfFileReader(l)
            right = PdfFileReader(r)

            pagecount = left.getNumPages()
            print("%s has %s pages to split." % (paperpdf,pagecount))

            for num in range(0, pagecount):
                left_page = left.getPage(num)
                right_page = right.getPage(num)
                midpoint = (
                        left_page.mediaBox.getUpperRight_x() / 2,
                        left_page.mediaBox.getUpperRight_y()
                        )

                left_page.mediaBox.upperRight = midpoint
                output.addPage(left_page)

                right_page.mediaBox.upperLeft = midpoint
                output.addPage(right_page)

            print("Writing %s pages to %s" % (output.getNumPages(), splitpdf))
            with open(splitpdf, "wb") as s:
                output.write(s)
Пример #29
0
def extract_text(link):
	amazon_file_name = "pdfs/" + link[25:]
	if not default_storage.exists(amazon_file_name):
		try:
			add_file(link)
		except:
			return ''

	pdf = default_storage.open(amazon_file_name, 'rb')

	try:
		pdf_file = PdfFileReader(pdf)
	except:
		print "BAD FILE-- %s " %(link)

	pages = pdf_file.getNumPages()
	count = 0
	text = ''
	while count < pages:
		pg = pdf_file.getPage(count)
		pgtxt = pg.extractText()
		count = count + 1
		text = text + pgtxt

	return text 
Пример #30
0
def toStringFormat(path):
    # tiempo inicial
    # se inicia la cadena que almacenará el contenido de cada página
    # del pdf
    contenido_pagina = ""
    # instanciando lista a ocupar
    lista = list()
    # abrir pdf en modo lectura
    pdf = PdfFileReader(codecs.open(path, "rb"))
    # imprime cuantas páginas tiene el pdf:
    numero_paginas = pdf.getNumPages()
    # print("Numero de paginas del PDF: ", numero_paginas)
    # uso de la librería PyPDF2 para obtener la cantidad de hojas del pdf
    for i in range(numero_paginas):
        # convierte página i de pdf en txt
        subprocess.call(
            "pdftotext -f " + str(i + 1) + " -l " + str(i + 1) +
            " " + path, shell=True)
        # reemplazo de .pdf a .txt en path
        txt = path.replace(".pdf", ".txt")
        # abrir fichero txt que trae el contenido de la página i del pdf +
        # limpieza del string
        contenido_pagina = codecs.open(txt, encoding='ISO-8859-1').read().lower()
        contenido_pagina = contenido_pagina.replace('á', 'a')
        contenido_pagina = contenido_pagina.replace('é', 'e')
        contenido_pagina = contenido_pagina.replace('í', 'i')
        contenido_pagina = contenido_pagina.replace('ó', 'o')
        contenido_pagina = contenido_pagina.replace('ú', 'u')
        contenido_pagina = contenido_pagina.replace('ñ', 'n')
        contenido_pagina = re.sub('[^a-z]', '', contenido_pagina)
        lista.append(contenido_pagina)
        subprocess.call("rm -R " + txt, shell=True)
        

    return lista
def extract_data_from_pdf(cwd=os.getcwd()):
    # when typing the password, it will not be shown in the terminal
    password = getpass('Enter password: '******'temp.pdf')

    # to store the data based on the year (key = year, value = [month, inflow, outflow, netflow, interest, avg_balance])
    data_dict = dict()

    for file_name in os.listdir(cwd):
        # to search for FRANK OCBC e-Statements in the same folder
        if file_name.startswith("FRANK") and file_name.endswith(".pdf"):
            file_dir = os.path.join(cwd, file_name)

            try:
                '''since PyPDF2 cannot open the encrypted file, we use pikepdf
                to open the file and create a copy in a temporary pdf file that
                is decrypted for extraction
                '''
                temp_file = Pdf.open(file_dir, password=password)
                temp_file.save(temp_dir)
                pdf_obj = PdfFileReader(temp_dir)
            except PasswordError:
                print(
                    'Wrong password! Please rerun the script again to create the summary.'
                )
                exit()
            except:
                print(
                    'Whoops. Something went wrong, please rerun the script again. In case the error still happens, \
                please report this issue in https://github.com/nicklimmm/banking-statement-summarizer/issues!'
                )
                exit()

            try:
                date_created = pdf_obj.getDocumentInfo()['/CreationDate']
                year = int(date_created[2:6])
                # the statement is created 1 month later, so we decrement by 1 to get the actual data
                month = int(date_created[6:8]) - 1

                # error handling when the e-statement is received on January (which is e-statement for December)
                if month == 0:
                    year -= 1
                    month = 12

                # to handle different number of pages in each file, and the summary lies in the back pages of the file
                num_pages = pdf_obj.getNumPages()
                if num_pages == 3:
                    page_obj = pdf_obj.getPage(num_pages - 3)
                else:
                    page_obj = pdf_obj.getPage(num_pages - 2)

                # using regex to find the necessary details and extract those to variables
                text = page_obj.extractText().encode('ascii').decode('ascii')
                pattern = r'[0-9,\.]+\s+[0-9,\.]+\s+[0-9,\.]+\s+[0-9,\.]+[0-9]'
                result = re.findall(pattern, text)
                inflow, outflow, _, _ = list(
                    map(float, result[-1].replace(',', '').split()))
                netflow = round(float(inflow) - float(outflow), 2)

                if year not in data_dict:
                    data_dict[year] = []
                data_dict[year].append((month, inflow, outflow, netflow))
            except:
                print(
                    'Something went wrong... Please rerun the script or report the issue in GitHub.'
                )
                os.remove(temp_dir)
                exit()

    # to prevent unauthorized access the decrypted pdf
    os.remove(temp_dir)

    return data_dict
Пример #32
0
from PyPDF2 import PdfFileWriter, PdfFileReader

if __name__ == '__main__':
    # Read pdf from file
    infile = PdfFileReader('in.pdf')
    # Get total amount of pages
    totpages = infile.getNumPages()
    # Get starting page number from each page
    pdfpagenuminfo = infile.trailer["/Root"]["/PageLabels"]["/Nums"]
    pdfpagenumaliases = pdfpagenuminfo[0::2]
    # Shift page number of interest to get only the page of the last overlay for each frame
    pagestokeep = [x - 1 for x in pdfpagenumaliases[1::]] + [totpages - 1]

    # Initialize output
    output = PdfFileWriter()
    # Add content to output
    for i in pagestokeep:
        p = infile.getPage(i)
        p.cropBox.lowerLeft = (0, 10)

        output.addPage(p)
    # Write to output file
    with open('out.pdf', 'wb') as f:
        output.write(f)
Пример #33
0
import progressbar
from PyPDF2 import PdfFileReader, PdfFileWriter

input = json.load(open(sys.argv[1]))
origfile = sys.argv[2]
outfile = sys.argv[3]
pages_sub_segments = input['segments']
#infiles = input['files']
shape = input['shape']

j = 0
parts = []

print 'reading file ...'
pdf = PdfFileReader(open(origfile, 'rb'))
pages = [pdf.getPage(i) for i in range(0, pdf.getNumPages())]

colwidths = []
for segments in pages_sub_segments:
    colwidths += [
        seg_info['segment']['right'] - seg_info['segment']['left']
        for seg_info in segments if seg_info['iscolumn']
    ]
if len(colwidths) == 0:
    colwidth = shape[0] / 2
else:
    colwidth = max(colwidths)


def crop(page, section):
    #print 'page dimensions', page.mediaBox, page.mediaBox.upperLeft, page.mediaBox.upperRight
Пример #34
0
 def pages(self):
     '''int: Number of pages contained in PDF file'''
     with open(self.__filepath, 'rb') as in_pdf:
         pdf_handler = PdfFileReader(in_pdf)
         return pdf_handler.getNumPages()
Пример #35
0
def techs_selected(request, model=None, id=None):

    groups = TechGroup.objects.all()

    chosen_techs = Technology.objects.filter(tech_choices__session=get_session(request))
    choices = TechChoice.objects.filter(session=get_session(request)).order_by('order')
    chosen_in_group = []
    all_techs =[]
    relevance=[]
    empty=[]


    for tc in choices:
        tech = Technology.objects.get(pk=tc.technology.id)
        all_techs.append(tech)
        applicable = tech.applicable(get_session(request))
        relevance_added=False
        if applicable == tech.TECH_USE_MAYBE:
            relevancy_objects = list(tech.maybe_relevant(get_session(request)))
            if len(relevancy_objects)!=0:
                relevance.append(relevancy_objects)
                relevance_added = True
        if applicable == tech.TECH_USE_NO:
            relevancy_objects = list(tech.not_relevant(get_session(request)))
            if len(relevancy_objects)!=0:
                relevance.append(relevancy_objects)
                relevance_added = True
        if not relevance_added:
            relevance.append(empty)

    all_chosen_techs = list(zip(all_techs,relevance))

    if request.method == 'POST': # If the form has been submitted...
        form = PDF_prefs(request.POST) # A form bound to the POST data
        if form.is_valid(): # All validation rules pass

         #   incl_selected=form.cleaned_data['incl_selected']
         #   incl_short_expl=form.cleaned_data['incl_short_expl']

         #   incl_akvopedia=[]
         #   incl_akvopedia.append(form.cleaned_data['incl_akvopedia_1'])
         #   incl_akvopedia.append(form.cleaned_data['incl_akvopedia_2'])
         #   incl_akvopedia.append(form.cleaned_data['incl_akvopedia_3'])
         #   incl_akvopedia.append(form.cleaned_data['incl_akvopedia_4'])
         #   incl_akvopedia.append(form.cleaned_data['incl_akvopedia_5'])
         #   incl_akvopedia.append(form.cleaned_data['incl_akvopedia_6'])

            # create list of Akvopedia articles to be included
         #   Akvopedia_articles_URL=[]
         #   for index,incl_akv in enumerate(incl_akvopedia):
         #       if (incl_akv==True and chosen_in_group[index]!=''):
         #           if chosen_in_group[index].url!='':
         #               Akvopedia_articles_URL.append(chosen_in_group[index].url)

            # create list of factors and criteria
            answers = get_or_create_answers(get_session(request))

            criterion_list=[]
            applicable_list=[]
            change_list = []
            factor_list = []
            old_factor = ''

            # the 'change' variable is used to detect when we need to display a new factor. The form list is just a list of all criteria.
            for answer in answers:
                criterion_list.append(answer.criterion)
                applicable_list.append(answer.applicable)
                new_factor = answer.criterion.factor
                factor_list.append(new_factor)
                change_list.append(new_factor != old_factor)
                old_factor = new_factor


            zipped_answerlist = list(zip(factor_list,change_list,criterion_list,applicable_list))

            # This will generate all akvopedia articles in pdf form from the wiki. Needs to be done only once.
            #initialize_Akvopedia_articles()

            #create the basic PDF
            today=datetime.datetime.today()

            format_temp = "watercompass-%a-%b-%d-%Y_%H-%M-%S.temp.pdf"
            format_final= "watercompass-%a-%b-%d-%Y_%H-%M-%S.pdf"

            s_name_temp=today.strftime(format_temp)
            s_name_final=today.strftime(format_final)

            #first create first pages
            pdf_path=create_PDF_selected_techs(all_chosen_techs, zipped_answerlist,True,True,s_name_temp)

            # append akvopedia articles if checked.
            THIS_PATH=os.path.dirname(__file__)
            (HOME,HERE)=os.path.split(THIS_PATH)
            akvopedia_pdf_dir= settings.STATIC_ROOT + '/akvopedia_pdf/'
            output_dir=settings.STATIC_ROOT + 'pdf_tmp/'

            output = PdfFileWriter()
            outputStream = open(output_dir+s_name_final, "wb")

            input = PdfFileReader(open(output_dir+s_name_temp, "rb"))
            num_pages=input.getNumPages()
            for i in range(num_pages):
                output.addPage(input.getPage(i))

      #      for article_url in Akvopedia_articles_URL:
    #         # create pdf path
      #          URL_list=article_url.split("/")
      #          article_name=URL_list[-1]
      #          full_path=akvopedia_pdf_dir+article_name+'.pdf'

                # append article
      #          input = PdfFileReader(file(full_path, "rb"))
      #          num_pages=input.getNumPages()
      #          for i in range(num_pages):
      #              output.addPage(input.getPage(i))

            output.write(outputStream)
            outputStream.close()


            return {
              'techgroups'    : groups,
                'all_chosen_techs'    : all_chosen_techs,
                'session'       : request.session,
                'form'          : form,
                'pdf_file'      :'/technologies/pdf/'+s_name_final,
                'chosen_techs': choices
            }

                #HttpResponseRedirect(reverse('techs_selected_download')) # Redirect after POST
    else:
        form = PDF_prefs() # An unbound form

    return {
        'techgroups'    : groups,
        'session'       : request.session,
        'form'          : form,
        'pdf_file'      :'',
        'chosen_techs'  : choices
    }
Пример #36
0
    os.mkdir(outputDir)
# else:
#     if os.listdir(outputDir):
#         shutil.rmtree(outputDir)
#         os.mkdir(outputDir)

# 得到目录起始页和终止页

prompt_start, prompt_end = '目录起始页:', '目录终止页:'
prelog_startIdx, prelog_endIdx = int(input(prompt_start)), int(
    input(prompt_end))

# 取出这几页目录, 生成新文件

rd = PdfFileReader(open(pdf_path, 'rb'))  # pdfilereader -> rd
page_cnt = rd.getNumPages()
wt = PdfFileWriter()
for prelogue_pageIdx in range(prelog_startIdx,
                              prelog_endIdx + 1):  # prelog_idx 是和pdf上面显示的页码对应的
    page_obj = rd.getPage(prelogue_pageIdx - 1)  # getpage时候, 从零算起, 故真实页码-1
    wt.addPage(page_obj)
prelog_path = outputDir + '\\' + '目录集合.pdf'
output_fd = open(prelog_path, 'wb')  # fd 指的是一个句柄, 传给wt去写入
wt.write(output_fd)
output_fd.close()  ## 这一行值1亿美元!!

print('生成所有目录!')

print('目录地址:{}; 目标地址:{}'.format(prelog_path, outputDir))

# save pics
from PyPDF2 import PdfFileReader, PdfFileWriter
import os
import copy

path = r"practice_files\half and half.pdf"
input_pdf = PdfFileReader(path)
output_pdf = PdfFileWriter()

for page_num in range(0, input_pdf.getNumPages()):
    # Page_left/right har nu begge en kopi af siden
    page_left = input_pdf.getPage(page_num)
    page_right = copy.copy(page_left)

    # Finder midten af siden
    upper_right = page_left.mediaBox.upperRight
    new_coords = (upper_right[0] / 2, upper_right[1])

    # Sætter top højre hjørne på venstre side
    page_left.mediaBox.upperRight = new_coords
    output_pdf.addPage(page_left)

    # sætter top venstre hjørne på højre side
    page_right.mediaBox.upperLeft = new_coords
    output_pdf.addPage(page_right)

output_path = "The Little Mermaid.pdf"
with open(output_path, "wb") as output_file:
    output_pdf.write(output_file)
Пример #38
0
def book_pdf(from_filename, output_filename):
    from PyPDF2 import PdfFileReader, PdfFileWriter
    from io import BytesIO
    pdf_reader = PdfFileReader(str(from_filename))
    pdf_writer = PdfFileWriter()
    buffer = BytesIO()

    #######################################
    # Rotation
    #######################################
    n = pdf_reader.getNumPages()
    print(f'File: {from_filename} pre rotate constains {n} pages')
    for n in range(pdf_reader.getNumPages()):
        page = pdf_reader.getPage(n)
        if n % 2 == 0:
            page.rotateCounterClockwise(90)
        else:
            page.rotateClockwise(90)
        pdf_writer.addPage(page)

    pdf_writer.write(buffer)

    #######################################
    # Cropping
    #######################################
    import numpy as np
    import math
    from collections import Counter, OrderedDict

    def reorder(n):
        '''
		Example: For 4 double sided pages you actually get 16 pages
			Ordered as: 8, 9, 10, 7
						6, 11, 12, 5
						4, 13, 14, 3
						2, 15, 16, 1
		'''
        n_all = (n + 1) * 2
        # total_range = int(math.ceil(n_all//4.0)*2)
        total_range = int(math.ceil(n_all / 2.0))
        out = []
        for a in range(1, total_range, 2):
            tmp = []
            tmp.append(a + 1)
            tmp.append(n_all - a)
            tmp.append(n_all - a + 1)
            tmp.append(a)
            out.append(tmp)
        arr = np.concatenate(np.array(out)[::-1])
        duplicates = [
            item for item, count in Counter(arr).items() if count > 1
        ]
        assert len(
            duplicates
        ) == 0, f"Page ordering - Duplicate error: {duplicates}\n{arr}"
        assert float(n + 1) == len(
            arr
        ) / 2.0, f"Page ordering - length error: golden={float(n+1)}, revised={len(arr)/2.0}"
        return list(arr)

    # Crop
    pdf_reader0 = PdfFileReader(buffer)
    pdf_reader1 = PdfFileReader(buffer)
    pdf_writer = PdfFileWriter()
    pages = reorder(n)
    print(
        f'File: {from_filename} after booking constains {n+1}*2={n*2+2} pages')
    print(f'Page ordering: {pages}')
    threshold = 0.02
    out = {}
    for n in range(pdf_reader.getNumPages()):
        page1 = pdf_reader0.getPage(n)
        page2 = pdf_reader1.getPage(n)
        # get dimensions
        # The first two: x,y coordinates of the lower-left  corner
        # The secnd two: x,y coordinates of the Upper-right corner
        (ll_height, ll_width, ur_height, ur_width) = page1.mediaBox

        # print(page1["/Rotate"], page2["/Rotate"])
        # Right (first) page
        # When rotation comes into play, x,y dimensions are still from original image
        if page1["/Rotate"] == 90:
            page1.mediaBox.lowerLeft = (ll_height,
                                        int(ur_width / 2.0 -
                                            threshold * ur_width))
        else:  # == -90
            page1.mediaBox.upperRight = (ur_height,
                                         int(ur_width / 2.0 +
                                             threshold * ur_width))

        if page2["/Rotate"] == 90:
            page2.mediaBox.upperRight = (ur_height,
                                         int(ur_width / 2.0 +
                                             threshold * ur_width))
        else:
            page2.mediaBox.lowerLeft = (ll_height,
                                        int(ur_width / 2.0 -
                                            threshold * ur_width))

        # Force page ordering as book enforces
        try:
            n1 = pages.pop(0)
            n2 = pages.pop(0)
            # print(f'id: {n} right:{n1}, left:{n2}')
            out[n1] = page1
            out[n2] = page2
        except:
            raise Exception(
                f'Page ordering - During crop reordered pages does not match')
    print("Done cropping")

    # Reorder pages
    od = OrderedDict(sorted(out.items()))
    for k, v in od.items():
        # print(k)
        pdf_writer.addPage(v)
    print("Done reordering")

    # Writing output
    with Path(output_filename).open(mode="wb") as output_file:
        pdf_writer.write(output_file)
# 17.2 - Challenge: Use GUI Elements to Help a User Modify Files
# Solution to challenge

# save part of a PDF based on a user-supplied page range using a GUI

import easygui as gui
from PyPDF2 import PdfFileReader, PdfFileWriter

# let the user choose an input file
input_file_path = gui.fileopenbox("", "Select a PDF to trim...", "*.pdf")
if input_file_path is None:  # exit on "Cancel"
    exit()

# get the page length of the input file
input_file = PdfFileReader(input_file_path)
total_pages = input_file.getNumPages()

# let the user choose a beginning page
page_start = gui.enterbox("Enter the number of the first page to use:",
                          "Where to begin?")
if page_start is None:  # exit on "Cancel"
    exit()
# check for possible problems and try again:
#    1) input page number isn't a (non-negative) digit
# or 2) input page number is 0
# or 3) page number is greater than total number of pages
while (not page_start.isdigit() or page_start == "0"
       or int(page_start) > total_pages):
    gui.msgbox("Please provide a valid page number.", "Whoops!")
    page_start = gui.enterbox("Enter the number of the first page to use:",
                              "Where to begin?")
Пример #40
0
    def update_file_info(self, file):
        # set defaults to blank
        file.add_string_attribute('title', '')
        file.add_string_attribute('description', '')
        file.add_string_attribute('album', '')
        file.add_string_attribute('creator', '')
        file.add_string_attribute('tracknumber', '')
        file.add_string_attribute('genre', '')
        file.add_string_attribute('date', '')
        file.add_string_attribute('bitrate', '')
        file.add_string_attribute('samplerate', '')
        file.add_string_attribute('length', '')
        file.add_string_attribute('datetime_original', '')
        file.add_string_attribute('exposure_time', '')
        file.add_string_attribute('fnumber', '')
        file.add_string_attribute('focal_length', '')
        file.add_string_attribute('gps_altitude', '')
        file.add_string_attribute('gps_latitude', '')
        file.add_string_attribute('gps_longitude', '')
        file.add_string_attribute('iso_speed', '')
        file.add_string_attribute('get_orientation', '')
        file.add_string_attribute('model', '')
        file.add_string_attribute('resolution_unit', '')
        file.add_string_attribute('xresolution', '')
        file.add_string_attribute('yresolution', '')
        file.add_string_attribute('shutter_speed_value', '')
        file.add_string_attribute('aperture_value', '')
        file.add_string_attribute('brightness_value', '')
        file.add_string_attribute('exposure_bias_value', '')
        file.add_string_attribute('max_aperture_value', '')
        file.add_string_attribute('metering_mode', '')
        file.add_string_attribute('light_source', '')
        file.add_string_attribute('flash', '')
        file.add_string_attribute('exposure_mode', '')
        file.add_string_attribute('gain_control', '')
        file.add_string_attribute('width', '')
        file.add_string_attribute('height', '')
        file.add_string_attribute('pages', '')

        if file.get_uri_scheme() != 'file':
            return

        # strip file:// to get absolute path
        filename = urllib.parse.unquote_plus(file.get_uri()[7:])

        # mp3 handling
        if file.is_mime_type('audio/mpeg'):
            # attempt to read ID3 tag
            try:
                audio = EasyID3(filename)
                # sometimes the audio variable will not have one of these items
                # defined, that's why there is this long try / except attempt
                try:
                    if 'title' in audio.keys():
                        file.add_string_attribute('title', audio['title'][0])
                    else:
                        file.add_string_attribute('title', '')
                except Exception:
                    file.add_string_attribute('title', _('Error'))
                try:
                    file.add_string_attribute('album', audio['album'][0])
                except Exception:
                    file.add_string_attribute('album', _('Error'))
                try:
                    file.add_string_attribute('creator', audio['artist'][0])
                except Exception:
                    file.add_string_attribute('creator', _('Error'))
                try:
                    file.add_string_attribute('tracknumber',
                                              audio['tracknumber'][0])
                except Exception:
                    file.add_string_attribute('tracknumber', _('Error'))
                try:
                    file.add_string_attribute('genre', audio['genre'][0])
                except Exception:
                    file.add_string_attribute('genre', _('Error'))
                try:
                    file.add_string_attribute('date', audio['date'][0])
                except Exception:
                    file.add_string_attribute('date', _('Error'))
            except Exception:
                # [SabreWolfy] some files have no ID3 tag and will throw this
                # exception:
                file.add_string_attribute('title', '')
                file.add_string_attribute('description', '')
                file.add_string_attribute('album', '')
                file.add_string_attribute('creator', '')
                file.add_string_attribute('tracknumber', '')
                file.add_string_attribute('genre', '')
                file.add_string_attribute('date', '')
            # try to read MP3 information (bitrate, length, samplerate)
            try:
                mpfile = open(filename)
                mpinfo = MPEGInfo(mpfile)
                file.add_string_attribute('bitrate',
                                          str(mpinfo.bitrate / 1000) + ' Kbps')
                file.add_string_attribute('samplerate',
                                          str(mpinfo.sample_rate) + ' Hz')
                # [SabreWolfy] added consistent formatting of times in format
                # hh:mm:ss
                # [SabreWolfy[ to allow for correct column sorting by length
                mp3length = '%02i:%02i:%02i' % ((int(mpinfo.length / 3600)),
                                                (int(mpinfo.length / 60 % 60)),
                                                (int(mpinfo.length % 60)))
                mpfile.close()
                file.add_string_attribute('length', mp3length)
            except Exception:
                file.add_string_attribute('bitrate', _('Error'))
                file.add_string_attribute('length', _('Error'))
                file.add_string_attribute('samplerate', _('Error'))
                try:
                    mpfile.close()
                except Exception:
                    pass

        # image handling
        if file.get_mime_type().split('/')[0] in ('image'):
            # EXIF handling routines
            try:
                metadata = GExiv2.Metadata(filename)
                try:
                    file.add_string_attribute(
                        'datetime_original',
                        metadata.get_tag_string('Exif.Image.DateTime'))
                except Exception:
                    file.add_string_attribute('datetime_original', '')
                try:
                    file.add_string_attribute(
                        'creator', metadata.get_tag_string('Xmp.dc.creator'))
                except Exception:
                    file.add_string_attribute('creator', '')
                try:
                    file.add_string_attribute(
                        'description',
                        metadata.get_tag_string('Exif.Image.ImageDescription'))
                except Exception:
                    file.add_string_attribute('description', '')
                try:
                    x = str(metadata.get_tag_string('Xmp.dc.title'))
                    file.add_string_attribute('title', x[17:])
                except Exception:
                    file.add_string_attribute('title', '')
                try:
                    file.add_string_attribute('exposure_time',
                                              metadata.get_exposure_time())
                except Exception:
                    file.add_string_attribute('exposure_time', '')
                try:
                    file.add_string_attribute('fnumber',
                                              metadata.get_fnumber())
                except Exception:
                    file.add_string_attribute('fnumber', '')
                try:
                    file.add_string_attribute('focal_length',
                                              metadata.get_focal_length())
                except Exception:
                    file.add_string_attribute('focal_length', '')
                try:
                    file.add_string_attribute('gps_altitude',
                                              metadata.get_gps_altitude())
                except Exception:
                    file.add_string_attribute('gps_altitude', '')
                try:
                    file.add_string_attribute('gps_latitude',
                                              metadata.get_gps_latitude())
                except Exception:
                    file.add_string_attribute('gps_latitude', '')
                try:
                    file.add_string_attribute('gps_longitude',
                                              metadata.get_gps_longitude())
                except Exception:
                    file.add_string_attribute('gps_longitude', '')
                try:
                    file.add_string_attribute('iso_speed',
                                              metadata.get_iso_speed())
                except Exception:
                    file.add_string_attribute('iso_speed', '')
                file.add_string_attribute('orientation',
                                          get_orientation(metadata))
                try:
                    file.add_string_attribute(
                        'model', metadata.get_tag_string('Exif.Image.Model'))
                except Exception:
                    file.add_string_attribute('model', '')
                file.add_string_attribute('resolution_unit',
                                          get_resolution_unit(metadata))
                try:
                    file.add_string_attribute(
                        'xresolution',
                        metadata.get_tag_string('Exif.Image.XResolution'))
                except Exception:
                    file.add_string_attribute('xresolution', '')
                try:
                    file.add_string_attribute(
                        'yresolution',
                        metadata.get_tag_string('Exif.Image.YResolution'))
                except Exception:
                    file.add_string_attribute('yresolution', '')
                try:
                    file.add_string_attribute(
                        'shutter_speed_value',
                        metadata.get_tag_string(
                            'Exif.Photo.ShutterSpeedValue'))
                except Exception:
                    file.add_string_attribute('shutter_speed_value', '')
                try:
                    file.add_string_attribute(
                        'aperture_value',
                        metadata.get_tag_string('Exif.Photo.ApertureValue'))
                except Exception:
                    file.add_string_attribute('aperture_value', '')
                try:
                    file.add_string_attribute(
                        'brightness_value',
                        metadata.get_tag_string('Exif.Photo.BrightnessValue'))
                except Exception:
                    file.add_string_attribute('brightness_value', '')
                try:
                    file.add_string_attribute(
                        'brightness_value',
                        metadata.get_tag_string('Exif.Photo.BrightnessValue'))
                except Exception:
                    file.add_string_attribute('brightness_value', '')
                try:
                    file.add_string_attribute(
                        'exposure_bias_value',
                        metadata.get_tag_string(
                            'Exif.Photo.ExposureBiasValue'))
                except Exception:
                    file.add_string_attribute('exposure_bias_value', '')
                try:
                    file.add_string_attribute(
                        'max_aperture_value',
                        metadata.get_tag_string('Exif.Photo.MaxApertureValue'))
                except Exception:
                    file.add_string_attribute('max_aperture_value', '')
                file.add_string_attribute('metering_mode',
                                          get_metering_mode(metadata))
                file.add_string_attribute('light_source',
                                          get_light_source(metadata))
                file.add_string_attribute('flash', get_flash(metadata))
                file.add_string_attribute('exposure_mode',
                                          get_exposure_mode(metadata))
                file.add_string_attribute('gain_control',
                                          get_gain_control(metadata))
            except Exception:
                file.add_string_attribute('datetime_original', '')
                file.add_string_attribute('creator', '')
                file.add_string_attribute('title', '')
                file.add_string_attribute('description', '')
                file.add_string_attribute('exposure_time', '')
                file.add_string_attribute('fnumber', '')
                file.add_string_attribute('focal_length', '')
                file.add_string_attribute('gps_altitude', '')
                file.add_string_attribute('gps_latitude', '')
                file.add_string_attribute('gps_longitude', '')
                file.add_string_attribute('iso_speed', '')
                file.add_string_attribute('get_orientation', '')
                file.add_string_attribute('model', '')
                file.add_string_attribute('resolution_unit', '')
                file.add_string_attribute('xresolution', '')
                file.add_string_attribute('yresolution', '')
                file.add_string_attribute('shutter_speed_value', '')
                file.add_string_attribute('aperture_value', '')
                file.add_string_attribute('brightness_value', '')
                file.add_string_attribute('exposure_bias_value', '')
                file.add_string_attribute('max_aperture_value', '')
                file.add_string_attribute('metering_mode', '')
                file.add_string_attribute('light_source', '')
                file.add_string_attribute('flash', '')
                file.add_string_attribute('exposure_mode', '')
                file.add_string_attribute('gain_control', '')
            try:
                im = Image.open(filename)
                try:
                    file.add_string_attribute('width', str(im.size[0]))
                except Exception:
                    file.add_string_attribute('width', _('Error'))
                try:
                    file.add_string_attribute('height', str(im.size[1]))
                except Exception:
                    file.add_string_attribute('height', _('Error'))
            except Exception:
                file.add_string_attribute('width', '')
                file.add_string_attribute('height', '')

        # video/flac handling
        if file.is_mime_type('video/x-msvideo') or\
                file.is_mime_type('video/mpeg') or\
                file.is_mime_type('video/x-ms-wmv') or\
                file.is_mime_type('video/mp4') or\
                file.is_mime_type('audio/x-flac') or\
                file.is_mime_type('video/x-flv') or\
                file.is_mime_type('video/x-matroska') or\
                file.is_mime_type('audio/x-wav'):
            metadata = MediaInfo(filename)
            file.add_string_attribute('format', metadata.get_format())
            file.add_string_attribute('duration',
                                      metadata.get_duration_string())
            file.add_string_attribute('overall_bitrate',
                                      metadata.get_overallbitrate())
            file.add_string_attribute('frame_count', metadata.get_framecount())
            file.add_string_attribute('video_format',
                                      metadata.get_videoformat())
            file.add_string_attribute('width', metadata.get_width())
            file.add_string_attribute('height', metadata.get_height())
            file.add_string_attribute('bit_depth', metadata.get_bitdepth())
            file.add_string_attribute('audio_format',
                                      metadata.get_audioformat())
            file.add_string_attribute('title', metadata.get_title())
            #JRB
            #file.add_string_attribute('description',
            #                          metadata.get_description())
        # pdf handling
        if file.is_mime_type('application/pdf'):
            try:
                f = open(filename, 'rb')
                pdf = PdfFileReader(f)
                info = pdf.getDocumentInfo()
                try:
                    file.add_string_attribute(
                        'title', info.title if info.title is not None else '')
                except Exception:
                    file.add_string_attribute('title', _('Error'))
                try:
                    file.add_string_attribute(
                        'description',
                        info.subject if info.subject is not None else '')
                except Exception:
                    file.add_string_attribute('description', _('Error'))
                try:
                    file.add_string_attribute(
                        'creator',
                        info.author if info.author is not None else '')
                except Exception:
                    file.add_string_attribute('creator', _('Error'))
                try:
                    file.add_string_attribute('pages', str(pdf.getNumPages()))
                except Exception:
                    file.add_string_attribute('pages', _('Error'))
                if pdf.getNumPages() > 0:
                    try:
                        width = abs(
                            pdf.getPage(0).mediaBox.upperRight[0] -
                            pdf.getPage(0).mediaBox.lowerLeft[0])
                        file.add_string_attribute(
                            'width',
                            str(int(float(width) * math.sqrt(2.0) / 4.0)))
                    except Exception:
                        file.add_string_attribute('width', '')
                    try:
                        height = abs(
                            pdf.getPage(0).mediaBox.upperRight[1] -
                            pdf.getPage(0).mediaBox.lowerLeft[1])
                        file.add_string_attribute(
                            'height',
                            str(int(float(height) * math.sqrt(2.0) / 4.0)))
                    except Exception:
                        file.add_string_attribute('height', '')
                else:
                    file.add_string_attribute('width', '')
                    file.add_string_attribute('height', '')
                f.close()
            except Exception:
                file.add_string_attribute('title', _('Error'))
                file.add_string_attribute('description', _('Error'))
                file.add_string_attribute('creator', _('Error'))
                file.add_string_attribute('pages', _('Error'))
                file.add_string_attribute('width', _('Error'))
                file.add_string_attribute('height', _('Error'))
        self.get_columns()
Пример #41
0
#Allen Higgins C00197373
#Zoltan Fuzesi C00197361
#Robert Scully C00196960

from tabula import convert_into
from PyPDF2 import PdfFileReader

filename = raw_input('Enter PDF file Name to convert to CSV:--> ')

try:
    reader = PdfFileReader(filename, 'r')
    totalPages = reader.getNumPages()
    outputFileName = filename[0:-4] + '.csv'
    convert_into(filename,
                 outputFileName,
                 output_format="csv",
                 pages=range(2, totalPages))
except Exception as e:
    print(
        'File not found. Please check name of file or if the file has been created'
    )
Пример #42
0
def write_files_to_output(directory, output_path):

    output = PdfFileWriter()

    for file in os.listdir(directory):
        # build the path
        path = os.path.join(directory, file)
        # check whether it is a directory
        if os.path.isdir(path):
            print(file)

            # iterate over the folders in this folder
            for inner_file in os.listdir(path):
                path_1 = os.path.join(path, inner_file)
                if os.path.isdir(path_1):
                    for pdf_file in os.listdir(path_1):
                        # get the PDF
                        if pdf_file.endswith(".pdf"):
                            # build the path to the PDF
                            path_inner = os.path.join(path_1, pdf_file)
                            file_to_pdf = open(path_inner, 'rb')

                            # read the PDF in
                            existing_pdf = PdfFileReader(file_to_pdf)

                            #  number = pdf_file.split('_')[0]

                            # search for a student number in the file name
                            pattern = re.compile('[sS]\d{7}')
                            result = pattern.search(pdf_file)
                            if result is None:
                                # search the first page of the file
                                page = existing_pdf.getPage(0)

                                page_content = page.extractText()
                                result = pattern.search(page_content)

                            number = result.group(
                                0) if result is not None else '0'

                            # concatenate
                            version_student = str(file) + '_' + str(number)
                            # create the canvas with the watermark on it
                            packet = BytesIO()
                            # create a new PDF with Reportlab
                            can = canvas.Canvas(packet, pagesize=A4)
                            # shift about the canvas to have it start at the middle of the page
                            # can.translate(A4[0] / 2, A4[1] / 2)
                            # can.rotate(45)
                            # shift it back
                            # can.translate(- A4[0] / 2, - A4[1] / 2)

                            can.setFillColor(Color(0, 50, 100, alpha=0.5))
                            #0.5,0.5,0.5)
                            can.setFont("Helvetica", 30)
                            x, y = can._pagesize
                            can.drawCentredString(x / 2, 10,
                                                  str(version_student))

                            can.save()

                            # move to the beginning of the StringIO buffer
                            packet.seek(0)
                            new_pdf = PdfFileReader(packet)

                            # iterate over all pages

                            nPages = existing_pdf.getNumPages()
                            even = nPages % 2 == 0
                            for i in range(nPages):
                                # add the watermark every page here
                                page = existing_pdf.getPage(i)
                                # this is where the watermark is added
                                page.mergePage(new_pdf.getPage(0))
                                # add the page to the output
                                output.addPage(page)
                            if not even:
                                # add a blank A4 page
                                output.addBlankPage(210, 297)

                            print(path_inner)
                else:
                    continue

                # pick out the PDF
        else:
            continue

    file_output = open(output_path, "wb")
    output.write(file_output)
    file_output.close()
Пример #43
0
class PDFProcessor(object):
    def __init__(self, filename, language=None, config=None):
        self.filename = filename
        self.pdf_reader = PdfFileReader(filename)
        self.num_pages = self.pdf_reader.getNumPages()
        self.language = language
        self.config = config or {}

    def get_meta(self):
        doc_info = self.pdf_reader.getDocumentInfo()
        return {'title': doc_info.title}

    def get_images(self, pages=None, resolution=300):
        if pages is None:
            pages = range(self.num_pages)
        for page_no in pages:
            with self.get_image(page_no, resolution=resolution) as img:
                yield img

    @contextlib.contextmanager
    def get_image(self, page_no, resolution=300):
        filename = "{}[{}]".format(self.filename, page_no)
        with Image(filename=filename,
                   resolution=resolution,
                   background=wand.color.Color('#fff')) as img:
            img.alpha_channel = False
            yield img

    def get_text(self, pages=None):
        if pages is None:
            pages = range(self.num_pages)
        pdflib_pages = None
        if pdflib is not None:
            pdflib_doc = pdflib.Document(self.filename)
            pdflib_pages = list(pdflib_doc)
        for page_no in pages:
            if pdflib_pages is not None:
                page = pdflib_pages[page_no]
                text = ' '.join(page.lines).strip()
            else:
                page = self.pdf_reader.getPage(page_no)
                text = page.extractText()
            if not text.strip():
                text = self.ocr_page(page_no)
            yield text.strip()

    def ocr_page(self, page_no):
        if tesserocr is None:
            return ''
        with self.get_image(page_no, resolution=300) as img:
            pil_image = PILImage.frombytes('RGB', img.size,
                                           img.make_blob('RGB'))
            return tesserocr.image_to_text(
                pil_image,
                lang=TESSERACT_LANGUAGE[self.language],
                path=self.config.get('TESSERACT_DATA_PATH', ''))

    def run_ocr(self, timeout=180):
        from froide.helper.document import run_ocr

        output_bytes = run_ocr(self.filename,
                               language=TESSERACT_LANGUAGE[self.language],
                               timeout=timeout)
        return output_bytes

    def save_pages(self, path, **kwargs):
        for page, img in enumerate(self.get_images(**kwargs), 1):
            filename = path.format(page=page)
            img.save(filename=filename)
            yield filename
Пример #44
0
def main():
    parser = argparse.ArgumentParser(
        description='make movies from beamer slides')
    parser.add_argument('input', default='texput.pdf', nargs='?')
    parser.add_argument('output', default='input.xml', nargs='?')

    args = parser.parse_args()

    doc = fitz.open(args.input)

    output = open(args.output, 'w')
    output.write('<?xml version="1.0" encoding="UTF-8"?>' + "\n")
    output.write('<movie>' + "\n")

    with open(args.input, 'rb') as f:
        pdf = PdfFileReader(f)
        info = pdf.getDocumentInfo()
        number_of_pages = pdf.getNumPages()
        print("Reading through {} pages...".format(number_of_pages))

        for i in range(number_of_pages):
            print("Page {}".format(i))

            page = pdf.getPage(i)

            fitzpage = doc[i]
            mat = fitz.Matrix(4, 4)
            pix = fitzpage.getPixmap(matrix=mat, alpha=True)
            zoom = 4.0 * float(1080) / pix.height

            mat = fitz.Matrix(zoom, zoom)
            pix = fitzpage.getPixmap(matrix=mat, alpha=True)

            png = 'page{:03d}.png'.format(i)
            pix.writePNG(png)

            kind = 'wait'
            contents = '5'

            count = 0
            for annot in page['/Annots']:
                # Other subtypes, such as /Link, cause errors
                subtype = annot.getObject()['/Subtype']
                if subtype == "/Text":
                    if annot.getObject()['/T'] == 'Wait':
                        kind = 'wait'
                        contents = annot.getObject()['/Contents']
                    if annot.getObject()['/T'] == 'Audio':
                        kind = 'audio'
                        contents = annot.getObject()['/Contents']
                    if annot.getObject()['/T'] == 'Video':
                        kind = 'video'
                        contents = annot.getObject()['/Contents']
                    count = count + 1
                    if count > 1:
                        raise Exception("Too many annotations on page " +
                                        str(i + 1))
            if kind == 'audio':
                output.write(
                    '  <video src="{}" slide="{}"/>'.format(contents, png) +
                    "\n")
            elif kind == 'video':
                output.write(
                    '  <video src="{}" overlay="{}"/>'.format(contents, png) +
                    "\n")
            elif kind == 'wait':
                output.write('  <video src="{}" in="0" out="{}"/>'.format(
                    png, contents) + "\n")

        output.write('</movie>' + "\n")
        output.close()
        print("Wrote " + args.output)
Пример #45
0
"""
This file is for diabetes data pre-processing

"""
import PyPDF2

from PyPDF2 import PdfFileReader, PdfFileWriter

pdf_document = "Cover letter Deepmind.pdf"
pdf = PdfFileReader(pdf_document)

for page in range(pdf.getNumPages()):
    pdf_writer = PdfFileWriter
    current_page = pdf.getPage(page)
    pdf_writer.addPage(current_page)

    outputFilename = "example-page-{}.pdf".format(page + 1)
    with open(outputFilename, "wb") as out:
        pdf_writer.write(out)

        print("created", outputFilename)

#with open('diabetes_file.txt', 'w') as f:
#    f.write("\n\n".join(pdfFileobj))
Пример #46
0
# Build a list of tuples for each file type the file dialog should display
my_filetypes = [('PDF files','*.pdf'), ("All files", "*.*")]

application_window = tkinter.Tk()
# Ask the user to select a single file name.
answer = filedialog.askopenfilename(parent=application_window, initialdir=os.getcwd(), title="Please select a file:", filetypes=my_filetypes)
print(answer)

print(answer)

FILE_PATH = answer



input1 = PdfFileReader(open(FILE_PATH, mode='rb'))
n_pages=input1.getNumPages()
print("document1.pdf has %d pages." % n_pages)
end_of_document=False

rt = RichText("")
page_index=0;
inc_page=True
#n_pages=6

while (end_of_document== False):
    print("Página %d ." % page_index)    
    if (inc_page == True):   
        page=input1.getPage(page_index)
        txt=page.extractText()
        end_of_page=txt.find("\n\n")
        if (end_of_page >0):
Пример #47
0
def getPdfPageNum(path):
    with open(path, "rb") as file:
        doc = PdfFileReader(file)
        pagecount = doc.getNumPages()
    return pagecount
Пример #48
0
import os
from PyPDF2 import PdfFileReader

path = "C:/Users/Catrell Washington/Pride"

input_file_name = os.path.join(path, "Pride.pdf")
input_file = PdfFileReader(open(input_file_name, "rb"))

output_file_name = os.path.join(path, "Pride.txt")
output_file = open(output_file_name, "w")

title = input_file.getDocumentInfo().title  #get the file title
total_pages = input_file.getNumPages()  # get the total page count

output_file.write(title + "\n")
output_file.write("Number of Pages: {}\n\n".format(total_pages))

for page_num in range(0, total_pages):
    text = input_file.getPage(page_num).extractText()
    text = text.replace("   ", "\n")
    output_file.write(text)

output_file.close()
        del_name.unlink()
        file_names.remove(del_name)

convert_pt_to_mm = 25.4 / 72.0
output_data = {}
output_data[non_standard] = PdfFileWriter()
for key in sheet_size_height.keys():
    output_data[key] = PdfFileWriter()
    output_data[key + '_page_1'] = PdfFileWriter()

# Считывание листов из файла PDF и сортировка их по форматам
read_streams = []
for name in file_names:
    read_streams.append(open(name, 'rb'))
    pdf_document = PdfFileReader(read_streams[-1])
    pages_count = pdf_document.getNumPages()
    for index in range(0, pages_count):
        current_page = pdf_document.getPage(index)
        page_added = False
        # Размеры страницы необходимо перевести из пунктов в мм
        page_height = round(
            float(current_page.mediaBox.getHeight()) * convert_pt_to_mm)
        page_width = round(
            float(current_page.mediaBox.getWidth()) * convert_pt_to_mm)
        for key in sheet_size_height.keys():
            standard_height = sheet_size_height[key]
            standard_width = sheet_size_width[key]
            # Проверяются альбомные и портретные ориентации листа
            if (check_size(standard_height, page_height)
                    and check_size(standard_width, page_width)) or (
                        check_size(standard_width, page_height)
Пример #50
0
 def getArchiveFilenameList(self):
     out = []
     pdf = PdfFileReader(open(self.path, 'rb'))
     for page in range(1, pdf.getNumPages() + 1):
         out.append("/%04d.jpg" % (page))
     return out
def transform_comment_data_set(comment_data_set):

    for page, comment_data in comment_data_set.items():

        if comment_data['analyze']:
            print(f"Analyze {comment_data['public_comment_path_pdf']}")
            text_comments = get_sentence_list.get_sentence_list(
                process.get_word_block(
                    tdfp.convert_pdf_to_xml(
                        comment_data['public_comment_path_pdf'])))
            comment_data['keyword_list'] = {}
            comment_data['address'] = []
            if text_comments and not bogus.is_bogus_text(text_comments):
                with open(f"{comment_data['public_comment_path']}", 'w') as w:
                    for comment in text_comments:
                        w.write(comment)
            else:
                page_num = 0
                with open(comment_data['public_comment_path_pdf'],
                          'rb') as infile:
                    reader = PdfFileReader(infile)
                    page_num = reader.getNumPages()
                if page_num <= 50:
                    orc.extract_text_by_orc(
                        comment_data['public_comment_path_pdf'],
                        f"{comment_data['public_comment_path']}")
                elif page_num > 50:
                    separate.separate_pdf_and_ocr(
                        comment_data['public_comment_path_pdf'],
                        f"{comment_data['public_comment_path']}")
            print("Extract keywords")
            if os.path.isfile(f"{comment_data['public_comment_path']}"):
                with open(f"{comment_data['public_comment_path']}") as r:
                    comment_data[
                        'keyword_list'] = get_keyword_list.get_keyword_list(
                            r.read())
                    comment_data['address'].extend(
                        get_address.get_address(r.readlines()))
                comment_data['address'].extend(
                    get_spe_data.get_address(comment_data['summary']))
        else:
            print(
                f"System cannot find {comment_data['public_comment_path_pdf']}"
            )

    return {
        comment_data['comment_number']: {
            'summary':
            comment_data['summary'],
            'address':
            comment_data['address'],
            'keyword_list':
            comment_data['keyword_list'],
            'name':
            get_spe_data.get_name(comment_data['summary']),
            'topic':
            get_spe_data.get_topic(comment_data['summary']),
            'date':
            comment_data['date'],
            'category':
            get_category(comment_data["keyword_list"], comment_data["summary"])
        }
        for page, comment_data in comment_data_set.items()
        if comment_data['analyze']
    }
Пример #52
0
from PyPDF2 import PdfFileWriter, PdfFileReader

from PyPDF2Highlight import createHighlight, addHighlightToPage

pdfInput = PdfFileReader(open("early-stopping-1703.09580.pdf", "rb"))
pdfOutput = PdfFileWriter()

page1 = pdfInput.getPage(0)
number_of_pages = pdfInput.getNumPages()
page_content = page1.extractText()
import textract
text = textract.process("early-stopping-1703.09580.pdf")
print page_content.encode('utf-8')

highlight = createHighlight(488.725021, 202.392357, 523.153376, 211.298922, {
    "author": "",
    "contents": "Bla-bla-bla"
})

addHighlightToPage(highlight, page1, pdfOutput)

pdfOutput.addPage(page1)

outputStream = open("output.pdf", "wb")
pdfOutput.write(outputStream)
Пример #53
0
__author__ = 'Chetan'

import PyPDF2
from PyPDF2 import PdfFileReader

pdf = open("diveintopython.pdf", 'rb')
readerObj = PdfFileReader(pdf)
print "PDF Reader Object is:\n", readerObj

# Details of diveintopython book
print "Details of diveintopython book"
print "Number of pages", readerObj.getNumPages()
print "Title:", readerObj.getDocumentInfo().title
print "Author:", readerObj.getDocumentInfo().author

print "Book Outline"
for heading in readerObj.getOutlines():
    if type(heading) is not list:
        print dict(heading).get('/Title')

print "Reading Page 1"
page = readerObj.getPage(1)
print page.extractText()

pdf.close()
Пример #54
0
from PyPDF2 import PdfFileReader, PdfFileWriter

path = "C:/Users/cmello/Documents/" \
"Python Basics Book Dan Bader/" \
"exercises_chapter_13/"

input_file_path = os.path.join(path, "output/Walrus.pdf")
input_pdf = PdfFileReader(input_file_path)
output_pdf = PdfFileWriter()


# Decrypt the PDF file
input_pdf.decrypt("IamtheWalrus")  # without this, I get an error message.

# 2
num_pages = input_pdf.getNumPages()
for n in range(0, num_pages):
	page = input_pdf.getPage(n)
	page.rotateClockwise(270)
	output_pdf.addPage(page)

output_file_path = os.path.join(path, "output/Walrus Rotated2.pdf")

with open(output_file_path, "wb") as output_file:
	output_pdf.write(output_file)



"""
CIRO START FROM STEP # 3
import copy
Пример #55
0
from PyPDF2 import PdfFileWriter, PdfFileReader

infile = PdfFileReader('dummy.pdf', 'rb')
infile2 = PdfFileReader('dummy2.pdf', 'rb')
output = PdfFileWriter()

p2 = infile2.getPage(0)

for i in xrange(infile.getNumPages()):
    p = infile.getPage(i)
    output.addPage(p)
    if i == 3:
        output.addPage(p2)

with open('newfile.pdf', 'wb') as f:
    output.write(f)
Пример #56
0
from PyPDF2 import PdfFileReader
from pathlib import Path

pdf_path = (r"chapter 14 working with PDF\Pride_and_Prejudice.pdf")
output_file_path = r"chapter 14 working with PDF\Pride_and_Prejudice.txt"

pdf = PdfFileReader(pdf_path)

with open(output_file_path, 'w') as output_file:
    title = pdf.documentInfo.title
    num_pages = pdf.getNumPages()
    output_file.write(f"{title}\nNumber of pages: {num_pages}\n\n")

    for page in pdf.pages:
        text = page.extractText()
        output_file.write(text)
'''
first_page = pdf.getPage(0)
print(first_page.extractText())



for page in pdf.pages:
    print(page.extractText())  # prints the whole book
'''
Пример #57
0
from PyPDF2 import PdfFileMerger, PdfFileReader, PdfFileWriter

# How to watermark PDF pages bellow:

pdf_file = "super.pdf"
watermark = "wtr.pdf"
merged_file = "merged.pdf"

with open(pdf_file, "rb") as input_file, open(watermark,
                                              "rb") as watermark_file:
    input_pdf = PdfFileReader(input_file)
    watermark_pdf = PdfFileReader(watermark_file)
    watermark_page = watermark_pdf.getPage(0)

    output = PdfFileWriter()

    for i in range(input_pdf.getNumPages()):
        pdf_page = input_pdf.getPage(i)
        pdf_page.mergePage(watermark_page)
        output.addPage(pdf_page)

    with open(merged_file, "wb") as merged_file:
        output.write(merged_file)
Пример #58
0
import os
import sys
from PyPDF2 import PdfFileReader

nombreArchivo = sys.argv[1]
reader = PdfFileReader(open(nombreArchivo, 'rb'))
pags = reader.getNumPages()

n = 1
cmd = "pdftk %s cat "%nombreArchivo
while (n <= pags):
    cmd += "%d "%n
    n += 2
cmd += "output - | lpr"
print(cmd)
os.system(cmd)

if input("De vuelta las hojas (ingrese x para abortar) ") != "x":
    n = pags
    if pags%2 == 1:
        n -= 1
        os.system("lpr blank.pdf")

    cmd = "pdftk %s cat "%nombreArchivo
    while (n > 0):
        cmd += "%d "%n
        n -= 2
    cmd += "output - | lpr"
    print(cmd)
    os.system(cmd)
c.save()

## 2. Add Watermark to PDF
### Open the watermark file
with io.open('./watermark.pdf', mode='rb') as watermark_file:
    watermark = PdfFileReader(watermark_file)

    ### Open the output file
    with io.open(merged_file_path, mode='wb') as merged_file:
        ### Start the PDF writer buffer
        output = PdfFileWriter()

        ### Open the input file
        with io.open(input_file_path, mode='rb') as input_file:
            input_pdf = PdfFileReader(input_file)
            page_count = input_pdf.getNumPages()

            ### Add watermark to every page
            for page_number in range(page_count):
                input_page = input_pdf.getPage(page_number)
                input_page.mergePage(watermark.getPage(0))
                output.addPage(input_page)

            ### Open the file stream to the output file and save the result
            with io.open(merged_file_path, mode='wb') as merged_file:
                #tmp_out = io.BytesIO()
                #output.write(tmp_out)
                #print(tmp_out.getvalue())
                output.write(merged_file)

## 3. Clean files
Пример #60
0
 def getArchiveFilenameList(self):
     pdf = PdfFileReader(open(self.path, 'rb'))
     return [
         "/%04d.jpg" % (page) for page in range(1,
                                                pdf.getNumPages() + 1)
     ]