Пример #1
0
def extract_pdf_pypdf2(pdf_path):
    with open(pdf_path, 'rb') as f:
        pdf = PdfFileReader(f)
        if pdf.isEncrypted:
            pdf.decrypt('')
        page_obj = pdf.getPage(2)
        return page_obj.extractText()
Пример #2
0
def RemovePdfOwnerPassword(inputname, outputname):
    '''
    '''
    inputfile = open(inputname, 'rb')
    wrt = PdfFileWriter()
    ipt = PdfFileReader(inputfile)
    try:
        ipt.decrypt("")
    except KeyError as e:
        if e.message == '/Encrypt':
            print("%s is not an encrypted pdf" % inputname)
            return -1
        else:
            raise e
    print(ipt.getDocumentInfo())
    size = ipt.getNumPages()
    i = 0
    while i < size:
        page = ipt.getPage(i)
        #print(page.extractText())
        wrt.addPage(page)
        i = i + 1
    fl = open(outputname, "wb")
    wrt.write(fl)

    inputfile.close()
    fl.close()
    return 0
Пример #3
0
    def createNewBooks(self, pdf_file, stPage, endPage, filename='my.pdf'):
        input = PdfFileReader(open(pdf_file, "rb"))
        if input.isEncrypted:  #注意:所有的pdf,pypdf2默认都是加密形式,所以要先解密再读取
            input = input.decrypt('')
        pdf_input = input
        pdf_output = PdfFileWriter()
        i = stPage
        while i < endPage:
            page = pdf_input.getPage(i)  # 选取需要页面,需要注意的是第一页的编号是0
            pdf_output.addPage(page)  # 将选好的页面加入到新的pdf中
            i += 1
        output_stream = open(filename, 'wb')
        pdf_output.write(output_stream)
        output_stream.close()

        return 'Complete knifing'
Пример #4
0
 def processFile(self, curr_file):
    global extractedFrom
    author = '-'
    date = '-'
    generator = '-'
    created = '-'
    producer = '-'
    modded = '-'
    last_saved = '-'
    if ".pdf" in curr_file:
       try:
          raw_input("Processing " + str(curr_file) + ".\nPress Enter to continue...")
          # pdfFile = PdfFileReader(file(curr_file, 'rb'))
          pdfFile = PdfFileReader(open(curr_file, 'rb'))
          if pdfFile.getIsEncrypted():
             print "File is encrypted (maybe, this sometimes has false positives). Trying to decrypt."
             pdfFile.decrypt('')
             print "Success! File decrypted."
          docInfo = pdfFile.getDocumentInfo()
          if not docInfo:
             return
          last_saved = '-'
          print " The RAW document information"
          for section in docInfo:
             print docInfo[section]
          raw_input("Press Enter to continue...")
          #looks at the entire dictionary to parse for information   
          if "/CreationDate" in docInfo:
             print " Processing CREATION DATE information"
             print " Creation Date RAW: " + docInfo["/CreationDate"]
             data = docInfo["/CreationDate"].strip("D:|'")
             year = data[0:4]
             date = data[4:6] + "/" + data[6:8]
             created_time = data[8:10] + ":" + data[10:12]
             print " The value of 'created_time' is: " + str(created_time)
             print " The data type is: " + str(type(created_time))
             print " Expecting H:M format"
             raw_input(" Does it match?")
             created_time = time.strftime("%I:%M %p", time.strptime(created_time, "%H:%M"))
             created = date + "/" + year + " " + created_time
          if "/Author" in docInfo:
             print " Processing AUTHOR information"
             author = docInfo["/Author"] + " "
             if len(author) <=1:
                author = "-"
          if "/Producer" in docInfo:
             print " Processing PRODUCER information"
             producer = docInfo["/Producer"].strip("(Windows)")
             producer = re.sub(r'[^\w]', ' ', producer)
             if len(producer) == 0:
                producer = "-"
             while True:
                if "  " in producer:
                   producer = producer.replace("  ", " ")
                else:
                   break
          if "/ModDate" in docInfo:
             print " Processing MODIFIED DATE information"
             data = docInfo["/ModDate"].strip("D:|'")
             year = data[0:4]
             date = data[4:6] + "/" + data[6:8]
             modded_time = data[8:10] + ":" + data[10:12]
             modded_time = time.strftime("%I:%M %p", time.strptime(modded_time, "%H:%M"))
             modded = date + "/" + year + " "  + modded_time
          #strips '/' off file name (if it includes directory name)
          if "/" in curr_file:
             curr_file = curr_file[curr_file.rfind("/")+1:]
          if "\\" in curr_file:
             curr_file = curr_file.replace("\\","")
          #trim information if it's too long
          if len(curr_file) > 15: # trims file name
             curr_file = curr_file[:15] + "..." + curr_file[-13:]
          if len(producer) > 30:
             producer = producer[:20] + " [snipped] "
          if len(author) > 20:
             author = author[:20] + " [snipped] "
          #appends each piece of information. output will show ONLY if at least ONE file has data in a column
          self.container.append([" | " + curr_file,created,author,producer,modded,last_saved])
          print "Parsing Completed."
       except NotImplementedError:
          print "Tried to decrypt a secured/encrypted PDF, and it failed. Try to read details manually."
       except Exception, err:
          print "Parsing failed somewhere in the TRY statement."
          return
Пример #5
0
    def getPdfMetadata(self, path=None):
        '''
        This method will get the pdf metadata and return book object.
        '''
        logger.debug('getPdfMetadata path: %s', path)

        if path:
            try:
                input = PdfFileReader(open(path, "rb"))
                logger.debug('getIsEncrypted : %s ', input.getIsEncrypted())
            except Exception as e:
                logger.error(e, exc_info=True)
            pdf_info = None
            try:
                pdf_toread = PdfFileReader(open(path, "rb"))
                if pdf_toread.isEncrypted:
                    try:
                        pdf_toread.decrypt('')
                    except Exception as e:
                        logger.error(e, exc_info=True)
            except Exception as e:
                logger.error(e, exc_info=True)
            try:
                pdf_info = pdf_toread.getDocumentInfo()
                logger.debug('NumPages:%s', pdf_toread.getNumPages())
                self.book.numberOfPages = pdf_toread.getNumPages()
                #             value = pdf_info.subject
                subject = None
                if pdf_info.subject and type(pdf_info.subject) == str:
                    # Ignore errors even if the string is not proper UTF-8 or has
                    # broken marker bytes.
                    # Python built-in function unicode() can do this.
                    subject = pdf_info.subject

#                 else:
#                     # Assume the value object has proper __unicode__() method
#                     value = unicode(pdf_info.subject)
#                     print 'else'
                if not self.book.tag and subject:
                    self.book.tag = subject
                elif self.book.tag and subject:
                    self.book.tag = self.book.tag + '' + subject
            except Exception as e:
                logger.error(e, exc_info=True)
            try:
                if pdf_info.title != None and pdf_info.title.strip() != '':
                    self.book.bookName = str(pdf_info.title)
            except Exception as e:
                logger.error(e, exc_info=True)

            try:
                if pdf_info.creator:
                    self.book.publisher = str(pdf_info.creator.encode('utf-8'))
            except Exception as e:
                logger.error(e, exc_info=True)
            self.book.createdOn = datetime.now()
            try:
                #                 print str(pdf_info['/CreationDate'])[2:10]
                date = datetime.strptime(
                    str(pdf_info['/CreationDate'])[2:10], '%Y%m%d')
                self.book.publishedOn = date
            except Exception as e:
                logger.error(e, exc_info=True)
                logger.error('CreationDate not found')

            logger.debug(Util().convert_bytes(os.path.getsize(path)))
            self.book.fileSize = Util().convert_bytes(os.path.getsize(path))

            #             if 'ISBN'.lower() in str(pdf_info['/Subject']).lower():
            #                 self.book.isbn_13 = str(pdf_info['/Subject'])[6:]

            author = Author()
            val = 'Unknown'
            try:
                if pdf_info.author != None and pdf_info.author.strip() != '':
                    val = pdf_info.author


#                     val = val.encode("utf8", "ignore")
            except Exception as e:
                logger.error(e, exc_info=True)
            author.authorName = val

            authorList = list()
            authorList.append(author)
            self.book.authors = authorList
Пример #6
0
    def getPdfMetadata(self, path=None):
        '''
        This method will get the pdf metadata and return book object.
        '''

        print path

        if path:
            try:
                input = PdfFileReader(open(path, "rb"))
                print 'getPdfMetadata', input.getIsEncrypted()
            except:
                pass
            pdf_info = None
            try:
                pdf_toread = PdfFileReader(open(path, "rb"))
                if pdf_toread.isEncrypted:
                    try:
                        pdf_toread.decrypt('')
                    except:
                        traceback.print_exc()
            except:
                pass
            try:
                pdf_info = pdf_toread.getDocumentInfo()
                print 'Pages:', pdf_toread.getNumPages()
                self.book.numberOfPages = pdf_toread.getNumPages()
                #             value = pdf_info.subject
                if type(pdf_info.subject) == str:
                    # Ignore errors even if the string is not proper UTF-8 or has
                    # broken marker bytes.
                    # Python built-in function unicode() can do this.
                    value = unicode(pdf_info.subject, "utf-8", errors="ignore")
                    
                else:
                    # Assume the value object has proper __unicode__() method
                    value = unicode(pdf_info.subject)
                    print 'else'
                if not self.book.tag :
                    self.book.tag = value
                else:
                    self.book.tag = self.book.tag + '' + value
            except:
                traceback.print_exc()
            try:
                if pdf_info.title != None and pdf_info.title.strip() != '':
                    self.book.bookName = str(pdf_info.title)
            except:
                print 'unable to set bookName', traceback.print_exc()
            
            try:
                if pdf_info.creator:
                    self.book.publisher = str(pdf_info.creator.encode('utf-8'))
            except:
                pass
            self.book.createdOn = datetime.now()
            try:
                print str(pdf_info['/CreationDate'])[2:10]
                date = datetime.strptime(str(pdf_info['/CreationDate'])[2:10] , '%Y%m%d')
                self.book.publishedOn = date
            except:
                print 'CreationDate not found'
            
            print path
            print Util().convert_bytes(os.path.getsize(path))
            self.book.fileSize = Util().convert_bytes(os.path.getsize(path))
            



#             if 'ISBN'.lower() in str(pdf_info['/Subject']).lower():
#                 self.book.isbn_13 = str(pdf_info['/Subject'])[6:]

            author = Author()
            val = 'Unknown'
            try:
                if pdf_info.author !=None and pdf_info.author.strip()!='':
                    val = pdf_info.author
                    val = val.encode("utf8", "ignore")
            except:
                pass
            author.authorName = val
            

            authorList = list()
            authorList.append(author)
            self.book.authors = authorList
Пример #7
0
def main():
    parser = argparse.ArgumentParser(description="Swaps colors of pdf file.")
    parser.add_argument("to_color",
                        help="hex string of color that will replace")
    parser.add_argument("input", help="path to input pdf file", type=str)
    # optional arguments
    parser.add_argument("-p",
                        help="page numbers",
                        nargs="*",
                        type=int,
                        metavar="pageno",
                        dest="pages")
    parser.add_argument("-P",
                        help="password to pdf input file",
                        metavar="password",
                        nargs=1,
                        dest="password",
                        type=str)
    parser.add_argument("-c",
                        help="color to be swapped(default black)",
                        default="#000000",
                        nargs=1,
                        metavar="from_color",
                        dest="from_color")
    parser.add_argument("-o",
                        help="filename of output pdf",
                        default="output.pdf",
                        nargs=1,
                        metavar="filename",
                        dest="output")
    parser.add_argument(
        "-O",
        help="save directory for output file(default current directory)",
        nargs=1,
        metavar="directory",
        dest="outputDir")
    parser.add_argument("-d",
                        help="debugging mode",
                        action="store_true",
                        dest="debug_mode")
    args = parser.parse_args()

    # input path management
    currentPath = os.getcwd()
    if os.path.exists(args.input):
        filepath = args.input
    else:
        input_filename = os.path.split(args.input)[1]
        filepath = os.path.join(currentPath, input_filename)
    # opening the file with reader
    reader = PdfFileReader(filepath)
    if not args.password == None:
        reader.decrypt(args.password)

    colorWriter = PdfColorConverter(debug=args.debug_mode)
    colorWriter.appendPagesFromReader(reader)

    # extracting colors
    from_rgb = hexStringToRGB(args.from_color)
    from_color = RGBColor(*from_rgb)
    to_rgb = hexStringToRGB(args.to_color)
    to_color = RGBColor(*to_rgb)

    # performing color swaps
    if args.pages != None:
        for page in args.pages:
            if page > (colorWriter.getNumPages() - 1):
                parser.error("page index to high: %d" % page)
                return 1
            else:
                colorWriter.swapColor(page, from_color, to_color)
    else:
        for page in range(0, colorWriter.getNumPages()):
            colorWriter.swapColor(page, from_color, to_color)

    # saving output pdf
    if args.outputDir == None:
        path = os.path.join(currentPath, args.output)
        outputStream = open(path, "wb")
    else:
        path = os.path.join(args.outputDir, args.output)
        outputStream = open(path, "wb")

    colorWriter.write(outputStream)

    return 0