Пример #1
0
def RemovePdfOwnerPassword(inputname, outputname):
    '''
    '''
    inputfile = open(inputname, 'rb')
    wrt = PdfFileWriter()
    ipt = PdfFileReader(inputfile)
    try:
        ipt.decrypt("")
    except KeyError as e:
        if e.message == '/Encrypt':
            print("%s is not an encrypted pdf" % inputname)
            return -1
        else:
            raise e
    print(ipt.getDocumentInfo())
    size = ipt.getNumPages()
    i = 0
    while i < size:
        page = ipt.getPage(i)
        #print(page.extractText())
        wrt.addPage(page)
        i = i + 1
    fl = open(outputname, "wb")
    wrt.write(fl)

    inputfile.close()
    fl.close()
    return 0
Пример #2
0
def metadata(path: Path) -> Metadata:
    """
    Reads a given PDF file and produces a Metadata object.

    :param path: path to a PDF file
    :return: the metadata extracted from the PDF file
    """
    with path.open('rb') as f:
        reader = PdfFileReader(f)
        info = reader.getDocumentInfo()
        page_count = reader.getNumPages()

    typer.echo(f'PDF metadata: {info}', err=True)

    # Decide which possible title to use:
    # - the title annotated in the PDF metadata
    # - the title read by pdftitle (largest text on the first page)
    # - the file name without extension
    pdftitle_title = pdftitle.get_title_from_file(str(path))
    typer.echo(f'Title according to pdftitle: {pdftitle_title}', err=True)

    title_candidates = [t for t in [info.title, pdftitle_title, path.stem] if t is not None]

    # The current heuristic is just to use the longest of the three candidates
    title = max(title_candidates, key=len)

    return Metadata(
        title=title,
        author=info.author,
        page_count=page_count
    )
Пример #3
0
 def __init__(self, path):
     self.path = path
     self.pages = None
     self.meta_data = None
     try: 
         with open(self.path,'rb') as fp:
             pdf = PdfFileReader(fp)
             self.meta_data =  pdf.getDocumentInfo()
             self.pages = pdf.getNumPages()
     except (IOError,TypeError) as e:
             print(e)
     except:
             print("Unexpected error:", sys.exc_info()[0])
Пример #4
0
 def __init__(self, path):
     self.path = path
     self.pages = None
     self.meta_data = None
     try:
         with open(self.path, 'rb') as fp:
             pdf = PdfFileReader(fp)
             self.meta_data = pdf.getDocumentInfo()
             self.pages = pdf.getNumPages()
     except (IOError, TypeError) as e:
         print(e)
     except:
         print("Unexpected error:", sys.exc_info()[0])
Пример #5
0
def getPDFInformation(filepath):
    """
    Read information of pdf file (title, author)
    :param filepath: filepath to pdf file
    :return: dict with title and author information
    """
    with open(filepath, 'rb') as f:
        reader = PdfFileReader(f)
        info = reader.getDocumentInfo()
        ret = {}
        if "/Title" in info:
            ret['title'] = info['/Title']
        else:
            ret['title'] = filepath.split('/')[-1]
        if "/Author" in info:
            ret['author'] = info['/Author']
        else:
            ret['author'] = ""
        return ret
Пример #6
0
def getPDFInformation(filepath):
    """
    Read information of pdf file (title, author)
    :param filepath: filepath to pdf file
    :return: dict with title and author information
    """
    with open(filepath, 'rb') as f:
        reader = PdfFileReader(f)
        info = reader.getDocumentInfo()
        ret = {}
        if "/Title" in info:
            ret['title'] = info['/Title']
        else:
            ret['title'] = filepath.split('/')[-1]
        if "/Author" in info:
            ret['author'] = info['/Author']
        else:
            ret['author'] = ""
        return ret
Пример #7
0
 def processFile(self, curr_file):
    global extractedFrom
    author = '-'
    date = '-'
    generator = '-'
    created = '-'
    producer = '-'
    modded = '-'
    last_saved = '-'
    if ".pdf" in curr_file:
       try:
          raw_input("Processing " + str(curr_file) + ".\nPress Enter to continue...")
          # pdfFile = PdfFileReader(file(curr_file, 'rb'))
          pdfFile = PdfFileReader(open(curr_file, 'rb'))
          if pdfFile.getIsEncrypted():
             print "File is encrypted (maybe, this sometimes has false positives). Trying to decrypt."
             pdfFile.decrypt('')
             print "Success! File decrypted."
          docInfo = pdfFile.getDocumentInfo()
          if not docInfo:
             return
          last_saved = '-'
          print " The RAW document information"
          for section in docInfo:
             print docInfo[section]
          raw_input("Press Enter to continue...")
          #looks at the entire dictionary to parse for information   
          if "/CreationDate" in docInfo:
             print " Processing CREATION DATE information"
             print " Creation Date RAW: " + docInfo["/CreationDate"]
             data = docInfo["/CreationDate"].strip("D:|'")
             year = data[0:4]
             date = data[4:6] + "/" + data[6:8]
             created_time = data[8:10] + ":" + data[10:12]
             print " The value of 'created_time' is: " + str(created_time)
             print " The data type is: " + str(type(created_time))
             print " Expecting H:M format"
             raw_input(" Does it match?")
             created_time = time.strftime("%I:%M %p", time.strptime(created_time, "%H:%M"))
             created = date + "/" + year + " " + created_time
          if "/Author" in docInfo:
             print " Processing AUTHOR information"
             author = docInfo["/Author"] + " "
             if len(author) <=1:
                author = "-"
          if "/Producer" in docInfo:
             print " Processing PRODUCER information"
             producer = docInfo["/Producer"].strip("(Windows)")
             producer = re.sub(r'[^\w]', ' ', producer)
             if len(producer) == 0:
                producer = "-"
             while True:
                if "  " in producer:
                   producer = producer.replace("  ", " ")
                else:
                   break
          if "/ModDate" in docInfo:
             print " Processing MODIFIED DATE information"
             data = docInfo["/ModDate"].strip("D:|'")
             year = data[0:4]
             date = data[4:6] + "/" + data[6:8]
             modded_time = data[8:10] + ":" + data[10:12]
             modded_time = time.strftime("%I:%M %p", time.strptime(modded_time, "%H:%M"))
             modded = date + "/" + year + " "  + modded_time
          #strips '/' off file name (if it includes directory name)
          if "/" in curr_file:
             curr_file = curr_file[curr_file.rfind("/")+1:]
          if "\\" in curr_file:
             curr_file = curr_file.replace("\\","")
          #trim information if it's too long
          if len(curr_file) > 15: # trims file name
             curr_file = curr_file[:15] + "..." + curr_file[-13:]
          if len(producer) > 30:
             producer = producer[:20] + " [snipped] "
          if len(author) > 20:
             author = author[:20] + " [snipped] "
          #appends each piece of information. output will show ONLY if at least ONE file has data in a column
          self.container.append([" | " + curr_file,created,author,producer,modded,last_saved])
          print "Parsing Completed."
       except NotImplementedError:
          print "Tried to decrypt a secured/encrypted PDF, and it failed. Try to read details manually."
       except Exception, err:
          print "Parsing failed somewhere in the TRY statement."
          return
Пример #8
0
    def getPdfMetadata(self, path=None):
        '''
        This method will get the pdf metadata and return book object.
        '''
        logger.debug('getPdfMetadata path: %s', path)

        if path:
            try:
                input = PdfFileReader(open(path, "rb"))
                logger.debug('getIsEncrypted : %s ', input.getIsEncrypted())
            except Exception as e:
                logger.error(e, exc_info=True)
            pdf_info = None
            try:
                pdf_toread = PdfFileReader(open(path, "rb"))
                if pdf_toread.isEncrypted:
                    try:
                        pdf_toread.decrypt('')
                    except Exception as e:
                        logger.error(e, exc_info=True)
            except Exception as e:
                logger.error(e, exc_info=True)
            try:
                pdf_info = pdf_toread.getDocumentInfo()
                logger.debug('NumPages:%s', pdf_toread.getNumPages())
                self.book.numberOfPages = pdf_toread.getNumPages()
                #             value = pdf_info.subject
                subject = None
                if pdf_info.subject and type(pdf_info.subject) == str:
                    # Ignore errors even if the string is not proper UTF-8 or has
                    # broken marker bytes.
                    # Python built-in function unicode() can do this.
                    subject = pdf_info.subject

#                 else:
#                     # Assume the value object has proper __unicode__() method
#                     value = unicode(pdf_info.subject)
#                     print 'else'
                if not self.book.tag and subject:
                    self.book.tag = subject
                elif self.book.tag and subject:
                    self.book.tag = self.book.tag + '' + subject
            except Exception as e:
                logger.error(e, exc_info=True)
            try:
                if pdf_info.title != None and pdf_info.title.strip() != '':
                    self.book.bookName = str(pdf_info.title)
            except Exception as e:
                logger.error(e, exc_info=True)

            try:
                if pdf_info.creator:
                    self.book.publisher = str(pdf_info.creator.encode('utf-8'))
            except Exception as e:
                logger.error(e, exc_info=True)
            self.book.createdOn = datetime.now()
            try:
                #                 print str(pdf_info['/CreationDate'])[2:10]
                date = datetime.strptime(
                    str(pdf_info['/CreationDate'])[2:10], '%Y%m%d')
                self.book.publishedOn = date
            except Exception as e:
                logger.error(e, exc_info=True)
                logger.error('CreationDate not found')

            logger.debug(Util().convert_bytes(os.path.getsize(path)))
            self.book.fileSize = Util().convert_bytes(os.path.getsize(path))

            #             if 'ISBN'.lower() in str(pdf_info['/Subject']).lower():
            #                 self.book.isbn_13 = str(pdf_info['/Subject'])[6:]

            author = Author()
            val = 'Unknown'
            try:
                if pdf_info.author != None and pdf_info.author.strip() != '':
                    val = pdf_info.author


#                     val = val.encode("utf8", "ignore")
            except Exception as e:
                logger.error(e, exc_info=True)
            author.authorName = val

            authorList = list()
            authorList.append(author)
            self.book.authors = authorList
Пример #9
0
    def getPdfMetadata(self, path=None):
        '''
        This method will get the pdf metadata and return book object.
        '''

        print path

        if path:
            try:
                input = PdfFileReader(open(path, "rb"))
                print 'getPdfMetadata', input.getIsEncrypted()
            except:
                pass
            pdf_info = None
            try:
                pdf_toread = PdfFileReader(open(path, "rb"))
                if pdf_toread.isEncrypted:
                    try:
                        pdf_toread.decrypt('')
                    except:
                        traceback.print_exc()
            except:
                pass
            try:
                pdf_info = pdf_toread.getDocumentInfo()
                print 'Pages:', pdf_toread.getNumPages()
                self.book.numberOfPages = pdf_toread.getNumPages()
                #             value = pdf_info.subject
                if type(pdf_info.subject) == str:
                    # Ignore errors even if the string is not proper UTF-8 or has
                    # broken marker bytes.
                    # Python built-in function unicode() can do this.
                    value = unicode(pdf_info.subject, "utf-8", errors="ignore")
                    
                else:
                    # Assume the value object has proper __unicode__() method
                    value = unicode(pdf_info.subject)
                    print 'else'
                if not self.book.tag :
                    self.book.tag = value
                else:
                    self.book.tag = self.book.tag + '' + value
            except:
                traceback.print_exc()
            try:
                if pdf_info.title != None and pdf_info.title.strip() != '':
                    self.book.bookName = str(pdf_info.title)
            except:
                print 'unable to set bookName', traceback.print_exc()
            
            try:
                if pdf_info.creator:
                    self.book.publisher = str(pdf_info.creator.encode('utf-8'))
            except:
                pass
            self.book.createdOn = datetime.now()
            try:
                print str(pdf_info['/CreationDate'])[2:10]
                date = datetime.strptime(str(pdf_info['/CreationDate'])[2:10] , '%Y%m%d')
                self.book.publishedOn = date
            except:
                print 'CreationDate not found'
            
            print path
            print Util().convert_bytes(os.path.getsize(path))
            self.book.fileSize = Util().convert_bytes(os.path.getsize(path))
            



#             if 'ISBN'.lower() in str(pdf_info['/Subject']).lower():
#                 self.book.isbn_13 = str(pdf_info['/Subject'])[6:]

            author = Author()
            val = 'Unknown'
            try:
                if pdf_info.author !=None and pdf_info.author.strip()!='':
                    val = pdf_info.author
                    val = val.encode("utf8", "ignore")
            except:
                pass
            author.authorName = val
            

            authorList = list()
            authorList.append(author)
            self.book.authors = authorList