Пример #1
0
    def getPdfMetadata(self, path=None):
        '''
        This method will get the pdf metadata and return book object.
        '''
        logger.debug('getPdfMetadata path: %s', path)

        if path:
            try:
                input = PdfFileReader(open(path, "rb"))
                logger.debug('getIsEncrypted : %s ', input.getIsEncrypted())
            except Exception as e:
                logger.error(e, exc_info=True)
            pdf_info = None
            try:
                pdf_toread = PdfFileReader(open(path, "rb"))
                if pdf_toread.isEncrypted:
                    try:
                        pdf_toread.decrypt('')
                    except Exception as e:
                        logger.error(e, exc_info=True)
            except Exception as e:
                logger.error(e, exc_info=True)
            try:
                pdf_info = pdf_toread.getDocumentInfo()
                logger.debug('NumPages:%s', pdf_toread.getNumPages())
                self.book.numberOfPages = pdf_toread.getNumPages()
                #             value = pdf_info.subject
                subject = None
                if pdf_info.subject and type(pdf_info.subject) == str:
                    # Ignore errors even if the string is not proper UTF-8 or has
                    # broken marker bytes.
                    # Python built-in function unicode() can do this.
                    subject = pdf_info.subject

#                 else:
#                     # Assume the value object has proper __unicode__() method
#                     value = unicode(pdf_info.subject)
#                     print 'else'
                if not self.book.tag and subject:
                    self.book.tag = subject
                elif self.book.tag and subject:
                    self.book.tag = self.book.tag + '' + subject
            except Exception as e:
                logger.error(e, exc_info=True)
            try:
                if pdf_info.title != None and pdf_info.title.strip() != '':
                    self.book.bookName = str(pdf_info.title)
            except Exception as e:
                logger.error(e, exc_info=True)

            try:
                if pdf_info.creator:
                    self.book.publisher = str(pdf_info.creator.encode('utf-8'))
            except Exception as e:
                logger.error(e, exc_info=True)
            self.book.createdOn = datetime.now()
            try:
                #                 print str(pdf_info['/CreationDate'])[2:10]
                date = datetime.strptime(
                    str(pdf_info['/CreationDate'])[2:10], '%Y%m%d')
                self.book.publishedOn = date
            except Exception as e:
                logger.error(e, exc_info=True)
                logger.error('CreationDate not found')

            logger.debug(Util().convert_bytes(os.path.getsize(path)))
            self.book.fileSize = Util().convert_bytes(os.path.getsize(path))

            #             if 'ISBN'.lower() in str(pdf_info['/Subject']).lower():
            #                 self.book.isbn_13 = str(pdf_info['/Subject'])[6:]

            author = Author()
            val = 'Unknown'
            try:
                if pdf_info.author != None and pdf_info.author.strip() != '':
                    val = pdf_info.author


#                     val = val.encode("utf8", "ignore")
            except Exception as e:
                logger.error(e, exc_info=True)
            author.authorName = val

            authorList = list()
            authorList.append(author)
            self.book.authors = authorList
Пример #2
0
 def processFile(self, curr_file):
    global extractedFrom
    author = '-'
    date = '-'
    generator = '-'
    created = '-'
    producer = '-'
    modded = '-'
    last_saved = '-'
    if ".pdf" in curr_file:
       try:
          raw_input("Processing " + str(curr_file) + ".\nPress Enter to continue...")
          # pdfFile = PdfFileReader(file(curr_file, 'rb'))
          pdfFile = PdfFileReader(open(curr_file, 'rb'))
          if pdfFile.getIsEncrypted():
             print "File is encrypted (maybe, this sometimes has false positives). Trying to decrypt."
             pdfFile.decrypt('')
             print "Success! File decrypted."
          docInfo = pdfFile.getDocumentInfo()
          if not docInfo:
             return
          last_saved = '-'
          print " The RAW document information"
          for section in docInfo:
             print docInfo[section]
          raw_input("Press Enter to continue...")
          #looks at the entire dictionary to parse for information   
          if "/CreationDate" in docInfo:
             print " Processing CREATION DATE information"
             print " Creation Date RAW: " + docInfo["/CreationDate"]
             data = docInfo["/CreationDate"].strip("D:|'")
             year = data[0:4]
             date = data[4:6] + "/" + data[6:8]
             created_time = data[8:10] + ":" + data[10:12]
             print " The value of 'created_time' is: " + str(created_time)
             print " The data type is: " + str(type(created_time))
             print " Expecting H:M format"
             raw_input(" Does it match?")
             created_time = time.strftime("%I:%M %p", time.strptime(created_time, "%H:%M"))
             created = date + "/" + year + " " + created_time
          if "/Author" in docInfo:
             print " Processing AUTHOR information"
             author = docInfo["/Author"] + " "
             if len(author) <=1:
                author = "-"
          if "/Producer" in docInfo:
             print " Processing PRODUCER information"
             producer = docInfo["/Producer"].strip("(Windows)")
             producer = re.sub(r'[^\w]', ' ', producer)
             if len(producer) == 0:
                producer = "-"
             while True:
                if "  " in producer:
                   producer = producer.replace("  ", " ")
                else:
                   break
          if "/ModDate" in docInfo:
             print " Processing MODIFIED DATE information"
             data = docInfo["/ModDate"].strip("D:|'")
             year = data[0:4]
             date = data[4:6] + "/" + data[6:8]
             modded_time = data[8:10] + ":" + data[10:12]
             modded_time = time.strftime("%I:%M %p", time.strptime(modded_time, "%H:%M"))
             modded = date + "/" + year + " "  + modded_time
          #strips '/' off file name (if it includes directory name)
          if "/" in curr_file:
             curr_file = curr_file[curr_file.rfind("/")+1:]
          if "\\" in curr_file:
             curr_file = curr_file.replace("\\","")
          #trim information if it's too long
          if len(curr_file) > 15: # trims file name
             curr_file = curr_file[:15] + "..." + curr_file[-13:]
          if len(producer) > 30:
             producer = producer[:20] + " [snipped] "
          if len(author) > 20:
             author = author[:20] + " [snipped] "
          #appends each piece of information. output will show ONLY if at least ONE file has data in a column
          self.container.append([" | " + curr_file,created,author,producer,modded,last_saved])
          print "Parsing Completed."
       except NotImplementedError:
          print "Tried to decrypt a secured/encrypted PDF, and it failed. Try to read details manually."
       except Exception, err:
          print "Parsing failed somewhere in the TRY statement."
          return
Пример #3
0
    def getPdfMetadata(self, path=None):
        '''
        This method will get the pdf metadata and return book object.
        '''

        print path

        if path:
            try:
                input = PdfFileReader(open(path, "rb"))
                print 'getPdfMetadata', input.getIsEncrypted()
            except:
                pass
            pdf_info = None
            try:
                pdf_toread = PdfFileReader(open(path, "rb"))
                if pdf_toread.isEncrypted:
                    try:
                        pdf_toread.decrypt('')
                    except:
                        traceback.print_exc()
            except:
                pass
            try:
                pdf_info = pdf_toread.getDocumentInfo()
                print 'Pages:', pdf_toread.getNumPages()
                self.book.numberOfPages = pdf_toread.getNumPages()
                #             value = pdf_info.subject
                if type(pdf_info.subject) == str:
                    # Ignore errors even if the string is not proper UTF-8 or has
                    # broken marker bytes.
                    # Python built-in function unicode() can do this.
                    value = unicode(pdf_info.subject, "utf-8", errors="ignore")
                    
                else:
                    # Assume the value object has proper __unicode__() method
                    value = unicode(pdf_info.subject)
                    print 'else'
                if not self.book.tag :
                    self.book.tag = value
                else:
                    self.book.tag = self.book.tag + '' + value
            except:
                traceback.print_exc()
            try:
                if pdf_info.title != None and pdf_info.title.strip() != '':
                    self.book.bookName = str(pdf_info.title)
            except:
                print 'unable to set bookName', traceback.print_exc()
            
            try:
                if pdf_info.creator:
                    self.book.publisher = str(pdf_info.creator.encode('utf-8'))
            except:
                pass
            self.book.createdOn = datetime.now()
            try:
                print str(pdf_info['/CreationDate'])[2:10]
                date = datetime.strptime(str(pdf_info['/CreationDate'])[2:10] , '%Y%m%d')
                self.book.publishedOn = date
            except:
                print 'CreationDate not found'
            
            print path
            print Util().convert_bytes(os.path.getsize(path))
            self.book.fileSize = Util().convert_bytes(os.path.getsize(path))
            



#             if 'ISBN'.lower() in str(pdf_info['/Subject']).lower():
#                 self.book.isbn_13 = str(pdf_info['/Subject'])[6:]

            author = Author()
            val = 'Unknown'
            try:
                if pdf_info.author !=None and pdf_info.author.strip()!='':
                    val = pdf_info.author
                    val = val.encode("utf8", "ignore")
            except:
                pass
            author.authorName = val
            

            authorList = list()
            authorList.append(author)
            self.book.authors = authorList