Пример #1
0
    def test_read_pdf(self):
        fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__")
        pdffile = os.path.join(os.path.split(__file__)[
                               0], "data", "1305.0445.pdf")
        assert os.path.exists(pdffile)

        with open(pdffile, "rb") as f:
            input1 = PdfFileReader(f)
            title = input1.getDocumentInfo().title
            traw = input1.getDocumentInfo().title_raw
            npage = input1.getNumPages()
            fLOG("title", title, "*", traw)
            fLOG("nb pages", npage)

            page = input1.getPage(0)
            cont = page.getContents()
            fLOG("cont", cont)
            for obj in page:
                fLOG("obj", obj, "*", obj.title())
            annots = page.raw_get("/Annots")
            for a in annots:
                fLOG("annot", a, dir(a))
            for i in page.items():
                fLOG("item", i)
            text = page.extractText()
            fLOG("text---", text)
            assert " " in text
            assert "\n" in text
            if "algorithms: their inability" not in text:
                raise Exception(text)
Пример #2
0
    def get_pdf_text(self, response):
        """ Peek inside PDF to check possible violations.

        @return: PDF content as searcable plain-text string
        """

        try:
                from pyPdf import PdfFileReader
        except ImportError:
                print "Needed: easy_install pyPdf"
                raise 

        stream = StringIO.StringIO(response.body)
        reader = PdfFileReader(stream)

        text = u""

        if reader.getDocumentInfo().title:
                # Title is optional, may be None
                text += reader.getDocumentInfo().title

        for page in reader.pages:
                # XXX: Does handle unicode properly?
                text += page.extractText()

        return text                                      
Пример #3
0
def add(request):
	"""
	Upload a document
	"""
	
	if request.method == "POST":

		form = AddDocumentForm(request.POST, request.FILES)
		if form.is_valid():
			document = form.save(commit=False)
			document.user = request.user
			
			try:
				from pyPdf import PdfFileReader
				pdf = PdfFileReader(document.file)
				
				document.title = pdf.getDocumentInfo().title
				document.author = pdf.getDocumentInfo().author
				
			except:
				document.title = "( Insert title )"
				document.author = "( Insert author )"
				
			document.save()
			return HttpResponseRedirect('/documents/edit/' + str(document.id))
	else:
		form = AddDocumentForm()
	
	context = {
		'form': form,
	}
	return render_to_response('add.html', context,
							  context_instance=RequestContext(request))
Пример #4
0
 def pre_save_handler(sender, instance, **kwargs):
    r = PdfFileReader(instance.pdf_file)
    instance.num_pages = r.numPages
    instance.title = r.getDocumentInfo().title
    instance.author = r.getDocumentInfo().author
    instance.info = r.getDocumentInfo()
    print "title = %s" % (r.getDocumentInfo().title)
Пример #5
0
def PDFInfo (inputfiles):
    """prints useful Information about a PDF File """
    totalpagenum = 0
    totalfilesize = 0
    for inputfile in inputfiles:
        input1 = PdfFileReader(file(inputfile, "rb"))
        fileinfo = os.stat (inputfile)
        filesizekb = fileinfo[6] / 1024
        pagenum = input1.getNumPages()
        print "\n\n"
        print inputfile
        print "\n"
        print "\tTitel:\t\t %s" % (input1.getDocumentInfo().title)
        print "\tGroesse (KBytes):\t\t %s" % (filesizekb)
        print "\tzuletzt geaendert:\t\t %s" % (_formatDate(fileinfo[8]))
        print "\terstellt:\t\t %s" % (_formatDate(fileinfo[9]))
        print "\tSeiten :\t\t %s" % (pagenum)
        print "\tAutor :\t\t %s" % (input1.getDocumentInfo().author)
        print "\tQuelldokument erstellt mit :\t\t %s" % (input1.getDocumentInfo().creator)
        print "\tIn PDF konvertiert durch :\t\t %s" % (input1.getDocumentInfo().producer)
        totalpagenum += pagenum
        totalfilesize += filesizekb
    print 80 * "-"
    print "GESAMTINFO:"
    print "Seiten insgesamt: %s" % (totalpagenum)
    print "Dateigroesse insgesamt (kb): %s" % (totalfilesize)
Пример #6
0
 def validate(self, path, ext):
     try:
         with open(path, "rb") as fr:
             pdf = PdfFileReader(fr)
             pdf.getDocumentInfo()
             for p in pdf.pages:
                 pass
         return (0, "")
     except AssertionError as e:
         return (1, str(e))
     except Exception as e:
         return (1, str(e))
Пример #7
0
def printMeta(fileName):
    pdfFile = PdfFileReader(file(fileName, "rb"))
    docInfo = pdfFile.getDocumentInfo()
    pginput = pdfFile.getPage(1)

    print "title = %s" % (pdfFile.getDocumentInfo().title)
    print "[*] PDF MetaData For: " + str(fileName)

    tocfilepdf = get_toc(fileName)

    for metaItem in docInfo:
        print "[+]" + metaItem + ":" + docInfo[metaItem]
Пример #8
0
def print_meta(file_name):
    pdf = PdfFileReader(file(file_name, 'rb'))
    info = pdf.getDocumentInfo()
    print Style.BRIGHT + Back.GREEN + 'PDF MetaData For: ' + str(file_name) + Style.RESET_ALL

    for metaItem in info:
        print '[+] ' + metaItem + ':' + info[metaItem]
Пример #9
0
def get_metadata(pdf):
	ret={}
	pdf_toread = PdfFileReader(open(pdf, "rb"))
	pdf_info = pdf_toread.getDocumentInfo()

	
	print str(pdf_info)
def printMeta(fileName):
	pdfFile = PdfFileReader(file(fileName, 'rb'))
	docInfo = pdfFile.getDocumentInfo()
	#print docInfo.producer
	print '[*] PDF MetaData For:'  + str(fileName)
	for metaItem in docInfo:
		print '[+]'  + metaItem + ':' + docInfo[metaItem]
Пример #11
0
def main():
	fileName= sys.argv[len(sys.argv)-1]
	pdfFile = PdfFileReader(file(fileName, 'rb'))
 	info = pdfFile.getDocumentInfo()
	print "The Metadata for the file" + fileName + " are: \n"
	for line in info:
		print line+ " : " +info[line]
Пример #12
0
    def _getPDFText(self, filename, d):
        logger.debug(u"filename: %s" % filename)
        newparatextlist = list()

        try:
            pdfDoc = PdfFileReader(file(filename, u"rb"))

            pdfDict = pdfDoc.getDocumentInfo()

            for x in pdfDict.keys():
                d.addConceptKeyType(x[1:], pdfDict[x])

            # c.logConcepts()

            for page in pdfDoc.pages:
                text = page.extractText()
                if not isinstance(text, str):
                    unicodedata.normalize(u'NFKD', text).encode(u'ascii', u'ignore')

                logger.debug(u"PDF : %s" % text)

                newparatextlist.append(text + u". ")

            return newparatextlist

        except Exception, msg:
            logger.error(u"%s" % msg)
Пример #13
0
    def test_backlog_list(self):
        user = factories.UserFactory.create(
            email='*****@*****.**', password='******')
        backlog = factories.create_project_sample_backlog(user)
        for i in range(0, 10):
            factories.create_sample_story(user, backlog=backlog)
        # special printing of -1 points
        story = factories.UserStory.objects.all()[0]
        story.points = -1
        story.save()
        url = reverse("print_stories")
        url_plus = "{0}?backlog_id={1}".format(url, backlog.pk)
        self.app.get(url_plus, status=302)
        response = self.app.get(url_plus, user=user)
        form = response.forms['print_pdf_form']
        for k, f in form.fields.items():
            if k and "story-" in k:
                form[k] = True
        form['print-side'] = "long"
        form['print-format'] = "a4"
        response = form.submit()
        self.assertEqual(response['Content-Type'], "application/pdf")
        o = StringIO.StringIO(response.content)
        pdf = PdfFileReader(o)
        info = pdf.getDocumentInfo()
        self.assertEqual(pdf.getNumPages(), 6)
        self.assertEqual("backlogman.com", info['/Author'])
        # A4 is not "round" in PDF unit format real value are
        # approximately : [0, 0, 841.88980, 595.27560]
        self.assertEqual([0, 0, 841, 595],
                         [int(x) for x in pdf.getPage(0)["/MediaBox"]])

        response = self.app.get(url_plus, user=user)
        form = response.forms['print_pdf_form']
        for k, f in form.fields.items():
            if k and "story-" in k:
                form[k] = True
        form['print-side'] = "short"
        form['print-format'] = "letter"
        response = form.submit()
        self.assertEqual(response['Content-Type'], "application/pdf")
        o = StringIO.StringIO(response.content)
        pdf = PdfFileReader(o)
        info = pdf.getDocumentInfo()
        self.assertEqual(pdf.getNumPages(), 6)
        self.assertEqual("backlogman.com", info['/Author'])
        self.assertEqual([0, 0, 792, 612], pdf.getPage(0)["/MediaBox"])
Пример #14
0
def printMeta(filename):

	pdfFile = PdfFileReader(file(filename, 'rb'))
	docInfo = pdfFile.getDocumentInfo()

	print '[+] PDF MetaData for : ' + str(filename)
	for metaItem in docInfo:
		print '[+]' + metaItem + ":" + docInfo[metaItem]
Пример #15
0
def extractTitle(dirPath):
      fileTitles = {}
      fileName = listFileNames (dirPath)
      for index in range (len(fileName)):
            inputPdf = PdfFileReader(file("%s/%s" % (dirPath, fileName[index]),'rb'))
            fileTitles.setdefault(index+1,'')
            fileTitles[index+1] = inputPdf.getDocumentInfo().title
      return fileTitles
Пример #16
0
def get_name(filename):
    '''get pdf name'''
    try:
        file_obj = file(filename, "rb")
        input1 = PdfFileReader(file_obj)
        title = input1.getDocumentInfo().title
	subject = input1.getDocumentInfo().subject
	if title:
	    if not subject:
		new_name ="{0}.pdf".format(str(title))
	    else:
                new_name = ("{0}_{1}.pdf".format(str(title), str(subject).replace("/", "-").replace(" ", "_")))
	else:
	    new_name = filename
	file_obj.close()
    except:
	print "NO CHANGES!"
    return new_name
Пример #17
0
 def _extract(self, pdfname):
     pdf = PdfFileReader(file(pdfname, 'rb'))
     try:
         meta_info = pdf.getDocumentInfo()
         for meta_obj in meta_info:
             self.metaData[meta_obj[1:]] = meta_info[meta_obj]
     except:
         self.metaData["Error"] = "Ocurrio un error"
     return self.metaData
Пример #18
0
def pdf_name_extract(file):
    if file.split('.')[-1] == 'pdf':
        # pdfquery.PDFQuery()
        pdf = PdfFileReader(open(file, 'rb'))
        info = pdf.getDocumentInfo()
        try:
            name = info['/Title']
        except:
            name = ''
        return name
Пример #19
0
    def slice(self, ifile, ofile=None, marginv=0, marginh=0, columnwidth=0, centerwidth=0, scale=0.9):
        output = PdfFileWriter()
        input = PdfFileReader(file(ifile, "rb"))
        # print the title of document1.pdf
        print "title = %s" % (input.getDocumentInfo().title)
        print "Processing page: "
        for i in xrange(input.getNumPages()):
            print i+1
            # add left column as page
            page = PageObject.createBlankPage(input)
            page.mergePage(input.getPage(i))
            if columnwidth != 0 and centerwidth != 0:
                page.mediaBox.upperRight = (
                    page.mediaBox.getUpperLeft_x() + marginh + columnwidth,
                    page.mediaBox.getUpperRight_y() - marginv
                )
            else:
                page.mediaBox.upperRight = (
                    page.mediaBox.getUpperRight_x() / 2,
                    page.mediaBox.getUpperRight_y() - marginv
                )
            page.mediaBox.lowerLeft = (
                page.mediaBox.getLowerLeft_x() + marginh,
                page.mediaBox.getLowerLeft_y() + marginv,
            )
            page.scale(scale, scale)
            output.addPage(page)

            # add right column as page
            page = PageObject.createBlankPage(input)
            page.mergePage(input.getPage(i))
            if columnwidth != 0 and centerwidth != 0:
                page.mediaBox.lowerLeft = (
                    page.mediaBox.getLowerLeft_x() + marginh + columnwidth + centerwidth,
                    page.mediaBox.getLowerLeft_y() + marginv,
                )
            else:
                page.mediaBox.lowerLeft = (
                    page.mediaBox.getUpperRight_x() / 2,
                    page.mediaBox.getLowerLeft_y() + marginv,
                )
            page.mediaBox.upperRight = (
                page.mediaBox.getUpperRight_x() - marginh,
                page.mediaBox.getUpperRight_y() - marginv
            )
            page.scale(scale, scale)
            output.addPage(page)

        # finally, write "output"
        if ofile is not None:
            outputStream = file(ofile, "wb")
        else:
            outputStream = file(PdfSlicer.getOutName(ifile), "wb")
        output.write(outputStream)
        outputStream.close()
Пример #20
0
def pdfgooglesearch():
    filetype = "pdf"
    print "Searching google for files..."
    # set up browser
    browse = mechanize.Browser()
    cookiejar = cookielib.LWPCookieJar()
    browse.set_cookiejar(cookiejar)
    browse.set_handle_equiv(True)
    browse.set_handle_redirect(True)
    browse.set_handle_referer(True)
    browse.set_handle_robots(False)
    browse.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
    browse.addheaders = [
        (
            "User-agent",
            "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1",
        )
    ]

    # response = browse.open("https://www.google.com/#q=filetype: %s + %s" % (filetype, domain))
    response = browse.open("https://www.google.com")
    browse.select_form(nr=0)
    browse.form["q"] = "filetype:%s site:%s" % (filetype, domain)
    browse.submit()
    results = browse.response().read()
    soup = BeautifulSoup(results, "lxml")
    sidlist = []
    namelist = []
    typelist = []
    metalist = []
    counter = 1
    for link in soup.find_all("a", href=re.compile("/url")):
        link = link.get("href")
        if link.startswith("/url?q="):
            link = link[len("/url?q=") :]
            link = link.split("." + filetype)[0]
            # print str(link + ".pdf")
            try:
                filename = "%s%s.%s" % (domain, counter, filetype)
                downfile = browse.retrieve(str(link + "." + filetype), filename)[0]
                # print downfile
                pdf_toread = PdfFileReader(open(downfile, "rb"))
                pdf_info = pdf_toread.getDocumentInfo()
                # print str(pdf_info)
                sidlist.append(sid)
                typelist.append(str(filetype))
                namelist.append(str(filename))
                metalist.append(str(pdf_info))
                counter += 1
            except:
                pass
            for meta in zip(sidlist, typelist, namelist, metalist):
                executor.execute("INSERT INTO metadata VALUES (?,?,?,?)", meta)
    connection.commit()
def printMeta(fileName):
    pdfFile = PdfFileReader(file(fileName, 'rb'))
    docInfo = pdfFile.getDocumentInfo()
    print '\n[*] PDF MetaData For: {0}\n'.format(str(fileName))
    for metaItem in docInfo:
        if docInfo[metaItem][0] == "D":
            time = docInfo[metaItem][2:16]
            time = calendar.month_name[int(time[4:6])] + ' ' + time[6:8] + ', ' + time[:4] + ' at ' + time[8:10] + ':' + time[10:12] + ':' + time[12:14]
            print '[+] {0:20}{1}'.format(metaItem.replace('/', '') + ':', time)
        else:
            print '[+] {0:20}{1}'.format(metaItem.replace('/', '') + ':', docInfo[metaItem])
Пример #22
0
  def getContentInformation(self):
    """
    Returns the information about the PDF document with
    pdfinfo.

    NOTE: XXX check that command exists and was executed
    successfully
    """
    try:
      return self._content_information.copy()
    except AttributeError:
      pass
    tmp = tempfile.NamedTemporaryFile()
    tmp.write(self.getData())
    tmp.seek(0)
    command_result = None
    try:

      # First, we use pdfinfo to get standard metadata
      command = ['pdfinfo', '-meta', '-box', tmp.name]
      try:
        command_result = Popen(command, stdout=PIPE).communicate()[0]
      except OSError, e:
        if e.errno == errno.ENOENT:
          raise ConversionError('pdfinfo was not found')
        raise

      result = {}
      for line in command_result.splitlines():
        item_list = line.split(':')
        key = item_list[0].strip()
        value = ':'.join(item_list[1:]).strip()
        result[key] = value

      # Then we use pyPdf to get extra metadata
      try:
        from pyPdf import PdfFileReader
        from pyPdf.utils import PdfReadError
      except ImportError:
        # if pyPdf not found, pass
        pass
      else:
        try:
          pdf_file = PdfFileReader(tmp)
          for info_key, info_value in pdf_file.getDocumentInfo().iteritems():
            info_key = info_key.lstrip("/")
            if isinstance(info_value, unicode):
              info_value = info_value.encode("utf-8")
            result.setdefault(info_key, info_value)
        except PdfReadError:
          LOG("PDFDocument.getContentInformation", 0,
            "pyPdf is Unable to read PDF, probably corrupted PDF here : %s" % \
            (self.getRelativeUrl(),))
Пример #23
0
def printMeta(fileName):
	try:
		pdfFile	=	PdfFileReader(file(fileName, 'rb'))
		docInfo	=	pdfFile.getDocumentInfo()
		print '[*] PDF MetaData For: ' + str(fileName)
		for metaItem in docInfo:
			print '[+] ' + metaItem + ':' + docInfo[metaItem]
		print '\n\n\n\n\n'
	except Exception, e:
		print "[!] Error reading file ===================>	" + fileName
		#print e
		print '\n\n\n\n\n\n'
Пример #24
0
def run():
    os.chdir('/home/chris/Documents/Literature')
    for fileName in os.listdir('.'):
        
        try:
            input1 = PdfFileReader(file(fileName, "rb"))
       
            # print the title of document1.pdf
            print '##1', fileName, '##2', input1.getDocumentInfo().title
        except:
            pass
            
    return 0
Пример #25
0
    def run(self):
        def getSrcDim(srcPage):
            return (float(srcPage.mediaBox.getWidth()),
                    float(srcPage.mediaBox.getHeight()))

        def getDestDim():
            if self.opts.orientation == const.PORTRAIT:
                return self.opts.size
            elif self.opts.orientation == const.LANDSCAPE:
                return (self.opts.size[1], self.opts.size[0])

        def getScale(srcPage):
            destWidth, destHeight = getDestDim()
            return (getSrcDim(srcPage)[const.WIDTH]/float(destWidth))


        def getScaledDestDim(srcPage):
            return [x * int(getScale(srcPage)) for x in getDestDim()]


        reader = PdfFileReader(file(self.infile, "rb"))
        writer = PdfFileWriter(
            documentInfo=reader.getDocumentInfo(), authors=["Vimala"])

        #self.opts.count

        srcPage = reader.getPage(0)
        height = getSrcDim(srcPage)[const.HEIGHT]
        totalHeight = self.opts.count * height

        destPage = writer.addBlankPage(*getScaledDestDim(srcPage))

        print totalHeight
        fitScale = getScaledDestDim(srcPage)[const.HEIGHT] / float(totalHeight)
        print fitScale
        srcPage.scale(fitScale, fitScale)
        #scale = getScale(srcPage)
        #srcPage.scale(scale, scale)

        destPage.mergeTranslatedPage(srcPage, 0, height * 2 - .2 * height)

        srcPage = reader.getPage(1)
        srcPage.scale(fitScale, fitScale)
        destPage.mergeTranslatedPage(srcPage, 0, height - .1 * height)

        srcPage = reader.getPage(3)
        srcPage.scale(fitScale, fitScale)
        destPage.mergeTranslatedPage(srcPage, 0, 0)

        #import pdb;pdb.set_trace()
        writer.write(open(self.outfile, "wb"))
Пример #26
0
def get_pdfinfo(pdf_file):
    """
    Obtains information from pdf_file
    """
    pdf = PdfFileReader(open(pdf_file, "rb"))
    if not pdf.isEncrypted:
        info = {}
        info["numPages"] = pdf.numPages
        info["filepath"] = os.path.abspath(pdf_file)
        
        info.update(pdf.getDocumentInfo())
        return info
    else:
        return None
Пример #27
0
def PDFtara(srcfile):
	if srcfile.lower()[-3:] == "pdf":
		srcinput = PdfFileReader(file(srcfile, "rb"))
		if srcinput.getDocumentInfo().author:
			authors = srcinput.getDocumentInfo().author.split(',')
			author = max(authors[0].split(), key=len)
			titlewords = srcinput.getDocumentInfo().title
			title1 = max(titlewords.split(), key=len)
			title2 = max(titlewords.replace(title1,'').split(), key=len)
			titlewords.replace(title2,'')
			title3 = max(titlewords.replace(title2,'').replace(title1,'').split(), key=len)
			command = "pdf2txt -p 1 "+srcfile
			process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=None, shell=True)
			output = process.communicate()
			m = re.search(r'\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?!["&\'<>])\S)+)\b', output[0])
			if m:
				print 'doi:'+m.group()
			else:
				print 'meta:'+author.lower() + ';' + title1.lower() + ' ' + title2.lower() + ' ' + title3.lower()
		else:
			command = "pdf2txt -p 1 "+srcfile
			process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=None, shell=True)
			output = process.communicate()
			m = re.search(r'\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?!["&\'<>])\S)+)\b', output[0])
			if m:
				print 'doi:'+m.group()
			else:
				contents = re.split("(abstract)(?i)",output[0])
				contentlines = contents[0].split('\n')
				rawcontents = 'raw:'
				for i in range(0,min(20,len(contentlines)),1):
					if contentlines[i] != "":
						#forbiddens = re.compile("(faculty|university|department|january|february|march|april|may|june|july|august|september|october|november|december|[a-zA-Z0-9_\-]+\.[a-zA-Z0-9_\-]+\.[a-zA-Z0-9_\-])(?i)")
						#if forbiddens.search(contentlines[i]) is None:
  							#print 'raw:'+contentlines[i]
  						rawcontents = rawcontents+contentlines[i].replace(':',' ')+';'
  				print rawcontents
Пример #28
0
 def AddWatermark(self,watermark,filein,fileout):
     #Use reportlab to create a PDF that will be used
     #as a watermark on another PDF.
     c= canvas.Canvas("temp_watermark.pdf")
     c.setFont("Courier", 60)
     #This next setting with make the text of our
     #watermark gray, nice touch for a watermark.
     c.setFillGray(0.5,0.5)
     #Set up our watermark document. Our watermark
     #will be rotated 45 degrees from the direction
     #of our underlying document.
     c.saveState()
     c.translate(500,100)
     c.rotate(45)
     c.drawCentredString(0, 0, "A WATERMARK!")
     c.drawCentredString(0, 300, "A WATERMARK!")
     c.drawCentredString(0, 600, "A WATERMARK!")
     c.restoreState()
     c.save() 
     
     #Read in the PDF that will have the PDF applied to it.
     output = PdfFileWriter()
     input1 = PdfFileReader(file(filein, "rb")) 
     
     #Just to demo this function from pyPdf.
     #If the PDF has a title, this will print it out.
     print "title = %s" % (input1.getDocumentInfo().title)
     
     #Open up the orgininal PDF.
     page1 = input1.getPage(0)
     
     #Read in the file created above by ReportLab for our watermark.
     twatermark = PdfFileReader(file("temp_watermark.pdf", "rb"))
     #Apply the watermark by merging the two PDF files.
     page1.mergePage(twatermark.getPage(0))
     #Send the resultant PDF to the output stream.
     output.addPage(page1)
     
     #Just to demo this function from pyPdf.
     #Return the number of pages in the watermarked PDF.
     print "watermarked_pdf.pdf has %s pages." % input1.getNumPages()
     
     #write the output of our new, watermarked PDF.
     outputStream = file(fileout, "wb")
     output.write(outputStream)
     outputStream.close()
     os.remove("temp_watermark.pdf")
     os.remove(filein)
     
Пример #29
0
 def read(self, filename, pages, density):
     input1 = PdfFileReader(file(filename))
     self.info = input1.getDocumentInfo()
     n = input1.getNumPages()
     self.pages = []
     if pages is None:
         self.mypages = range(n)
     else:
         self.mypages = pages
     for i in self.mypages:
         sys.stdout.write("\rReading page %d of %d" % (i + 1, n))
         sys.stdout.flush()
         self.pages.append(Page('%s[%d]' % (filename, i), density))
     sys.stdout.write("\n")
     sys.stdout.flush()
Пример #30
0
def extractor():
	for file in PDFFiles:
		"""
		Processes the files in the PDFFiles list.  If the document 
		is missing data, it will pass and continue so you can 
		inspect the exceptions by hand.
		"""
		try:
			pdf_toread = PdfFileReader(open(BASEDIR + file, 'rb'))
			pdf_info = pdf_toread.getDocumentInfo()
			#print str(pdf_info)   #print full metadata if you want
			print file + "--" + pdf_info['/Title'] + " - " + pdf_info['/Subject']
		except:
			print file + ' ' + ' ERROR: Data missing or corrupt'
			pass
Пример #31
0

full = []

for d in directory:
	full.append('/home/matt/Desktop/new/' + d)

for f in full:
	f = f.lower()
	f = f.replace(" ","_")
	f = f.replace("-","_")
	print(f)

for filename in full:
	input1 = PdfFileReader(file(filename, 'rb'))
	title = input1.getDocumentInfo().title
	print title

for f in full:
    print(os.path.basename(f[:-4]))



>>> myFiles = ['accounts.txt', 'details.csv', 'invite.docx']
>>> for filename in myFiles:
        print(os.path.join('C:\\Users\\asweigart', filename))

path = 
for f in full:

    filename = "myfile.txt"
Пример #32
0
    def do_update_file_info(self, file):
        info = FileExtensionInfo()

        # strip file:// to get absolute path
        filename = urllib.unquote(file.get_uri()[7:])

        # mp3 handling
        if file.is_mime_type('audio/mpeg'):
            # attempt to read ID3 tag
            try:
                audio = EasyID3(filename)
                # sometimes the audio variable will not have one of these items defined, that's why
                # there is this long try / except attempt
                try:
                    info.title = audio["title"][0]
                except:
                    pass
                try:
                    info.album = audio["album"][0]
                except:
                    pass
                try:
                    info.artist = audio["artist"][0]
                except:
                    pass
                try:
                    info.tracknumber = "{:0>2}".format(audio["tracknumber"][0])
                except:
                    pass
                try:
                    info.genre = audio["genre"][0]
                except:
                    pass
                try:
                    info.date = audio["date"][0]
                except:
                    pass
            except:
                pass

            # try to read MP3 information (bitrate, length, samplerate)
            try:
                mpfile = open(filename)
                mpinfo = MPEGInfo(mpfile)
                info.bitrate = str(mpinfo.bitrate / 1000) + " Kbps"
                info.samplerate = str(mpinfo.sample_rate) + " Hz"
                # [SabreWolfy] added consistent formatting of times in format hh:mm:ss
                # [SabreWolfy[ to allow for correct column sorting by length
                mp3length = "%02i:%02i:%02i" % ((int(mpinfo.length / 3600)),
                                                (int(mpinfo.length / 60 % 60)),
                                                (int(mpinfo.length % 60)))
                mpfile.close()
                info.length = mp3length
            except:
                try:
                    mpfile.close()
                except:
                    pass

        # image handling
        elif file.is_mime_type('image/jpeg') or file.is_mime_type(
                'image/png') or file.is_mime_type(
                    'image/gif') or file.is_mime_type('image/bmp'):
            # EXIF handling routines
            try:
                metadata = pyexiv2.ImageMetadata(filename)
                metadata.read()
                try:
                    exif_datetimeoriginal = metadata[
                        'Exif.Photo.DateTimeOriginal']
                    info.exif_datetime_original = str(
                        exif_datetimeoriginal.raw_value)
                except:
                    pass
                try:
                    exif_imagesoftware = metadata['Exif.Image.Software']
                    info.exif_software = str(exif_imagesoftware.raw_value)
                except:
                    pass
                try:
                    exif_photoflash = metadata['Exif.Photo.Flash']
                    info.exif_flash = str(exif_photoflash.raw_value)
                except:
                    pass
                try:
                    exif_rating = metadata['Xmp.xmp.Rating']
                    info.exif_rating = str(exif_rating.raw_value)
                except:
                    pass
            except:
                pass
            # try read image info directly
            try:
                im = PIL.Image.open(filename)
                info.pixeldimensions = str(im.size[0]) + 'x' + str(im.size[1])
            except error as e:
                print e
                pass

        # video/flac handling
        elif file.is_mime_type('video/x-msvideo') | file.is_mime_type(
                'video/mpeg'
        ) | file.is_mime_type('video/x-ms-wmv') | file.is_mime_type(
                'video/mp4'
        ) | file.is_mime_type('audio/x-flac') | file.is_mime_type(
                'video/x-flv') | file.is_mime_type(
                    'video/x-matroska') | file.is_mime_type('audio/x-wav'):
            try:
                metadata = kaa.metadata.parse(filename)
                try:
                    info.length = "%02i:%02i:%02i" % (
                        (int(metadata.length / 3600)),
                        (int(metadata.length / 60 % 60)),
                        (int(metadata.length % 60)))
                except:
                    pass
                try:
                    info.pixeldimensions = str(
                        metadata.video[0].width) + 'x' + str(
                            metadata.video[0].height)
                except:
                    pass
                try:
                    info.bitrate = str(round(metadata.audio[0].bitrate / 1000))
                except:
                    pass
                try:
                    info.samplerate = str(int(
                        metadata.audio[0].samplerate)) + ' Hz'
                except:
                    pass
                try:
                    info.title = metadata.title
                except:
                    pass
                try:
                    info.artist = metadata.artist
                except:
                    pass
                try:
                    info.genre = metadata.genre
                except:
                    pass
                try:
                    info.tracknumber = metadata.trackno
                except:
                    pass
                try:
                    info.date = metadata.userdate
                except:
                    pass
                try:
                    info.album = metadata.album
                except:
                    pass
            except:
                pass

        # pdf handling
        elif file.is_mime_type('application/pdf'):
            try:
                f = open(filename, "rb")
                pdf = PdfFileReader(f)
                try:
                    info.title = pdf.getDocumentInfo().title
                except:
                    pass
                try:
                    info.artist = pdf.getDocumentInfo().author
                except:
                    pass
                f.close()
            except:
                pass

        self.set_file_attributes(file, info)

        del info
Пример #33
0
import os
import re

# Retrive all the files from the current directory
for fileName in os.listdir('.'):
    try:
        # Process nly the pdf files.
        if fileName.lower()[-3:] != "pdf":
            continue

        # Print the file name.
        print("Processing " + fileName)

        # Retrive the Title of the pdf.
        pdfReader = PdfFileReader(file(fileName, "rb"))
        title = pdfReader.getDocumentInfo().title
        # close the pdf
        pdfReader.stream.close()
        
        # Not all the PDFs contain the Title meta-info.
        # If the Title info is not available print the "Title: None" message.
        if title is None:
            print("Title: None")
        else:
            # Print the Title.
            print("Title: " + title)

            # Format the Title by removing any special characters.
            newName = re.sub('[^-a-zA-Z0-9_.() ]+', '', title) + ".pdf"

            # Ask the user for confirmation because sometimes the Title
Пример #34
0
from pyPdf import PdfFileReader, PdfFileWriter
import os

#1. Extract info from pdf file

path = "/home/jeanne/Desktop/Realpython/Chapter12 practice files"
input_file = PdfFileReader(
    file(os.path.join(path, "The whistling gypsy.pdf"), "rb"))
title = input_file.getDocumentInfo().title
author = input_file.getDocumentInfo().author
pages = input_file.getNumPages()
print "{}\n{}\n{}\n".format(title, author, pages)

#
path = "/home/jeanne/Desktop/Realpython/Chapter12 practice files"
file_name = PdfFileReader(
    file(os.path.join(path, "The whistling gypsy.pdf"), "rb"))
output_file_name = os.path.join(path, "Output/The whistling gypsy.txt")

output_file = open(output_file_name, "wb")
for page_num in range(1, file_name.getNumPages()):
    text = file_name.getPage(page_num).extractText()
    text = text.replace(" ", "\n")
    text = text.encode("utf-12")
    output_file.write(text)
output_file.close()

#
path = "/home/jeanne/Desktop/Realpython/Chapter12 practice files"
input_file = PdfFileReader(
    file(os.path.join(path, "The whistling gypsy.pdf"), "rb"))
Пример #35
0
import os
from pyPdf import PdfFileReader

path = "/Users/KevinKoshy/PycharmProjects/RealPythonEg"

input_file_name = os.path.join(path, "Pride and Prejudice.pdf")
input_file = PdfFileReader(file(input_file_name, "rb"))

print "Number of pages = ", input_file.getNumPages()
print "Title = ", input_file.getDocumentInfo().title
Пример #36
0
'''THIS IS A PYTHON 2 CODE'''

import pyPdf

from pyPdf import PdfFileReader
file = PdfFileReader(
    open('path\to\file.pdf', 'rb')
)  # First open the file and then pass the object as an args to the PdfFileReader
info = file.getDocumentInfo()  # Returns a dictionary

for meta_item in info:
    print "{}     Info: {}".format(meta_item, info[meta_item])
# pyPdf available at http://pybrary.net/pyPdf/
from pyPdf import PdfFileWriter, PdfFileReader
import os

for fileName in os.listdir('.'):
    try:
        if fileName.lower()[-3:] != "pdf": continue
        input1 = PdfFileReader(file(fileName, "rb"))

        # print the title of document1.pdf
        print '##1', fileName, '##2', input1.getDocumentInfo().title
    except:
        print '##1', fileName, '##2'
def printMeta(fileName):
    pdfFile = PdfFileReader(file(fileName, 'rb'))
    docInfo = pdfFile.getDocumentInfo()
    print '[*] PDF MetaData For: ' + str(fileName)
    for metaItem in docInfo:
        print '[+] ' + metaItem + ':' + docInfo[metaItem]
Пример #39
0
from pyPdf import PdfFileReader
import os

for fileName in os.listdir('/home/zhuwenjun/Documents/test/'):
    os.chdir('/home/zhuwenjun/Documents/test/')
    actfile = file(fileName, "rb")
    try:
        if fileName.lower()[-3:] != "pdf":
            continue
        input1 = PdfFileReader(actfile)
        print '  ##1', fileName, '##2', input1.getDocumentInfo().title
    except:
        print '  ##1', fileName, '##2'

    try:
        timeofpdf = input1.getDocumentInfo()['/ModDate'][2:6]
        trgtfilename = timeofpdf + '-' + input1.getDocumentInfo(
        ).title + ".pdf"
    except:
        print "\n## ERROR ## %s Title could not be extracted. PDF file may be encrypted!" % fileName
        continue

    del input1
    actfile.close()
    print 'Trying to rename from:', fileName, 'to', trgtfilename
    print '\n'

    try:
        os.rename(fileName, trgtfilename)
    except:
        print fileName, 'could not be renamed!'
Пример #40
0
def generateRandom():
    v = str(getNumber()) + str(getNumber()) + str(getNumber()) + str(
        getNumber())
    ind = pile.index(v) if v in pile else False
    if (ind == False):
        pile.append(v)
        return v
    else:
        return generateRandom()


def decryptPdf(f):
    if (f.getIsEncrypted()):
        ran = generateRandom()
        t = f.decrypt(seed + ran)
        if (t == 1):
            return {'pdf': f, 'r': ran, 's': seed + ran}
        else:
            return decryptPdf(f)
    else:
        print 'cola'
        return f.getDocumentInfo().title


f = PdfFileReader(file("EstadodeCuenta.pdf", "rb"))
o = decryptPdf(f)
ff = o.get('pdf')
ff.decrypt(o.get('s'))
print ff.getIsEncrypted()
print "title = %s" % (f.getDocumentInfo().title)
pp(o)
Пример #41
0
# 8.1 review exercises

import os
from pyPdf import PdfFileReader, PdfFileWriter

path = "C:/Real Python/Course materials/Chapter 8/Practice files"
inputFileName = os.path.join(path, "The Whistling Gypsy.pdf")
inputFile = PdfFileReader(file(inputFileName, "rb"))

# Display meta-data about file
print "Title:", inputFile.getDocumentInfo().title
print "Author:", inputFile.getDocumentInfo().author
print "Number of pages:", inputFile.getNumPages()

# Specify and open output text file
outputFileName = os.path.join(path, "Output/The Whistling Gypsy.txt")
with open(outputFileName, "w") as outputFile:
    # Extract every page of text
    for pageNum in range(0, inputFile.getNumPages()):
        text = inputFile.getPage(pageNum).extractText()
        text = text.encode("utf-8")  # convert text to unicode
        outputFile.write(text)

# Save file without cover page
outputPDF = PdfFileWriter()
for pageNum in range(1, inputFile.getNumPages()):
    outputPDF.addPage(inputFile.getPage(pageNum))

outputFileName = os.path.join(path,
                              "Output/The Whistling Gypsy un-covered.pdf")
with file(outputFileName, "wb") as outputFile:
Пример #42
0
# pyPdf available at http://pybrary.net/pyPdf/
# http://stackoverflow.com/questions/911672/extracting-titles-from-pdf-files
# 有的文档并没有title信息,会报错
from pyPdf import PdfFileWriter, PdfFileReader
import os

for fileName in os.listdir('.'):
    actfile = file(fileName, "rb")
    try:
        if fileName.lower()[-3:] != "pdf": continue
        input1 = PdfFileReader(actfile)
        # print the title of document1.pdf
        print '##1', fileName, '##2', input1.getDocumentInfo().title()
    except:
        print '##1', fileName, '##2'

    try:
        trgtfilename = input1.getDocumentInfo().title + "_" + fileName
    except:
        print "\n## ERROR ## %s Title could not be extracted. PDF file may be encrypted!" % fileName
    continue

    del input1
    actfile.close()

    print 'Trying to rename from:', fileName, '\n to ', trgtfilename
    try:
        os.rename(fileName, trgtfilename)
    except:
        print fileName, ' could not be renamed!'
        print '\n## ERROR ## Maybe the filename already exists or the document is already opened!'
Пример #43
0
def analizar_file(fichero):

    ext = fichero.split('.')[-1]
    extension = magic.from_file(fichero)

    if 'PDF' in extension:
        #Procederemos a analizar metadatos de un fichero pdf

        metadata_pdf = {}
        tipo_metadatos = [
            'Title', 'CreationDate', 'Author', 'Producer', 'Creator',
            'ModDate', 'Company', 'Comments', 'Keywords', 'SourceModified',
            'Subject'
        ]

        try:
            pdf_toread = PdfFileReader(open(fichero, "rb"))
        except:
            return metadata_pdf

        pdf_info = pdf_toread.getDocumentInfo()

        for i in tipo_metadatos:
            metadata_pdf.update({i: '#'})

        for k, v in pdf_info.iteritems():
            metadata_pdf.update({
                unidecode.unidecode(unicode(k.split('/')[1])):
                unidecode.unidecode(unicode(v))
            })
            #metadata_pdf.append(v)

        metadata_pdf.update({'Fichero': fichero})
        metadata_pdf.update({'Tipo': 'PDF'})

        return metadata_pdf

    if ext == 'doc' or ext == 'ppt' or ext == 'xls':
        #Es un formato antiguo de fichero Office, no son .zip, hay que analizarlos de otra manera.
        info = magic.from_file(fichero)

        #print info

        title = re.findall(r'Title:.*', info)
        if len(title) > 0:
            title = title[0].split(':')[1].split(',')[0]
        else:
            title = '#'

        author = re.findall(r'Author:.*', info)
        if len(author) > 0:
            author = author[0].split(':')[1].split(',')[0]
        else:
            author = '#'

        lastsavedby = re.findall(r'Last Saved By:.*', info)
        if len(lastsavedby) > 0:
            lastsavedby = lastsavedby[0].split(':')[1].split(',')[0]
        else:
            lastsavedby = '#'

        revision = re.findall(r'Revision Number:.*', info)
        if len(revision) > 0:
            revision = revision[0].split(':')[1].split(',')[0]
        else:
            revision = '#'

        aplication = re.findall(r'Creating Application:.*', info)
        if len(aplication) > 0:
            aplication = aplication[0].split(':')[1].split(',')[0]
        else:
            aplication = '#'

        created = re.findall(r'Create Time/Date:.*', info)
        if len(created) > 0:
            created = created[0].split(':')[1].split(',')[0]
        else:
            created = '#'

        lastsaved = re.findall(r'Saved Time/Date:.*', info)
        if len(lastsaved) > 0:
            lastsaved = lastsaved[0].split(':')[1].split(',')[0]
        else:
            lastsaved = '#'

        pages = re.findall(r'Pages:.*', info)
        if len(pages) > 0:
            pages = pages[0].split(':')[1].split(',')[0]
        else:
            pages = '#'

        words = re.findall(r'Words:.*', info)
        if len(words) > 0:
            words = words[0].split(':')[1].split(',')[0]
        else:
            words = '#'

        chars = re.findall(r'Characters:.*', info)
        if len(chars) > 0:
            chars = chars[0].split(':')[1].split(',')[0]
        else:
            chars = '#'

        lastprinted = re.findall(r'Last Printed:.*', info)
        if len(lastprinted) > 0:
            lastprinted = lastprinted[0].split(':')[1].split(',')[0]
        else:
            lastprinted = '#'

        res = {
            'Fichero': fichero,
            'Tipo': ext,
            'creator': author,
            'lastModifiedBy': lastsavedby,
            'created': created,
            'modified': lastsaved,
            'title': title,
            'revision': revision,
            'lastPrinted': lastprinted,
            'keywords': '#',
            'Application': aplication,
            'Paginas': pages,
            'Palabras': words,
            'Caracteres': chars,
            'Lineas': '#',
            'Parrafos': '#',
            'Slides': '#',
            'PresentationFormat': '#'
        }

        return res

    if 'Word' in extension or 'Excel' in extension or 'PowerPoint' in extension:
        try:
            zf = zipfile.ZipFile(fichero)
        except:
            return {}

        #Analizamos el fichero core.xml y sacamos metadatos de ahi.
        core_xml = zf.read('docProps/core.xml')

        xmlns_cp = re.findall(r'xmlns:cp="https?:.*"', core_xml)
        xmlns_cp = xmlns_cp[0].split('"')[1]
        #print xmlns_cp

        xmlns_dc = re.findall(r'xmlns:dc="https?:.*"', core_xml)
        xmlns_dc = xmlns_dc[0].split('"')[1]
        #print xmlns_dc

        xmlns_dcterms = re.findall(r'xmlns:dcterms="https?:.*"', core_xml)
        xmlns_dcterms = xmlns_dcterms[0].split('"')[1]
        #print xmlns_dcterms

        doc = lxml.etree.fromstring(core_xml)

        # Ya hemos creado las variables para crear el diccionario namespace
        ns = {'dc': xmlns_dc, 'dcterms': xmlns_dcterms, 'cp': xmlns_cp}

        # Buscamos los metadatos en core.xml
        creator = doc.xpath('//dc:creator', namespaces=ns)
        if len(creator) > 0:
            creator = unidecode.unidecode(unicode(creator[0].text))
        else:
            creator = '#'

        lastModifiedBy = doc.xpath('//cp:lastModifiedBy', namespaces=ns)
        if len(lastModifiedBy) > 0:
            lastModifiedBy = unidecode.unidecode(
                unicode(lastModifiedBy[0].text))
        else:
            lastModifiedBy = '#'

        created = doc.xpath('//dcterms:created', namespaces=ns)
        if len(created) > 0:
            created = unidecode.unidecode(unicode(created[0].text))
        else:
            created = '#'

        modified = doc.xpath('//dcterms:modified', namespaces=ns)
        if len(modified) > 0:
            modified = unidecode.unidecode(unicode(modified[0].text))
        else:
            modified = '#'

        title = doc.xpath('//dc:title', namespaces=ns)
        if len(title) > 0:

            title = unidecode.unidecode(unicode(title[0].text))
        else:
            title = '#'

        revision = doc.xpath('//cp:revision', namespaces=ns)
        if len(revision) > 0:
            revision = unidecode.unidecode(unicode(revision[0].text))
        else:
            revision = '#'

        lastPrinted = doc.xpath('//cp:lastPrinted', namespaces=ns)
        if len(lastPrinted) > 0:
            lastPrinted = unidecode.unidecode(unicode(lastPrinted[0].text))
        else:
            lastPrinted = '#'

        keywords = doc.xpath('//cp:keywords', namespaces=ns)
        if len(keywords) > 0:
            keywords = unidecode.unidecode(unicode(keywords[0].text))
        else:
            keywords = '#'

        #Analizamos el fichero app.xml y sacamos metadatos de ahi.

        app_xml = zf.read('docProps/app.xml')

        #print app_xml

        Aplicacion = re.findall(r'<Application>.*</Application>', app_xml)

        if len(Aplicacion) > 0:
            Aplicacion = Aplicacion[0].split('>')[1].split('<')[0]
        else:
            Aplicacion = '#'

        Paginas = re.findall(r'<Pages>.*</Pages>', app_xml)

        if len(Paginas) > 0:
            Paginas = Paginas[0].split('>')[1].split('<')[0]
        else:
            Paginas = '#'

        Palabras = re.findall(r'<Words>.*</Words>', app_xml)

        if len(Palabras) > 0:
            Palabras = Palabras[0].split('>')[1].split('<')[0]
        else:
            Palabras = '#'

        Caracteres = re.findall(r'<Characters>.*</Characters>', app_xml)

        if len(Caracteres) > 0:
            Caracteres = Caracteres[0].split('>')[1].split('<')[0]
        else:
            Caracteres = '#'

        Lineas = re.findall(r'<Lines>.*</Lines>', app_xml)

        if len(Lineas) > 0:
            Lineas = Lineas[0].split('>')[1].split('<')[0]
        else:
            Lineas = '#'

        Parrafos = re.findall(r'<Paragraphs>.*</Paragraphs>', app_xml)

        if len(Parrafos) > 0:
            Parrafos = Parrafos[0].split('>')[1].split('<')[0]
        else:
            Parrafos = '#'

        Slides = re.findall(r'<Slides>.*</Slides>', app_xml)

        if len(Slides) > 0:
            Slides = Slides[0].split('>')[1].split('<')[0]
        else:
            Slides = '#'

        PresentationFormat = re.findall(
            r'<PresentationFormat>.*</PresentationFormat>', app_xml)

        if len(PresentationFormat) > 0:
            PresentationFormat = PresentationFormat[0].split('>')[1].split(
                '<')[0]
        else:
            PresentationFormat = '#'

        res = {
            'Fichero': fichero,
            'Tipo': ext,
            'creator': creator,
            'lastModifiedBy': lastModifiedBy,
            'created': created,
            'modified': modified,
            'title': title,
            'revision': revision,
            'lastPrinted': lastPrinted,
            'keywords': keywords,
            'Application': Aplicacion,
            'Paginas': Paginas,
            'Palabras': Palabras,
            'Caracteres': Caracteres,
            'Lineas': Lineas,
            'Parrafos': Parrafos,
            'Slides': Slides,
            'PresentationFormat': PresentationFormat
        }

        return res
Пример #44
0
def split_scan(scan_id=None, redirect=None, scan=None):
    """
    Split a Scan into ScanPage's
    """
    # Mark under construction.
    scan = scan or Scan.objects.get(pk=scan_id)
    scan.under_construction = True
    scan.save()
    for doc in scan.document_set.all():
        doc.under_construction = True
        doc.save()

    # Check that the pdf works, and try to fix if not.
    try:
        with open(scan.pdf.path, 'rb') as fh:
            reader = PdfFileReader(fh)
            try:
                if 'Quartz' in reader.getDocumentInfo()['/Producer']:
                    # Assume that anything produced by Mac OS X Quartz needs to be
                    # fixed. It's buggy.
                    raise PdfReadError()
            except KeyError:
                pass
    except PdfReadError:
        logger.debug("Error reading pdf %s, try to fix." % scan.pdf.path)
        _fix_pdf(scan.pdf.path)

    name, ext = os.path.splitext(scan.pdf.name)
    directory, name_with_ext = os.path.split(scan.pdf.path)
    basename, ext = os.path.splitext(name_with_ext)
    page_basename = basename + "-page"

    # Burst pdf into single pages.
    pdf_pages = []
    burst_dir = tempfile.mkdtemp(suffix="pdfburst")
    #print "Bursting pages to " + burst_dir + "-%03d.pdf"

    # We used to use PyPDF to do this, but it consumes too much memory for
    # large PDFs.
    proc = subprocess.Popen([
        settings.NICE_CMD, settings.PDFTK_CMD, scan.pdf.path, "burst",
        "output",
        os.path.join(burst_dir, page_basename + "-%03d.pdf")
    ])
    proc.communicate()
    pages = sorted(os.listdir(burst_dir))
    for page in pages:
        #print "trying %s" % page
        match = re.match(page_basename + "-(\d\d\d)\.pdf", page)
        if match:
            page_dest = os.path.join(
                directory,
                "%s-%03d.pdf" % (page_basename, int(match.group(1)) - 1))
            #print "found %s" % page_dest
            shutil.move(os.path.join(burst_dir, page), page_dest)
            pdf_pages.append(page_dest)
    shutil.rmtree(burst_dir)

    # NOTE: This simpler pyPdf strategy doesn't do so well on some pdfs.  pdftk
    # burst is much more robust.
    #    page_basename_with_dir = directory + "/" + page_basename
    #    with open(scan.pdf.path, "rb") as in_fh:
    #        reader = PdfFileReader(in_fh)
    #        for i in range(0, reader.getNumPages()):
    #            print "%s / %s" % (i, reader.getNumPages())
    #            pdf_pages.append(page_basename_with_dir + "-%03d.pdf" % i)
    #            with open(pdf_pages[-1], 'wb') as out_fh:
    #                writer = PdfFileWriter()
    #                writer.addPage(reader.getPage(i))
    #                writer.write(out_fh)

    jpgs = []
    for page in pdf_pages:
        #print "Converting %s to jpg..." % page
        img = _pdfimages_page_to_jpg(page)
        jpgs.append(img)

    scanpages = []
    for i, filename in enumerate(jpgs):
        try:
            scanpage = ScanPage.objects.get(scan=scan, order=i)
        except ScanPage.DoesNotExist:
            scanpage = ScanPage(scan=scan, order=i)
        scanpage.image = os.path.relpath(filename, settings.MEDIA_ROOT)
        scanpage.save()
        scanpages.append(scanpage)
        # Pre-cache some thumbnails (performance optimization for moderation).
        get_thumbnail(scanpage.image.path, "900")
        get_thumbnail(scanpage.image.path, "100")
        get_thumbnail(scanpage.image.path, "15")
    scan.scanpages = scanpages

    # update document images, if any.
    chain = update_document_images.map([d.pk for d in scan.document_set.all()])
    chain()
    for doc in scan.document_set.all():
        doc.under_construction = False
        doc.save()
    scan.under_construction = False
    scan.save()
    return redirect
Пример #45
0
import os
from pyPdf import PdfFileReader, PdfFileWriter

path = "C:\Users\New Owner\Documents\Python Learning\Practice"

inputfilename = os.path.join(path, "GarbegeyPDF.pdf")
inputfile = PdfFileReader(file(inputfilename, "rb"))

print "Number of pages:", inputfile.getNumPages()
print "Title:", inputfile.getDocumentInfo().creator

outputPDF = PdfFileWriter()

for page in range(inputfile.getNumPages()):
    if page % 5 == 0:
        outputPDF.addPage(inputfile.getPage(page))

outputfilename = os.path.join(path, "GarbegeyPDF.pdf")
outputfile = file(outputfilename, "wb")
outputPDF.write(outputfile)
outputfile.close()
Пример #46
0
#!/usr/bin/python

from pyPdf import PdfFileWriter, PdfFileReader

output = PdfFileWriter()
input1 = PdfFileReader(file("main.pdf", "rb"))

# print the title of document1.pdf
print "title = %s" % (input1.getDocumentInfo().title)

# add page 1 from input1 to output document, unchanged
watermark = PdfFileReader(file("watermark.pdf", "rb"))
#print input1.getPage(1).mergePage(watermark.getPage(0))
watermark.getPage(0).mergePage(input1.getPage(0))
#output.addPage(input1.getPage(1))
output.addPage(watermark.getPage(0))

# print how many pages input1 has:
print "document1.pdf has %s pages." % input1.getNumPages()

# finally, write "output" to document-output.pdf
outputStream = file("document-output.pdf", "wb")
output.write(outputStream)
outputStream.close()
Пример #47
0
    def processFile(self, curr_file):
        global extractedFrom
        author = '-'
        date = '-'
        generator = '-'
        created = '-'
        producer = '-'
        modded = '-'
        last_saved = '-'
        if ".pdf" in curr_file:
            try:
                pdfFile = PdfFileReader(file(curr_file, 'rb'))
                if pdfFile.getIsEncrypted():
                    pdfFile.decrypt('')
                docInfo = pdfFile.getDocumentInfo()
                if not docInfo:
                    return
                last_saved = '-'
                #looks at the entire dictionary to parse for information
                if "/CreationDate" in docInfo:
                    data = docInfo["/CreationDate"].strip("D:|'")
                    year = data[0:4]
                    date = data[4:6] + "/" + data[6:8]
                    created_time = data[8:10] + ":" + data[10:12]
                    created_time = time.strftime(
                        "%I:%M %p", time.strptime(created_time, "%H:%M"))
                    created = date + "/" + year + " " + created_time
                if "/Author" in docInfo:
                    author = docInfo["/Author"] + " "
                    if len(author) <= 1:
                        author = "-"
                if "/Producer" in docInfo:
                    producer = docInfo["/Producer"].strip("(Windows)")
                    producer = re.sub(r'[^\w]', ' ', producer)
                    if len(producer) == 0:
                        producer = "-"
                    while True:
                        if "  " in producer:
                            producer = producer.replace("  ", " ")
                        else:
                            break
                if "/ModDate" in docInfo:
                    data = docInfo["/ModDate"].strip("D:|'")
                    year = data[0:4]
                    date = data[4:6] + "/" + data[6:8]
                    modded_time = data[8:10] + ":" + data[10:12]
                    modded_time = time.strftime(
                        "%I:%M %p", time.strptime(modded_time, "%H:%M"))
                    modded = date + "/" + year + " " + modded_time

                #strips '/' off file name (if it includes directory name)
                if "/" in curr_file:
                    curr_file = curr_file[curr_file.rfind("/") + 1:]
                if "\\" in curr_file:
                    curr_file = curr_file.replace("\\", "")

                #trim information if it's too long
                if len(curr_file) > 15:  # trims file name
                    curr_file = curr_file[:15] + "..." + curr_file[-13:]
                if len(producer) > 30:
                    producer = producer[:20] + " [snipped] "
                if len(author) > 20:
                    author = author[:20] + " [snipped] "

                #appends each piece of information. output will show ONLY if at least ONE file has data in a column
                self.container.append([
                    " | " + curr_file, created, author, producer, modded,
                    last_saved
                ])
            except Exception, err:
                return
Пример #48
0
	def update_file_info(self, file):
		# set defaults to blank
		file.add_string_attribute('title', '')
		file.add_string_attribute('album', '')
		file.add_string_attribute('artist', '')
		file.add_string_attribute('tracknumber', '')
		file.add_string_attribute('genre', '')
		file.add_string_attribute('date', '')
		file.add_string_attribute('bitrate', '')
		file.add_string_attribute('samplerate', '')
		file.add_string_attribute('length', '')
		file.add_string_attribute('exif_datetime_original', '')
		file.add_string_attribute('exif_software', '')
		file.add_string_attribute('exif_flash', '')
		file.add_string_attribute('exif_pixeldimensions', '')
		file.add_string_attribute('exif_rating','')
		file.add_string_attribute('pixeldimensions', '')

		if file.get_uri_scheme() != 'file':
			return

		# strip file:// to get absolute path
		filename = urllib.unquote(file.get_uri()[7:])
		
		# mp3 handling
		if file.is_mime_type('audio/mpeg'):
			# attempt to read ID3 tag
			try:
				audio = EasyID3(filename)
				# sometimes the audio variable will not have one of these items defined, that's why
				# there is this long try / except attempt
				try: file.add_string_attribute('title', audio["title"][0])
				except: file.add_string_attribute('title', "[n/a]")
				try: file.add_string_attribute('album', audio["album"][0])
				except: file.add_string_attribute('album', "[n/a]")
				try: file.add_string_attribute('artist', audio["artist"][0])
				except: file.add_string_attribute('artist', "[n/a]")
				try: file.add_string_attribute('tracknumber', audio["tracknumber"][0])
				except: file.add_string_attribute('tracknumber', "[n/a]")
				try: file.add_string_attribute('genre', audio["genre"][0])
				except: file.add_string_attribute('genre', "[n/a]")
				try: file.add_string_attribute('date', audio["date"][0])
				except: file.add_string_attribute('date', "[n/a]")
			except:
				# [SabreWolfy] some files have no ID3 tag and will throw this exception:
				file.add_string_attribute('title', "[no ID3]")
				file.add_string_attribute('album', "[no ID3]")
				file.add_string_attribute('artist', "[no ID3]")
				file.add_string_attribute('tracknumber', "[no ID3]")
				file.add_string_attribute('genre', "[no ID3]")
				file.add_string_attribute('date', "[no ID3]")
				
			# try to read MP3 information (bitrate, length, samplerate)
			try:
				mpfile = open (filename)
				mpinfo = MPEGInfo (mpfile)
				file.add_string_attribute('bitrate', str(mpinfo.bitrate/1000) + " Kbps")
				file.add_string_attribute('samplerate', str(mpinfo.sample_rate) + " Hz")
				# [SabreWolfy] added consistent formatting of times in format hh:mm:ss
				# [SabreWolfy[ to allow for correct column sorting by length
				mp3length = "%02i:%02i:%02i" % ((int(mpinfo.length/3600)), (int(mpinfo.length/60%60)), (int(mpinfo.length%60)))
				mpfile.close()
				file.add_string_attribute('length', mp3length)
			except:
				file.add_string_attribute('bitrate', "[n/a]")
				file.add_string_attribute('length', "[n/a]")
				file.add_string_attribute('samplerate', "[n/a]")
				try:
					mpfile.close()
				except:	pass
	
		# image handling
		if file.is_mime_type('image/jpeg') or file.is_mime_type('image/png') or file.is_mime_type('image/gif') or file.is_mime_type('image/bmp'):
			# EXIF handling routines
			try:
				metadata = pyexiv2.ImageMetadata(filename)
				metadata.read()
				try:
					exif_datetimeoriginal = metadata['Exif.Photo.DateTimeOriginal']
					file.add_string_attribute('exif_datetime_original',str(exif_datetimeoriginal.raw_value))
				except:
					file.add_string_attribute('exif_datetime_original',"")
				try:
					exif_imagesoftware = metadata['Exif.Image.Software']
					file.add_string_attribute('exif_software',str(exif_imagesoftware.raw_value))
				except:
					file.add_string_attribute('exif_software',"")
				try:
					exif_photoflash = metadata['Exif.Photo.Flash']
					file.add_string_attribute('exif_flash',str(exif_photoflash.raw_value))
				except:
					file.add_string_attribute('exif_flash',"")
				try:
					exif_pixelydimension = metadata['Exif.Photo.PixelYDimension']
					exif_pixelxdimension = metadata['Exif.Photo.PixelXDimension']
					file.add_string_attribute('exif_pixeldimensions',str(exif_pixelydimension.raw_value)+'x'+str(exif_pixelxdimension.raw_value))
				except:
					file.add_string_attribute('exif_pixeldimensions',"")
				try:
					exif_rating = metadata['Xmp.xmp.Rating']
					file.add_string_attribute('exif_rating',str(exif_rating.raw_value))
				except:
					file.add_string_attribute('exif_rating',"")
			except:
				# no exif data?
				file.add_string_attribute('exif_datetime_original',"")
				file.add_string_attribute('exif_software',"")
				file.add_string_attribute('exif_flash',"")
				file.add_string_attribute('exif_pixeldimensions',"")
				file.add_string_attribute('exif_rating',"")
			# try read image info directly
			try:
				im = Image.open(filename)
				file.add_string_attribute('pixeldimensions',str(im.size[0])+'x'+str(im.size[1]))
			except:
				file.add_string_attribute('pixeldimensions',"[image read error]")

		# video/flac handling
		if file.is_mime_type('video/x-msvideo') | file.is_mime_type('video/mpeg') | file.is_mime_type('video/x-ms-wmv') | file.is_mime_type('video/mp4') | file.is_mime_type('audio/x-flac') | file.is_mime_type('video/x-flv') | file.is_mime_type('video/x-matroska') | file.is_mime_type('audio/x-wav'):
			try:
				info=kaa.metadata.parse(filename)
				try: file.add_string_attribute('length',"%02i:%02i:%02i" % ((int(info.length/3600)), (int(info.length/60%60)), (int(info.length%60))))
				except: file.add_string_attribute('length','[n/a]')
				try: file.add_string_attribute('pixeldimensions', str(info.video[0].width) + 'x'+ str(info.video[0].height))
				except: file.add_string_attribute('pixeldimensions','[n/a]')
				try: file.add_string_attribute('bitrate',str(round(info.audio[0].bitrate/1000)))
				except: file.add_string_attribute('bitrate','[n/a]')
				try: file.add_string_attribute('samplerate',str(int(info.audio[0].samplerate))+' Hz')
				except: file.add_string_attribute('samplerate','[n/a]')
				try: file.add_string_attribute('title', info.title)
				except: file.add_string_attribute('title', '[n/a]')
				try: file.add_string_attribute('artist', info.artist)
				except: file.add_string_attribute('artist', '[n/a]')
				try: file.add_string_attribute('genre', info.genre)
				except: file.add_string_attribute('genre', '[n/a]')
				try: file.add_string_attribute('tracknumber',info.trackno)
				except: file.add_string_attribute('tracknumber', '[n/a]')
				try: file.add_string_attribute('date',info.userdate)
				except: file.add_string_attribute('date', '[n/a]')					
				try: file.add_string_attribute('album',info.album)
				except: file.add_string_attribute('album', '[n/a]')
			except:
				file.add_string_attribute('length','error')
				file.add_string_attribute('pixeldimensions','error')
				file.add_string_attribute('bitrate','error')
				file.add_string_attribute('samplerate','error')
				file.add_string_attribute('title','error')
				file.add_string_attribute('artist','error')
				file.add_string_attribute('genre','error')
				file.add_string_attribute('track','error')
				file.add_string_attribute('date','error')
				file.add_string_attribute('album','error')
		# pdf handling
		if file.is_mime_type('application/pdf'):
			try:
				f = open(filename, "rb")
				pdf = PdfFileReader(f)
				try: file.add_string_attribute('title', pdf.getDocumentInfo().title)
				except: file.add_string_attribute('title', "[n/a]")
				try: file.add_string_attribute('artist', pdf.getDocumentInfo().author)
				except: file.add_string_attribute('artist', "[n/a]")
				f.close()
			except:
				file.add_string_attribute('title', "[no info]")
				file.add_string_attribute('artist', "[no info]")
					
		self.get_columns()
Пример #49
0
def printMeta(fileName):
    pdfFile = PdfFileReader(file(fileName, 'rb'))
    docInfo = pdfFile.getDocumentInfo()
    for metaItem in docInfo:
        print('[+] ' + metaItem + ' : ' + docInfo[metaItem])
Пример #50
0
import os
from pyPdf import PdfFileReader, PdfFileWriter

path = '/home/alberick/Documents/python/books/realpython-jean/part1/book1-exercises/Course materials/Chapter 12/Practice files'
target_file_name = os.path.join(path, 'The Whistling Gypsy.pdf')
target_file = PdfFileReader(file(target_file_name, 'rb'))
book_info = target_file.getDocumentInfo().title, target_file.getDocumentInfo().author, target_file.getNumPages()

print '''The title of this pdf is "{0[0]}"
Author is {0[1]}
Total number of pages is{0[2]}'''.format(book_info)

print '\n'
print 'Exercise 2'

output_file_name = os.path.join(path, 'Output/The Whistling Gypsy.txt')
with open(output_file_name, 'wb') as output_file:

    for page in range(target_file.getNumPages()):
        text = target_file.getPage(page).extractText()
        output_file.write(text.encode('utf-8'))

print '\n'
print 'Exercise 3'

output_file_name = os.path.join(path, 'Output/The Whistling Gypsy.pdf')

with open(output_file_name, 'wb') as output_file:
    pdf_writer = PdfFileWriter()

    for page in range(1, target_file.getNumPages()):
Пример #51
0
OUTPUT = 'ml1.pdf'
INPUT = 'NOFO.pdf'

# There is no interface through pyPDF with which to set this other then getting
# your hands dirty like so:
output = PdfFileWriter()
fin = file(INPUT, 'rb')
pdf_in = PdfFileReader(fin)
infoDict = output._info.getObject()
print infoDict
infoDict.update({
    NameObject('/Title'): createStringObject(u'title'),
    NameObject('/Author'): createStringObject(u'author'),
    NameObject('/Subject'): createStringObject(u'subject'),
    NameObject('/Creator'): createStringObject(u'a script')
})
print infoDict
for page in range(pdf_in.getNumPages()):
    output.addPage(pdf_in.getPage(page))

outputStream = file(OUTPUT, 'wb')
output.write(outputStream)
outputStream.close()

from pyPdf import PdfFileReader, PdfFileWriter

pdf = PdfFileReader(open(OUTPUT, 'rb'))

print pdf.getDocumentInfo()
Пример #52
0
def printMeta(filename):
    pdfFile = PdfFileReader(file(filename,'rb'))
    docInfo = pdfFile.getDocumentInfo()
    print("[*] The Metadata for " + str(filename))
    for metaItem in docInfo:
        print("[+] "+ metaItem[0] + ":" + docInfo[metaItem])
    def pdf_watermark_slow(self, pathname, Wm_f, wt1='', **kwargs):
        from pyPdf import PdfFileWriter, PdfFileReader
        try:
            url_watermark = kwargs['url_wtm']
        except:
            pass
        url_watermark2 = url_watermark.replace(".", "_")
        url_watermark2 = url_watermark2.replace("://", "__")
        # CurrentDir=os.path.dirname(os.path.realpath(__file__)).replace('\\','/')
        if wt1 == '':
            try:
                wt1 = self.watermark_file(self.Watermarked_PDF_Dir + "/" +
                                          "watermarker_slow_" +
                                          url_watermark2 + ".pdf",
                                          url_watermark,
                                          center_text=True)
            except:
                print "erro in writing new watermarker files"
            try:
                if not os.path.isfile(self.Watermarked_PDF_Dir + "/" +
                                      "watermarker_slow_" + url_watermark2 +
                                      ".pdf"):
                    wt1 = self.watermark_file(self.Watermarked_PDF_Dir + "/" +
                                              "watermarker_slow_" +
                                              url_watermark2 + ".pdf",
                                              url_watermark,
                                              center_text=True)
                else:
                    wt1 = self.Watermarked_PDF_Dir + "/" + "watermarker_slow_" + url_watermark2 + ".pdf"
                    sa = file(wt1, 'rb')
                    watermark1 = PdfFileReader(sa)
                    wtt = watermark1.getPage(0)
            except:
                print 'if not os.path.isfile(self.Watermarked_PDF_Dir+"/" + "watermarker_slow_"+url_watermark2+".pdf"): is not working'

        else:
            watermark1 = PdfFileReader(wt1)
            wtt = watermark1.getPage(0)
            # wt2=self.watermark_file(pathname,'www.free-papers.tk')

        output = PdfFileWriter()

        # input1 = PdfFileReader(file(pathname, "r"))
        # inf = PdfFileReader(file(pathname, "r")).getDocumentInfo

        # print the title of document1.pdf
        fh = file(pathname, 'rb')
        pdf = PdfFileReader(fh)

        print "title = %s" % (pdf.getDocumentInfo().title)
        f = pdf.getNumPages()
        # j = self.pdf_cheker(pathname)
        wt = watermark1.getPage(0)
        try:
            for i in range(0, pdf.getNumPages()):
                watermark = []
                wt = []
                p = pdf.getPage(i)
                # p.mergePage(wt)
                # output.addPage(p)
                watermark = PdfFileReader(sa)
                # # watermark = watermark1
                wt = watermark.getPage(0)
                wt.mergePage(p)
                output.addPage(wt)

            outputStream = open(Wm_f, 'wb')
            # outputStream=StringIO.StringIO()
            # output.write(open(Wm_f, 'wb'))
            # import sys;sys.setrecursionlimit(11500)
            output.write(outputStream)
            print('output.write(outputStream) is done' + 'wtl is :' + wt1)
            outputStream.close()
        except:
            print('Please make correct Wattermarket')
            # return address of files
        # Wm_f is full address
        fh.close()
        try:
            sa.close()
        except:
            pass
        return Wm_f
Пример #54
0
# 11.1 review exercises

import os
from pyPdf import PdfFileReader, PdfFileWriter

path = "C:/Real Python/refactor/chp12/practice_files"
input_file_name = os.path.join(path, "The Whistling Gypsy.pdf")
input_file = PdfFileReader(open(input_file_name, "rb"))

# Display meta-data about file
print("Title:", input_file.getDocumentInfo().title)
print("Author:", input_file.getDocumentInfo().author)
print("Number of pages:", input_file.getNumPages())

# Specify and open output text file
output_file_name = os.path.join(path, "Output/The Whistling Gypsy.txt")
with open(output_file_name, "w") as output_file:
    # Extract every page of text
    for page_num in range(0, input_file.getNumPages()):
        text = input_file.getPage(page_num).extractText()
        text = text.encode("utf-8")  # convert text to unicode
        output_file.write(text)

# Save file without cover page
output_PDF = PdfFileWriter()
for page_num in range(1, input_file.getNumPages()):
    output_PDF.addPage(input_file.getPage(page_num))

output_file_name = os.path.join(path,
                                "Output/The Whistling Gypsy un-covered.pdf")
with open(output_file_name, "wb") as output_file:
Пример #55
0
def printMeta(fileName):
    pdfFile = PdfFileReader(file(fileName, 'rb'))
    docInfo = pdfFile.getDocumentInfo()
    print "[*] PDF MeataData For: " + str(fileName)
    for meraItem in docInfo:
        print "[+] " + meraItem + ": " + docInfo[meraItem]