Пример #1
0
def _get_images_from_pdf(pdf_filename, resolution, verbose, delete_files,
        temp_dir, make_thumbs, thumb_size, thumb_dir, thumb_prefix, pool_count=1):

    success = False
    try:

        if verbose == True:
            print "Splitting PDF into multiple pdf's for processing ..."

        # make sure there is a place to put our temporary pdfs
        if not os.path.exists(temp_dir):
            os.makedirs(temp_dir)

        # make sure if we are going to make thumbs, the folde rexists
        if make_thumbs == True:
            if not os.path.exists(thumb_dir):
                os.makedirs(thumb_dir)

        # read input pdf
        inputpdf = PdfFileReader(open(pdf_filename, "rb"))
        if inputpdf.getIsEncrypted():
            inputpdf.decrypt('')

        if verbose == True:
            print "Writing out %i pages ..." % inputpdf.numPages

        # create all of the temporary pdfs
        for i in xrange(inputpdf.numPages):
            output = PdfFileWriter()
            output.addPage(inputpdf.getPage(i))
            #print output.resolvedObjects
            filename = "{0}/document-page-{1}.pdf".format(temp_dir,i)
            with open(filename, "wb") as outputStream:
                output.write(outputStream)
            __pdf_queue.put(i)

        if verbose == True:
            print "Dispatching pdf workers ..."

        # spin up our workers to convert the pdfs to images
        #pool_count = 4
        pool = Pool()
        pool.map_async(
            _pdf_converter_worker,
            [(x, resolution, verbose, delete_files,
                temp_dir, make_thumbs, thumb_size,
                thumb_dir, thumb_prefix) for \
                x in range(pool_count)]
        )

        while __pdf_texts.qsize() != inputpdf.numPages:
            time.sleep(.25)

        if verbose == True:
            print "Done converting PDF."

        success = True

    except Exception, e:
        print str(e)
Пример #2
0
    def OCR(self, fn, resolution=300, verbose=False, part=''):

        i = 1
        pdf = PdfFileReader(file(fn, 'rb'))
        if pdf.getIsEncrypted():
            if pdf.decrypt(''):
                jnk = 0
            else:
                return false
        pagedata = []
        text = ''

        for p in pdf.pages:

            if verbose:
                print ' --- ' + str(i)

            part = str(part)

            # Temporary filenames for ImageMagick conversion
            pgfile = 'tmp-' + part + '-' + str(i) + '.pdf'
            pgfilejpg = 'tmp-' + part + '-' + str(i) + '.jpg'

            # Parse this page
            output = PdfFileWriter()
            output.addPage(p)
            outputStream = file(pgfile, 'wb')
            output.write(outputStream)
            outputStream.close()

            # Convert this page to a high-resolution JPEG
            img = PythonMagick.Image()
            img.density(str(resolution))
            img.read(pgfile)
            img.write(pgfilejpg)

            # OCR the converted JPG
            im = Image.open(pgfilejpg)
            if (len(im.split()) == 4):
                r, g, b, a = im.split()
                im = Image.merge('RGB', (r, g, b))

            t = image_to_string(im)

            # Cleanup
            os.remove(pgfile)
            os.remove(pgfilejpg)

            # Add to data object
            pagedata.append(OCRPage(i, t, self.OCRCleanup(t)))
            text += t

            i += 1

        # Produce the output data object
        result = OCRResult(text, self.OCRCleanup(text), (i - 1), pagedata)

        return result
Пример #3
0
	def OCR(self, fn, resolution=300, verbose=False, part=''):
		
		i = 1
		pdf = PdfFileReader(file(fn, 'rb'))
		if pdf.getIsEncrypted():
			if pdf.decrypt(''):
				jnk = 0
			else:
				return false
		pagedata = []
		text = ''
		
		for p in pdf.pages:
		
			if verbose:
				print ' --- ' + str(i)
		
			part = str(part)
		
			# Temporary filenames for ImageMagick conversion
			pgfile = 'tmp-' + part + '-' + str(i) + '.pdf'
			pgfilejpg = 'tmp-' + part + '-' + str(i) + '.jpg'
			
			# Parse this page
			output = PdfFileWriter()
			output.addPage(p)
			outputStream = file(pgfile,'wb')
			output.write(outputStream)
			outputStream.close()
		
			# Convert this page to a high-resolution JPEG
			img = PythonMagick.Image()
			img.density(str(resolution))
			img.read(pgfile)
			img.write(pgfilejpg)
			
			# OCR the converted JPG
			im = Image.open(pgfilejpg)
			if(len(im.split()) == 4):
				r, g, b, a = im.split()
				im = Image.merge('RGB', (r,g,b))

			t = image_to_string(im)
			
			# Cleanup
			os.remove(pgfile)
			os.remove(pgfilejpg)
			
			# Add to data object
			pagedata.append(OCRPage(i, t, self.OCRCleanup(t)))
			text += t

			i += 1
		
		# Produce the output data object
		result = OCRResult(text, self.OCRCleanup(text), (i-1), pagedata)

		return result
Пример #4
0
class PdfBox(object):
    ''' Wraps pyPdf utils into a pdf object'''
    pdfReader = None
    pdfInfo = None
    currentpage = 0
    extractedPages = {}
    filepath = ""
    isencrypted = False
    password = ""
    author = ""
    title = ""
    subject = ""
    pages = 0
    initialized = False

    def __init__(self, filepath, password = None):
        self.filepath = filepath
        self.pdfReader = PdfFileReader(file(filepath, "rb"))
        if password:
            self.password = password
        if self.initializePdf(self.password):
            self.pdfInfo = self.pdfReader.getDocumentInfo()
            self.author = self.pdfInfo.author
            self.title = self.pdfInfo.title
            self.pages = self.pdfReader.getNumPages()
            self.subject = self.pdfInfo.subject
            self.extractedPages = {}
        
    def initializePdf(self, password = None):
        if self.pdfReader.getIsEncrypted():
            self.isencrypted = True
            if self.pdfReader.decrypt(self.password):
                self.initialized = True
                return True
        else:
            self.initialized = True
            return True
        return False
    
    def getPage(self, pagenum):
        self.currentpage = pagenum
        if self.extractedPages.has_key(pagenum):
            return self.extractedPages[pagenum]
        else:
            page = self.pdfReader.getPage(pagenum)
            text = page.extractText()
            self.extractedPages[pagenum] = text
            return text
Пример #5
0
    def export_to_file(self, file_out, only_selected=False):
        """Export to file"""

        selection = self.iconview.get_selected_items()
        pdf_output = PdfFileWriter()
        pdf_input = []
        for pdfdoc in self.pdfqueue:
            pdfdoc_inp = PdfFileReader(open(pdfdoc.copyname, 'rb'))
            if pdfdoc_inp.getIsEncrypted():
                try:  # Workaround for lp:#355479
                    stat = pdfdoc_inp.decrypt('')
                except:
                    stat = 0
                if (stat != 1):
                    errmsg = _(
                        'File %s is encrypted.\n'
                        'Support for encrypted files has not been implemented yet.\n'
                        'File export failed.') % pdfdoc.filename
                    raise Exception(errmsg)
                #FIXME
                #else
                #   ask for password and decrypt file
            pdf_input.append(pdfdoc_inp)

        for row in self.model:

            if only_selected and row.path not in selection:
                continue

            # add pages from input to output document
            nfile = row[2]
            npage = row[3]
            current_page = copy(pdf_input[nfile - 1].getPage(npage - 1))
            angle = row[6]
            angle0 = current_page.get("/Rotate", 0)
            crop = [row[7], row[8], row[9], row[10]]
            if angle != 0:
                current_page.rotateClockwise(angle)
            if crop != [0., 0., 0., 0.]:
                rotate_times = int(round(((angle + angle0) % 360) / 90) % 4)
                crop_init = crop
                if rotate_times != 0:
                    perm = [0, 2, 1, 3]
                    for it in range(rotate_times):
                        perm.append(perm.pop(0))
                    perm.insert(1, perm.pop(2))
                    crop = [crop_init[perm[side]] for side in range(4)]
                #(x1, y1) = current_page.cropBox.lowerLeft
                #(x2, y2) = current_page.cropBox.upperRight
                (x1,
                 y1) = [float(xy) for xy in current_page.mediaBox.lowerLeft]
                (x2,
                 y2) = [float(xy) for xy in current_page.mediaBox.upperRight]
                x1_new = int(x1 + (x2 - x1) * crop[0])
                x2_new = int(x2 - (x2 - x1) * crop[1])
                y1_new = int(y1 + (y2 - y1) * crop[3])
                y2_new = int(y2 - (y2 - y1) * crop[2])
                #current_page.cropBox.lowerLeft = (x1_new, y1_new)
                #current_page.cropBox.upperRight = (x2_new, y2_new)
                current_page.mediaBox.lowerLeft = (x1_new, y1_new)
                current_page.mediaBox.upperRight = (x2_new, y2_new)

            pdf_output.addPage(current_page)

        # finally, write "output" to document-output.pdf
        pdf_output.write(open(file_out, 'wb'))
Пример #6
0
    def export_to_file(self, file_out, only_selected=False):
        """Export to file"""

        selection = self.iconview.get_selected_items()
        pdf_output = PdfFileWriter()
        pdf_input = []
        for pdfdoc in self.pdfqueue:
            pdfdoc_inp = PdfFileReader(file(pdfdoc.copyname, 'rb'))
            if pdfdoc_inp.getIsEncrypted():
                try: # Workaround for lp:#355479
                    stat = pdfdoc_inp.decrypt('')
                except:
                    stat = 0
                if (stat!=1):
                    errmsg = _('File %s is encrypted.\n'
                               'Support for encrypted files has not been implemented yet.\n'
                               'File export failed.') % pdfdoc.filename
                    raise Exception, errmsg
                #FIXME
                #else
                #   ask for password and decrypt file
            pdf_input.append(pdfdoc_inp)

        for row in self.model:

            if only_selected and row.path not in selection:
                continue

            # add pages from input to output document
            nfile = row[2]
            npage = row[3]
            current_page = copy(pdf_input[nfile-1].getPage(npage-1))
            angle = row[6]
            angle0 = current_page.get("/Rotate",0)
            crop = [row[7],row[8],row[9],row[10]]
            if angle != 0:
                current_page.rotateClockwise(angle)
            if crop != [0.,0.,0.,0.]:
                rotate_times = (((angle + angle0) % 360 + 45) / 90) % 4
                crop_init = crop
                if rotate_times != 0:
                    perm = [0,2,1,3]
                    for it in range(rotate_times):
                        perm.append(perm.pop(0))
                    perm.insert(1,perm.pop(2))
                    crop = [crop_init[perm[side]] for side in range(4)]
                #(x1, y1) = current_page.cropBox.lowerLeft
                #(x2, y2) = current_page.cropBox.upperRight
                (x1, y1) = [float(xy) for xy in current_page.mediaBox.lowerLeft]
                (x2, y2) = [float(xy) for xy in current_page.mediaBox.upperRight]
                x1_new = int(x1 + (x2-x1) * crop[0])
                x2_new = int(x2 - (x2-x1) * crop[1])
                y1_new = int(y1 + (y2-y1) * crop[3])
                y2_new = int(y2 - (y2-y1) * crop[2])
                #current_page.cropBox.lowerLeft = (x1_new, y1_new)
                #current_page.cropBox.upperRight = (x2_new, y2_new)
                current_page.mediaBox.lowerLeft = (x1_new, y1_new)
                current_page.mediaBox.upperRight = (x2_new, y2_new)

            pdf_output.addPage(current_page)

        # finally, write "output" to document-output.pdf
        pdf_output.write(file(file_out, 'wb'))
Пример #7
0
def employer_resume_book_create(request):
    if request.POST.has_key("resume_book_id") and request.POST["resume_book_id"]:
        redelivering = True
        try:
            resume_book = ResumeBook.objects.get(id=request.POST["resume_book_id"])
        except ResumeBook.DoesNotExist:
            raise Http404("No resume book exists with id of %s" % request.POST["resume_book_id"])
    else:
        redelivering = False
        try:
            resume_book, created = ResumeBook.objects.get_or_create(recruiter=request.user.recruiter, delivered=False)
        except ResumeBook.MultipleObjectsReturned:
            resume_books = ResumeBook.objects.filter(recruiter=request.user.recruiter, delivered=False)
            for i, rb in enumerate(resume_books):
                if i != 0:
                    rb.delete()
                else:
                    resume_book = rb

    if redelivering:
        resume_book_name = resume_book.name
    else:
        now = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
        resume_book_name = "%s_%s" % (str(request.user), now)
        resume_book.name = resume_book_name
        resume_book.save()

    file_path = "%semployer/resumebook/" % (s.MEDIA_ROOT,)
    if not os.path.exists(file_path):
        os.makedirs(file_path)

    if request.POST["delivery_format"] == "separate":
        # Create the zip file
        file_name = "%s%s" % (file_path, resume_book_name)
        output = zipfile.ZipFile(file_name, "w")
        try:
            for student in resume_book.students.visible():
                resume_file = file("%s%s" % (s.MEDIA_ROOT, str(student.resume)), "rb")
                try:
                    name = "%s %s (%s, %s).pdf" % (
                        student.first_name,
                        student.last_name,
                        student.graduation_year,
                        student.degree_program,
                    )
                    output.write(resume_file.name, name, zipfile.ZIP_DEFLATED)
                finally:
                    resume_file.close()
        finally:
            output.close()
    else:
        output = PdfFileWriter()
        file_name = "%s%s.pdf" % (file_path, resume_book_name)
        report_buffer = cStringIO.StringIO()
        c = Canvas(report_buffer)
        now = datetime.now()
        first_line = "Created on %s at %s" % (now.strftime("%m/%d/%Y"), now.strftime("%I:%M %p"))
        c.drawString(1 * cm, 28.5 * cm, first_line)
        c.drawString(1 * cm, 28 * cm, str(request.user.recruiter))
        c.drawString(1 * cm, 27.5 * cm, str(request.user.recruiter.employer))
        c.drawString(16 * cm, 28.5 * cm, "Created using Umeqo")
        c.drawString(8.5 * cm, 26.5 * cm, "Resume Book Contents")
        pad_from_top = 0
        for num, student in enumerate(
            resume_book.students.visible().order_by("graduation_year", "first_name", "last_name")
        ):
            c.drawString(6.5 * cm, (25.5 - pad_from_top * 0.5) * cm, "%s %s" % (student.first_name, student.last_name))
            c.drawString(
                12 * cm, (25.5 - pad_from_top * 0.5) * cm, "%s, %s" % (student.graduation_year, student.degree_program)
            )
            pad_from_top += 1
            if num == 50:
                c.showPage()
                c.save()
                output.addPage(PdfFileReader(cStringIO.StringIO(report_buffer.getvalue())).getPage(0))
                report_buffer = cStringIO.StringIO()
                c = Canvas(report_buffer)
                pad_from_top = 0
        c.showPage()
        c.save()
        output.addPage(PdfFileReader(cStringIO.StringIO(report_buffer.getvalue())).getPage(0))
        for student in resume_book.students.visible().order_by("graduation_year", "first_name", "last_name"):
            resume_file = open("%s%s" % (s.MEDIA_ROOT, str(student.resume)), "rb")
            resume = PdfFileReader(resume_file)
            if resume.getIsEncrypted():
                resume.decrypt("")
            for page in range(resume.getNumPages()):
                output.addPage(resume.getPage(page))
        outputStream = file(file_name, "wb")
        output.write(outputStream)
        outputStream.close()
        resume_file.close()

    resume_book_contents = open(file_name, "rb")
    resume_book.resume_book.save(file_name, File(resume_book_contents))
    resume_book_contents.close()
    return HttpResponse()
Пример #8
0
    def processFile(self, curr_file):
        global extractedFrom
        author = '-'
        date = '-'
        generator = '-'
        created = '-'
        producer = '-'
        modded = '-'
        last_saved = '-'
        if ".pdf" in curr_file:
            try:
                pdfFile = PdfFileReader(file(curr_file, 'rb'))
                if pdfFile.getIsEncrypted():
                    pdfFile.decrypt('')
                docInfo = pdfFile.getDocumentInfo()
                if not docInfo:
                    return
                last_saved = '-'
                #looks at the entire dictionary to parse for information
                if "/CreationDate" in docInfo:
                    data = docInfo["/CreationDate"].strip("D:|'")
                    year = data[0:4]
                    date = data[4:6] + "/" + data[6:8]
                    created_time = data[8:10] + ":" + data[10:12]
                    created_time = time.strftime(
                        "%I:%M %p", time.strptime(created_time, "%H:%M"))
                    created = date + "/" + year + " " + created_time
                if "/Author" in docInfo:
                    author = docInfo["/Author"] + " "
                    if len(author) <= 1:
                        author = "-"
                if "/Producer" in docInfo:
                    producer = docInfo["/Producer"].strip("(Windows)")
                    producer = re.sub(r'[^\w]', ' ', producer)
                    if len(producer) == 0:
                        producer = "-"
                    while True:
                        if "  " in producer:
                            producer = producer.replace("  ", " ")
                        else:
                            break
                if "/ModDate" in docInfo:
                    data = docInfo["/ModDate"].strip("D:|'")
                    year = data[0:4]
                    date = data[4:6] + "/" + data[6:8]
                    modded_time = data[8:10] + ":" + data[10:12]
                    modded_time = time.strftime(
                        "%I:%M %p", time.strptime(modded_time, "%H:%M"))
                    modded = date + "/" + year + " " + modded_time

                #strips '/' off file name (if it includes directory name)
                if "/" in curr_file:
                    curr_file = curr_file[curr_file.rfind("/") + 1:]
                if "\\" in curr_file:
                    curr_file = curr_file.replace("\\", "")

                #trim information if it's too long
                if len(curr_file) > 15:  # trims file name
                    curr_file = curr_file[:15] + "..." + curr_file[-13:]
                if len(producer) > 30:
                    producer = producer[:20] + " [snipped] "
                if len(author) > 20:
                    author = author[:20] + " [snipped] "

                #appends each piece of information. output will show ONLY if at least ONE file has data in a column
                self.container.append([
                    " | " + curr_file, created, author, producer, modded,
                    last_saved
                ])
            except Exception, err:
                return
Пример #9
0
def employer_resume_book_create(request):
    if request.POST.has_key("resume_book_id") and request.POST['resume_book_id']:
        redelivering = True
        try:
            resume_book = ResumeBook.objects.get(id=request.POST["resume_book_id"])
        except ResumeBook.DoesNotExist:
            raise Http404("No resume book exists with id of %s" % request.POST["resume_book_id"])
    else:
        redelivering = False
        try:
            resume_book, created = ResumeBook.objects.get_or_create(recruiter = request.user.recruiter, delivered=False)    
        except ResumeBook.MultipleObjectsReturned:
            resume_books = ResumeBook.objects.filter(recruiter=request.user.recruiter, delivered=False)
            for i, rb in enumerate(resume_books):
                if i != 0:
                    rb.delete()
                else:
                    resume_book = rb

    if redelivering:
        resume_book_name = resume_book.name
    else:
        now = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
        resume_book_name = "%s_%s" % (str(request.user), now,)
        resume_book.name = resume_book_name
        resume_book.save()

    file_path = "%semployer/resumebook/"% (s.MEDIA_ROOT,)
    if not os.path.exists(file_path):
        os.makedirs(file_path)

    if request.POST['delivery_format'] == 'separate':
        # Create the zip file
        file_name = "%s%s" % (file_path, resume_book_name,)
        output = zipfile.ZipFile(file_name, 'w')
        try:
            for student in resume_book.students.visible():
                resume_file = file("%s%s" % (s.MEDIA_ROOT, str(student.resume)), "rb")
                try:
                    name = "%s %s (%s, %s).pdf" % (student.first_name, student.last_name, student.graduation_year, student.degree_program)
                    output.write(resume_file.name, name, zipfile.ZIP_DEFLATED)
                finally:
                    resume_file.close()
        finally:
            output.close()
    else:
        output = PdfFileWriter()
        file_name = "%s%s.pdf" % (file_path, resume_book_name)
        report_buffer = cStringIO.StringIO() 
        c = Canvas(report_buffer)  
        now = datetime.now()
        first_line = "Created on %s at %s" % (now.strftime('%m/%d/%Y'), now.strftime('%I:%M %p'))
        c.drawString(1*cm, 28.5*cm, first_line)
        c.drawString(1*cm, 28*cm, str(request.user.recruiter))
        c.drawString(1*cm, 27.5*cm, str(request.user.recruiter.employer))
        c.drawString(16*cm, 28.5*cm, "Created using Umeqo")
        c.drawString(8.5*cm, 26.5*cm, "Resume Book Contents")
        pad_from_top = 0
        for num, student in enumerate(resume_book.students.visible().order_by("graduation_year", "first_name", "last_name")):
            c.drawString(6.5*cm, (25.5-pad_from_top*.5)*cm, "%s %s" % (student.first_name, student.last_name))
            c.drawString(12*cm, (25.5-pad_from_top*.5)*cm,  "%s, %s" %(student.graduation_year, student.degree_program))
            pad_from_top += 1
	    if num == 50:
                c.showPage()
                c.save()
                output.addPage(PdfFileReader(cStringIO.StringIO(report_buffer.getvalue())).getPage(0)) 
                report_buffer = cStringIO.StringIO()
                c = Canvas(report_buffer)
                pad_from_top = 0
        c.showPage()
        c.save()
        output.addPage(PdfFileReader(cStringIO.StringIO(report_buffer.getvalue())).getPage(0)) 
        for student in resume_book.students.visible().order_by("graduation_year", "first_name", "last_name"):
            resume_file = open("%s%s" % (s.MEDIA_ROOT, str(student.resume)), "rb")
            resume = PdfFileReader(resume_file)
            if resume.getIsEncrypted():
                resume.decrypt("")
            for page in range(resume.getNumPages()):
                output.addPage(resume.getPage(page))
        outputStream = file(file_name, "wb")
        output.write(outputStream)
        outputStream.close()
        resume_file.close()

    resume_book_contents = open(file_name, "rb")
    resume_book.resume_book.save(file_name, File(resume_book_contents))
    resume_book_contents.close()
    return HttpResponse()
Пример #10
0
	def processFile(self, curr_file):
		global extractedFrom
		author = '-'
		date = '-'
		generator = '-'
		created = '-'
		producer = '-'
		modded = '-'
		last_saved = '-'
		if ".pdf" in curr_file:
			try:
				pdfFile = PdfFileReader(file(curr_file, 'rb'))
				if pdfFile.getIsEncrypted():
					pdfFile.decrypt('')
				docInfo = pdfFile.getDocumentInfo()
				if not docInfo:
					return
				last_saved = '-'
				#looks at the entire dictionary to parse for information	
				if "/CreationDate" in docInfo:
					data = docInfo["/CreationDate"].strip("D:|'")
					year = data[0:4]
					date = data[4:6] + "/" + data[6:8]
					created_time = data[8:10] + ":" + data[10:12]
					created_time = time.strftime("%I:%M %p", time.strptime(created_time, "%H:%M"))
					created = date + "/" + year + " " + created_time
				if "/Author" in docInfo:
					author = docInfo["/Author"] + " "
					if len(author) <=1:
						author = "-"
				if "/Producer" in docInfo:
					producer = docInfo["/Producer"].strip("(Windows)")
					producer = re.sub(r'[^\w]', ' ', producer)
					if len(producer) == 0:
						producer = "-"
					while True:
						if "  " in producer:
							producer = producer.replace("  ", " ")
						else:
							break
				if "/ModDate" in docInfo:
					data = docInfo["/ModDate"].strip("D:|'")
					year = data[0:4]
					date = data[4:6] + "/" + data[6:8]
					modded_time = data[8:10] + ":" + data[10:12]
					modded_time = time.strftime("%I:%M %p", time.strptime(modded_time, "%H:%M"))
					modded = date + "/" + year + " "  + modded_time

				#strips '/' off file name (if it includes directory name)
				if "/" in curr_file:
					curr_file = curr_file[curr_file.rfind("/")+1:]
				if "\\" in curr_file:
					curr_file = curr_file.replace("\\","")

				#trim information if it's too long
				if len(curr_file) > 15: # trims file name
					curr_file = curr_file[:15] + "..." + curr_file[-13:]
				if len(producer) > 30:
					producer = producer[:20] + " [snipped] "
				if len(author) > 20:
					author = author[:20] + " [snipped] "

				#appends each piece of information. output will show ONLY if at least ONE file has data in a column
				self.container.append([" | " + curr_file,created,author,producer,modded,last_saved])
			except Exception, err:
				return