예제 #1
1
def convert_filetype(filename_ori, filename_res, filetype):
    original = Image(filename=filename_ori)
    with original.convert(filetype) as converted:
        converted.save(filename=filename_res)
예제 #2
0
def pdf_page_to_png(src_pdf, pagenum=0, resolution=154):
    """
    Returns specified PDF page as wand.image.Image png.
    :param PyPDF2.PdfFileReader src_pdf: PDF from which to take pages.
    :param int pagenum: Page number to take.
    :param int resolution: Resolution for resulting png in DPI.
    """

    check_dependencies(__optional_dependencies__[PDF])
    # Import libraries within this function so as to avoid
    # import-time dependence
    import PyPDF2

    # TODO: When we start using this again, document which
    # system-level libraries are required.
    from wand.image import Image

    dst_pdf = PyPDF2.PdfFileWriter()
    dst_pdf.addPage(src_pdf.getPage(pagenum))

    pdf_bytes = io.BytesIO()
    dst_pdf.write(pdf_bytes)
    pdf_bytes.seek(0)

    img = Image(file=pdf_bytes, resolution=resolution)
    img.convert("png")

    return img
예제 #3
0
def getannots(pdfannots, pageno, fh):
    global index
    annots = []
    input1 = PdfFileReader(fh)
    output = PdfFileWriter()
    targetPage = input1.getPage(pageno)
    newpath = "./images/"
    for pa in pdfannots:
        # print(pa)
        subtype = pa.get('Subtype')
        if subtype is not None and subtype.name not in ANNOT_SUBTYPES:
            continue
        print(subtype)
        if (subtype.name == "Ink" or subtype.name == "Square"):
            print("yes")
            print(type(pa.get('Rect')))
            coord = pa.get('Rect')
            targetPage.cropBox.lowerLeft = (coord[0], coord[1])
            targetPage.trimBox.lowerLeft = (coord[0], coord[1])
            targetPage.mediaBox.lowerLeft = (coord[0], coord[1])
            targetPage.cropBox.upperRight = (coord[2], coord[3])
            targetPage.trimBox.upperRight = (coord[2], coord[3])
            targetPage.mediaBox.upperRight = (coord[2], coord[3])
            pdf_bytes = io.BytesIO()
            output.addPage(targetPage)
            output.write(pdf_bytes)
            pdf_bytes.seek(0)
            img = Image(file=pdf_bytes, resolution=300)
            img.convert("png")
            if not os.path.exists(newpath):
                os.makedirs(newpath)
            img.save(filename=newpath + str(index) + ".png")

        colour = pa.get('C')

        contents = pa.get('Contents')

        def getcolour(colour):
            if (colour == [1.0, 0.90196, 0.0]):
                return "yellow"
            elif (colour == [0.26667, 0.78431, 0.96078]):
                return "blue"
            elif (colour == [0.92549, 0.0, 0.54902]):
                return "pink"
            elif (colour == [0.90196, 0.10588, 0.10588]):
                return "red"
            else:
                return "none"

        if contents is not None:
            contents = str(contents, 'iso8859-15')  #'utf-8'
            contents = contents.replace('\r\n', '\n').replace('\r', '\n')
        a = Annotation(index, pageno, subtype.name.lower(),
                       pa.get('QuadPoints'), pa.get('Rect'), contents,
                       getcolour(colour))
        annots.append(a)

        index += 1

    return annots
예제 #4
0
def pdf_page_to_jpg(src_pdf, pagenum=0, resolution=72):

    dst_pdf = PyPDF2.PdfFileWriter()
    dst_pdf.addPage(src_pdf.getPage(pagenum))

    pdf_bytes = io.BytesIO()
    dst_pdf.write(pdf_bytes)
    pdf_bytes.seek(0)

    img = WandImage(file=pdf_bytes, resolution=resolution)
    img.convert("jpg")

    return img
예제 #5
0
 def process_page(self, f, page_number):
     page = Image(file=f, resolution=130)
     f.close()
     if self.signatures and page_number in self.pages_with_signatures:
         page = self.add_signature(page)
     # page.noise("multiplicative_gaussian", attenuate=-0.15)
     page.despeckle()
     page.sharpen(radius=8, sigma=4)
     page.gamma(1.3)
     page.rotate(randint(-100, 100) / 200,
                 background=Color('rgb(255, 255, 255)'))
     page.convert('RGB')
     file_like = io.BytesIO(page.make_blob())
     return file_like
예제 #6
0
def convert_pdf_to_jpg(tablename, nrows):
    print("Converting")
    path = os.getcwd() + '\\figures\\'
    filename = path + tablename + '_OUTPUT.pdf'
    img = Image(filename=filename, resolution=500)
    imgname = path + tablename + '_OUTPUT.jpeg'
    img.convert('jpeg').save(filename=imgname)

    import PIL
    img = PIL.Image.open(imgname)  #打开图像

    box = (360, 490, 3733, 510 + 95 * nrows)  #根据nrows来剪裁图像
    clip = img.crop(box)
    clip.save(imgname)
    os.remove(filename)
예제 #7
0
def ocr(filename):
    name, ext = os.path.splitext(filename)
    if ext.lower() == '.pdf':
        image_pdf = Image(filename=filename, resolution=300)
        image_jpeg = image_pdf.convert('jpeg')
        img = image_jpeg.make_blob()
        img = PImage.open(io.BytesIO(img))
    if ext.lower() in ['.jpg', '.jpeg', '.png']:
        img = PImage.open(filename)
    word_boxes = tool.image_to_string(
        img,
        lang=lang,
        builder=pyocr.builders.LineBoxBuilder(tesseract_layout=1)
    )
    X, Y = img.size
    res = []
    for line in word_boxes:
        if line.position[1][0] - line.position[0][0] < 0.01 * X:
            continue
        if line.position[1][1] - line.position[0][1] < 0.003 * X:
            continue
        if line.content.strip() == '':
            continue
        res.append(line)
    return res, X, Y
예제 #8
0
파일: views.py 프로젝트: ngjonnathan/SIGPAE
def extract_text_from_image(path):
    tool = pyocr.get_available_tools()[0]
    lang = tool.get_available_languages()[2]

    req_image = []
    final_text = []

    image_pdf = Image(filename=path, resolution=200)
    image_jpeg = image_pdf.convert('jpeg')

    for img in image_jpeg.sequence:
        img_page = Image(image=img)
        req_image.append(img_page.make_blob('jpeg'))

    for img in req_image:
        txt = tool.image_to_string(Pi.open(io.BytesIO(img)),
                                   lang=lang,
                                   builder=pyocr.builders.TextBuilder())
        final_text.append(txt)

    trancription = ''
    for i in final_text:
        trancription += i

    return trancription
예제 #9
0
def image_opener(key):
    """Handler to locate file based on key.

    .. note::
        If the file is a PDF then only the first page will be
        returned as an image.

    :param key: A key encoded in the format "<bucket>:<version>:<object_key>".
    :returns: A file-like object.
    """
    if hasattr(g, 'obj'):
        obj = g.obj
    else:
        obj = protect_api(key)

    fp = obj.file.storage().open('rb')

    # If ImageMagick with Wand is installed, extract first page
    # for PDF/text.
    if HAS_IMAGEMAGICK and obj.mimetype in ['application/pdf', 'text/plain']:
        first_page = Image(Image(fp).sequence[0])
        tempfile_ = tempfile.TemporaryFile()
        with first_page.convert(format='png') as converted:
            converted.save(file=tempfile_)
        return tempfile_
    return fp
예제 #10
0
def pdf_to_img(infile, newname):
    # 将pdf文件转为jpg图片文件
    # ./PDF_FILE_NAME 为pdf文件路径和名称
    image_pdf = Image(filename=infile, resolution=300)
    image_jpeg = image_pdf.convert('jpg')
    img_list = join_img_list(image_jpeg)
    img_to_save(img_list, infile, newname)
예제 #11
0
def main():
    tools = pyocr.get_available_tools()
    if len(tools) == 0:
        print("No OCR tool found")
        sys.exit(1)
    tool = tools[0]
    print("Will use tool '%s'" % (tool.get_name()))

    langs = tool.get_available_languages()
    print("Available languages: %s" % ", ".join(langs))
    lang = langs[0]
    print("Will use lang '%s'" % (lang))

    req_image = []
    final_text = []

    image_pdf = Image(filename="./pdf_file/stackoverflow.pdf", resolution=400)
    image_jpeg = image_pdf.convert('jpeg')
    image_jpeg.save(filename='./pdf2img/stackoverflow.jpeg')
    for img in image_jpeg.sequence:
        img_page = Image(image=img)
        req_image.append(img_page.make_blob('jpeg'))
    for img in req_image:
        txt = tool.image_to_string(PI.open(io.BytesIO(img)),
                                   lang=lang,
                                   builder=pyocr.builders.TextBuilder())
        final_text.append(txt)
    print(final_text)
예제 #12
0
def get_text(filepath):
    '''
    Uses OCR to get the text from a pdf.
    Returns a list where each element is the text of a single page of that pdf.
    ex: final_text = get_text('/Users/matthewwong/dsi-capstone/PDFs/decrypted/
        A1/Certifications/Certificates Flood Cert.pdf')
    '''
    tool = pyocr.get_available_tools()[0]
    lang = tool.get_available_languages()[0]

    req_image = []
    final_text = []

    image_pdf = Image(filename=filepath, resolution=300)
    image_jpeg = image_pdf.convert('jpeg')

    for img in image_jpeg.sequence:
        img_page = Image(image=img)
        req_image.append(img_page.make_blob('jpeg'))

    for img in req_image:
        txt = tool.image_to_string(
            PI.open(io.BytesIO(img)),
            lang=lang,
            builder=pyocr.builders.TextBuilder()
        )
        final_text.append(txt)

    return final_text
 def getPageAsPng(self, pageNumber, width=640):
     """Return the given page as a png wand.image.Image."""
     # The page numbers are indexed with base 0, while page numbers start
     # from 1.
     img = Image(image=self.pdf.sequence[pageNumber - 1])
     img.resize(width, int(width * img.height / (1.0 * img.width)))
     return img.convert('png')
예제 #14
0
def convert(trimmed):
	print('Converting...')
	tool = pyocr.get_available_tools()[0] # tesseract
	lang = tool.get_available_languages()[17] # chi_tra; I installed all languages
	for n,filepath in enumerate(trimmed):
		clean_temp_files()
		print('done cleaning.')
		print('\r' + str(n) + '/' + str(len(trimmed)) + ' ',end='')
		req_image = []
		final_text = ""
		try:
			image_pdf = Image(filename=filepath,resolution=300)
		except Exception as e:
			print('\rException: \n' + repr(e))
			continue
		# 	prob_name = 'PROBLEM_' + file
		# 	prob_path = os.path.join(reports,prob_name)
		# 	os.rename(filepath,prob_path)
		txt_name = filepath.replace('.pdf','.txt')
		image_jpeg = image_pdf.convert('jpeg')
		for img in image_jpeg.sequence:
			img_page = Image(image=img)
			req_image.append(img_page.make_blob('jpeg'))
		for img in req_image:
			text = tool.image_to_string(PI.open(io.BytesIO(img)),lang=lang,builder=pyocr.builders.TextBuilder())
			final_text += text
		with open(txt_name,'w',encoding='utf-8') as f:
			print(txt_name)
			f.write(final_text)
예제 #15
0
    def single_ocr2txt__(self, src_path, ocr_name, target_path):
        req_image = []
        final_text = []

        image_pdf = Image(filename=src_path, resolution=300)
        image_jpeg = image_pdf.convert('jpeg')
        for img in image_jpeg.sequence:
            # 将pdf分割成图片
            img_page = Image(image=img)
            req_image.append(img_page.make_blob('jpeg'))
        for img in req_image:
            # 将图片提取出文本
            txt = pytesseract.image_to_string(PI.open(io.BytesIO(img)),
                                              lang='eng')
            final_text.append(txt)

        upperdirs = os.path.dirname(target_path)
        if upperdirs and not os.path.exists(upperdirs):
            # Create directories that are not part of the archive with
            # default permissions.
            os.makedirs(upperdirs)

        ocr_name = ocr_name.replace('.pdf', '.txt')
        file_path = os.path.join(target_path, ocr_name)
        with open(file_path, 'a', encoding='utf-8') as f:
            #存储文本
            for text in final_text:
                f.write(text)
예제 #16
0
    def pdf_run(self, image_file_name, filename, path):
        
        image_pdf = Image(filename=image_file_name, resolution=300) #take filename
        image_page = image_pdf.convert("png") #png conversion

        page = 1 #init page
        process_start = time.time()

        for img in image_page.sequence: # Every single image in image_page for grayscale conversion in 300 resolution
            
            img_per_page = Image(image=img)
            img_per_page.type = 'grayscale'
            img_per_page.depth = 8
            img_per_page.density = 300

            try:
                img_per_page.level(black=0.3, white=1.0, gamma=1.5, channel=None)
            
            except AttributeError as e:
                print("Update Wand library: %s" % e)

            img_buf = path + '/' + "saram_" + filename + str(page) + ".png"

            os.chmod(path, 0o777)
            img_per_page.save(filename=img_buf)

            page_start = time.time()
            page_elaboration = time.time() - page_start
            print("page %s - size %s - process %2d sec." % (page, img_per_page.size, page_elaboration))
                
            page += 1
            img.destroy()

        process_end = time.time() - process_start
        print("Total elaboration time: %s" % process_end)
예제 #17
0
    def process(self, pdf_filename, pdf_resolution, imageformat,
                do_orientation):
        final_text = ""
        image_pdf = Image(filename=pdf_filename, resolution=pdf_resolution)
        image_page = image_pdf.convert(imageformat)

        page = 1
        process_start = time.time()
        for img in image_page.sequence:
            img_per_page = Image(image=img)
            img_per_page.type = 'grayscale'
            img_per_page.depth = 8
            img_per_page.density = pdf_resolution
            try:
                img_per_page.level(black=0.3,
                                   white=1.0,
                                   gamma=1.5,
                                   channel=None)
            except AttributeError as e:
                print("Update Wand library: %s" % e)
            img_per_page.save(filename="buffer.png")
            page_start = time.time()
            txt = self.image2txt_pyocr(img_per_page.make_blob(imageformat),
                                       do_orientation)
            page_elaboration = time.time() - page_start
            print("page %s - size %s - process %2d sec. - text %s" %
                  (page, img_per_page.size, page_elaboration, len(txt)))
            final_text += "%s\n" % txt
            page += 1
            img.destroy()

        process_end = time.time() - process_start
        print("Total elaboration time: %s" % process_end)

        return final_text
예제 #18
0
def pdf_to_text(filename):
    pdf = IM(filename=filename, resolution=300)
    pages = len(pdf.sequence)
    image = IM(width=pdf.width, height=pdf.height * pages)

    for i in xrange(pages):
        image.composite(
            pdf.sequence[i],
            top=pdf.height * i,
            left=0
        )
    img = image.convert('png')

    with tempfile.NamedTemporaryFile(prefix="tess_") as temp_file:
        img.save(filename=temp_file.name)

        try:
            temp = tempfile.NamedTemporaryFile(delete=False)
            process = subprocess.Popen(['tesseract', temp_file.name, temp.name], \
                                       stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
            process.communicate()

            with open(temp.name + '.txt', 'r') as handle:
                contents = handle.read()
                os.remove(temp.name + '.txt')
                os.remove(temp.name)
                print contents
        except:
            print "ERROR"
def extract_pages(filename):
    img_pdf = Image(filename=filename, resolution=300)
    img_jpeg = img_pdf.convert('jpeg')
    img_thirdpage = img_jpeg.sequence[-1]
    img_secondpage = img_jpeg.sequence[-2]

    return img_secondpage, img_thirdpage
예제 #20
0
def pdf_to_jpg(PDF_FILE_NAME):
    JPG_FILE_DIR = PDF_FILE_NAME.split('.')[0] + u'_图片目录'
    #如果图片目录存在,则先删除,再创建空目录,否则直接创建空目录
    if not os.path.exists(JPG_FILE_DIR):
        os.makedirs(JPG_FILE_DIR)
    else:
        shutil.rmtree(JPG_FILE_DIR)
        os.makedirs(JPG_FILE_DIR)
    #读取pdf文件,分辨率值设置越大,打开文件越慢,可根据自己情况调整
    image_pdf = Image(filename=PDF_FILE_NAME, resolution=300)
    image_jpeg = image_pdf.convert('jpg')
    #print u"[%s]:文件正在转换..." %time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
    req_image = []
    #print u"[%s]:该PDF文件有%s页,正在转换,请稍等...\n" %(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()),len(image_jpeg.sequence))
    for img in image_jpeg.sequence:
        img_page = Image(image=img)
        req_image.append(img_page.make_blob('jpg'))
    jpg_file_num = 1
    # 遍历req_image,保存为图片文件
    for img in req_image:
        #print u"[%s]:正在转换第%s页,请稍后." %(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()),jpg_file_num)
        JPG_FILE_NAME = PDF_FILE_NAME.split('.')[0] + '_' + str(
            jpg_file_num) + '.jpg'
        ff = open(JPG_FILE_NAME, 'wb')
        ff.write(img)
        ff.close()
        shutil.move(JPG_FILE_NAME, JPG_FILE_DIR)
        jpg_file_num += 1
예제 #21
0
    def dt_pdf_image_to_txt(self, filename, codec='utf-8'):
        """ 
        Argument :
        
        filename : input file name
        
        Return :
        
        return file text
        
        
        Note :
        pdf file to text file
        """
        document = ""
        try:

            filename = filename.encode('utf-8')
            image_pdf = Image(filename=filename, resolution=300)
            image_jpeg = image_pdf.convert('jpeg')
            texts = []
            for img in image_jpeg.sequence:
                img_page = Image(image=img)
                blob = img_page.make_blob('jpeg')
                txt = self.dt_image_to_text(None, blob)
                if txt:
                    texts.append(txt)
            document = ''.join(texts)
        except:
            pass

        finally:
            pass

        return document
예제 #22
0
def image_opener(key):
    """Handler to locate file based on key.

    .. note::
        If the file is a PDF then only the first page will be
        returned as an image.

    :param key: A key encoded in the format "<recid>:<filename>".
    :returns: A file-like object.
    """
    key_parts = key.split(":")
    assert len(key_parts) == 2

    recid = key_parts[0]
    filename = key_parts[1]
    service = current_rdm_records.records_service
    try:
        file_item = service.files.get_file_content(g.identity, recid, filename)
    except KeyError:
        return None  # FIXME: throw custom exception `FileNotFound`?

    fp = file_item.get_stream('rb')

    # If ImageMagick with Wand is installed, extract first page
    # for PDF/text.
    pages_mimetypes = ['application/pdf', 'text/plain']
    if HAS_IMAGEMAGICK and file_item.data["mimetype"] in pages_mimetypes:
        first_page = Image(Image(fp).sequence[0])
        tempfile_ = tempfile.TemporaryFile()
        with first_page.convert(format='png') as converted:
            converted.save(file=tempfile_)
        return tempfile_

    return fp
예제 #23
0
def pdfocr(location):
    # global tool, lang
    tool = pyocr.get_available_tools()[0]
    lang = tool.get_available_languages()[0]
    req_image = []
    final_text = []

    image_pdf = Image(filename=location, resolution=300)
    image_jpeg = image_pdf.convert('jpeg')

    for img in image_jpeg.sequence:
        img_page = Image(image=img)
        req_image.append(img_page.make_blob('jpeg'))

    ct = 1
    for img in req_image:
        txt = tool.image_to_string(PI.open(io.BytesIO(img)),
                                   lang=lang,
                                   builder=pyocr.builders.TextBuilder())
        final_text.append(txt)
        print("%3d / %3d" % (ct, len(req_image)))
        ct += 1

    for frame in image_jpeg.sequence:
        frame.destroy()
    return final_text
예제 #24
0
def extract_infos_from_pdf(pdf_url):
    image_pdf = Image(file=urlopen(pdf_url), resolution=300)
    image_jpeg = image_pdf.convert('jpeg')
    req_image = []
    for img in image_jpeg.sequence:
        img_page = Image(image=img)
        req_image.append(img_page.make_blob('jpeg'))
    final_text = []
    for img in req_image:
        txt = TOOL.image_to_string(PI.open(io.BytesIO(img)),
                                   lang=LANG,
                                   builder=pyocr.builders.TextBuilder())
        final_text.append(txt)
    email = re.search("[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+", final_text[0])
    raw_tel = re.search("\([0-9]{2}\) (?:9[0-9]{1}|[1-5]{1})[0-9]{3}-[0-9]{4}",
                        final_text[0])[0]
    numbers_list = re.findall(r'\d+', raw_tel)
    tel = '55' + ''.join(numbers_list)
    full_name = re.findall(r'Nome: (.+)', final_text[0])[0].split()[0]
    insurer = re.findall('Tipo: ([^\W\d_]+)', final_text[0])[0]
    return {
        'email': email.group(),
        'tel': ''.join([d for d in re.findall(r'\d+', tel)]),
        'full_name': full_name,
        'insurer': insurer
    }
예제 #25
0
def image_opener(uuid):
    """ Find a file based on its UUID.

    :param uuid: a UUID in the form bucket:filename
    :returns: a file path or handle to the file or its preview image
    :rtype: string or handle
    """
    # Drop the "version" that comes after the second ":" - we use this version
    # only as key in redis cache
    bucket, _file = uuid.split(':')[:2]

    ret = ObjectVersion.get(bucket, _file).file.uri
    # Open the Image
    opened_image = file_opener_xrootd(ret, 'rb')
    if '.' in _file:
        ext = _file.split('.')[-1]
        if ext in ['txt', 'pdf']:
            img = Image(opened_image)
            # Get the first page from text and pdf files
            first_page = Image(img.sequence[0])
            tempfile_ = tempfile.TemporaryFile()
            with first_page.convert(format='png') as converted:
                converted.save(file=tempfile_)
            return tempfile_
    # Return an open file to IIIF
    return opened_image
예제 #26
0
def pdf2ocr(pdffile):
    """
    Optical Character Recognition on PDF files using Python
    see https://pythontips.com/2016/02/25/ocr-on-pdf-files-using-python/
    :param pdffile: pdffile to be OCR'd
    :return:
    """
    from wand.image import Image
    from PIL import Image as PI
    import pyocr
    import pyocr.builders
    import io

    tool = pyocr.get_available_tools()[0]
    lang = tool.get_available_languages()[0]  # [0] for english
    req_image = []
    final_text = []
    print "Reading {0}".format(pdffile)
    image_pdf = Image(filename=pdffile, resolution=300)
    image_jpeg = image_pdf.convert("jpeg")
    for img in image_jpeg.sequence:
        img_page = Image(image=img)
        print ("appending image")
        req_image.append(img_page.make_blob("jpeg"))
    print "Generating text"
    for img in req_image:
        txt = tool.image_to_string(PI.open(io.BytesIO(img)), lang=lang, builder=pyocr.builders.TextBuilder())
        final_text.append(txt)
    return final_text
예제 #27
0
def pdf2text(pdf_filename):

    tool = pyocr.get_available_tools()[0]
    lang = tool.get_available_languages()[1]

    req_image = []
    final_text = []

    image_pdf = Image(filename=pdf_filename, resolution=300)
    image_jpeg = image_pdf.convert('jpeg')

    for img in image_jpeg.sequence:
        img_page = Image(image=img)
        req_image.append(img_page.make_blob('jpeg'))

    for img in req_image:

      txt = tool.image_to_string(
          PI.open(io.BytesIO(img)),
          lang=lang,
          builder=pyocr.builders.TextBuilder()
      )

      final_text.append(txt)

    return final_text
예제 #28
0
    def pdf_image_to_text(self,
                          filename,
                          lang,
                          encoding='utf-8',
                          resolution=300):
        super().reset(filename)
        document = ""
        try:
            filename = filename.encode(encoding)
            image_pdf = Image(filename=filename, resolution=resolution)
            image_jpeg = image_pdf.convert('jpeg')
            texts = []
            for img in image_jpeg.sequence:
                img_page = Image(image=img)
                buff = img_page.make_blob('jpeg')
                txt = self.ocr_image(PI.open(io.BytesIO(buff)), lang=lang)
                if txt:
                    texts.append(txt)
            document = ''.join(texts)
        except:
            pass

        finally:
            pass

        return document
예제 #29
0
파일: main.py 프로젝트: hraban/pyocr-docker
def main(argv):
    tool = pyocr.get_available_tools()[0]

    if LANG not in tool.get_available_languages():
        print("ocr language '%s' not available" % (LANG, ), file=sys.stderr)
        os.exit(1)
    if len(argv) <= 1:
        print("Usage: ./main.py <INPUT_PDF>", file=sys.stderr)
        os.exit(1)
    infile = argv[1]

    req_image = []
    final_text = []
    image_pdf = Image(filename=infile)
    image_jpeg = image_pdf.convert('png')
    for img in image_jpeg.sequence:
        img_page = Image(image=img)
        req_image.append(img_page.make_blob('png'))
    for img in req_image:
        txt = tool.image_to_string(PI.open(io.BytesIO(img)),
                                   lang=LANG,
                                   builder=pyocr.builders.TextBuilder())
        final_text.append(txt)
    # is this really a good idea?
    print(json.dumps(final_text))
예제 #30
0
def pdf_text_extract(file, resolution=300, tool=0, lang=0):
    '''uses wand.image to convert provided pdf file to jpeg,
	then extract text from jpeg files. Returns list of extracted
	text'''
    engine = pyocr.get_available_tools()[tool]  #tesseract engine
    lang = engine.get_available_languages(
    )[lang]  #first language is eng for tesseract. May be different for different engine

    #create lists to capture images and text from pdfs
    req_image = []
    final_text = []

    #open pdf
    image_pdf = Image(filename=file, resolution=resolution)
    #convert pdf to jpegs
    image_jpeg = image_pdf.convert('jpeg')

    #convert each page in pdf file to individual jpeg
    for img in image_jpeg.sequence:
        img_page = Image(image=img)
        req_image.append(img_page.make_blob('jpeg'))

    for img in req_image:
        txt = engine.image_to_string(PI.open(io.BytesIO(img)),
                                     lang=lang,
                                     builder=pyocr.builders.TextBuilder())
        final_text.append(txt)

    return final_text
    def mosaic_thumbnail(self):
        """ Use wand to create an effect like that in the imagemagick
        command line montage command. Read through each page in the pdf,
        resize it, draw an angled rectangle in gray for the background,
        then the reszied image on top of that.
        """

        back_filename = "%s/../resources/mosaic_background.png" \
                        % self.dir_name
        back_img = Image(filename=back_filename)

        # From: # http://stackoverflow.com/questions/18821145/\
        # wand-convert-pdf-to-jpeg-and-storing-pages-in-file-like-objects
        image_pdf = Image(blob=self.blob)
        if self.blob is None:
            image_pdf = Image(filename=self.filename)
        image_png = image_pdf.convert("png")
        shift = 0
        for page_img in image_png.sequence:
            page_img.resize(80, 103)
            self.composite_gray(back_img, page_img, shift)
            shift += 1

        back_img.format = "png"
        return back_img.make_blob()
예제 #32
0
    def get_thumbnail_from_pdf(self, file):
        try:
            filename = file.name
            img = None

            # Convert PDF files
            imgs_pdf = Image(file=file)
            imgs = imgs_pdf.convert('jpeg')

            if imgs:
                img = Image(image=imgs.sequence[0])
                img.background_color = Color('white')
                img.alpha_channel = 'remove'

                # resized and save the converted file
                img.transform(crop='', resize=THUMBNAIL_SIZE)
                img.thumbnail()

                temp = NamedTemporaryFile(delete=False)
                temp.flush()
                temp0 = File(temp)

                with temp0.open('wb') as f:
                    img.save(file=f)

                return temp0.open('rb')

        except Exception as e:
            print(repr(e))

        return None
예제 #33
0
def new_blank_png(width, height, color=None):
    if color:
        new_img = Image(width=width, height=height, background=color)
    else:
        new_img = Image(width=width, height=height)

    return new_img.convert('png')
예제 #34
0
def extract_text(input_pdf):


    tool = pyocr.get_available_tools()[0]
    lang = tool.get_available_languages()[0]

    req_image = []
    final_text = []

    image_pdf = Image(filename=input_pdf, resolution=300)
    image_jpeg = image_pdf.convert('jpeg')

    for img in image_jpeg.sequence:
        img_page = Image(image=img)
        req_image.append(img_page.make_blob('jpeg'))

    for img in req_image:
        txt = tool.image_to_string(
            PI.open(io.BytesIO(img)),
            lang=lang,
            builder=pyocr.builders.TextBuilder()
        )
        final_text.append(txt)


    with open(Ouput_dir + 'output{0:%H_%M_%S.%f}.txt'.format(datetime.now()), 'w', encoding='utf-8') as f:

        for item in final_text:
            f.write(item)
    return input_pdf
예제 #35
0
def Parse_PDF(FileName):
    try:
        # Open and read the pdf file
        fp = open(FileName, 'rb')
        # Create parser object to parse the pdf content
        parser = PDFParser(fp)
        # No password for the pdf file
        password = ""
        document = PDFDocument(parser, password)
        # check out password protection
        if not document.is_extractable:
            print("File Under Password Protection" + str(FileName))
            raise PDFTextExtractionNotAllowed("File Under Password Protection")
        # Create PDFResourceManager object that stores shared resources such as fonts or images
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        # Create a PDFDevice object which translates interpreted information into desired format
        # Device needs to be connected to resource manager to store shared resources
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        maxpages = 0
        caching = True
        pagenos = set()
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching):
            interpreter.process_page(page)
        fp.close()
        device.close()
        output = retstr.getvalue()
        retstr.close()

        # If content is extracted successfully, return the content, otherwise go to pyOCR
        if len(output) != 1:
            return output
        else:
            # go to pyOCR
            # Initialize pyOCR
            # The tools are returned in the recommended order of usage
            tool = pyocr.get_available_tools()[0]
            req_image = []
            txt = ""
            image_pdf = Image(filename=FileName, resolution=300)
            image_jpeg = image_pdf.convert('jpeg')
            for img in image_jpeg.sequence:
                img_page = Image(image=img)
                req_image.append(img_page.make_blob('jpeg'))
            for img in req_image:
                tem = tool.image_to_string(
                    PI.open(io.BytesIO(img)),
                    builder=pyocr.builders.TextBuilder())
                txt = txt + tem
            return txt
    except:
        print("ERROR - Could not parse file" + str(file))
예제 #36
0
파일: imwand.py 프로젝트: Scrik/sk1-wx
def process_pattern(raw_content):
	img = Image(file=StringIO(raw_content))
	base_img = img.convert('tiff')
	flag = False
	if img.type in ['bilevel', 'grayscale', 'grayscalematte']:
		flag = True
	base = StringIO()
	base_img.save(file=base)
	base.seek(0)
	return base, flag
예제 #37
0
    def upload_from_web(self, request, pk=None):
        from wand.image import Image
        from wand.color import Color
        from wand import exceptions as wand_exceptions
        from apps.group.models import CourseGroup, CourseGroupMember

        chatroom = self.get_object()
        try:
            chatroom_member = ChatroomMember.objects.get(chatroom=chatroom, user=request.user)
        except ChatroomMember.DoesNotExist:
            # Create the course group member (is past)
            course_group = CourseGroup.objects.get(chatroom=chatroom)
            course_group_member = CourseGroupMember.objects.create(course_group=course_group, student=request.user.student, is_past=True)

            # Create the chatroom member (is past)
            chatroom_member = ChatroomMember.objects.create(user=request.user, chatroom=chatroom, is_past=True)

        name = request.data.get('name')
        is_anonymous = int(request.data.get('is_anonymous', False))
        tag = Tag.objects.get(pk=int(request.POST.get('tag_id')))

        new_upload = Upload.objects.create(chatroom_member=chatroom_member, chatroom=chatroom, name=name, tag=tag, is_anonymous=is_anonymous)
        all_urls = ""

        for fp in request.FILES:
            uploadedFile = request.data.get(fp)
            if uploadedFile.content_type == "application/pdf":
                image_pdf = Image(file=uploadedFile, resolution=250, background=Color("white"))
                image_jpeg = image_pdf.convert('jpeg')
                count = 0
                for single_img in image_jpeg.sequence:
                    img = Image(image=single_img, resolution=250)
                    temp = tempfile.TemporaryFile()
                    img.alpha_channel = False
                    img.save(file=temp)
                    url = new_upload.upload_file(temp)
                    temp.close()
                    all_urls = all_urls + url + "\n"
                    count += 1
                    if count >= 25:
                        break
                break
            url = new_upload.upload_file(uploadedFile)
            all_urls = all_urls + url + "\n"

        activity_type = ChatroomActivityType.objects.get_activity_type(ChatroomActivityTypeManager.UPLOAD)
        activity = ChatroomActivity.objects.create(chatroom=chatroom, chatroom_activity_type=activity_type, activity_id=new_upload.pk)
        new_upload.send_created_notification(activity, request, True)

        # post to slack TODO add detail
        message = request.user.email + " uploaded files to " + chatroom.name + ":\n[" + str(new_upload.id) + "] " + all_urls
        slack_utils.send_simple_slack_message(message)

        return Response(200)
예제 #38
0
def make_anim():
    paths = glob('*.png')
    image = Image(filename=paths[0])
    anim = image.convert('gif')
    for p in paths[1:]: anim.sequence.append(
                Image(filename=p).convert('gif'))
    
    # lined up manually for once off capture
    anim.crop(left=80, top=1030, width=585, height=140)
    import IPython
    IPython.embed()
    save(anim)
예제 #39
0
def preview(pdfpath, outpath, croph):
  """
  @arg pdfpath input pdf path
  @arg outpath output screeshot file path
  @arg croph percentage of the page height to crop at
  """
  try:
    with Image(filename=pdfpath) as imgs:
      for imgidx, img in enumerate(imgs.sequence):
        w, h = img.width, img.height
        h = int(croph * h)
        img.crop(0, 0, width=w, height=h)

        newimg = Image(width=w, height=h, background=Color("white"))
        newimg.composite(img, 0, 0)
        newimg.convert("png")
        newimg.save(filename=outpath)
        return True
  except Exception as err:
    print err
    return False
예제 #40
0
파일: imwand.py 프로젝트: Scrik/sk1-wx
def process_image(raw_content):
	alpha = None
	img = Image(file=StringIO(raw_content))

	if img.alpha_channel:
		img_png = img.convert('png')
		img_png.type = 'truecolormatte'
		img.alpha_channel = False
		alpha = StringIO()
		img_png.save(file=alpha)
		alpha.seek(0)

	if img.type == 'colorseparation' or img.colorspace == 'lab':
		base_img = img.convert('tiff')
	else:
		base_img = img.convert('png')

#	base_img = img.convert('tiff')
	base = StringIO()
	base_img.save(file=base)
	base.seek(0)
	return base, alpha
예제 #41
0
def screenshots(pdfpath, h=200):
  #dirpath = os.path.join(os.path.dirname(os.path.dirname(pdfpath)), "imgs")
  dirpath = IMGROOT
  fprefix = os.path.splitext(os.path.basename(pdfpath))[0]
  print "screenshots for %s" % fprefix
  ret = []

  with Image(filename=pdfpath) as imgs:
    for imgidx, img in enumerate(imgs.sequence):
      w = int(h * img.width / img.height)
      tosave = Image(width=img.width, height=img.height, background=Color("white"))
      tosave.composite(img, 0, 0)
      tosave.convert('png')
      tosave.resize(w, h, filter='lagrange', blur=.35)

      imgname = "%s_%02d.png" % (fprefix, imgidx)
      imgpath = os.path.join(dirpath, imgname)
      tosave.save(filename=imgpath)

      ret.append(dict(
        imgpath=imgpath,
        imgidx=imgidx
      ))
  return ret
예제 #42
0
def upload_avatar():
	#TODO: process situation when request.files don't have needed file
	if request.method == 'POST' and 'userFile' in request.files:
		hasher = md5()
		hasher.update(current_user.email.encode('utf-8'))
		f1 = request.files['userFile']
		filename = path.join(UPLOADED_AVATARS_DEST, hasher.hexdigest())
		f1.save(filename)
		print('new avatar for user ' + current_user.first_name + current_user.second_name + ' was saved as ' + filename)
		img = Image(filename=filename)
		if img is None:
			print("Can't parse blob from post message as image!")
			return redirect('/profile')
		else:
			img.resize(320, 240)
			img = img.convert('png')
			user = current_user
			user.load_avatar(img)
			return redirect('/profile')
	else:
		#TODO: redirect to previous url
		return redirect('/')
예제 #43
0
def image_opener(key):
    """Handler to locate file based on key.

    :param key: A key encoded in the format "<bucket>:<version>:<object_key>".
    :returns: A file-like object.
    """
    if hasattr(g, 'obj'):
        obj = g.obj
    else:
        obj = protect_api(key)

    fp = obj.file.storage().open('rb')

    # If ImageMagick with Wand is installed, extract first page
    # for PDF/text.
    if HAS_IMAGEMAGICK and obj.mimetype in ['application/pdf', 'text/plain']:
        first_page = Image(Image(fp).sequence[0])
        tempfile_ = tempfile.TemporaryFile()
        with first_page.convert(format='png') as converted:
            converted.save(file=tempfile_)
        return tempfile_
    return fp
예제 #44
0
def image_to_text(image):
    tool = pyocr.get_available_tools()[0]
    lang = tool.get_available_languages()[2]
    req_image = []
    final_text = []
    try:
        image_obj = Image(filename=image, resolution=300)
    except:
        return final_text
    if image[-4:] is not 'jpeg':
        image_obj = image_obj.convert('jpeg')
    for img in image_obj.sequence:
        img_page = Image(image=img)
        req_image.append(img_page.make_blob('jpeg'))
    for img in req_image:
        txt = tool.image_to_string(
            PI.open(io.BytesIO(img)),
            lang=lang,
            builder=pyocr.builders.TextBuilder()
        )
        final_text.append(txt)
    return final_text
예제 #45
0
파일: parser.py 프로젝트: ticalcster/efos
    def __init__(self, page=None):
        """ """
        self._barcodes = None
        self._efos_barcode = None
        self.page = page


        # save single page to stream
        page_stream = StringIO.StringIO()
        pdf_file_writer = PdfFileWriter()
        pdf_file_writer.addPage(self.page)
        pdf_file_writer.write(page_stream)


        # Convert pdf file to image
        image_stream = StringIO.StringIO()
        pdf_image = WandImage(blob=page_stream.getvalue())
        png_image = pdf_image.convert('png')
        # self.image.type = 'grayscale'
        png_image.save(file=image_stream)

        # Convert Image to B/W
        self.pil_image = PILImage.open(image_stream)
        self.get_barcodes()
예제 #46
0
class ImportMedia(object):
    """
    Import new photos: create JPEG, XMP and thumbnail files, add to database
    """

    PROXY_FULLSIZE = 0
    PROXY_THUMBNAIL = 1
    PROXY_WEBSIZED = 2

    # outrageously slow, exec()s ufraw-batch in newer versions
    def _create_proxy_wand(self, source, dest, mode):
        # load image if not yet done
        if not self.image:
            try:
                self.image = Image(filename=source)
            except Exception as e:
                logging.error('cannot read {}: {}'.format(source, e.args[0]))
                raise e

        # copy bitmap, we may need it again
        image = self.image.convert('jpeg')

        # resize
        if mode == self.PROXY_FULLSIZE:
            pass
        elif mode == self.PROXY_THUMBNAIL:
            image.transform(resize=settings.THUMBNAILSIZE)
        elif mode == self.PROXY_WEBSIZED:
            image.transform(resize=settings.WEBSIZE)

        try:
            # Wand does auto-rotate, but doesn't fix EXIF data... m(
            image.strip()
            image.save(filename=dest)
        except (Exception, IOError) as e:
            logging.error('Error: cannot write {}: {}'.format(dest, e.args[0]))
            raise e

    def _create_proxy_rawpy(self, source, dest, mode):
        # maybe Pillow supports this file type directly?
        if not self.image:
            try:
                self.image = Image.open(source)
            except IOError:
                pass
            except Exception as e:
                logging.error('cannot read {}: {}'.format(source, e.args[0]))
                raise e

        # obviously not, try decoding as Raw
        if not self.image:
            try:
                raw = rawpy.imread(source)
                rgb = raw.postprocess(use_camera_wb=True, no_auto_bright=True)
                self.image = Image.fromarray(rgb)
            except Exception as e:
                logging.error('cannot read {}: {}'.format(source, e.args[0]))
                raise e

        image = self.image.copy()
        if mode == self.PROXY_FULLSIZE:
            pass
        elif mode == self.PROXY_THUMBNAIL:
            image.thumbnail(settings.THUMBNAILSIZE)
        elif mode == self.PROXY_WEBSIZED:
            image.thumbnail(settings.WEBSIZE)

        try:
            image.save(dest)
        except Exception as e:
            logging.error('cannot write {}: {}'.format(dest, e.args[0]))

    def create_proxy(self, source, dest, mode):
        logging.info('called for {}, {}, {}'.format(source, dest, mode))
        if os.path.isfile(dest) and not self.force:
            return

        # just link fullsize image if JPEG and orientation is normal
        is_jpeg = os.path.splitext(source)[1].lower() in ('.jpg', '.jpeg')
        orientation = self.exif.get_orientation()
        logging.debug('EXIF Orientation: {}'.format(orientation.value_nick))

        if mode == self.PROXY_FULLSIZE and is_jpeg and orientation == orientation.NORMAL:
            try:
                os.unlink(dest)
            except Exception as e:
                pass

            try:
                os.link(source, dest)
            except Exception as e:
                logging.error('cannot link {} to {}: {}'.format(source, dest, e.args[0]))
                raise e

            return

        if settings.IMAGE_LIB == 'wand':
            self._create_proxy_wand(source, dest, mode)
        elif settings.IMAGE_LIB == 'rawpy':
            self._create_proxy_rawpy(source, dest, mode)
        else:
            logging.error('configuration error: unknown or missing IMAGE_LIB {}'.format(settings.IMAGE_LIB))

    def create_video_proxy(self, source, thumbnail, websized):
        logging.info('called for {}, {}, {}'.format(source, thumbnail, websized))
        if os.path.isfile(thumbnail) and os.path.isfile(websized) and not self.force:
            return


        for keyframe in 10, 1:
            ffmpeg_filter="[0:v] select='eq(pict_type,I)*lte(t,30)' [iframes];" \
              "[iframes] select='gte(n,{})',split=2 [sel1][sel2];" \
              "[sel1] scale=128:-1[tns]; [tns][1:v] overlay=main_w-overlay_w-5:main_h-overlay_h-5 [tn];" \
              "[sel2] scale=768:-1[prxs]; [prxs][2:v]overlay=main_w-overlay_w-10:main_h-overlay_h-10 [proxy]".format(keyframe)

            call([settings.FFMPEG_COMMAND, '-i', source,
              '-i', settings.THUMBNAIL_VIDEO_PLAY_BUTTON,
              '-i', settings.PREVIEW_VIDEO_PLAY_BUTTON,
              '-filter_complex', ffmpeg_filter,
              '-y', '-v', 'quiet',
              '-map', '[tn]',
              '-frames:v', '1',
              thumbnail,
              '-map', '[proxy]',
              '-frames:v', '1',
              websized],
             shell=False)

            if os.path.isfile(thumbnail) and os.path.isfile(websized):
                return


    def write_xmp_sidecar(self, sourcefile):
        destxmppath = os.path.splitext(sourcefile)[0] + '.xmp'
        try:
            # how brain-damaged is this?!
            fd = open(destxmppath, mode='w+')
            fd.write('<?xml version="1.0" encoding="UTF-8"?><x:xmpmeta xmlns:x="adobe:ns:meta/"></x:xmpmeta>')
            fd.close()
            self.exif.save_file(destxmppath)
        except Exception as e:
            logging.error('cannot write {}: {}'.format(destxmppath, e))

        return destxmppath

    def get_timestamp(self, source_file, use_exif=True):
        # first try to get it from EXIF data
        if use_exif:
            timestamp = self.exif.get_tag_string('Exif.Photo.DateTimeOriginal')
            if not timestamp:
                timestamp = self.exif.get_tag_string('Exif.Image.DateTime')
        else:
            timestamp = None

        # try timestamp of file
        if not timestamp:
            st = os.stat(source_file)
            timestruct = tz.time.gmtime(int(st.st_ctime))
            # camera clock not set, eh?
            if timestruct.tm_year < 2000:
                timestruct = tz.time.gmtime()
            return timestruct

        return tz.time.strptime(timestamp, '%Y:%m:%d %H:%M:%S')

    def create_filename(self, source_file, timestruct):
        datestring = tz.time.strftime("%Y%m%d", timestruct)
        mediabasename = os.path.basename(source_file).lower()
        filename = datestring + "-" + mediabasename
        count = 1
        while count < 1000000:
            try:
                MediaFile.objects.get(filename=filename)
            except Exception as e:
                break
            filename = '{}-{:06d}-{}'.format(datestring, count, mediabasename)
            count += 1

        if count == 1000000:
            logging.error('Cannot create unique export filename')
            return None

        return filename

    def update_image_parameters(self, entry):
        try:
            entry.f_number = Fraction(self.exif.get_tag_string('Exif.Photo.FNumber')) * 1.0
        except Exception:
            entry.f_number = 0

        try:
            entry.exposure_time = Fraction(self.exif.get_tag_string('Exif.Photo.ExposureTime')) * 1.0
        except Exception:
            entry.exposure_time = 0

        try:
            entry.gain_value = float(self.exif.get_tag_string('Exif.Photo.ISOSpeedRatings'))
        except Exception:
            try:
                entry.gain_value = float(self.exif.get_tag_string('Exif.Photo.GainControl'))
            except Exception:
                entry.gain_value = 0

        try:
            entry.focal_length = Fraction(self.exif.get_tag_string('Exif.Photo.FocalLength')) * 1.0
        except Exception as e:
            entry.focal_length = 0

    def update_db(self, source_file, sidecar=None, is_supported_media=True):
        entry = None
        media_dir = self.mediadir
        media_file = os.path.basename(source_file)

        try:
            entry = MediaFile.objects.get(media_dir=media_dir, media_file=media_file)
            if not self.force:
                return entry
        except MediaFile.DoesNotExist:
            timestamp = self.get_timestamp(source_file, use_exif=is_supported_media)
            filename = self.create_filename(source_file, timestamp)
            if not filename:
                return None

            catalog, created = Catalog.objects.get_or_create(name=settings.DEFAULT_CATALOG)
            mime_type, created = MimeType.objects.get_or_create(type=self.mimetype)

            try:
                entry = MediaFile(media_dir=media_dir, media_file=media_file)
            except Exception as e:
                logging.error('Cannot get object: {}'.format(e.args[0]))
                raise e

            entry.mime_type = mime_type
            entry.catalog = catalog
            entry.filename = filename
            entry.date = tz.time.strftime('%Y-%m-%dT%H:%M:%SZ', timestamp)

            if is_supported_media:
                rating = self.exif.get_tag_long('Xmp.xmp.Rating')
                if rating:
                    entry.rating = rating
                label = self.exif.get_tag_string('Xmp.xmp.Label')
                if label:
                    entry.label = label

                entry.sidecar_file = sidecar

        if is_supported_media:
            self.update_image_parameters(entry)
        return entry

    def import_image(self, source_file):
        self.image = None
        # prefer XMP "sidecar" files to save us from parsing huge RAW files more than necessary
        source_xmp_file = os.path.splitext(source_file)[0]
        havesourcesidecar = False
        for ext in (".xmp", ".XMP", "Xmp"):
            if os.path.isfile(source_xmp_file + ext):
                source_xmp_file += ext
                try:
                    havesourcesidecar = self.exif.open_path(source_xmp_file)  # returns True on success
                except Exception:
                    pass

                break

        # No sidecar, oh well...
        if not havesourcesidecar:
            self.exif.open_path(source_file)

        (mediareldir, jpegfilename) = os.path.split(os.path.relpath(source_file, settings.SOURCE_DIR))
        jpegfilename = os.path.splitext(jpegfilename)[0] + ".jpg"

        self.status.update(10, 'Writing Proxy')
        try:
            mediadir = settings.WEB_DIR + mediareldir
            jpegfullpath = mediadir + '/' + jpegfilename
            tools.mkdir(mediadir)
            self.create_proxy(source_file, jpegfullpath, self.PROXY_FULLSIZE)
        except Exception as e:
            raise e

        self.status.update(75, 'Writing Thumbnail')
        try:
            tndir = mediadir + "/" + settings.THUMBNAIL_DIR
            tnfullpath = tndir + '/' + jpegfilename
            tools.mkdir(tndir)
            self.create_proxy(source_file, tnfullpath, self.PROXY_THUMBNAIL)
        except Exception as e:
            os.unlink(jpegfullpath)
            raise e

        self.status.update(85, 'Writing Preview')
        try:
            webimgdir = mediadir + "/" + settings.PREVIEW_DIR
            webimgfullpath = webimgdir + '/' + jpegfilename
            tools.mkdir(webimgdir)
            self.create_proxy(source_file, webimgfullpath, self.PROXY_WEBSIZED)
        except Exception as e:
            os.unlink(jpegfullpath)
            os.unlink(tnfullpath)
            raise e

        # we need to write the XMP sidecar file here as we definitely do not want to parse
        # RAW files from the web app and exiv2 isn't capable to construct it's content without
        # a source file. A sidecar file is required for import of the rating and label tags
        # with lightroom, though.
        #
        # ATTN: We'll write it to SOURCE_DIR. The web app should never touch the proxy dirs
        # by itself

        if not havesourcesidecar:
            source_xmp_file = self.write_xmp_sidecar(source_file)

        entry = self.update_db(source_file, sidecar=os.path.basename(source_xmp_file))
        self.status.update(95, 'Writing Database')
        entry.save()
        self.status.update(100, 'Done')

    def import_video(self, source_file):
        if not settings.FFMPEG_COMMAND:
            raise NotImplementedError

        source_thm_file = os.path.splitext(source_file)[0]
        havesourcesidecar = False
        for ext in (".thm", ".THM", "Thm"):
            if os.path.isfile(source_thm_file + ext):
                source_thm_file += ext
                try:
                    havesourcesidecar = self.exif.open_path(source_thm_file)  # returns True on success
                except Exception:
                    pass

                break

        (mediareldir, jpegfilename) = os.path.split(os.path.relpath(source_file, settings.SOURCE_DIR))
        mediadir = settings.WEB_DIR + mediareldir
        (basename, extension) = os.path.splitext(jpegfilename)
        jpegfilename = basename + ".jpg"
        mediafilename = basename + extension.lower()

        self.status.update(10, 'Linking Source File')
        try:
            mediadir = settings.WEB_DIR + mediareldir
            mediafullpath = mediadir + '/' + mediafilename
            tools.mkdir(mediadir)
            tools.link(source_file, mediafullpath)
        except Exception as e:
            raise e

        self.status.update(50, 'Writing Thumbnail and Proxy')
        try:
            tndir = mediadir + "/" + settings.THUMBNAIL_DIR
            tnfullpath = tndir + '/' + jpegfilename
            tools.mkdir(tndir)
        except Exception as e:
            os.unlink(mediafullpath)
            raise e

        try:
            webimgdir = mediadir + "/" + settings.PREVIEW_DIR
            webimgfullpath = webimgdir + '/' + jpegfilename
            tools.mkdir(webimgdir)
        except Exception as e:
            os.unlink(mediafullpath)
            os.unlink(tnfullpath)
            raise e

        self.create_video_proxy(source_file, tnfullpath, webimgfullpath)

        self.status.update(90, 'Writing Sidecar')
        if not havesourcesidecar:
            source_sidecar = self.write_xmp_sidecar(source_file)
            self.exif.open_path(source_sidecar)
            sidecar = os.path.basename(source_sidecar)
        else:
            sidecar = os.path.basename(source_thm_file)

        entry = self.update_db(source_file, sidecar=sidecar)
        self.status.update(95, 'Writing Database')
        entry.save()
        self.status.update(100, 'Done')

    def import_other(self, source_file):
        entry = self.update_db(source_file, is_supported_media=False)
        if entry.mime_type.copy:
            self.status.update(50, 'Writing Database')
            entry.save()
        self.status.update(100, 'Done')

    def do_import(self, source_file):
        self.status = StatusWriter(statusname=settings.PROCESS_STATUS, filename=source_file, text='Start')
        source_file = os.path.abspath(source_file)

        if not source_file.startswith(settings.SOURCE_DIR):
            logging.critical('{} is not below directory {}'.format(source_file, settings.SOURCE_DIR))
            self.status.error('Broken Configuration')
            return

        if not os.path.isfile(source_file):
            logging.critical('{} does not exist or is not a file'.format(source_file))
            self.status.error('Not a File')
            return

        if self.lock:
            self.mediadir = MediaDir.objects.compare_and_lock(self.mediadir, source_file, prefix=settings.SOURCE_DIR,
                                                              name=self.name)
        else:
            self.mediadir = MediaDir.objects.get_or_create_by_full_path(source_file, prefix=settings.SOURCE_DIR)

        self.status.update(5, 'Examining File Type')
        self.mimetype = magic.from_file(filename=source_file, mime=True)
        extension = os.path.splitext(source_file)[1].lower()

        # image, but skip video thumbnail (handled by do_import_video and exporter)
        if self.mimetype.startswith('image/') and extension != '.thm':
            try:
                self.import_image(source_file)
                return
            except Exception as e:
                logging.warning('Importer: ', exc_info=True)
                self.status.update(0, 'Script Error')
                pass

        # video, yay.
        if self.mimetype.startswith('video/'):
            try:
                self.import_video(source_file)
                return
            except Exception as e:
                logging.warning('Importer: ', exc_info=True)
                self.status.update(0, 'Script Error')
                pass

        # sidecar gets handled explicitly by exporter
        if extension == '.xmp':
            self.status.update(100, 'Done')
            return

        self.import_other(source_file)

    def close(self):
        if self.status:
            self.status.close()
            self.status = None
        if self.mediadir and self.lock:
            self.mediadir.unlock()
            self.mediadir = None

    def __del__(self):
        self.close()

    def __init__(self, force=False, lock=True, name=None):
        self.exif = GExiv2.Metadata()
        self.force = force
        self.lock = lock
        self.name = name
        self.mediadir = None
        self.image = None
        self.status = None
예제 #47
0
	source.seek(0)
		
	instances = -1
	offsets = []
	
	while True:
		instances = contents.find(b'icon', instances+1)
		offsets.append(instances)
		
		if instances == -1:
			break
				
	source.seek(0)
	
	for i in range(0, len(offsets)-1):
		source.seek(offsets[i]+4)
		copi = ((int.from_bytes(source.read(4),byteorder='little'))) # Number of bytes to copy
		
		f_oname = oname + "_" + str(i) +".png"
		
		data = source.read(copi)	# Copy x bytes to this variable
		
		img = Image(format='ico',blob=data)
		
		with img.convert('png') as converted:
			converted.save(filename=f_oname)
			
		print("Converted frame %d in address %s to %s" % (i,hex(offsets[i]),f_oname))
	

	source.close()
예제 #48
0
파일: classify.py 프로젝트: qiaojingy/dm341
	#for col in enumerate(range(read_sheet.ncols)):
	#print "pos2"
	record_id = read_sheet.cell_value(i,0)
	record_url = read_sheet.cell_value(i,2)
	type_contain = [False,False,False,False]
	type_meaning = ['NAB','Invoice','Contracts','Request']
	try:
		#print record_url
		req = requests.get(record_url)
		if req.status_code != 200:
			write_sheet.write(i,0,record_id)
			write_sheet.write(i,1,record_url)
			write_sheet.write(i,2,"CANNOT DOWNLOAD FILE")
			break
		image_pdf = Image(blob=req.content,resolution=450)
		image_bmp = image_pdf.convert('bmp')
		#currently we only detect first page for efficiency
		for img in image_bmp.sequence:
			write_sheet.write(i,0,record_id)
			write_sheet.write(i,1,record_url)
			img_page = Image(image=img)
			img_page =img_page.make_blob('bmp')
			tmp_img = PI.open(io.BytesIO(img_page))
			tmp_img = tmp_img.convert('L')
			tmp_img = tmp_img.point(lambda x: x>190 and 255)
			tmp_img = tmp_img.filter(ImageFilter.GaussianBlur(radius = 1.5))
			txt = tool.image_to_string(tmp_img,lang="eng",builder=pyocr.builders.TextBuilder())
			val = check_format(txt)
			print "round 1"
			if val <4:
				#print type_meaning[val]
    else:
        print "Processing image: "+f
        #Create image objects for small and medium using original large
        li = Image(filename=join(orig_path,f))
        mi = li.clone()
        si = mi.clone()

        print 'Original: '+str(mi.width)+'x'+str(mi.height)

        #Resize to med and small maintaining aspect ratio
        mi.transform(resize=str(med_size)+'x'+str(med_size)+'>')
        print 'Medium: '+str(mi.width)+'x'+str(mi.height)
        si.transform(resize=str(small_size)+'x'+str(small_size)+'>')
        print 'Small: '+str(si.width)+'x'+str(si.height)

        #Convert to JPEG if necessary and save as new file
        lf = join(large_path,f)
        if li.format != 'JPEG':
            li = li.convert('jpeg')
        li.save(filename=lf[:-3]+'jpg')

        mf = join(med_path,f)
        if mi.format != 'JPEG':
            mi = mi.convert('jpeg')
        mi.save(filename=mf[:-3]+'jpg')

        sf = join(small_path,f)
        if si.format != 'JPEG':            
            si = si.convert('jpeg')
        si.save(filename=sf[:-3]+'jpg')
        
예제 #50
0
from PIL import Image as PI
import pyocr as pyocr1
import pyocr.builders
import io

outfile = open("test.txt","wb")

path = "C:\\upw\\chin\\test.pdf"
tool = pyocr1.get_available_tools()[0]
lang = tool.get_available_languages()[0]

req_image = []
final_text = []

image_pdf = Image(filename=path, resolution=300)
image_jpeg = image_pdf.convert('jpeg')

for img in image_jpeg.sequence:
    img_page = Image(image=img)
    req_image.append(img_page.make_blob('jpeg'))

for img in req_image:
    txt = tool.image_to_string(
        PI.open(io.BytesIO(img)),
        lang=lang,
        builder=pyocr.builders.TextBuilder()
    )
    final_text.append(txt.encode("utf-8"))
    #outfile.write(txt.encode("utf-8"))
outfile.writelines(final_text)
print(final_text)
예제 #51
0
def update_patient(patient, form, files):
    """Update a patient record with information from submitted form."""
    for field_name, class_name in [
        ('income_sources', IncomeSource),
        ('phone_numbers', PhoneNumber),
        ('addresses', Address),
        ('emergency_contacts', EmergencyContact),
        ('household_members', HouseholdMember),
        ('employers', Employer),
        ('document_images', DocumentImage)
    ]:
        if form[field_name]:
            # If the last row in a many-to-one section doesn't have any data, don't save it
            remove_blank_rows_helper(form[field_name])

            # Add a new child object for each new item in a many-to-one section
            new_row_count = len(form[field_name].entries) - getattr(patient, field_name).count()
            if new_row_count > 0:
                for p in range(new_row_count):
                    getattr(patient, field_name).append(class_name())

            # When a user clicks the delete icon on a many-to-one row, it clears
            # all the data in that row. If any existing rows have no data, delete
            # them from patient object and then from the form.
            for row in form[field_name]:
                if not bool([val for key, val in row.data.iteritems() if (
                    val != ''
                    and val is not None
                    and key != 'id'
                    and not (key in ['state', 'employee'])
                )]):
                    row_index = int(row.name[-1])
                    # Delete from patient object
                    db.session.delete(getattr(patient, field_name)[row_index])
                    # Deletion from form FieldList requires popping all entries
                    # after the one to be removed, then readding them
                    to_re_add = []
                    for _ in range(len(form[field_name].entries) - row_index):
                        to_re_add.append(form[field_name].pop_entry())
                    to_re_add.pop()
                    for row in reversed(to_re_add):
                        form[field_name].append_entry(data=row.data)

    # Get binary data and create resized versions of any new document images
    for index, entry in enumerate(form.document_images):
        if entry.file_name.data and entry.file_name.data.filename:
            # This is a new file
            if entry.file_name.data.content_type == 'application/pdf':
                # PIL can't handle PDFs, so use Wand
                pdf = WandImage(file=entry.file_name.data.stream, resolution=500)
                pdf.convert('jpg')
                entry.file_name.data.stream = io.BytesIO(pdf.make_blob('jpeg'))

            large_image = Image.open(entry.file_name.data.stream)
            small_image = large_image.copy()

            large_image_output, small_image_output = io.BytesIO(), io.BytesIO()
            large_image.thumbnail(
                current_app.config['LARGE_DOCUMENT_IMAGE_SIZE'],
                Image.ANTIALIAS
            )
            large_image.save(large_image_output, format='JPEG')
            small_image.thumbnail(
                current_app.config['SMALL_DOCUMENT_IMAGE_SIZE'],
                Image.ANTIALIAS
            )
            small_image.save(small_image_output, format='JPEG')

            entry.data_full.data = entry.file_name.data.stream.getvalue()
            entry.data_large.data = large_image_output.getvalue()
            entry.data_small.data = small_image_output.getvalue()
            entry.file_name.data = entry.file_name.data.filename
        else:
            # This is an existing entry, so the file can't change, only the description
            # Fill in the fields that aren't inputs from the saved data so
            # that populate_obj doesn't overwrite them.
            entry.file_name.data = patient.document_images[index].file_name
            entry.data_full.data = patient.document_images[index].data_full
            entry.data_large.data = patient.document_images[index].data_large
            entry.data_small.data = patient.document_images[index].data_small

    # Populate the patient object with all the updated info
    form.populate_obj(patient)
    return
예제 #52
0
파일: test_pdf.py 프로젝트: qiaojingy/dm341
#refer to https://pythontips.com/2016/02/25/ocr-on-pdf-files-using-python/

from wand.image import Image
from PIL import Image as PI
import pyocr
import pyocr.builders
import io
import codecs
from PIL import ImageFilter
tool = pyocr.get_available_tools()[0]

req_image = []
#final_text = []
final_text = "NOT NAB"
image_pdf = Image(filename="/Users/xiaoshi/Dropbox/Stanford/cs341/final/source/49157Political File2014Non-Candidate Issue Adsnational education assocSCAN-14100711320 (14127116222163)_.pdf", resolution=450)
image_png = image_pdf.convert('bmp')

for img in image_png.sequence:
	img_page = Image(image=img)
	#img_page.type='grayscale'
	#img_page = img_page.point(lambda x:0 if x<143 else 255)
	req_image.append(img_page.make_blob('bmp'))
	#break

count = 0
for img in req_image:
	tmp_img = PI.open(io.BytesIO(img))
	tmp_img = tmp_img.convert('L')
	#tmp_img = tmp_img.point(lambda x: x>190 and 255)
	tmp_img = tmp_img.point(lambda x: x>190 and 255)
	tmp_img = tmp_img.filter(ImageFilter.GaussianBlur(radius = 1.5))
예제 #53
0
def new_blank_png(new_width, new_height):
    new_img = Image(width=new_width, height=new_height)
    return new_img.convert('png')