def convert_filetype(filename_ori, filename_res, filetype): original = Image(filename=filename_ori) with original.convert(filetype) as converted: converted.save(filename=filename_res)
def pdf_page_to_png(src_pdf, pagenum=0, resolution=154): """ Returns specified PDF page as wand.image.Image png. :param PyPDF2.PdfFileReader src_pdf: PDF from which to take pages. :param int pagenum: Page number to take. :param int resolution: Resolution for resulting png in DPI. """ check_dependencies(__optional_dependencies__[PDF]) # Import libraries within this function so as to avoid # import-time dependence import PyPDF2 # TODO: When we start using this again, document which # system-level libraries are required. from wand.image import Image dst_pdf = PyPDF2.PdfFileWriter() dst_pdf.addPage(src_pdf.getPage(pagenum)) pdf_bytes = io.BytesIO() dst_pdf.write(pdf_bytes) pdf_bytes.seek(0) img = Image(file=pdf_bytes, resolution=resolution) img.convert("png") return img
def getannots(pdfannots, pageno, fh): global index annots = [] input1 = PdfFileReader(fh) output = PdfFileWriter() targetPage = input1.getPage(pageno) newpath = "./images/" for pa in pdfannots: # print(pa) subtype = pa.get('Subtype') if subtype is not None and subtype.name not in ANNOT_SUBTYPES: continue print(subtype) if (subtype.name == "Ink" or subtype.name == "Square"): print("yes") print(type(pa.get('Rect'))) coord = pa.get('Rect') targetPage.cropBox.lowerLeft = (coord[0], coord[1]) targetPage.trimBox.lowerLeft = (coord[0], coord[1]) targetPage.mediaBox.lowerLeft = (coord[0], coord[1]) targetPage.cropBox.upperRight = (coord[2], coord[3]) targetPage.trimBox.upperRight = (coord[2], coord[3]) targetPage.mediaBox.upperRight = (coord[2], coord[3]) pdf_bytes = io.BytesIO() output.addPage(targetPage) output.write(pdf_bytes) pdf_bytes.seek(0) img = Image(file=pdf_bytes, resolution=300) img.convert("png") if not os.path.exists(newpath): os.makedirs(newpath) img.save(filename=newpath + str(index) + ".png") colour = pa.get('C') contents = pa.get('Contents') def getcolour(colour): if (colour == [1.0, 0.90196, 0.0]): return "yellow" elif (colour == [0.26667, 0.78431, 0.96078]): return "blue" elif (colour == [0.92549, 0.0, 0.54902]): return "pink" elif (colour == [0.90196, 0.10588, 0.10588]): return "red" else: return "none" if contents is not None: contents = str(contents, 'iso8859-15') #'utf-8' contents = contents.replace('\r\n', '\n').replace('\r', '\n') a = Annotation(index, pageno, subtype.name.lower(), pa.get('QuadPoints'), pa.get('Rect'), contents, getcolour(colour)) annots.append(a) index += 1 return annots
def pdf_page_to_jpg(src_pdf, pagenum=0, resolution=72): dst_pdf = PyPDF2.PdfFileWriter() dst_pdf.addPage(src_pdf.getPage(pagenum)) pdf_bytes = io.BytesIO() dst_pdf.write(pdf_bytes) pdf_bytes.seek(0) img = WandImage(file=pdf_bytes, resolution=resolution) img.convert("jpg") return img
def process_page(self, f, page_number): page = Image(file=f, resolution=130) f.close() if self.signatures and page_number in self.pages_with_signatures: page = self.add_signature(page) # page.noise("multiplicative_gaussian", attenuate=-0.15) page.despeckle() page.sharpen(radius=8, sigma=4) page.gamma(1.3) page.rotate(randint(-100, 100) / 200, background=Color('rgb(255, 255, 255)')) page.convert('RGB') file_like = io.BytesIO(page.make_blob()) return file_like
def convert_pdf_to_jpg(tablename, nrows): print("Converting") path = os.getcwd() + '\\figures\\' filename = path + tablename + '_OUTPUT.pdf' img = Image(filename=filename, resolution=500) imgname = path + tablename + '_OUTPUT.jpeg' img.convert('jpeg').save(filename=imgname) import PIL img = PIL.Image.open(imgname) #打开图像 box = (360, 490, 3733, 510 + 95 * nrows) #根据nrows来剪裁图像 clip = img.crop(box) clip.save(imgname) os.remove(filename)
def ocr(filename): name, ext = os.path.splitext(filename) if ext.lower() == '.pdf': image_pdf = Image(filename=filename, resolution=300) image_jpeg = image_pdf.convert('jpeg') img = image_jpeg.make_blob() img = PImage.open(io.BytesIO(img)) if ext.lower() in ['.jpg', '.jpeg', '.png']: img = PImage.open(filename) word_boxes = tool.image_to_string( img, lang=lang, builder=pyocr.builders.LineBoxBuilder(tesseract_layout=1) ) X, Y = img.size res = [] for line in word_boxes: if line.position[1][0] - line.position[0][0] < 0.01 * X: continue if line.position[1][1] - line.position[0][1] < 0.003 * X: continue if line.content.strip() == '': continue res.append(line) return res, X, Y
def extract_text_from_image(path): tool = pyocr.get_available_tools()[0] lang = tool.get_available_languages()[2] req_image = [] final_text = [] image_pdf = Image(filename=path, resolution=200) image_jpeg = image_pdf.convert('jpeg') for img in image_jpeg.sequence: img_page = Image(image=img) req_image.append(img_page.make_blob('jpeg')) for img in req_image: txt = tool.image_to_string(Pi.open(io.BytesIO(img)), lang=lang, builder=pyocr.builders.TextBuilder()) final_text.append(txt) trancription = '' for i in final_text: trancription += i return trancription
def image_opener(key): """Handler to locate file based on key. .. note:: If the file is a PDF then only the first page will be returned as an image. :param key: A key encoded in the format "<bucket>:<version>:<object_key>". :returns: A file-like object. """ if hasattr(g, 'obj'): obj = g.obj else: obj = protect_api(key) fp = obj.file.storage().open('rb') # If ImageMagick with Wand is installed, extract first page # for PDF/text. if HAS_IMAGEMAGICK and obj.mimetype in ['application/pdf', 'text/plain']: first_page = Image(Image(fp).sequence[0]) tempfile_ = tempfile.TemporaryFile() with first_page.convert(format='png') as converted: converted.save(file=tempfile_) return tempfile_ return fp
def pdf_to_img(infile, newname): # 将pdf文件转为jpg图片文件 # ./PDF_FILE_NAME 为pdf文件路径和名称 image_pdf = Image(filename=infile, resolution=300) image_jpeg = image_pdf.convert('jpg') img_list = join_img_list(image_jpeg) img_to_save(img_list, infile, newname)
def main(): tools = pyocr.get_available_tools() if len(tools) == 0: print("No OCR tool found") sys.exit(1) tool = tools[0] print("Will use tool '%s'" % (tool.get_name())) langs = tool.get_available_languages() print("Available languages: %s" % ", ".join(langs)) lang = langs[0] print("Will use lang '%s'" % (lang)) req_image = [] final_text = [] image_pdf = Image(filename="./pdf_file/stackoverflow.pdf", resolution=400) image_jpeg = image_pdf.convert('jpeg') image_jpeg.save(filename='./pdf2img/stackoverflow.jpeg') for img in image_jpeg.sequence: img_page = Image(image=img) req_image.append(img_page.make_blob('jpeg')) for img in req_image: txt = tool.image_to_string(PI.open(io.BytesIO(img)), lang=lang, builder=pyocr.builders.TextBuilder()) final_text.append(txt) print(final_text)
def get_text(filepath): ''' Uses OCR to get the text from a pdf. Returns a list where each element is the text of a single page of that pdf. ex: final_text = get_text('/Users/matthewwong/dsi-capstone/PDFs/decrypted/ A1/Certifications/Certificates Flood Cert.pdf') ''' tool = pyocr.get_available_tools()[0] lang = tool.get_available_languages()[0] req_image = [] final_text = [] image_pdf = Image(filename=filepath, resolution=300) image_jpeg = image_pdf.convert('jpeg') for img in image_jpeg.sequence: img_page = Image(image=img) req_image.append(img_page.make_blob('jpeg')) for img in req_image: txt = tool.image_to_string( PI.open(io.BytesIO(img)), lang=lang, builder=pyocr.builders.TextBuilder() ) final_text.append(txt) return final_text
def getPageAsPng(self, pageNumber, width=640): """Return the given page as a png wand.image.Image.""" # The page numbers are indexed with base 0, while page numbers start # from 1. img = Image(image=self.pdf.sequence[pageNumber - 1]) img.resize(width, int(width * img.height / (1.0 * img.width))) return img.convert('png')
def convert(trimmed): print('Converting...') tool = pyocr.get_available_tools()[0] # tesseract lang = tool.get_available_languages()[17] # chi_tra; I installed all languages for n,filepath in enumerate(trimmed): clean_temp_files() print('done cleaning.') print('\r' + str(n) + '/' + str(len(trimmed)) + ' ',end='') req_image = [] final_text = "" try: image_pdf = Image(filename=filepath,resolution=300) except Exception as e: print('\rException: \n' + repr(e)) continue # prob_name = 'PROBLEM_' + file # prob_path = os.path.join(reports,prob_name) # os.rename(filepath,prob_path) txt_name = filepath.replace('.pdf','.txt') image_jpeg = image_pdf.convert('jpeg') for img in image_jpeg.sequence: img_page = Image(image=img) req_image.append(img_page.make_blob('jpeg')) for img in req_image: text = tool.image_to_string(PI.open(io.BytesIO(img)),lang=lang,builder=pyocr.builders.TextBuilder()) final_text += text with open(txt_name,'w',encoding='utf-8') as f: print(txt_name) f.write(final_text)
def single_ocr2txt__(self, src_path, ocr_name, target_path): req_image = [] final_text = [] image_pdf = Image(filename=src_path, resolution=300) image_jpeg = image_pdf.convert('jpeg') for img in image_jpeg.sequence: # 将pdf分割成图片 img_page = Image(image=img) req_image.append(img_page.make_blob('jpeg')) for img in req_image: # 将图片提取出文本 txt = pytesseract.image_to_string(PI.open(io.BytesIO(img)), lang='eng') final_text.append(txt) upperdirs = os.path.dirname(target_path) if upperdirs and not os.path.exists(upperdirs): # Create directories that are not part of the archive with # default permissions. os.makedirs(upperdirs) ocr_name = ocr_name.replace('.pdf', '.txt') file_path = os.path.join(target_path, ocr_name) with open(file_path, 'a', encoding='utf-8') as f: #存储文本 for text in final_text: f.write(text)
def pdf_run(self, image_file_name, filename, path): image_pdf = Image(filename=image_file_name, resolution=300) #take filename image_page = image_pdf.convert("png") #png conversion page = 1 #init page process_start = time.time() for img in image_page.sequence: # Every single image in image_page for grayscale conversion in 300 resolution img_per_page = Image(image=img) img_per_page.type = 'grayscale' img_per_page.depth = 8 img_per_page.density = 300 try: img_per_page.level(black=0.3, white=1.0, gamma=1.5, channel=None) except AttributeError as e: print("Update Wand library: %s" % e) img_buf = path + '/' + "saram_" + filename + str(page) + ".png" os.chmod(path, 0o777) img_per_page.save(filename=img_buf) page_start = time.time() page_elaboration = time.time() - page_start print("page %s - size %s - process %2d sec." % (page, img_per_page.size, page_elaboration)) page += 1 img.destroy() process_end = time.time() - process_start print("Total elaboration time: %s" % process_end)
def process(self, pdf_filename, pdf_resolution, imageformat, do_orientation): final_text = "" image_pdf = Image(filename=pdf_filename, resolution=pdf_resolution) image_page = image_pdf.convert(imageformat) page = 1 process_start = time.time() for img in image_page.sequence: img_per_page = Image(image=img) img_per_page.type = 'grayscale' img_per_page.depth = 8 img_per_page.density = pdf_resolution try: img_per_page.level(black=0.3, white=1.0, gamma=1.5, channel=None) except AttributeError as e: print("Update Wand library: %s" % e) img_per_page.save(filename="buffer.png") page_start = time.time() txt = self.image2txt_pyocr(img_per_page.make_blob(imageformat), do_orientation) page_elaboration = time.time() - page_start print("page %s - size %s - process %2d sec. - text %s" % (page, img_per_page.size, page_elaboration, len(txt))) final_text += "%s\n" % txt page += 1 img.destroy() process_end = time.time() - process_start print("Total elaboration time: %s" % process_end) return final_text
def pdf_to_text(filename): pdf = IM(filename=filename, resolution=300) pages = len(pdf.sequence) image = IM(width=pdf.width, height=pdf.height * pages) for i in xrange(pages): image.composite( pdf.sequence[i], top=pdf.height * i, left=0 ) img = image.convert('png') with tempfile.NamedTemporaryFile(prefix="tess_") as temp_file: img.save(filename=temp_file.name) try: temp = tempfile.NamedTemporaryFile(delete=False) process = subprocess.Popen(['tesseract', temp_file.name, temp.name], \ stdout=subprocess.PIPE, stderr=subprocess.STDOUT) process.communicate() with open(temp.name + '.txt', 'r') as handle: contents = handle.read() os.remove(temp.name + '.txt') os.remove(temp.name) print contents except: print "ERROR"
def extract_pages(filename): img_pdf = Image(filename=filename, resolution=300) img_jpeg = img_pdf.convert('jpeg') img_thirdpage = img_jpeg.sequence[-1] img_secondpage = img_jpeg.sequence[-2] return img_secondpage, img_thirdpage
def pdf_to_jpg(PDF_FILE_NAME): JPG_FILE_DIR = PDF_FILE_NAME.split('.')[0] + u'_图片目录' #如果图片目录存在,则先删除,再创建空目录,否则直接创建空目录 if not os.path.exists(JPG_FILE_DIR): os.makedirs(JPG_FILE_DIR) else: shutil.rmtree(JPG_FILE_DIR) os.makedirs(JPG_FILE_DIR) #读取pdf文件,分辨率值设置越大,打开文件越慢,可根据自己情况调整 image_pdf = Image(filename=PDF_FILE_NAME, resolution=300) image_jpeg = image_pdf.convert('jpg') #print u"[%s]:文件正在转换..." %time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) req_image = [] #print u"[%s]:该PDF文件有%s页,正在转换,请稍等...\n" %(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()),len(image_jpeg.sequence)) for img in image_jpeg.sequence: img_page = Image(image=img) req_image.append(img_page.make_blob('jpg')) jpg_file_num = 1 # 遍历req_image,保存为图片文件 for img in req_image: #print u"[%s]:正在转换第%s页,请稍后." %(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()),jpg_file_num) JPG_FILE_NAME = PDF_FILE_NAME.split('.')[0] + '_' + str( jpg_file_num) + '.jpg' ff = open(JPG_FILE_NAME, 'wb') ff.write(img) ff.close() shutil.move(JPG_FILE_NAME, JPG_FILE_DIR) jpg_file_num += 1
def dt_pdf_image_to_txt(self, filename, codec='utf-8'): """ Argument : filename : input file name Return : return file text Note : pdf file to text file """ document = "" try: filename = filename.encode('utf-8') image_pdf = Image(filename=filename, resolution=300) image_jpeg = image_pdf.convert('jpeg') texts = [] for img in image_jpeg.sequence: img_page = Image(image=img) blob = img_page.make_blob('jpeg') txt = self.dt_image_to_text(None, blob) if txt: texts.append(txt) document = ''.join(texts) except: pass finally: pass return document
def image_opener(key): """Handler to locate file based on key. .. note:: If the file is a PDF then only the first page will be returned as an image. :param key: A key encoded in the format "<recid>:<filename>". :returns: A file-like object. """ key_parts = key.split(":") assert len(key_parts) == 2 recid = key_parts[0] filename = key_parts[1] service = current_rdm_records.records_service try: file_item = service.files.get_file_content(g.identity, recid, filename) except KeyError: return None # FIXME: throw custom exception `FileNotFound`? fp = file_item.get_stream('rb') # If ImageMagick with Wand is installed, extract first page # for PDF/text. pages_mimetypes = ['application/pdf', 'text/plain'] if HAS_IMAGEMAGICK and file_item.data["mimetype"] in pages_mimetypes: first_page = Image(Image(fp).sequence[0]) tempfile_ = tempfile.TemporaryFile() with first_page.convert(format='png') as converted: converted.save(file=tempfile_) return tempfile_ return fp
def pdfocr(location): # global tool, lang tool = pyocr.get_available_tools()[0] lang = tool.get_available_languages()[0] req_image = [] final_text = [] image_pdf = Image(filename=location, resolution=300) image_jpeg = image_pdf.convert('jpeg') for img in image_jpeg.sequence: img_page = Image(image=img) req_image.append(img_page.make_blob('jpeg')) ct = 1 for img in req_image: txt = tool.image_to_string(PI.open(io.BytesIO(img)), lang=lang, builder=pyocr.builders.TextBuilder()) final_text.append(txt) print("%3d / %3d" % (ct, len(req_image))) ct += 1 for frame in image_jpeg.sequence: frame.destroy() return final_text
def extract_infos_from_pdf(pdf_url): image_pdf = Image(file=urlopen(pdf_url), resolution=300) image_jpeg = image_pdf.convert('jpeg') req_image = [] for img in image_jpeg.sequence: img_page = Image(image=img) req_image.append(img_page.make_blob('jpeg')) final_text = [] for img in req_image: txt = TOOL.image_to_string(PI.open(io.BytesIO(img)), lang=LANG, builder=pyocr.builders.TextBuilder()) final_text.append(txt) email = re.search("[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+", final_text[0]) raw_tel = re.search("\([0-9]{2}\) (?:9[0-9]{1}|[1-5]{1})[0-9]{3}-[0-9]{4}", final_text[0])[0] numbers_list = re.findall(r'\d+', raw_tel) tel = '55' + ''.join(numbers_list) full_name = re.findall(r'Nome: (.+)', final_text[0])[0].split()[0] insurer = re.findall('Tipo: ([^\W\d_]+)', final_text[0])[0] return { 'email': email.group(), 'tel': ''.join([d for d in re.findall(r'\d+', tel)]), 'full_name': full_name, 'insurer': insurer }
def image_opener(uuid): """ Find a file based on its UUID. :param uuid: a UUID in the form bucket:filename :returns: a file path or handle to the file or its preview image :rtype: string or handle """ # Drop the "version" that comes after the second ":" - we use this version # only as key in redis cache bucket, _file = uuid.split(':')[:2] ret = ObjectVersion.get(bucket, _file).file.uri # Open the Image opened_image = file_opener_xrootd(ret, 'rb') if '.' in _file: ext = _file.split('.')[-1] if ext in ['txt', 'pdf']: img = Image(opened_image) # Get the first page from text and pdf files first_page = Image(img.sequence[0]) tempfile_ = tempfile.TemporaryFile() with first_page.convert(format='png') as converted: converted.save(file=tempfile_) return tempfile_ # Return an open file to IIIF return opened_image
def pdf2ocr(pdffile): """ Optical Character Recognition on PDF files using Python see https://pythontips.com/2016/02/25/ocr-on-pdf-files-using-python/ :param pdffile: pdffile to be OCR'd :return: """ from wand.image import Image from PIL import Image as PI import pyocr import pyocr.builders import io tool = pyocr.get_available_tools()[0] lang = tool.get_available_languages()[0] # [0] for english req_image = [] final_text = [] print "Reading {0}".format(pdffile) image_pdf = Image(filename=pdffile, resolution=300) image_jpeg = image_pdf.convert("jpeg") for img in image_jpeg.sequence: img_page = Image(image=img) print ("appending image") req_image.append(img_page.make_blob("jpeg")) print "Generating text" for img in req_image: txt = tool.image_to_string(PI.open(io.BytesIO(img)), lang=lang, builder=pyocr.builders.TextBuilder()) final_text.append(txt) return final_text
def pdf2text(pdf_filename): tool = pyocr.get_available_tools()[0] lang = tool.get_available_languages()[1] req_image = [] final_text = [] image_pdf = Image(filename=pdf_filename, resolution=300) image_jpeg = image_pdf.convert('jpeg') for img in image_jpeg.sequence: img_page = Image(image=img) req_image.append(img_page.make_blob('jpeg')) for img in req_image: txt = tool.image_to_string( PI.open(io.BytesIO(img)), lang=lang, builder=pyocr.builders.TextBuilder() ) final_text.append(txt) return final_text
def pdf_image_to_text(self, filename, lang, encoding='utf-8', resolution=300): super().reset(filename) document = "" try: filename = filename.encode(encoding) image_pdf = Image(filename=filename, resolution=resolution) image_jpeg = image_pdf.convert('jpeg') texts = [] for img in image_jpeg.sequence: img_page = Image(image=img) buff = img_page.make_blob('jpeg') txt = self.ocr_image(PI.open(io.BytesIO(buff)), lang=lang) if txt: texts.append(txt) document = ''.join(texts) except: pass finally: pass return document
def main(argv): tool = pyocr.get_available_tools()[0] if LANG not in tool.get_available_languages(): print("ocr language '%s' not available" % (LANG, ), file=sys.stderr) os.exit(1) if len(argv) <= 1: print("Usage: ./main.py <INPUT_PDF>", file=sys.stderr) os.exit(1) infile = argv[1] req_image = [] final_text = [] image_pdf = Image(filename=infile) image_jpeg = image_pdf.convert('png') for img in image_jpeg.sequence: img_page = Image(image=img) req_image.append(img_page.make_blob('png')) for img in req_image: txt = tool.image_to_string(PI.open(io.BytesIO(img)), lang=LANG, builder=pyocr.builders.TextBuilder()) final_text.append(txt) # is this really a good idea? print(json.dumps(final_text))
def pdf_text_extract(file, resolution=300, tool=0, lang=0): '''uses wand.image to convert provided pdf file to jpeg, then extract text from jpeg files. Returns list of extracted text''' engine = pyocr.get_available_tools()[tool] #tesseract engine lang = engine.get_available_languages( )[lang] #first language is eng for tesseract. May be different for different engine #create lists to capture images and text from pdfs req_image = [] final_text = [] #open pdf image_pdf = Image(filename=file, resolution=resolution) #convert pdf to jpegs image_jpeg = image_pdf.convert('jpeg') #convert each page in pdf file to individual jpeg for img in image_jpeg.sequence: img_page = Image(image=img) req_image.append(img_page.make_blob('jpeg')) for img in req_image: txt = engine.image_to_string(PI.open(io.BytesIO(img)), lang=lang, builder=pyocr.builders.TextBuilder()) final_text.append(txt) return final_text
def mosaic_thumbnail(self): """ Use wand to create an effect like that in the imagemagick command line montage command. Read through each page in the pdf, resize it, draw an angled rectangle in gray for the background, then the reszied image on top of that. """ back_filename = "%s/../resources/mosaic_background.png" \ % self.dir_name back_img = Image(filename=back_filename) # From: # http://stackoverflow.com/questions/18821145/\ # wand-convert-pdf-to-jpeg-and-storing-pages-in-file-like-objects image_pdf = Image(blob=self.blob) if self.blob is None: image_pdf = Image(filename=self.filename) image_png = image_pdf.convert("png") shift = 0 for page_img in image_png.sequence: page_img.resize(80, 103) self.composite_gray(back_img, page_img, shift) shift += 1 back_img.format = "png" return back_img.make_blob()
def get_thumbnail_from_pdf(self, file): try: filename = file.name img = None # Convert PDF files imgs_pdf = Image(file=file) imgs = imgs_pdf.convert('jpeg') if imgs: img = Image(image=imgs.sequence[0]) img.background_color = Color('white') img.alpha_channel = 'remove' # resized and save the converted file img.transform(crop='', resize=THUMBNAIL_SIZE) img.thumbnail() temp = NamedTemporaryFile(delete=False) temp.flush() temp0 = File(temp) with temp0.open('wb') as f: img.save(file=f) return temp0.open('rb') except Exception as e: print(repr(e)) return None
def new_blank_png(width, height, color=None): if color: new_img = Image(width=width, height=height, background=color) else: new_img = Image(width=width, height=height) return new_img.convert('png')
def extract_text(input_pdf): tool = pyocr.get_available_tools()[0] lang = tool.get_available_languages()[0] req_image = [] final_text = [] image_pdf = Image(filename=input_pdf, resolution=300) image_jpeg = image_pdf.convert('jpeg') for img in image_jpeg.sequence: img_page = Image(image=img) req_image.append(img_page.make_blob('jpeg')) for img in req_image: txt = tool.image_to_string( PI.open(io.BytesIO(img)), lang=lang, builder=pyocr.builders.TextBuilder() ) final_text.append(txt) with open(Ouput_dir + 'output{0:%H_%M_%S.%f}.txt'.format(datetime.now()), 'w', encoding='utf-8') as f: for item in final_text: f.write(item) return input_pdf
def Parse_PDF(FileName): try: # Open and read the pdf file fp = open(FileName, 'rb') # Create parser object to parse the pdf content parser = PDFParser(fp) # No password for the pdf file password = "" document = PDFDocument(parser, password) # check out password protection if not document.is_extractable: print("File Under Password Protection" + str(FileName)) raise PDFTextExtractionNotAllowed("File Under Password Protection") # Create PDFResourceManager object that stores shared resources such as fonts or images rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() # Create a PDFDevice object which translates interpreted information into desired format # Device needs to be connected to resource manager to store shared resources device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching): interpreter.process_page(page) fp.close() device.close() output = retstr.getvalue() retstr.close() # If content is extracted successfully, return the content, otherwise go to pyOCR if len(output) != 1: return output else: # go to pyOCR # Initialize pyOCR # The tools are returned in the recommended order of usage tool = pyocr.get_available_tools()[0] req_image = [] txt = "" image_pdf = Image(filename=FileName, resolution=300) image_jpeg = image_pdf.convert('jpeg') for img in image_jpeg.sequence: img_page = Image(image=img) req_image.append(img_page.make_blob('jpeg')) for img in req_image: tem = tool.image_to_string( PI.open(io.BytesIO(img)), builder=pyocr.builders.TextBuilder()) txt = txt + tem return txt except: print("ERROR - Could not parse file" + str(file))
def process_pattern(raw_content): img = Image(file=StringIO(raw_content)) base_img = img.convert('tiff') flag = False if img.type in ['bilevel', 'grayscale', 'grayscalematte']: flag = True base = StringIO() base_img.save(file=base) base.seek(0) return base, flag
def upload_from_web(self, request, pk=None): from wand.image import Image from wand.color import Color from wand import exceptions as wand_exceptions from apps.group.models import CourseGroup, CourseGroupMember chatroom = self.get_object() try: chatroom_member = ChatroomMember.objects.get(chatroom=chatroom, user=request.user) except ChatroomMember.DoesNotExist: # Create the course group member (is past) course_group = CourseGroup.objects.get(chatroom=chatroom) course_group_member = CourseGroupMember.objects.create(course_group=course_group, student=request.user.student, is_past=True) # Create the chatroom member (is past) chatroom_member = ChatroomMember.objects.create(user=request.user, chatroom=chatroom, is_past=True) name = request.data.get('name') is_anonymous = int(request.data.get('is_anonymous', False)) tag = Tag.objects.get(pk=int(request.POST.get('tag_id'))) new_upload = Upload.objects.create(chatroom_member=chatroom_member, chatroom=chatroom, name=name, tag=tag, is_anonymous=is_anonymous) all_urls = "" for fp in request.FILES: uploadedFile = request.data.get(fp) if uploadedFile.content_type == "application/pdf": image_pdf = Image(file=uploadedFile, resolution=250, background=Color("white")) image_jpeg = image_pdf.convert('jpeg') count = 0 for single_img in image_jpeg.sequence: img = Image(image=single_img, resolution=250) temp = tempfile.TemporaryFile() img.alpha_channel = False img.save(file=temp) url = new_upload.upload_file(temp) temp.close() all_urls = all_urls + url + "\n" count += 1 if count >= 25: break break url = new_upload.upload_file(uploadedFile) all_urls = all_urls + url + "\n" activity_type = ChatroomActivityType.objects.get_activity_type(ChatroomActivityTypeManager.UPLOAD) activity = ChatroomActivity.objects.create(chatroom=chatroom, chatroom_activity_type=activity_type, activity_id=new_upload.pk) new_upload.send_created_notification(activity, request, True) # post to slack TODO add detail message = request.user.email + " uploaded files to " + chatroom.name + ":\n[" + str(new_upload.id) + "] " + all_urls slack_utils.send_simple_slack_message(message) return Response(200)
def make_anim(): paths = glob('*.png') image = Image(filename=paths[0]) anim = image.convert('gif') for p in paths[1:]: anim.sequence.append( Image(filename=p).convert('gif')) # lined up manually for once off capture anim.crop(left=80, top=1030, width=585, height=140) import IPython IPython.embed() save(anim)
def preview(pdfpath, outpath, croph): """ @arg pdfpath input pdf path @arg outpath output screeshot file path @arg croph percentage of the page height to crop at """ try: with Image(filename=pdfpath) as imgs: for imgidx, img in enumerate(imgs.sequence): w, h = img.width, img.height h = int(croph * h) img.crop(0, 0, width=w, height=h) newimg = Image(width=w, height=h, background=Color("white")) newimg.composite(img, 0, 0) newimg.convert("png") newimg.save(filename=outpath) return True except Exception as err: print err return False
def process_image(raw_content): alpha = None img = Image(file=StringIO(raw_content)) if img.alpha_channel: img_png = img.convert('png') img_png.type = 'truecolormatte' img.alpha_channel = False alpha = StringIO() img_png.save(file=alpha) alpha.seek(0) if img.type == 'colorseparation' or img.colorspace == 'lab': base_img = img.convert('tiff') else: base_img = img.convert('png') # base_img = img.convert('tiff') base = StringIO() base_img.save(file=base) base.seek(0) return base, alpha
def screenshots(pdfpath, h=200): #dirpath = os.path.join(os.path.dirname(os.path.dirname(pdfpath)), "imgs") dirpath = IMGROOT fprefix = os.path.splitext(os.path.basename(pdfpath))[0] print "screenshots for %s" % fprefix ret = [] with Image(filename=pdfpath) as imgs: for imgidx, img in enumerate(imgs.sequence): w = int(h * img.width / img.height) tosave = Image(width=img.width, height=img.height, background=Color("white")) tosave.composite(img, 0, 0) tosave.convert('png') tosave.resize(w, h, filter='lagrange', blur=.35) imgname = "%s_%02d.png" % (fprefix, imgidx) imgpath = os.path.join(dirpath, imgname) tosave.save(filename=imgpath) ret.append(dict( imgpath=imgpath, imgidx=imgidx )) return ret
def upload_avatar(): #TODO: process situation when request.files don't have needed file if request.method == 'POST' and 'userFile' in request.files: hasher = md5() hasher.update(current_user.email.encode('utf-8')) f1 = request.files['userFile'] filename = path.join(UPLOADED_AVATARS_DEST, hasher.hexdigest()) f1.save(filename) print('new avatar for user ' + current_user.first_name + current_user.second_name + ' was saved as ' + filename) img = Image(filename=filename) if img is None: print("Can't parse blob from post message as image!") return redirect('/profile') else: img.resize(320, 240) img = img.convert('png') user = current_user user.load_avatar(img) return redirect('/profile') else: #TODO: redirect to previous url return redirect('/')
def image_opener(key): """Handler to locate file based on key. :param key: A key encoded in the format "<bucket>:<version>:<object_key>". :returns: A file-like object. """ if hasattr(g, 'obj'): obj = g.obj else: obj = protect_api(key) fp = obj.file.storage().open('rb') # If ImageMagick with Wand is installed, extract first page # for PDF/text. if HAS_IMAGEMAGICK and obj.mimetype in ['application/pdf', 'text/plain']: first_page = Image(Image(fp).sequence[0]) tempfile_ = tempfile.TemporaryFile() with first_page.convert(format='png') as converted: converted.save(file=tempfile_) return tempfile_ return fp
def image_to_text(image): tool = pyocr.get_available_tools()[0] lang = tool.get_available_languages()[2] req_image = [] final_text = [] try: image_obj = Image(filename=image, resolution=300) except: return final_text if image[-4:] is not 'jpeg': image_obj = image_obj.convert('jpeg') for img in image_obj.sequence: img_page = Image(image=img) req_image.append(img_page.make_blob('jpeg')) for img in req_image: txt = tool.image_to_string( PI.open(io.BytesIO(img)), lang=lang, builder=pyocr.builders.TextBuilder() ) final_text.append(txt) return final_text
def __init__(self, page=None): """ """ self._barcodes = None self._efos_barcode = None self.page = page # save single page to stream page_stream = StringIO.StringIO() pdf_file_writer = PdfFileWriter() pdf_file_writer.addPage(self.page) pdf_file_writer.write(page_stream) # Convert pdf file to image image_stream = StringIO.StringIO() pdf_image = WandImage(blob=page_stream.getvalue()) png_image = pdf_image.convert('png') # self.image.type = 'grayscale' png_image.save(file=image_stream) # Convert Image to B/W self.pil_image = PILImage.open(image_stream) self.get_barcodes()
class ImportMedia(object): """ Import new photos: create JPEG, XMP and thumbnail files, add to database """ PROXY_FULLSIZE = 0 PROXY_THUMBNAIL = 1 PROXY_WEBSIZED = 2 # outrageously slow, exec()s ufraw-batch in newer versions def _create_proxy_wand(self, source, dest, mode): # load image if not yet done if not self.image: try: self.image = Image(filename=source) except Exception as e: logging.error('cannot read {}: {}'.format(source, e.args[0])) raise e # copy bitmap, we may need it again image = self.image.convert('jpeg') # resize if mode == self.PROXY_FULLSIZE: pass elif mode == self.PROXY_THUMBNAIL: image.transform(resize=settings.THUMBNAILSIZE) elif mode == self.PROXY_WEBSIZED: image.transform(resize=settings.WEBSIZE) try: # Wand does auto-rotate, but doesn't fix EXIF data... m( image.strip() image.save(filename=dest) except (Exception, IOError) as e: logging.error('Error: cannot write {}: {}'.format(dest, e.args[0])) raise e def _create_proxy_rawpy(self, source, dest, mode): # maybe Pillow supports this file type directly? if not self.image: try: self.image = Image.open(source) except IOError: pass except Exception as e: logging.error('cannot read {}: {}'.format(source, e.args[0])) raise e # obviously not, try decoding as Raw if not self.image: try: raw = rawpy.imread(source) rgb = raw.postprocess(use_camera_wb=True, no_auto_bright=True) self.image = Image.fromarray(rgb) except Exception as e: logging.error('cannot read {}: {}'.format(source, e.args[0])) raise e image = self.image.copy() if mode == self.PROXY_FULLSIZE: pass elif mode == self.PROXY_THUMBNAIL: image.thumbnail(settings.THUMBNAILSIZE) elif mode == self.PROXY_WEBSIZED: image.thumbnail(settings.WEBSIZE) try: image.save(dest) except Exception as e: logging.error('cannot write {}: {}'.format(dest, e.args[0])) def create_proxy(self, source, dest, mode): logging.info('called for {}, {}, {}'.format(source, dest, mode)) if os.path.isfile(dest) and not self.force: return # just link fullsize image if JPEG and orientation is normal is_jpeg = os.path.splitext(source)[1].lower() in ('.jpg', '.jpeg') orientation = self.exif.get_orientation() logging.debug('EXIF Orientation: {}'.format(orientation.value_nick)) if mode == self.PROXY_FULLSIZE and is_jpeg and orientation == orientation.NORMAL: try: os.unlink(dest) except Exception as e: pass try: os.link(source, dest) except Exception as e: logging.error('cannot link {} to {}: {}'.format(source, dest, e.args[0])) raise e return if settings.IMAGE_LIB == 'wand': self._create_proxy_wand(source, dest, mode) elif settings.IMAGE_LIB == 'rawpy': self._create_proxy_rawpy(source, dest, mode) else: logging.error('configuration error: unknown or missing IMAGE_LIB {}'.format(settings.IMAGE_LIB)) def create_video_proxy(self, source, thumbnail, websized): logging.info('called for {}, {}, {}'.format(source, thumbnail, websized)) if os.path.isfile(thumbnail) and os.path.isfile(websized) and not self.force: return for keyframe in 10, 1: ffmpeg_filter="[0:v] select='eq(pict_type,I)*lte(t,30)' [iframes];" \ "[iframes] select='gte(n,{})',split=2 [sel1][sel2];" \ "[sel1] scale=128:-1[tns]; [tns][1:v] overlay=main_w-overlay_w-5:main_h-overlay_h-5 [tn];" \ "[sel2] scale=768:-1[prxs]; [prxs][2:v]overlay=main_w-overlay_w-10:main_h-overlay_h-10 [proxy]".format(keyframe) call([settings.FFMPEG_COMMAND, '-i', source, '-i', settings.THUMBNAIL_VIDEO_PLAY_BUTTON, '-i', settings.PREVIEW_VIDEO_PLAY_BUTTON, '-filter_complex', ffmpeg_filter, '-y', '-v', 'quiet', '-map', '[tn]', '-frames:v', '1', thumbnail, '-map', '[proxy]', '-frames:v', '1', websized], shell=False) if os.path.isfile(thumbnail) and os.path.isfile(websized): return def write_xmp_sidecar(self, sourcefile): destxmppath = os.path.splitext(sourcefile)[0] + '.xmp' try: # how brain-damaged is this?! fd = open(destxmppath, mode='w+') fd.write('<?xml version="1.0" encoding="UTF-8"?><x:xmpmeta xmlns:x="adobe:ns:meta/"></x:xmpmeta>') fd.close() self.exif.save_file(destxmppath) except Exception as e: logging.error('cannot write {}: {}'.format(destxmppath, e)) return destxmppath def get_timestamp(self, source_file, use_exif=True): # first try to get it from EXIF data if use_exif: timestamp = self.exif.get_tag_string('Exif.Photo.DateTimeOriginal') if not timestamp: timestamp = self.exif.get_tag_string('Exif.Image.DateTime') else: timestamp = None # try timestamp of file if not timestamp: st = os.stat(source_file) timestruct = tz.time.gmtime(int(st.st_ctime)) # camera clock not set, eh? if timestruct.tm_year < 2000: timestruct = tz.time.gmtime() return timestruct return tz.time.strptime(timestamp, '%Y:%m:%d %H:%M:%S') def create_filename(self, source_file, timestruct): datestring = tz.time.strftime("%Y%m%d", timestruct) mediabasename = os.path.basename(source_file).lower() filename = datestring + "-" + mediabasename count = 1 while count < 1000000: try: MediaFile.objects.get(filename=filename) except Exception as e: break filename = '{}-{:06d}-{}'.format(datestring, count, mediabasename) count += 1 if count == 1000000: logging.error('Cannot create unique export filename') return None return filename def update_image_parameters(self, entry): try: entry.f_number = Fraction(self.exif.get_tag_string('Exif.Photo.FNumber')) * 1.0 except Exception: entry.f_number = 0 try: entry.exposure_time = Fraction(self.exif.get_tag_string('Exif.Photo.ExposureTime')) * 1.0 except Exception: entry.exposure_time = 0 try: entry.gain_value = float(self.exif.get_tag_string('Exif.Photo.ISOSpeedRatings')) except Exception: try: entry.gain_value = float(self.exif.get_tag_string('Exif.Photo.GainControl')) except Exception: entry.gain_value = 0 try: entry.focal_length = Fraction(self.exif.get_tag_string('Exif.Photo.FocalLength')) * 1.0 except Exception as e: entry.focal_length = 0 def update_db(self, source_file, sidecar=None, is_supported_media=True): entry = None media_dir = self.mediadir media_file = os.path.basename(source_file) try: entry = MediaFile.objects.get(media_dir=media_dir, media_file=media_file) if not self.force: return entry except MediaFile.DoesNotExist: timestamp = self.get_timestamp(source_file, use_exif=is_supported_media) filename = self.create_filename(source_file, timestamp) if not filename: return None catalog, created = Catalog.objects.get_or_create(name=settings.DEFAULT_CATALOG) mime_type, created = MimeType.objects.get_or_create(type=self.mimetype) try: entry = MediaFile(media_dir=media_dir, media_file=media_file) except Exception as e: logging.error('Cannot get object: {}'.format(e.args[0])) raise e entry.mime_type = mime_type entry.catalog = catalog entry.filename = filename entry.date = tz.time.strftime('%Y-%m-%dT%H:%M:%SZ', timestamp) if is_supported_media: rating = self.exif.get_tag_long('Xmp.xmp.Rating') if rating: entry.rating = rating label = self.exif.get_tag_string('Xmp.xmp.Label') if label: entry.label = label entry.sidecar_file = sidecar if is_supported_media: self.update_image_parameters(entry) return entry def import_image(self, source_file): self.image = None # prefer XMP "sidecar" files to save us from parsing huge RAW files more than necessary source_xmp_file = os.path.splitext(source_file)[0] havesourcesidecar = False for ext in (".xmp", ".XMP", "Xmp"): if os.path.isfile(source_xmp_file + ext): source_xmp_file += ext try: havesourcesidecar = self.exif.open_path(source_xmp_file) # returns True on success except Exception: pass break # No sidecar, oh well... if not havesourcesidecar: self.exif.open_path(source_file) (mediareldir, jpegfilename) = os.path.split(os.path.relpath(source_file, settings.SOURCE_DIR)) jpegfilename = os.path.splitext(jpegfilename)[0] + ".jpg" self.status.update(10, 'Writing Proxy') try: mediadir = settings.WEB_DIR + mediareldir jpegfullpath = mediadir + '/' + jpegfilename tools.mkdir(mediadir) self.create_proxy(source_file, jpegfullpath, self.PROXY_FULLSIZE) except Exception as e: raise e self.status.update(75, 'Writing Thumbnail') try: tndir = mediadir + "/" + settings.THUMBNAIL_DIR tnfullpath = tndir + '/' + jpegfilename tools.mkdir(tndir) self.create_proxy(source_file, tnfullpath, self.PROXY_THUMBNAIL) except Exception as e: os.unlink(jpegfullpath) raise e self.status.update(85, 'Writing Preview') try: webimgdir = mediadir + "/" + settings.PREVIEW_DIR webimgfullpath = webimgdir + '/' + jpegfilename tools.mkdir(webimgdir) self.create_proxy(source_file, webimgfullpath, self.PROXY_WEBSIZED) except Exception as e: os.unlink(jpegfullpath) os.unlink(tnfullpath) raise e # we need to write the XMP sidecar file here as we definitely do not want to parse # RAW files from the web app and exiv2 isn't capable to construct it's content without # a source file. A sidecar file is required for import of the rating and label tags # with lightroom, though. # # ATTN: We'll write it to SOURCE_DIR. The web app should never touch the proxy dirs # by itself if not havesourcesidecar: source_xmp_file = self.write_xmp_sidecar(source_file) entry = self.update_db(source_file, sidecar=os.path.basename(source_xmp_file)) self.status.update(95, 'Writing Database') entry.save() self.status.update(100, 'Done') def import_video(self, source_file): if not settings.FFMPEG_COMMAND: raise NotImplementedError source_thm_file = os.path.splitext(source_file)[0] havesourcesidecar = False for ext in (".thm", ".THM", "Thm"): if os.path.isfile(source_thm_file + ext): source_thm_file += ext try: havesourcesidecar = self.exif.open_path(source_thm_file) # returns True on success except Exception: pass break (mediareldir, jpegfilename) = os.path.split(os.path.relpath(source_file, settings.SOURCE_DIR)) mediadir = settings.WEB_DIR + mediareldir (basename, extension) = os.path.splitext(jpegfilename) jpegfilename = basename + ".jpg" mediafilename = basename + extension.lower() self.status.update(10, 'Linking Source File') try: mediadir = settings.WEB_DIR + mediareldir mediafullpath = mediadir + '/' + mediafilename tools.mkdir(mediadir) tools.link(source_file, mediafullpath) except Exception as e: raise e self.status.update(50, 'Writing Thumbnail and Proxy') try: tndir = mediadir + "/" + settings.THUMBNAIL_DIR tnfullpath = tndir + '/' + jpegfilename tools.mkdir(tndir) except Exception as e: os.unlink(mediafullpath) raise e try: webimgdir = mediadir + "/" + settings.PREVIEW_DIR webimgfullpath = webimgdir + '/' + jpegfilename tools.mkdir(webimgdir) except Exception as e: os.unlink(mediafullpath) os.unlink(tnfullpath) raise e self.create_video_proxy(source_file, tnfullpath, webimgfullpath) self.status.update(90, 'Writing Sidecar') if not havesourcesidecar: source_sidecar = self.write_xmp_sidecar(source_file) self.exif.open_path(source_sidecar) sidecar = os.path.basename(source_sidecar) else: sidecar = os.path.basename(source_thm_file) entry = self.update_db(source_file, sidecar=sidecar) self.status.update(95, 'Writing Database') entry.save() self.status.update(100, 'Done') def import_other(self, source_file): entry = self.update_db(source_file, is_supported_media=False) if entry.mime_type.copy: self.status.update(50, 'Writing Database') entry.save() self.status.update(100, 'Done') def do_import(self, source_file): self.status = StatusWriter(statusname=settings.PROCESS_STATUS, filename=source_file, text='Start') source_file = os.path.abspath(source_file) if not source_file.startswith(settings.SOURCE_DIR): logging.critical('{} is not below directory {}'.format(source_file, settings.SOURCE_DIR)) self.status.error('Broken Configuration') return if not os.path.isfile(source_file): logging.critical('{} does not exist or is not a file'.format(source_file)) self.status.error('Not a File') return if self.lock: self.mediadir = MediaDir.objects.compare_and_lock(self.mediadir, source_file, prefix=settings.SOURCE_DIR, name=self.name) else: self.mediadir = MediaDir.objects.get_or_create_by_full_path(source_file, prefix=settings.SOURCE_DIR) self.status.update(5, 'Examining File Type') self.mimetype = magic.from_file(filename=source_file, mime=True) extension = os.path.splitext(source_file)[1].lower() # image, but skip video thumbnail (handled by do_import_video and exporter) if self.mimetype.startswith('image/') and extension != '.thm': try: self.import_image(source_file) return except Exception as e: logging.warning('Importer: ', exc_info=True) self.status.update(0, 'Script Error') pass # video, yay. if self.mimetype.startswith('video/'): try: self.import_video(source_file) return except Exception as e: logging.warning('Importer: ', exc_info=True) self.status.update(0, 'Script Error') pass # sidecar gets handled explicitly by exporter if extension == '.xmp': self.status.update(100, 'Done') return self.import_other(source_file) def close(self): if self.status: self.status.close() self.status = None if self.mediadir and self.lock: self.mediadir.unlock() self.mediadir = None def __del__(self): self.close() def __init__(self, force=False, lock=True, name=None): self.exif = GExiv2.Metadata() self.force = force self.lock = lock self.name = name self.mediadir = None self.image = None self.status = None
source.seek(0) instances = -1 offsets = [] while True: instances = contents.find(b'icon', instances+1) offsets.append(instances) if instances == -1: break source.seek(0) for i in range(0, len(offsets)-1): source.seek(offsets[i]+4) copi = ((int.from_bytes(source.read(4),byteorder='little'))) # Number of bytes to copy f_oname = oname + "_" + str(i) +".png" data = source.read(copi) # Copy x bytes to this variable img = Image(format='ico',blob=data) with img.convert('png') as converted: converted.save(filename=f_oname) print("Converted frame %d in address %s to %s" % (i,hex(offsets[i]),f_oname)) source.close()
#for col in enumerate(range(read_sheet.ncols)): #print "pos2" record_id = read_sheet.cell_value(i,0) record_url = read_sheet.cell_value(i,2) type_contain = [False,False,False,False] type_meaning = ['NAB','Invoice','Contracts','Request'] try: #print record_url req = requests.get(record_url) if req.status_code != 200: write_sheet.write(i,0,record_id) write_sheet.write(i,1,record_url) write_sheet.write(i,2,"CANNOT DOWNLOAD FILE") break image_pdf = Image(blob=req.content,resolution=450) image_bmp = image_pdf.convert('bmp') #currently we only detect first page for efficiency for img in image_bmp.sequence: write_sheet.write(i,0,record_id) write_sheet.write(i,1,record_url) img_page = Image(image=img) img_page =img_page.make_blob('bmp') tmp_img = PI.open(io.BytesIO(img_page)) tmp_img = tmp_img.convert('L') tmp_img = tmp_img.point(lambda x: x>190 and 255) tmp_img = tmp_img.filter(ImageFilter.GaussianBlur(radius = 1.5)) txt = tool.image_to_string(tmp_img,lang="eng",builder=pyocr.builders.TextBuilder()) val = check_format(txt) print "round 1" if val <4: #print type_meaning[val]
else: print "Processing image: "+f #Create image objects for small and medium using original large li = Image(filename=join(orig_path,f)) mi = li.clone() si = mi.clone() print 'Original: '+str(mi.width)+'x'+str(mi.height) #Resize to med and small maintaining aspect ratio mi.transform(resize=str(med_size)+'x'+str(med_size)+'>') print 'Medium: '+str(mi.width)+'x'+str(mi.height) si.transform(resize=str(small_size)+'x'+str(small_size)+'>') print 'Small: '+str(si.width)+'x'+str(si.height) #Convert to JPEG if necessary and save as new file lf = join(large_path,f) if li.format != 'JPEG': li = li.convert('jpeg') li.save(filename=lf[:-3]+'jpg') mf = join(med_path,f) if mi.format != 'JPEG': mi = mi.convert('jpeg') mi.save(filename=mf[:-3]+'jpg') sf = join(small_path,f) if si.format != 'JPEG': si = si.convert('jpeg') si.save(filename=sf[:-3]+'jpg')
from PIL import Image as PI import pyocr as pyocr1 import pyocr.builders import io outfile = open("test.txt","wb") path = "C:\\upw\\chin\\test.pdf" tool = pyocr1.get_available_tools()[0] lang = tool.get_available_languages()[0] req_image = [] final_text = [] image_pdf = Image(filename=path, resolution=300) image_jpeg = image_pdf.convert('jpeg') for img in image_jpeg.sequence: img_page = Image(image=img) req_image.append(img_page.make_blob('jpeg')) for img in req_image: txt = tool.image_to_string( PI.open(io.BytesIO(img)), lang=lang, builder=pyocr.builders.TextBuilder() ) final_text.append(txt.encode("utf-8")) #outfile.write(txt.encode("utf-8")) outfile.writelines(final_text) print(final_text)
def update_patient(patient, form, files): """Update a patient record with information from submitted form.""" for field_name, class_name in [ ('income_sources', IncomeSource), ('phone_numbers', PhoneNumber), ('addresses', Address), ('emergency_contacts', EmergencyContact), ('household_members', HouseholdMember), ('employers', Employer), ('document_images', DocumentImage) ]: if form[field_name]: # If the last row in a many-to-one section doesn't have any data, don't save it remove_blank_rows_helper(form[field_name]) # Add a new child object for each new item in a many-to-one section new_row_count = len(form[field_name].entries) - getattr(patient, field_name).count() if new_row_count > 0: for p in range(new_row_count): getattr(patient, field_name).append(class_name()) # When a user clicks the delete icon on a many-to-one row, it clears # all the data in that row. If any existing rows have no data, delete # them from patient object and then from the form. for row in form[field_name]: if not bool([val for key, val in row.data.iteritems() if ( val != '' and val is not None and key != 'id' and not (key in ['state', 'employee']) )]): row_index = int(row.name[-1]) # Delete from patient object db.session.delete(getattr(patient, field_name)[row_index]) # Deletion from form FieldList requires popping all entries # after the one to be removed, then readding them to_re_add = [] for _ in range(len(form[field_name].entries) - row_index): to_re_add.append(form[field_name].pop_entry()) to_re_add.pop() for row in reversed(to_re_add): form[field_name].append_entry(data=row.data) # Get binary data and create resized versions of any new document images for index, entry in enumerate(form.document_images): if entry.file_name.data and entry.file_name.data.filename: # This is a new file if entry.file_name.data.content_type == 'application/pdf': # PIL can't handle PDFs, so use Wand pdf = WandImage(file=entry.file_name.data.stream, resolution=500) pdf.convert('jpg') entry.file_name.data.stream = io.BytesIO(pdf.make_blob('jpeg')) large_image = Image.open(entry.file_name.data.stream) small_image = large_image.copy() large_image_output, small_image_output = io.BytesIO(), io.BytesIO() large_image.thumbnail( current_app.config['LARGE_DOCUMENT_IMAGE_SIZE'], Image.ANTIALIAS ) large_image.save(large_image_output, format='JPEG') small_image.thumbnail( current_app.config['SMALL_DOCUMENT_IMAGE_SIZE'], Image.ANTIALIAS ) small_image.save(small_image_output, format='JPEG') entry.data_full.data = entry.file_name.data.stream.getvalue() entry.data_large.data = large_image_output.getvalue() entry.data_small.data = small_image_output.getvalue() entry.file_name.data = entry.file_name.data.filename else: # This is an existing entry, so the file can't change, only the description # Fill in the fields that aren't inputs from the saved data so # that populate_obj doesn't overwrite them. entry.file_name.data = patient.document_images[index].file_name entry.data_full.data = patient.document_images[index].data_full entry.data_large.data = patient.document_images[index].data_large entry.data_small.data = patient.document_images[index].data_small # Populate the patient object with all the updated info form.populate_obj(patient) return
#refer to https://pythontips.com/2016/02/25/ocr-on-pdf-files-using-python/ from wand.image import Image from PIL import Image as PI import pyocr import pyocr.builders import io import codecs from PIL import ImageFilter tool = pyocr.get_available_tools()[0] req_image = [] #final_text = [] final_text = "NOT NAB" image_pdf = Image(filename="/Users/xiaoshi/Dropbox/Stanford/cs341/final/source/49157Political File2014Non-Candidate Issue Adsnational education assocSCAN-14100711320 (14127116222163)_.pdf", resolution=450) image_png = image_pdf.convert('bmp') for img in image_png.sequence: img_page = Image(image=img) #img_page.type='grayscale' #img_page = img_page.point(lambda x:0 if x<143 else 255) req_image.append(img_page.make_blob('bmp')) #break count = 0 for img in req_image: tmp_img = PI.open(io.BytesIO(img)) tmp_img = tmp_img.convert('L') #tmp_img = tmp_img.point(lambda x: x>190 and 255) tmp_img = tmp_img.point(lambda x: x>190 and 255) tmp_img = tmp_img.filter(ImageFilter.GaussianBlur(radius = 1.5))
def new_blank_png(new_width, new_height): new_img = Image(width=new_width, height=new_height) return new_img.convert('png')