def ocr_image(file_name: str): """Ocr an image""" img_ = wand_img(filename=file_name, resolution=300) wnd_img = wand_img(image=img_).make_blob("jpg") im = Image.open(io.BytesIO(wnd_img)) text = pytesseract.image_to_string(im, lang='eng') print(text)
def ocr_image_file(self, current_img_file: str): """ocr the extracted pdf image files""" try: if os.path.isdir(self.write_path): img_ = wand_img(filename=os.path.join(self.write_path, current_img_file), resolution=300) wnd_img = wand_img(image=img_).make_blob(format='jpeg') print("In ocr_image_file") im = Image.open(io.BytesIO(wnd_img)) # extract and clean the text text = pytesseract.image_to_string(im, lang='eng') text = ''.join(text) \ .translate(str.maketrans('', '', string.punctuation)) \ .lower() text = SPACES.sub(" ", text) # text = NEWLINE.sub("", text) # text = TABS.sub("", text) # text = ''.join([i if ord(i) < 128 else ' ' for i in text]) return text except Exception as e: print(e) print( f"An error has occurred while trying to ocr img file: {current_img_file}" )
def make_gif_app_caller(frame, effect): frame.transform(resize='400x400>') proc = subprocess.Popen(["gif", effect], stdin=subprocess.PIPE, stdout=subprocess.PIPE) out = proc.communicate(wand_img(frame).make_blob("png"))[0] result = wand_img(blob=out) return result
def make_imgtext(frame, text): frame.transform(resize='400x400>') proc = subprocess.Popen(["gif", "text", text], stdin=subprocess.PIPE, stdout=subprocess.PIPE) out = proc.communicate(wand_img(frame).make_blob("png"))[0] result = wand_img(blob=out) return result
def ocr_image_file(self, img_file_path: str): try: if os.path.isdir(img_file_path): for f in os.listdir(img_file_path): print(f'Extracting text from file: {f}') try: img_ = wand_img(filename=os.path.join( img_file_path, f), resolution=300) wnd_img = wand_img(image=img_).make_blob( os.path.splitext(f)[-1].lstrip('.')) im = Image.open(io.BytesIO(wnd_img)) text = pytesseract.image_to_string(im, lang='eng') self.extracted_pdfs.append(text) except: print(f'An error occurred while ocring image: {f}') else: raise OSError(f'OSError: Path, {img_file_path} not found.') except OSError as e: print(e)
def ocr_image_file(self, img_file_path: str): try: if os.path.isdir(img_file_path): for f in os.listdir(img_file_path): self.pdf_img_content.append( os.path.splitext(os.path.split(img_file_path)[1])[0]) pdf_file_number = os.path.splitext(f)[0][-3] print(f"PDF FILE NUMBER: {pdf_file_number}") try: if self.img_format is 'tiff': tiff_img = os.path.join(img_file_path, f) img_ = Image.open(tiff_img).convert("RGBA") # extract and clean the text text = pytesseract.image_to_string(img_, lang='eng') text = ''.join(text) \ .translate(str.maketrans('', '', string.punctuation)) \ .lower() text = SPACES.sub(" ", text) text = NEWLINE.sub("", text) text = TABS.sub("", text) # attempt to remove special characters text = ''.join( [i if ord(i) < 128 else ' ' for i in text]) # update pdf dictionaries self.extracted_pdfs.append(text) self.pdf_img_content.append(text) else: img_ = wand_img(filename=os.path.join( img_file_path, f), resolution=300) wnd_img = wand_img(image=img_).make_blob( os.path.splitext(f)[-1].lstrip('.')) im = Image.open(io.BytesIO(wnd_img)) # extract and clean the text text = pytesseract.image_to_string(im, lang='eng') text = ' '.join(text) \ .translate(str.maketrans('', '', string.punctuation)) \ .lower() text = SPACES.sub("", text) text = NEWLINE.sub("", text) text = TABS.sub("", text) # attempt to remove special characters text = ''.join( [i if ord(i) < 128 else ' ' for i in text]) # update pdf dictionaries self.extracted_pdfs.append(text) self.pdf_img_content.append(text) except: print(f'An error occurred while ocring image: {f}') else: raise OSError(f'OSError: Path, {img_file_path} not found.') except OSError as e: print(e)