示例#1
0
def ocr_image(file_name: str):
    """Ocr an image"""
    img_ = wand_img(filename=file_name, resolution=300)
    wnd_img = wand_img(image=img_).make_blob("jpg")
    im = Image.open(io.BytesIO(wnd_img))
    text = pytesseract.image_to_string(im, lang='eng')
    print(text)
    def ocr_image_file(self, current_img_file: str):
        """ocr the extracted pdf image files"""
        try:
            if os.path.isdir(self.write_path):
                img_ = wand_img(filename=os.path.join(self.write_path,
                                                      current_img_file),
                                resolution=300)
                wnd_img = wand_img(image=img_).make_blob(format='jpeg')
                print("In ocr_image_file")
                im = Image.open(io.BytesIO(wnd_img))
                # extract and clean the text
                text = pytesseract.image_to_string(im, lang='eng')
                text = ''.join(text) \
                    .translate(str.maketrans('', '', string.punctuation)) \
                    .lower()
                text = SPACES.sub(" ", text)
                # text = NEWLINE.sub("", text)
                # text = TABS.sub("", text)
                # text = ''.join([i if ord(i) < 128 else ' ' for i in text])
                return text

        except Exception as e:
            print(e)
            print(
                f"An error has occurred while trying to ocr img file: {current_img_file}"
            )
示例#3
0
def make_gif_app_caller(frame, effect):
    frame.transform(resize='400x400>')

    proc = subprocess.Popen(["gif", effect],
                            stdin=subprocess.PIPE,
                            stdout=subprocess.PIPE)

    out = proc.communicate(wand_img(frame).make_blob("png"))[0]

    result = wand_img(blob=out)
    return result
示例#4
0
def make_imgtext(frame, text):
    frame.transform(resize='400x400>')

    proc = subprocess.Popen(["gif", "text", text],
                            stdin=subprocess.PIPE,
                            stdout=subprocess.PIPE)

    out = proc.communicate(wand_img(frame).make_blob("png"))[0]

    result = wand_img(blob=out)
    return result
示例#5
0
 def ocr_image_file(self, img_file_path: str):
     try:
         if os.path.isdir(img_file_path):
             for f in os.listdir(img_file_path):
                 print(f'Extracting text from file: {f}')
                 try:
                     img_ = wand_img(filename=os.path.join(
                         img_file_path, f),
                                     resolution=300)
                     wnd_img = wand_img(image=img_).make_blob(
                         os.path.splitext(f)[-1].lstrip('.'))
                     im = Image.open(io.BytesIO(wnd_img))
                     text = pytesseract.image_to_string(im, lang='eng')
                     self.extracted_pdfs.append(text)
                 except:
                     print(f'An error occurred while ocring image: {f}')
         else:
             raise OSError(f'OSError: Path, {img_file_path} not found.')
     except OSError as e:
         print(e)
示例#6
0
    def ocr_image_file(self, img_file_path: str):
        try:
            if os.path.isdir(img_file_path):
                for f in os.listdir(img_file_path):
                    self.pdf_img_content.append(
                        os.path.splitext(os.path.split(img_file_path)[1])[0])
                    pdf_file_number = os.path.splitext(f)[0][-3]
                    print(f"PDF FILE NUMBER: {pdf_file_number}")
                    try:
                        if self.img_format is 'tiff':
                            tiff_img = os.path.join(img_file_path, f)
                            img_ = Image.open(tiff_img).convert("RGBA")

                            # extract and clean the text
                            text = pytesseract.image_to_string(img_,
                                                               lang='eng')
                            text = ''.join(text) \
                                .translate(str.maketrans('', '', string.punctuation)) \
                                .lower()
                            text = SPACES.sub(" ", text)
                            text = NEWLINE.sub("", text)
                            text = TABS.sub("", text)

                            # attempt to remove special characters
                            text = ''.join(
                                [i if ord(i) < 128 else ' ' for i in text])

                            # update pdf dictionaries
                            self.extracted_pdfs.append(text)
                            self.pdf_img_content.append(text)

                        else:
                            img_ = wand_img(filename=os.path.join(
                                img_file_path, f),
                                            resolution=300)
                            wnd_img = wand_img(image=img_).make_blob(
                                os.path.splitext(f)[-1].lstrip('.'))
                            im = Image.open(io.BytesIO(wnd_img))

                            # extract and clean the text
                            text = pytesseract.image_to_string(im, lang='eng')
                            text = ' '.join(text) \
                                .translate(str.maketrans('', '', string.punctuation)) \
                                .lower()
                            text = SPACES.sub("", text)
                            text = NEWLINE.sub("", text)
                            text = TABS.sub("", text)

                            # attempt to remove special characters
                            text = ''.join(
                                [i if ord(i) < 128 else ' ' for i in text])

                            # update pdf dictionaries
                            self.extracted_pdfs.append(text)
                            self.pdf_img_content.append(text)
                    except:
                        print(f'An error occurred while ocring image: {f}')
            else:
                raise OSError(f'OSError: Path, {img_file_path} not found.')
        except OSError as e:
            print(e)