예제 #1
0
def test_version():

    e = ocr.Engine("")
    version = e.get_version()
    version_regex = re.compile("[0-9][.][0-9]{1,2}[.][0-9]{1,2}")
    print(version)
    assert version_regex.match(version)
예제 #2
0
 def set_tess_path(cls, path: str = None) -> None:
     if path is None:
         path = locate_tessdata()
     assert path is not None
     assert os.path.exists(path)
     cls.engine = ocr.Engine(path)
     assert cls.engine is not None
예제 #3
0
class GenerateOCRFileTask(speedwagon.tasks.Subtask):
    engine = ocr.Engine(locate_tessdata())

    def __init__(self,
                 source_image,
                 out_text_file,
                 lang="eng",
                 tesseract_path=None) -> None:
        super().__init__()

        self._source = source_image
        self._output_text_file = out_text_file
        self._lang = lang

        GenerateOCRFileTask.set_tess_path(tesseract_path or locate_tessdata())
        assert self.engine is not None

    @classmethod
    def set_tess_path(cls, path=locate_tessdata()):
        assert path is not None
        assert os.path.exists(path)
        cls.engine = ocr.Engine(path)
        assert cls.engine is not None

    def work(self) -> bool:
        # Get the ocr text reader for the proper language
        reader = self.engine.get_reader(self._lang)
        self.log("Reading {}".format(os.path.normcase(self._source)))

        f = io.StringIO()

        with contextlib.redirect_stderr(f):
            # Capture the warning messages
            resulting_text = reader.read(self._source)

        stderr_messages = f.getvalue()
        if stderr_messages:
            # Log any error messages
            self.log(stderr_messages.strip())

        # Generate a text file from the text data extracted from the image
        self.log("Writing to {}".format(self._output_text_file))
        with open(self._output_text_file, "w", encoding="utf8") as wf:
            wf.write(resulting_text)

        result = {"text": resulting_text, "source": self._source}
        self.set_results(result)
        return True
def test_no_osd_file(tmpdir_factory):
    e = ocr.Engine("")
    version = e.get_version()
    english_data_url = "{}/{}/{}".format(TESSDATA_SOURCE_URL_BASE, "4.0.0",
                                         "eng.traineddata")

    tessdata_path = tmpdir_factory.mktemp("no_osd_tessdata", numbered=False)

    if not os.path.exists(tessdata_path):
        os.makedirs(tessdata_path)
    download_data(english_data_url, destination=tessdata_path)
    with pytest.raises(FileNotFoundError):

        reader = ocr.Reader(language_code="eng",
                            tesseract_data_path=tessdata_path)
    shutil.rmtree(tessdata_path)
예제 #5
0
 def set_tess_path(cls, path=locate_tessdata()):
     assert path is not None
     assert os.path.exists(path)
     cls.engine = ocr.Engine(path)
     assert cls.engine is not None
예제 #6
0
class GenerateOCRFileTask(speedwagon.tasks.Subtask):
    engine = ocr.Engine(locate_tessdata())

    def __init__(self,
                 source_image: str,
                 out_text_file: str,
                 lang: str = "eng",
                 tesseract_path: str = None) -> None:
        super().__init__()

        self._source = source_image
        self._output_text_file = out_text_file
        self._lang = lang
        self._tesseract_path = tesseract_path
        GenerateOCRFileTask.set_tess_path(tesseract_path or locate_tessdata())
        assert self.engine is not None

    @classmethod
    def set_tess_path(cls, path: str = None) -> None:
        if path is None:
            path = locate_tessdata()
        assert path is not None
        assert os.path.exists(path)
        cls.engine = ocr.Engine(path)
        assert cls.engine is not None

    def work(self) -> bool:
        resulting_text = self.read_image(self._source, self._lang)

        # Generate a text file from the text data extracted from the image
        self.log("Writing to {}".format(self._output_text_file))
        with open(self._output_text_file, "w", encoding="utf8") as write_file:
            write_file.write(resulting_text)

        result = {
            "text": resulting_text,
            "source": self._source
        }
        self.set_results(result)
        return True

    def read_image(self, file: str, lang: str) -> str:

        if self.engine.data_set_path is None:
            self.engine.data_set_path = self._tesseract_path

        # Get the ocr text reader for the proper language
        reader = self.engine.get_reader(lang)
        self.log("Reading {}".format(os.path.normcase(file)))

        file_handle = io.StringIO()

        with contextlib.redirect_stderr(file_handle):
            # Capture the warning messages
            try:
                resulting_text = reader.read(file)
            except ocr.tesseractwrap.TesseractGlueException as error:
                raise SpeedwagonException(f"Unable to read {file}") from error

        stderr_messages = file_handle.getvalue()
        if stderr_messages:
            # Log any error messages
            self.log(stderr_messages.strip())
        return resulting_text
예제 #7
0
def ocr_engine(tess_path):
    tess_engine = ocr.Engine(tess_path)
    return tess_engine