def test_run(self, mock_mkdirs_exists_ok, mock_os_rename, mock_path_isdir, mock_path_isfile): settings = default_settings() settings.emop_home = "/foo" job = mock_emop_job(settings) tesseract = Tesseract(job) mock_path_isfile.return_value = True mock_path_isdir.return_value = False expected_cmd = [ "tesseract", job.image_path, tesseract.output_filename, "-l", job.font.name, tesseract.cfg ] results = mock_results_tuple() expected_results = results(None, None, 0) self.mock_rv.communicate.return_value[0] = "" retval = tesseract.run() args, kwargs = self.mock_popen.call_args self.assertTrue(mock_path_isfile.called) self.assertTrue(mock_path_isdir.called) self.assertTrue(mock_mkdirs_exists_ok.called) self.assertTrue(self.mock_popen.called) self.assertEqual(expected_cmd, args[0]) # self.assertTrue(mock_os_rename.called) self.assertTupleEqual(expected_results, retval)
def test_should_run_true_all_values_missing(self): settings = default_settings() job = mock_emop_job(settings) job.page_result.ocr_text_path_exists = False job.page_result.ocr_xml_path_exists = False tesseract = Tesseract(job) self.assertTrue(tesseract.should_run())
def test_should_run_false(self): settings = default_settings() job = mock_emop_job(settings) job.page_result.ocr_text_path_exists = True job.page_result.ocr_xml_path_exists = True tesseract = Tesseract(job) self.assertFalse(tesseract.should_run())
def do_ocr(self, job): """Run the OCR The actual OCR class is called from here. Based on the value of the ocr_engine, a different class will be called. The ocr_results returned by the OCR class are used to determine if the ocr was successful and the results are appended to global results. Args: job (EmopJob): EmopJob object Returns: bool: True if successful, False otherwise. """ logger.info( "Got job [%s] - Batch: %s JobType: %s OCR Engine: %s" % (job.id, job.batch_job.name, job.batch_job.job_type, job.batch_job.ocr_engine) ) # OCR # ocr_engine = job.batch_job.ocr_engine if ocr_engine == "tesseract": ocr = Tesseract(job=job) elif ocr_engine == "ocular": ocr = OcularTranscribe(job=job) else: ocr_engine_err = "OCR with %s not yet supported" % ocr_engine self.append_result(job=job, results=ocr_engine_err, failed=True) return False if self.settings.controller_skip_existing and not ocr.should_run(): logger.info("Skipping OCR job [%s]" % job.id) return True ocr_result = ocr.run() if ocr_result.exitcode != 0: ocr_err = "%s OCR Failed: %s" % (ocr_engine, ocr_result.stderr) self.append_result(job=job, results=ocr_err, failed=True) return False else: return True