def test_should_run_false(self): settings = default_settings() job = mock_emop_job(settings) job.page_result.ocr_text_path_exists = True job.page_result.ocr_xml_path_exists = True tesseract = Tesseract(job) self.assertFalse(tesseract.should_run())
def test_should_run_true_all_values_missing(self): settings = default_settings() job = mock_emop_job(settings) job.page_result.ocr_text_path_exists = False job.page_result.ocr_xml_path_exists = False tesseract = Tesseract(job) self.assertTrue(tesseract.should_run())
def do_ocr(self, job): """Run the OCR The actual OCR class is called from here. Based on the value of the ocr_engine, a different class will be called. The ocr_results returned by the OCR class are used to determine if the ocr was successful and the results are appended to global results. Args: job (EmopJob): EmopJob object Returns: bool: True if successful, False otherwise. """ logger.info( "Got job [%s] - Batch: %s JobType: %s OCR Engine: %s" % (job.id, job.batch_job.name, job.batch_job.job_type, job.batch_job.ocr_engine) ) # OCR # ocr_engine = job.batch_job.ocr_engine if ocr_engine == "tesseract": ocr = Tesseract(job=job) elif ocr_engine == "ocular": ocr = OcularTranscribe(job=job) else: ocr_engine_err = "OCR with %s not yet supported" % ocr_engine self.append_result(job=job, results=ocr_engine_err, failed=True) return False if self.settings.controller_skip_existing and not ocr.should_run(): logger.info("Skipping OCR job [%s]" % job.id) return True ocr_result = ocr.run() if ocr_result.exitcode != 0: ocr_err = "%s OCR Failed: %s" % (ocr_engine, ocr_result.stderr) self.append_result(job=job, results=ocr_err, failed=True) return False else: return True