def parse_data(self, data): self.id = data["id"] self.batch_job = EmopBatchJob(self.settings) self.font = EmopFont(self.settings) self.language_model = EmopLanguageModel(self.settings) self.glyph_substitution_model = EmopGlyphSubstitutionModel(self.settings) self.page = EmopPage(self.settings) self.work = EmopWork(self.settings) self.page_result = EmopPageResult(self.settings) self.postproc_result = EmopPostprocResult(self.settings) self.font_training_result = EmopFontTrainingResult(self.settings) self.batch_job.setattrs(data["batch_job"]) if "font" in data: self.font.setattrs(data["font"]) elif "font" in data["batch_job"]: self.font.setattrs(data["batch_job"]["font"]) else: self.font.setattrs({}) if "language_model" in data: self.language_model.setattrs(data["language_model"]) elif "language_model" in data["batch_job"]: self.language_model.setattrs(data["batch_job"]["language_model"]) else: self.language_model.setattrs({}) if "glyph_substitution_model" in data: self.glyph_substitution_model.setattrs(data["glyph_substitution_model"]) elif "glyph_substitution_model" in data["batch_job"]: self.glyph_substitution_model.setattrs(data["batch_job"]["glyph_substitution_model"]) else: self.language_model.setattrs({}) self.page.setattrs(data["page"]) self.work.setattrs(data["work"]) self.page_result.set_existing_attrs(data.get("page_result")) self.postproc_result.set_existing_attrs(data.get("postproc_result")) self.postproc_result.set_existing_attrs(data.get("postproc_result")) self.page_result.page_id = self.page.id self.page_result.batch_id = self.batch_job.id self.postproc_result.page_id = self.page.id self.postproc_result.batch_job_id = self.batch_job.id self.font_training_result.work_id = self.work.id self.font_training_result.batch_job_id = self.batch_job.id
def setUp(self): self.page_result = EmopPageResult(default_settings())
class TestEmopPageResult(TestCase): def setUp(self): self.page_result = EmopPageResult(default_settings()) def test_init(self): self.assertIsNone(self.page_result.page_id) self.assertIsNone(self.page_result.batch_id) self.assertIsNone(self.page_result.ocr_text_path) self.assertIsNone(self.page_result.ocr_xml_path) self.assertIsNone(self.page_result.corr_ocr_text_path) self.assertIsNone(self.page_result.corr_ocr_xml_path) self.assertIsNone(self.page_result.juxta_change_index) self.assertIsNone(self.page_result.alt_change_index) self.assertFalse(self.page_result.page_id_exists) self.assertFalse(self.page_result.batch_id_exists) self.assertFalse(self.page_result.ocr_text_path_exists) self.assertFalse(self.page_result.ocr_xml_path_exists) self.assertFalse(self.page_result.corr_ocr_text_path_exists) self.assertFalse(self.page_result.corr_ocr_xml_path_exists) self.assertFalse(self.page_result.juxta_change_index_exists) self.assertFalse(self.page_result.alt_change_index_exists) def test_set_existing_attrs_none(self): self.page_result.set_existing_attrs(None) self.assertFalse(self.page_result.page_id_exists) self.assertFalse(self.page_result.batch_id_exists) self.assertFalse(self.page_result.ocr_text_path_exists) self.assertFalse(self.page_result.ocr_xml_path_exists) self.assertFalse(self.page_result.corr_ocr_text_path_exists) self.assertFalse(self.page_result.corr_ocr_xml_path_exists) self.assertFalse(self.page_result.juxta_change_index_exists) self.assertFalse(self.page_result.alt_change_index_exists) def test_set_existing_attrs_juxta_change_index(self): dictionary = { "juxta_change_index": 0.001, } self.page_result.set_existing_attrs(dictionary) self.assertTrue(self.page_result.juxta_change_index_exists) def test_to_dict(self): self.page_result.page_id = 1 self.page_result.batch_id = 2 self.page_result.juxta_change_index = 0.01 expected_dict = { "page_id": 1, "batch_id": 2, "juxta_change_index": 0.01 } actual_dict = self.page_result.to_dict() self.assertItemsEqual(expected_dict, actual_dict) def test_has_data_true(self): self.page_result.page_id = 1 self.page_result.batch_id = 2 self.page_result.juxta_change_index = 0.01 self.assertTrue(self.page_result.has_data()) def test_has_data_false(self): self.page_result.page_id = 1 self.page_result.batch_id = 2 self.assertFalse(self.page_result.has_data()) def test_ocr_text_path(self): self.page_result.settings.output_path_prefix = "/foo" self.page_result.ocr_text_path = "/foo/path" self.assertEqual("/path", self.page_result.ocr_text_path) def test_ocr_xml_path(self): self.page_result.settings.output_path_prefix = "/foo" self.page_result.ocr_xml_path = "/foo/path" self.assertEqual("/path", self.page_result.ocr_xml_path) def test_corr_ocr_text_path(self): self.page_result.settings.output_path_prefix = "/foo" self.page_result.corr_ocr_text_path = "/foo/path" self.assertEqual("/path", self.page_result.corr_ocr_text_path) def test_corr_ocr_xml_path(self): self.page_result.settings.output_path_prefix = "/foo" self.page_result.corr_ocr_xml_path = "/foo/path" self.assertEqual("/path", self.page_result.corr_ocr_xml_path)
class EmopJob(object): def __init__(self, job_data, settings, scheduler): self.settings = settings self.scheduler = scheduler self.extra_transfers = [] self.parse_data(data=job_data) self.output_root_dir = EmopBase.add_prefix(self.settings.output_path_prefix, self.settings.ocr_root) self.temp_dir = get_temp_dir() self.image_path = self.page.image_path # The values below rely on values set above self.output_dir = self.get_output_dir(batch_id=self.batch_job.id, work_id=self.work.id) self.txt_file = self.output_file("txt") self.xml_file = self.output_file("xml") self.hocr_file = self.output_file("hocr") self.idhmc_txt_file = self.add_filename_suffix(self.txt_file, "IDHMC") self.idhmc_xml_file = self.add_filename_suffix(self.xml_file, "IDHMC") self.alto_txt_file = self.add_filename_suffix(self.txt_file, "ALTO") self.alto_xml_file = self.add_filename_suffix(self.xml_file, "ALTO") # Ocular specific items if self.batch_job.ocr_engine == "ocular": self.input_font_path = self.font.path self.input_lm_path = self.language_model.path self.input_gsm_path = self.glyph_substitution_model.path _base_output_name = "work-%s-batch-%s" % (self.work.id, self.batch_job.id) self.output_font_path = os.path.join(self.output_dir, "%s.fontser" % _base_output_name) self.output_lm_path = os.path.join(self.output_dir, "%s.lmser" % _base_output_name) self.output_gsm_path = os.path.join(self.output_dir, "%s.gsmser" % _base_output_name) self.input_doc_list_path = os.path.join( self.temp_dir, "batch-%s-work-%s-pages-images.txt" % (str(self.batch_job.id), str(self.work.id)) ) # Extra command parameters that are passed to OCR application _extra_command_parameters = self.batch_job.parameters if _extra_command_parameters and isinstance(_extra_command_parameters, basestring): self.extra_command_parameters = shlex.split(_extra_command_parameters) else: self.extra_command_parameters = None def parse_data(self, data): self.id = data["id"] self.batch_job = EmopBatchJob(self.settings) self.font = EmopFont(self.settings) self.language_model = EmopLanguageModel(self.settings) self.glyph_substitution_model = EmopGlyphSubstitutionModel(self.settings) self.page = EmopPage(self.settings) self.work = EmopWork(self.settings) self.page_result = EmopPageResult(self.settings) self.postproc_result = EmopPostprocResult(self.settings) self.font_training_result = EmopFontTrainingResult(self.settings) self.batch_job.setattrs(data["batch_job"]) if "font" in data: self.font.setattrs(data["font"]) elif "font" in data["batch_job"]: self.font.setattrs(data["batch_job"]["font"]) else: self.font.setattrs({}) if "language_model" in data: self.language_model.setattrs(data["language_model"]) elif "language_model" in data["batch_job"]: self.language_model.setattrs(data["batch_job"]["language_model"]) else: self.language_model.setattrs({}) if "glyph_substitution_model" in data: self.glyph_substitution_model.setattrs(data["glyph_substitution_model"]) elif "glyph_substitution_model" in data["batch_job"]: self.glyph_substitution_model.setattrs(data["batch_job"]["glyph_substitution_model"]) else: self.language_model.setattrs({}) self.page.setattrs(data["page"]) self.work.setattrs(data["work"]) self.page_result.set_existing_attrs(data.get("page_result")) self.postproc_result.set_existing_attrs(data.get("postproc_result")) self.postproc_result.set_existing_attrs(data.get("postproc_result")) self.page_result.page_id = self.page.id self.page_result.batch_id = self.batch_job.id self.postproc_result.page_id = self.page.id self.postproc_result.batch_job_id = self.batch_job.id self.font_training_result.work_id = self.work.id self.font_training_result.batch_job_id = self.batch_job.id def get_output_dir(self, batch_id, work_id): """ Provide the job output directory Format is the following: /<config.ini output_path_prefix><config.ini ocr_root>/<batch ID>/<work ID> Example: /dh/data/shared/text-xml/IDHMC-OCR/<batch.id>/<work.id> Returns: str: Output directory path """ path = os.path.join(self.output_root_dir, str(batch_id), str(work_id)) return path def output_file(self, fmt): """ Provide the job output file name Format is the following: <output_dir>/<page.number>.<fmt> Example: <output_dir>/<page.number>.<fmt> Args: fmt (str): File format (extension) for file path Returns: str: Output file path """ filename = "%s.%s" % (self.page.number, str(fmt).lower()) path = os.path.join(self.output_dir, filename) return path def add_filename_suffix(self, file, suffix): """ Add filename suffix This function adds a suffix to a filename before the extension Example: add_filename_suffix('5.xml', 'IDHMC') 5.xml -> 5_IDHMC.xml Args: file (str): File name to add suffix suffix (str): The suffix to add Returns: str: The filename with suffix added before extension """ filename, ext = os.path.splitext(file) return "%s_%s%s" % (filename, suffix, ext)