Пример #1
0
 def parse_data(self, data):
     self.id = data["id"]
     self.batch_job = EmopBatchJob(self.settings)
     self.font = EmopFont(self.settings)
     self.language_model = EmopLanguageModel(self.settings)
     self.glyph_substitution_model = EmopGlyphSubstitutionModel(self.settings)
     self.page = EmopPage(self.settings)
     self.work = EmopWork(self.settings)
     self.page_result = EmopPageResult(self.settings)
     self.postproc_result = EmopPostprocResult(self.settings)
     self.font_training_result = EmopFontTrainingResult(self.settings)
     self.batch_job.setattrs(data["batch_job"])
     if "font" in data:
         self.font.setattrs(data["font"])
     elif "font" in data["batch_job"]:
         self.font.setattrs(data["batch_job"]["font"])
     else:
         self.font.setattrs({})
     if "language_model" in data:
         self.language_model.setattrs(data["language_model"])
     elif "language_model" in data["batch_job"]:
         self.language_model.setattrs(data["batch_job"]["language_model"])
     else:
         self.language_model.setattrs({})
     if "glyph_substitution_model" in data:
         self.glyph_substitution_model.setattrs(data["glyph_substitution_model"])
     elif "glyph_substitution_model" in data["batch_job"]:
         self.glyph_substitution_model.setattrs(data["batch_job"]["glyph_substitution_model"])
     else:
         self.language_model.setattrs({})
     self.page.setattrs(data["page"])
     self.work.setattrs(data["work"])
     self.page_result.set_existing_attrs(data.get("page_result"))
     self.postproc_result.set_existing_attrs(data.get("postproc_result"))
     self.postproc_result.set_existing_attrs(data.get("postproc_result"))
     self.page_result.page_id = self.page.id
     self.page_result.batch_id = self.batch_job.id
     self.postproc_result.page_id = self.page.id
     self.postproc_result.batch_job_id = self.batch_job.id
     self.font_training_result.work_id = self.work.id
     self.font_training_result.batch_job_id = self.batch_job.id
 def setUp(self):
     self.page_result = EmopPageResult(default_settings())
class TestEmopPageResult(TestCase):
    def setUp(self):
        self.page_result = EmopPageResult(default_settings())

    def test_init(self):
        self.assertIsNone(self.page_result.page_id)
        self.assertIsNone(self.page_result.batch_id)
        self.assertIsNone(self.page_result.ocr_text_path)
        self.assertIsNone(self.page_result.ocr_xml_path)
        self.assertIsNone(self.page_result.corr_ocr_text_path)
        self.assertIsNone(self.page_result.corr_ocr_xml_path)
        self.assertIsNone(self.page_result.juxta_change_index)
        self.assertIsNone(self.page_result.alt_change_index)
        self.assertFalse(self.page_result.page_id_exists)
        self.assertFalse(self.page_result.batch_id_exists)
        self.assertFalse(self.page_result.ocr_text_path_exists)
        self.assertFalse(self.page_result.ocr_xml_path_exists)
        self.assertFalse(self.page_result.corr_ocr_text_path_exists)
        self.assertFalse(self.page_result.corr_ocr_xml_path_exists)
        self.assertFalse(self.page_result.juxta_change_index_exists)
        self.assertFalse(self.page_result.alt_change_index_exists)

    def test_set_existing_attrs_none(self):
        self.page_result.set_existing_attrs(None)
        self.assertFalse(self.page_result.page_id_exists)
        self.assertFalse(self.page_result.batch_id_exists)
        self.assertFalse(self.page_result.ocr_text_path_exists)
        self.assertFalse(self.page_result.ocr_xml_path_exists)
        self.assertFalse(self.page_result.corr_ocr_text_path_exists)
        self.assertFalse(self.page_result.corr_ocr_xml_path_exists)
        self.assertFalse(self.page_result.juxta_change_index_exists)
        self.assertFalse(self.page_result.alt_change_index_exists)

    def test_set_existing_attrs_juxta_change_index(self):
        dictionary = {
            "juxta_change_index": 0.001,
        }
        self.page_result.set_existing_attrs(dictionary)
        self.assertTrue(self.page_result.juxta_change_index_exists)

    def test_to_dict(self):
        self.page_result.page_id = 1
        self.page_result.batch_id = 2
        self.page_result.juxta_change_index = 0.01

        expected_dict = {
            "page_id": 1,
            "batch_id": 2,
            "juxta_change_index": 0.01
        }
        actual_dict = self.page_result.to_dict()

        self.assertItemsEqual(expected_dict, actual_dict)

    def test_has_data_true(self):
        self.page_result.page_id = 1
        self.page_result.batch_id = 2
        self.page_result.juxta_change_index = 0.01

        self.assertTrue(self.page_result.has_data())

    def test_has_data_false(self):
        self.page_result.page_id = 1
        self.page_result.batch_id = 2

        self.assertFalse(self.page_result.has_data())

    def test_ocr_text_path(self):
        self.page_result.settings.output_path_prefix = "/foo"
        self.page_result.ocr_text_path = "/foo/path"

        self.assertEqual("/path", self.page_result.ocr_text_path)

    def test_ocr_xml_path(self):
        self.page_result.settings.output_path_prefix = "/foo"
        self.page_result.ocr_xml_path = "/foo/path"

        self.assertEqual("/path", self.page_result.ocr_xml_path)

    def test_corr_ocr_text_path(self):
        self.page_result.settings.output_path_prefix = "/foo"
        self.page_result.corr_ocr_text_path = "/foo/path"

        self.assertEqual("/path", self.page_result.corr_ocr_text_path)

    def test_corr_ocr_xml_path(self):
        self.page_result.settings.output_path_prefix = "/foo"
        self.page_result.corr_ocr_xml_path = "/foo/path"

        self.assertEqual("/path", self.page_result.corr_ocr_xml_path)
Пример #4
0
class EmopJob(object):
    def __init__(self, job_data, settings, scheduler):
        self.settings = settings
        self.scheduler = scheduler
        self.extra_transfers = []
        self.parse_data(data=job_data)
        self.output_root_dir = EmopBase.add_prefix(self.settings.output_path_prefix, self.settings.ocr_root)
        self.temp_dir = get_temp_dir()
        self.image_path = self.page.image_path
        # The values below rely on values set above
        self.output_dir = self.get_output_dir(batch_id=self.batch_job.id, work_id=self.work.id)
        self.txt_file = self.output_file("txt")
        self.xml_file = self.output_file("xml")
        self.hocr_file = self.output_file("hocr")
        self.idhmc_txt_file = self.add_filename_suffix(self.txt_file, "IDHMC")
        self.idhmc_xml_file = self.add_filename_suffix(self.xml_file, "IDHMC")
        self.alto_txt_file = self.add_filename_suffix(self.txt_file, "ALTO")
        self.alto_xml_file = self.add_filename_suffix(self.xml_file, "ALTO")
        # Ocular specific items
        if self.batch_job.ocr_engine == "ocular":
            self.input_font_path = self.font.path
            self.input_lm_path = self.language_model.path
            self.input_gsm_path = self.glyph_substitution_model.path
            _base_output_name = "work-%s-batch-%s" % (self.work.id, self.batch_job.id)
            self.output_font_path = os.path.join(self.output_dir, "%s.fontser" % _base_output_name)
            self.output_lm_path = os.path.join(self.output_dir, "%s.lmser" % _base_output_name)
            self.output_gsm_path = os.path.join(self.output_dir, "%s.gsmser" % _base_output_name)
            self.input_doc_list_path = os.path.join(
                self.temp_dir, "batch-%s-work-%s-pages-images.txt" % (str(self.batch_job.id), str(self.work.id))
            )
        # Extra command parameters that are passed to OCR application
        _extra_command_parameters = self.batch_job.parameters
        if _extra_command_parameters and isinstance(_extra_command_parameters, basestring):
            self.extra_command_parameters = shlex.split(_extra_command_parameters)
        else:
            self.extra_command_parameters = None

    def parse_data(self, data):
        self.id = data["id"]
        self.batch_job = EmopBatchJob(self.settings)
        self.font = EmopFont(self.settings)
        self.language_model = EmopLanguageModel(self.settings)
        self.glyph_substitution_model = EmopGlyphSubstitutionModel(self.settings)
        self.page = EmopPage(self.settings)
        self.work = EmopWork(self.settings)
        self.page_result = EmopPageResult(self.settings)
        self.postproc_result = EmopPostprocResult(self.settings)
        self.font_training_result = EmopFontTrainingResult(self.settings)
        self.batch_job.setattrs(data["batch_job"])
        if "font" in data:
            self.font.setattrs(data["font"])
        elif "font" in data["batch_job"]:
            self.font.setattrs(data["batch_job"]["font"])
        else:
            self.font.setattrs({})
        if "language_model" in data:
            self.language_model.setattrs(data["language_model"])
        elif "language_model" in data["batch_job"]:
            self.language_model.setattrs(data["batch_job"]["language_model"])
        else:
            self.language_model.setattrs({})
        if "glyph_substitution_model" in data:
            self.glyph_substitution_model.setattrs(data["glyph_substitution_model"])
        elif "glyph_substitution_model" in data["batch_job"]:
            self.glyph_substitution_model.setattrs(data["batch_job"]["glyph_substitution_model"])
        else:
            self.language_model.setattrs({})
        self.page.setattrs(data["page"])
        self.work.setattrs(data["work"])
        self.page_result.set_existing_attrs(data.get("page_result"))
        self.postproc_result.set_existing_attrs(data.get("postproc_result"))
        self.postproc_result.set_existing_attrs(data.get("postproc_result"))
        self.page_result.page_id = self.page.id
        self.page_result.batch_id = self.batch_job.id
        self.postproc_result.page_id = self.page.id
        self.postproc_result.batch_job_id = self.batch_job.id
        self.font_training_result.work_id = self.work.id
        self.font_training_result.batch_job_id = self.batch_job.id

    def get_output_dir(self, batch_id, work_id):
        """ Provide the job output directory

        Format is the following:
            /<config.ini output_path_prefix><config.ini ocr_root>/<batch ID>/<work ID>

        Example:
            /dh/data/shared/text-xml/IDHMC-OCR/<batch.id>/<work.id>

        Returns:
            str: Output directory path
        """
        path = os.path.join(self.output_root_dir, str(batch_id), str(work_id))
        return path

    def output_file(self, fmt):
        """ Provide the job output file name

        Format is the following:
            <output_dir>/<page.number>.<fmt>

        Example:
            <output_dir>/<page.number>.<fmt>

        Args:
            fmt (str): File format (extension) for file path

        Returns:
            str: Output file path
        """
        filename = "%s.%s" % (self.page.number, str(fmt).lower())
        path = os.path.join(self.output_dir, filename)
        return path

    def add_filename_suffix(self, file, suffix):
        """ Add filename suffix

        This function adds a suffix to a filename before the extension

        Example:
            add_filename_suffix('5.xml', 'IDHMC')
            5.xml -> 5_IDHMC.xml

        Args:
            file (str): File name to add suffix
            suffix (str): The suffix to add

        Returns:
            str: The filename with suffix added before extension
        """
        filename, ext = os.path.splitext(file)
        return "%s_%s%s" % (filename, suffix, ext)