Exemplo n.º 1
0
 def __init__(self, job_data, settings, scheduler):
     self.settings = settings
     self.scheduler = scheduler
     self.extra_transfers = []
     self.parse_data(data=job_data)
     self.output_root_dir = EmopBase.add_prefix(self.settings.output_path_prefix, self.settings.ocr_root)
     self.temp_dir = get_temp_dir()
     self.image_path = self.page.image_path
     # The values below rely on values set above
     self.output_dir = self.get_output_dir(batch_id=self.batch_job.id, work_id=self.work.id)
     self.txt_file = self.output_file("txt")
     self.xml_file = self.output_file("xml")
     self.hocr_file = self.output_file("hocr")
     self.idhmc_txt_file = self.add_filename_suffix(self.txt_file, "IDHMC")
     self.idhmc_xml_file = self.add_filename_suffix(self.xml_file, "IDHMC")
     self.alto_txt_file = self.add_filename_suffix(self.txt_file, "ALTO")
     self.alto_xml_file = self.add_filename_suffix(self.xml_file, "ALTO")
     # Ocular specific items
     if self.batch_job.ocr_engine == "ocular":
         self.input_font_path = self.font.path
         self.input_lm_path = self.language_model.path
         self.input_gsm_path = self.glyph_substitution_model.path
         _base_output_name = "work-%s-batch-%s" % (self.work.id, self.batch_job.id)
         self.output_font_path = os.path.join(self.output_dir, "%s.fontser" % _base_output_name)
         self.output_lm_path = os.path.join(self.output_dir, "%s.lmser" % _base_output_name)
         self.output_gsm_path = os.path.join(self.output_dir, "%s.gsmser" % _base_output_name)
         self.input_doc_list_path = os.path.join(
             self.temp_dir, "batch-%s-work-%s-pages-images.txt" % (str(self.batch_job.id), str(self.work.id))
         )
     # Extra command parameters that are passed to OCR application
     _extra_command_parameters = self.batch_job.parameters
     if _extra_command_parameters and isinstance(_extra_command_parameters, basestring):
         self.extra_command_parameters = shlex.split(_extra_command_parameters)
     else:
         self.extra_command_parameters = None
    def _get_stage_out_data(self, data):
        """ Private function - convert stage out files to transfer data

        The output API data produced by controller is checked for absolute file paths in data and 
        then that data is converted to src/dest key/value pairs for transfer.  Currently only the 
        page_results are searched for data.

        Args:
            data (dict): Dictionary containing output API data.

        Returns:
            list: List of dicts that contain necessary src/dest key/value pairs
        """
        _data = []
        _paths = set()
        _page_results = data.get("page_results", [])
        _font_training_results = data.get("font_training_results", [])
        _extra_tranfers = data.get("extra_transfers", [])
        _results = _page_results + _font_training_results
        for _result in _results:
            for _value in _result.values():
                if not isinstance(_value, basestring):
                    continue
                if os.path.isabs(_value):
                    _local_path = EmopBase.add_prefix(prefix=self.settings.output_path_prefix, path=_value)
                    _paths.add(_local_path)

        for _extra in _extra_tranfers:
            for _path in _paths.copy():
                if os.path.isdir(_extra):
                    if _extra in _path:
                        _paths.discard(_path)
            _paths.add(_extra)

        for _path in _paths:
            _d = {}
            _remote_path = EmopBase.remove_prefix(prefix=self.settings.output_path_prefix, path=_path)
            _d['dest'] = _remote_path
            _d['src'] = _path
            if os.path.isdir(_path):
                _d['recursive'] = True
            _data.append(_d)

        return _data
Exemplo n.º 3
0
    def image_path(self, value):
        """Determine the full path of an image

        This function generates an image path based on value of image path for a page.
        If a page has no image path then one is generated.

        ECCO image path format:
            eeco_directory/<eeco ID> + <4 digit page ID> + 0.[tif | TIF]
        EEBO image path format:
            eebo_directory/<eebo ID>.000.<0-100>.[tif | TIF]

        Args:
            value (str): Path to page image from API

        Returns:
            str: Path to the page image
            None is returned if no path could be determined which constitutes an error
        """
        if value:
            self._image_path = EmopBase.add_prefix(self.settings.input_path_prefix, value)
        # image path was not provided by API so one will be generated
        else:
            # EECO
            if self.work.is_ecco():
                img = "%s/%s%04d0.tif" % (self.work.ecco_directory, self.work.ecco_id, self.number)
                image_path = EmopBase.add_prefix(self.settings.input_path_prefix, img)
                image_path_upcase = image_path.replace(".tif", ".TIF")
                if os.path.isfile(image_path):
                    self._image_path = image_path
                elif os.path.isfile(image_path_upcase):
                    self._image_path = image_path_upcase
            # EEBO
            else:
                for i in xrange(101):
                    img = "%s/%05d.000.%03d.tif" % (self.work.eebo_directory, self.number, i)
                    image_path = EmopBase.add_prefix(self.settings.input_path_prefix, img)
                    image_path_upcase = image_path.replace(".tif", ".TIF")
                    if os.path.isfile(image_path):
                        self._image_path = image_path
                    elif os.path.isfile(image_path_upcase):
                        self._image_path = image_path_upcase
                    else:
                        continue
    def _get_stage_in_data(self, files):
        """ Private function - convert stage in files to transfer data

        A list of files is turned into src/dest key/value pairs based on controller/input_path_prefix.

        Args:
            files (list): Stage in file list.

        Returns:
            list: List of dicts that contain necessary src/dest key/value pairs
        """
        _data = []
        for f in files:
            _paths = {}
            _paths['src'] = f
            _local_path = EmopBase.add_prefix(prefix=self.settings.input_path_prefix, path=f)
            _paths['dest'] = _local_path
            _data.append(_paths)
        return _data
Exemplo n.º 5
0
 def ground_truth_file(self, value):
     prefix = self.settings.input_path_prefix
     new_value = EmopBase.add_prefix(prefix=prefix, path=value)
     self._ground_truth_file = new_value
Exemplo n.º 6
0
 def path(self, value):
     prefix = self.settings.input_path_prefix
     new_value = EmopBase.add_prefix(prefix=prefix, path=value)
     self._path = new_value
 def glyph_substitution_model_path(self, value):
     prefix = self.settings.output_path_prefix
     new_value = EmopBase.remove_prefix(prefix=prefix, path=value)
     self._glyph_substitution_model_path = new_value
 def language_model_path(self, value):
     prefix = self.settings.output_path_prefix
     new_value = EmopBase.remove_prefix(prefix=prefix, path=value)
     self._language_model_path = new_value
 def font_path(self, value):
     prefix = self.settings.output_path_prefix
     new_value = EmopBase.remove_prefix(prefix=prefix, path=value)
     self._font_path = new_value
 def corr_ocr_xml_path(self, value):
     prefix = self.settings.output_path_prefix
     new_value = EmopBase.remove_prefix(prefix=prefix, path=value)
     self._corr_ocr_xml_path = new_value