def __init__(self, job_data, settings, scheduler): self.settings = settings self.scheduler = scheduler self.extra_transfers = [] self.parse_data(data=job_data) self.output_root_dir = EmopBase.add_prefix(self.settings.output_path_prefix, self.settings.ocr_root) self.temp_dir = get_temp_dir() self.image_path = self.page.image_path # The values below rely on values set above self.output_dir = self.get_output_dir(batch_id=self.batch_job.id, work_id=self.work.id) self.txt_file = self.output_file("txt") self.xml_file = self.output_file("xml") self.hocr_file = self.output_file("hocr") self.idhmc_txt_file = self.add_filename_suffix(self.txt_file, "IDHMC") self.idhmc_xml_file = self.add_filename_suffix(self.xml_file, "IDHMC") self.alto_txt_file = self.add_filename_suffix(self.txt_file, "ALTO") self.alto_xml_file = self.add_filename_suffix(self.xml_file, "ALTO") # Ocular specific items if self.batch_job.ocr_engine == "ocular": self.input_font_path = self.font.path self.input_lm_path = self.language_model.path self.input_gsm_path = self.glyph_substitution_model.path _base_output_name = "work-%s-batch-%s" % (self.work.id, self.batch_job.id) self.output_font_path = os.path.join(self.output_dir, "%s.fontser" % _base_output_name) self.output_lm_path = os.path.join(self.output_dir, "%s.lmser" % _base_output_name) self.output_gsm_path = os.path.join(self.output_dir, "%s.gsmser" % _base_output_name) self.input_doc_list_path = os.path.join( self.temp_dir, "batch-%s-work-%s-pages-images.txt" % (str(self.batch_job.id), str(self.work.id)) ) # Extra command parameters that are passed to OCR application _extra_command_parameters = self.batch_job.parameters if _extra_command_parameters and isinstance(_extra_command_parameters, basestring): self.extra_command_parameters = shlex.split(_extra_command_parameters) else: self.extra_command_parameters = None
def _get_stage_out_data(self, data): """ Private function - convert stage out files to transfer data The output API data produced by controller is checked for absolute file paths in data and then that data is converted to src/dest key/value pairs for transfer. Currently only the page_results are searched for data. Args: data (dict): Dictionary containing output API data. Returns: list: List of dicts that contain necessary src/dest key/value pairs """ _data = [] _paths = set() _page_results = data.get("page_results", []) _font_training_results = data.get("font_training_results", []) _extra_tranfers = data.get("extra_transfers", []) _results = _page_results + _font_training_results for _result in _results: for _value in _result.values(): if not isinstance(_value, basestring): continue if os.path.isabs(_value): _local_path = EmopBase.add_prefix(prefix=self.settings.output_path_prefix, path=_value) _paths.add(_local_path) for _extra in _extra_tranfers: for _path in _paths.copy(): if os.path.isdir(_extra): if _extra in _path: _paths.discard(_path) _paths.add(_extra) for _path in _paths: _d = {} _remote_path = EmopBase.remove_prefix(prefix=self.settings.output_path_prefix, path=_path) _d['dest'] = _remote_path _d['src'] = _path if os.path.isdir(_path): _d['recursive'] = True _data.append(_d) return _data
def image_path(self, value): """Determine the full path of an image This function generates an image path based on value of image path for a page. If a page has no image path then one is generated. ECCO image path format: eeco_directory/<eeco ID> + <4 digit page ID> + 0.[tif | TIF] EEBO image path format: eebo_directory/<eebo ID>.000.<0-100>.[tif | TIF] Args: value (str): Path to page image from API Returns: str: Path to the page image None is returned if no path could be determined which constitutes an error """ if value: self._image_path = EmopBase.add_prefix(self.settings.input_path_prefix, value) # image path was not provided by API so one will be generated else: # EECO if self.work.is_ecco(): img = "%s/%s%04d0.tif" % (self.work.ecco_directory, self.work.ecco_id, self.number) image_path = EmopBase.add_prefix(self.settings.input_path_prefix, img) image_path_upcase = image_path.replace(".tif", ".TIF") if os.path.isfile(image_path): self._image_path = image_path elif os.path.isfile(image_path_upcase): self._image_path = image_path_upcase # EEBO else: for i in xrange(101): img = "%s/%05d.000.%03d.tif" % (self.work.eebo_directory, self.number, i) image_path = EmopBase.add_prefix(self.settings.input_path_prefix, img) image_path_upcase = image_path.replace(".tif", ".TIF") if os.path.isfile(image_path): self._image_path = image_path elif os.path.isfile(image_path_upcase): self._image_path = image_path_upcase else: continue
def _get_stage_in_data(self, files): """ Private function - convert stage in files to transfer data A list of files is turned into src/dest key/value pairs based on controller/input_path_prefix. Args: files (list): Stage in file list. Returns: list: List of dicts that contain necessary src/dest key/value pairs """ _data = [] for f in files: _paths = {} _paths['src'] = f _local_path = EmopBase.add_prefix(prefix=self.settings.input_path_prefix, path=f) _paths['dest'] = _local_path _data.append(_paths) return _data
def ground_truth_file(self, value): prefix = self.settings.input_path_prefix new_value = EmopBase.add_prefix(prefix=prefix, path=value) self._ground_truth_file = new_value
def path(self, value): prefix = self.settings.input_path_prefix new_value = EmopBase.add_prefix(prefix=prefix, path=value) self._path = new_value
def glyph_substitution_model_path(self, value): prefix = self.settings.output_path_prefix new_value = EmopBase.remove_prefix(prefix=prefix, path=value) self._glyph_substitution_model_path = new_value
def language_model_path(self, value): prefix = self.settings.output_path_prefix new_value = EmopBase.remove_prefix(prefix=prefix, path=value) self._language_model_path = new_value
def font_path(self, value): prefix = self.settings.output_path_prefix new_value = EmopBase.remove_prefix(prefix=prefix, path=value) self._font_path = new_value
def corr_ocr_xml_path(self, value): prefix = self.settings.output_path_prefix new_value = EmopBase.remove_prefix(prefix=prefix, path=value) self._corr_ocr_xml_path = new_value