def stage_in_proc_ids(self, proc_ids, wait=0): """ Stage in proc_ids This function will find the necessary data from provided proc_ids and initiate a Globus transfer. Args: proc_ids (list): List of proc_ids to stage in wait (bool): Whether the stage in should wait for the transfer to complete Returns: str: Globus Task ID """ stage_in_files = [] stage_in_data = [] src = self.remote_endpoint dest = self.cluster_endpoint label = "emop-stage-in-files" for proc_id in proc_ids: payload = EmopPayload(self.settings, proc_id) if not payload.input_exists(): logger.error("EmopTransfer: Could not find input payload for proc_id %s", proc_id) continue data = payload.load_input() _files = self._get_stage_in_files_from_data(data) stage_in_files = stage_in_files + _files _stage_in_files = list(set(stage_in_files)) stage_in_data = self._get_stage_in_data(_stage_in_files) task_id = self.start(src=src, dest=dest, data=stage_in_data, label=label, wait=wait) return task_id
def test_stage_out_proc_id_3(self): payload = EmopPayload(self.transfer.settings, 'output_payload_1') payload.completed_output_exists = MagicMock() payload.completed_output_exists.return_value = False payload.output_exists = MagicMock() payload.output_exists.return_value = False retval = self.transfer.stage_out_proc_id('output_payload_1') self.assertEqual('', retval)
def upload_proc_id(self, proc_id): payload = EmopPayload(self.settings, proc_id) if payload.completed_output_exists(): filename = payload.completed_output_filename elif payload.output_exists(): filename = payload.output_filename else: logger.error("EmopUpload: Could not find payload file for proc_id %s" % proc_id) return False upload_status = self.upload_file(filename=filename) return upload_status
def reserve(self, num_pages, r_filter): """Reserve pages for a job Reserve page(s) for work by sending PUT request to dashboard API. Returns: str: The reserved work's proc_id. """ reserve_data = {} if r_filter and isinstance(r_filter, dict): job_queue = r_filter.copy() else: job_queue = {} job_queue["num_pages"] = num_pages if self.settings.operate_on == 'works': job_queue["works"] = '1' reserve_data["job_queue"] = job_queue reserve_request = self.emop_api.put_request("/api/job_queues/reserve", reserve_data) if not reserve_request: return "" requested = reserve_request.get('requested') reserved = reserve_request.get('reserved') proc_id = reserve_request.get('proc_id') results = reserve_request.get('results') logger.debug("Requested %s pages, and %s were reserved with proc_id: %s" % (requested, reserved, proc_id)) logger.debug("Payload: %s" % json.dumps(results, sort_keys=True, indent=4)) if reserved < 1: logger.error("No pages reserved") return "" self.payload = EmopPayload(self.settings, proc_id) self.payload.save_input(results) return proc_id
def setup_settings(self, tmpdir): self.settings = default_settings() self.input_path = tmpdir.mkdir("input") self.output_path = tmpdir.mkdir("output") self.completed_path = self.output_path.mkdir("completed") self.uploaded_path = self.output_path.mkdir("uploaded") self.settings.payload_input_path = str(self.input_path) self.settings.payload_output_path = str(self.output_path) self.settings.payload_completed_path = str(self.completed_path) self.settings.payload_uploaded_path = str(self.uploaded_path) self.payload = EmopPayload(settings=self.settings, proc_id='1')
class TestEmopPayload(TestCase): @pytest.fixture(autouse=True) def setup_settings(self, tmpdir): self.settings = default_settings() self.input_path = tmpdir.mkdir("input") self.output_path = tmpdir.mkdir("output") self.completed_path = self.output_path.mkdir("completed") self.uploaded_path = self.output_path.mkdir("uploaded") self.settings.payload_input_path = str(self.input_path) self.settings.payload_output_path = str(self.output_path) self.settings.payload_completed_path = str(self.completed_path) self.settings.payload_uploaded_path = str(self.uploaded_path) self.payload = EmopPayload(settings=self.settings, proc_id='1') def test_input_exists_false(self): self.assertEqual(self.payload.input_exists(), False) def test_input_exists_true(self): self.input_path.join("1.json").write("text") self.assertEqual(self.payload.input_exists(), True) def test_output_exists_false(self): self.assertEqual(self.payload.output_exists(), False) def test_output_exists_true(self): self.output_path.join("1.json").write("text") self.assertEqual(self.payload.output_exists(), True) def test_completed_output_exists_false(self): self.assertEqual(self.payload.completed_output_exists(), False) def test_completed_output_exists_true(self): self.completed_path.join("1.json").write("text") self.assertEqual(self.payload.completed_output_exists(), True)
def upload_file(self, filename): filename_path = os.path.abspath(filename) file_basename = os.path.basename(filename_path) proc_id, file_ext = os.path.splitext(file_basename) payload = EmopPayload(self.settings, proc_id) if not os.path.isfile(filename_path): logger.error("EmopUpload: Could not find file %s" % filename_path) return None with open(filename_path) as datafile: try: data = json.load(datafile) except ValueError: logger.error("EmopUpload: Invalid JSON file %s" % filename_path) return False uploaded = self.upload(data) if uploaded: logger.info("Successfully uploaded payload file %s" % filename_path) payload.save_uploaded_output(data) return True else: return False
def stage_out_proc_id(self, proc_id, wait=0): """ Stage out proc_id This function will find the necessary data from the provided proc_id and initiate a Globus transfer. Args: proc_id (str): proc_id to stage out wait (bool): Whether the stage out should wait for the transfer to complete Returns: str: Globus Task ID """ payload = EmopPayload(self.settings, proc_id) if payload.completed_output_exists(): filename = payload.completed_output_filename elif payload.output_exists(): filename = payload.output_filename elif payload.uploaded_output_exists(): filename = payload.uploaded_output_filename else: logger.error("EmopTransfer: Could not find payload file for proc_id %s" % proc_id) return '' data = payload.load(filename) if not data: logger.error("EmopTransfer: Unable to load payload data") return '' stage_out_data = self._get_stage_out_data(data) src = self.cluster_endpoint dest = self.remote_endpoint label = "emop-stage-out-%s" % proc_id logger.debug("Stage out files:\n%s", json.dumps(stage_out_data, indent=4, sort_keys=True)) task_id = self.start(src=src, dest=dest, data=stage_out_data, label=label, wait=wait) return task_id
def __init__(self, config_path, proc_id): """ Initialize EmopRun object and attributes Args: config_path (str): path to application config file proc_id (str or int): proc-id of this run """ super(self.__class__, self).__init__(config_path) self.proc_id = proc_id self.payload = EmopPayload(self.settings, proc_id) self.scheduler = EmopScheduler.get_scheduler_instance(name=self.settings.scheduler, settings=self.settings) self.results = {} self.jobs_completed = [] self.jobs_failed = [] self.page_results = [] self.postproc_results = [] self.font_training_results = [] self.extra_transfers = []
class EmopSubmit(EmopBase): def __init__(self, config_path): """ Initialize EmopSubmit object and attributes Args: config_path (str): path to application config file """ super(self.__class__, self).__init__(config_path) self.scheduler = EmopScheduler.get_scheduler_instance(name=self.settings.scheduler, settings=self.settings) def optimize_submit(self, page_count, running_job_count, sim=False): """Determine optimal job submission This function attempts to determine the best number of jobs and how many pages per job should be submitted to the scheduler. This function does not return a value but sets the num_jobs and pages_per_job attributes. Args: page_count (int): Number of pages needing to be processed running_job_count (int): Number of active jobs Returns: list: First value is number of jobs and second value is number of pages per job. """ num_jobs = 0 pages_per_job = 1 job_slots_available = int(self.settings.max_jobs - running_job_count) run_option_a = float(page_count) / float(job_slots_available) run_option_b = float(self.settings.max_job_runtime) / float(self.settings.avg_page_runtime) run_option_c = float(self.settings.min_job_runtime) / float(self.settings.avg_page_runtime) logger.debug("JobSlotsAvailable: %s, PageCount: %s" % (job_slots_available, page_count)) logger.debug("RunOptA: %s , RunOptB: %s, RunOptC: %s" % (run_option_a, run_option_b, run_option_c)) # max pages per job > pages in max time if run_option_a > run_option_b: num_jobs = job_slots_available pages_per_job = run_option_b # Pages less than pages in min time elif page_count < run_option_c: num_jobs = page_count / run_option_c pages_per_job = page_count # max pages per job < pages in min time elif run_option_a < run_option_c: num_jobs = page_count / run_option_c pages_per_job = run_option_c # max pages per job else: # TODO: In some cases num_jobs will exceed max_jobs value num_jobs = page_count / run_option_a pages_per_job = run_option_a # Convert values to integers num_jobs = int(num_jobs) pages_per_job = int(pages_per_job) # Incase num_jobs and pages_per_job were type casted to 0 if not num_jobs: num_jobs = 1 if not pages_per_job: pages_per_job = 1 # Case where num_jobs > page_count if num_jobs > page_count or (num_jobs*pages_per_job) > page_count: if page_count > job_slots_available: num_jobs = job_slots_available pages_per_job = int(page_count/num_jobs) else: num_jobs = page_count pages_per_job = 1 expected_runtime = pages_per_job * self.settings.avg_page_runtime expected_runtime_msg = "Expected job runtime: %s seconds" % expected_runtime if sim: logger.info(expected_runtime_msg) else: logger.debug(expected_runtime_msg) # total_pages_to_run = num_jobs * pages_per_job optimal_submit_msg = "Optimal submission is %s jobs with %s pages per job" % (num_jobs, pages_per_job) if sim: logger.info(optimal_submit_msg) else: logger.debug(optimal_submit_msg) return num_jobs, pages_per_job def reserve(self, num_pages, r_filter): """Reserve pages for a job Reserve page(s) for work by sending PUT request to dashboard API. Returns: str: The reserved work's proc_id. """ reserve_data = {} if r_filter and isinstance(r_filter, dict): job_queue = r_filter.copy() else: job_queue = {} job_queue["num_pages"] = num_pages if self.settings.operate_on == 'works': job_queue["works"] = '1' reserve_data["job_queue"] = job_queue reserve_request = self.emop_api.put_request("/api/job_queues/reserve", reserve_data) if not reserve_request: return "" requested = reserve_request.get('requested') reserved = reserve_request.get('reserved') proc_id = reserve_request.get('proc_id') results = reserve_request.get('results') logger.debug("Requested %s pages, and %s were reserved with proc_id: %s" % (requested, reserved, proc_id)) logger.debug("Payload: %s" % json.dumps(results, sort_keys=True, indent=4)) if reserved < 1: logger.error("No pages reserved") return "" self.payload = EmopPayload(self.settings, proc_id) self.payload.save_input(results) return proc_id def set_job_id(self, proc_id, job_id): """Sends JobID back to dashboard """ data = { "job_queue": { "proc_id": proc_id, "job_id": job_id, } } set_job_id_request = self.emop_api.put_request('/api/job_queues/set_job_id', data) return True
class EmopRun(EmopBase): def __init__(self, config_path, proc_id): """ Initialize EmopRun object and attributes Args: config_path (str): path to application config file proc_id (str or int): proc-id of this run """ super(self.__class__, self).__init__(config_path) self.proc_id = proc_id self.payload = EmopPayload(self.settings, proc_id) self.scheduler = EmopScheduler.get_scheduler_instance(name=self.settings.scheduler, settings=self.settings) self.results = {} self.jobs_completed = [] self.jobs_failed = [] self.page_results = [] self.postproc_results = [] self.font_training_results = [] self.extra_transfers = [] def append_result(self, job, results, failed=False): """Append a page's results to job's results payload The results are saved to the output JSON file so that the status of each page is saved upon failure or success. Args: job (EmopJob): EmopJob object results (str): The error output of a particular process failed (bool, optional): Sets if the result is a failure """ if failed: results_ext = "%s JOB %s: %s" % (self.scheduler.name, self.scheduler.job_id, results) logger.error(results_ext) if self.settings.operate_on == 'works': for j in job.jobs: self.jobs_failed.append({"id": j.id, "results": results_ext}) else: self.jobs_failed.append({"id": job.id, "results": results_ext}) else: if self.settings.operate_on == 'works': for j in job.jobs: self.jobs_completed.append(j.id) else: self.jobs_completed.append(job.id) # TODO: Do we need to handle adding page_results and postproc_results differently?? _jobs = [] if self.settings.operate_on == 'works': for j in job.jobs: _jobs.append(j) else: _jobs = [job] for j in _jobs: if j.page_result.has_data(): self.page_results.append(j.page_result.to_dict()) if j.postproc_result.has_data(): self.postproc_results.append(j.postproc_result.to_dict()) if j.font_training_result.has_data(): self.font_training_results.append(j.font_training_result.to_dict()) if j.extra_transfers: self.extra_transfers = self.extra_transfers + j.extra_transfers current_results = self.get_results() self.payload.save_output(data=current_results, overwrite=True) def get_results(self): """Get this object's results Returns: dict: Results to be used as payload to API """ job_queues_data = { "completed": self.jobs_completed, "failed": self.jobs_failed, } data = { "job_queues": job_queues_data, "page_results": self.page_results, "postproc_results": self.postproc_results, "font_training_results": self.font_training_results, "extra_transfers": self.extra_transfers } return data @EmopBase.run_timing def do_process(self, obj, job, **kwargs): """ Run a process This function is intended to handle calling and getting the success or failure of a job's post process. If a process does not return an exitcode of 0 then a failure has occurred and the stderr is added to the job's results. Args: obj (object): The class of a process job (EmopJob): EmopJob object **kwargs: Arbitrary keyword arguments. Returns: bool: True if successful, False otherwise. """ klass = obj.__class__.__name__ if self.settings.controller_skip_existing and not obj.should_run(): logger.info("Skipping %s job [%s]" % (klass, job.id)) return True result = obj.run(**kwargs) if result.exitcode != 0: err = "%s Failed: %s" % (klass, result.stderr) # TODO need to rework so failed doesn't mean done self.append_result(job=job, results=err, failed=True) return False else: return True @EmopBase.run_timing def do_ocr(self, job): """Run the OCR The actual OCR class is called from here. Based on the value of the ocr_engine, a different class will be called. The ocr_results returned by the OCR class are used to determine if the ocr was successful and the results are appended to global results. Args: job (EmopJob): EmopJob object Returns: bool: True if successful, False otherwise. """ logger.info( "Got job [%s] - Batch: %s JobType: %s OCR Engine: %s" % (job.id, job.batch_job.name, job.batch_job.job_type, job.batch_job.ocr_engine) ) # OCR # ocr_engine = job.batch_job.ocr_engine if ocr_engine == "tesseract": ocr = Tesseract(job=job) elif ocr_engine == "ocular": ocr = OcularTranscribe(job=job) else: ocr_engine_err = "OCR with %s not yet supported" % ocr_engine self.append_result(job=job, results=ocr_engine_err, failed=True) return False if self.settings.controller_skip_existing and not ocr.should_run(): logger.info("Skipping OCR job [%s]" % job.id) return True ocr_result = ocr.run() if ocr_result.exitcode != 0: ocr_err = "%s OCR Failed: %s" % (ocr_engine, ocr_result.stderr) self.append_result(job=job, results=ocr_err, failed=True) return False else: return True def do_postprocesses(self, job): """Run the post processes Each post process class is called from here. Currently the steps are executed in the following order: * Denoise * MultiColumnSkew * XML_To_Text * PageEvaluator * PageCorrector * JuxtaCompare (postprocess) * JuxtaCompare - COMMENTED OUT * RetasCompare (postprocess) * RetasCompare - COMMENTED OUT If any step fails, the function terminates and returns False. Args: job (EmopJob): EmopJob object Returns: bool: True if successful, False otherwise. """ # DeNoise # if self.settings.denoise_enabled: denoise = Denoise(job=job) denoise_proc = self.do_process(obj=denoise, job=job) if not denoise_proc: return False # MultiColumnSkew # if self.settings.denoise_enabled and self.settings.multi_column_skew_enabled: multi_column_skew = MultiColumnSkew(job=job) multi_column_skew_proc = self.do_process(obj=multi_column_skew, job=job) if not multi_column_skew_proc: return False # _IDHMC.xml to _IDHMC.txt # if self.settings.denoise_enabled: xml_to_text = XML_To_Text(job=job) xml_to_text_proc = self.do_process(obj=xml_to_text, job=job) if not xml_to_text_proc: return False # PageEvaluator # if self.settings.page_evaluator_enabled: page_evaluator = PageEvaluator(job=job) page_evaluator_proc = self.do_process(obj=page_evaluator, job=job) if not page_evaluator_proc: return False # PageCorrector # if self.settings.page_corrector_enabled: page_corrector = PageCorrector(job=job) page_corrector_proc = self.do_process(obj=page_corrector, job=job) if not page_corrector_proc: return False # JuxtaCompare postprocess and OCR output # juxta_compare = JuxtaCompare(job=job) juxta_compare_proc_pp = self.do_process(obj=juxta_compare, job=job, postproc=True) if not juxta_compare_proc_pp: return False # juxta_compare_proc = self.do_process(obj=juxta_compare, job=job, postproc=False) # if not juxta_compare_proc: # return False # RetasCompare postprocess and OCR output # # retas_compare = RetasCompare(job=job) # retas_compare_proc_pp = self.do_process(obj=retas_compare, job=job, postproc=True) # if not retas_compare_proc_pp: # return False # retas_compare_proc = self.do_process(obj=retas_compare, job=job, postproc=False) # if not retas_compare_proc: # return False return True @EmopBase.run_timing def do_job(self, job): """Execute the parts of a page's job Args: job (EmopJob): EmopJob object Returns: bool: True if successful, False otherwise. """ if not self.do_ocr(job=job): return False if not self.do_postprocesses(job=job): return False return True @EmopBase.run_timing def do_training(self, job): """Execute Training""" logger.info( "Got job [%s] - Batch: %s JobType: %s OCR Engine: %s" % (job.id, job.batch_job.name, job.batch_job.job_type, job.batch_job.ocr_engine) ) # OCR # training_engine = job.batch_job.ocr_engine if training_engine == "ocular": training = OcularFontTraining(job=job) else: training_engine_err = "Training with %s not yet supported" % training_engine self.append_result(job=job, results=training_engine_err, failed=True) return False #if self.settings.controller_skip_existing and not ocr.should_run(): # logger.info("Skipping OCR job [%s]" % job.id) # return True training_result = training.run() if training_result.exitcode != 0: training_err = "%s Training Failed: %s" % (training_engine, training_result.stderr) self.append_result(job=job, results=training_err, failed=True) return False else: return True @EmopBase.run_timing def run(self, force=False): """Run the EmopJob This function is intended to be what's called by external scripts like emopcmd.py to start all work. Based on the payload's data, all pages are iterated over from here. Once the loop of all jobs is complete the final results are saved to a file as completed payload Args: force (bool): Run even if output file exists. Returns: bool: True if successful, False otherwise. """ global instance global job_ids data = self.payload.load_input() logger.debug("Payload: \n%s" % json.dumps(data, sort_keys=True, indent=4)) if not data: logger.error("No payload data to load.") return False if not force: if self.payload.output_exists(): logger.error("Output file %s already exists." % self.payload.output_filename) return False if self.payload.completed_output_exists(): logger.error("Output file %s already exists." % self.payload.completed_output_filename) return False # Assign global variables and respond to signals for job in data: job_ids.append(job["id"]) instance = self signal.signal(signal.SIGUSR1, signal_exit) # Loop over data to create EmopJob records emop_jobs = {} for job in data: emop_job = EmopJob(job_data=job, settings=self.settings, scheduler=self.scheduler) if self.settings.operate_on == 'works': work_id = emop_job.work.id if work_id in emop_jobs: _emop_job = emop_jobs[work_id] _emop_job.jobs.append(emop_job) continue else: emop_job.jobs = [emop_job] emop_jobs[work_id] = emop_job else: page_id = emop_job.page.id emop_jobs[page_id] = emop_job # Loop over jobs to perform actual work for emop_job_id, emop_job in emop_jobs.iteritems(): if emop_job.batch_job.job_type == "ocr": job_succcessful = self.do_job(job=emop_job) if not job_succcessful: continue # Append successful completion of page # self.append_result(job=emop_job, results=None, failed=False) elif emop_job.batch_job.job_type == 'font training': job_successful = self.do_training(job=emop_job) if not job_successful: continue # Append successful completion # self.append_result(job=emop_job, results=None, failed=False) else: logger.error("JobType of %s is not yet supported." % emop_job.batch_job.job_type) return False logger.debug("Payload: \n%s" % json.dumps(self.get_results(), sort_keys=True, indent=4)) self.payload.save_completed_output(data=self.get_results(), overwrite=force) return True