def run(self, postproc): if not self.job.page.hasGroundTruth(): return self.results(stdout=None, stderr=None, exitcode=0) if postproc: input_file = self.job.alto_txt_file else: input_file = self.job.idhmc_txt_file if not input_file or not os.path.isfile(input_file): stderr = "Could not find RetasCompare input file: %s" % input_file return self.results(stdout=None, stderr=stderr, exitcode=1) cmd = [ "java", "-Xms128M", "-Xmx128M", "-jar", self.executable, self.job.page.ground_truth_file, input_file, "-opt", self.cfg ] proc = exec_cmd(cmd) if proc.exitcode != 0: stderr = "RetasCompare of %s failed: %s" % (input_file, proc.stderr) return self.results(stdout=proc.stdout, stderr=stderr, exitcode=proc.exitcode) out = proc.stdout.strip() values = re.split(r"\t", out) value = float(values[-1]) if postproc: # self.job.postproc_result.pp_retas = value self.job.page_result.alt_change_index = value # else: # self.job.page_result.alt_change_index = value return self.results(stdout=None, stderr=None, exitcode=0)
def submit_job(self, proc_id, num_pages, dependency=None): """Submit a job to SLURM Before the job is submitted some environment variables are set which are then used by SLURM. ``PROC_ID`` tells the SLURM job which JSON file to load. ``EMOP_CONFIG_PATH`` tells the SLURM job which INI file should be used. Args: proc_id (str or int): proc_id to be used by submitted job num_pages (int): Number of pages being scheduled Returns: str: SLURM Job ID (false returned if failed) """ if not proc_id: logger.error("EmopSLURM#submit_job(): Must provide valid proc_id.") return False os.environ['PROC_ID'] = proc_id os.environ['EMOP_CONFIG_PATH'] = self.settings.config_path cmd = self.get_submit_cmd(num_pages=num_pages, dependency=dependency) proc = exec_cmd(cmd, log_level="debug") if proc.exitcode != 0: logger.error("Failed to submit job to SLURM: %s" % proc.stderr) return False slurm_job_id = proc.stdout.rstrip() logger.info("SLURM job %s submitted for PROC_ID %s" % (slurm_job_id, proc_id)) return slurm_job_id
def run(self): if not self.job.xml_file or not os.path.isfile(self.job.xml_file): stderr = "Could not find XML file: %s" % self.job.xml_file return self.results(stdout=None, stderr=stderr, exitcode=1) # TODO Move -Xms and -Xmx into config.ini cmd = ["java", self.java_args, "-jar", self.executable, "-q", self.job.xml_file] proc = exec_cmd(cmd) if proc.exitcode != 0: return self.results(stdout=proc.stdout, stderr=proc.stderr, exitcode=proc.exitcode) out = proc.stdout.strip() scores = out.split(",") if len(scores) != 2: stderr = "PageEvaluator Error: unexpected response format: %s" % out return self.results(stdout=None, stderr=stderr, exitcode=1) pp_ecorr = scores[0] pp_pg_quality = scores[1] # Handle invalid values returned by PageEvaluator if pp_ecorr == 'NaN': pp_ecorr = '-1' if pp_pg_quality == 'NaN': pp_pg_quality = '-1' self.job.postproc_result.pp_ecorr = pp_ecorr self.job.postproc_result.pp_pg_quality = pp_pg_quality return self.results(stdout=None, stderr=None, exitcode=0)
def submit_transfer_job(self, task_id): """Submit a transfer job to SLURM Before the job is submitted some environment variables are set which are then used by SLURM. ``TASK_ID`` tells the SLURM job which task ID to monitor. ``EMOP_CONFIG_PATH`` tells the SLURM job which INI file should be used. Args: task_id (str or int): task_id to be used by submitted job Returns: str: ID of job submitted """ if not task_id: logger.error("EmopSLURM#submit_transfer_job(): Must provide valid task_id.") return False os.environ['TASK_ID'] = task_id os.environ['EMOP_CONFIG_PATH'] = self.settings.config_path _queue = self.settings.scheduler_transfer_queue cmd = self.get_submit_cmd(queue=_queue, name='emop-transfer', mem_per_cpu='2000', cpus_per_task='1', job_type='transfer') proc = exec_cmd(cmd, log_level="debug") if proc.exitcode != 0: logger.error("Failed to submit transfer job to SLURM: %s" % proc.stderr) return False slurm_job_id = proc.stdout.rstrip() logger.info("SLURM job %s submitted for TASK_ID %s" % (slurm_job_id, task_id)) return slurm_job_id
def run(self): if not self.job.image_path: stderr = "No image path could be determined" return self.results(stdout=None, stderr=stderr, exitcode=1) if not os.path.isfile(self.job.image_path): stderr = "Could not find page image %s" % self.job.image_path return self.results(stdout=None, stderr=stderr, exitcode=1) # Create output parent directory if it doesn't exist if not os.path.isdir(self.output_parent_dir): mkdirs_exists_ok(self.output_parent_dir) cmd = ["tesseract", self.job.image_path, self.output_filename, "-l", self.job.font.name, self.cfg] proc = exec_cmd(cmd) if proc.exitcode != 0: return self.results(stdout=proc.stdout, stderr=proc.stderr, exitcode=proc.exitcode) # Rename hOCR file to XML if os.path.isfile(self.job.hocr_file) and not os.path.isfile(self.job.xml_file): logger.debug("Renaming %s to %s" % (self.job.hocr_file, self.job.xml_file)) os.rename(self.job.hocr_file, self.job.xml_file) self.job.page_result.ocr_text_path = self.job.txt_file self.job.page_result.ocr_xml_path = self.job.xml_file return self.results(stdout=None, stderr=None, exitcode=0)
def run(self): self.generate_input_doc_list() if not self.input_font_path: stderr = "No input font path could be determined" return self.results(stdout=None, stderr=stderr, exitcode=1) if not os.path.isfile(self.input_font_path): stderr = "Could not find input font path %s" % self.input_font_path return self.results(stdout=None, stderr=stderr, exitcode=1) # Create output parent directory if it doesn't exist if not os.path.isdir(self.output_path): mkdirs_exists_ok(self.output_path) cmd = [ "java", self.java_max_heap, "-Done-jar.main.class=edu.berkeley.cs.nlp.ocular.main.Transcribe", "-jar", self.jar, "-outputPath", self.output_path, "-inputDocListPath", self.input_doc_list_path, "-inputFontPath", self.input_font_path, "-inputLmPath", self.input_lm_path, "-inputGsmPath", self.input_gsm_path, "-allowGlyphSubstitution", "true", "-skipAlreadyTranscribedDocs", 'true', "-emissionEngine", self.job.settings.ocular_emission_engine, ] if self.extra_command_parameters: cmd = cmd + self.extra_command_parameters proc = exec_cmd(cmd, realtime=True) if proc.exitcode != 0: #logger.info("OcularTranscribe STDOUT: %s", proc.stdout) return self.results(stdout=proc.stdout, stderr=proc.stderr, exitcode=proc.exitcode) # Loop over each of this job's pages and build transcribed output paths # These paths are added as results if the file is found for j in self.job.jobs: _image_basename = os.path.basename(j.image_path) _image_name = os.path.splitext(_image_basename)[0] _txt_name = "%s%s" % (_image_name, self.ocr_text_suffix) _alto_name = "%s.alto.xml" % _image_name _txt_path = os.path.join(self.transcribed_output_path, _txt_name) _alto_path = os.path.join(self.transcribed_output_path, _alto_name) if os.path.isfile(_txt_path): j.page_result.ocr_text_path = _txt_path if os.path.isfile(_alto_path): j.page_result.ocr_xml_path = _alto_path # Add extra transfers if os.path.isdir(self.transcription_dir): self.job.extra_transfers.append(self.transcription_dir) return self.results(stdout=None, stderr=None, exitcode=0)
def run(self): self.generate_input_doc_list() if not self.input_font_path: stderr = "No input font path could be determined" return self.results(stdout=None, stderr=stderr, exitcode=1) if not os.path.isfile(self.input_font_path): stderr = "Could not find input font path %s" % self.input_font_path return self.results(stdout=None, stderr=stderr, exitcode=1) # Create output parent directory if it doesn't exist if not os.path.isdir(self.output_path): mkdirs_exists_ok(self.output_path) cmd = [ "java", self.java_max_heap, "-Done-jar.main.class=edu.berkeley.cs.nlp.ocular.main.TrainFont", "-jar", self.jar, "-outputPath", self.output_path, "-inputDocListPath", self.input_doc_list_path, "-inputFontPath", self.input_font_path, "-inputLmPath", self.input_lm_path, "-inputGsmPath", self.input_gsm_path, #"-numDocs", str((len(self.images))), "-outputFontPath", self.output_font_path, "-outputLmPath", self.output_lm_path, "-outputGsmPath", self.output_gsm_path, "-continueFromLastCompleteIteration", "true", "-allowGlyphSubstitution", "true", "-updateLM", "true", "-updateGsm", "true", "-emissionEngine", self.job.settings.ocular_emission_engine, ] if self.extra_command_parameters: cmd = cmd + self.extra_command_parameters proc = exec_cmd(cmd, realtime=True) if proc.exitcode != 0: #logger.info("OcularFontTraining STDOUT: %s", proc.stdout) return self.results(stdout=proc.stdout, stderr=proc.stderr, exitcode=proc.exitcode) # Only set font_training_result on one page (job) since this is a per-work result if os.path.isfile(self.output_font_path): self.job.font_training_result.font_path = self.output_font_path if os.path.isfile(self.output_lm_path): self.job.font_training_result.language_model_path = self.output_lm_path if os.path.isfile(self.output_gsm_path): self.job.font_training_result.glyph_substitution_model_path = self.output_gsm_path return self.results(stdout=None, stderr=None, exitcode=0)
def current_job_count(self): """Get count of this application's active jobs The currentjobs are those that are Running+Pending. Example command used: squeue -r --noheader -p idhmc -n emop-controller Returns: int: The numberof current jobs """ cmd = ["squeue", "-r", "--noheader", "-p", self.settings.scheduler_queue, "-n", self.settings.scheduler_job_name] proc = exec_cmd(cmd, log_level="debug") lines = proc.stdout.splitlines() num = len(lines) return num
def run(self): if not self.job.xml_file or not os.path.isfile(self.job.xml_file): stderr = "Could not find XML file: %s" % self.job.xml_file return self.results(stdout=None, stderr=stderr, exitcode=1) dict_files = glob.glob("%s/*.dict" % self.dicts_dir) cmd = [ "java", self.java_args, "-jar", self.executable, "--dbconf", self.cfg, "-t", self.rules_file, "-o", self.job.output_dir, "--stats", "--alt", self.alt_arg, "--max-transforms", self.max_transforms, "--noiseCutoff", self.noise_cutoff, "--dict", dict_files ] if self.ctx_min_match: cmd.append("--ctx-min-match") cmd.append(self.ctx_min_match) if self.ctx_min_vol: cmd.append("--ctx-min-vol") cmd.append(self.ctx_min_vol) if self.dump: cmd.append("--dump") if self.save: cmd.append("--save") cmd.append("--") cmd.append(self.job.xml_file) proc = exec_cmd(cmd, timeout=self.timeout) if proc.exitcode != 0: # TODO: PageCorrector errors are going to stdout not stderr if not proc.stdout and proc.stderr: stderr = proc.stderr else: stderr = proc.stdout return self.results(stdout=proc.stdout, stderr=stderr, exitcode=proc.exitcode) out = proc.stdout.strip() # Check that output is valid JSON try: json.loads(out) except ValueError: stderr = "PageCorrector Error: output is not valid JSON: %s" % out return self.results(stdout=None, stderr=stderr, exitcode=1) self.job.postproc_result.pp_health = out self.job.page_result.corr_ocr_text_path = self.job.alto_txt_file self.job.page_result.corr_ocr_xml_path = self.job.alto_xml_file return self.results(stdout=None, stderr=None, exitcode=0)
def run(self, postproc): if not self.job.page.hasGroundTruth(): return self.results(stdout=None, stderr=None, exitcode=0) if postproc: input_file = self.job.alto_txt_file else: input_file = self.job.idhmc_txt_file if not input_file or not os.path.isfile(input_file): stderr = "Could not find JuxtaCompare input file: %s" % input_file return self.results(stdout=None, stderr=stderr, exitcode=1) cmd = [ "java", "-Xms128M", "-Xmx128M", "-jar", self.executable, "-diff", self.job.page.ground_truth_file, input_file, "-algorithm", self.jx_algorithm, "-hyphen", "none" ] proc = exec_cmd(cmd) if proc.exitcode != 0: # TODO: juxta-cl.jar errors are going to stdout not stderr if not proc.stdout and proc.stderr: stderr = proc.stderr else: stderr = proc.stdout return self.results(stdout=proc.stdout, stderr=stderr, exitcode=proc.exitcode) out = proc.stdout.strip() # Handle invalid values returned by Juxta if out == 'NaN': value = '-1' else: value = float(out) if postproc: # self.job.postproc_result.pp_juxta = value self.job.page_result.juxta_change_index = value # else: # self.job.page_result.juxta_change_index = value return self.results(stdout=None, stderr=None, exitcode=0)
def run(self): if not self.job.idhmc_xml_file or not os.path.isfile(self.job.idhmc_xml_file): stderr = "Could not find XML file: %s" % self.job.idhmc_xml_file return self.results(stdout=None, stderr=stderr, exitcode=1) cmd = ["python", self.executable, self.job.idhmc_xml_file] proc = exec_cmd(cmd) if proc.exitcode != 0: return self.results(stdout=proc.stdout, stderr=proc.stderr, exitcode=proc.exitcode) out = proc.stdout.strip() # Check that output is valid JSON try: json_data = json.loads(out) except ValueError: stderr = "MultiColumnSkew Error: output is not valid JSON: %s" % out return self.results(stdout=None, stderr=stderr, exitcode=1) self.job.postproc_result.multicol = json_data.get("multicol") self.job.postproc_result.skew_idx = json_data.get("skew_idx") return self.results(stdout=None, stderr=None, exitcode=0)