def check_links_status(self, fail_running=False, fail_pending=False): """"Check the status of all the jobs run from the `Link` objects in this `Chain` and return a status flag that summarizes that. Parameters ---------- fail_running : `bool` If True, consider running jobs as failed fail_pending : `bool` If True, consider pending jobs as failed Returns ------- status : `JobStatus` Job status flag that summarizes the status of all the jobs, """ status_vector = JobStatusVector() for link in self._links.values(): key = JobDetails.make_fullkey(link.full_linkname) link_status = link.check_job_status(key, fail_running=fail_running, fail_pending=fail_pending) status_vector[link_status] += 1 return status_vector.get_status()
def run_with_log(self, dry_run=False, stage_files=True, resubmit_failed=False): """Runs this link with output sent to a pre-defined logfile Parameters ----------- dry_run : bool Print command but do not run it. stage_files : bool Copy files to and from scratch staging area. resubmit_failed : bool Flag for sub-classes to resubmit failed jobs. """ fullkey = JobDetails.make_fullkey(self.full_linkname) job_details = self.jobs[fullkey] odir = os.path.dirname(job_details.logfile) try: os.makedirs(odir) except OSError: pass ostream = open(job_details.logfile, 'w') self.run(ostream, dry_run, stage_files, resubmit_failed)
def _register_self(self, logfile, key=JobDetails.topkey, status=JobStatus.unknown): """Runs this link, captures output to logfile, and records the job in self.jobs""" fullkey = JobDetails.make_fullkey(self.full_linkname, key) if fullkey in self.jobs: job_details = self.jobs[fullkey] job_details.status = status else: job_details = self._register_job(key, self.args, logfile, status)
def _set_status_self(self, key=JobDetails.topkey, status=JobStatus.unknown): """Set the status of this job, both in self.jobs and in the `JobArchive` if it is present. """ fullkey = JobDetails.make_fullkey(self.full_linkname, key) if fullkey in self.jobs: self.jobs[fullkey].status = status if self._job_archive: self._job_archive.register_job(self.jobs[fullkey]) else: self._register_self('dummy.log', key, status)
def print_status(self, indent="", recurse=False): """Print a summary of the job status for each `Link` in this `Chain`""" print ("%s%30s : %15s : %20s" % (indent, "Linkname", "Link Status", "Jobs Status")) for link in self._links.values(): if hasattr(link, 'check_status'): status_vect = link.check_status( stream=sys.stdout, no_wait=True, do_print=False) else: status_vect = None key = JobDetails.make_fullkey(link.full_linkname) link_status = JOB_STATUS_STRINGS[link.check_job_status(key)] if status_vect is None: jobs_status = JOB_STATUS_STRINGS[link.check_jobs_status()] else: jobs_status = status_vect print ("%s%30s : %15s : %20s" % (indent, link.linkname, link_status, jobs_status)) if hasattr(link, 'print_status') and recurse: print ("---------- %30s -----------" % link.linkname) link.print_status(indent + " ", recurse=True) print ("------------------------------------------------")
def preconfigure(self, config_yaml): """ Run any links needed to build files that are used in _map_arguments """ if self._preconfigured: return config_dict = load_yaml(config_yaml) ttype = config_dict.get('ttype') self.link_prefix = "%s." % ttype config_template = config_dict.get('config_template', None) rosters = config_dict.get('rosters') alias_dict = config_dict.get('alias_dict', None) spatial_models = config_dict.get('spatial_models') sims = config_dict.get('sims', {}) sim_names = [] sim_names += list(sims.keys()) if 'random' in config_dict: sim_names += ['random'] self._set_link('prepare-targets', PrepareTargets, ttype=ttype, rosters=rosters, spatial_models=spatial_models, alias_dict=alias_dict, sims=sim_names, config=config_template) link = self['prepare-targets'] key = JobDetails.make_fullkey(link.full_linkname) if not link.jobs: raise ValueError("No Jobs") link_status = link.check_job_status(key) if link_status == JobStatus.done: self._preconfigured = True return if link_status == JobStatus.failed: link.clean_jobs() link.run_with_log() self._preconfigured = True
def print_status(self, indent="", recurse=False): """Print a summary of the job status for each `Link` in this `Chain`""" print("%s%30s : %15s : %20s" % (indent, "Linkname", "Link Status", "Jobs Status")) for link in self._links.values(): if hasattr(link, 'check_status'): status_vect = link.check_status(stream=sys.stdout, no_wait=True, do_print=False) else: status_vect = None key = JobDetails.make_fullkey(link.full_linkname) link_status = JOB_STATUS_STRINGS[link.check_job_status(key)] if status_vect is None: jobs_status = JOB_STATUS_STRINGS[link.check_jobs_status()] else: jobs_status = status_vect print("%s%30s : %15s : %20s" % (indent, link.linkname, link_status, jobs_status)) if hasattr(link, 'print_status') and recurse: print("---------- %30s -----------" % link.linkname) link.print_status(indent + " ", recurse=True) print("------------------------------------------------")
def _run_chain(self, stream=sys.stdout, dry_run=False, stage_files=True, force_run=False, resubmit_failed=False): """Run all the links in the chain Parameters ----------- stream : `file` Stream to print to, Must have 'write' function dry_run : bool Print commands but do not run them stage_files : bool Stage files to and from the scratch area force_run : bool Run jobs, even if they are marked as done resubmit_failed : bool Resubmit failed jobs """ self._set_links_job_archive() failed = False if self._file_stage is not None: input_file_mapping, output_file_mapping = self._map_scratch_files( self.sub_files) if stage_files: self._file_stage.make_scratch_dirs(input_file_mapping, dry_run) self._file_stage.make_scratch_dirs( output_file_mapping, dry_run) self._stage_input_files(input_file_mapping, dry_run) for link in self._links.values(): logfile = os.path.join('logs', "%s.log" % link.full_linkname) link._archive_self(logfile, status=JobStatus.unknown) key = JobDetails.make_fullkey(link.full_linkname) if hasattr(link, 'check_status'): link.check_status(stream, no_wait=True, check_once=True, do_print=False) else: pass link_status = link.check_job_status(key) if link_status in [JobStatus.done]: if not force_run: print ("Skipping done link", link.full_linkname) continue elif link_status in [JobStatus.running]: if not force_run and not resubmit_failed: print ("Skipping running link", link.full_linkname) continue elif link_status in [JobStatus.failed, JobStatus.partial_failed]: if not resubmit_failed: print ("Skipping failed link", link.full_linkname) continue print ("Running link ", link.full_linkname) link.run_with_log(dry_run=dry_run, stage_files=False, resubmit_failed=resubmit_failed) link_status = link.check_jobs_status() link._set_status_self(status=link_status) if link_status in [JobStatus.failed, JobStatus.partial_failed]: print ("Stoping chain execution at failed link %s" % link.full_linkname) failed = True break # elif link_status in [JobStatus.partial_failed]: # print ("Resubmitting partially failed link %s" % # link.full_linkname) # link.run_with_log(dry_run=dry_run, stage_files=False, # resubmit_failed=resubmit_failed) # link_status = link.check_jobs_status() # link._set_status_self(status=link_status) # if link_status in [JobStatus.partial_failed]: # print ("Stoping chain execution: resubmission failed %s" % # link.full_linkname) # failed = True # break if self._file_stage is not None and stage_files and not failed: self._stage_output_files(output_file_mapping, dry_run) chain_status = self.check_links_status() print ("Chain status: %s" % (JOB_STATUS_STRINGS[chain_status])) if chain_status == 5: job_status = 0 else: job_status = -1 self._write_status_to_log(job_status, stream) self._set_status_self(status=chain_status) if self._job_archive: self._job_archive.file_archive.update_file_status() self._job_archive.write_table_file()
def _run_chain(self, stream=sys.stdout, dry_run=False, stage_files=True, force_run=False, resubmit_failed=False): """Run all the links in the chain Parameters ----------- stream : `file` Stream to print to, Must have 'write' function dry_run : bool Print commands but do not run them stage_files : bool Stage files to and from the scratch area force_run : bool Run jobs, even if they are marked as done resubmit_failed : bool Resubmit failed jobs """ self._set_links_job_archive() failed = False if self._file_stage is not None: input_file_mapping, output_file_mapping = self._map_scratch_files( self.sub_files) if stage_files: self._file_stage.make_scratch_dirs(input_file_mapping, dry_run) self._file_stage.make_scratch_dirs(output_file_mapping, dry_run) self._stage_input_files(input_file_mapping, dry_run) for link in self._links.values(): logfile = os.path.join('logs', "%s.log" % link.full_linkname) link._archive_self(logfile, status=JobStatus.unknown) key = JobDetails.make_fullkey(link.full_linkname) if hasattr(link, 'check_status'): link.check_status(stream, no_wait=True, check_once=True, do_print=False) else: pass link_status = link.check_job_status(key) if link_status in [JobStatus.done]: if not force_run: print("Skipping done link", link.full_linkname) continue elif link_status in [JobStatus.running]: if not force_run and not resubmit_failed: print("Skipping running link", link.full_linkname) continue elif link_status in [JobStatus.failed, JobStatus.partial_failed]: if not resubmit_failed: print("Skipping failed link", link.full_linkname) continue print("Running link ", link.full_linkname) link.run_with_log(dry_run=dry_run, stage_files=False, resubmit_failed=resubmit_failed) link_status = link.check_jobs_status() link._set_status_self(status=link_status) if link_status in [JobStatus.failed, JobStatus.partial_failed]: print("Stoping chain execution at failed link %s" % link.full_linkname) failed = True break # elif link_status in [JobStatus.partial_failed]: # print ("Resubmitting partially failed link %s" % # link.full_linkname) # link.run_with_log(dry_run=dry_run, stage_files=False, # resubmit_failed=resubmit_failed) # link_status = link.check_jobs_status() # link._set_status_self(status=link_status) # if link_status in [JobStatus.partial_failed]: # print ("Stoping chain execution: resubmission failed %s" % # link.full_linkname) # failed = True # break if self._file_stage is not None and stage_files and not failed: self._stage_output_files(output_file_mapping, dry_run) chain_status = self.check_links_status() print("Chain status: %s" % (JOB_STATUS_STRINGS[chain_status])) if chain_status == 5: job_status = 0 else: job_status = -1 self._write_status_to_log(job_status, stream) self._set_status_self(status=chain_status) if self._job_archive: self._job_archive.file_archive.update_file_status() self._job_archive.write_table_file()