def submit_job(self, job): """Submit job to LSF and return the job id""" res = bsub( _in="limit_wrapper.sh {0} {1}".format(job.model_filename, job.sig_strength), q=self.lsf_queue, o="/dev/null") logger.debug("submitting {0}".format(res.call_args['in'])) return self.lsf_submit_re.match(res.stdout).groups()[0]
def execute(self, job): super(LsfRunner, self).execute(job) result = JobResult(has_failed=True, details=job.info.copy()) stderr = StringIO() if job.info["queue"] == "default": report = self.get_report_filepath(job) result.output = bsub("-o", report, _in=job.info["command"], _err=stderr).strip() result.error = stderr.getvalue().strip() logger.debug("bsub output: %s" % result.output) logger.debug("bsub error: %s" % result.error) match = submit_expr.search(result.output) if match: result.has_failed = False result.details["lsf_id"] = match.group(1) result.details["lsf_queue"] = match.group(2) else: result.error = "Job info has unknown queue." logger.warn(result.error) return result
def submit_jobs(jobs, args, log, timeout): log_message(1, ["Entering submit_jobs to schedule node tests"]) main_dir = os.getcwd() for job in jobs: # Create job directory job.set_rootdir(args.path) cp(args.case, job.path) os.chdir(job.path) try: if args.batch == "LSF": from sh import bsub with open(os.path.join(job.path, "run_case.lsf"), 'r') as jf: temp = bsub(_in=jf, m=' '.join(job.nodes), P=args.account, q=args.queue, _timeout=timeout) job.jobid = temp.split('<')[1].split('>')[0] log_message( 1, ["Job {} submitted with bsub".format(job.name)]) elif args.batch == "PBS": from sh import qsub sel_hosts = "select=" + '+'.join([ "ncpus=36:mpiprocs=36:host={}".format(nid) for nid in job.nodes ]) if args.force: temp = qsub("-l", sel_hosts, "-A", args.account, "-q", args.queue, "-h", os.path.join(job.path, "run_case.pbs"), _timeout=timeout) else: temp = qsub("-l", sel_hosts, "-A", args.account, "-q", args.queue, os.path.join(job.path, "run_case.pbs"), _timeout=timeout) job.jobid = temp.split('.')[0] log_message(1, [ "Job {} submitted with qsub (hold = {})".format( job.name, args.force) ]) log["num_active"] += 1 except TimeoutException: log_message(1, ["Could not submit job {}, skipping".format(job.name)]) log["errors"].append(" submit failed - " + job.name) log["num_errors"] += 1 log["num_jobs"] += 1 os.chdir(main_dir) # If it's been a while, check status if int(time.time() - log["last_time"]) >= 10: print_status(jobs, log) log_message(1, ["Finished submitting {} jobs".format(log["num_jobs"])])