Пример #1
0
 def submit_job(self, job):
     """Submit job to LSF and return the job id"""
     res = bsub(
         _in="limit_wrapper.sh {0} {1}".format(job.model_filename, job.sig_strength),
         q=self.lsf_queue, o="/dev/null")
     logger.debug("submitting {0}".format(res.call_args['in']))
     return self.lsf_submit_re.match(res.stdout).groups()[0]
Пример #2
0
 def execute(self, job):
     super(LsfRunner, self).execute(job)
     result = JobResult(has_failed=True, details=job.info.copy())
     stderr = StringIO()
     if job.info["queue"] == "default":
         report = self.get_report_filepath(job)
         result.output = bsub("-o", report, _in=job.info["command"], _err=stderr).strip()
         result.error = stderr.getvalue().strip()
         logger.debug("bsub output: %s" % result.output)
         logger.debug("bsub error: %s" % result.error)
         match = submit_expr.search(result.output)
         if match:
             result.has_failed = False
             result.details["lsf_id"] = match.group(1)
             result.details["lsf_queue"] = match.group(2)
     else:
         result.error = "Job info has unknown queue."
         logger.warn(result.error)
     return result
Пример #3
0
def submit_jobs(jobs, args, log, timeout):
    log_message(1, ["Entering submit_jobs to schedule node tests"])
    main_dir = os.getcwd()

    for job in jobs:
        # Create job directory
        job.set_rootdir(args.path)
        cp(args.case, job.path)
        os.chdir(job.path)

        try:
            if args.batch == "LSF":
                from sh import bsub
                with open(os.path.join(job.path, "run_case.lsf"), 'r') as jf:
                    temp = bsub(_in=jf,
                                m=' '.join(job.nodes),
                                P=args.account,
                                q=args.queue,
                                _timeout=timeout)
                    job.jobid = temp.split('<')[1].split('>')[0]
                    log_message(
                        1, ["Job {} submitted with bsub".format(job.name)])
            elif args.batch == "PBS":
                from sh import qsub
                sel_hosts = "select=" + '+'.join([
                    "ncpus=36:mpiprocs=36:host={}".format(nid)
                    for nid in job.nodes
                ])

                if args.force:
                    temp = qsub("-l",
                                sel_hosts,
                                "-A",
                                args.account,
                                "-q",
                                args.queue,
                                "-h",
                                os.path.join(job.path, "run_case.pbs"),
                                _timeout=timeout)
                else:
                    temp = qsub("-l",
                                sel_hosts,
                                "-A",
                                args.account,
                                "-q",
                                args.queue,
                                os.path.join(job.path, "run_case.pbs"),
                                _timeout=timeout)

                job.jobid = temp.split('.')[0]
                log_message(1, [
                    "Job {} submitted with qsub (hold = {})".format(
                        job.name, args.force)
                ])

            log["num_active"] += 1
        except TimeoutException:
            log_message(1,
                        ["Could not submit job {}, skipping".format(job.name)])
            log["errors"].append("   submit failed   - " + job.name)
            log["num_errors"] += 1

        log["num_jobs"] += 1
        os.chdir(main_dir)

        # If it's been a while, check status
        if int(time.time() - log["last_time"]) >= 10:
            print_status(jobs, log)

        log_message(1, ["Finished submitting {} jobs".format(log["num_jobs"])])