Пример #1
0
def kill_job(api, job_id):
    """Kill a job job until it is dead."""
    job = get_job(api, job_id)

    if job["state"] in ["CANCELLED", "COMPLETED", "FAILED", "TIMEOUT"]:
        print('%s is already dead' % job_id)
    else:
        kill_command = "scancel %s" % job_id
        while True:
            try:
                hu.subprocess_call(kill_command)
                print('%s CANCELLING...' % job_id)
            except Exception as e:
                if "Socket timed out" in str(e):
                    print("scancel time out and retry now")
                    time.sleep(1)
                    continue
            break

        # confirm cancelled
        job = get_job(api, job_id)
        while job["state"] != "CANCELLED":
            time.sleep(2)
            job = get_job(api, job_id)

        print('%s now is dead.' % job_id)
Пример #2
0
def kill_job(job_id):
    """Kill a job job until it is dead."""
    kill_command = "scancel %s" % job_id
    while True:
        try:
            hu.subprocess_call(kill_command)  # no return message after scancel
        except Exception:
            print("scancel time out and retry now")
            time.sleep(1)
            continue
        break
    return
Пример #3
0
    def launch_job(self, exp_dict, savedir, command, job=None):
        """Submit a job job and save job dict and exp_dict."""
        add_job_utils()
        import haven_jobs_utils as hju

        # Check for duplicates
        if job is not None:
            assert self._assert_no_duplicates(job)

        hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict)

        # Define paths
        workdir_job = os.path.join(savedir, "code")

        # Copy the experiment code into the experiment folder
        hu.copy_code(self.workdir + "/", workdir_job)

        # Run  command
        job_command = hju.get_job_command(self.job_config, command, savedir, workdir=workdir_job)
        job_id = hu.subprocess_call(job_command).replace("\n", "")

        # Verbose
        if self.verbose:
            print("Job_id: %s command: %s" % (job_id, command))

        job_dict = {"job_id": job_id, 
                      "started at (Montreal)":hu.time_to_montreal(),
                      "command":command}

        hu.save_json(hju.get_job_fname(savedir), job_dict)

        return job_dict
Пример #4
0
def get_existing_slurm_job_commands(exp_list, savedir_base):
    existing_job_commands = []
    for exp_dict in exp_list:
        exp_id = hu.hash_dict(exp_dict)
        savedir = os.path.join(savedir_base, exp_id)
        file_name = os.path.join(savedir, "job_dict.json")
        if not os.path.exists(file_name):
            continue
        job_dict = hu.load_json(file_name)
        job_id = job_dict["job_id"]
        job_status = hu.subprocess_call(
            "scontrol show job %s" %
            job_id).split("JobState=")[1].split(" ")[0]
        if job_status == "RUNNING" or job_status == "PENDING":
            existing_job_commands += [job_dict["command"]]

    return existing_job_commands
Пример #5
0
def get_jobs(user_name):
    # account_id = hu.subprocess_call('eai account get').split('\n')[-2].split(' ')[0]
    """ get the first 3 jobs"""
    command = "squeue --user=%s" % user_name
    while True:
        try:
            job_list = hu.subprocess_call(command)
            job_list = job_list.split("\n")
            job_list = [v.lstrip().split(" ")[0] for v in job_list[1:]]
            result = []
            for job_id in job_list:
                result.append(get_job(job_id))
        except Exception:
            print("scontrol time out and retry now")
            time.sleep(1)
            continue
        break
    return result
Пример #6
0
def get_job(job_id):
    """Get job information."""
    command = "scontrol show job %s" % job_id
    job_info = ""
    while True:
        try:
            job_info = hu.subprocess_call(command)
            job_info = job_info.replace("\n", "")
            job_info = {
                v.split("=")[0]: v.split("=")[1]
                for v in job_info.split(" ") if "=" in v
            }
        except Exception:
            print("scontrol time out and retry now")
            time.sleep(1)
            continue
        break
    return job_info
Пример #7
0
def get_job(job_id):
    """Get job information."""
    command = "scontrol show job %s" % job_id
    job_info = ''
    while True:
        try:
            job_info = hu.subprocess_call(command)
            job_info = job_info.replace('\n', '')
            job_info = {
                v.split('=')[0]: v.split('=')[1]
                for v in job_info.split(' ') if '=' in v
            }
        except:
            print("scontrol time out and retry now")
            time.sleep(1)
            continue
        break
    return job_info
Пример #8
0
def get_jobs(api, account_id):
    ''' get all jobs launched by the current user'''
    job_list = ""
    command = "squeue --user=%s --format=\"%%.18i %%.8T\"" % getpass.getuser()
    while True:
        try:
            job_list = hu.subprocess_call(command)
        except Exception as e:
            if "Socket timed out" in str(e):
                print("squeue time out and retry now")
                time.sleep(1)
                continue
        break

    result = [{
        "job_id": j.split()[0],
        "state": j.split()[1]
    } for j in job_list.split('\n')[1:-1]]
    return result
Пример #9
0
def get_jobs_dict(api, job_id_list, query_size=20):
    if len(job_id_list) == 0:
        return {}

    jobs_dict = {}

    command = "sacct --jobs=%s --format=jobid,cputime,state" % str(
        job_id_list)[1:-1].replace(" ", "")
    while True:
        try:
            job_list = hu.subprocess_call(command)
        except Exception as e:
            if "Socket timed out" in str(e):
                print("sacct time out and retry now")
                time.sleep(1)
                continue
        break

    lines = job_list.split('\n')
    header = lines[0].split()
    lines = [l.split() for l in lines[2:-1]]

    df = pd.DataFrame(data=lines, columns=header)
    df = df[~df["JobID"].str.contains(r"\.")]
    df = df.rename(mapper={
        "State": "state",
        "CPUTime": "cpuTime",
        "JobID": "job_id"
    },
                   axis=1)
    df = df.replace({"state": r"CANCELLED.*"}, {"state": "CANCELLED"},
                    regex=True)
    df.insert(loc=0, column="runs", value="")

    # use job id as key
    new_df = df.drop(labels="job_id", axis=1)
    new_df.index = df["job_id"].to_list()
    jobs_dict = new_df.to_dict(orient="index")

    return jobs_dict
Пример #10
0
def submit_job(api,
               account_id,
               command,
               job_config,
               workdir,
               savedir_logs=None):
    # read slurm setting
    lines = "#! /bin/bash \n"
    lines += "#SBATCH --account=%s \n" % account_id
    for key in list(job_config.keys()):
        lines += "#SBATCH --%s=%s \n" % (key, job_config[key])
    path_log = os.path.join(savedir_logs, "logs.txt")
    lines += "#SBATCH --output=%s \n" % path_log
    path_err = os.path.join(savedir_logs, "err.txt")
    lines += "#SBATCH --error=%s \n" % path_err
    path_code = os.path.join(savedir_logs, "code")
    lines += "#SBATCH --chdir=%s \n" % path_code

    lines += command

    file_name = os.path.join(savedir_logs, "bash.sh")
    hu.save_txt(file_name, lines)

    # launch the exp
    submit_command = "sbatch %s" % file_name
    while True:
        try:
            job_id = hu.subprocess_call(submit_command).split()[-1]
        except Exception as e:
            if "Socket timed out" in str(e):
                print("slurm time out and retry now")
                time.sleep(1)
                continue
        break

    # delete the bash.sh
    os.remove(file_name)

    return job_id
Пример #11
0
def submit_job(command, savedir):
    # read slurm setting
    lines = "#! /bin/bash \n"
    # if job_config is not None:
    #     lines += "#SBATCH --account=%s \n" % job_configs.ACCOUNT_ID
    #     for key in list(job_config.keys()):
    #         lines += "#SBATCH --%s=%s \n" % (key, job_config[key])
    lines += "#SBATCH --account=%s \n" % job_configs.ACCOUNT_ID
    for key in list(job_configs.JOB_CONFIG.keys()):
        lines += "#SBATCH --%s=%s \n" % (key, job_configs.JOB_CONFIG[key])
    path_log = os.path.join(savedir, "logs.txt")
    path_err = os.path.join(savedir, "err.txt")
    lines += "#SBATCH --output=%s \n" % path_log
    lines += "#SBATCH --error=%s \n" % path_err

    lines += command

    file_name = os.path.join(savedir, "bash.sh")
    hu.save_txt(file_name, lines)
    # launch the exp
    submit_command = "sbatch %s" % file_name
    while True:
        try:
            job_id = hu.subprocess_call(submit_command).split()[-1]
        except Exception:
            print("slurm time out and retry now")
            time.sleep(1)
            continue
        break

    # save the command and job id in job_dict.json
    job_dict = {"command": command, "job_id": job_id}
    hu.save_json(os.path.join(savedir, "job_dict.json"), job_dict)

    # delete the bash.sh
    os.remove(file_name)

    return job_id
Пример #12
0
    lines = ("#! /bin/bash \n"
             "#SBATCH --account=%s \n"
             "#SBATCH --time=%s \n"
             "#SBATCH --mem-per-cpu=%s \n"
             "%s") % (
                 account,
                 time,
                 mem_cpu,
                 command,
             )
    hu.save_txt(file_name, lines)


if __name__ == "__main__":

    # specify the slurm script to run
    parser = argparse.ArgumentParser()
    parser.add_argument("--batch", required=True)
    args = parser.parse_args()
    submit_command = "sbatch " + args.batch
    # step 1 - run a job through slurm using `hu.subprocess_call`
    job_id = hu.subprocess_call(submit_command).split()[-1]

    # step 2 - get the status of the job from the job_id
    get_command = "squeue --job %s" % job_id
    job_status = hu.subprocess_call(get_command)

    # step 3 - kill the job
    kill_command = "scancel %s" % job_id
    info = hu.subprocess_call(kill_command)  # no return message after scancel