예제 #1
0
파일: batch_exec.py 프로젝트: HXTP/dpgen
def exec_batch(cmd,
               numb_node,
               work_thread,
               numb_gpu,
               task_dirs,
               task_args=None,
               time_limit="24:0:0",
               mem_limit=32,
               modules=None,
               sources=None):
    cwd = os.getcwd()
    job_list = []
    fin_tag = 'tag_finished'
    for ii, mydir in enumerate(task_dirs):
        os.chdir(mydir)
        myarg = None
        if task_args is not None:
            myarg = task_args[ii]
        with open('_sub', 'w') as fp:
            fp.write(
                make_slurm_script(cmd, numb_node, work_thread, numb_gpu, myarg,
                                  time_limit, mem_limit, modules, sources,
                                  fin_tag))
        job = SlurmJob(os.getcwd(), '_sub', job_finish_tag=fin_tag)
        job_list.append(job)
        os.chdir(cwd)

    for ii in job_list:
        ii.submit()


#        time.sleep(1)

    while True:
        find_unfinish = False
        for job in job_list:
            stat = job.check_status()
            if stat == JobStatus.terminated:
                raise RuntimeError("find terminated job")
                old_job_id = job.get_job_id()
                new_job_id = job.submit()
                find_unfinish = True
            if stat != JobStatus.finished:
                find_unfinish = True
        if find_unfinish == False:
            return
        else:
            time.sleep(10)
예제 #2
0
def exec_batch_group(cmd,
                     work_thread,
                     numb_gpu,
                     task_dirs_,
                     group_size=10,
                     task_args=None,
                     time_limit="24:0:0",
                     mem_limit=6,
                     modules=None,
                     sources=None):
    cwd = os.getcwd()
    job_list = []
    fin_tag = 'tag_finished'

    os.chdir(task_dirs_[0])
    os.chdir('..')
    working_dir = os.getcwd()
    os.chdir(cwd)

    task_dirs = []
    for ii in task_dirs_:
        task_dirs.append(os.path.abspath(ii))
    if task_args is not None:
        assert (len(task_dirs) == len(task_args))
    if task_args is None:
        task_args = []
        for ii in task_dirs:
            task_args.append("")

    ntasks = len(task_dirs)
    task_chunks = [
        task_dirs[i:i + group_size] for i in range(0, ntasks, group_size)
    ]
    args_chunks = [
        task_args[i:i + group_size] for i in range(0, ntasks, group_size)
    ]

    os.chdir(working_dir)
    for ii in range(len(task_chunks)):
        group_dir = "group.%06d" % ii
        if not os.path.isdir(group_dir):
            os.mkdir(group_dir)
        os.chdir(group_dir)
        with open('sub', 'w') as fp:
            fp.write(
                make_slurm_script_group(cmd, task_chunks[ii], work_thread,
                                        numb_gpu, args_chunks[ii], time_limit,
                                        mem_limit, modules, sources, fin_tag))
            job = SlurmJob(os.getcwd(), 'sub', job_finish_tag=fin_tag)
        job_list.append(job)
        os.chdir(working_dir)
    os.chdir(cwd)

    # for ii,mydir in enumerate(task_dirs) :
    #     os.chdir(mydir)
    #     myarg = None
    #     if task_args is not None :
    #         myarg = task_args[ii]
    #     with open('_sub', 'w') as fp :
    #         fp.write(make_slurm_script(cmd, work_thread, numb_gpu, myarg, time_limit, mem_limit, modules, sources, fin_tag))
    #     job = SlurmJob(os.getcwd(), '_sub', job_finish_tag = fin_tag)
    #     job_list.append (job)
    #     os.chdir(cwd)

    for ii in job_list:
        ii.submit()
#        time.sleep(1)

    while True:
        find_unfinish = False
        for job in job_list:
            stat = job.check_status()
            if stat == JobStatus.terminated:
                raise RuntimeError("find terminated job")
                old_job_id = job.get_job_id()
                new_job_id = job.submit()
                find_unfinish = True
            if stat != JobStatus.finished:
                find_unfinish = True
        if find_unfinish == False:
            return
        else:
            time.sleep(10)