def submit_hillshade_newest_headwall_line_grid_job(ids_list, idx,
                                                   grid_base_name,
                                                   max_job_count):

    wait_if_reach_max_jobs(max_job_count, 'dLi')  # draw Line on hillshade

    job_name = 'dLi%d' % idx
    check_length_jobname(job_name)
    work_dir = working_dir_string(idx,
                                  'hillshade_newest_headwall_line_',
                                  root=root_dir)
    if os.path.isdir(work_dir) is False:
        io_function.mkdir(work_dir)
        os.chdir(work_dir)

        ids_list = [str(item) for item in ids_list]
        io_function.save_list_to_txt(grid_base_name + '.txt', ids_list)

        # prepare job
        sh_list = [
            'hillshade_headwall_line_grid.sh',
            'job_hillshade_headwall_line_grid.sh'
        ]
        copy_curc_job_files(jobsh_dir, work_dir, sh_list)
        slurm_utility.modify_slurm_job_sh(
            'job_hillshade_headwall_line_grid.sh', 'job-name', job_name)
    else:
        os.chdir(work_dir)
        submit_job_names = slurm_utility.get_submited_job_names(curc_username)
        if job_name in submit_job_names:
            print(
                'The folder: %s already exist and the job has been submitted, skip submitting a new job'
                % work_dir)
            return

        # job is completed
        if os.path.isfile('done.txt'):
            print('The job in the folder: %s is Done' % work_dir)
            return

    # submit the job
    # sometime, when submit a job, end with: singularity: command not found,and exist, wired, then try run submit a job in scomplie note
    submit_job_curc_or_run_script_local('job_hillshade_headwall_line_grid.sh',
                                        'hillshade_headwall_line_grid.sh')

    os.chdir(curr_dir_before_start)
def submit_extract_headwall_job(slope_tifs, idx, max_job_count):

    wait_if_reach_max_jobs(max_job_count, 'HW')

    job_name = 'HW%d' % idx
    check_length_jobname(job_name)
    work_dir = working_dir_string(idx, 'extract_headwall_', root=root_dir)
    if os.path.isdir(work_dir) is False:
        io_function.mkdir(work_dir)
        os.chdir(work_dir)

        io_function.save_list_to_txt('slope_tif_list.txt', slope_tifs)

        # run segmentation
        sh_list = ['job_healwall.sh', 'extract_headwall_from_slope.sh']
        copy_curc_job_files(jobsh_dir, work_dir, sh_list)
        slurm_utility.modify_slurm_job_sh('job_healwall.sh', 'job-name',
                                          job_name)

    else:
        os.chdir(work_dir)

        # job is completed
        if os.path.isfile('done.txt'):
            print('The job in the folder: %s is Done' % work_dir)
            return

        submit_job_names = slurm_utility.get_submited_job_names(curc_username)
        if job_name in submit_job_names:
            print(
                'The folder: %s already exist and the job has been submitted, skip submitting a new job'
                % work_dir)
            return

    # submit the job
    # sometime, when submit a job, end with: singularity: command not found,and exist, wired, then try run submit a job in scomplie note
    submit_job_curc_or_run_script_local('job_healwall.sh',
                                        'extract_headwall_from_slope.sh')

    os.chdir(curr_dir_before_start)

    return
Пример #3
0
def run_evaluation_one_dataset(idx, area_ini, training_root_dir, template_dir):

    curr_dir = os.getcwd()

    run_eval_dir = os.path.basename(area_ini)[:-4] + '_%d' % idx
    main_para = 'main_para_eval_on_testData.ini'
    area_ini_name = os.path.basename(area_ini)

    if os.path.isdir(run_eval_dir) is False:
        io_function.mkdir(run_eval_dir)
        os.chdir(run_eval_dir)

        # copy and modify parameters
        io_function.copy_file_to_dst(os.path.join(template_dir, main_para),
                                     main_para)
        io_function.copy_file_to_dst(area_ini, area_ini_name)
        # set training_data_per=0, then all the data will be input for evaluation
        modify_parameter(main_para, 'training_regions', area_ini_name)
        io_function.copy_file_to_dst(
            os.path.join(template_dir, 'deeplabv3plus_xception65.ini'),
            'deeplabv3plus_xception65.ini')

        if 'login' in machine_name or 'shas' in machine_name or 'sgpu' in machine_name:
            io_function.copy_file_to_dst(
                os.path.join(template_dir, 'exe_curc.sh'), 'exe_curc.sh')
            io_function.copy_file_to_dst(
                os.path.join(template_dir, 'run_INsingularity_curc_GPU_tf.sh'),
                'run_INsingularity_curc_GPU_tf.sh')
            io_function.copy_file_to_dst(
                os.path.join(template_dir, 'job_tf_GPU.sh'), 'job_tf_GPU.sh')

            job_name = 'eval_%d_area' % idx
            slurm_utility.modify_slurm_job_sh('job_tf_GPU.sh', 'job-name',
                                              job_name)
        else:
            # copy
            io_function.copy_file_to_dst(
                os.path.join(template_dir, 'exe_eval.sh'), 'exe_eval.sh')

    else:
        os.chdir(run_eval_dir)

    # if run in curc cluster
    if 'login' in machine_name or 'shas' in machine_name or 'sgpu' in machine_name:

        while True:
            job_count = slurm_utility.get_submit_job_count(
                curc_username, job_name_substr='eval')
            if job_count >= max_run_jobs:
                print(
                    machine_name, datetime.now(),
                    'You have submitted %d or more jobs, wait ' % max_run_jobs)
                time.sleep(60)  #
                continue
            break

        # submit a job
        res = os.system('sbatch job_tf_GPU.sh')
        if res != 0:
            sys.exit(1)
    else:

        deviceIDs = []
        while True:
            # get available GPUs  # https://github.com/anderskm/gputil
            deviceIDs = GPUtil.getAvailable(order='memory',
                                            limit=100,
                                            maxLoad=0.5,
                                            maxMemory=0.5,
                                            includeNan=False,
                                            excludeID=[],
                                            excludeUUID=[])
            basic.outputlogMessage('deviceIDs: %s' % str(deviceIDs))
            if len(deviceIDs) < 1:
                time.sleep(
                    60)  # wait one minute, then check the available GPUs again
                continue
            break

        while True:
            job_count = basic.alive_process_count(local_tasks)
            if job_count >= max_run_jobs:
                print(
                    machine_name, datetime.now(),
                    '%d (>%d) jobs are running, wait ' %
                    (job_count, max_run_jobs))
                time.sleep(60)  #
                continue
            break

        job_sh = 'exe_eval.sh'
        gpuid = deviceIDs[0]
        # modify gpuid in exe_eval.sh
        with open(job_sh, 'r') as inputfile:
            list_of_all_the_lines = inputfile.readlines()
            for i in range(0, len(list_of_all_the_lines)):
                line = list_of_all_the_lines[i]
                if 'CUDA_VISIBLE_DEVICES' in line:
                    list_of_all_the_lines[
                        i] = 'export CUDA_VISIBLE_DEVICES=%d\n' % gpuid
                    print('Set %s' % list_of_all_the_lines[i])
            # write the new file and overwrite the old one
        with open(job_sh, 'w') as outputfile:
            outputfile.writelines(list_of_all_the_lines)
            outputfile.close()

        # run
        sub_process = Process(target=run_exe_eval)
        sub_process.start()
        local_tasks.append(sub_process)

        # wait until the assigned is used or exceed 100 seconds
        t0 = time.time()
        while True:
            gpu_ids = GPUtil.getAvailable(order='memory',
                                          limit=100,
                                          maxLoad=0.5,
                                          maxMemory=0.5,
                                          includeNan=False,
                                          excludeID=[],
                                          excludeUUID=[])
            t1 = time.time()
            # print(gpu_ids, t1-t0)
            if len(gpu_ids) < 1 or gpu_ids[0] != gpuid or (t1 - t0) > 100:
                break
            else:
                time.sleep(0.5)

        if sub_process.exitcode is not None and sub_process.exitcode != 0:
            sys.exit(1)

    os.chdir(curr_dir)
Пример #4
0
def submit_training_job(idx, lr, iter_num, batch_size, backbone, buffer_size,
                        training_data_per, data_augmentation,
                        data_aug_ignore_classes):

    while True:
        job_count = slurm_utility.get_submit_job_count(curc_username)
        if job_count >= 5:
            print(machine_name, datetime.now(),
                  'You have submitted 5 or more jobs, wait ')
            time.sleep(60)  #
            continue
        break

    para_file = 'main_para_exp9.ini'
    job_name = 'tune%d' % idx
    work_dir = working_dir_string(idx, root=root_dir)
    if os.path.isdir(work_dir) is False:
        io_function.mkdir(work_dir)
        os.chdir(work_dir)

        # create a training folder
        copy_ini_files(ini_dir, work_dir, para_file, area_ini_list, backbone)

        # change para_file
        modify_parameter(os.path.join(work_dir, para_file),
                         'network_setting_ini', backbone)
        modify_parameter(os.path.join(work_dir, backbone),
                         'base_learning_rate', lr)
        modify_parameter(os.path.join(work_dir, backbone), 'batch_size',
                         batch_size)
        modify_parameter(os.path.join(work_dir, backbone), 'iteration_num',
                         iter_num)

        modify_parameter(os.path.join(work_dir, para_file), 'buffer_size',
                         buffer_size)
        modify_parameter(os.path.join(work_dir, para_file),
                         'training_data_per', training_data_per)
        modify_parameter(os.path.join(work_dir, para_file),
                         'data_augmentation', data_augmentation)
        modify_parameter(os.path.join(work_dir, para_file),
                         'data_aug_ignore_classes', data_aug_ignore_classes)

        # run training
        # whole_procedure.run_whole_procedure(para_file, b_train_only=True)
        # copy job.sh exe.sh and other, run submit jobs
        copy_curc_job_files(jobsh_dir, work_dir)
        slurm_utility.modify_slurm_job_sh('job_tf_GPU.sh', 'job-name',
                                          job_name)

    else:
        os.chdir(work_dir)

        submit_job_names = slurm_utility.get_submited_job_names(curc_username)
        if job_name in submit_job_names:
            print(
                'The folder: %s already exist and the job has been submitted, skip submitting a new job'
                % work_dir)
            return work_dir, os.path.join(work_dir, para_file)

        # if result exists, well trained, or early stopping
        early_stop, model_trained_iter = check_early_stopping_trained_iteration(
            work_dir, para_file)
        if early_stop is True:
            print(
                'The folder: %s is early_stopping with trained model of %d iteration, skip submitting a new job'
                % (work_dir, model_trained_iter))
            return work_dir, os.path.join(work_dir, para_file)
        if model_trained_iter >= iter_num:
            print(
                'The folder: %s has been trained of %d iteration (>=required), skip submitting a new job'
                % (work_dir, model_trained_iter))
            return work_dir, os.path.join(work_dir, para_file)

    # submit the job
    # sometime, when submit a job, end with: singularity: command not found,and exist, wired, then try run submit a job in scomplie note
    res = os.system('sbatch job_tf_GPU.sh')
    if res != 0:
        sys.exit(1)

    os.chdir(curr_dir_before_start)

    return work_dir, os.path.join(work_dir, para_file)