def submit_hillshade_newest_headwall_line_grid_job(ids_list, idx, grid_base_name, max_job_count): wait_if_reach_max_jobs(max_job_count, 'dLi') # draw Line on hillshade job_name = 'dLi%d' % idx check_length_jobname(job_name) work_dir = working_dir_string(idx, 'hillshade_newest_headwall_line_', root=root_dir) if os.path.isdir(work_dir) is False: io_function.mkdir(work_dir) os.chdir(work_dir) ids_list = [str(item) for item in ids_list] io_function.save_list_to_txt(grid_base_name + '.txt', ids_list) # prepare job sh_list = [ 'hillshade_headwall_line_grid.sh', 'job_hillshade_headwall_line_grid.sh' ] copy_curc_job_files(jobsh_dir, work_dir, sh_list) slurm_utility.modify_slurm_job_sh( 'job_hillshade_headwall_line_grid.sh', 'job-name', job_name) else: os.chdir(work_dir) submit_job_names = slurm_utility.get_submited_job_names(curc_username) if job_name in submit_job_names: print( 'The folder: %s already exist and the job has been submitted, skip submitting a new job' % work_dir) return # job is completed if os.path.isfile('done.txt'): print('The job in the folder: %s is Done' % work_dir) return # submit the job # sometime, when submit a job, end with: singularity: command not found,and exist, wired, then try run submit a job in scomplie note submit_job_curc_or_run_script_local('job_hillshade_headwall_line_grid.sh', 'hillshade_headwall_line_grid.sh') os.chdir(curr_dir_before_start)
def submit_extract_headwall_job(slope_tifs, idx, max_job_count): wait_if_reach_max_jobs(max_job_count, 'HW') job_name = 'HW%d' % idx check_length_jobname(job_name) work_dir = working_dir_string(idx, 'extract_headwall_', root=root_dir) if os.path.isdir(work_dir) is False: io_function.mkdir(work_dir) os.chdir(work_dir) io_function.save_list_to_txt('slope_tif_list.txt', slope_tifs) # run segmentation sh_list = ['job_healwall.sh', 'extract_headwall_from_slope.sh'] copy_curc_job_files(jobsh_dir, work_dir, sh_list) slurm_utility.modify_slurm_job_sh('job_healwall.sh', 'job-name', job_name) else: os.chdir(work_dir) # job is completed if os.path.isfile('done.txt'): print('The job in the folder: %s is Done' % work_dir) return submit_job_names = slurm_utility.get_submited_job_names(curc_username) if job_name in submit_job_names: print( 'The folder: %s already exist and the job has been submitted, skip submitting a new job' % work_dir) return # submit the job # sometime, when submit a job, end with: singularity: command not found,and exist, wired, then try run submit a job in scomplie note submit_job_curc_or_run_script_local('job_healwall.sh', 'extract_headwall_from_slope.sh') os.chdir(curr_dir_before_start) return
def run_evaluation_one_dataset(idx, area_ini, training_root_dir, template_dir): curr_dir = os.getcwd() run_eval_dir = os.path.basename(area_ini)[:-4] + '_%d' % idx main_para = 'main_para_eval_on_testData.ini' area_ini_name = os.path.basename(area_ini) if os.path.isdir(run_eval_dir) is False: io_function.mkdir(run_eval_dir) os.chdir(run_eval_dir) # copy and modify parameters io_function.copy_file_to_dst(os.path.join(template_dir, main_para), main_para) io_function.copy_file_to_dst(area_ini, area_ini_name) # set training_data_per=0, then all the data will be input for evaluation modify_parameter(main_para, 'training_regions', area_ini_name) io_function.copy_file_to_dst( os.path.join(template_dir, 'deeplabv3plus_xception65.ini'), 'deeplabv3plus_xception65.ini') if 'login' in machine_name or 'shas' in machine_name or 'sgpu' in machine_name: io_function.copy_file_to_dst( os.path.join(template_dir, 'exe_curc.sh'), 'exe_curc.sh') io_function.copy_file_to_dst( os.path.join(template_dir, 'run_INsingularity_curc_GPU_tf.sh'), 'run_INsingularity_curc_GPU_tf.sh') io_function.copy_file_to_dst( os.path.join(template_dir, 'job_tf_GPU.sh'), 'job_tf_GPU.sh') job_name = 'eval_%d_area' % idx slurm_utility.modify_slurm_job_sh('job_tf_GPU.sh', 'job-name', job_name) else: # copy io_function.copy_file_to_dst( os.path.join(template_dir, 'exe_eval.sh'), 'exe_eval.sh') else: os.chdir(run_eval_dir) # if run in curc cluster if 'login' in machine_name or 'shas' in machine_name or 'sgpu' in machine_name: while True: job_count = slurm_utility.get_submit_job_count( curc_username, job_name_substr='eval') if job_count >= max_run_jobs: print( machine_name, datetime.now(), 'You have submitted %d or more jobs, wait ' % max_run_jobs) time.sleep(60) # continue break # submit a job res = os.system('sbatch job_tf_GPU.sh') if res != 0: sys.exit(1) else: deviceIDs = [] while True: # get available GPUs # https://github.com/anderskm/gputil deviceIDs = GPUtil.getAvailable(order='memory', limit=100, maxLoad=0.5, maxMemory=0.5, includeNan=False, excludeID=[], excludeUUID=[]) basic.outputlogMessage('deviceIDs: %s' % str(deviceIDs)) if len(deviceIDs) < 1: time.sleep( 60) # wait one minute, then check the available GPUs again continue break while True: job_count = basic.alive_process_count(local_tasks) if job_count >= max_run_jobs: print( machine_name, datetime.now(), '%d (>%d) jobs are running, wait ' % (job_count, max_run_jobs)) time.sleep(60) # continue break job_sh = 'exe_eval.sh' gpuid = deviceIDs[0] # modify gpuid in exe_eval.sh with open(job_sh, 'r') as inputfile: list_of_all_the_lines = inputfile.readlines() for i in range(0, len(list_of_all_the_lines)): line = list_of_all_the_lines[i] if 'CUDA_VISIBLE_DEVICES' in line: list_of_all_the_lines[ i] = 'export CUDA_VISIBLE_DEVICES=%d\n' % gpuid print('Set %s' % list_of_all_the_lines[i]) # write the new file and overwrite the old one with open(job_sh, 'w') as outputfile: outputfile.writelines(list_of_all_the_lines) outputfile.close() # run sub_process = Process(target=run_exe_eval) sub_process.start() local_tasks.append(sub_process) # wait until the assigned is used or exceed 100 seconds t0 = time.time() while True: gpu_ids = GPUtil.getAvailable(order='memory', limit=100, maxLoad=0.5, maxMemory=0.5, includeNan=False, excludeID=[], excludeUUID=[]) t1 = time.time() # print(gpu_ids, t1-t0) if len(gpu_ids) < 1 or gpu_ids[0] != gpuid or (t1 - t0) > 100: break else: time.sleep(0.5) if sub_process.exitcode is not None and sub_process.exitcode != 0: sys.exit(1) os.chdir(curr_dir)
def submit_training_job(idx, lr, iter_num, batch_size, backbone, buffer_size, training_data_per, data_augmentation, data_aug_ignore_classes): while True: job_count = slurm_utility.get_submit_job_count(curc_username) if job_count >= 5: print(machine_name, datetime.now(), 'You have submitted 5 or more jobs, wait ') time.sleep(60) # continue break para_file = 'main_para_exp9.ini' job_name = 'tune%d' % idx work_dir = working_dir_string(idx, root=root_dir) if os.path.isdir(work_dir) is False: io_function.mkdir(work_dir) os.chdir(work_dir) # create a training folder copy_ini_files(ini_dir, work_dir, para_file, area_ini_list, backbone) # change para_file modify_parameter(os.path.join(work_dir, para_file), 'network_setting_ini', backbone) modify_parameter(os.path.join(work_dir, backbone), 'base_learning_rate', lr) modify_parameter(os.path.join(work_dir, backbone), 'batch_size', batch_size) modify_parameter(os.path.join(work_dir, backbone), 'iteration_num', iter_num) modify_parameter(os.path.join(work_dir, para_file), 'buffer_size', buffer_size) modify_parameter(os.path.join(work_dir, para_file), 'training_data_per', training_data_per) modify_parameter(os.path.join(work_dir, para_file), 'data_augmentation', data_augmentation) modify_parameter(os.path.join(work_dir, para_file), 'data_aug_ignore_classes', data_aug_ignore_classes) # run training # whole_procedure.run_whole_procedure(para_file, b_train_only=True) # copy job.sh exe.sh and other, run submit jobs copy_curc_job_files(jobsh_dir, work_dir) slurm_utility.modify_slurm_job_sh('job_tf_GPU.sh', 'job-name', job_name) else: os.chdir(work_dir) submit_job_names = slurm_utility.get_submited_job_names(curc_username) if job_name in submit_job_names: print( 'The folder: %s already exist and the job has been submitted, skip submitting a new job' % work_dir) return work_dir, os.path.join(work_dir, para_file) # if result exists, well trained, or early stopping early_stop, model_trained_iter = check_early_stopping_trained_iteration( work_dir, para_file) if early_stop is True: print( 'The folder: %s is early_stopping with trained model of %d iteration, skip submitting a new job' % (work_dir, model_trained_iter)) return work_dir, os.path.join(work_dir, para_file) if model_trained_iter >= iter_num: print( 'The folder: %s has been trained of %d iteration (>=required), skip submitting a new job' % (work_dir, model_trained_iter)) return work_dir, os.path.join(work_dir, para_file) # submit the job # sometime, when submit a job, end with: singularity: command not found,and exist, wired, then try run submit a job in scomplie note res = os.system('sbatch job_tf_GPU.sh') if res != 0: sys.exit(1) os.chdir(curr_dir_before_start) return work_dir, os.path.join(work_dir, para_file)