def run_test(config_file, training_service, local_gpu=False): '''run test per configuration file''' new_config_file, config = gen_new_config(config_file, training_service) print(json.dumps(config, sort_keys=True, indent=4)) if training_service == 'local' and not local_gpu and config['trial'][ 'gpuNum'] > 0: print('no gpu, skiping: ', config_file) return try: proc = subprocess.run( ['nnictl', 'create', '--config', new_config_file]) assert proc.returncode == 0, '`nnictl create` failed with code %d' % proc.returncode max_duration, max_trial_num = get_max_values(new_config_file) sleep_interval = 3 for _ in range(0, max_duration + 30, sleep_interval): time.sleep(sleep_interval) status = get_experiment_status(STATUS_URL) if status in ['DONE', 'ERROR' ] or get_failed_trial_jobs(TRIAL_JOBS_URL): break print_failed_job_log(config['trainingServicePlatform'], TRIAL_JOBS_URL) if status != 'DONE' or get_succeeded_trial_num( TRIAL_JOBS_URL) < max_trial_num: raise AssertionError('Failed to finish in maxExecDuration') finally: if os.path.exists(new_config_file): os.remove(new_config_file)
def launch_test(config_file, training_service, test_case_config): launch_command = get_command(test_case_config, 'launchCommand') print('launch command: ', launch_command, flush=True) proc = subprocess.run(launch_command, shell=True) assert proc.returncode == 0, 'launch command failed with code %d' % proc.returncode # set experiment ID into variable exp_var_name = test_case_config.get('setExperimentIdtoVar') if exp_var_name is not None: assert exp_var_name.startswith('$') it_variables[exp_var_name] = get_experiment_id(EXPERIMENT_URL) print('variables:', it_variables) max_duration, max_trial_num = get_max_values(config_file) print('max_duration:', max_duration, ' max_trial_num:', max_trial_num) if not test_case_config.get('experimentStatusCheck'): return bg_time = time.time() print(str(datetime.datetime.now()), ' waiting ...', flush=True) try: # wait restful server to be ready time.sleep(3) experiment_id = get_experiment_id(EXPERIMENT_URL) while True: waited_time = time.time() - bg_time if waited_time > max_duration + 10: print('waited: {}, max_duration: {}'.format( waited_time, max_duration)) break status = get_experiment_status(STATUS_URL) if status in ['DONE', 'ERROR']: print('experiment status:', status) break num_failed = len(get_failed_trial_jobs(TRIAL_JOBS_URL)) if num_failed > 0: print('failed jobs: ', num_failed) break time.sleep(1) except: print_experiment_log(experiment_id=experiment_id) raise print(str(datetime.datetime.now()), ' waiting done', flush=True) if get_experiment_status(STATUS_URL) == 'ERROR': print_experiment_log(experiment_id=experiment_id) trial_stats = get_trial_stats(TRIAL_JOBS_URL) print(json.dumps(trial_stats, indent=4), flush=True) if status != 'DONE' or trial_stats['SUCCEEDED'] + trial_stats[ 'EARLY_STOPPED'] < max_trial_num: print_experiment_log(experiment_id=experiment_id) print_trial_job_log(training_service, TRIAL_JOBS_URL) raise AssertionError('Failed to finish in maxExecDuration')