def launch_test(config_file, training_service, test_case_config): launch_command = get_command(test_case_config, 'launchCommand') print('launch command: ', launch_command, flush=True) proc = subprocess.run(launch_command, shell=True) assert proc.returncode == 0, 'launch command failed with code %d' % proc.returncode # set experiment ID into variable exp_var_name = test_case_config.get('setExperimentIdtoVar') if exp_var_name is not None: assert exp_var_name.startswith('$') it_variables[exp_var_name] = get_experiment_id(EXPERIMENT_URL) print('variables:', it_variables) max_duration, max_trial_num = get_max_values(config_file) print('max_duration:', max_duration, ' max_trial_num:', max_trial_num) if not test_case_config.get('experimentStatusCheck'): return bg_time = time.time() print(str(datetime.datetime.now()), ' waiting ...', flush=True) try: # wait restful server to be ready time.sleep(3) experiment_id = get_experiment_id(EXPERIMENT_URL) while True: waited_time = time.time() - bg_time if waited_time > max_duration + 10: print('waited: {}, max_duration: {}'.format( waited_time, max_duration)) break status = get_experiment_status(STATUS_URL) if status in ['DONE', 'ERROR']: print('experiment status:', status) break num_failed = len(get_failed_trial_jobs(TRIAL_JOBS_URL)) if num_failed > 0: print('failed jobs: ', num_failed) break time.sleep(1) except: print_experiment_log(experiment_id=experiment_id) raise print(str(datetime.datetime.now()), ' waiting done', flush=True) if get_experiment_status(STATUS_URL) == 'ERROR': print_experiment_log(experiment_id=experiment_id) trial_stats = get_trial_stats(TRIAL_JOBS_URL) print(json.dumps(trial_stats, indent=4), flush=True) if status != 'DONE' or trial_stats['SUCCEEDED'] + trial_stats[ 'EARLY_STOPPED'] < max_trial_num: print_experiment_log(experiment_id=experiment_id) print_trial_job_log(training_service, TRIAL_JOBS_URL) raise AssertionError('Failed to finish in maxExecDuration')
def stop_experiment_test(args): config_file = args.config '''Test `nnictl stop` command, including `nnictl stop exp_id` and `nnictl stop all`. Simple `nnictl stop` is not tested here since it is used in all other test code''' subprocess.run(['nnictl', 'create', '--config', config_file, '--port', '8080'], check=True) subprocess.run(['nnictl', 'create', '--config', config_file, '--port', '8888'], check=True) subprocess.run(['nnictl', 'create', '--config', config_file, '--port', '8989'], check=True) subprocess.run(['nnictl', 'create', '--config', config_file, '--port', '8990'], check=True) # test cmd 'nnictl stop id` experiment_id = get_experiment_id(EXPERIMENT_URL) proc = subprocess.run(['nnictl', 'stop', experiment_id]) assert proc.returncode == 0, '`nnictl stop %s` failed with code %d' % (experiment_id, proc.returncode) snooze() assert not detect_port(8080), '`nnictl stop %s` failed to stop experiments' % experiment_id # test cmd `nnictl stop --port` proc = subprocess.run(['nnictl', 'stop', '--port', '8990']) assert proc.returncode == 0, '`nnictl stop %s` failed with code %d' % (experiment_id, proc.returncode) snooze() assert not detect_port(8990), '`nnictl stop %s` failed to stop experiments' % experiment_id # test cmd `nnictl stop --all` proc = subprocess.run(['nnictl', 'stop', '--all']) assert proc.returncode == 0, '`nnictl stop --all` failed with code %d' % proc.returncode snooze() assert not detect_port(8888) and not detect_port(8989), '`nnictl stop --all` failed to stop experiments'
def invoke_validator(test_case_config, nni_source_dir, training_service): validator_config = test_case_config.get('validator') if validator_config is None or validator_config.get('class') is None: return validator = validators.__dict__[validator_config.get('class')]() kwargs = validator_config.get('kwargs', {}) print('kwargs:', kwargs) experiment_id = get_experiment_id(EXPERIMENT_URL) try: validator(REST_ENDPOINT, get_experiment_dir(EXPERIMENT_URL), nni_source_dir, **kwargs) except: print_experiment_log(experiment_id=experiment_id) print_trial_job_log(training_service, TRIAL_JOBS_URL) raise
def multi_target_training(cfg, X_train, y_trains, X_test, logger): model_name = list(cfg["model"].keys())[0] targets = cfg["training"]["targets"] y_oof = np.zeros((len(X_train), len(targets))) y_pred = np.zeros((len(X_test), len(targets))) metric = utils.get_metric(cfg) metric_name = cfg["metric"]["name"] figs = [] experiment_id = utils.get_experiment_id(cfg) run_name = cfg["mlflow"]["run_name"] logger.info(f"experiment config: {run_name}") logger.info( f"CV method: {cfg['split']['name']} {cfg['split']['params']['n_splits']}-Fold" ) with mlflow.start_run(run_name=run_name, experiment_id=experiment_id): for i, target in enumerate(targets): logger.info(f"Training for {target}") y_train = y_trains[target] trainer = get_trainer(cfg, model_name, X_train, y_train, X_test) y_oof_, models, y_pred_ = training_step(trainer) y_oof[:, i] = y_oof_ y_pred[:, i] = y_pred_ fig = utils.plot_feature_importance(models, X_train, model_name, target) figs.append(fig) metrics = utils.calc_metrics(cfg, metric, y_trains.values, y_oof) logger.info(f"CV score : {metrics[metric_name]}") utils.mlflow_logger(cfg, metrics, figs, targets) return y_pred
def single_target_training(cfg, X_train, y_train, X_test, logger): model_name = list(cfg["model"].keys())[0] metric = utils.get_metric(cfg) metric_name = cfg["metric"]["name"] experiment_id = utils.get_experiment_id(cfg) run_name = cfg["mlflow"]["run_name"] logger.info(f"experiment config: {run_name}") logger.info( f"CV method: {cfg['split']['name']} {cfg['split']['params']['n_splits']}-Fold" ) with mlflow.start_run(run_name=run_name, experiment_id=experiment_id): trainer = get_trainer(cfg, model_name, X_train, y_train, X_test) y_oof, models, y_pred = training_step(trainer) metrics = utils.calc_metrics(cfg, metric, y_train.values, y_oof) fig = utils.plot_feature_importance(models, X_train, model_name) logger.info(f"CV score : {metrics[metric_name]}") utils.mlflow_logger(cfg, metrics, fig, targets=None) return y_pred
def re_init_data(self): self.data = {'experiment_info': get_experiment_id()}