示例#1
0
def launch_test(config_file, training_service, test_case_config):
    launch_command = get_command(test_case_config, 'launchCommand')
    print('launch command: ', launch_command, flush=True)

    proc = subprocess.run(launch_command, shell=True)

    assert proc.returncode == 0, 'launch command failed with code %d' % proc.returncode

    # set experiment ID into variable
    exp_var_name = test_case_config.get('setExperimentIdtoVar')
    if exp_var_name is not None:
        assert exp_var_name.startswith('$')
        it_variables[exp_var_name] = get_experiment_id(EXPERIMENT_URL)
    print('variables:', it_variables)

    max_duration, max_trial_num = get_max_values(config_file)
    print('max_duration:', max_duration, ' max_trial_num:', max_trial_num)

    if not test_case_config.get('experimentStatusCheck'):
        return

    bg_time = time.time()
    print(str(datetime.datetime.now()), ' waiting ...', flush=True)
    try:
        # wait restful server to be ready
        time.sleep(3)
        experiment_id = get_experiment_id(EXPERIMENT_URL)
        while True:
            waited_time = time.time() - bg_time
            if waited_time > max_duration + 10:
                print('waited: {}, max_duration: {}'.format(
                    waited_time, max_duration))
                break
            status = get_experiment_status(STATUS_URL)
            if status in ['DONE', 'ERROR']:
                print('experiment status:', status)
                break
            num_failed = len(get_failed_trial_jobs(TRIAL_JOBS_URL))
            if num_failed > 0:
                print('failed jobs: ', num_failed)
                break
            time.sleep(1)
    except:
        print_experiment_log(experiment_id=experiment_id)
        raise
    print(str(datetime.datetime.now()), ' waiting done', flush=True)
    if get_experiment_status(STATUS_URL) == 'ERROR':
        print_experiment_log(experiment_id=experiment_id)

    trial_stats = get_trial_stats(TRIAL_JOBS_URL)
    print(json.dumps(trial_stats, indent=4), flush=True)
    if status != 'DONE' or trial_stats['SUCCEEDED'] + trial_stats[
            'EARLY_STOPPED'] < max_trial_num:
        print_experiment_log(experiment_id=experiment_id)
        print_trial_job_log(training_service, TRIAL_JOBS_URL)
        raise AssertionError('Failed to finish in maxExecDuration')
示例#2
0
def stop_experiment_test(args):
    config_file = args.config
    '''Test `nnictl stop` command, including `nnictl stop exp_id` and `nnictl stop all`.
    Simple `nnictl stop` is not tested here since it is used in all other test code'''
    subprocess.run(['nnictl', 'create', '--config', config_file, '--port', '8080'], check=True)
    subprocess.run(['nnictl', 'create', '--config', config_file, '--port', '8888'], check=True)
    subprocess.run(['nnictl', 'create', '--config', config_file, '--port', '8989'], check=True)
    subprocess.run(['nnictl', 'create', '--config', config_file, '--port', '8990'], check=True)

    # test cmd 'nnictl stop id`
    experiment_id = get_experiment_id(EXPERIMENT_URL)
    proc = subprocess.run(['nnictl', 'stop', experiment_id])
    assert proc.returncode == 0, '`nnictl stop %s` failed with code %d' % (experiment_id, proc.returncode)
    snooze()
    assert not detect_port(8080), '`nnictl stop %s` failed to stop experiments' % experiment_id

    # test cmd `nnictl stop --port`
    proc = subprocess.run(['nnictl', 'stop', '--port', '8990'])
    assert proc.returncode == 0, '`nnictl stop %s` failed with code %d' % (experiment_id, proc.returncode)
    snooze()
    assert not detect_port(8990), '`nnictl stop %s` failed to stop experiments' % experiment_id

    # test cmd `nnictl stop --all`
    proc = subprocess.run(['nnictl', 'stop', '--all'])
    assert proc.returncode == 0, '`nnictl stop --all` failed with code %d' % proc.returncode
    snooze()
    assert not detect_port(8888) and not detect_port(8989), '`nnictl stop --all` failed to stop experiments'
示例#3
0
文件: run_tests.py 项目: zhyj3038/nni
def invoke_validator(test_case_config, nni_source_dir, training_service):
    validator_config = test_case_config.get('validator')
    if validator_config is None or validator_config.get('class') is None:
        return

    validator = validators.__dict__[validator_config.get('class')]()
    kwargs = validator_config.get('kwargs', {})
    print('kwargs:', kwargs)
    experiment_id = get_experiment_id(EXPERIMENT_URL)
    try:
        validator(REST_ENDPOINT, get_experiment_dir(EXPERIMENT_URL), nni_source_dir, **kwargs)
    except:
        print_experiment_log(experiment_id=experiment_id)
        print_trial_job_log(training_service, TRIAL_JOBS_URL)
        raise
示例#4
0
def multi_target_training(cfg, X_train, y_trains, X_test, logger):
    model_name = list(cfg["model"].keys())[0]

    targets = cfg["training"]["targets"]
    y_oof = np.zeros((len(X_train), len(targets)))
    y_pred = np.zeros((len(X_test), len(targets)))
    metric = utils.get_metric(cfg)
    metric_name = cfg["metric"]["name"]
    figs = []

    experiment_id = utils.get_experiment_id(cfg)
    run_name = cfg["mlflow"]["run_name"]

    logger.info(f"experiment config: {run_name}")
    logger.info(
        f"CV method: {cfg['split']['name']} {cfg['split']['params']['n_splits']}-Fold"
    )

    with mlflow.start_run(run_name=run_name, experiment_id=experiment_id):
        for i, target in enumerate(targets):
            logger.info(f"Training for {target}")
            y_train = y_trains[target]
            trainer = get_trainer(cfg, model_name, X_train, y_train, X_test)
            y_oof_, models, y_pred_ = training_step(trainer)
            y_oof[:, i] = y_oof_
            y_pred[:, i] = y_pred_

            fig = utils.plot_feature_importance(models, X_train, model_name,
                                                target)
            figs.append(fig)

        metrics = utils.calc_metrics(cfg, metric, y_trains.values, y_oof)
        logger.info(f"CV score : {metrics[metric_name]}")
        utils.mlflow_logger(cfg, metrics, figs, targets)

    return y_pred
示例#5
0
def single_target_training(cfg, X_train, y_train, X_test, logger):
    model_name = list(cfg["model"].keys())[0]

    metric = utils.get_metric(cfg)
    metric_name = cfg["metric"]["name"]

    experiment_id = utils.get_experiment_id(cfg)
    run_name = cfg["mlflow"]["run_name"]

    logger.info(f"experiment config: {run_name}")
    logger.info(
        f"CV method: {cfg['split']['name']} {cfg['split']['params']['n_splits']}-Fold"
    )

    with mlflow.start_run(run_name=run_name, experiment_id=experiment_id):
        trainer = get_trainer(cfg, model_name, X_train, y_train, X_test)
        y_oof, models, y_pred = training_step(trainer)
        metrics = utils.calc_metrics(cfg, metric, y_train.values, y_oof)
        fig = utils.plot_feature_importance(models, X_train, model_name)

        logger.info(f"CV score : {metrics[metric_name]}")
        utils.mlflow_logger(cfg, metrics, fig, targets=None)

    return y_pred
 def re_init_data(self):
     self.data = {'experiment_info': get_experiment_id()}