예제 #1
0
def run_test(config_file, training_service, local_gpu=False):
    '''run test per configuration file'''

    new_config_file, config = gen_new_config(config_file, training_service)
    print(json.dumps(config, sort_keys=True, indent=4))

    if training_service == 'local' and not local_gpu and config['trial'][
            'gpuNum'] > 0:
        print('no gpu, skiping: ', config_file)
        return

    try:
        proc = subprocess.run(
            ['nnictl', 'create', '--config', new_config_file])
        assert proc.returncode == 0, '`nnictl create` failed with code %d' % proc.returncode

        max_duration, max_trial_num = get_max_values(new_config_file)
        sleep_interval = 3

        for _ in range(0, max_duration + 30, sleep_interval):
            time.sleep(sleep_interval)
            status = get_experiment_status(STATUS_URL)
            if status in ['DONE', 'ERROR'
                          ] or get_failed_trial_jobs(TRIAL_JOBS_URL):
                break

        print_failed_job_log(config['trainingServicePlatform'], TRIAL_JOBS_URL)
        if status != 'DONE' or get_succeeded_trial_num(
                TRIAL_JOBS_URL) < max_trial_num:
            raise AssertionError('Failed to finish in maxExecDuration')
    finally:
        if os.path.exists(new_config_file):
            os.remove(new_config_file)
예제 #2
0
파일: run_tests.py 프로젝트: microsoft/nni
def launch_test(config_file, training_service, test_case_config):
    launch_command = get_command(test_case_config, 'launchCommand')
    print('launch command: ', launch_command, flush=True)

    proc = subprocess.run(launch_command, shell=True)

    assert proc.returncode == 0, 'launch command failed with code %d' % proc.returncode

    # set experiment ID into variable
    exp_var_name = test_case_config.get('setExperimentIdtoVar')
    if exp_var_name is not None:
        assert exp_var_name.startswith('$')
        it_variables[exp_var_name] = get_experiment_id(EXPERIMENT_URL)
    print('variables:', it_variables)

    max_duration, max_trial_num = get_max_values(config_file)
    print('max_duration:', max_duration, ' max_trial_num:', max_trial_num)

    if not test_case_config.get('experimentStatusCheck'):
        return

    bg_time = time.time()
    print(str(datetime.datetime.now()), ' waiting ...', flush=True)
    try:
        # wait restful server to be ready
        time.sleep(3)
        experiment_id = get_experiment_id(EXPERIMENT_URL)
        while True:
            waited_time = time.time() - bg_time
            if waited_time > max_duration + 10:
                print('waited: {}, max_duration: {}'.format(
                    waited_time, max_duration))
                break
            status = get_experiment_status(STATUS_URL)
            if status in ['DONE', 'ERROR']:
                print('experiment status:', status)
                break
            num_failed = len(get_failed_trial_jobs(TRIAL_JOBS_URL))
            if num_failed > 0:
                print('failed jobs: ', num_failed)
                break
            time.sleep(1)
    except:
        print_experiment_log(experiment_id=experiment_id)
        raise
    print(str(datetime.datetime.now()), ' waiting done', flush=True)
    if get_experiment_status(STATUS_URL) == 'ERROR':
        print_experiment_log(experiment_id=experiment_id)

    trial_stats = get_trial_stats(TRIAL_JOBS_URL)
    print(json.dumps(trial_stats, indent=4), flush=True)
    if status != 'DONE' or trial_stats['SUCCEEDED'] + trial_stats[
            'EARLY_STOPPED'] < max_trial_num:
        print_experiment_log(experiment_id=experiment_id)
        print_trial_job_log(training_service, TRIAL_JOBS_URL)
        raise AssertionError('Failed to finish in maxExecDuration')