示例#1
0
def run_test():
    '''run metrics test'''
    if sys.platform == 'win32':
        config_file = osp.join('metrics_test', 'metrics_win32.test.yml')
    else:
        config_file = osp.join('metrics_test', 'metrics.test.yml')

    print('Testing %s...' % config_file)
    proc = subprocess.run(['nnictl', 'create', '--config', config_file])
    assert proc.returncode == 0, '`nnictl create` failed with code %d' % proc.returncode

    max_duration, max_trial_num = get_max_values(config_file)
    sleep_interval = 3

    for _ in range(0, max_duration, sleep_interval):
        time.sleep(sleep_interval)
        status = get_experiment_status(STATUS_URL)
        #print('experiment status:', status)
        if status == 'DONE':
            num_succeeded = get_succeeded_trial_num(TRIAL_JOBS_URL)
            print_failed_job_log('local', TRIAL_JOBS_URL)
            if sys.platform == "win32":
                time.sleep(
                    sleep_interval
                )  # Windows seems to have some issues on updating in time
            assert num_succeeded == max_trial_num, 'only %d succeeded trial jobs, there should be %d' % (
                num_succeeded, max_trial_num)
            check_metrics()
            break

    assert status == 'DONE', 'Failed to finish in maxExecDuration'
示例#2
0
def run_test(config_file, training_service, local_gpu=False):
    '''run test per configuration file'''

    new_config_file, config = gen_new_config(config_file, training_service)
    print(json.dumps(config, sort_keys=True, indent=4))

    if training_service == 'local' and not local_gpu and config['trial'][
            'gpuNum'] > 0:
        print('no gpu, skiping: ', config_file)
        return

    try:
        proc = subprocess.run(
            ['nnictl', 'create', '--config', new_config_file])
        assert proc.returncode == 0, '`nnictl create` failed with code %d' % proc.returncode

        max_duration, max_trial_num = get_max_values(new_config_file)
        sleep_interval = 3

        for _ in range(0, max_duration + 30, sleep_interval):
            time.sleep(sleep_interval)
            status = get_experiment_status(STATUS_URL)
            if status in ['DONE', 'ERROR'
                          ] or get_failed_trial_jobs(TRIAL_JOBS_URL):
                break

        print_failed_job_log(config['trainingServicePlatform'], TRIAL_JOBS_URL)
        if status != 'DONE' or get_succeeded_trial_num(
                TRIAL_JOBS_URL) < max_trial_num:
            raise AssertionError('Failed to finish in maxExecDuration')
    finally:
        if os.path.exists(new_config_file):
            os.remove(new_config_file)