Exemplo n.º 1
0
def create_experiment(args):
    # to make it clear what are inside args
    config_file = Path(args.config)
    port = args.port
    debug = args.debug
    url_prefix = args.url_prefix
    foreground = args.foreground

    # it should finally be done in nnictl main function
    # but for now don't break routines without logging support
    init_logger_for_command_line()
    logging.getLogger('nni').setLevel(logging.INFO)

    if not config_file.is_file():
        _logger.error(f'"{config_file}" is not a valid file.')
        exit(1)

    with config_file.open() as config:
        config_content = yaml.safe_load(config)

    v1_platform = config_content.get('trainingServicePlatform')
    if v1_platform:
        can_convert = True
        if v1_platform == 'adl':
            can_convert = False
        if v1_platform in ['kubeflow', 'frameworkcontroller']:
            reuse = config_content.get(v1_platform + 'Config', {}).get('reuse')
            can_convert = (
                reuse != False
            )  # if user does not explicitly specify it, convert to reuse mode

        if not can_convert:
            legacy_launcher.create_experiment(args)
            exit()

        try:
            v2_config = convert.to_v2(config_content)
        except Exception:
            _logger.error(
                'You are using legacy config format with incorrect fields or values, '
                'to get more accurate error message please update it to the new format.'
            )
            _logger.error(
                'Reference: https://nni.readthedocs.io/en/stable/reference/experiment_config.html'
            )
            exit(1)
        _logger.warning(
            f'You are using legacy config file, please update it to latest format:'
        )
        # use `print` here because logging will add timestamp and make it hard to copy paste
        print(Fore.YELLOW + '=' * 80 + Fore.RESET)
        print(yaml.dump(v2_config, sort_keys=False).strip())
        print(Fore.YELLOW + '=' * 80 + Fore.RESET)
        print(
            Fore.YELLOW +
            'Reference: https://nni.readthedocs.io/en/stable/reference/experiment_config.html'
            + Fore.RESET)

        utils.set_base_path(config_file.parent)
        config = ExperimentConfig(**v2_config)
        utils.unset_base_path()

    else:
        config = ExperimentConfig.load(config_file)

    if config.use_annotation:
        path = Path(tempfile.gettempdir(), getuser(), 'nni', 'annotation')
        path.mkdir(parents=True, exist_ok=True)
        path = tempfile.mkdtemp(dir=path)
        code_dir = expand_annotations(config.trial_code_directory, path)
        config.trial_code_directory = code_dir
        config.search_space = generate_search_space(code_dir)
        assert config.search_space, 'ERROR: Generated search space is empty'
        config.use_annotation = False

    exp = Experiment(config)
    exp.url_prefix = url_prefix
    run_mode = RunMode.Foreground if foreground else RunMode.Detach
    exp.start(port, debug, run_mode)

    _logger.info(
        f'To stop experiment run "nnictl stop {exp.id}" or "nnictl stop --all"'
    )
    _logger.info(
        'Reference: https://nni.readthedocs.io/en/stable/Tutorial/Nnictl.html')
Exemplo n.º 2
0
experiment.config.max_trial_number = 100
experiment.config.max_experiment_duration = '60d'

experiment.config.nni_manager_ip = '10.221.90.21'
experiment.config.search_space = search_space

experiment.config.trial_prepare_command = 'source /home/igor.quintanilha/miniconda3/bin/activate dsc'
experiment.config.trial_command = 'python main.py --gpus 1 data/brtd --vocab data/brtd/b3922f0904f4f1b7b258a9488132f2e6480cf936493be53f74fd7aaa07e14781.8f9337.vocab --batch-size 64 --max_epochs 10 --terminate_on_nan --num-embedding 400 --num-layers 3 --num-hidden 1150 --model awd --bptt 20 --max_steps 150000 --val_check_interval .25'
experiment.config.trial_code_directory = Path(__file__).parent.parent
experiment.config.trial_concurrency = 2
experiment.config.trial_gpu_number = 1

experiment.config.training_service[0].use_active_gpu = True
experiment.config.training_service[0].max_trial_number_per_gpu = True

experiment.config.training_service[1].reuse_mode = True

remote_confs = []
for ip in ['10.221.70.3', '10.221.70.15', '10.221.90.20']:
    rm_conf = RemoteMachineConfig()
    rm_conf.host = ip
    rm_conf.user = '******'
    rm_conf.ssh_key_file = '/home/igor.quintanilha/.ssh/id_rsa'
    rm_conf.use_active_gpu = True
    rm_conf.max_trial_number_per_gpu = 1
    remote_confs.append(rm_conf)

experiment.config.training_service[1].machine_list = remote_confs

experiment.start(26780, debug=False)