Exemplo n.º 1
0
def get_next_parameter(socket):
    """
    Get the hyper paremeters generated by tuner. For a multiphase experiment, it returns a new group of hyper
    parameters at each call of get_next_parameter. For a non-multiphase (multiPhase is not configured or set to False)
    experiment, it returns hyper parameters only on the first call for each trial job, it returns None since second call.
    This API should be called only once in each trial job of an experiment which is not specified as multiphase.

    Returns
    -------
    dict
        A dict object contains the hyper parameters generated by tuner, the keys of the dict are defined in
        search space. Returns None if no more hyper parameters can be generated by tuner.
    """
    global _params
    #_params = platform.get_next_parameter()
    # v1.1
    father_id = -1
    start=time.time()
    _params = platform.get_next_parameter()
    end=time.time()

    if _params is None:
        return None
    socket.send_pyobj({"type": "get_next_parameter"})
    message = socket.recv_pyobj()

    tuner = message["tuner"]

    if tuner.history:
        p0 = multiprocessing.Process(target= tuner.generate_parameters, args=(int(get_sequence_id()),))
        p0.start()
        trial_concurrency = os.popen('cat /etc/slurm-llnl/slurm.conf|grep NodeName|wc -l')
        trial_concurrency = int(trial_concurrency.read().strip())
        if get_sequence_id() < trial_concurrency :
            lock.acquire()
            with open(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/graph.txt", "a+") as f:
                json_and_id = 'json_out=' + str(_params['parameters']) + '+history' + "=False or True?"
                f.write(json_and_id + "\n")
            lock.release()
    else:
        socket.send_pyobj({"type": "generated_parameter"})
        message = socket.recv_pyobj()
        lock.acquire()
        with open(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/trials/" + str(nni.get_trial_id()) + "/output.log","a+") as f:
            f.write(" generate=" + str(end-start)+"\n")

        with open(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/graph.txt","a+") as f:
            json_and_id='json_out='+str(_params['parameters'])+'+history'+"=False"
            f.write(json_and_id+"\n")
        lock.release()
Exemplo n.º 2
0
    def output_experiment_detail(self, res_path):
        exp = nni.get_experiment_id()
        trail = nni.get_trial_id()
        trail_path = os.path.join(res_path, exp, trail)
        if not os.path.exists(os.path.join(res_path, exp)):
            os.mkdir(os.path.join(res_path, exp))
        if not os.path.exists(trail_path):
            os.mkdir(trail_path)

        # TODO whatever you want
        p_loss = self.plot_line(LOSS_PLOT, show_plot=False)
        p_acc = self.plot_line(ACCURACY_PLOT, show_plot=False)
        p_auc = self.plot_line(AUC_PLOT, show_plot=False)

        measures_table = [
            ["train_loss_vec"] + [str(x) for x in self.loss_train_vec],
            ["train_acc_vec"] + [str(x) for x in self.accuracy_train_vec],
            ["train_auc_vec"] + [str(x) for x in self.auc_train_vec],
            ["dev_loss_vec"] + [str(x) for x in self.loss_dev_vec],
            ["dev_acc_vec"] + [str(x) for x in self.accuracy_dev_vec],
            ["dev_auc_vec"] + [str(x) for x in self.auc_dev_vec],
            ["test_loss_vec"] + [str(x) for x in self.loss_test_vec],
            ["test_acc_vec"] + [str(x) for x in self.accuracy_test_vec],
            ["test_auc_vec"] + [str(x) for x in self.auc_test_vec]
        ]
        with open(os.path.join(res_path, exp, trail, "measures_by_epochs.csv", "wt"), newline="") as f:
            writer = csv.writer(f)
            writer.writerows(measures_table)
Exemplo n.º 3
0
 def __init__(self, model_key, run_id, run_dir):
     self.model_key = model_key
     self.run_id = run_id
     self.run_dir = run_dir
     self.trial_id = nni.get_trial_id()
     self.exp_id = nni.get_experiment_id()
     self.scoring = accuracy_score
Exemplo n.º 4
0
def get_nni_or_mlflow_experiment_and_trial() -> Tuple[Optional[str], Optional[str]]:
    """ Helper function which returns NNI experiment name and trial ID if NNI isn't in Standalone mode or, otherwise, returns MLFlow experiment name and run ID if there is an active MLFlow run. 
    Returns (None, None) if NNI is in standalone mode and there is no active MLFLow run.
    """
    if is_nni_run_standalone():
        exp, run = deepcv.utils.mlflow_get_experiment_run_info()
        return (None, None) if exp is None else (exp.name, str(run.run_id))
    return (nni.get_experiment_id(), nni.get_trial_id())
Exemplo n.º 5
0
def setup_experiment(
        runtime_config: RuntimeConfig,
        enable_nni: bool = False,
        logger_blacklist: Optional[List[str]] = None) -> RuntimeConfig:
    if logger_blacklist is None:
        logger_blacklist = ['numba']
    setup_distributed_training()
    seed_everything(runtime_config.seed)

    if runtime_config.output_dir is None:
        if 'PT_OUTPUT_DIR' in os.environ:
            runtime_config.output_dir = Path(os.environ['PT_OUTPUT_DIR'])
        else:
            runtime_config.output_dir = Path('./outputs')

    if enable_nni:
        import nni
        if nni.get_experiment_id() != 'STANDALONE':
            runtime_config.output_dir = runtime_config.output_dir / nni.get_experiment_id(
            ) / str(nni.get_sequence_id())

    runtime_config.output_dir.mkdir(exist_ok=True)

    if runtime_config.checkpoint_dir is None:
        runtime_config.checkpoint_dir = runtime_config.output_dir / 'checkpoints'
        runtime_config.checkpoint_dir.mkdir(exist_ok=True)

    if runtime_config.tb_log_dir is None:
        runtime_config.tb_log_dir = runtime_config.output_dir / 'tb'
        runtime_config.tb_log_dir.mkdir(exist_ok=True)

    reset_logger()
    setup_logger(
        '',
        log_file=(runtime_config.output_dir / 'stdout.log').as_posix(),
        log_level=logging.DEBUG if runtime_config.debug else logging.INFO)
    for logger in logger_blacklist:
        mute_logger(logger)

    global _runtime_config
    _runtime_config = runtime_config

    return runtime_config
Exemplo n.º 6
0
def prepare_hyper_search(cfg_kwargs: dict,
                         reporthook=None, final_reporthook=None,
                         primary_key=None, max_key=True, reporter_cls=None, with_keys: (list, str, None) = None,
                         final_keys: (list, str, None) = None,
                         dump=False, disable=False):
    """
    Updated in v1.3.18

    从 nni package 中获取超参,更新配置文件参数。当 nni 不可用或不是 nni 搜索模式时,参数将不会改变。

    .. code-block :: python

        cfg_kwargs, reporthook, final_reporthook, tag = prepare_hyper_search(
            cfg_kwargs, Configuration, reporthook, final_reporthook, primary_key="macro_avg:f1"
        )

        _cfg = Configuration(**cfg_kwargs)
        model = Model(_cfg)
        ...

        for epoch in range(_cfg.begin_epoch, _cfg.end_epoch):
            for batch_data in dataset:
                train_model(batch_data)

            data = evaluate_model()
            reporthook(data)

        final_reporthook()

    Parameters
    ----------
    cfg_kwargs: dict
        待传入cfg的参数
    reporthook
    final_reporthook
    primary_key:
        评估模型用的主键,
        ``nni.report_intermediate_result`` 和 ``nni.report_final_result`` 中  ``metric`` 的 ``default``
    max_key: bool
        主键是越大越好
    reporter_cls
    with_keys: list or str
        其它要存储的 metric,final report时默认为 primary_key 最优时指标
    final_keys: list or str
        with_keys 中使用最后一个 report result 而不是 primary_key 最优时指标
    dump: bool
        为 True 时,会修改 配置文件 中 workspace 参数为 ``workspace/nni.get_experiment_id()/nni.get_trial_id()``
        使得 nni 的中间结果会被存储下来。
    disable

    Returns
    -------
    cfg_kwargs: dict
        插入了nni超参后的配置文件参数
    reporthook: function
        每个iteration结束后的回调函数,用来报告中间结果。
        默认 ``nni.report_intermediate_result``。
    final_reporthook:
        所有iteration结束后的回调函数,用来报告最终结果。
        默认 ``nni.report_final_result``
    dump: bool
        和传入参数保持一致

    Examples
    --------
    .. code-block :: python

        class CFG(Configuration):
            hyper_params = {"hidden_num": 100}
            learning_rate = 0.001
            workspace = ""

        cfg_kwargs, reporthook, final_reporthook, dump = prepare_hyper_search(
            {"learning_rate": 0.1}, CFG, primary_key="macro_avg:f1", with_keys="accuracy"
        )
        # cfg_kwargs: {'learning_rate': 0.1}

    when nni start (e.g., using ``nni create --config _config.yml``),
    suppose in ``_config.yml``:

    .. code-block: yml

        searchSpacePath: _search_space.json

    and in ``_search_space.json``

    .. code-block :: json

        {
            "hidden_num": {"_type": "choice", "_value": [500, 600, 700, 835, 900]},
        }

    one of the return cfg_kwargs is ``{'hyper_params': {'hidden_num': 50}, 'learning_rate': 0.1}``
    """
    if disable:
        return cfg_kwargs, None, None, None
    try:
        import nni
        from nni import get_next_parameter, report_intermediate_result, report_final_result

        assert primary_key is not None

        def _as_key_list(_keys: (list, str, None)):
            if isinstance(_keys, str):
                if ";" in _keys:
                    _keys = _keys.split(";")
                else:
                    _keys = [_keys]
            elif isinstance(_keys, list):
                pass
            elif _keys is None:
                _keys = []
            return _keys

        with_keys = _as_key_list(with_keys)
        final_keys = _as_key_list(final_keys)

        class Reporter(BaseReporter):
            def __init__(self):
                self.datas = []

            def intermediate(self, data):
                feed_dict = {
                    'default': float(get_by_key(data, key_parser(primary_key))),
                    primary_key: get_by_key(data, key_parser(primary_key))
                }
                for key in with_keys:
                    feed_dict[key] = get_by_key(data, key_parser(key))
                report_intermediate_result(feed_dict)
                self.datas.append(data)

            def final(self):
                best_fn = get_min if max_key is False else get_max
                _with_keys = (with_keys if with_keys else []) + [primary_key]
                _final_keys = set(final_keys if final_keys else [])
                final_result = best_fn(
                    self.datas, primary_key, with_keys=";".join(_with_keys), merge=False
                )
                feed_dict = {
                    'default': float(final_result[0][primary_key])
                }
                appendix_dict = dict(final_result[1][primary_key])
                for key in _with_keys:
                    if key in _final_keys:
                        feed_dict[key] = get_by_key(self.datas[-1], key_parser(key))
                    else:
                        feed_dict[key] = appendix_dict[key]
                report_final_result(feed_dict)

        rc = Reporter() if reporter_cls is None else reporter_cls
        reporthook = reporthook if reporthook is not None else rc.intermediate
        final_reporthook = final_reporthook if final_reporthook is not None else rc.final
        cfg_cls_params = get_params(get_next_parameter())
        using_nni_tag = True if cfg_cls_params else False
        nested_update(cfg_kwargs, cfg_cls_params)
        if using_nni_tag is True and dump is True:  # pragma: no cover
            cfg_kwargs["workspace"] = cfg_kwargs.get("workspace", "") + path_append(
                nni.get_experiment_id(), nni.get_trial_id(), to_str=True
            )
        return cfg_kwargs, reporthook, final_reporthook, dump

    except ModuleNotFoundError:  # pragma: no cover
        warnings.warn("nni package not found, skip")
        return cfg_kwargs, reporthook, final_reporthook, dump
Exemplo n.º 7
0
def train_eval(esargs, RCV_CONFIG, seqid):
    """ train and eval the model
    """
    global net
    global best_acc
    global bs_explore
    global gpus
    global hp_path

    best_acc = 0
    parse_rev_args(RCV_CONFIG, esargs)
    # train procedure
    trial_id = nni.get_trial_id()
    available_devices = os.environ["CUDA_VISIBLE_DEVICES"]
    gpus = len(available_devices.split(","))

    is_training = True
    filenames = ds.get_filenames(args.train_data_dir)
    dataset = tf.data.Dataset.from_tensor_slices(filenames)
    dataset = dataset.flat_map(tf.data.TFRecordDataset)
    ds_train = ds.process_record_dataset(
        dataset=dataset,
        is_training=is_training,
        batch_size=bs_explore,
        shuffle_buffer=shuffle_buffer,
        parse_record_fn=ds.parse_record,
        num_epochs=args.epochs,
        npc=args.num_parallel_calls,
        num_gpus=gpus,
        examples_per_epoch=examples_per_epoch if is_training else None,
        dtype=tf.float32)

    is_training = False
    filenames = ds.get_filenames(args.val_data_dir)
    dataset = tf.data.Dataset.from_tensor_slices(filenames)
    dataset = dataset.flat_map(tf.data.TFRecordDataset)
    ds_val = ds.process_record_dataset(dataset=dataset,
                                       is_training=is_training,
                                       batch_size=bs_explore,
                                       shuffle_buffer=shuffle_buffer,
                                       parse_record_fn=ds.parse_record,
                                       num_epochs=args.epochs,
                                       npc=args.num_parallel_calls,
                                       num_gpus=gpus,
                                       examples_per_epoch=None,
                                       dtype=tf.float32)

    # run epochs and patience
    loopnum = seqid // args.slave
    patience = min(int(6 + (2 * loopnum)), 20)
    if loopnum == 0:
        run_epochs = int(args.warmup_1)
    elif loopnum == 1:
        run_epochs = int(args.warmup_2)
    elif loopnum == 2:
        run_epochs = int(args.warmup_3)
    else:
        run_epochs = int(args.epochs)

    # if loopnum < 4:
    #     patience = int(8 + (2 * loopnum))
    #     run_epochs = int(10 + (20 * loopnum))
    # else:
    #     patience = 16
    #     run_epochs = args.epochs

    # lr strategy

    def scheduler2(epoch):
        lr_max = args.initial_lr
        total_epochs = args.epochs
        lr_each_epoch = lr_max - lr_max * epoch / total_epochs
        return lr_each_epoch

    callback = tf.keras.callbacks.LearningRateScheduler(scheduler2)

    # save weights
    checkpoint_dir = os.environ["HOME"] + "/nni/experiments/" + str(
        nni.get_experiment_id()) + "/checkpoint/" + str(nni.get_trial_id())
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    checkpoint_filepath = checkpoint_dir + "/weights." + "epoch." + str(
        run_epochs) + ".hdf5"
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        monitor='val_accuracy',
        mode='max',
        save_best_only=True,
        save_freq='epoch',
        save_weights_only=True,
    )

    history = net.fit(ds_train,
                      epochs=run_epochs,
                      steps_per_epoch=Ntrain // bs_explore // gpus,
                      validation_data=ds_val,
                      validation_steps=Nvalidation // bs_explore // gpus,
                      verbose=1,
                      shuffle=False,
                      callbacks=[
                          SendMetrics(hp_path), callback,
                          EarlyStopping(min_delta=0.001, patience=patience),
                          model_checkpoint_callback
                      ])

    # trial report final acc to tuner
    acc = 0
    acc_list = history.history['val_accuracy']
    for acc_n in acc_list:
        if float(acc_n) > acc:
            acc = float(acc_n)
    try:
        # predict acc
        if run_epochs >= 10 and run_epochs < 80:
            epoch_x = range(1, len(acc_list) + 1)
            pacc = utils.predict_acc(trial_id, epoch_x, acc_list, 90, True)
            best_acc = float(pacc)
    except Exception as E:
        print("Predict failed.")
    if acc > best_acc:
        best_acc = acc
    logger.debug("Final result is: %.3f", acc)
    return best_acc, history.epoch[-1]
Exemplo n.º 8
0
    except Exception as E:
        print("Predict failed.")
    if acc > best_acc:
        best_acc = acc
    logger.debug("Final result is: %.3f", acc)
    return best_acc, history.epoch[-1]


if __name__ == "__main__":
    example_start_time = time.time()
    net = None
    args = get_args()
    try:
        experiment_path = os.environ[
            "HOME"] + "/mountdir/nni/experiments/" + str(
                nni.get_experiment_id())
        lock = multiprocessing.Lock()
        context = zmq.Context()
        socket = context.socket(zmq.REQ)
        tmpstr = 'tcp://' + args.ip + ':800081'
        socket.connect(tmpstr)
        os.makedirs(experiment_path + "/trials/" + str(nni.get_trial_id()))

        get_next_parameter_start = time.time()
        nni.get_next_parameter(socket)
        get_next_parameter_end = time.time()

        while True:
            lock.acquire()
            with open(experiment_path + "/graph.txt", "a+") as f:
                f.seek(0)
Exemplo n.º 9
0
 def test_get_experiment_id(self):
     self.assertEqual(nni.get_experiment_id(), 'fakeidex')
Exemplo n.º 10
0
class ClassifyParam:
    local_model_path = os.path.join(
        'data', 'cache', 'classify_{}_{}.model'.format(nni.get_experiment_id(),
                                                       nni.get_trial_id()))
    top_n_list = list(range(1, 11)) + [15, 20]
Exemplo n.º 11
0
    def _start_mlflow_run(self, run_params: Dict[str, Any],
                          pipeline: Pipeline):
        """ Log basic informations to MLFlow about pipeline if this pipeline is tagged with 'train' (creates a new MLFLow experiment and/or run named after training pipeline if it doesn't exists yet)
        NOTE: If NNI is in dry run mode (mode used to generate NNI Classic NAS search space JSON file from a model which contains NNI NAS Mutables `LayerChoice` and/or `InputChoice`) we avoid creating any new MLFlow experiment/run nor logging anything else to mlflow during this dry run
        """
        node_tags = functools.reduce(set.union,
                                     [n.tags for n in pipeline.nodes])
        if not deepcv.meta.nni_tools.is_nni_gen_search_space_mode() and (
                'train' in run_params['tags'] or 'train' in node_tags):
            if mlflow.active_run() is None:
                # Create MLFlow run in an experiment named after pipeline involved in training and log various pipeline/datasets informations to mlflow. If we are running an NNI hp/nas search, mlflow experiment and run will be named after NNI experiment and trial ids for better consitency.
                # TODO: find another way to name experiment as pipeline name is only available when running `kedro run --pipeline=<pipeline_name>` (e.g. special tag to node after which experiment is named)

                if not deepcv.meta.nni_tools.is_nni_run_standalone(
                ):  # 'STANDALONE' is NNI default experiment ID if python process haven't been started by NNI
                    nni_experiment = nni.get_experiment_id()
                    mlflow.set_experiment(nni_experiment)
                    mlflow.start_run(run_name=nni.get_trial_id())
                    # Flag indicating whether we are using NNI HP or Classic NAS API (Hyperparameter and/or Classic Neural Architecture search using NNI)
                    mlflow.set_tag('nni_standalone_mode', False)
                    mlflow.set_tag('nni_experiment_id', nni_experiment)
                    mlflow.set_tag('nni_trial_id', nni.get_trial_id())
                    mlflow.set_tag('nni_sequence_id', nni.get_sequence_id())
                else:
                    pipeline_name = run_params['pipeline_name'].lower(
                    ) if run_params['pipeline_name'] else 'default'
                    mlflow.set_experiment(
                        f'{self.project_ctx.project_name.lower()}_{pipeline_name}'
                    )
                    mlflow.start_run(
                        run_name=
                        f'{pipeline_name.lower()}_run_{run_params["run_id"]}')
                    mlflow.set_tag('nni_standalone_mode', True)

            # Log basic informations about Kedro training pipeline to mlflow
            mlflow.set_tags({
                f'kedro_node_tag_{i}': tag
                for i, tag in enumerate(node_tags)
            })
            mlflow.log_params({n: v for n, v in run_params.items() if v})
            mlflow.log_param('pipeline.json', pipeline.to_json())
            mlflow.log_param('pipeline.describe', pipeline.describe())
            mlflow.log_param('pipeline.pipeline_datasets',
                             pipeline.data_sets())
            """ The following code creates special mlflow tags about current repository infos, which is not done by mlflow when starting an MLFlow run from code instead of from `mlflow run` command
            Code inspired from [`mlflow.projects._create_run`](https://www.mlflow.org/docs/latest/_modules/mlflow/projects.html) which doesn't seems to be called by `mlflow.start_run`
            """
            tags = {
                mlflow.utils.mlflow_tags.MLFLOW_SOURCE_NAME:
                self.project_ctx.package_name,
                mlflow.utils.mlflow_tags.MLFLOW_SOURCE_TYPE:
                mlflow.entities.SourceType.to_string(
                    mlflow.entities.SourceType.PROJECT),
                mlflow.utils.mlflow_tags.MLFLOW_PROJECT_ENTRY_POINT:
                inspect.getsourcefile(type(self.project_ctx))
            }
            try:
                repo = git.Repo(self.project_ctx.project_path,
                                search_parent_directories=True)
                git_repo_url = repo.remote(
                ).url if 'origin' in repo.remotes else (
                    repo.remotes[0].url if len(repo.remotes) > 0 else '')
                git_repo_url = re.sub(
                    r'git@([.\w]+):', r'https://\1/', git_repo_url).rstrip(
                        '.git')  # Convert SSH git URL to http URL
                mlflow.log_param(
                    'commit_url',
                    git_repo_url + f'/commit/{repo.head.commit.hexsha}/')

                # We also set MLFLOW_SOURCE_NAME to repo URL so that MLFlow web UI is able to parse it and render commit and source hyperlinks (MLFLow only supports github URLs for now)
                tags.update({
                    mlflow.utils.mlflow_tags.MLFLOW_SOURCE_NAME:
                    git_repo_url
                    if git_repo_url else self.project_ctx.project_name,
                    mlflow.utils.mlflow_tags.MLFLOW_GIT_BRANCH:
                    repo.active_branch.name,
                    mlflow.utils.mlflow_tags.MLFLOW_GIT_REPO_URL:
                    git_repo_url,
                    mlflow.utils.mlflow_tags.MLFLOW_GIT_COMMIT:
                    repo.head.commit.hexsha
                })

                # Change mlflow user to be git repository user instead of system user (if any git user is specified)
                git_config_reader = repo.config_reader()
                git_config_reader.read()
                user = git_config_reader.get_value('user',
                                                   'name',
                                                   default=None)
                email = git_config_reader.get_value('user',
                                                    'email',
                                                    default=None)
                if user or email:
                    tags[mlflow.utils.mlflow_tags.MLFLOW_USER] = (
                        str(user) + (f' <{email}>' if email else '')
                    ) if user else str(email)
            except (ImportError, OSError, ValueError, IOError, KeyError,
                    git.GitError, configparser.Error) as e:
                logging.warning(
                    f'Failed to import Git or to get repository informations. Error: {e}'
                )

            mlflow.set_tags(tags)
def train_search(config,
                 params=None,
                 warm_start_NN=None,
                 restore_old_checkpoint=False,
                 workers=1,
                 verbosity=0):
    """
    train_search is practically the same as the train function from training_torch, just made for NNI experiments

    :param config:
    :param params:
    :param warm_start_NN:
    :param restore_old_checkpoint:
    :param workers:
    :param verbosity:
    :return:
    """
    if verbosity == 0:
        logger.setLevel(logging.INFO)
    if verbosity >= 1:
        logger.setLevel(logging.DEBUG)
    start = time.time()

    logger.info('Preparing Datasets')

    train_dataset, validation_dataset = prepare_dataset_torch(config)
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=params['batch_size'],
                                               shuffle=True)
    test_loader = torch.utils.data.DataLoader(validation_dataset,
                                              batch_size=params['batch_size'],
                                              shuffle=True)

    logger.info('Initializing Torch Network')

    net = map_model(config, params)

    logger.info('Optimizer Initialize')
    optimizer = map_optimizer(params['optimizer'], net.parameters(),
                              params['learning_rate'])
    loss_func = map_loss_func(params['loss'])
    criterion = torch.nn.MSELoss()

    if config['scheduler']:
        scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer, milestones=config['scheduler_milestones'], gamma=0.1)
    else:
        scheduler = None

    epochs = config['epochs']

    # Track the losses to determine early stopping
    avg_train_loss = []
    avg_valid_loss = []

    # initalize the early_stopping object
    early_stopping = EarlyStopping(verbose=True, trace_func=logger.info)

    logger.info('Start Training!')
    for epoch in range(epochs):

        train_loss, validation_loss, RMSE = train_epoch(
            net,
            optimizer,
            loss_func,
            train_loader=train_loader,
            test_loader=test_loader,
            scheduler=scheduler,
            criterion=criterion)

        nni.report_intermediate_result(-math.log10(RMSE))
        if early_stopping is not None:
            early_stopping(validation_loss, net, RMSE)
            RMSE = early_stopping.RMSE

        avg_train_loss.append(train_loss)
        avg_valid_loss.append(validation_loss)

        logger.info(
            'Epoch {}; Train Loss: {:.5}; Valid Loss: {:.5}; Best Validation RMSE: {:.5}'
            .format(epoch, train_loss, validation_loss, RMSE))
        print(
            'Epoch {}; Train Loss: {:.5}; Valid Loss: {:.5}; Validation RMSE: {:.5}'
            .format(epoch, train_loss, validation_loss, RMSE))
        if early_stopping.early_stop:
            logger.info('Early Stopping')
            RMSE = early_stopping.RMSE
            break

    nni.report_final_result(-math.log10(RMSE))
    end = time.time()
    logger.info(
        'Training Completed: Time elapsed: {:.2} Seconds'.format(end - start))
    plot_against_scaling(net,
                         validation_dataset,
                         criterion,
                         trial_id=str(nni.get_trial_id()),
                         exp_id=str(nni.get_experiment_id()))
Exemplo n.º 13
0
def is_nni_run_standalone() -> bool:
    """ Simple helper function which returns whether NNI is in standalone trial run mode """
    return nni.get_experiment_id() == r'STANDALONE' and nni.get_trial_id() == r'STANDALONE' and nni.get_sequence_id() == 0
Exemplo n.º 14
0
    def generate_parameters(self, parameter_id, **kwargs):
        """
        Returns a set of trial neural architecture, as a serializable object.

        Parameters
        ----------
        parameter_id : int
        """
        #If there is no history, slave node will use the fake model.
        if not self.history:
            print("If there is no history, generate_parameters should not be called!")
            exit(1)
        total_start=time.time()
        rate = 1

        if (os.path.exists(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/generate_time") and os.path.exists(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/train_time")):
            with open(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/generate_time", "r") as f:
                generate_time = float(f.read())
            with open(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/train_time", "r") as f:
                train_time = float(f.read())
            if (generate_time != 0) and (train_time != 0):
                realrate = int(train_time / generate_time)
                if (realrate < 5) and (realrate > 1):
                    rate = int(realrate)
                if (realrate <= 1):
                    rate = 1

        for i in range(rate):
            start=time.time()
            new_father_id = None
            generated_graph = None
            if not self.training_queue:
                new_father_id, generated_graph = self.generate()
                father_id,json_out,new_model_id = self.total_data[parameter_id]
                self.training_queue.append((generated_graph, new_father_id, new_model_id))
                #self.descriptors.append(generated_graph.extract_descriptor())
            else:
                print("training_queue should be an empty list.")
                exit(1)

            graph, father_id, model_id = self.training_queue.pop(0)
        # from graph to json
            json_model_path = os.path.join(self.path, str(model_id) + ".json")
            json_out = graph_to_json(graph, json_model_path)
            end=time.time()
        #self.total_data[parameter_id] = (json_out, father_id, model_id)
            json_and_id="json_out="+str(json_out)+"+father_id="+str(father_id)+"+parameter_id="+str(parameter_id)+"+history="+"True"
            lock.acquire()
            with open(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/trials/" + str(nni.get_trial_id()) + "/output.log","a+") as f:
                f.write("single_generate=" + str(end - start)+"\n")

            with open(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/graph.txt","a+") as f:
                f.write(json_and_id+"\n")
            lock.release()
        total_end=time.time()
        lock.acquire()
        with open(os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/trials/" + str(nni.get_trial_id()) + "/output.log","a+") as f:
            f.write("total_generate=" + str(total_end - total_start)+"\n")
        lock.release()

        totime = total_end - total_start
        if totime<0:
            totime = 0-totime

        with open (os.environ["HOME"] + "/mountdir/nni/experiments/" + str(nni.get_experiment_id()) + "/generate_time","w+") as f:
            gt = totime/rate
            f.write(str(gt))
Exemplo n.º 15
0
    def main(self, hp_params):
        model_args = self.model_args
        data_args = self.data_args
        training_args = self.training_args

        # arguments manipulation
        if nni.get_experiment_id() != 'STANDALONE':
            training_args.output_dir = f"{training_args.output_dir}/{nni.get_experiment_id()}-{nni.get_trial_id()}"
        model_args.model_name_or_path = hp_params['backbone']
        training_args.learning_rate = hp_params['learning_rate']
        training_args.seed = hp_params['seed']
        if hp_params["max_seq_length"] > 384:
            training_args.per_device_train_batch_size = 2
            training_args.per_device_eval_batch_size = 2
            training_args.gradient_accumulation_steps = 16
            data_args.max_seq_length = hp_params["max_seq_length"]
        else:
            data_args.max_seq_length = hp_params["max_seq_length"]

        # get token classification task instance
        module = import_module("tasks")
        try:
            token_classification_task_clazz = getattr(module,
                                                      model_args.task_type)
            token_classification_task: TokenClassificationTask = token_classification_task_clazz(
            )
        except AttributeError:
            raise ValueError(
                f"Task {model_args.task_type} needs to be defined as a TokenClassificationTask subclass in {module}. "
                f"Available tasks classes are: {TokenClassificationTask.__subclasses__()}"
            )

        # label
        labels = token_classification_task.get_labels(data_args.labels)
        label_map: Dict[int,
                        str] = {i: label
                                for i, label in enumerate(labels)}
        num_labels = len(labels)
        self.label_map = label_map

        # load pretrained model and tokenizer
        config = self._custom_config(
            model_args=model_args,
            num_labels=num_labels,
            id2label=label_map,
            label2id={label: i
                      for i, label in enumerate(labels)})
        tokenizer = self._custom_tokenizer(model_args=model_args)
        model = self._custom_model(model_args=model_args, config=config)

        #         config = AutoConfig.from_pretrained(
        #         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        #         num_labels=num_labels,
        #         id2label=label_map,
        #         label2id={label: i for i, label in enumerate(labels)},
        #         cache_dir=model_args.cache_dir,
        #         )
        #         tokenizer = AutoTokenizer.from_pretrained(
        #             model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        #             cache_dir=model_args.cache_dir,
        #             use_fast=model_args.use_fast,
        #         )
        #         model = AutoModelForTokenClassification.from_pretrained(
        #             model_args.model_name_or_path,
        #             from_tf=bool(".ckpt" in model_args.model_name_or_path),
        #             config=config,
        #             cache_dir=model_args.cache_dir,
        #         )

        # get dataset and data_collator
        train_dataset = (TokenClassificationDataset(
            token_classification_task=token_classification_task,
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            labels=labels,
            model_type=config.model_type,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.train,
        ) if training_args.do_train else None)
        eval_dataset = (TokenClassificationDataset(
            token_classification_task=token_classification_task,
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            labels=labels,
            model_type=config.model_type,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.dev,
        ) if training_args.do_eval else None)
        data_collator = DataCollatorForTokenClassification(tokenizer)

        # callbacks
        callbacks = [
            NNiCallback(hp_metric=training_args.metric_for_best_model,
                        greater_is_better=training_args.greater_is_better)
        ]

        # reset logging, eval and save step as EPOCHS explicitly
        steps_per_epoch = int(
            np.ceil(
                len(train_dataset) /
                (training_args.train_batch_size *
                 training_args.gradient_accumulation_steps)))
        training_args.logging_steps = steps_per_epoch
        training_args.save_steps = steps_per_epoch
        training_args.eval_steps = steps_per_epoch

        # Initialize our Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            callbacks=callbacks,
            compute_metrics=self.compute_metrics,
        )

        # Training
        if training_args.do_train:
            trainer.train(model_path=model_args.model_name_or_path if os.path.
                          isdir(model_args.model_name_or_path) else None)
            trainer.save_model()
            # For convenience, we also re-save the tokenizer to the same directory,
            # so that you can share your model easily on huggingface.co/models =)
            if trainer.is_world_master():
                tokenizer.save_pretrained(training_args.output_dir)

        # Evaluation
        results = {}
        if training_args.do_eval:
            logger.info("*** Evaluate ***")

            result = trainer.evaluate()

            output_eval_file = os.path.join(training_args.output_dir,
                                            "eval_results.txt")
            if trainer.is_world_master():
                with open(output_eval_file, "w") as writer:
                    logger.info("***** Eval results *****")
                    for key, value in result.items():
                        logger.info("  %s = %s", key, value)
                        writer.write("%s = %s\n" % (key, value))

                results.update(result)

        # Predict
        if training_args.do_predict:
            test_dataset = TokenClassificationDataset(
                token_classification_task=token_classification_task,
                data_dir=data_args.data_dir,
                tokenizer=tokenizer,
                labels=labels,
                model_type=config.model_type,
                max_seq_length=data_args.max_seq_length,
                overwrite_cache=data_args.overwrite_cache,
                mode=Split.test,
            )

            predictions, label_ids, metrics = trainer.predict(test_dataset)
            preds_list, _ = self.align_predictions(predictions, label_ids)

            output_test_results_file = os.path.join(training_args.output_dir,
                                                    "test_results.txt")
            if trainer.is_world_master():
                with open(output_test_results_file, "w") as writer:
                    for key, value in metrics.items():
                        logger.info("  %s = %s", key, value)
                        writer.write("%s = %s\n" % (key, value))

            # Save predictions
            output_test_predictions_file = os.path.join(
                training_args.output_dir, "test_predictions.txt")
            if trainer.is_world_master():
                with open(output_test_predictions_file, "w") as writer:
                    with open(os.path.join(data_args.data_dir, "test.json"),
                              "r") as f:
                        docs = json.load(f)
                    for doc, preds in zip(docs, preds_list):
                        text = doc['_source']['text']
                        labels = doc['_source']['label_list']
                        preds = ' '.join(preds)
                        print(f"{text}\t{labels}\t{preds}", file=writer)

        # nni final result
        if training_args.greater_is_better:
            nni.report_final_result(max(METRICS))
        else:
            nni.report_final_result(min(METRICS))
Exemplo n.º 16
0
def get_experiment_id() -> str:
    return nni.get_experiment_id()
Exemplo n.º 17
0
        # params["gs_research_workflow.time_series.gs_steps.model_steps:FitStep > epochs "] = 1
        # params["gs_research_workflow.time_series.gs_steps.model_steps:FitStep > steps_per_epoch "] = 1
        # params["gs_research_workflow.time_series.gs_steps.model_steps:FitStep > validation_steps "] = 1
        # params["gs_research_workflow.time_series.models.inception_time:InceptionTime.HP > depth"] = 5
        # params["gs_research_workflow.time_series.models.inception_time:InceptionTime.HP > use_residual"] = True
        if cfg_alias_cls:
            params = {
                cfg_alias_cls.get_cfg_loc(k): v
                for k, v in params.items()
            }
        trial_uuid = generate_uuid()
        experiment_id = generate_uuid()
    else:
        os.environ[ENV_KEY_TRIAL_IN_NNI] = "1"
        params = nni.get_next_parameter()
        experiment_id = nni.get_experiment_id()
        trial_uuid = nni.get_trial_id()
        if cfg_alias_cls:
            params = {
                cfg_alias_cls.get_cfg_loc(k): v
                for k, v in params.items()
            }
    # 对 item 进行 unescape
    params = {k: unescape_nni_choice_item(v) for k, v in params.items()}

    yml_path = os.path.join(os.path.dirname(__file__), "../../..", args.cfg)
    if not os.path.isfile(yml_path):
        logger.error(f"Default cfg file {yml_path} is not existed!")
        sys.exit(0)

    trial_task = HPOTrialPodSideEnv(args.name, yml_path, params, trial_uuid,
Exemplo n.º 18
0
class NextLocParam:
    local_model_path = os.path.join(
        'data', 'cache', 'next_loc_{}_{}.model'.format(nni.get_experiment_id(),
                                                       nni.get_trial_id()))
    local_result_path = os.path.join('data', 'cache', 'next_loc_result.h5')
    top_n_list = [1, 2, 3, 4, 5, 10, 20]