示例#1
0
def fn_trainable(config, checkpoint_dir=None):
    if checkpoint_dir:
        with open(os.path.join(checkpoint_dir, "checkpoint.json"), "rt") as fp:
            state = json.load(fp)
    else:
        state = {"internal_iter": 0}

    for i in range(state["internal_iter"], config["max_iterations"]):
        state["internal_iter"] = i
        time.sleep(config["sleep_time"])

        if i % config["checkpoint_freq"] == 0:
            with tune.checkpoint_dir(step=i) as cd:
                with open(os.path.join(cd, "checkpoint.json"), "wt") as fp:
                    json.dump(state, fp)

        tune.report(
            score=i * 10 * config["score_multiplied"],
            internal_iter=state["internal_iter"],
        )
    def _on_step(self):
        sync_envs_normalization(self.training_env, self.eval_env)

        episode_rewards, episode_lengths = evaluate_policy(self.model, self.eval_env,
                                                           n_eval_episodes=self.n_eval_episodes,
                                                           render=False,
                                                           deterministic=self.deterministic,
                                                           return_episode_rewards=True)

        episode_reward_mean, std_reward = np.mean(
            episode_rewards), np.std(episode_rewards)
        mean_ep_length, std_ep_length = np.mean(
            episode_lengths), np.std(episode_lengths)

        report(
            episode_reward_mean=episode_reward_mean,
            std_reward=std_reward,
            mean_ep_length=mean_ep_length,
            std_ep_length=std_ep_length
        )
示例#3
0
    def tune_function(config, checkpoint_dir=None):
        trainer = Trainer(
            backend=backend_config,
            num_workers=num_workers,
            use_gpu=use_gpu,
            resources_per_worker=resources_per_worker,
        )

        trainer.start()

        iterator = trainer.run_iterator(
            train_func, config, dataset=dataset, checkpoint=checkpoint_dir
        )

        for results in iterator:
            first_worker_results = results[0]

            tune.report(**first_worker_results)

        trainer.shutdown()
def train(config, checkpoint=None):
    step = 0
    if checkpoint:
        with open(checkpoint) as f:
            step = json.loads(f.read())["timestep"]

    for timestep in range(step, 100):
        v = np.tanh(float(timestep) / config.get("width", 1))
        v *= config.get("height", 1)

        if timestep % 3 == 0:
            checkpoint_dir = tune.make_checkpoint_dir(step=timestep)
            path = os.path.join(checkpoint_dir, "checkpoint")
            with open(path, "w") as f:
                f.write(json.dumps({"timestep": timestep}))
            tune.save_checkpoint(path)

        # Here we use `episode_reward_mean`, but you can also report other
        # objectives such as loss or accuracy.
        tune.report(episode_reward_mean=v)
def eval_single_epoch(model: torch.nn.Module, loss_function: torch.nn.Module,
                      data_loader: torch.utils.data.DataLoader):
    # switch to evaluate mode
    model.eval()

    accuracy_total = 0

    for data in data_loader:
        X, y = data
        X, y = X.to(device), y.to(device)
        output = model(X)  #.view(-1, 4096)
        loss = loss_function(output, y)
        accuracy_total += accuracy(y, output)

    accuracy_avg = 100.0 * accuracy_total / len(data_loader.dataset)

    ####### tune ############
    tune.report(mean_accuracy=accuracy_avg)

    return {'Eval epoch accuracy ': accuracy_avg}
示例#6
0
        def MockTrainingFuncSync(config, checkpoint_dir=None):
            iter = 0

            if checkpoint_dir:
                checkpoint_path = os.path.join(checkpoint_dir, "checkpoint")
                with open(checkpoint_path, "rb") as fp:
                    a, iter = pickle.load(fp)

            a = config["a"]  # Use the new hyperparameter if perturbed.

            while True:
                iter += 1
                with tune.checkpoint_dir(step=iter) as checkpoint_dir:
                    checkpoint_path = os.path.join(checkpoint_dir,
                                                   "checkpoint")
                    with open(checkpoint_path, "wb") as fp:
                        pickle.dump((a, iter), fp)
                # Score gets better every iteration.
                time.sleep(1)
                tune.report(mean_accuracy=iter + a, a=a)
示例#7
0
def train(config):
    import torch
    import horovod.torch as hvd

    hvd.init()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    mode = config["mode"]
    net = Net(mode).to(device)
    optimizer = torch.optim.SGD(
        net.parameters(),
        lr=config["lr"],
    )
    optimizer = hvd.DistributedOptimizer(optimizer)

    num_steps = 5
    print(hvd.size())
    np.random.seed(1 + hvd.rank())
    torch.manual_seed(1234)
    # To ensure consistent initialization across slots,
    hvd.broadcast_parameters(net.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    start = time.time()
    x_max = config["x_max"]
    for step in range(1, num_steps + 1):
        features = torch.Tensor(np.random.rand(1) * 2 * x_max -
                                x_max).to(device)
        if mode == "square":
            labels = sq(features)
        else:
            labels = qu(features)
        optimizer.zero_grad()
        outputs = net(features)
        loss = torch.nn.MSELoss()(outputs, labels)
        loss.backward()

        optimizer.step()
        time.sleep(0.1)
        tune.report(loss=loss.item())
    total = time.time() - start
    print(f"Took {total:0.3f} s. Avg: {total / num_steps:0.3f} s.")
示例#8
0
def train_convnet(config, checkpoint_dir=None):
    # Create our data loaders, model, and optmizer.
    step = 0
    train_loader, test_loader = get_data_loaders()
    model = ConvNet()
    optimizer = optim.SGD(
        model.parameters(),
        lr=config.get("lr", 0.01),
        momentum=config.get("momentum", 0.9),
    )

    # If checkpoint_dir is not None, then we are resuming from a checkpoint.
    # Load model state and iteration step from checkpoint.
    if checkpoint_dir:
        print("Loading from checkpoint.")
        path = os.path.join(checkpoint_dir, "checkpoint")
        checkpoint = torch.load(path)
        model.load_state_dict(checkpoint["model_state_dict"])
        step = checkpoint["step"]

    while True:
        train(model, optimizer, train_loader)
        acc = test(model, test_loader)
        if step % 5 == 0:
            # Every 5 steps, checkpoint our current state.
            # First get the checkpoint directory from tune.
            with tune.checkpoint_dir(step=step) as checkpoint_dir:
                # Then create a checkpoint file in this directory.
                path = os.path.join(checkpoint_dir, "checkpoint")
                # Save state to checkpoint file.
                # No need to save optimizer for SGD.
                torch.save(
                    {
                        "step": step,
                        "model_state_dict": model.state_dict(),
                        "mean_accuracy": acc,
                    },
                    path,
                )
        step += 1
        tune.report(mean_accuracy=acc)
示例#9
0
def training_function(config):
    master = A3CMaster()
    master.learning_rate = config["learning_rate"]
    master.beta = config["beta"]
    master.gamma = config["gamma"]
    '''worker = A3CWorker(master.master_model, master.optimizer, 0, master.folder,
                             master.beta, master.gamma, opponent_model_path=master.opponent_model_path)
     worker.run()'''
    """a3c_workers = [A3CWorker(master.master_model, master.optimizer, worker_id, master.folder,
                             master.beta, master.gamma, opponent_model_path=master.opponent_model_path)
                   for worker_id in range(2)]
    for i, worker in enumerate(a3c_workers):
        worker.start()
    [worker.join() for worker in a3c_workers]"""
    master.train()
    agent_test = AgentTest(master, RandomPlayer(), 0)
    reward = agent_test.play()
    # del a3c_workers
    del master

    tune.report(reward=reward)
def train(config, checkpoint_dir=None):
    step = 0
    if checkpoint_dir:
        with open(os.path.join(checkpoint_dir, "checkpoint")) as f:
            step = json.loads(f.read())["timestep"]

    for timestep in range(step, 100):
        v = np.tanh(float(timestep) / config.get("width", 1))
        v *= config.get("height", 1)

        # Checkpoint the state of the training every 3 steps
        # Note that this is only required for certain schedulers
        if timestep % 3 == 0:
            with tune.checkpoint_dir(step=timestep) as checkpoint_dir:
                path = os.path.join(checkpoint_dir, "checkpoint")
                with open(path, "w") as f:
                    f.write(json.dumps({"timestep": timestep}))

        # Here we use `episode_reward_mean`, but you can also report other
        # objectives such as loss or accuracy.
        tune.report(episode_reward_mean=v)
示例#11
0
def function_trainable(config):
    num_iters = int(config["num_iters"])
    sleep_time = config["sleep_time"]
    score = config["score"]

    checkpoint_iters = config["checkpoint_iters"]
    checkpoint_size_b = config["checkpoint_size_b"]
    checkpoint_num_items = checkpoint_size_b // 8  # np.float64

    for i in range(num_iters):
        if checkpoint_iters >= 0 and checkpoint_size_b > 0 and \
           i % checkpoint_iters == 0:
            with tune.checkpoint_dir(step=i) as dir:
                checkpoint_file = os.path.join(dir, "bogus.ckpt")
                checkpoint_data = np.random.uniform(
                    0, 1, size=checkpoint_num_items)
                with open(checkpoint_file, "wb") as fp:
                    pickle.dump(checkpoint_data, fp)

        tune.report(score=i + score)
        time.sleep(sleep_time)
示例#12
0
def train_figgie(config, checkpoint_dir=None):
    with open(r"/home/jmd6724/Documents/Figgie/ann/training_data.pickle", 'rb') as file:
        all_data = pickle.load(file)
    point = int(9 / 10 * len(all_data))
    train_data = FiggieDataSet(all_data[:point])
    test_data = FiggieDataSet(all_data[point:])
    train_set = DataLoader(train_data, batch_size=64, shuffle=True)
    test_set = DataLoader(test_data, batch_size=64, shuffle=False)

    model = Net(config['l1'], config['l2'], config['l3'])
    loss_function = CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=config['lr'])

    for epoch in range(10):
        train(model, optimizer, loss_function, train_set)
        acc = test(model, test_set)
        tune.report(mean_accuracty=acc)

        if epoch % 5 == 0:
            torch.save(model.state_dict(), r"/home/jmd6724/Documents/Figgie/ann/model_{}_{}_{}_{}.pth"
                       .format(config['l1'], config['l2'], config['l3'], acc))
示例#13
0
    def _run_experiment(self, config, hyperopt_dict):
        trial_id = tune.get_trial_id()
        gpus_ids = ray.get_gpu_ids()
        if gpus_ids:
            gpus = ",".join(str(id) for id in gpus_ids)
        else:
            gpus = None
        modified_config = substitute_parameters(
            copy.deepcopy(hyperopt_dict["config"]), config)
        hyperopt_dict["config"] = modified_config
        hyperopt_dict[
            "experiment_name"] = f'{hyperopt_dict["experiment_name"]}_{trial_id}'
        hyperopt_dict["gpus"] = gpus

        train_stats, eval_stats = run_experiment(**hyperopt_dict)
        metric_score = self.get_metric_score(train_stats, eval_stats)

        tune.report(parameters=str(config),
                    metric_score=metric_score,
                    training_stats=str(train_stats),
                    eval_stats=str(eval_stats))
示例#14
0
def worker_function(inner_ex_config, config):
    """
    Combines experiment config and auto-generated Ray config, and runs an iteration of
    inner_ex on that combined config.

    :param inner_ex_config: The current values of inner experiment config, including
    any modifications we might have made in an macro_experiment config update
    :param config: Config generated by Ray tune
    :return:
    """
    from inner_experiment import inner_ex
    # Something that runs inner_ex by combining "base" config and ray experiment config
    inner_ex_dict = dict(inner_ex_config)
    merged_config = update(inner_ex_dict, config)

    # This will create an observer in the Tune trial directory, meaning that
    # inner experiment configs will be saved at <trial.log_dir>/1
    observer = FileStorageObserver.create(tune.get_trial_dir())
    inner_ex.observers.append(observer)
    ret_val = inner_ex.run(config_updates=merged_config)
    tune.report(accuracy=ret_val.result)
示例#15
0
        def train(config, checkpoint_dir=None):
            start = i = 0
            if checkpoint_dir:
                with open(os.path.join(checkpoint_dir, "checkpoint.json"), "rt") as fp:
                    state = json.load(fp)
                    start = state["step"] + 1

            for i in range(start, start + 10):
                with tune.checkpoint_dir(i) as d:
                    with open(os.path.join(d, "checkpoint.json"), "wt") as fp:
                        json.dump({"step": i}, fp)
                tune.report(step=i)

            # These indicators will tell us if all trials saved their
            # checkpoints.
            with open(f"/cluster/shared/indicator.{tune.get_trial_id()}", "wt") as fp:
                fp.write("")

            # We continue training (without saving checkpoints) to make sure
            # that Tune's result handling is triggered (so that the
            # FailOnIndicator callback is invoked).
            time.sleep(6)
            tune.report(step=i + 1)
            time.sleep(6)

            if start == 0:
                # If this is the first round, we just sleep for some time
                # to make sure that the driver exits first (via the
                # FailOnIndicator)
                tune.report(step=i + 2)
                time.sleep(120)
        def train_func(config):
            train_data = ray.get(data_id)
            val_data = ray.get(validation_data_id)
            config = convert_bayes_configs(config).copy()
            if not isinstance(model_builder, ModelBuilder):
                raise ValueError(f"You must input a ModelBuilder instance for model_builder")
            trial_model = model_builder.build(config)

            # no need to call build since it is called the first time fit_eval is called.
            # callbacks = [TuneCallback(tune_reporter)]
            # fit model
            best_reward = None
            for i in range(1, 101):
                result = trial_model.fit_eval(data=train_data,
                                              validation_data=val_data,
                                              mc=mc,
                                              metric=metric,
                                              **config)
                reward = result
                checkpoint_filename = "best.ckpt"

                # Save best reward iteration
                mode = Evaluator.get_metric_mode(metric)
                if mode == "max":
                    has_best_reward = best_reward is None or reward > best_reward
                else:
                    has_best_reward = best_reward is None or reward < best_reward

                if has_best_reward:
                    best_reward = reward
                    trial_model.save(checkpoint_filename)
                    # Save to hdfs
                    if remote_dir is not None:
                        put_ckpt_hdfs(remote_dir, checkpoint_filename)

                report_dict = {"training_iteration": i,
                               metric: reward,
                               "checkpoint": checkpoint_filename,
                               "best_" + metric: best_reward}
                tune.report(**report_dict)
示例#17
0
        def ray_fit(config):
            val_log_liks = []
            splitter = KFold(n_splits=n_splits)
            for (train_ind, val_ind) in splitter.split(X=train_context,
                                                       y=train_inputs):
                train_inputs_, train_context_ = train_inputs[
                    train_ind], train_context[train_ind]
                val_inputs_, val_context_ = train_inputs[
                    val_ind], train_context[val_ind]

                flow = cls(inputs_size=self.inputs_size,
                           context_size=self.context_size,
                           device=self.device,
                           context_normalization=self.context_normalization,
                           inputs_normalization=self.inputs_normalization,
                           cat_context=self.cat_context,
                           **config)
                flow.fit(train_inputs_, train_context_, False)
                val_log_liks.append(
                    flow.log_prob(val_inputs_, val_context_).mean())

            tune.report(log_lik=np.mean(val_log_liks))
示例#18
0
        def post_epoch_actions(trainer_instance: Engine):

            # evaluate model on validation set
            evaluator.run(val_loader)
            state_val_metrics = evaluator.state.metrics

            current_epoch: int = trainer_instance.state.epoch

            with tune.checkpoint_dir(current_epoch) as local_checkpoint_dir:
                # save model, optimizer and trainer checkpoints
                path = os.path.join(local_checkpoint_dir, "checkpoint")
                torch.save(
                    (model.state_dict(), optimizer.state_dict(),
                     trainer_instance.state_dict(), evaluator.state_dict()),
                    path)

            # report validation scores to ray-tune
            report_dict: dict = {
                **state_val_metrics, "done": current_epoch == epochs
            }

            tune.report(**report_dict)
示例#19
0
    def _do_eval(self):
        results = self._func()

        if results:
            assert isinstance(
                results, dict
            ), "Eval function must return a dict. Got {} instead.".format(results)

            flattened_results = flatten_results_dict(results)
            for k, v in flattened_results.items():
                try:
                    v = float(v)
                except Exception:
                    raise ValueError(
                        "[EvalHook] eval_function should return a nested dict of float. "
                        "Got '{}: {}' instead.".format(k, v)
                    )

        # Remove extra memory cache of main process due to evaluation
        torch.cuda.empty_cache()

        self.step += 1

        # Here we save a checkpoint. It is automatically registered with
        # Ray Tune and will potentially be passed as the `checkpoint_dir`
        # parameter in future iterations.
        with tune.checkpoint_dir(step=self.step) as checkpoint_dir:
            additional_state = {"iteration": int(self.trainer.iter)}
            Checkpointer(
                # Assume you want to save checkpoints together with logs/statistics
                self.trainer.model,
                checkpoint_dir,
                save_to_disk=True,
                optimizer=self.trainer.optimizer,
                scheduler=self.trainer.scheduler,
            ).save(name="checkpoint", **additional_state)

        metrics = dict(r1=results['Rank-1'], map=results['mAP'], score=(results['Rank-1'] + results['mAP']) / 2)
        tune.report(**metrics)
示例#20
0
def train_breast_cancer(config):
    # Load dataset
    data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True)
    # Split into train and test set
    train_x, test_x, train_y, test_y = train_test_split(
        data, labels, test_size=0.25)
    # Build input matrices for XGBoost
    train_set = xgb.DMatrix(train_x, label=train_y)
    test_set = xgb.DMatrix(test_x, label=test_y)
    # Train the classifier
    bst = xgb.train(
        config,
        train_set,
        evals=[(test_set, "eval")],
        verbose_eval=False,
        callbacks=[XGBCallback])
    # Predict labels for the test set
    preds = bst.predict(test_set)
    pred_labels = np.rint(preds)
    # Return prediction accuracy
    accuracy = sklearn.metrics.accuracy_score(test_y, pred_labels)
    tune.report(mean_accuracy=accuracy, done=True)
def train(config, checkpoint_dir=None):
    trainer = PPOTrainer(config=config)

    if checkpoint_dir:
        trainer.load_checkpoint(checkpoint_dir)

    chk_freq = 10

    if useModelFromLowLevelTrain:
        config_low["num_workers"] = 0
        config_low["num_envs_per_worker"] = 1
        config_low["num_gpus"] = 1
        agentLow = PPOTrainer(config_low)
        agentLow.restore(
            "/home/aditya/ray_results/{}/{}/checkpoint_{}/checkpoint-{}".
            format(experiment_name, experiment_id, checkpoint_num,
                   checkpoint_num))
        lowWeight = agentLow.get_policy().get_weights()
        highWeight = trainer.get_policy("low_level_policy").get_weights()
        lowState = agentLow.get_policy().get_state()
        importedOptState = OrderedDict([
            (k.replace("default_policy", "low_level_policy"), v)
            for k, v in lowState["_optimizer_variables"].items()
        ])
        importedPolicy = {
            hw: lowWeight[lw]
            for hw, lw in zip(highWeight.keys(), lowWeight.keys())
        }
        importedPolicy["_optimizer_variables"] = importedOptState
        trainer.get_policy("low_level_policy").set_state(importedPolicy)
        chk_freq = 1  # Hanya perlu 1 kali saja di awal untuk save model hasil import

    while True:
        result = trainer.train()
        tune.report(**result)
        if (trainer._iteration % chk_freq == 0):
            with tune.checkpoint_dir(
                    step=trainer._iteration) as checkpoint_dir:
                trainer.save(checkpoint_dir)
示例#22
0
def train_mnist(config):

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    train_loader, test_loader = get_data_loaders()

    model = ConvNet()
    model.to(device)

    optimizer = optim.SGD(model.parameters(),
                          lr=config["lr"],
                          momentum=config["momentum"])

    for _i in range(10):
        train(model, optimizer, train_loader, device=device)
        acc = test(model, test_loader, device=device)

        # When using WandbLogger, the metrics reported to tune are also logged in the W&B dashboard
        tune.report(mean_accuracy=acc)

        # @wandb_mixin enables logging custom metric using wandb.log()
        error_rate = 100 * (1 - acc)
        wandb.log({"error_rate": error_rate})
示例#23
0
def nas_report(study, trial):
    best_session = study.best_trials[0]
    print("Trial stats (#{}):    Loss={}    Accuracy={}".format(
        trial.number, *(list(best_session.values))))
    print("Best params so far (#{}):    {}".format(best_session.number,
                                                   best_session.params))

    finished_trials = list(
        filter((lambda trial: trial.state.is_finished()), study.trials))

    model_state = {}
    with tune.checkpoint_dir(step=best_session.number) as checkpoint_dir:
        path = os.path.join(checkpoint_dir, "checkpoint")
        model_state = torch.load(path)

    with tune.checkpoint_dir(step=trial.number) as checkpoint_dir:
        path = os.path.join(checkpoint_dir, "checkpoint")
        torch.save((best_session.params, model_state), path)

    result_zip = zip(["loss", "accuracy"], list(best_session.values))
    results = {p: v for p, v in result_zip}
    tune.report(**results)
def run_parameterised_experiment(config):
    # Hyperparameters
    trial_dir = tune.get_trial_dir()
    problem, method, other_config = config["main_params"]
    n_workers = config["n_workers"]

    experiment = CartpoleExperiment()
    experiment.nn_path = other_config[
        "folder"]  # nn_paths_cartpole[other_config["nn_path"]]
    experiment.tau = other_config["tau"]
    if other_config["template"] == 2:  # octagon
        experiment.analysis_template = Experiment.octagon(
            experiment.env_input_size)
    elif other_config["template"] == 0:  # box
        experiment.analysis_template = Experiment.box(
            experiment.env_input_size)
    else:
        _, template = experiment.get_template(1)
        experiment.analysis_template = template  # standard
    experiment.n_workers = n_workers
    experiment.show_progressbar = False
    experiment.show_progress_plot = False
    # experiment.use_rounding = False
    experiment.save_dir = trial_dir
    experiment.update_progress_fn = update_progress
    elapsed_seconds, safe, max_t = experiment.run_experiment()

    safe_value = 0
    if safe is None:
        safe_value = 0
    elif safe:
        safe_value = 1
    elif not safe:
        safe_value = -1
    tune.report(elapsed_seconds=elapsed_seconds,
                safe=safe_value,
                max_t=max_t,
                done=True)
示例#25
0
def ntk_experiment(config=None,
                   checkpoint_dir=None,
                   n_inputs=100,
                   n_inits=100,
                   repetitions=1000,
                   input_chunk_size=50):
    """Compute error in LeCun/NTK initialization empirically."""

    logging.basicConfig(level=logging.INFO)

    n_chunks = max(1, n_inputs // input_chunk_size)
    n_inputs = input_chunk_size * n_chunks

    with np_random_seed():
        data_all = np.random.randn(n_inputs, 1)

    for init in range(n_inits):
        exp = Experiment()
        print("GPUs", tf.config.experimental.list_physical_devices("GPU"))
        print("GPU", tf.test.gpu_device_name())

        for chunk in range(n_chunks):
            data = data_all[chunk * input_chunk_size:(chunk + 1) *
                            input_chunk_size]
            out = exp.model_correct.predict(data)
            delta = exp.compute_error(data, repetitions=repetitions)

            for inp in range(input_chunk_size):
                tune.report({
                    'input': data[inp, 0],
                    'out': out[inp, 0],
                    'delta_mean': np.mean(delta[inp]),
                    'delta_std': np.std(delta[inp]),
                    'n_init': init,
                    'n_inp': inp + chunk * input_chunk_size,
                    'inp_chunk': chunk
                })
        del exp
示例#26
0
def train_cifar_100(config):

    config = fill_config(config)
    
    # Data Setup
    train_loader, val_loader = get_data_loaders(round(config['batch_size']))
    

    # Model Setup
    model = models.resnet18()
    model.fc = nn.Linear(512,100,bias=True)
    model = model.to(DEVICE)

    # Optimizer
    optimizer = optim.SGD(
        model.parameters(), 
        lr=config["lr"], 
        momentum=config["momentum"],
        weight_decay=config["weight_decay"]
    )
    
    # LR Scheduler
    scheduler = optim.lr_scheduler.StepLR(optimizer,round(config['step']),gamma=cf.SCHEDULER_GAMMA)

    # Loss Criterion
    criterion = nn.CrossEntropyLoss()
    
    while True:
        train_acc = train(model, optimizer, criterion, train_loader)
        val_acc = test(model, val_loader)
        scheduler.step()

        # Send the current training result back to Tune
        print('[log] time: ', time() - START_TIME)
        print('[log] ram: ', psutil.virtual_memory().used / (1024 ** 3) - START_RAM)
        print('[log] val_acc: ', val_acc)
        print('[log] train_acc: ', train_acc)
        tune.report(mean_accuracy=val_acc,train_acc=train_acc)
示例#27
0
def mnist_pt_objective(config):
    model = NumberNet(config)
    trainer = pl.Trainer(max_epochs=config['epochs'],
                         gpus=1,
                         auto_select_gpus=True)
    trainer.fit(model)
    trainer.test(model)
    tune.report(test_loss=model.test_loss)
    fmodel = fb.PyTorchModel(model, bounds=(0, 1))
    images, labels = fb.utils.samples(fmodel,
                                      dataset='mnist',
                                      batchsize=config['batch_size'])
    clean_accuracy = fb.utils.accuracy(fmodel, images, labels)
    attack = fb.attacks.SaltAndPepperNoiseAttack()
    epsilons = [
        0.0,
        0.0002,
        0.0005,
        0.0008,
        0.001,
        0.0015,
        0.002,
        0.003,
        0.01,
        0.1,
        0.3,
        0.5,
        1.0,
    ]
    raw_advs, clipped_advs, success = attack(fmodel,
                                             images,
                                             labels,
                                             epsilons=epsilons)
    robust_accuracy = 1 - success.cpu().numpy().astype(float).flatten().mean(
        axis=-1)
    # res test[0] reports the loss from the evaluation, res_test[1] reports the accuracy
    tune.report(robust_acc=robust_accuracy)
    return robust_accuracy
def test(net):
    env = gym.make(env_name)
    performance = []

    for _ in range(20):
        obs = env.reset()

        next_obs = None
        reward = 0
        total_reward = 0
        done = False
        while not done:
            if next_obs is not None:
                obs = next_obs
            obs = torch.tensor(obs).float()
            action = action_decide(net, obs)
            next_obs, reward, done, info = env.step(action)
            total_reward += reward
            # env.render()
            if done:
                performance.append(total_reward)
    performance = mean(performance)
    tune.report(reward_avg=performance)
示例#29
0
    def train(self, epochs, global_step=0):

        for epoch in range(global_step, epochs + global_step):
            self.model.train()
            all_probs = []
            all_labels = []
            running_loss = 0.0

            for data, labels in self.train_loader:
                data, labels = data.to(self.device), labels.to(self.device)
                if self.after_load_cb:
                    data = self.after_load_cb(data)

                self.optimizer.zero_grad()

                outputs = self.model(data)

                loss = self.loss_fn(outputs, labels)
                running_loss += loss.item()

                probs = F.softmax(outputs, dim=1)
                all_probs.append(probs.cpu().detach().numpy())
                all_labels.append(labels.cpu().numpy())

                loss.backward()
                self.optimizer.step()

            all_probs = np.concatenate(all_probs)
            all_labels = np.concatenate(all_labels)
            train_metrics = self.calc_metrics(
                all_probs, all_labels,
                running_loss / len(self.train_loader.dataset), "train")
            val_metrics = self.evaluate()

            self.save(epoch)
            metrics = {**train_metrics, **val_metrics}
            tune.report(**metrics)
示例#30
0
    def _do_eval(self):
        results = self._func()

        if results:
            assert isinstance(
                results, dict
            ), "Eval function must return a dict. Got {} instead.".format(
                results)

            flattened_results = flatten_results_dict(results)
            for k, v in flattened_results.items():
                try:
                    v = float(v)
                except Exception:
                    raise ValueError(
                        "[EvalHook] eval_function should return a nested dict of float. "
                        "Got '{}: {}' instead.".format(k, v))

        # Remove extra memory cache of main process due to evaluation
        torch.cuda.empty_cache()

        self.step += 1

        # Here we save a checkpoint. It is automatically registered with
        # RayTune and will potentially be passed as the `checkpoint_dir`
        # parameter in future iterations.
        with tune.checkpoint_dir(step=self.step) as checkpoint_dir:
            additional_state = {"epoch": int(self.trainer.epoch)}
            # Change path of save dir where tune can find
            self.trainer.checkpointer.save_dir = checkpoint_dir
            self.trainer.checkpointer.save(name="checkpoint",
                                           **additional_state)

        metrics = dict(r1=results["Rank-1"],
                       map=results["mAP"],
                       score=(results["Rank-1"] + results["mAP"]) / 2)
        tune.report(**metrics)