Пример #1
0
def get_criterion(opt, model, nll_weight):
    """ Return a suitable loss function. """
    if opt.mode == "SVI":
        rlog.info("\nLoss: NLL + KL")
        return SVILoss(model.get_kl_div, nll_weight=nll_weight)
    rlog.info("Loss: NLL \n")
    return nn.NLLLoss()
Пример #2
0
def run(opt):
    """ Entry Point. """

    rlog.init(opt.experiment, path=opt.out_dir, tensorboard=True)
    rlog.addMetrics(
        rlog.AvgMetric("trn_R_ep", metargs=["trn_reward", "trn_done"]),
        rlog.AvgMetric("trn_loss", metargs=["trn_loss", 1]),
        rlog.FPSMetric("lrn_tps", metargs=["lrn_steps"]),
        rlog.AvgMetric("val_R_ep", metargs=["reward", "done"]),
        rlog.AvgMetric("val_avg_step", metargs=[1, "done"]),
        rlog.FPSMetric("val_fps", metargs=["val_frames"]),
    )

    opt = game_settings_(opt)
    env, agent = experiment_factory(opt)

    rlog.info(ioutil.config_to_string(opt))
    ioutil.save_config(opt, opt.out_dir)

    steps = 0
    for ep in range(1, opt.env.episodes + 1):
        steps = train_one_ep(env, agent, steps, opt.update_freq, opt.target_update_freq)

        if ep % opt.valid_freq == 0:
            rlog.traceAndLog(ep)
            validate(env, agent, opt.valid_episodes)
            rlog.traceAndLog(ep)
Пример #3
0
def experiment_factory(opt, only_env=False):
    env = gym_wrapper.GymFromDMEnv(bsuite.load_from_id(opt.env.name))
    env = TorchWrapper(env, opt.device)
    if only_env:
        return env

    replay = ExperienceReplay(**opt.replay)
    layers = [
        reduce(lambda x, y: x * y, env.observation_space.shape),  # input
        *opt.estimator["layers"],  # hidden
        env.action_space.n,  # output
    ]
    estimator = MLP(layers, spectral=opt.spectral, **opt.estimator)
    estimator.to(opt.device)

    optimizer = getattr(torch.optim, opt.optim.name)(
        estimator.parameters(), **opt.optim.kwargs
    )
    policy_improvement = C51PolicyImprovement(
        estimator, opt.epsilon, env.action_space.n
    )
    policy_evaluation = C51PolicyEvaluation(estimator, optimizer, opt.gamma)
    rlog.info(replay)
    rlog.info(estimator)
    return env, (replay, policy_improvement, policy_evaluation)
Пример #4
0
def get_model(opt, num_labels):
    """ Configure and return a model. """
    if opt.model == "baseline":
        model = Baseline(
            Head(),
            nn.LSTM(44 ** 2, hidden_size=opt.hidden_size),
            nn.Linear(opt.hidden_size, num_labels),
        )
    else:
        lstm_in = opt.window ** 2 * opt.topk
        idx2xy_partial = None
        if opt.use_coords:
            # increase the input for the xy coords
            lstm_in = (opt.window ** 2 + 2) * opt.topk
            idx2xy_partial = partial(idx2xy, k=opt.window, s=4)
        model = Glimpsy(
            partial(unfold, window=opt.window, stride=4),
            SparseAttention(opt.window ** 2, topk=opt.topk),
            nn.LSTM(lstm_in, hidden_size=opt.hidden_size),
            nn.Linear(opt.hidden_size, SyncedMNIST.num_labels),
            head=Head(),
            idx2xy=idx2xy_partial,
        )

    rlog.info(
        summary(
            model,
            torch.zeros((opt.batch_size, 10, 1, 64, 64)),
            show_input=True,
            show_hierarchical=True,
        ),
    )
    return model
Пример #5
0
def get_optimizer(opt, estimator):
    # Create custom param groups
    if hasattr(opt.optim, "div_by_rho") and opt.optim.div_by_rho:
        assert (opt.estimator.args["spectral"] is not None
                ), "When dividing by rho you should hook at least a layer."
        assert all(
            [
                s[-1] == "L"
                for s in str(opt.estimator.args["spectral"]).split(",")
            ]
        ), "Spectral norm layers should not be active when dividing the optim step."

        param_groups = [{
            "params": p,
            "name": n,
            "lr": opt.optim.args["lr"],
            "rho_idx": None
        } for n, p in estimator.named_parameters()]
        param_groups_ = [g for g in param_groups if "weight" in g["name"]]
        for k in estimator.get_spectral_norms().keys():
            param_groups_[int(k)]["rho_idx"] = k
    else:
        param_groups = estimator.parameters()

    optimizer = getattr(O, opt.optim.name)(param_groups, **opt.optim.args)

    if hasattr(opt.optim, "div_by_rho") and opt.optim.div_by_rho:
        rlog.info("Checking the groups are alright, alright, alright...")
        for group in optimizer.param_groups:
            rlog.info("{:<36} rho_idx={}".format(group["name"],
                                                 group["rho_idx"]))

    return optimizer
Пример #6
0
def make_rlog(opt):
    """ Configure logger. """
    rlog.init("pff", path=opt.path, tensorboard=True)
    train_log = rlog.getLogger("pff.train")
    train_log.fmt = (
        "[{gen:03d}/{batch:04d}] acc={acc:2.2f}% | bestFit={bestFit:2.3f}"
        + ", unFit={unFit:2.3f} [μ={attnMean:2.3f}/σ={attnVar:2.3f}]"
    )
    if opt.model == "baseline":
        train_log.fmt = "[{batch:04d}] acc={acc:2.2f}%, loss={loss:2.3f}"
    msg = "Configuration:\n"
    for k, v in vars(opt).items():
        msg += f"   {k:16}:  {v}\n"
    rlog.info(msg)
    return train_log
Пример #7
0
def load_policy(env, ckpt_path, opt):
    opt.action_cnt = env.action_space.n
    estimator = get_estimator(opt, env)
    agent_args = opt.agent.args
    agent_args["epsilon"] = 0.0  # purely max
    policy = AGENTS[opt.agent.name]["policy_improvement"](
        estimator, opt.action_cnt, **agent_args
    )
    idx = int(ckpt_path.stem.split("_")[1])
    rlog.info(f"Loading {ckpt_path.stem}")
    ckpt = ioutil.load_checkpoint(
        ckpt_path.parent, idx=idx, verbose=False, device=torch.device(opt.device)
    )

    if opt.estimator.args["spectral"] is not None:
        ioutil.special_conv_uv_buffer_fix(policy.estimator, ckpt["estimator_state"])
    policy.estimator.load_state_dict(ckpt["estimator_state"])
    return policy, idx
Пример #8
0
def run(opt):
    """ Entry point of the experiment """

    # no need to run this for all the seeds
    if opt.run_id not in [0, 1, 2]:
        return

    # this is a bit of a hack, it would be nice to change it
    # when launching the experiment. It generally only affects the logger.
    if "JyxNorm" not in opt.experiment:
        opt.experiment += "--JyxNorm"

    rlog.init(opt.experiment, path=opt.out_dir, relative_time=True)
    rlog.addMetrics(
        rlog.AvgMetric("Jyx_norm_avg", metargs=["Jyx_norm", 1]),
        rlog.MaxMetric("Jyx_norm_max", metargs=["Jyx_norm"]),
        rlog.AvgMetric("val_R_ep", metargs=["reward", "done"]),
        rlog.SumMetric("val_ep_cnt", metargs=["done"]),
        rlog.AvgMetric("val_avg_step", metargs=[1, "done"]),
        rlog.FPSMetric("val_fps", metargs=["val_frames"]),
    )

    opt.device = "cuda" if torch.cuda.is_available() else "cpu"

    root = Path(opt.out_dir)
    ckpt_paths = sorted(root.glob("**/checkpoint*"))

    rlog.info("Begin empirical estimation of norm(Jyx).")
    rlog.info("Runing experiment on {}.".format(opt.device))
    rlog.info("Found {:3d} checkpoints.".format(len(ckpt_paths)))

    # Sample only every other third checkpoint
    if (Path(opt.out_dir) / "max_ckpt").exists():
        ckpt_paths = [
            p
            for p in ckpt_paths
            if int(p.stem.split("_")[1])
            == int((Path(opt.out_dir) / "max_ckpt").read_text())
        ]
        rlog.info("IMPORTANT! Found max_ckpt @{}.".format(ckpt_paths[0]))
    else:
        if "MinAtar" in opt.game:
            ckpt_paths = ckpt_paths[0::3]
            rlog.warning("IMPORTANT! Sampling only every other third checkpoint.")
        else:
            ckpt_paths = ckpt_paths[0::5]
            rlog.warning("IMPORTANT! Sampling only every other fifth checkpoint.")

    for ckpt_path in ckpt_paths:
        env = get_env(opt, mode="testing")
        policy, step = load_policy(env, ckpt_path, deepcopy(opt))

        check_lipschitz_constant(policy, env, opt.valid_step_cnt)
        rlog.traceAndLog(step=step)
Пример #9
0
def run(opt):
    """ Run experiment. This function is being launched by liftoff.
    """
    U.configure_logger(opt)

    # set seed
    opt.seed = (opt.run_id + 1) * opt.base_seed
    torch.manual_seed(opt.seed)

    # configure env
    env = ActionWrapper(TorchWrapper(gym.make(opt.env_name)))
    env.seed(opt.seed)

    # build estimator
    estimator = ActorCriticEstimator(
        env.observation_space.shape[0],
        env.action_space,
        hidden_size=opt.hidden_size,
    )
    # load checkpoint and reset
    rlog.info("Loading model from %s", opt.model_state)
    estimator.load_state_dict(torch.load(opt.model_state)["policy"])
    estimator.reset_policy()
    rlog.info("Policy reset.")
    if opt.freeze_critic:
        estimator.freeze_critic()
        rlog.info("Freezed feature extractor and critic.")

    # build the agent
    policy_improvement, policy_evaluation = build_agent(
        opt, env, estimator=estimator
    )

    # log
    rlog.info(f"\n{U.config_to_string(opt)}")
    rlog.info(policy_improvement)

    # train
    try:
        train(env, policy_improvement, policy_evaluation, opt)
    except Exception as err:
        rlog.error(clr(str(err), "red", attrs=["bold"]))
        raise err
Пример #10
0
def valid_stats(opt, model, dset):
    """ Stats on the validation data.
    """
    stats = {}
    stats["loss"], stats["acc"] = validate(
        DataLoader(dset, **vars(opt.val_loader)), model, opt.tst_mcs)

    if hasattr(opt, "log") and opt.log.mle_ish:
        # Use the means of the posterior to set a pseudo-MLE model.
        assert isinstance(
            model, SVIModel), "This stat only makes sense for SVI models."
        model.sync_mle_model()
        rlog.info("Synced MLE model using means from posterior.")
        rlog.info("Compute accuracy with a pseudo-MLE model.")
        stats["lossMLE"], stats["accMLE"] = validate(
            DataLoader(dset, **vars(opt.val_loader)),
            model._mle_model,  # pylint: disable=protected-access
            0,
        )
    return stats
Пример #11
0
def train_stats(opt, model, dset):
    """ Stats on the traning data.
    """
    stats = {}
    if isinstance(model, SVIModel):
        # Stats collected during traning use a single sample from the
        # posterior. Therefore we check the accuracy once more using
        # the same no of samples as on the validation set.
        stats["lossMC"], stats["accMC"] = validate(
            DataLoader(dset, **vars(opt.val_loader)), model, opt.tst_mcs)
    if hasattr(opt, "log") and opt.log.train_no_aug:
        # We also look at the accuracy on un-augmented training data.
        # This is done on both MLE and SVI
        rlog.info("Compute accuracy on un-augmented train data.")
        mc_samples = opt.tst_mcs if isinstance(model, SVIModel) else 0
        stats["lossNoAug"], stats["accNoAug"] = validate(
            DataLoader(get_unaugmented(dset), **vars(opt.val_loader)),
            model,
            mc_samples,
        )
    if hasattr(opt, "log") and opt.log.mle_ish:
        # Use the means of the posterior to set a pseudo-MLE model.
        assert isinstance(
            model, SVIModel), "This stat only makes sense for SVI models."
        model.sync_mle_model()
        rlog.info("Synced MLE model using means from posterior.")
        rlog.info("Compute accuracy with a pseudo-MLE model.")
        stats["lossMLE"], stats["accMLE"] = validate(
            DataLoader(dset, **vars(opt.val_loader)),
            model._mle_model,  # pylint: disable=protected-access
            0,
        )
    return stats
Пример #12
0
def checkpoint_agent(path, crt_step, save_replay=True, **kwargs):
    to_save = {"step": crt_step}
    replay_path = None
    for k, v in kwargs.items():
        if k == "replay":
            if save_replay:
                replay_path = v.save(path, crt_step, save_all=False)
        elif isinstance(v, (torch.nn.Module, torch.optim.Optimizer)):
            to_save[f"{k}_state"] = v.state_dict()
        elif isinstance(v, (Namespace, YamlNamespace)):
            to_save[k] = namespace_to_dict(v)
        else:
            to_save[k] = v

    with open(f"{path}/checkpoint_{crt_step:08d}.gz", "wb") as f:
        with GzipFile(fileobj=f) as outfile:
            torch.save(to_save, outfile)
    if replay_path is not None:
        shutil.copyfile(replay_path, Path(path) / "prev_replay.gz")

    rlog.info(
        "So, I have saved the agent's state"
        f"{'' if replay_path is not None else ' not'} including the experience replay."
    )
Пример #13
0
def run(opt):
    """ Entry point """
    if "sRank" not in opt.experiment:
        opt.experiment += "--sRank"

    rlog.init(opt.experiment, path=opt.out_dir, relative_time=True)
    rlog.addMetrics(
        rlog.AvgMetric("avg_rank", metargs=["rank", 1]),
        # rlog.ValueMetric("rank", metargs=["rank"]),
        rlog.AvgMetric("val_R_ep", metargs=["reward", "done"]),
        rlog.SumMetric("val_ep_cnt", metargs=["done"]),
        rlog.AvgMetric("val_avg_step", metargs=[1, "done"]),
        rlog.FPSMetric("val_fps", metargs=["val_frames"]),
    )

    opt.device = "cuda" if torch.cuda.is_available() else "cpu"

    root = Path(opt.out_dir)
    ckpt_paths = sorted(root.glob("**/checkpoint*"))

    rlog.info("Begin empirical estimation of feature matrix rank.")
    rlog.info("Runing experiment on {}".format(opt.device))
    rlog.info("Found {:3d} checkpoints.".format(len(ckpt_paths)))

    # Sample only every other third checkpoint
    if "MinAtar" in opt.game:
        ckpt_paths = ckpt_paths[0::3]
        rlog.warning("IMPORTANT! Sampling only every other third checkpoint.")
    else:
        ckpt_paths = ckpt_paths[0::5]
        rlog.warning("IMPORTANT! Sampling only every other fifth checkpoint.")

    sampled_steps = min(opt.valid_step_cnt, opt.train_step_cnt)
    rlog.info(
        "Sampling {:6d} steps from the environment".format(sampled_steps))

    for ckpt_path in ckpt_paths:

        env = get_env(opt, mode="testing")
        policy, step = load_policy(env, ckpt_path, deepcopy(opt))
        check_effective_features_rank(policy, env, sampled_steps)

        rlog.traceAndLog(step=step)
Пример #14
0
def run(opt):
    """ Entry point of the program. """

    if __debug__:
        print(
            clr(
                "Code might have assertions. Use -O in liftoff when running stuff.",
                color="red",
                attrs=["bold"],
            ))

    ioutil.create_paths(opt)

    sticky_schedule = OrderedDict([(int(s), float(p))
                                   for (s, p) in opt.sticky_schedule])
    assert 1 in sticky_schedule

    rlog.init(opt.experiment, path=opt.out_dir, tensorboard=True)
    train_loggers = OrderedDict()
    for i, epoch in enumerate(sticky_schedule.keys()):
        train_loggers[epoch] = train_log = rlog.getLogger(
            f"{opt.experiment}.{i:d}")
        train_log.addMetrics(
            rlog.AvgMetric("trn_R_ep", metargs=["trn_reward", "trn_done"]),
            rlog.SumMetric("trn_ep_cnt", metargs=["trn_done"]),
            rlog.AvgMetric("trn_loss", metargs=["trn_loss", 1]),
            rlog.FPSMetric("trn_tps", metargs=["trn_steps"]),
            rlog.ValueMetric("trn_sticky_action_prob",
                             metargs=["trn_sticky_action_prob"]),
            rlog.FPSMetric("lrn_tps", metargs=["lrn_steps"]),
            rlog.AvgMetric("val_R_ep", metargs=["reward", "done"]),
            rlog.SumMetric("val_ep_cnt", metargs=["done"]),
            rlog.AvgMetric("val_avg_step", metargs=[1, "done"]),
            rlog.FPSMetric("val_fps", metargs=["val_frames"]),
            rlog.ValueMetric("val_sticky_action_prob",
                             metargs=["val_sticky_action_prob"]),
        )

    # Initialize the objects we will use during training.
    env, (replay, policy_improvement,
          policy_evaluation) = experiment_factory(opt)

    rlog.info("\n\n{}\n\n{}\n\n{}".format(env, replay,
                                          policy_evaluation.estimator))
    rlog.info("\n\n{}\n\n{}".format(policy_improvement, policy_evaluation))

    if opt.estimator.args.get("spectral", None) is not None:
        for k in policy_evaluation.estimator.get_spectral_norms().keys():
            # k = f"min{str(k)[1:]}"
            rlog.addMetrics(rlog.ValueMetric(k, metargs=[k]))

    # if we loaded a checkpoint
    if Path(opt.out_dir).joinpath("replay.gz").is_file():

        # sometimes the experiment is intrerupted while saving the replay
        # buffer and it gets corrupted. Therefore we attempt restoring
        # from the previous checkpoint and replay.
        try:
            idx = replay.load(Path(opt.out_dir) / "replay.gz")
            ckpt = ioutil.load_checkpoint(opt.out_dir, idx=idx)
            rlog.info(f"Loaded most recent replay (step {idx}).")
        except:
            gc.collect()
            rlog.info("Last replay gzip is faulty.")
            idx = replay.load(Path(opt.out_dir) / "prev_replay.gz")
            ckpt = ioutil.load_checkpoint(opt.out_dir, idx=idx)
            rlog.info(f"Loading a previous snapshot (step {idx}).")

        # load state dicts

        # load state dicts
        ioutil.special_conv_uv_buffer_fix(policy_evaluation.estimator,
                                          ckpt["estimator_state"])
        policy_evaluation.estimator.load_state_dict(ckpt["estimator_state"])
        ioutil.special_conv_uv_buffer_fix(policy_evaluation.target_estimator,
                                          ckpt["target_estimator_state"])
        policy_evaluation.target_estimator.load_state_dict(
            ckpt["target_estimator_state"])
        policy_evaluation.optimizer.load_state_dict(ckpt["optim_state"])

        last_epsilon = None
        for _ in range(ckpt["step"]):
            last_epsilon = next(policy_improvement.epsilon)
        rlog.info(f"Last epsilon: {last_epsilon}.")
        # some counters
        last_epoch = ckpt["step"] // opt.train_step_cnt
        rlog.info(f"Resuming from epoch {last_epoch}.")
        start_epoch = last_epoch + 1
        steps = ckpt["step"]
    else:
        steps = 0
        start_epoch = 1
        # add some hardware and git info, log and save
        opt = ioutil.add_platform_info(opt)

    rlog.info("\n" + ioutil.config_to_string(opt))
    ioutil.save_config(opt, opt.out_dir)

    # Start training

    last_state = None  # used by train_one_epoch to know how to resume episode.
    for epoch in range(start_epoch, opt.epoch_cnt + 1):
        last_sched_epoch = max(ep for ep in sticky_schedule if ep <= epoch)
        print(f"StickyActProb goes from {env.sticky_action_prob}"
              f" to {sticky_schedule[last_sched_epoch]}")
        env.sticky_action_prob = sticky_schedule[last_sched_epoch]
        crt_logger = train_loggers[last_sched_epoch]

        # train for 250,000 steps
        steps, last_state = train_one_epoch(
            env,
            (replay, policy_improvement, policy_evaluation),
            opt.train_step_cnt,
            opt.update_freq,
            opt.target_update_freq,
            opt,
            crt_logger,
            total_steps=steps,
            last_state=last_state,
        )
        crt_logger.put(trn_sticky_action_prob=env.sticky_action_prob)
        crt_logger.traceAndLog(epoch * opt.train_step_cnt)

        # validate for 125,000 steps
        for sched_epoch, eval_logger in train_loggers.items():
            eval_env = get_env(  # this doesn't work, fute-m-aș în ele de wrappere
                opt,
                mode="testing",
                sticky_action_prob=sticky_schedule[sched_epoch])
            eval_env.sticky_action_prob = sticky_schedule[sched_epoch]
            print(
                f"Evaluating on the env with sticky={eval_env.sticky_action_prob}."
            )
            validate(
                AGENTS[opt.agent.name]["policy_improvement"](
                    policy_improvement.estimator,
                    opt.action_cnt,
                    epsilon=opt.val_epsilon,
                ),
                eval_env,
                opt.valid_step_cnt,
                eval_logger,
            )
            eval_logger.put(
                val_sticky_action_prob=eval_env.sticky_action_prob, )
            eval_logger.traceAndLog(epoch * opt.train_step_cnt)

        # save the checkpoint
        if opt.agent.save:
            ioutil.checkpoint_agent(
                opt.out_dir,
                steps,
                estimator=policy_evaluation.estimator,
                target_estimator=policy_evaluation.target_estimator,
                optim=policy_evaluation.optimizer,
                cfg=opt,
                replay=replay,
                save_replay=(epoch % 8 == 0 or epoch == opt.epoch_cnt),
            )
Пример #15
0
 def freeze_critic(self):
     for module_name, module in self.named_modules():
         if "policy" not in module_name and module_name != "":
             rlog.info("Freezing %s", module_name)
             module.weight.requires_grad = False
             module.bias.requires_grad = False
Пример #16
0
def run(opt):
    torch.set_printoptions(precision=8, sci_mode=False)
    opt = augment_options(opt)
    configure_logger(opt)
    check_options_are_valid(opt)

    rlog.info(f"\n{config_to_string(opt)}")

    # configure the environment
    env = wrap_env(gym.make(opt.game), opt)

    # configure estimator and policy
    if hasattr(opt.estimator, 'categorical'):
        _s = opt.estimator.categorical.support
        support = [_s.min, _s.max, _s.bin_no]
        estimator = MiniGridFF(
            opt.er.hist_len * 3,
            env.action_space.n,
            hidden_size=opt.estimator.lin_size,
            support=support,
        ).cuda()
    elif opt.estimator.ff:
        estimator = MiniGridFF(
            opt.er.hist_len * 3,
            env.action_space.n,
            hidden_size=opt.estimator.lin_size,
        ).cuda()
    else:
        estimator = MiniGridNet(
            opt.er.hist_len * 3,
            env.action_space.n,
            hidden_size=opt.estimator.lin_size,
        ).cuda()

    if hasattr(opt.estimator, "ensemble"):
        # Build Bootstrapped Ensembles objects
        estimator = BootstrappedEstimator(estimator,
                                          **opt.estimator.ensemble.__dict__)
        policy_evaluation = BootstrappedPE(estimator,
                                           env.action_space.n,
                                           opt.exploration.__dict__,
                                           vote=True)
        if hasattr(opt.estimator, 'categorical'):
            policy_improvement = BootstrappedPI(
                wt.CategoricalPolicyImprovement(
                    estimator,
                    optim.Adam(estimator.parameters(), lr=opt.lr, eps=1e-4),
                    opt.gamma,
                ),
                categorical=True)
        else:
            policy_improvement = BootstrappedPI(
                wt.DQNPolicyImprovement(
                    estimator,
                    optim.Adam(estimator.parameters(), lr=opt.lr, eps=1e-4),
                    opt.gamma,
                    is_double=opt.double,
                ))
    elif hasattr(opt.estimator, "dropout"):
        # Build Variational Dropout objects
        estimator = MiniGridDropnet(
            opt.er.hist_len * 3,
            env.action_space.n,
            hidden_size=opt.estimator.lin_size,
            p=opt.estimator.dropout,
            mc_samples=opt.estimator.mc_samples,
        ).cuda()
        policy_evaluation = DropPE(
            estimator,
            env.action_space.n,
            epsilon=opt.exploration.__dict__,
            thompson=opt.estimator.thompson,
        )
        policy_improvement = DropPI(
            estimator,
            optim.Adam(estimator.parameters(), lr=opt.lr, eps=1e-4),
            opt.gamma,
            is_double=opt.double,
        )
    elif hasattr(opt.estimator, "categorical"):
        policy_evaluation = wt.EpsilonGreedyPolicy(
            estimator, env.action_space.n, epsilon=opt.exploration.__dict__)
        policy_improvement = wt.CategoricalPolicyImprovement(
            estimator,
            optim.Adam(estimator.parameters(), lr=opt.lr, eps=1e-4),
            opt.gamma,
        )
    else:
        policy_evaluation = wt.EpsilonGreedyPolicy(
            estimator, env.action_space.n, epsilon=opt.exploration.__dict__)
        policy_improvement = wt.DQNPolicyImprovement(
            estimator,
            optim.Adam(estimator.parameters(), lr=opt.lr, eps=1e-4),
            opt.gamma,
            is_double=opt.double,
        )

    policy = DQNPolicy(
        policy_evaluation,
        policy_improvement,
        wt.ExperienceReplay(**opt.er.__dict__)(),
        priority=opt.er.priority,
    )

    # additionally info
    rlog.info(policy)
    rlog.info(estimator)

    # start training
    policy_iteration(env, policy, opt)
Пример #17
0
        # sample from a gaussian for showcasing the histogram
        sample = random.gauss(mean, 0.1)

        # simply trace all the values you passed as `metargs` above.
        # the logger will know how to dispatch each argument.
        rlog.put(reward=reward, done=done, frame_no=1, sample=sample)

        if step % 10_000 == 0:
            # this is the call that dumps everything to the logger.
            summary = rlog.summarize()
            rlog.trace(step=step, **summary)
            # rlog.info(
            #     "{0:6d}, ep {ep_cnt:3d}, RunR/ep{RunR:8.2f}  |  rw/ep{R_per_ep:8.2f}.".format(
            #         step, **summary
            #     )
            # )
            # rlog.reset()
            rlog.traceAndLog(step)
            mean += 1

    rlog.trace("But we can continue tracing stuff manually...")
    # inlcuding structured stuff as long as we provide a `step` keyarg
    rlog.trace(step=step, aux_loss=0.23)

    rlog.info("Run `tensorboard --logdir sota_results` to see the results.")


if __name__ == "__main__":
    main()
Пример #18
0
def run(opt):
    """ Entry point of the program. """

    if __debug__:
        print(
            clr(
                "Code might have assertions. Use -O in liftoff when running stuff.",
                color="red",
                attrs=["bold"],
            ))

    ioutil.create_paths(opt)

    rlog.init(opt.experiment,
              path=opt.out_dir,
              tensorboard=True,
              relative_time=True)
    rlog.addMetrics(
        rlog.AvgMetric("trn_R_ep", metargs=["trn_reward", "trn_done"]),
        rlog.SumMetric("trn_ep_cnt", metargs=["trn_done"]),
        rlog.AvgMetric("trn_loss", metargs=["trn_loss", 1]),
        rlog.FPSMetric("trn_tps", metargs=["trn_steps"]),
        rlog.FPSMetric("lrn_tps", metargs=["lrn_steps"]),
        rlog.AvgMetric("val_R_ep", metargs=["reward", "done"]),
        rlog.SumMetric("val_ep_cnt", metargs=["done"]),
        rlog.AvgMetric("val_avg_step", metargs=[1, "done"]),
        rlog.FPSMetric("val_fps", metargs=["val_frames"]),
    )

    # Initialize the objects we will use during training.
    env, (replay, policy_improvement,
          policy_evaluation) = experiment_factory(opt)

    guts = [
        env,
        replay,
        policy_evaluation.estimator,
        policy_evaluation.optimizer,
        policy_improvement,
        policy_evaluation,
    ]
    rlog.info(("\n\n{}" * len(guts)).format(*guts))

    if opt.estimator.args.get("spectral", None) is not None:
        for k in policy_evaluation.estimator.get_spectral_norms().keys():
            # k = f"min{str(k)[1:]}"
            rlog.addMetrics(rlog.ValueMetric(k, metargs=[k]))

    # if we loaded a checkpoint
    if Path(opt.out_dir).joinpath("replay.gz").is_file():

        # sometimes the experiment is intrerupted while saving the replay
        # buffer and it gets corrupted. Therefore we attempt restoring
        # from the previous checkpoint and replay.
        try:
            idx = replay.load(Path(opt.out_dir) / "replay.gz")
            ckpt = ioutil.load_checkpoint(opt.out_dir, idx=idx)
            rlog.info(f"Loaded most recent replay (step {idx}).")
        except:
            gc.collect()
            rlog.info("Last replay gzip is faulty.")
            idx = replay.load(Path(opt.out_dir) / "prev_replay.gz")
            ckpt = ioutil.load_checkpoint(opt.out_dir, idx=idx)
            rlog.info(f"Loading a previous snapshot (step {idx}).")

        # load state dicts

        # load state dicts
        ioutil.special_conv_uv_buffer_fix(policy_evaluation.estimator,
                                          ckpt["estimator_state"])
        policy_evaluation.estimator.load_state_dict(ckpt["estimator_state"])
        ioutil.special_conv_uv_buffer_fix(policy_evaluation.target_estimator,
                                          ckpt["target_estimator_state"])
        policy_evaluation.target_estimator.load_state_dict(
            ckpt["target_estimator_state"])
        policy_evaluation.optimizer.load_state_dict(ckpt["optim_state"])

        last_epsilon = None
        for _ in range(ckpt["step"]):
            last_epsilon = next(policy_improvement.epsilon)
        rlog.info(f"Last epsilon: {last_epsilon}.")
        # some counters
        last_epoch = ckpt["step"] // opt.train_step_cnt
        rlog.info(f"Resuming from epoch {last_epoch}.")
        start_epoch = last_epoch + 1
        steps = ckpt["step"]
    else:
        steps = 0
        start_epoch = 1
        # add some hardware and git info, log and save
        opt = ioutil.add_platform_info(opt)

    rlog.info("\n" + ioutil.config_to_string(opt))
    ioutil.save_config(opt, opt.out_dir)

    # Start training

    last_state = None  # used by train_one_epoch to know how to resume episode.
    for epoch in range(start_epoch, opt.epoch_cnt + 1):

        # train for 250,000 steps
        steps, last_state = train_one_epoch(
            env,
            (replay, policy_improvement, policy_evaluation),
            opt.train_step_cnt,
            opt.update_freq,
            opt.target_update_freq,
            opt,
            rlog.getRootLogger(),
            total_steps=steps,
            last_state=last_state,
        )
        rlog.traceAndLog(epoch * opt.train_step_cnt)

        # validate for 125,000 steps
        validate(
            AGENTS[opt.agent.name]["policy_improvement"](
                policy_improvement.estimator,
                opt.action_cnt,
                epsilon=opt.val_epsilon),
            get_env(opt, mode="testing"),
            opt.valid_step_cnt,
            rlog.getRootLogger(),
        )
        rlog.traceAndLog(epoch * opt.train_step_cnt)

        # save the checkpoint
        if opt.agent.save:
            ioutil.checkpoint_agent(
                opt.out_dir,
                steps,
                estimator=policy_evaluation.estimator,
                target_estimator=policy_evaluation.target_estimator,
                optim=policy_evaluation.optimizer,
                cfg=opt,
                replay=replay,
                save_replay=(epoch % 8 == 0 or epoch == opt.epoch_cnt),
            )
Пример #19
0
def main():
    # get the root logger, preconfigured to log to the console,
    # to a text file, a pickle and a tensorboard protobuf.
    experiment_path = get_experiment_path()
    rlog.init("dqn", path=experiment_path, tensorboard=True)
    rlog.info("Logging application level stuff.")
    rlog.info("Log artifacts will be saved in %s", experiment_path)

    rlog.addMetrics(
        # counts each time it receives a `done=True`, aka counts episodes
        rlog.SumMetric("ep_cnt", resetable=False, metargs=["done"]),
        # sums up all the `reward=value` it receives and divides it
        # by the number of `done=True`, aka mean reward per episode
        rlog.AvgMetric("R_per_ep", metargs=["reward", "done"]),
    )

    for step in range(5):
        # probably not a good idea to call this every step if it is a hot loop?
        # also this will not be logged to the console or to the text file
        # since the default log-level for these two is INFO.
        rlog.trace(step=step, aux_loss=7.23 - step)

    # but we can register metrics that will accumulate traced events
    # and summarize them. Each Metric accepts a name and some metargs
    # that tells it which arguments received by the `put` call bellow
    # to accumulate and summarize.
    rlog.addMetrics(
        # counts each time it receives a `done=True`, aka counts episodes
        rlog.SumMetric("ep_cnt", resetable=False, metargs=["done"]),
        # sums up all the `reward=value` it receives and divides it
        # by the number of `done=True`, aka mean reward per episode
        rlog.AvgMetric("R_per_ep", metargs=["reward", "done"]),
        # same but keeps a running average instead (experimental).
        rlog.AvgMetric("RunR", eps=0.9, metargs=["reward", "done"]),
        # same as above but now we divide by the number of rewards
        rlog.AvgMetric("R_per_step", metargs=["reward", 1]),
        # same but with clipped rewards (to +- 1)
        rlog.AvgMetric("rw_per_ep", metargs=["clip(reward)", "done"]),
        # computes the no of frames per second
        rlog.FPSMetric("train_fps", metargs=["frame_no"]),
        # caches all the values it receives and inserts them into a
        # tensorboad.summary.histogram every time you call `log.trace`
        rlog.ValueMetric("gaussians", metargs=["sample"], tb_type="histogram"),
    )

    mean = 0
    for step in range(1, 300_001):

        # make a step in the "environment"
        reward, done = reward_following_policy(step)

        # sample from a gaussian for showcasing the histogram
        sample = random.gauss(mean, 0.1)

        # simply trace all the values you passed as `metargs` above.
        # the logger will know how to dispatch each argument.
        rlog.put(reward=reward, done=done, frame_no=1, sample=sample)

        if step % 10_000 == 0:
            # this is the call that dumps everything to the logger.
            summary = rlog.summarize()
            rlog.trace(step=step, **summary)
            # rlog.info(
            #     "{0:6d}, ep {ep_cnt:3d}, RunR/ep{RunR:8.2f}  |  rw/ep{R_per_ep:8.2f}.".format(
            #         step, **summary
            #     )
            # )
            # rlog.reset()
            rlog.traceAndLog(step)
            mean += 1
Пример #20
0
def run(opt):
    """ Run experiment. This function is being launched by liftoff.
    """
    # logging
    trn_log, val_log = set_logger(opt)

    # model related stuff
    device = torch.device("cuda")
    trn_set, val_set, wmp_set = get_dsets(opt)
    model = get_model(opt, device)
    optimizer = getattr(optim, opt.optim.name)(model.parameters(),
                                               **vars(opt.optim.args))
    # batch_size
    batch_size = opt.trn_loader.batch_size

    rlog.info(U.config_to_string(opt))
    rlog.info("Model: %s", str(model))
    rlog.info("Optimizer: %s \n", str(optimizer))

    # Warm-up the mode on a partition of the training dataset
    if wmp_set is not None:
        rlog.info("Warming-up on dset of size %d", len(wmp_set))
        for epoch in range(opt.warmup.epochs):
            # train for one epoch
            trn_loss, trn_acc = train(
                DataLoader(wmp_set, **vars(opt.trn_loader)),
                model,
                optimizer,
                get_criterion(opt, model,
                              len(wmp_set) // batch_size),
                mc_samples=opt.trn_mcs,
            )

            val_stats = valid_stats(opt, model, val_set)
            trn_stats = train_stats(opt, model, wmp_set)
            trn_stats["loss"], trn_stats["acc"] = trn_loss, trn_acc

            # to pickle and tensorboard
            val_log.trace(step=epoch, **val_stats)
            trn_log.trace(step=epoch, **trn_stats)

            # to console
            for log, stats in zip([trn_log, val_log], [trn_stats, val_stats]):
                log.info(log.fmt.format(epoch, stats["acc"], stats["loss"]))

            # extra logging
            model_stats(opt, epoch, model)

        # maybe reset optimizer after warmup
        if opt.warmup.reset_optim:
            rlog.info("\nWarmup ended. Resetting optimizer.")
            optimizer = getattr(optim, opt.optim.name)(model.parameters(),
                                                       **vars(opt.optim.args))

    # Train on the full training dataset
    if wmp_set is not None:
        epochs = range(opt.warmup.epochs, opt.warmup.epochs + opt.epochs)
    else:
        epochs = range(opt.epochs)

    rlog.info("\nTraining on dset: %s", str(trn_set))
    for epoch in epochs:
        trn_loss, trn_acc = train(
            DataLoader(trn_set, **vars(opt.trn_loader)),
            model,
            optimizer,
            get_criterion(opt, model,
                          len(trn_set) // batch_size),
            mc_samples=opt.trn_mcs,
        )

        val_stats = valid_stats(opt, model, val_set)
        trn_stats = train_stats(opt, model, trn_set)
        trn_stats["loss"], trn_stats["acc"] = trn_loss, trn_acc

        # to pickle and tensorboard
        val_log.trace(step=epoch, **val_stats)
        trn_log.trace(step=epoch, **trn_stats)

        # to console
        for log, stats in zip([trn_log, val_log], [trn_stats, val_stats]):
            log.info(log.fmt.format(epoch, stats["acc"], stats["loss"]))

        # extra logging
        model_stats(opt, epoch, model)