Пример #1
0
def configure_logger(opt):
    """ Configures the logger.
    """
    rlog.init(opt.experiment, path=opt.out_dir, tensorboard=True)
    train_log = rlog.getLogger(opt.experiment + ".train")
    train_log.addMetrics(
        rlog.AvgMetric("R_ep", metargs=["reward", "done"]),
        rlog.AvgMetric("V_step", metargs=["value", 1]),
        rlog.AvgMetric("v_mse_loss", metargs=["v_mse", 1]),
        rlog.AvgMetric("v_hub_loss", metargs=["v_hub", 1]),
        rlog.SumMetric("ep_cnt", resetable=False, metargs=["done"]),
        rlog.AvgMetric("steps_ep", metargs=["step_no", "done"]),
        rlog.FPSMetric("fps", metargs=["frame_no"]),
    )
    train_log.log_fmt = (
        "[{0:6d}/{ep_cnt:5d}] R/ep={R_ep:8.2f}, V/step={V_step:8.2f}" +
        " | steps/ep={steps_ep:8.2f}, fps={fps:8.2f}.")
    val_log = rlog.getLogger(opt.experiment + ".valid")
    val_log.addMetrics(
        rlog.AvgMetric("R_ep", metargs=["reward", "done"]),
        rlog.AvgMetric("RR_ep",
                       resetable=False,
                       eps=0.8,
                       metargs=["reward", "done"]),
        rlog.AvgMetric("V_step", metargs=["value", 1]),
        rlog.AvgMetric("steps_ep", metargs=["frame_no", "done"]),
        rlog.FPSMetric("fps", metargs=["frame_no"]),
    )
    if hasattr(opt.log, "detailed") and opt.log.detailed:
        val_log.addMetrics(
            rlog.ValueMetric("Vhist", metargs=["value"], tb_type="histogram"))
    val_log.log_fmt = (
        "@{0:6d}        R/ep={R_ep:8.2f}, RunR/ep={RR_ep:8.2f}" +
        " | steps/ep={steps_ep:8.2f}, fps={fps:8.2f}.")
Пример #2
0
def set_logger(opt):
    """ Configure logger.
    """
    rlog.init(opt.experiment, path=opt.out_dir, tensorboard=True)
    trn_log = rlog.getLogger(opt.experiment + ".train")
    val_log = rlog.getLogger(opt.experiment + ".valid")
    trn_log.fmt = "[{:03d}][TRN]  acc={:5.2f}%  loss={:5.2f}"
    val_log.fmt = "[{:03d}][VAL]  acc={:5.2f}%  loss={:5.2f}"
    # add histogram support
    if hasattr(opt, "log") and opt.log.detailed:
        mdl_log = rlog.getLogger(opt.experiment + ".model")
        mdl_log.addMetrics(
            rlog.ValueMetric("std", metargs=["std"], tb_type="histogram"),
            rlog.ValueMetric("mu", metargs=["mu"], tb_type="histogram"),
        )
    return trn_log, val_log
Пример #3
0
def configure_logger(opt):
    rlog.init(opt.experiment, path=opt.out_dir)
    train_log = rlog.getLogger(opt.experiment + ".train")
    train_log.addMetrics([
        rlog.AvgMetric("R/ep", metargs=["reward", "done"]),
        rlog.SumMetric("ep_cnt", resetable=False, metargs=["done"]),
        rlog.AvgMetric("steps/ep", metargs=["step_no", "done"]),
        rlog.FPSMetric("learning_fps", metargs=["frame_no"]),
    ])
    test_log = rlog.getLogger(opt.experiment + ".test")
    test_log.addMetrics([
        rlog.AvgMetric("R/ep", metargs=["reward", "done"]),
        rlog.SumMetric("ep_cnt", resetable=False, metargs=["done"]),
        rlog.AvgMetric("steps/ep", metargs=["frame_no", "done"]),
        rlog.FPSMetric("test_fps", metargs=["frame_no"]),
        rlog.MaxMetric("max_q", metargs=["qval"]),
    ])
Пример #4
0
def model_stats(opt, epoch, model):
    """ Log additional model stats.
    """
    log = rlog.getLogger(opt.experiment + ".model")
    if hasattr(opt, "log") and opt.log.detailed:
        # log histogram also
        assert isinstance(
            model, SVIModel), "This stat only makes sense for SVI models."
        for mu, std in zip(model.mu(), model.std()):
            log.put(mu=mu, std=std)
        log.trace(step=epoch, **model.summarize())
        log.reset()
Пример #5
0
def main(cmdl):
    """ Entry point.
    """
    opt = read_config(Path(cmdl.experiment_path) / "cfg.yaml")
    chkpt_paths = sorted(
        Path(cmdl.experiment_path).glob("*.pth"),
        key=lambda path: int(path.stem.split("_")[2]),
    )
    chkpt_paths = [(int(path.stem.split("_")[2]), path)
                   for path in chkpt_paths]

    print(config_to_string(cmdl))
    print(config_to_string(opt))

    if cmdl.build_val_dset:
        perf = [(torch.load(path)["R/ep"], path) for _, path in chkpt_paths]
        best_score, path = max(perf, key=lambda x: x[0])
        print(f"Loading {path} with total return {best_score}.")
        env, policy = configure_eval(cmdl, opt, path)
        achlioptas = _get_achlioptas(8, 4)
        val_dset = build_validation_dset(
            env,
            policy,
            opt.gamma,
            partial(_hash, decimals=cmdl.decimals, rnd_proj=achlioptas),
        )

        val_dset["meta"]["agent"] = path
        val_dset["meta"]["decimals"] = cmdl.decimals
        val_dset["meta"]["rnd_proj"] = achlioptas
        for k, v in val_dset["meta"].items():
            print(f"{k:12}", v)
        torch.save(val_dset, f"./val_dsets/{env.spec.id}.pkl")
    elif cmdl.offline_validation:
        rlog.init(opt.experiment, path=opt.out_dir, tensorboard=True)
        log = rlog.getLogger(opt.experiment + ".off_valid")
        log.addMetrics([
            rlog.AvgMetric("V_step", metargs=["value", 1]),
            rlog.AvgMetric("off_mse", metargs=["off_mse", 1]),
        ])
        log.info("Loading dataset...")
        dset = torch.load(f"./val_dsets/{opt.env_name}.pkl")
        for step, path in chkpt_paths:
            env, policy = configure_eval(cmdl, opt, path)
            offline_validation(step, policy, dset, opt)
    else:
        for step, path in chkpt_paths:
            env, policy = configure_eval(cmdl, opt, path)
            avg_return = greedy_validation(env, policy, opt.gamma)
            print("[{0:8d}]   R/ep={1:8.2f}.".format(step, avg_return))
Пример #6
0
def make_rlog(opt):
    """ Configure logger. """
    rlog.init("pff", path=opt.path, tensorboard=True)
    train_log = rlog.getLogger("pff.train")
    train_log.fmt = (
        "[{gen:03d}/{batch:04d}] acc={acc:2.2f}% | bestFit={bestFit:2.3f}"
        + ", unFit={unFit:2.3f} [μ={attnMean:2.3f}/σ={attnVar:2.3f}]"
    )
    if opt.model == "baseline":
        train_log.fmt = "[{batch:04d}] acc={acc:2.2f}%, loss={loss:2.3f}"
    msg = "Configuration:\n"
    for k, v in vars(opt).items():
        msg += f"   {k:16}:  {v}\n"
    rlog.info(msg)
    return train_log
Пример #7
0
def offline_validation(crt_step, policy, dset, opt):
    log = rlog.getLogger(opt.experiment + ".off_valid")
    log.info(f"Validating chkp@{crt_step:08d} on offline data...")

    # Iterate through the data
    for step, (state, stats) in enumerate(dset["dset"].items()):
        with torch.no_grad():
            pi = policy(state)
        loss = F.mse_loss(pi.value, torch.tensor([[stats["Gt"]]]))

        log.put(value=pi.value.squeeze().item(), off_mse=loss.squeeze().item())

        if step % 100_000 == 0 and step != 0:
            fstr = "partial@ {0:8d} V/step={V_step:8.2f}, off_mse/ep={off_mse:8.2f}"
            log.info(fstr.format(step, **log.summarize()))
Пример #8
0
def test(opt, estimator, crt_step):
    """ Test the agent's performance.
    """
    test_log = rlog.getLogger(f"{opt.experiment}.test")
    test_log.info("Test agent after %d training steps.", crt_step)
    test_log.reset()  # required for proper timing

    # construct env and policy
    env = wrap_env(gym.make(opt.game), opt)
    if hasattr(opt.estimator, "ensemble"):
        policy = BootstrappedPE(
            estimator,
            env.action_space.n,
            epsilon={
                "name": "constant",
                "start": 0.01
            },
            vote=True,
        )
    else:
        policy = wt.EpsilonGreedyPolicy(
            estimator,
            env.action_space.n,
            epsilon={
                "name": "constant",
                "start": 0.01
            },
        )

    step_cnt = 0
    while step_cnt < opt.test_steps:
        for transition, pi in Episode(env, policy, with_pi=True):
            _, _, reward, _, done = transition
            test_log.put(reward=reward, done=done, frame_no=1, qval=pi.q_value)
            step_cnt += 1
            if opt.test_render:
                env.render()
    env.close()

    # do some logging
    summary = test_log.summarize()
    test_log.info(("[{0:8d}/{ep_cnt:8d}] R/ep={R/ep:6.2f}" +
                   "\n             | steps/ep={steps/ep:6.2f}, " +
                   "fps={test_fps:8.2f}, maxq={max_q:6.2f}.").format(
                       step_cnt, **summary))
    test_log.trace(step=crt_step, **summary)
Пример #9
0
def policy_iteration(env, policy, opt):
    """ Policy improvement routine.

    Args:
        env (gym.env): The game we are training on.
        policy (DQNPolicy): A DQN agent.
        opt (Namespace): Configuration values.
    """

    train_log = rlog.getLogger(f"{opt.experiment}.train")

    while policy.steps < opt.train_steps:
        for _state, _action, reward, state, done in Episode(env, policy):

            # push to memory
            policy.push((_state, _action, reward, state, done))

            # learn
            if policy.steps >= opt.start_learning:
                if policy.steps % opt.update_freq == 0:
                    policy.learn()

                if policy.steps % opt.target_update == 0:
                    policy.policy_improvement.update_target_estimator()

            # log
            train_log.put(reward=reward,
                          done=done,
                          frame_no=opt.er.batch_size,
                          step_no=1)

            if policy.steps % 10_000 == 0:
                summary = train_log.summarize()
                train_log.info(("[{0:8d}/{ep_cnt:8d}] R/ep={R/ep:6.2f}" +
                                "\n             | steps/ep={steps/ep:6.2f}, " +
                                "fps={learning_fps:8.2f}.").format(
                                    policy.steps, **summary))
                train_log.trace(step=policy.steps, **summary)
                train_log.reset()

            if policy.steps % 50_000 == 0 and policy.steps != 0:
                test(opt, deepcopy(policy.estimator), policy.steps)
Пример #10
0
def run(opt):
    """ Entry point of the program. """

    if __debug__:
        print(
            clr(
                "Code might have assertions. Use -O in liftoff when running stuff.",
                color="red",
                attrs=["bold"],
            ))

    ioutil.create_paths(opt)

    sticky_schedule = OrderedDict([(int(s), float(p))
                                   for (s, p) in opt.sticky_schedule])
    assert 1 in sticky_schedule

    rlog.init(opt.experiment, path=opt.out_dir, tensorboard=True)
    train_loggers = OrderedDict()
    for i, epoch in enumerate(sticky_schedule.keys()):
        train_loggers[epoch] = train_log = rlog.getLogger(
            f"{opt.experiment}.{i:d}")
        train_log.addMetrics(
            rlog.AvgMetric("trn_R_ep", metargs=["trn_reward", "trn_done"]),
            rlog.SumMetric("trn_ep_cnt", metargs=["trn_done"]),
            rlog.AvgMetric("trn_loss", metargs=["trn_loss", 1]),
            rlog.FPSMetric("trn_tps", metargs=["trn_steps"]),
            rlog.ValueMetric("trn_sticky_action_prob",
                             metargs=["trn_sticky_action_prob"]),
            rlog.FPSMetric("lrn_tps", metargs=["lrn_steps"]),
            rlog.AvgMetric("val_R_ep", metargs=["reward", "done"]),
            rlog.SumMetric("val_ep_cnt", metargs=["done"]),
            rlog.AvgMetric("val_avg_step", metargs=[1, "done"]),
            rlog.FPSMetric("val_fps", metargs=["val_frames"]),
            rlog.ValueMetric("val_sticky_action_prob",
                             metargs=["val_sticky_action_prob"]),
        )

    # Initialize the objects we will use during training.
    env, (replay, policy_improvement,
          policy_evaluation) = experiment_factory(opt)

    rlog.info("\n\n{}\n\n{}\n\n{}".format(env, replay,
                                          policy_evaluation.estimator))
    rlog.info("\n\n{}\n\n{}".format(policy_improvement, policy_evaluation))

    if opt.estimator.args.get("spectral", None) is not None:
        for k in policy_evaluation.estimator.get_spectral_norms().keys():
            # k = f"min{str(k)[1:]}"
            rlog.addMetrics(rlog.ValueMetric(k, metargs=[k]))

    # if we loaded a checkpoint
    if Path(opt.out_dir).joinpath("replay.gz").is_file():

        # sometimes the experiment is intrerupted while saving the replay
        # buffer and it gets corrupted. Therefore we attempt restoring
        # from the previous checkpoint and replay.
        try:
            idx = replay.load(Path(opt.out_dir) / "replay.gz")
            ckpt = ioutil.load_checkpoint(opt.out_dir, idx=idx)
            rlog.info(f"Loaded most recent replay (step {idx}).")
        except:
            gc.collect()
            rlog.info("Last replay gzip is faulty.")
            idx = replay.load(Path(opt.out_dir) / "prev_replay.gz")
            ckpt = ioutil.load_checkpoint(opt.out_dir, idx=idx)
            rlog.info(f"Loading a previous snapshot (step {idx}).")

        # load state dicts

        # load state dicts
        ioutil.special_conv_uv_buffer_fix(policy_evaluation.estimator,
                                          ckpt["estimator_state"])
        policy_evaluation.estimator.load_state_dict(ckpt["estimator_state"])
        ioutil.special_conv_uv_buffer_fix(policy_evaluation.target_estimator,
                                          ckpt["target_estimator_state"])
        policy_evaluation.target_estimator.load_state_dict(
            ckpt["target_estimator_state"])
        policy_evaluation.optimizer.load_state_dict(ckpt["optim_state"])

        last_epsilon = None
        for _ in range(ckpt["step"]):
            last_epsilon = next(policy_improvement.epsilon)
        rlog.info(f"Last epsilon: {last_epsilon}.")
        # some counters
        last_epoch = ckpt["step"] // opt.train_step_cnt
        rlog.info(f"Resuming from epoch {last_epoch}.")
        start_epoch = last_epoch + 1
        steps = ckpt["step"]
    else:
        steps = 0
        start_epoch = 1
        # add some hardware and git info, log and save
        opt = ioutil.add_platform_info(opt)

    rlog.info("\n" + ioutil.config_to_string(opt))
    ioutil.save_config(opt, opt.out_dir)

    # Start training

    last_state = None  # used by train_one_epoch to know how to resume episode.
    for epoch in range(start_epoch, opt.epoch_cnt + 1):
        last_sched_epoch = max(ep for ep in sticky_schedule if ep <= epoch)
        print(f"StickyActProb goes from {env.sticky_action_prob}"
              f" to {sticky_schedule[last_sched_epoch]}")
        env.sticky_action_prob = sticky_schedule[last_sched_epoch]
        crt_logger = train_loggers[last_sched_epoch]

        # train for 250,000 steps
        steps, last_state = train_one_epoch(
            env,
            (replay, policy_improvement, policy_evaluation),
            opt.train_step_cnt,
            opt.update_freq,
            opt.target_update_freq,
            opt,
            crt_logger,
            total_steps=steps,
            last_state=last_state,
        )
        crt_logger.put(trn_sticky_action_prob=env.sticky_action_prob)
        crt_logger.traceAndLog(epoch * opt.train_step_cnt)

        # validate for 125,000 steps
        for sched_epoch, eval_logger in train_loggers.items():
            eval_env = get_env(  # this doesn't work, fute-m-aș în ele de wrappere
                opt,
                mode="testing",
                sticky_action_prob=sticky_schedule[sched_epoch])
            eval_env.sticky_action_prob = sticky_schedule[sched_epoch]
            print(
                f"Evaluating on the env with sticky={eval_env.sticky_action_prob}."
            )
            validate(
                AGENTS[opt.agent.name]["policy_improvement"](
                    policy_improvement.estimator,
                    opt.action_cnt,
                    epsilon=opt.val_epsilon,
                ),
                eval_env,
                opt.valid_step_cnt,
                eval_logger,
            )
            eval_logger.put(
                val_sticky_action_prob=eval_env.sticky_action_prob, )
            eval_logger.traceAndLog(epoch * opt.train_step_cnt)

        # save the checkpoint
        if opt.agent.save:
            ioutil.checkpoint_agent(
                opt.out_dir,
                steps,
                estimator=policy_evaluation.estimator,
                target_estimator=policy_evaluation.target_estimator,
                optim=policy_evaluation.optimizer,
                cfg=opt,
                replay=replay,
                save_replay=(epoch % 8 == 0 or epoch == opt.epoch_cnt),
            )
Пример #11
0
def main():
    # get the root logger, preconfigured to log to the console,
    # to a text file, a pickle and a tensorboard protobuf.
    experiment_path = get_experiment_path()
    rlog.init("dqn", path=experiment_path, tensorboard=True)
    rlog.info("Logging application level stuff.")
    rlog.info("Log artifacts will be saved in %s", experiment_path)

    # create a new logger that will log training events
    train_log = rlog.getLogger("dqn.train")
    train_log.info("Starting training... ")

    for step in range(5):
        # probably not a good idea to call this every step if it is a hot loop?
        # also this will not be logged to the console or to the text file
        # since the default log-level for these two is INFO.
        train_log.trace(step=step, aux_loss=7.23 - step)

    # but we can register metrics that will accumulate traced events
    # and summarize them. Each Metric accepts a name and some metargs
    # that tells it which arguments received by the `put` call bellow
    # to accumulate and summarize.
    train_log.addMetrics(
        # counts each time it receives a `done=True`, aka counts episodes
        rlog.SumMetric("ep_cnt", resetable=False, metargs=["done"]),
        # sums up all the `reward=value` it receives and divides it
        # by the number of `done=True`, aka mean reward per episode
        rlog.AvgMetric("R_per_ep", metargs=["reward", "done"]),
        # same but keeps a running average instead (experimental).
        rlog.AvgMetric("RunR", eps=0.9, metargs=["reward", "done"]),
        # same as above but now we divide by the number of rewards
        rlog.AvgMetric("R_per_step", metargs=["reward", 1]),
        # same but with clipped rewards (to +- 1)
        rlog.AvgMetric("rw_per_ep", metargs=["clip(reward)", "done"]),
        # computes the no of frames per second
        rlog.FPSMetric("train_fps", metargs=["frame_no"]),
        # caches all the values it receives and inserts them into a
        # tensorboad.summary.histogram every time you call `log.trace`
        rlog.ValueMetric("gaussians", metargs=["sample"], tb_type="histogram"),
    )

    mean = 0
    for step in range(1, 300_001):

        # make a step in the "environment"
        reward, done = reward_following_policy(step)

        # sample from a gaussian for showcasing the histogram
        sample = random.gauss(mean, 0.1)

        # simply trace all the values you passed as `metargs` above.
        # the logger will know how to dispatch each argument.
        train_log.put(reward=reward, done=done, frame_no=1, sample=sample)

        if step % 10_000 == 0:
            # this is the call that dumps everything to the logger.
            summary = train_log.summarize()
            train_log.trace(step=step, **summary)
            train_log.info(
                "{0:6d}, ep {ep_cnt:3d}, RunR/ep{RunR:8.2f}  |  rw/ep{R_per_ep:8.2f}."
                .format(step, **summary))
            train_log.reset()
            mean += 1
Пример #12
0
        # sample from a gaussian for showcasing the histogram
        sample = random.gauss(mean, 0.1)

        # simply trace all the values you passed as `metargs` above.
        # the logger will know how to dispatch each argument.
        train_log.put(reward=reward, done=done, frame_no=1, sample=sample)

        if step % 10_000 == 0:
            # this is the call that dumps everything to the logger.
            summary = train_log.summarize()
            train_log.trace(step=step, **summary)
            train_log.info(
                "{0:6d}, ep {ep_cnt:3d}, RunR/ep{RunR:8.2f}  |  rw/ep{R_per_ep:8.2f}."
                .format(step, **summary))
            train_log.reset()
            mean += 1

    train_log.trace("But we can continue tracing stuff manually...")
    # inlcuding structured stuff as long as we provide a `step` keyarg
    train_log.trace(step=step, aux_loss=0.23)

    # We can also configure an evaluation logger.
    eval_log = rlog.getLogger("dqn.eval")
    eval_log.info("Starting evaluation... ")

    rlog.info("Run `tensorboard --logdir sota_results` to see the results.")


if __name__ == "__main__":
    main()