def configure_logger(opt): """ Configures the logger. """ rlog.init(opt.experiment, path=opt.out_dir, tensorboard=True) train_log = rlog.getLogger(opt.experiment + ".train") train_log.addMetrics( rlog.AvgMetric("R_ep", metargs=["reward", "done"]), rlog.AvgMetric("V_step", metargs=["value", 1]), rlog.AvgMetric("v_mse_loss", metargs=["v_mse", 1]), rlog.AvgMetric("v_hub_loss", metargs=["v_hub", 1]), rlog.SumMetric("ep_cnt", resetable=False, metargs=["done"]), rlog.AvgMetric("steps_ep", metargs=["step_no", "done"]), rlog.FPSMetric("fps", metargs=["frame_no"]), ) train_log.log_fmt = ( "[{0:6d}/{ep_cnt:5d}] R/ep={R_ep:8.2f}, V/step={V_step:8.2f}" + " | steps/ep={steps_ep:8.2f}, fps={fps:8.2f}.") val_log = rlog.getLogger(opt.experiment + ".valid") val_log.addMetrics( rlog.AvgMetric("R_ep", metargs=["reward", "done"]), rlog.AvgMetric("RR_ep", resetable=False, eps=0.8, metargs=["reward", "done"]), rlog.AvgMetric("V_step", metargs=["value", 1]), rlog.AvgMetric("steps_ep", metargs=["frame_no", "done"]), rlog.FPSMetric("fps", metargs=["frame_no"]), ) if hasattr(opt.log, "detailed") and opt.log.detailed: val_log.addMetrics( rlog.ValueMetric("Vhist", metargs=["value"], tb_type="histogram")) val_log.log_fmt = ( "@{0:6d} R/ep={R_ep:8.2f}, RunR/ep={RR_ep:8.2f}" + " | steps/ep={steps_ep:8.2f}, fps={fps:8.2f}.")
def set_logger(opt): """ Configure logger. """ rlog.init(opt.experiment, path=opt.out_dir, tensorboard=True) trn_log = rlog.getLogger(opt.experiment + ".train") val_log = rlog.getLogger(opt.experiment + ".valid") trn_log.fmt = "[{:03d}][TRN] acc={:5.2f}% loss={:5.2f}" val_log.fmt = "[{:03d}][VAL] acc={:5.2f}% loss={:5.2f}" # add histogram support if hasattr(opt, "log") and opt.log.detailed: mdl_log = rlog.getLogger(opt.experiment + ".model") mdl_log.addMetrics( rlog.ValueMetric("std", metargs=["std"], tb_type="histogram"), rlog.ValueMetric("mu", metargs=["mu"], tb_type="histogram"), ) return trn_log, val_log
def configure_logger(opt): rlog.init(opt.experiment, path=opt.out_dir) train_log = rlog.getLogger(opt.experiment + ".train") train_log.addMetrics([ rlog.AvgMetric("R/ep", metargs=["reward", "done"]), rlog.SumMetric("ep_cnt", resetable=False, metargs=["done"]), rlog.AvgMetric("steps/ep", metargs=["step_no", "done"]), rlog.FPSMetric("learning_fps", metargs=["frame_no"]), ]) test_log = rlog.getLogger(opt.experiment + ".test") test_log.addMetrics([ rlog.AvgMetric("R/ep", metargs=["reward", "done"]), rlog.SumMetric("ep_cnt", resetable=False, metargs=["done"]), rlog.AvgMetric("steps/ep", metargs=["frame_no", "done"]), rlog.FPSMetric("test_fps", metargs=["frame_no"]), rlog.MaxMetric("max_q", metargs=["qval"]), ])
def model_stats(opt, epoch, model): """ Log additional model stats. """ log = rlog.getLogger(opt.experiment + ".model") if hasattr(opt, "log") and opt.log.detailed: # log histogram also assert isinstance( model, SVIModel), "This stat only makes sense for SVI models." for mu, std in zip(model.mu(), model.std()): log.put(mu=mu, std=std) log.trace(step=epoch, **model.summarize()) log.reset()
def main(cmdl): """ Entry point. """ opt = read_config(Path(cmdl.experiment_path) / "cfg.yaml") chkpt_paths = sorted( Path(cmdl.experiment_path).glob("*.pth"), key=lambda path: int(path.stem.split("_")[2]), ) chkpt_paths = [(int(path.stem.split("_")[2]), path) for path in chkpt_paths] print(config_to_string(cmdl)) print(config_to_string(opt)) if cmdl.build_val_dset: perf = [(torch.load(path)["R/ep"], path) for _, path in chkpt_paths] best_score, path = max(perf, key=lambda x: x[0]) print(f"Loading {path} with total return {best_score}.") env, policy = configure_eval(cmdl, opt, path) achlioptas = _get_achlioptas(8, 4) val_dset = build_validation_dset( env, policy, opt.gamma, partial(_hash, decimals=cmdl.decimals, rnd_proj=achlioptas), ) val_dset["meta"]["agent"] = path val_dset["meta"]["decimals"] = cmdl.decimals val_dset["meta"]["rnd_proj"] = achlioptas for k, v in val_dset["meta"].items(): print(f"{k:12}", v) torch.save(val_dset, f"./val_dsets/{env.spec.id}.pkl") elif cmdl.offline_validation: rlog.init(opt.experiment, path=opt.out_dir, tensorboard=True) log = rlog.getLogger(opt.experiment + ".off_valid") log.addMetrics([ rlog.AvgMetric("V_step", metargs=["value", 1]), rlog.AvgMetric("off_mse", metargs=["off_mse", 1]), ]) log.info("Loading dataset...") dset = torch.load(f"./val_dsets/{opt.env_name}.pkl") for step, path in chkpt_paths: env, policy = configure_eval(cmdl, opt, path) offline_validation(step, policy, dset, opt) else: for step, path in chkpt_paths: env, policy = configure_eval(cmdl, opt, path) avg_return = greedy_validation(env, policy, opt.gamma) print("[{0:8d}] R/ep={1:8.2f}.".format(step, avg_return))
def make_rlog(opt): """ Configure logger. """ rlog.init("pff", path=opt.path, tensorboard=True) train_log = rlog.getLogger("pff.train") train_log.fmt = ( "[{gen:03d}/{batch:04d}] acc={acc:2.2f}% | bestFit={bestFit:2.3f}" + ", unFit={unFit:2.3f} [μ={attnMean:2.3f}/σ={attnVar:2.3f}]" ) if opt.model == "baseline": train_log.fmt = "[{batch:04d}] acc={acc:2.2f}%, loss={loss:2.3f}" msg = "Configuration:\n" for k, v in vars(opt).items(): msg += f" {k:16}: {v}\n" rlog.info(msg) return train_log
def offline_validation(crt_step, policy, dset, opt): log = rlog.getLogger(opt.experiment + ".off_valid") log.info(f"Validating chkp@{crt_step:08d} on offline data...") # Iterate through the data for step, (state, stats) in enumerate(dset["dset"].items()): with torch.no_grad(): pi = policy(state) loss = F.mse_loss(pi.value, torch.tensor([[stats["Gt"]]])) log.put(value=pi.value.squeeze().item(), off_mse=loss.squeeze().item()) if step % 100_000 == 0 and step != 0: fstr = "partial@ {0:8d} V/step={V_step:8.2f}, off_mse/ep={off_mse:8.2f}" log.info(fstr.format(step, **log.summarize()))
def test(opt, estimator, crt_step): """ Test the agent's performance. """ test_log = rlog.getLogger(f"{opt.experiment}.test") test_log.info("Test agent after %d training steps.", crt_step) test_log.reset() # required for proper timing # construct env and policy env = wrap_env(gym.make(opt.game), opt) if hasattr(opt.estimator, "ensemble"): policy = BootstrappedPE( estimator, env.action_space.n, epsilon={ "name": "constant", "start": 0.01 }, vote=True, ) else: policy = wt.EpsilonGreedyPolicy( estimator, env.action_space.n, epsilon={ "name": "constant", "start": 0.01 }, ) step_cnt = 0 while step_cnt < opt.test_steps: for transition, pi in Episode(env, policy, with_pi=True): _, _, reward, _, done = transition test_log.put(reward=reward, done=done, frame_no=1, qval=pi.q_value) step_cnt += 1 if opt.test_render: env.render() env.close() # do some logging summary = test_log.summarize() test_log.info(("[{0:8d}/{ep_cnt:8d}] R/ep={R/ep:6.2f}" + "\n | steps/ep={steps/ep:6.2f}, " + "fps={test_fps:8.2f}, maxq={max_q:6.2f}.").format( step_cnt, **summary)) test_log.trace(step=crt_step, **summary)
def policy_iteration(env, policy, opt): """ Policy improvement routine. Args: env (gym.env): The game we are training on. policy (DQNPolicy): A DQN agent. opt (Namespace): Configuration values. """ train_log = rlog.getLogger(f"{opt.experiment}.train") while policy.steps < opt.train_steps: for _state, _action, reward, state, done in Episode(env, policy): # push to memory policy.push((_state, _action, reward, state, done)) # learn if policy.steps >= opt.start_learning: if policy.steps % opt.update_freq == 0: policy.learn() if policy.steps % opt.target_update == 0: policy.policy_improvement.update_target_estimator() # log train_log.put(reward=reward, done=done, frame_no=opt.er.batch_size, step_no=1) if policy.steps % 10_000 == 0: summary = train_log.summarize() train_log.info(("[{0:8d}/{ep_cnt:8d}] R/ep={R/ep:6.2f}" + "\n | steps/ep={steps/ep:6.2f}, " + "fps={learning_fps:8.2f}.").format( policy.steps, **summary)) train_log.trace(step=policy.steps, **summary) train_log.reset() if policy.steps % 50_000 == 0 and policy.steps != 0: test(opt, deepcopy(policy.estimator), policy.steps)
def run(opt): """ Entry point of the program. """ if __debug__: print( clr( "Code might have assertions. Use -O in liftoff when running stuff.", color="red", attrs=["bold"], )) ioutil.create_paths(opt) sticky_schedule = OrderedDict([(int(s), float(p)) for (s, p) in opt.sticky_schedule]) assert 1 in sticky_schedule rlog.init(opt.experiment, path=opt.out_dir, tensorboard=True) train_loggers = OrderedDict() for i, epoch in enumerate(sticky_schedule.keys()): train_loggers[epoch] = train_log = rlog.getLogger( f"{opt.experiment}.{i:d}") train_log.addMetrics( rlog.AvgMetric("trn_R_ep", metargs=["trn_reward", "trn_done"]), rlog.SumMetric("trn_ep_cnt", metargs=["trn_done"]), rlog.AvgMetric("trn_loss", metargs=["trn_loss", 1]), rlog.FPSMetric("trn_tps", metargs=["trn_steps"]), rlog.ValueMetric("trn_sticky_action_prob", metargs=["trn_sticky_action_prob"]), rlog.FPSMetric("lrn_tps", metargs=["lrn_steps"]), rlog.AvgMetric("val_R_ep", metargs=["reward", "done"]), rlog.SumMetric("val_ep_cnt", metargs=["done"]), rlog.AvgMetric("val_avg_step", metargs=[1, "done"]), rlog.FPSMetric("val_fps", metargs=["val_frames"]), rlog.ValueMetric("val_sticky_action_prob", metargs=["val_sticky_action_prob"]), ) # Initialize the objects we will use during training. env, (replay, policy_improvement, policy_evaluation) = experiment_factory(opt) rlog.info("\n\n{}\n\n{}\n\n{}".format(env, replay, policy_evaluation.estimator)) rlog.info("\n\n{}\n\n{}".format(policy_improvement, policy_evaluation)) if opt.estimator.args.get("spectral", None) is not None: for k in policy_evaluation.estimator.get_spectral_norms().keys(): # k = f"min{str(k)[1:]}" rlog.addMetrics(rlog.ValueMetric(k, metargs=[k])) # if we loaded a checkpoint if Path(opt.out_dir).joinpath("replay.gz").is_file(): # sometimes the experiment is intrerupted while saving the replay # buffer and it gets corrupted. Therefore we attempt restoring # from the previous checkpoint and replay. try: idx = replay.load(Path(opt.out_dir) / "replay.gz") ckpt = ioutil.load_checkpoint(opt.out_dir, idx=idx) rlog.info(f"Loaded most recent replay (step {idx}).") except: gc.collect() rlog.info("Last replay gzip is faulty.") idx = replay.load(Path(opt.out_dir) / "prev_replay.gz") ckpt = ioutil.load_checkpoint(opt.out_dir, idx=idx) rlog.info(f"Loading a previous snapshot (step {idx}).") # load state dicts # load state dicts ioutil.special_conv_uv_buffer_fix(policy_evaluation.estimator, ckpt["estimator_state"]) policy_evaluation.estimator.load_state_dict(ckpt["estimator_state"]) ioutil.special_conv_uv_buffer_fix(policy_evaluation.target_estimator, ckpt["target_estimator_state"]) policy_evaluation.target_estimator.load_state_dict( ckpt["target_estimator_state"]) policy_evaluation.optimizer.load_state_dict(ckpt["optim_state"]) last_epsilon = None for _ in range(ckpt["step"]): last_epsilon = next(policy_improvement.epsilon) rlog.info(f"Last epsilon: {last_epsilon}.") # some counters last_epoch = ckpt["step"] // opt.train_step_cnt rlog.info(f"Resuming from epoch {last_epoch}.") start_epoch = last_epoch + 1 steps = ckpt["step"] else: steps = 0 start_epoch = 1 # add some hardware and git info, log and save opt = ioutil.add_platform_info(opt) rlog.info("\n" + ioutil.config_to_string(opt)) ioutil.save_config(opt, opt.out_dir) # Start training last_state = None # used by train_one_epoch to know how to resume episode. for epoch in range(start_epoch, opt.epoch_cnt + 1): last_sched_epoch = max(ep for ep in sticky_schedule if ep <= epoch) print(f"StickyActProb goes from {env.sticky_action_prob}" f" to {sticky_schedule[last_sched_epoch]}") env.sticky_action_prob = sticky_schedule[last_sched_epoch] crt_logger = train_loggers[last_sched_epoch] # train for 250,000 steps steps, last_state = train_one_epoch( env, (replay, policy_improvement, policy_evaluation), opt.train_step_cnt, opt.update_freq, opt.target_update_freq, opt, crt_logger, total_steps=steps, last_state=last_state, ) crt_logger.put(trn_sticky_action_prob=env.sticky_action_prob) crt_logger.traceAndLog(epoch * opt.train_step_cnt) # validate for 125,000 steps for sched_epoch, eval_logger in train_loggers.items(): eval_env = get_env( # this doesn't work, fute-m-aș în ele de wrappere opt, mode="testing", sticky_action_prob=sticky_schedule[sched_epoch]) eval_env.sticky_action_prob = sticky_schedule[sched_epoch] print( f"Evaluating on the env with sticky={eval_env.sticky_action_prob}." ) validate( AGENTS[opt.agent.name]["policy_improvement"]( policy_improvement.estimator, opt.action_cnt, epsilon=opt.val_epsilon, ), eval_env, opt.valid_step_cnt, eval_logger, ) eval_logger.put( val_sticky_action_prob=eval_env.sticky_action_prob, ) eval_logger.traceAndLog(epoch * opt.train_step_cnt) # save the checkpoint if opt.agent.save: ioutil.checkpoint_agent( opt.out_dir, steps, estimator=policy_evaluation.estimator, target_estimator=policy_evaluation.target_estimator, optim=policy_evaluation.optimizer, cfg=opt, replay=replay, save_replay=(epoch % 8 == 0 or epoch == opt.epoch_cnt), )
def main(): # get the root logger, preconfigured to log to the console, # to a text file, a pickle and a tensorboard protobuf. experiment_path = get_experiment_path() rlog.init("dqn", path=experiment_path, tensorboard=True) rlog.info("Logging application level stuff.") rlog.info("Log artifacts will be saved in %s", experiment_path) # create a new logger that will log training events train_log = rlog.getLogger("dqn.train") train_log.info("Starting training... ") for step in range(5): # probably not a good idea to call this every step if it is a hot loop? # also this will not be logged to the console or to the text file # since the default log-level for these two is INFO. train_log.trace(step=step, aux_loss=7.23 - step) # but we can register metrics that will accumulate traced events # and summarize them. Each Metric accepts a name and some metargs # that tells it which arguments received by the `put` call bellow # to accumulate and summarize. train_log.addMetrics( # counts each time it receives a `done=True`, aka counts episodes rlog.SumMetric("ep_cnt", resetable=False, metargs=["done"]), # sums up all the `reward=value` it receives and divides it # by the number of `done=True`, aka mean reward per episode rlog.AvgMetric("R_per_ep", metargs=["reward", "done"]), # same but keeps a running average instead (experimental). rlog.AvgMetric("RunR", eps=0.9, metargs=["reward", "done"]), # same as above but now we divide by the number of rewards rlog.AvgMetric("R_per_step", metargs=["reward", 1]), # same but with clipped rewards (to +- 1) rlog.AvgMetric("rw_per_ep", metargs=["clip(reward)", "done"]), # computes the no of frames per second rlog.FPSMetric("train_fps", metargs=["frame_no"]), # caches all the values it receives and inserts them into a # tensorboad.summary.histogram every time you call `log.trace` rlog.ValueMetric("gaussians", metargs=["sample"], tb_type="histogram"), ) mean = 0 for step in range(1, 300_001): # make a step in the "environment" reward, done = reward_following_policy(step) # sample from a gaussian for showcasing the histogram sample = random.gauss(mean, 0.1) # simply trace all the values you passed as `metargs` above. # the logger will know how to dispatch each argument. train_log.put(reward=reward, done=done, frame_no=1, sample=sample) if step % 10_000 == 0: # this is the call that dumps everything to the logger. summary = train_log.summarize() train_log.trace(step=step, **summary) train_log.info( "{0:6d}, ep {ep_cnt:3d}, RunR/ep{RunR:8.2f} | rw/ep{R_per_ep:8.2f}." .format(step, **summary)) train_log.reset() mean += 1
# sample from a gaussian for showcasing the histogram sample = random.gauss(mean, 0.1) # simply trace all the values you passed as `metargs` above. # the logger will know how to dispatch each argument. train_log.put(reward=reward, done=done, frame_no=1, sample=sample) if step % 10_000 == 0: # this is the call that dumps everything to the logger. summary = train_log.summarize() train_log.trace(step=step, **summary) train_log.info( "{0:6d}, ep {ep_cnt:3d}, RunR/ep{RunR:8.2f} | rw/ep{R_per_ep:8.2f}." .format(step, **summary)) train_log.reset() mean += 1 train_log.trace("But we can continue tracing stuff manually...") # inlcuding structured stuff as long as we provide a `step` keyarg train_log.trace(step=step, aux_loss=0.23) # We can also configure an evaluation logger. eval_log = rlog.getLogger("dqn.eval") eval_log.info("Starting evaluation... ") rlog.info("Run `tensorboard --logdir sota_results` to see the results.") if __name__ == "__main__": main()