def main(cmdl): base_cfg = namespace_to_dict(read_config(Path(cmdl.cfg) / "default.yaml")) search_cfg = namespace_to_dict(read_config(Path(cmdl.cfg) / "search.yaml")) print(config_to_string(cmdl)) print(config_to_string(dict_to_namespace(search_cfg))) # the search space good_init, search_space = get_search_space(search_cfg) search_name = "{timestep}_tune_{experiment_name}{dev}".format( timestep="{:%Y%b%d-%H%M%S}".format(datetime.now()), experiment_name=base_cfg["experiment"], dev="_dev" if cmdl.dev else "", ) # search algorithm hyperopt_search = HyperOptSearch( search_space, metric="criterion", mode="max", max_concurrent=cmdl.workers, points_to_evaluate=good_init, ) # early stopping scheduler = ASHAScheduler( time_attr="train_step", metric="criterion", mode="max", max_t=base_cfg["training_steps"], # max length of the experiment grace_period=cmdl.grace_steps, # stops after 20 logged steps brackets=3, # don't know what this does ) analysis = tune.run( lambda x: tune_trial(x, base_cfg=base_cfg, get_objective=None), name=search_name, # config=search_space, search_alg=hyperopt_search, scheduler=scheduler, local_dir="./results", num_samples=cmdl.trials, trial_name_creator=trial2string, resources_per_trial={"cpu": 3}, ) dfs = analysis.trial_dataframes for i, (key, df) in enumerate(dfs.items()): print("saving: ", key) df.to_pickle(f"{key}/trial_df.pkl")
def main(cmdl): """ Entry point. """ opt = read_config(Path(cmdl.experiment_path) / "cfg.yaml") chkpt_paths = sorted( Path(cmdl.experiment_path).glob("*.pth"), key=lambda path: int(path.stem.split("_")[2]), ) chkpt_paths = [(int(path.stem.split("_")[2]), path) for path in chkpt_paths] print(config_to_string(cmdl)) print(config_to_string(opt)) if cmdl.build_val_dset: perf = [(torch.load(path)["R/ep"], path) for _, path in chkpt_paths] best_score, path = max(perf, key=lambda x: x[0]) print(f"Loading {path} with total return {best_score}.") env, policy = configure_eval(cmdl, opt, path) achlioptas = _get_achlioptas(8, 4) val_dset = build_validation_dset( env, policy, opt.gamma, partial(_hash, decimals=cmdl.decimals, rnd_proj=achlioptas), ) val_dset["meta"]["agent"] = path val_dset["meta"]["decimals"] = cmdl.decimals val_dset["meta"]["rnd_proj"] = achlioptas for k, v in val_dset["meta"].items(): print(f"{k:12}", v) torch.save(val_dset, f"./val_dsets/{env.spec.id}.pkl") elif cmdl.offline_validation: rlog.init(opt.experiment, path=opt.out_dir, tensorboard=True) log = rlog.getLogger(opt.experiment + ".off_valid") log.addMetrics([ rlog.AvgMetric("V_step", metargs=["value", 1]), rlog.AvgMetric("off_mse", metargs=["off_mse", 1]), ]) log.info("Loading dataset...") dset = torch.load(f"./val_dsets/{opt.env_name}.pkl") for step, path in chkpt_paths: env, policy = configure_eval(cmdl, opt, path) offline_validation(step, policy, dset, opt) else: for step, path in chkpt_paths: env, policy = configure_eval(cmdl, opt, path) avg_return = greedy_validation(env, policy, opt.gamma) print("[{0:8d}] R/ep={1:8.2f}.".format(step, avg_return))
def run(opt): """ Entry Point. """ rlog.init(opt.experiment, path=opt.out_dir, tensorboard=True) rlog.addMetrics( rlog.AvgMetric("trn_R_ep", metargs=["trn_reward", "trn_done"]), rlog.AvgMetric("trn_loss", metargs=["trn_loss", 1]), rlog.FPSMetric("lrn_tps", metargs=["lrn_steps"]), rlog.AvgMetric("val_R_ep", metargs=["reward", "done"]), rlog.AvgMetric("val_avg_step", metargs=[1, "done"]), rlog.FPSMetric("val_fps", metargs=["val_frames"]), ) opt = game_settings_(opt) env, agent = experiment_factory(opt) rlog.info(ioutil.config_to_string(opt)) ioutil.save_config(opt, opt.out_dir) steps = 0 for ep in range(1, opt.env.episodes + 1): steps = train_one_ep(env, agent, steps, opt.update_freq, opt.target_update_freq) if ep % opt.valid_freq == 0: rlog.traceAndLog(ep) validate(env, agent, opt.valid_episodes) rlog.traceAndLog(ep)
def run(opt): """ Entry point of the program. """ if __debug__: print( clr( "Code might have assertions. Use -O in liftoff when running stuff.", color="red", attrs=["bold"], )) ioutil.create_paths(opt) sticky_schedule = OrderedDict([(int(s), float(p)) for (s, p) in opt.sticky_schedule]) assert 1 in sticky_schedule rlog.init(opt.experiment, path=opt.out_dir, tensorboard=True) train_loggers = OrderedDict() for i, epoch in enumerate(sticky_schedule.keys()): train_loggers[epoch] = train_log = rlog.getLogger( f"{opt.experiment}.{i:d}") train_log.addMetrics( rlog.AvgMetric("trn_R_ep", metargs=["trn_reward", "trn_done"]), rlog.SumMetric("trn_ep_cnt", metargs=["trn_done"]), rlog.AvgMetric("trn_loss", metargs=["trn_loss", 1]), rlog.FPSMetric("trn_tps", metargs=["trn_steps"]), rlog.ValueMetric("trn_sticky_action_prob", metargs=["trn_sticky_action_prob"]), rlog.FPSMetric("lrn_tps", metargs=["lrn_steps"]), rlog.AvgMetric("val_R_ep", metargs=["reward", "done"]), rlog.SumMetric("val_ep_cnt", metargs=["done"]), rlog.AvgMetric("val_avg_step", metargs=[1, "done"]), rlog.FPSMetric("val_fps", metargs=["val_frames"]), rlog.ValueMetric("val_sticky_action_prob", metargs=["val_sticky_action_prob"]), ) # Initialize the objects we will use during training. env, (replay, policy_improvement, policy_evaluation) = experiment_factory(opt) rlog.info("\n\n{}\n\n{}\n\n{}".format(env, replay, policy_evaluation.estimator)) rlog.info("\n\n{}\n\n{}".format(policy_improvement, policy_evaluation)) if opt.estimator.args.get("spectral", None) is not None: for k in policy_evaluation.estimator.get_spectral_norms().keys(): # k = f"min{str(k)[1:]}" rlog.addMetrics(rlog.ValueMetric(k, metargs=[k])) # if we loaded a checkpoint if Path(opt.out_dir).joinpath("replay.gz").is_file(): # sometimes the experiment is intrerupted while saving the replay # buffer and it gets corrupted. Therefore we attempt restoring # from the previous checkpoint and replay. try: idx = replay.load(Path(opt.out_dir) / "replay.gz") ckpt = ioutil.load_checkpoint(opt.out_dir, idx=idx) rlog.info(f"Loaded most recent replay (step {idx}).") except: gc.collect() rlog.info("Last replay gzip is faulty.") idx = replay.load(Path(opt.out_dir) / "prev_replay.gz") ckpt = ioutil.load_checkpoint(opt.out_dir, idx=idx) rlog.info(f"Loading a previous snapshot (step {idx}).") # load state dicts # load state dicts ioutil.special_conv_uv_buffer_fix(policy_evaluation.estimator, ckpt["estimator_state"]) policy_evaluation.estimator.load_state_dict(ckpt["estimator_state"]) ioutil.special_conv_uv_buffer_fix(policy_evaluation.target_estimator, ckpt["target_estimator_state"]) policy_evaluation.target_estimator.load_state_dict( ckpt["target_estimator_state"]) policy_evaluation.optimizer.load_state_dict(ckpt["optim_state"]) last_epsilon = None for _ in range(ckpt["step"]): last_epsilon = next(policy_improvement.epsilon) rlog.info(f"Last epsilon: {last_epsilon}.") # some counters last_epoch = ckpt["step"] // opt.train_step_cnt rlog.info(f"Resuming from epoch {last_epoch}.") start_epoch = last_epoch + 1 steps = ckpt["step"] else: steps = 0 start_epoch = 1 # add some hardware and git info, log and save opt = ioutil.add_platform_info(opt) rlog.info("\n" + ioutil.config_to_string(opt)) ioutil.save_config(opt, opt.out_dir) # Start training last_state = None # used by train_one_epoch to know how to resume episode. for epoch in range(start_epoch, opt.epoch_cnt + 1): last_sched_epoch = max(ep for ep in sticky_schedule if ep <= epoch) print(f"StickyActProb goes from {env.sticky_action_prob}" f" to {sticky_schedule[last_sched_epoch]}") env.sticky_action_prob = sticky_schedule[last_sched_epoch] crt_logger = train_loggers[last_sched_epoch] # train for 250,000 steps steps, last_state = train_one_epoch( env, (replay, policy_improvement, policy_evaluation), opt.train_step_cnt, opt.update_freq, opt.target_update_freq, opt, crt_logger, total_steps=steps, last_state=last_state, ) crt_logger.put(trn_sticky_action_prob=env.sticky_action_prob) crt_logger.traceAndLog(epoch * opt.train_step_cnt) # validate for 125,000 steps for sched_epoch, eval_logger in train_loggers.items(): eval_env = get_env( # this doesn't work, fute-m-aș în ele de wrappere opt, mode="testing", sticky_action_prob=sticky_schedule[sched_epoch]) eval_env.sticky_action_prob = sticky_schedule[sched_epoch] print( f"Evaluating on the env with sticky={eval_env.sticky_action_prob}." ) validate( AGENTS[opt.agent.name]["policy_improvement"]( policy_improvement.estimator, opt.action_cnt, epsilon=opt.val_epsilon, ), eval_env, opt.valid_step_cnt, eval_logger, ) eval_logger.put( val_sticky_action_prob=eval_env.sticky_action_prob, ) eval_logger.traceAndLog(epoch * opt.train_step_cnt) # save the checkpoint if opt.agent.save: ioutil.checkpoint_agent( opt.out_dir, steps, estimator=policy_evaluation.estimator, target_estimator=policy_evaluation.target_estimator, optim=policy_evaluation.optimizer, cfg=opt, replay=replay, save_replay=(epoch % 8 == 0 or epoch == opt.epoch_cnt), )
def run(opt): """ Entry point of the program. """ if __debug__: print( clr( "Code might have assertions. Use -O in liftoff when running stuff.", color="red", attrs=["bold"], )) ioutil.create_paths(opt) rlog.init(opt.experiment, path=opt.out_dir, tensorboard=True, relative_time=True) rlog.addMetrics( rlog.AvgMetric("trn_R_ep", metargs=["trn_reward", "trn_done"]), rlog.SumMetric("trn_ep_cnt", metargs=["trn_done"]), rlog.AvgMetric("trn_loss", metargs=["trn_loss", 1]), rlog.FPSMetric("trn_tps", metargs=["trn_steps"]), rlog.FPSMetric("lrn_tps", metargs=["lrn_steps"]), rlog.AvgMetric("val_R_ep", metargs=["reward", "done"]), rlog.SumMetric("val_ep_cnt", metargs=["done"]), rlog.AvgMetric("val_avg_step", metargs=[1, "done"]), rlog.FPSMetric("val_fps", metargs=["val_frames"]), ) # Initialize the objects we will use during training. env, (replay, policy_improvement, policy_evaluation) = experiment_factory(opt) guts = [ env, replay, policy_evaluation.estimator, policy_evaluation.optimizer, policy_improvement, policy_evaluation, ] rlog.info(("\n\n{}" * len(guts)).format(*guts)) if opt.estimator.args.get("spectral", None) is not None: for k in policy_evaluation.estimator.get_spectral_norms().keys(): # k = f"min{str(k)[1:]}" rlog.addMetrics(rlog.ValueMetric(k, metargs=[k])) # if we loaded a checkpoint if Path(opt.out_dir).joinpath("replay.gz").is_file(): # sometimes the experiment is intrerupted while saving the replay # buffer and it gets corrupted. Therefore we attempt restoring # from the previous checkpoint and replay. try: idx = replay.load(Path(opt.out_dir) / "replay.gz") ckpt = ioutil.load_checkpoint(opt.out_dir, idx=idx) rlog.info(f"Loaded most recent replay (step {idx}).") except: gc.collect() rlog.info("Last replay gzip is faulty.") idx = replay.load(Path(opt.out_dir) / "prev_replay.gz") ckpt = ioutil.load_checkpoint(opt.out_dir, idx=idx) rlog.info(f"Loading a previous snapshot (step {idx}).") # load state dicts # load state dicts ioutil.special_conv_uv_buffer_fix(policy_evaluation.estimator, ckpt["estimator_state"]) policy_evaluation.estimator.load_state_dict(ckpt["estimator_state"]) ioutil.special_conv_uv_buffer_fix(policy_evaluation.target_estimator, ckpt["target_estimator_state"]) policy_evaluation.target_estimator.load_state_dict( ckpt["target_estimator_state"]) policy_evaluation.optimizer.load_state_dict(ckpt["optim_state"]) last_epsilon = None for _ in range(ckpt["step"]): last_epsilon = next(policy_improvement.epsilon) rlog.info(f"Last epsilon: {last_epsilon}.") # some counters last_epoch = ckpt["step"] // opt.train_step_cnt rlog.info(f"Resuming from epoch {last_epoch}.") start_epoch = last_epoch + 1 steps = ckpt["step"] else: steps = 0 start_epoch = 1 # add some hardware and git info, log and save opt = ioutil.add_platform_info(opt) rlog.info("\n" + ioutil.config_to_string(opt)) ioutil.save_config(opt, opt.out_dir) # Start training last_state = None # used by train_one_epoch to know how to resume episode. for epoch in range(start_epoch, opt.epoch_cnt + 1): # train for 250,000 steps steps, last_state = train_one_epoch( env, (replay, policy_improvement, policy_evaluation), opt.train_step_cnt, opt.update_freq, opt.target_update_freq, opt, rlog.getRootLogger(), total_steps=steps, last_state=last_state, ) rlog.traceAndLog(epoch * opt.train_step_cnt) # validate for 125,000 steps validate( AGENTS[opt.agent.name]["policy_improvement"]( policy_improvement.estimator, opt.action_cnt, epsilon=opt.val_epsilon), get_env(opt, mode="testing"), opt.valid_step_cnt, rlog.getRootLogger(), ) rlog.traceAndLog(epoch * opt.train_step_cnt) # save the checkpoint if opt.agent.save: ioutil.checkpoint_agent( opt.out_dir, steps, estimator=policy_evaluation.estimator, target_estimator=policy_evaluation.target_estimator, optim=policy_evaluation.optimizer, cfg=opt, replay=replay, save_replay=(epoch % 8 == 0 or epoch == opt.epoch_cnt), )
def run(opt): """ Run experiment. This function is being launched by liftoff. """ # logging trn_log, val_log = set_logger(opt) # model related stuff device = torch.device("cuda") trn_set, val_set, wmp_set = get_dsets(opt) model = get_model(opt, device) optimizer = getattr(optim, opt.optim.name)(model.parameters(), **vars(opt.optim.args)) # batch_size batch_size = opt.trn_loader.batch_size rlog.info(U.config_to_string(opt)) rlog.info("Model: %s", str(model)) rlog.info("Optimizer: %s \n", str(optimizer)) # Warm-up the mode on a partition of the training dataset if wmp_set is not None: rlog.info("Warming-up on dset of size %d", len(wmp_set)) for epoch in range(opt.warmup.epochs): # train for one epoch trn_loss, trn_acc = train( DataLoader(wmp_set, **vars(opt.trn_loader)), model, optimizer, get_criterion(opt, model, len(wmp_set) // batch_size), mc_samples=opt.trn_mcs, ) val_stats = valid_stats(opt, model, val_set) trn_stats = train_stats(opt, model, wmp_set) trn_stats["loss"], trn_stats["acc"] = trn_loss, trn_acc # to pickle and tensorboard val_log.trace(step=epoch, **val_stats) trn_log.trace(step=epoch, **trn_stats) # to console for log, stats in zip([trn_log, val_log], [trn_stats, val_stats]): log.info(log.fmt.format(epoch, stats["acc"], stats["loss"])) # extra logging model_stats(opt, epoch, model) # maybe reset optimizer after warmup if opt.warmup.reset_optim: rlog.info("\nWarmup ended. Resetting optimizer.") optimizer = getattr(optim, opt.optim.name)(model.parameters(), **vars(opt.optim.args)) # Train on the full training dataset if wmp_set is not None: epochs = range(opt.warmup.epochs, opt.warmup.epochs + opt.epochs) else: epochs = range(opt.epochs) rlog.info("\nTraining on dset: %s", str(trn_set)) for epoch in epochs: trn_loss, trn_acc = train( DataLoader(trn_set, **vars(opt.trn_loader)), model, optimizer, get_criterion(opt, model, len(trn_set) // batch_size), mc_samples=opt.trn_mcs, ) val_stats = valid_stats(opt, model, val_set) trn_stats = train_stats(opt, model, trn_set) trn_stats["loss"], trn_stats["acc"] = trn_loss, trn_acc # to pickle and tensorboard val_log.trace(step=epoch, **val_stats) trn_log.trace(step=epoch, **trn_stats) # to console for log, stats in zip([trn_log, val_log], [trn_stats, val_stats]): log.info(log.fmt.format(epoch, stats["acc"], stats["loss"])) # extra logging model_stats(opt, epoch, model)