def main(params_yaml_path): with open(params_yaml_path) as fp: params = yaml.load(fp, Loader=yaml.FullLoader) env = gym.make(params["env"]) observation_space_d = np.product(env.observation_space.shape) action_space_d = env.action_space.n agent = DiscreteLinearAgent( n_features=observation_space_d, n_actions=action_space_d, degree=params["degree"], ) mu = rloptim.cross_entropy(env, agent, params) utils.vector_to_parameters(mu, agent.policy.parameters()) # Provide last simulation before testing. rlsim.simulate(env, agent, True) # Save the final model. # model_name = f"{n_epochs}_{batch_size}_{layers}" # torch.save(agent.policy, f"cartploe_{model_name}_final.pth") # Evaluate the final learned model. if rleval.evaluate( env, agent, rleval.CARTPOLE_V0_EPISODES, rleval.check_cartpole_v0, ): print("You've successfully solved the environment.") else: print("You did not solve the environment.") env.close()
def main(params_yaml_path): with open(params_yaml_path) as fp: params = yaml.load(fp, Loader=yaml.FullLoader) env = gym.make(params["env"]) obs_space_d = np.product(env.observation_space.shape) action_space_d = np.product(env.action_space.n) hidden_layers = [obs_space_d, *params["hidden_layers"], action_space_d] agent = DiscreteMLPAgent(hidden_layers) mu = rloptim.cross_entropy(env, agent, params, logs_dir) utils.vector_to_parameters(mu, agent.policy.parameters()) # Provide last simulation before testing. rlsim.simulate(env, agent, True) n_epochs = params["n_epochs"] eval_samples = params["eval_samples"] # Save the final model. model_name = f"{n_epochs}_{eval_samples}_{hidden_layers}" torch.save(agent.policy, f"lunarlander2_{model_name}_final.pth") # Evaluate the final learned model. if rleval.evaluate( env, agent, rleval.LUNARLANDER_V2_EPISODES, rleval.check_lunarlander_v2, ): print("You've successfully solved the environment.") else: print("You did not solve the environment.") env.close()
def cross_entropy(env, agent, params): mu = utils.parameters_to_vector(agent.policy.parameters()) sigma = torch.ones_like(mu) n_epochs = params["n_epochs"] n_samples = params["n_samples"] eval_samples = params["eval_samples"] top_p_per = int(n_samples * params["p"]) # Create agents. agents = [agent.__class__(*agent.params) for _ in range(n_samples)] for epoch in range(1, n_epochs + 1): # Collect n samples of sigma_i from N(mu, diag(sigma)) theta = torch.stack( [torch.normal(mean=mu, std=sigma) for _ in range(n_samples)] ) # To each agent, assign one of the sampled param vector. for agent, th in zip(agents, theta): utils.vector_to_parameters(th, agent.policy.parameters()) results = [] # Evaluate each agent. for i, agent in enumerate(agents): batch = [rlsim.simulate(env, agent) for _ in range(eval_samples)] average_return = rlstats.calc_average_return( rlutils.extract_rewards(batch) ) results.append((i, average_return)) best_results = sorted(results, key=lambda x: x[1])[-top_p_per:] elite_set_indices = list(map(lambda x: x[0], best_results)) print( f"Best average return for epoch {epoch} is {best_results[-1][1]}." ) elite_set = theta[elite_set_indices] # Re-fit Gaussian to the best results. mu = torch.mean(elite_set, dim=0) sigma = torch.std(elite_set, dim=0) return mu
def main(params_yaml_path): with open(params_yaml_path) as file: params = yaml.load(file, Loader=yaml.FullLoader) env = gym.make(params["env"]) observation_space_d = np.product(env.observation_space.shape) action_space_d = env.action_space.n agent = DiscreteLinearAgent( n_features=observation_space_d, n_actions=action_space_d, degree=params["degree"], ) optimizer = optim.RMSprop(agent.policy.parameters(), lr=params["lr"]) n_epochs = params["n_epochs"] batch_size = params["batch_size"] n_dbg = params["n_dbg"] log_data = [] for epoch in range(n_epochs): batch = [rlsim.simulate(env, agent) for _ in range(batch_size)] rloptim.vpg(batch, agent, optimizer, params["gamma"]) log_data.append( { "epoch": epoch + 1, "probs": rlutils.extract_action_probs(batch), "rewards": rlutils.extract_rewards(batch), } ) average_return = rlstats.calc_average_return( rlutils.extract_rewards(batch) ) average_entropy = rlstats.calc_average_entropy( rlutils.extract_action_probs(batch) ) print( f"Average return for {epoch}th epoch is {average_return:.2f} \ with average entropy of {average_entropy:.2f}." ) if epoch % n_dbg == 0: rlsim.simulate(env, agent, True) if average_entropy < params["min_entropy"]: print("Your agent's policy achieved desired entropy.") break rlsim.simulate(env, agent, True) # Save logs. logs_fn = os.path.join(logs_dir, params["logs"]) with open(logs_fn, "w") as fp: json.dump(log_data, fp, cls=rlutils.CustomEncoder) # for param in agent.policy.parameters(): # print(param.data) if rleval.evaluate( env, agent, rleval.CARTPOLE_V0_EPISODES, rleval.check_cartpole_v0, ): print("You've successfully solved the environment.") else: print("You did not solve the environment.") env.close()
def main(params_yaml_path): with open(params_yaml_path) as fp: params = yaml.load(fp, Loader=yaml.FullLoader) env = gym.make(params["env"]) observation_space_d = np.product(env.observation_space.shape) action_space_d = np.product(env.action_space.n) layers = [observation_space_d, *params["hidden_layers"], action_space_d] agent = DiscreteMLPAgent(layers) optimizer = optim.RMSprop(agent.policy.parameters(), lr=params["lr"]) n_epochs = params["n_epochs"] batch_size = params["batch_size"] episode_dbg = params["n_dbg"] log_data = [] for epoch in range(n_epochs): batch = [rlsim.simulate(env, agent) for _ in range(batch_size)] rloptim.vpg(batch, agent, optimizer, params["gamma"]) # Compute stuff for the log. rewards = [ np.sum(eps_rewards) for eps_rewards in rlutils.extract_rewards(batch) ] entropy = [ np.mean(list(map(rlstats.entropy, eps_probs))) for eps_probs in rlutils.extract_action_probs(batch) ] log_data.append({ "epoch": epoch + 1, "entropy": entropy, "rewards": rewards }) avg_return = rlstats.calc_average_return( rlutils.extract_rewards(batch)) avg_entropy = rlstats.calc_average_entropy( rlutils.extract_action_probs(batch)) if np.isnan(avg_entropy) or avg_entropy < params["min_entropy"]: print("Your agent's policy achieved desired entropy.") break print(f"Average return for {epoch + 1}th epoch is {avg_return:.2f} \ with average entropy of {avg_entropy:.2f}.") if epoch % episode_dbg == 0: rlsim.simulate(env, agent, True) if epoch % 20 == 0 and epoch > 0: # Save logs. logs_fn = os.path.join(logs_dir, params["logs"]) with open(logs_fn, "w") as fp: json.dump(log_data, fp, cls=rlutils.CustomEncoder) # Save the learned model. model_name = ( f"{n_epochs}_{batch_size}_{layers}_{epoch}_{avg_return:.2f}") torch.save(agent.policy, f"lunarlander2_{model_name}.pth") # Provide last simulation before testing. rlsim.simulate(env, agent, True) # Save the final model. model_name = f"{n_epochs}_{batch_size}_{layers}" torch.save(agent.policy, f"lunarlander2_{model_name}_final.pth") # Evaluate the final learned model. if rleval.evaluate( env, agent, rleval.LUNARLANDER_V2_EPISODES, rleval.check_lunarlander_v2, ): print("You've successfully solved the environment.") else: print("You did not solve the environment.") env.close()