def main(params_yaml_path):
    with open(params_yaml_path) as fp:
        params = yaml.load(fp, Loader=yaml.FullLoader)

    env = gym.make(params["env"])

    observation_space_d = np.product(env.observation_space.shape)
    action_space_d = env.action_space.n

    agent = DiscreteLinearAgent(
        n_features=observation_space_d,
        n_actions=action_space_d,
        degree=params["degree"],
    )

    mu = rloptim.cross_entropy(env, agent, params)
    utils.vector_to_parameters(mu, agent.policy.parameters())

    # Provide last simulation before testing.
    rlsim.simulate(env, agent, True)

    # Save the final model.
    # model_name = f"{n_epochs}_{batch_size}_{layers}"
    # torch.save(agent.policy, f"cartploe_{model_name}_final.pth")

    # Evaluate the final learned model.
    if rleval.evaluate(
        env,
        agent,
        rleval.CARTPOLE_V0_EPISODES,
        rleval.check_cartpole_v0,
    ):
        print("You've successfully solved the environment.")
    else:
        print("You did not solve the environment.")

    env.close()
Exemplo n.º 2
0
def main(params_yaml_path):
    with open(params_yaml_path) as fp:
        params = yaml.load(fp, Loader=yaml.FullLoader)

    env = gym.make(params["env"])

    obs_space_d = np.product(env.observation_space.shape)
    action_space_d = np.product(env.action_space.n)
    hidden_layers = [obs_space_d, *params["hidden_layers"], action_space_d]

    agent = DiscreteMLPAgent(hidden_layers)

    mu = rloptim.cross_entropy(env, agent, params, logs_dir)
    utils.vector_to_parameters(mu, agent.policy.parameters())

    # Provide last simulation before testing.
    rlsim.simulate(env, agent, True)

    n_epochs = params["n_epochs"]
    eval_samples = params["eval_samples"]

    # Save the final model.
    model_name = f"{n_epochs}_{eval_samples}_{hidden_layers}"
    torch.save(agent.policy, f"lunarlander2_{model_name}_final.pth")

    # Evaluate the final learned model.
    if rleval.evaluate(
            env,
            agent,
            rleval.LUNARLANDER_V2_EPISODES,
            rleval.check_lunarlander_v2,
    ):
        print("You've successfully solved the environment.")
    else:
        print("You did not solve the environment.")

    env.close()
Exemplo n.º 3
0
def cross_entropy(env, agent, params):

    mu = utils.parameters_to_vector(agent.policy.parameters())
    sigma = torch.ones_like(mu)

    n_epochs = params["n_epochs"]
    n_samples = params["n_samples"]
    eval_samples = params["eval_samples"]
    top_p_per = int(n_samples * params["p"])

    # Create agents.
    agents = [agent.__class__(*agent.params) for _ in range(n_samples)]

    for epoch in range(1, n_epochs + 1):
        # Collect n samples of sigma_i from N(mu, diag(sigma))
        theta = torch.stack(
            [torch.normal(mean=mu, std=sigma) for _ in range(n_samples)]
        )

        # To each agent, assign one of the sampled param vector.
        for agent, th in zip(agents, theta):
            utils.vector_to_parameters(th, agent.policy.parameters())

        results = []
        # Evaluate each agent.
        for i, agent in enumerate(agents):
            batch = [rlsim.simulate(env, agent) for _ in range(eval_samples)]
            average_return = rlstats.calc_average_return(
                rlutils.extract_rewards(batch)
            )

            results.append((i, average_return))

        best_results = sorted(results, key=lambda x: x[1])[-top_p_per:]
        elite_set_indices = list(map(lambda x: x[0], best_results))

        print(
            f"Best average return for epoch {epoch} is {best_results[-1][1]}."
        )

        elite_set = theta[elite_set_indices]

        # Re-fit Gaussian to the best results.
        mu = torch.mean(elite_set, dim=0)
        sigma = torch.std(elite_set, dim=0)

    return mu
Exemplo n.º 4
0
def main(params_yaml_path):
    with open(params_yaml_path) as file:
        params = yaml.load(file, Loader=yaml.FullLoader)

    env = gym.make(params["env"])

    observation_space_d = np.product(env.observation_space.shape)
    action_space_d = env.action_space.n

    agent = DiscreteLinearAgent(
        n_features=observation_space_d,
        n_actions=action_space_d,
        degree=params["degree"],
    )

    optimizer = optim.RMSprop(agent.policy.parameters(), lr=params["lr"])

    n_epochs = params["n_epochs"]
    batch_size = params["batch_size"]
    n_dbg = params["n_dbg"]

    log_data = []
    for epoch in range(n_epochs):
        batch = [rlsim.simulate(env, agent) for _ in range(batch_size)]
        rloptim.vpg(batch, agent, optimizer, params["gamma"])

        log_data.append(
            {
                "epoch": epoch + 1,
                "probs": rlutils.extract_action_probs(batch),
                "rewards": rlutils.extract_rewards(batch),
            }
        )

        average_return = rlstats.calc_average_return(
            rlutils.extract_rewards(batch)
        )

        average_entropy = rlstats.calc_average_entropy(
            rlutils.extract_action_probs(batch)
        )

        print(
            f"Average return for {epoch}th epoch is {average_return:.2f} \
with average entropy of {average_entropy:.2f}."
        )

        if epoch % n_dbg == 0:
            rlsim.simulate(env, agent, True)

        if average_entropy < params["min_entropy"]:
            print("Your agent's policy achieved desired entropy.")
            break

    rlsim.simulate(env, agent, True)

    # Save logs.
    logs_fn = os.path.join(logs_dir, params["logs"])
    with open(logs_fn, "w") as fp:
        json.dump(log_data, fp, cls=rlutils.CustomEncoder)

    # for param in agent.policy.parameters():
    #     print(param.data)

    if rleval.evaluate(
        env,
        agent,
        rleval.CARTPOLE_V0_EPISODES,
        rleval.check_cartpole_v0,
    ):
        print("You've successfully solved the environment.")
    else:
        print("You did not solve the environment.")

    env.close()
def main(params_yaml_path):
    with open(params_yaml_path) as fp:
        params = yaml.load(fp, Loader=yaml.FullLoader)

    env = gym.make(params["env"])

    observation_space_d = np.product(env.observation_space.shape)
    action_space_d = np.product(env.action_space.n)
    layers = [observation_space_d, *params["hidden_layers"], action_space_d]

    agent = DiscreteMLPAgent(layers)
    optimizer = optim.RMSprop(agent.policy.parameters(), lr=params["lr"])

    n_epochs = params["n_epochs"]
    batch_size = params["batch_size"]
    episode_dbg = params["n_dbg"]

    log_data = []
    for epoch in range(n_epochs):
        batch = [rlsim.simulate(env, agent) for _ in range(batch_size)]
        rloptim.vpg(batch, agent, optimizer, params["gamma"])

        # Compute stuff for the log.
        rewards = [
            np.sum(eps_rewards)
            for eps_rewards in rlutils.extract_rewards(batch)
        ]

        entropy = [
            np.mean(list(map(rlstats.entropy, eps_probs)))
            for eps_probs in rlutils.extract_action_probs(batch)
        ]

        log_data.append({
            "epoch": epoch + 1,
            "entropy": entropy,
            "rewards": rewards
        })

        avg_return = rlstats.calc_average_return(
            rlutils.extract_rewards(batch))

        avg_entropy = rlstats.calc_average_entropy(
            rlutils.extract_action_probs(batch))

        if np.isnan(avg_entropy) or avg_entropy < params["min_entropy"]:
            print("Your agent's policy achieved desired entropy.")
            break

        print(f"Average return for {epoch + 1}th epoch is {avg_return:.2f} \
with average entropy of {avg_entropy:.2f}.")

        if epoch % episode_dbg == 0:
            rlsim.simulate(env, agent, True)

        if epoch % 20 == 0 and epoch > 0:
            # Save logs.
            logs_fn = os.path.join(logs_dir, params["logs"])
            with open(logs_fn, "w") as fp:
                json.dump(log_data, fp, cls=rlutils.CustomEncoder)

            # Save the learned model.
            model_name = (
                f"{n_epochs}_{batch_size}_{layers}_{epoch}_{avg_return:.2f}")
            torch.save(agent.policy, f"lunarlander2_{model_name}.pth")

    # Provide last simulation before testing.
    rlsim.simulate(env, agent, True)

    # Save the final model.
    model_name = f"{n_epochs}_{batch_size}_{layers}"
    torch.save(agent.policy, f"lunarlander2_{model_name}_final.pth")

    # Evaluate the final learned model.
    if rleval.evaluate(
            env,
            agent,
            rleval.LUNARLANDER_V2_EPISODES,
            rleval.check_lunarlander_v2,
    ):
        print("You've successfully solved the environment.")
    else:
        print("You did not solve the environment.")

    env.close()