Exemplo n.º 1
0
        while not done:
            log(test_env, iteration, step_idx, total_rew)
            p, _ = network.step(np.array([state]))
            # print(p)
            action = np.argmax(p)
            state, reward, done, _ = test_env.step(action)
            step_idx += 1
            total_rew += reward
        log(test_env, iteration, step_idx, total_rew)

    value_losses = []
    policy_losses = []

    for i in range(1000):
        if i % 50 == 0:
            test_agent(i)
            plt.plot(value_losses, label="value loss")
            plt.plot(policy_losses, label="policy loss")
            plt.legend()
            plt.show()

        obs, pis, returns, total_reward, done_state = execute_episode(
            network, 32, HillClimbingEnv)
        mem.add_all({"ob": obs, "pi": pis, "return": returns})

        batch = mem.get_minibatch()

        vl, pl = trainer.train(batch["ob"], batch["pi"], batch["return"])
        value_losses.append(vl)
        policy_losses.append(pl)