示例#1
0
    def test_a2c_doesnt_store_invalid_transitions_in_td_setting(self):

        STEPS = 35

        env = DummyEnv()
        agent = A2C.from_environment(env, discount_gamma=0.)
        rollout = Rolling(agent, env)

        rollout.roll(STEPS, verbose=0, push_experience=True)

        data = agent.memory_sampler.sample(-1)

        self.assertEqual(agent.episodes, 3)
        np.testing.assert_array_less(data["state"], 10)
        self.assertEqual(len(data["state"]), STEPS - 4)
示例#2
0
test_rollout = Trajectory(agent, gym.make("CartPole-v1"))

rewards = []
actor_loss = []
actor_utility = []
actor_entropy = []
critic_loss = []

for episode in range(1, 1001):
    episode_actor_loss = []
    episode_actor_utility = []
    episode_actor_entropy = []
    episode_critic_loss = []

    for update in range(32):
        rollout.roll(steps=2, verbose=0, push_experience=True)
        agent_history = agent.fit(batch_size=32, verbose=0)
        episode_actor_loss.append(agent_history["actor_loss"])
        episode_actor_utility.append(agent_history["actor_utility"])
        episode_actor_entropy.append(agent_history["actor_entropy"])
        episode_critic_loss.append(agent_history["critic_loss"])

    test_history = test_rollout.rollout(verbose=0, push_experience=False)

    rewards.append(test_history["reward_sum"])
    actor_loss.append(sum(episode_actor_loss) / len(episode_actor_loss))
    actor_utility.append(
        sum(episode_actor_utility) / len(episode_actor_utility))
    actor_entropy.append(
        sum(episode_actor_entropy) / len(episode_actor_entropy))
    critic_loss.append(sum(episode_critic_loss) / len(episode_critic_loss))
示例#3
0
                  action_space=2,
                  memory=Experience(max_length=10000),
                  epsilon=1.,
                  discount_factor_gamma=0.98)

rollout = Rolling(agent, env, config=RolloutConfig(max_steps=300))
test_rollout = Trajectory(agent, test_env)

rewards = []
losses = []

for episode in range(1, 501):
    episode_losses = []

    for update in range(32):
        roll_history = rollout.roll(steps=4, verbose=0, push_experience=True)
        agent_history = agent.fit(batch_size=32, verbose=0)
        episode_losses.append(agent_history["loss"])

    test_history = test_rollout.rollout(verbose=0,
                                        push_experience=False,
                                        render=False)

    rewards.append(test_history["reward_sum"])
    losses.append(np.mean(episode_losses))

    print("\rEpisode {:>4} RWD {:>3.0f} LOSS {:.4f} EPS {:>6.2%}".format(
        episode, np.mean(rewards[-10:]), np.mean(losses[-10:]), agent.epsilon),
          end="")

    agent.epsilon *= 0.992
示例#4
0
experience = Experience(10000)
agent = DoubleDQN(ann,
                  env.action_space,
                  experience,
                  epsilon=1.,
                  epsilon_decay=1.,
                  epsilon_min=0.1)

rcfg = RolloutConfig(max_steps=1024, skipframes=2)
training_rollout = Rolling(agent, env, rcfg)
testing_rollout = Trajectory(agent, test_env, rcfg)

print("Filling experience...")
while experience.N < 10000:
    training_rollout.roll(steps=32, verbose=0, push_experience=True)
    print(f"\r{experience.N/10000:.2%} 10000/{experience.N}", end="")
print()
agent.epsilon_decay = 0.99995

logger = history.History("reward_sum", *agent.history_keys, "epsilon")

for episode in range(1, 501):

    for update in range(32):
        training_rollout.roll(steps=32, verbose=0, push_experience=True)
        agent_history = agent.fit(batch_size=1024, verbose=0, polyak_tau=0.1)
        logger.buffer(**agent_history)

    for _ in range(3):
        test_history = testing_rollout.rollout(verbose=0,