plt.figure(figsize=[10, 4])
    rewards = []

    # Training loop
    for i in range(max_iterations):
        # Play & train game
        # Update rewards
        # rewards

        # Decay agent epsilon
        # agent.epsilon = ?

        initial_state = env.reset()
        rewards.append(play_and_train(env, agent))
        agent.epsilon = agent.epsilon * discount

        if i % 100 == 0:
            print('Iteration {}, Average reward {:.2f}, Epsilon {:.3f}'.format(
                i, np.mean(rewards), agent.epsilon))

        if visualize:
            plt.subplot(1, 2, 1)
            plt.plot(rewards, color='r')
            plt.xlabel('Iterations')
            plt.ylabel('Total Reward')

            plt.subplot(1, 2, 2)
            plt.hist(rewards,
                     bins=20,
                     range=[-700, +20],
示例#2
0
    discount = 0.99

    agent = QLearningAgent(alpha, epsilon, discount, getActionRange)
    agent2 = SarsaAgent(alpha, epsilon, discount, getActionRange)

    plt.figure(figsize=[10, 4])
    rewards1 = []
    rewards2 = []
    # Training loop
    for i in range(max_iterations):
        # Play & train game

        rewards1.append(play_and_train(env, agent))
        rewards2.append(play_and_train(env, agent2))
        if (i + 1) % 100 == 0:
            agent.epsilon = max(agent.epsilon * 0.99, 0.00001)
            agent2.epsilon = max(agent2.epsilon * 0.99, 0.00001)
            # agent.alpha = max(agent.alpha * 0.99, 0.00001)
            # agent2.alpha = max(agent2.alpha * 0.99, 0.00001)

        if i % 100 == 0:
            print(
                'Iteration {}, Average reward {:.2f}, Average reward {:.2f}, Epsilon {:.3f}'
                .format(i, np.mean(rewards1[-100:]), np.mean(rewards2[-100:]),
                        agent.epsilon))

        if visualize:
            plt.subplot(1, 2, 1)
            plt.plot(rewards1, color='r')
            plt.plot(rewards2, color='b')
            plt.xlabel('Iterations')