예제 #1
0
                    rl_glue.rl_episode(0)
                else:
                    # Runs an episode while keeping track of visited states
                    state, action = rl_glue.rl_start()
                    state_visits[state] += 1
                    is_terminal = False
                    while not is_terminal:
                        # # stop the program
                        # line = sys.stdin.readline()
                        # print 'line=', line
                        # if line == 'q':
                        #     sys.exit()
                        reward, state, action, is_terminal = rl_glue.rl_step()
                        state_visits[state] += 1

                reward_sums.append(rl_glue.rl_return())
                #             last_episode_total_reward = rl_glue.rl_return()
                end_time = time.clock()
                print "The time of ", episode, " episode:", end_time - start_time

            print 'q_table:', rl_glue.agent.q
            all_reward_sums[algorithm].append(reward_sums)
            all_state_visits[algorithm].append(state_visits)

        name = 'results/' + algorithm + '_q_table_r5_e100.npy'
        np.save(name, rl_glue.agent.q)

    # save results
    np.save('results/q_learning_r5_e100.npy', all_reward_sums['Q-learning'])
    np.save('results/expected_sarsa_r5_e100.npy',
            all_reward_sums['Expected Sarsa'])
        state_visits = np.zeros(48)
        last_episode_total_reward = 0
        for episode in range(num_episodes):
            if episode < num_episodes - 10:
                # Runs an episode
                rl_glue.rl_episode(10000)
            else:
                # Runs an episode while keeping track of visited states
                state, action = rl_glue.rl_start()
                state_visits[state] += 1
                is_terminal = False
                while not is_terminal:
                    reward, state, action, is_terminal = rl_glue.rl_step()
                    state_visits[state] += 1

            reward_sums.append(rl_glue.rl_return() - last_episode_total_reward)
            last_episode_total_reward = rl_glue.rl_return()

        all_reward_sums[algorithm].append(reward_sums)
        all_state_visits[algorithm].append(state_visits)

# plot results
for algorithm in ["Q-learning", "Expected Sarsa"]:
    plt.plot(np.mean(all_reward_sums[algorithm], axis=0), label=algorithm)
plt.xlabel("Episodes")
plt.ylabel("Sum of\n rewards\n during\n episode", rotation=0, labelpad=40)
plt.xlim(0, 100)
plt.ylim(-30, 0)
plt.legend()
plt.show()