Exemplo n.º 1
0
    exploit_returns += [np.mean(stat.episode_rewards[20000:25000])]

print('Q-learning average exploitative return:', np.mean(exploit_returns))

exploit_returns = []
for stat in stats_hq_learning:
    exploit_returns += [np.mean(stat.episode_rewards[20000:25000])]

print('Hierarchical Q-learning average exploitative return:',
      np.mean(exploit_returns))

#########################################################

plt.figure()

plotting.plot_rewards(stats_q_learning, c='g')
plotting.plot_rewards(stats_hq_learning, c='b')
'''
plotting.plot_rewards(stats_q_learning, c='r')
plotting.plot_rewards(stats_hq_learning, c='c')
'''

plt.legend(["Q-learning", "Hierarchical Q-learning"])
plt.xlabel("Episode")
plt.ylabel("Extrinsic Reward")
plt.title("Discrete Stochastic Decision Process")

#########################################################

plt.figure()
Exemplo n.º 2
0
                    s_next, f, done, _ = self.env.step(action)
                    r = self.intrinsic_reward(s, action, s_next, goal)
                    stats.episode_rewards[i] += f
                    stats.episode_lengths[i] = t
                    stats.visitation_count[s_next, i] += 1

                    D1 = [((s, goal), action, r, (s_next, goal), done)]
                    Q1 = self.QValueUpdate(Q1, D1)
                    F = F + f
                    s = s_next
                    t += 1
                D2 = [(s0, goal, F, s, done)]
                Q2 = self.QValueUpdate(Q2, D2)
                if not done:
                    goal = self.epsGreedy(s, self.meta_goals, epsilon_meta, Q2)
                    stats.target_count[goal, i] += 1
                    epsilon[goal] = max(epsilon[goal] - self.epsilon_anneal, 0.1) if i < self.num_episodes*0.8 else 0

            epsilon_meta = max(epsilon_meta - self.meta_epsilon_anneal, 0.1) if i < self.num_episodes*0.8 else 0


        return stats
        #plotting.plot_episode_stats(stats, smoothing_window=1000)



if __name__ == "__main__":
    agent = hierarchicalQLearningAgent(env=hMDP())
    stats = agent.learn()
    plotting.plot_rewards([stats], smoothing_window=1000)