def __init__(self, env = hMDP(), num_episodes = 100000, \ gamma = 0.9, alpha = 0.6, batch_size = 1, epsilon_anneal = 1/50000): self.env = env self.num_episodes = num_episodes self.gamma = gamma self.alpha = alpha self.batch_size = batch_size self.epsilon_anneal = epsilon_anneal
def __init__(self, env = hMDP(), meta_goals = [0, 1, 2, 3, 4, 5], num_episodes = 20000, \ gamma = 0.9, batch_size = 32, epsilon_anneal = 1/2000, \ meta_epsilon_anneal = 1/12000): self.env = env self.meta_goals = meta_goals self.num_episodes = num_episodes self.gamma = gamma self.batch_size = batch_size self.epsilon_anneal = epsilon_anneal self.meta_epsilon_anneal = meta_epsilon_anneal
#from agents.hDQN import hDQNAgent from envs.hmdp import StochastichMDPEnv as hMDP from envs.mdp import StochasticMDPEnv as MDP import utils.plotting as plotting import pandas as pd import numpy as np from matplotlib import pyplot as plt num_trials = 20 stats_q_learning = [] for i in range(num_trials): q_agent = QLearningAgent(env=hMDP(), num_episodes=25000) episode_stats = q_agent.learn() stats_q_learning.append(episode_stats) stats_hq_learning = [] for i in range(num_trials): hq_agent = hierarchicalQLearningAgent(env=hMDP(), num_episodes=25000) episode_stats = hq_agent.learn() stats_hq_learning.append(episode_stats) ''' stats_dqn = [] for i in range(num_trials): dqn_agent = DQNAgent(env=hMDP()) episode_stats = dqn_agent.learn() stats_dqn.append(episode_stats)
action = self.epsGreedy((s, goal), A, epsilon[goal], Q1) s_next, f, done, _ = self.env.step(action) r = self.intrinsic_reward(s, action, s_next, goal) stats.episode_rewards[i] += f stats.episode_lengths[i] = t stats.visitation_count[s_next, i] += 1 D1 = [((s, goal), action, r, (s_next, goal), done)] Q1 = self.QValueUpdate(Q1, D1) F = F + f s = s_next t += 1 D2 = [(s0, goal, F, s, done)] Q2 = self.QValueUpdate(Q2, D2) if not done: goal = self.epsGreedy(s, self.meta_goals, epsilon_meta, Q2) stats.target_count[goal, i] += 1 epsilon[goal] = max(epsilon[goal] - self.epsilon_anneal, 0.1) if i < self.num_episodes*0.8 else 0 epsilon_meta = max(epsilon_meta - self.meta_epsilon_anneal, 0.1) if i < self.num_episodes*0.8 else 0 return stats #plotting.plot_episode_stats(stats, smoothing_window=1000) if __name__ == "__main__": agent = hierarchicalQLearningAgent(env=hMDP()) stats = agent.learn() plotting.plot_rewards([stats], smoothing_window=1000)
for i in range(self.num_episodes): if i % 1000 == 0: print('Episode ', i) print(epsilon) s = self.env.reset() done = False t = 0 while not done: action = self.epsGreedy(s, A, epsilon, Q) s_next, f, done, _ = self.env.step(action) stats.episode_rewards[i] += f stats.episode_lengths[i] = t stats.visitation_count[s_next, i] += 1 D = [(s, action, f, s_next, done)] Q = self.QValueUpdate(Q, D) s = s_next t += 1 epsilon = max(epsilon - self.epsilon_anneal, 0.1) if i < self.num_episodes * 0.8 else 0 return stats #plotting.plot_episode_stats(stats, smoothing_window=1000) if __name__ == "__main__": agent = QLearningAgent(env=hMDP()) stats = agent.learn() plotting.plot_rewards([stats], smoothing_window=1000)