class DeepSeaTreasureGraphicalDDQN(object): def __init__(self, episodes): self.current_episode = 0 self.episodes = episodes self.episode_score = [] self.episode_qs = [] self.episode_height = [] self.episode_loss = [] self.episode_policies = [] self.fig, self.ax = plt.subplots(figsize=(10, 4)) self.fig.canvas.draw() plt.show(block=False) self.numRewards = 2 self.env = DeepSeaTreasure(width=5, speed=10000, graphical_state=True, render=True, is_debug=False, frame_stack=2) self.agent = DQNAgent(stateShape=(84, 84, 2), actionSpace=self.env.get_action_space(), numPicks=32, memorySize=10000, numRewards=self.numRewards) def train(self): for _ in range(self.episodes): self.episode() self.current_episode += 1 plt.show(block=True) self.env.close() def episode(self): done = False rewardsSum = 0 lossSum = 0 qSums = [0] * (self.numRewards) actions = 1 state = self.env.reset() maxHeight = -1 while not done: action, qs = self.agent.selectAction(state) if qs != -100000: qSums += qs actions += 1 obs, reward, done, _ = self.env.step_all(action) nextState = obs rewardsSum = np.add(rewardsSum, sum(reward)) self.agent.addMemory( (state, action, (reward[0] + reward[1]), nextState, done)) state = nextState loss = self.agent.trainDQN() lossSum += loss if self.current_episode % self.agent.sync == 0: self.agent.targetNetwork.set_weights( self.agent.trainNetwork.get_weights()) print("now epsilon is {}, the reward is {} with loss {} in episode {}". format(self.agent.epsilon, rewardsSum, lossSum, self.current_episode)) self.episode_score.append(rewardsSum) self.episode_height.append(maxHeight) self.episode_loss.append(lossSum) self.episode_qs.append([qSum / actions for qSum in qSums]) self.plot() print("Report: \nrewardSum:{}\nheight:{}\nloss:{}\nqAverage:{}".format( self.episode_score[-1], self.episode_height[-1], self.episode_loss[-1], self.episode_qs[-1])) def plot(self): spline_x = np.linspace(0, self.current_episode, num=self.current_episode) ep_scores = np.array(self.episode_score) ep_groups = [ ep_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM] for i in range((len(ep_scores) + GROUP_NUM - 1) // GROUP_NUM) ] # Pad for weird numpy error for now ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] * (GROUP_NUM - len(ep_groups[-1]))) x_groups = [i * GROUP_NUM for i in range(len(ep_groups))] self.ax.clear() if len(x_groups) > 5: ep_avgs = np.mean(ep_groups, 1) avg_spl = interp1d(x_groups, ep_avgs, kind='cubic', fill_value="extrapolate") ep_std = np.std(ep_groups, 1) std_spl = interp1d(x_groups, ep_std, kind='cubic', fill_value="extrapolate") self.ax.plot(spline_x, avg_spl(spline_x), lw=0.7, c="blue") self.ax.fill_between(spline_x, avg_spl(spline_x) - std_spl(spline_x), avg_spl(spline_x) + std_spl(spline_x), alpha=0.5, facecolor="red", interpolate=True) self.ax.title.set_text('Training Score') self.ax.set_xlabel('Episode') self.ax.set_ylabel('Score') ''' policies = np.transpose(self.episode_policies) colors = pl.cm.jet(np.linspace(0, 1, len(policies)*2)) self.ax[1].clear() self.ax[1].title.set_text('Policy Choices') for i, policy in enumerate(policies): if len(x_groups) > 5: ep_groups = [policy[i * GROUP_NUM:(i + 1) * GROUP_NUM] for i in range((len(policy) + GROUP_NUM - 1) // GROUP_NUM)] # Pad for weird numpy error for now ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] * (GROUP_NUM - len(ep_groups[-1]))) x_groups = [i*GROUP_NUM for i in range(len(ep_groups))] ep_avgs = np.mean(ep_groups, 1) avg_spl = interp1d(x_groups, ep_avgs, kind='cubic', fill_value="extrapolate") ep_std = np.std(ep_groups, 1) std_spl = interp1d(x_groups, ep_std, kind='cubic', fill_value="extrapolate") self.ax[1].plot(spline_x, avg_spl(spline_x), lw=0.7, c=colors[i], label="{} policy".format(PolEnum(i).name)) self.ax[1].fill_between(spline_x, avg_spl(spline_x)-std_spl(spline_x), avg_spl(spline_x)+std_spl(spline_x), alpha=0.5, facecolor=colors[-1-i], interpolate=True) self.ax[1].legend() ''' self.fig.canvas.draw() plt.show(block=False) plt.pause(.001)
class DeepSeaTreasureBaselineDQN(object): def __init__(self, episodes): self.current_episode = 0 self.episodes = episodes self.episode_score = [] self.episode_qs = [] self.episode_height = [] self.episode_loss = [] self.fig, self.ax = plt.subplots(figsize=(5, 4)) self.fig.canvas.draw() plt.show(block=False) self.env = DeepSeaTreasure(width=5, speed=1000, graphical_state=False, render=True, is_debug=True) self.agent = DQNAgent(stateShape=(2, ), actionSpace=self.env.get_action_space(), numPicks=64, memorySize=2000) def train(self): for _ in range(self.episodes): self.episode() self.plot() self.current_episode += 1 plt.show(block=True) self.env.close() def episode(self): done = False rewardsSum = 0 qSum = 0 qActions = 1 lossSum = 0 state = self.env.reset().reshape(1, 2) maxHeight = -10000 while not done: action, q = self.agent.selectAction(state) if q != -100000: qSum += q qActions += 1 obs, reward, done, _ = self.env.step_all(action) # env.render() reward = reward[0] + reward[1] ''' maxHeight = max(obs[0], maxHeight) if obs[0] >= 0.5: reward += 10 ''' nextState = obs.reshape(1, 2) rewardsSum = np.add(rewardsSum, reward) loss = self.agent.trainDQN() self.agent.addMemory((state, action, reward, nextState, done)) state = nextState lossSum += loss self.agent.terminal() print("now epsilon is {}, the reward is {} with loss {} in episode {}". format(self.agent.epsilon, rewardsSum, lossSum, self.current_episode)) self.episode_score.append(rewardsSum) self.episode_qs.append(qSum / qActions) self.episode_height.append(maxHeight) self.episode_loss.append(lossSum) def plot(self): spline_x = np.linspace(0, self.current_episode, num=self.current_episode) ep_scores = np.array(self.episode_score) ep_groups = [ ep_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM] for i in range((len(ep_scores) + GROUP_NUM - 1) // GROUP_NUM) ] #Pad for weird numpy error for now ep_groups[-1] = np.append(ep_groups[-1], [ep_groups[-1][-1]] * (GROUP_NUM - len(ep_groups[-1]))) x_groups = [i * GROUP_NUM for i in range(len(ep_groups))] self.ax.clear() if len(x_groups) > 5: ep_avgs = np.mean(ep_groups, 1) avg_spl = interp1d(x_groups, ep_avgs, kind='cubic', fill_value="extrapolate") ep_std = np.std(ep_groups, 1) std_spl = interp1d(x_groups, ep_std, kind='cubic', fill_value="extrapolate") self.ax.plot(spline_x, avg_spl(spline_x), lw=0.7, c="blue") self.ax.fill_between(spline_x, avg_spl(spline_x) - std_spl(spline_x), avg_spl(spline_x) + std_spl(spline_x), alpha=0.5, facecolor="red", interpolate=True) self.ax.title.set_text('Training Score') self.ax.set_xlabel('Episode') self.ax.set_ylabel('Score') plt.show(block=False) plt.pause(.001)
class DeepSeaGraphicalWAgent(object): def __init__(self, episodes): self.current_episode = 0 self.episodes = episodes self.episode_score = [] self.episode_qs = [] self.episode_height = [] self.episode_loss = [] self.episode_ws = [] self.episode_policies = [] self.fig, self.ax = plt.subplots(1, 2, figsize=(10, 4)) self.fig.canvas.draw() plt.show(block=False) self.numRewards = 2 self.env = DeepSeaTreasure(width=5, speed=10000, graphical_state=True, render=True, is_debug=False) self.agent = DQNAgent(stateShape=(64, 64, 1), actionSpace=self.env.get_action_space(), numPicks=32, memorySize=10000, numRewards=self.numRewards) def train(self): for _ in range(self.episodes): self.episode() self.current_episode += 1 plt.show(block=True) self.env.close() def episode(self): done = False rewardsSum = 0 lossSums = [0] * (self.numRewards) policies = [0] * (self.numRewards) qSums = [0] * (self.numRewards) wSums = [0] * (self.numRewards) actions = 1 state = self.process_state(self.env.reset()) maxHeight = -1 while not done: action, policy, qs, ws, random = self.agent.selectAction(state) if not random: policies[policy] += 1 qSums = [qSums[i] + qs[i] for i in range(len(policies))] wSums = [wSums[i] + ws[i] for i in range(len(policies))] actions += 1 obs, reward, done, _ = self.env.step_all(action) nextState = state - self.process_state(obs) rewardsSum = np.add(rewardsSum, sum(reward)) self.agent.addMemory( (state, action, policy, reward, nextState, done)) state = nextState loss = self.agent.trainDQN() lossSums = [lossSums[i] + loss[i][0] for i in range(len(policies))] print("now epsilon is {}, the reward is {} with loss {} in episode {}". format(self.agent.epsilon, rewardsSum, lossSums, self.current_episode)) self.episode_score.append(rewardsSum) self.episode_height.append(maxHeight) self.episode_loss.append(lossSums) self.episode_policies.append(policies) self.episode_qs.append([qSum / actions for qSum in qSums]) self.episode_ws.append([wSum / actions for wSum in wSums]) self.plot() print( "Report: \nrewardSum:{}\nheight:{}\nloss:{}\npolicies:{}\nqAverage:{}\nws:{}" .format(self.episode_score[-1], self.episode_height[-1], self.episode_loss[-1], self.episode_policies[-1], self.episode_qs[-1], self.episode_ws[-1])) def process_state(self, state): state = cv2.resize(state.astype('float32'), (64, 64), interpolation=cv2.INTER_AREA) state = cv2.cvtColor(state, cv2.COLOR_RGB2GRAY) return np.expand_dims(state, 2) def plot(self): spline_x = np.linspace(0, self.current_episode, num=self.current_episode) ep_scores = np.array(self.episode_score) ep_groups = [ ep_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM] for i in range((len(ep_scores) + GROUP_NUM - 1) // GROUP_NUM) ] # Pad for weird numpy error for now ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] * (GROUP_NUM - len(ep_groups[-1]))) x_groups = [i * GROUP_NUM for i in range(len(ep_groups))] self.ax[0].clear() if len(x_groups) > 5: ep_avgs = np.mean(ep_groups, 1) avg_spl = interp1d(x_groups, ep_avgs, kind='cubic', fill_value="extrapolate") ep_std = np.std(ep_groups, 1) std_spl = interp1d(x_groups, ep_std, kind='cubic', fill_value="extrapolate") self.ax[0].plot(spline_x, avg_spl(spline_x), lw=0.7, c="blue") self.ax[0].fill_between(spline_x, avg_spl(spline_x) - std_spl(spline_x), avg_spl(spline_x) + std_spl(spline_x), alpha=0.5, facecolor="red", interpolate=True) self.ax[0].title.set_text('Training Score') self.ax[0].set_xlabel('Episode') self.ax[0].set_ylabel('Score') policies = np.transpose(self.episode_policies) colors = pl.cm.jet(np.linspace(0, 1, len(policies) * 2)) self.ax[1].clear() self.ax[1].title.set_text('Policy Choices') for i, policy in enumerate(policies): if len(x_groups) > 5: ep_groups = [ policy[i * GROUP_NUM:(i + 1) * GROUP_NUM] for i in range((len(policy) + GROUP_NUM - 1) // GROUP_NUM) ] # Pad for weird numpy error for now ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] * (GROUP_NUM - len(ep_groups[-1]))) x_groups = [i * GROUP_NUM for i in range(len(ep_groups))] ep_avgs = np.mean(ep_groups, 1) avg_spl = interp1d(x_groups, ep_avgs, kind='cubic', fill_value="extrapolate") ep_std = np.std(ep_groups, 1) std_spl = interp1d(x_groups, ep_std, kind='cubic', fill_value="extrapolate") self.ax[1].plot(spline_x, avg_spl(spline_x), lw=0.7, c=colors[i], label="{} policy".format(PolEnum(i).name)) self.ax[1].fill_between(spline_x, avg_spl(spline_x) - std_spl(spline_x), avg_spl(spline_x) + std_spl(spline_x), alpha=0.5, facecolor=colors[-1 - i], interpolate=True) self.ax[1].legend() self.fig.canvas.draw() plt.show(block=False) plt.pause(.001)