class DeepSeaTreasureBaselineDQN(object): def __init__(self, episodes): self.current_episode = 0 self.episodes = episodes self.episode_score = [] self.episode_qs = [] self.episode_height = [] self.episode_loss = [] self.fig, self.ax = plt.subplots(figsize=(5, 4)) self.fig.canvas.draw() plt.show(block=False) self.env = DeepSeaTreasure(width=5, speed=1000, graphical_state=False, render=True, is_debug=True) self.agent = DQNAgent(stateShape=(2, ), actionSpace=self.env.get_action_space(), numPicks=64, memorySize=2000) def train(self): for _ in range(self.episodes): self.episode() self.plot() self.current_episode += 1 plt.show(block=True) self.env.close() def episode(self): done = False rewardsSum = 0 qSum = 0 qActions = 1 lossSum = 0 state = self.env.reset().reshape(1, 2) maxHeight = -10000 while not done: action, q = self.agent.selectAction(state) if q != -100000: qSum += q qActions += 1 obs, reward, done, _ = self.env.step_all(action) # env.render() reward = reward[0] + reward[1] ''' maxHeight = max(obs[0], maxHeight) if obs[0] >= 0.5: reward += 10 ''' nextState = obs.reshape(1, 2) rewardsSum = np.add(rewardsSum, reward) loss = self.agent.trainDQN() self.agent.addMemory((state, action, reward, nextState, done)) state = nextState lossSum += loss self.agent.terminal() print("now epsilon is {}, the reward is {} with loss {} in episode {}". format(self.agent.epsilon, rewardsSum, lossSum, self.current_episode)) self.episode_score.append(rewardsSum) self.episode_qs.append(qSum / qActions) self.episode_height.append(maxHeight) self.episode_loss.append(lossSum) def plot(self): spline_x = np.linspace(0, self.current_episode, num=self.current_episode) ep_scores = np.array(self.episode_score) ep_groups = [ ep_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM] for i in range((len(ep_scores) + GROUP_NUM - 1) // GROUP_NUM) ] #Pad for weird numpy error for now ep_groups[-1] = np.append(ep_groups[-1], [ep_groups[-1][-1]] * (GROUP_NUM - len(ep_groups[-1]))) x_groups = [i * GROUP_NUM for i in range(len(ep_groups))] self.ax.clear() if len(x_groups) > 5: ep_avgs = np.mean(ep_groups, 1) avg_spl = interp1d(x_groups, ep_avgs, kind='cubic', fill_value="extrapolate") ep_std = np.std(ep_groups, 1) std_spl = interp1d(x_groups, ep_std, kind='cubic', fill_value="extrapolate") self.ax.plot(spline_x, avg_spl(spline_x), lw=0.7, c="blue") self.ax.fill_between(spline_x, avg_spl(spline_x) - std_spl(spline_x), avg_spl(spline_x) + std_spl(spline_x), alpha=0.5, facecolor="red", interpolate=True) self.ax.title.set_text('Training Score') self.ax.set_xlabel('Episode') self.ax.set_ylabel('Score') plt.show(block=False) plt.pause(.001)
class MultiObjectiveDeepSeaW(object): def __init__(self, episodes): self.current_episode = 0 self.episodes = episodes self.episode_score = [] self.episode_qs = [] self.episode_height = [] self.episode_loss = [] self.episode_ws = [] self.episode_policies = [] self.fig, self.ax = plt.subplots(1, 2, figsize=(10, 4)) self.fig.tight_layout() self.fig.canvas.draw() self.numRewards = 2 self.env = DeepSeaTreasure(width=5, speed=1e8, graphical_state=True, render=False, is_debug=True, frame_stack=2) def train(self): self.agent = DQNAgent(stateShape=(84, 84, 2), actionSpace=self.env.get_action_space(), numPicks=32, memorySize=10000, numRewards=self.numRewards, optim=keras.optimizers.Adam) for _ in range(self.episodes): self.episode() self.current_episode += 1 #save scores self.adam_scores = deepcopy(self.episode_score) self.current_episode = 0 self.agent = DQNAgent(stateShape=(84, 84, 2), actionSpace=self.env.get_action_space(), numPicks=32, memorySize=10000, numRewards=self.numRewards, optim=keras.optimizers.RMSprop) for _ in range(self.episodes): self.episode() self.current_episode += 1 self.rms_scores = deepcopy(self.episode_score) self.plot_compare() def episode(self): done = False rewardsSum = 0 lossSums = [0] * (self.numRewards) policies = [0] * (self.numRewards) qSums = [0] * (self.numRewards) wSums = [0] * (self.numRewards) actions = 1 state = self.env.reset() while not done: action, policy, qs, ws, random = self.agent.selectAction(state) if policy != -1: policies[policy] += 1 if not random: qSums[policy] += qs wSums = [wSums[i] + ws[i] for i in range(len(wSums))] actions += 1 obs, reward, done, _ = self.env.step_all(action) nextState = obs rewardsSum = np.add(rewardsSum, sum(reward)) self.agent.addMemory(state, action, policy, reward, nextState, done) loss = self.agent.trainDQN() state = nextState lossSums = [lossSums[i] + loss[i] for i in range(len(lossSums))] print("now epsilon is {}, the reward is {} with loss {} in episode {}". format(self.agent.epsilon, rewardsSum, lossSums, self.current_episode)) self.episode_score.append(rewardsSum) self.episode_loss.append(lossSums) self.episode_policies.append(policies) self.episode_qs.append([qSum / actions for qSum in qSums]) self.episode_ws.append([wSum / actions for wSum in wSums]) print( "Report: \nrewardSum:{}\nloss:{}\npolicies:{}\nqAverage:{}\nws:{}". format(self.episode_score[-1], self.episode_loss[-1], self.episode_policies[-1], self.episode_qs[-1], self.episode_ws[-1])) print("memory len:" + str(len(self.agent.replayMemory[0]))) print("memory used:" + str(psutil.virtual_memory().used // 1e6)) tf.keras.backend.clear_session() gc.collect() def plot(self): spline_x = np.linspace(0, self.current_episode, num=self.current_episode) ep_scores = np.array(self.episode_score) ep_groups = [ ep_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM] for i in range((len(ep_scores) + GROUP_NUM - 1) // GROUP_NUM) ] # Pad for weird numpy error for now ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] * (GROUP_NUM - len(ep_groups[-1]))) x_groups = [i * GROUP_NUM for i in range(len(ep_groups))] self.ax[0].clear() if len(x_groups) > 5: ep_avgs = np.mean(ep_groups, 1) avg_spl = interp1d(x_groups, ep_avgs, kind='cubic', fill_value="extrapolate") ep_std = np.std(ep_groups, 1) std_spl = interp1d(x_groups, ep_std, kind='cubic', fill_value="extrapolate") self.ax[0].plot(spline_x, avg_spl(spline_x), lw=0.7, c="blue") self.ax[0].fill_between(spline_x, avg_spl(spline_x) - std_spl(spline_x), avg_spl(spline_x) + std_spl(spline_x), alpha=0.5, facecolor="red", interpolate=True) self.ax[0].plot(spline_x, avg_spl(spline_x), lw=0.7, c="orange") self.ax[0].fill_between(spline_x, avg_spl(spline_x) - std_spl(spline_x), avg_spl(spline_x) + std_spl(spline_x), alpha=0.5, facecolor="green", interpolate=True) self.ax[0].title.set_text('Training Score') self.ax[0].set_xlabel('Episode') self.ax[0].set_ylabel('Score') policies = np.transpose(self.episode_policies) colors = pl.cm.jet(np.linspace(0, 1, len(policies) * 2)) for i, policy in enumerate(policies): if len(x_groups) > 5: ep_groups = [ policy[i * GROUP_NUM:(i + 1) * GROUP_NUM] for i in range((len(policy) + GROUP_NUM - 1) // GROUP_NUM) ] # Pad for weird numpy error for now ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] * (GROUP_NUM - len(ep_groups[-1]))) x_groups = [i * GROUP_NUM for i in range(len(ep_groups))] ep_avgs = np.mean(ep_groups, 1) avg_spl = interp1d(x_groups, ep_avgs, kind='cubic', fill_value="extrapolate") ep_std = np.std(ep_groups, 1) std_spl = interp1d(x_groups, ep_std, kind='cubic', fill_value="extrapolate") self.ax[1].plot(spline_x, avg_spl(spline_x), lw=0.7, c=colors[i], label="{} policy".format(PolEnum(i).name)) self.ax[1].fill_between(spline_x, avg_spl(spline_x) - std_spl(spline_x), avg_spl(spline_x) + std_spl(spline_x), alpha=0.5, facecolor=colors[-1 - i], interpolate=True) self.ax[1].legend() self.fig.canvas.draw() plt.savefig("dst_w_pddqn_{}.png".format(self.current_episode)) def plot_compare(self): spline_x = np.linspace(0, self.current_episode, num=self.current_episode) ep_adam_scores = np.array(self.adam_scores) ep_rms_scores = np.array(self.rms_scores) ep_adam_groups = [ ep_adam_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM] for i in range((len(ep_adam_scores) + GROUP_NUM - 1) // GROUP_NUM) ] ep_rms_groups = [ ep_rms_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM] for i in range((len(ep_rms_scores) + GROUP_NUM - 1) // GROUP_NUM) ] # Pad for weird numpy error for now ep_adam_groups[-1] = np.append(ep_adam_groups[-1], [np.mean(ep_adam_groups[-1])] * (GROUP_NUM - len(ep_adam_groups[-1]))) ep_rms_groups[-1] = np.append(ep_rms_groups[-1], [np.mean(ep_rms_groups[-1])] * (GROUP_NUM - len(ep_rms_groups[-1]))) x_groups = [i * GROUP_NUM for i in range(len(ep_adam_groups))] self.ax[0].clear() if len(x_groups) > 5: ep_adam_avgs = np.mean(ep_adam_groups, 1) ep_rms_avgs = np.mean(ep_rms_groups, 1) avg_adam_spl = interp1d(x_groups, ep_adam_avgs, kind='cubic', fill_value="extrapolate") avg_rms_spl = interp1d(x_groups, ep_rms_avgs, kind='cubic', fill_value="extrapolate") ep_adam_std = np.std(ep_adam_groups, 1) ep_rms_std = np.std(ep_rms_groups, 1) std_adam_spl = interp1d(x_groups, ep_adam_std, kind='cubic', fill_value="extrapolate") std_rms_spl = interp1d(x_groups, ep_rms_std, kind='cubic', fill_value="extrapolate") self.ax[0].plot(spline_x, avg_adam_spl(spline_x), lw=0.7, c="blue", label="Adam") self.ax[0].fill_between( spline_x, avg_adam_spl(spline_x) - std_adam_spl(spline_x), avg_adam_spl(spline_x) + std_adam_spl(spline_x), alpha=0.5, facecolor="red", interpolate=True) self.ax[0].plot(spline_x, avg_rms_spl(spline_x), lw=0.7, c="orange", label="RMSProp") self.ax[0].fill_between( spline_x, avg_rms_spl(spline_x) - std_rms_spl(spline_x), avg_rms_spl(spline_x) + std_rms_spl(spline_x), alpha=0.5, facecolor="green", interpolate=True)
class DeepSeaTreasureGraphicalDDQN(object): def __init__(self, episodes): self.current_episode = 0 self.episodes = episodes self.episode_score = [] self.episode_qs = [] self.episode_height = [] self.episode_loss = [] self.episode_policies = [] self.fig, self.ax = plt.subplots(figsize=(10, 4)) self.fig.canvas.draw() plt.show(block=False) self.numRewards = 2 self.env = DeepSeaTreasure(width=5, speed=10000, graphical_state=True, render=True, is_debug=False, frame_stack=2) self.agent = DQNAgent(stateShape=(84, 84, 2), actionSpace=self.env.get_action_space(), numPicks=32, memorySize=10000, numRewards=self.numRewards) def train(self): for _ in range(self.episodes): self.episode() self.current_episode += 1 plt.show(block=True) self.env.close() def episode(self): done = False rewardsSum = 0 lossSum = 0 qSums = [0] * (self.numRewards) actions = 1 state = self.env.reset() maxHeight = -1 while not done: action, qs = self.agent.selectAction(state) if qs != -100000: qSums += qs actions += 1 obs, reward, done, _ = self.env.step_all(action) nextState = obs rewardsSum = np.add(rewardsSum, sum(reward)) self.agent.addMemory( (state, action, (reward[0] + reward[1]), nextState, done)) state = nextState loss = self.agent.trainDQN() lossSum += loss if self.current_episode % self.agent.sync == 0: self.agent.targetNetwork.set_weights( self.agent.trainNetwork.get_weights()) print("now epsilon is {}, the reward is {} with loss {} in episode {}". format(self.agent.epsilon, rewardsSum, lossSum, self.current_episode)) self.episode_score.append(rewardsSum) self.episode_height.append(maxHeight) self.episode_loss.append(lossSum) self.episode_qs.append([qSum / actions for qSum in qSums]) self.plot() print("Report: \nrewardSum:{}\nheight:{}\nloss:{}\nqAverage:{}".format( self.episode_score[-1], self.episode_height[-1], self.episode_loss[-1], self.episode_qs[-1])) def plot(self): spline_x = np.linspace(0, self.current_episode, num=self.current_episode) ep_scores = np.array(self.episode_score) ep_groups = [ ep_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM] for i in range((len(ep_scores) + GROUP_NUM - 1) // GROUP_NUM) ] # Pad for weird numpy error for now ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] * (GROUP_NUM - len(ep_groups[-1]))) x_groups = [i * GROUP_NUM for i in range(len(ep_groups))] self.ax.clear() if len(x_groups) > 5: ep_avgs = np.mean(ep_groups, 1) avg_spl = interp1d(x_groups, ep_avgs, kind='cubic', fill_value="extrapolate") ep_std = np.std(ep_groups, 1) std_spl = interp1d(x_groups, ep_std, kind='cubic', fill_value="extrapolate") self.ax.plot(spline_x, avg_spl(spline_x), lw=0.7, c="blue") self.ax.fill_between(spline_x, avg_spl(spline_x) - std_spl(spline_x), avg_spl(spline_x) + std_spl(spline_x), alpha=0.5, facecolor="red", interpolate=True) self.ax.title.set_text('Training Score') self.ax.set_xlabel('Episode') self.ax.set_ylabel('Score') ''' policies = np.transpose(self.episode_policies) colors = pl.cm.jet(np.linspace(0, 1, len(policies)*2)) self.ax[1].clear() self.ax[1].title.set_text('Policy Choices') for i, policy in enumerate(policies): if len(x_groups) > 5: ep_groups = [policy[i * GROUP_NUM:(i + 1) * GROUP_NUM] for i in range((len(policy) + GROUP_NUM - 1) // GROUP_NUM)] # Pad for weird numpy error for now ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] * (GROUP_NUM - len(ep_groups[-1]))) x_groups = [i*GROUP_NUM for i in range(len(ep_groups))] ep_avgs = np.mean(ep_groups, 1) avg_spl = interp1d(x_groups, ep_avgs, kind='cubic', fill_value="extrapolate") ep_std = np.std(ep_groups, 1) std_spl = interp1d(x_groups, ep_std, kind='cubic', fill_value="extrapolate") self.ax[1].plot(spline_x, avg_spl(spline_x), lw=0.7, c=colors[i], label="{} policy".format(PolEnum(i).name)) self.ax[1].fill_between(spline_x, avg_spl(spline_x)-std_spl(spline_x), avg_spl(spline_x)+std_spl(spline_x), alpha=0.5, facecolor=colors[-1-i], interpolate=True) self.ax[1].legend() ''' self.fig.canvas.draw() plt.show(block=False) plt.pause(.001)
class DeepSeaTreasureGraphicalPDDQN(object): def __init__(self, episodes): self.current_episode = 0 self.episodes = episodes self.episode_score = [] self.episode_qs = [] self.episode_height = [] self.episode_loss = [] self.episode_policies = [] self.fig, self.ax = plt.subplots(figsize=(6, 4)) self.numRewards = 2 self.env = DeepSeaTreasure(width=5, speed=10000000, graphical_state=True, render=False, is_debug=True, frame_stack=2, reshape_reward_weights=[[1, 1]], seed=1234) self.agent = DQNAgent( stateShape=(84, 84, 2), actionSpace=self.env.get_action_space(), numPicks=32, memorySize=10000, numRewards=self.numRewards, ) self.agent.load() def train(self): for _ in range(self.episodes): self.episode() self.current_episode += 1 self.plot() self.agent.save() def episode(self): done = False rewardsSum = 0 lossSum = 0 qSums = 0 actions = 1 state = self.env.reset() maxHeight = -1 while not done: action, qs = self.agent.selectAction(state) if qs != -100000: qSums += qs actions += 1 obs, reward, done, _ = self.env.step_all(action) nextState = obs rewardsSum = np.add(rewardsSum, sum(reward)) self.agent.addMemory(state, action, (reward[0] + reward[1]), nextState, done) state = nextState loss = self.agent.trainDQN() lossSum += loss if self.current_episode % self.agent.sync == 0: self.agent.targetNetwork.set_weights( self.agent.trainNetwork.get_weights()) print("now epsilon is {}, the reward is {} with loss {} in episode {}". format(self.agent.epsilon, rewardsSum, lossSum, self.current_episode)) self.episode_score.append(rewardsSum) self.episode_height.append(maxHeight) self.episode_loss.append(lossSum) self.episode_qs.append(qSums / actions) print("Report: \nrewardSum:{}\nheight:{}\nloss:{}\nqAverage:{}".format( self.episode_score[-1], self.episode_height[-1], self.episode_loss[-1], self.episode_qs[-1], )) def plot(self): spline_x = np.linspace(0, self.current_episode, num=self.current_episode) ep_scores = np.array(self.episode_score) ep_groups = [ ep_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM] for i in range((len(ep_scores) + GROUP_NUM - 1) // GROUP_NUM) ] # Pad for weird numpy error for now ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] * (GROUP_NUM - len(ep_groups[-1]))) x_groups = [i * GROUP_NUM for i in range(len(ep_groups))] self.ax.clear() if len(x_groups) > 5: ep_avgs = np.mean(ep_groups, 1) avg_spl = interp1d(x_groups, ep_avgs, kind="cubic", fill_value="extrapolate") ep_std = np.std(ep_groups, 1) std_spl = interp1d(x_groups, ep_std, kind="cubic", fill_value="extrapolate") self.ax.plot(spline_x, avg_spl(spline_x), lw=0.7, c="blue") self.ax.fill_between( spline_x, avg_spl(spline_x) - std_spl(spline_x), avg_spl(spline_x) + std_spl(spline_x), alpha=0.5, facecolor="red", interpolate=True, ) self.ax.title.set_text("Training Score") self.ax.set_xlabel("Episode") self.ax.set_ylabel("Score") self.fig.canvas.draw() plt.savefig("dst_pddqn_retrain.png")
class DeepSeaGraphicalWAgent(object): def __init__(self, episodes): self.current_episode = 0 self.episodes = episodes self.episode_score = [] self.episode_qs = [] self.episode_height = [] self.episode_loss = [] self.episode_ws = [] self.episode_policies = [] self.fig, self.ax = plt.subplots(1, 2, figsize=(10, 4)) self.fig.canvas.draw() plt.show(block=False) self.numRewards = 2 self.env = DeepSeaTreasure(width=5, speed=10000, graphical_state=True, render=True, is_debug=False) self.agent = DQNAgent(stateShape=(64, 64, 1), actionSpace=self.env.get_action_space(), numPicks=32, memorySize=10000, numRewards=self.numRewards) def train(self): for _ in range(self.episodes): self.episode() self.current_episode += 1 plt.show(block=True) self.env.close() def episode(self): done = False rewardsSum = 0 lossSums = [0] * (self.numRewards) policies = [0] * (self.numRewards) qSums = [0] * (self.numRewards) wSums = [0] * (self.numRewards) actions = 1 state = self.process_state(self.env.reset()) maxHeight = -1 while not done: action, policy, qs, ws, random = self.agent.selectAction(state) if not random: policies[policy] += 1 qSums = [qSums[i] + qs[i] for i in range(len(policies))] wSums = [wSums[i] + ws[i] for i in range(len(policies))] actions += 1 obs, reward, done, _ = self.env.step_all(action) nextState = state - self.process_state(obs) rewardsSum = np.add(rewardsSum, sum(reward)) self.agent.addMemory( (state, action, policy, reward, nextState, done)) state = nextState loss = self.agent.trainDQN() lossSums = [lossSums[i] + loss[i][0] for i in range(len(policies))] print("now epsilon is {}, the reward is {} with loss {} in episode {}". format(self.agent.epsilon, rewardsSum, lossSums, self.current_episode)) self.episode_score.append(rewardsSum) self.episode_height.append(maxHeight) self.episode_loss.append(lossSums) self.episode_policies.append(policies) self.episode_qs.append([qSum / actions for qSum in qSums]) self.episode_ws.append([wSum / actions for wSum in wSums]) self.plot() print( "Report: \nrewardSum:{}\nheight:{}\nloss:{}\npolicies:{}\nqAverage:{}\nws:{}" .format(self.episode_score[-1], self.episode_height[-1], self.episode_loss[-1], self.episode_policies[-1], self.episode_qs[-1], self.episode_ws[-1])) def process_state(self, state): state = cv2.resize(state.astype('float32'), (64, 64), interpolation=cv2.INTER_AREA) state = cv2.cvtColor(state, cv2.COLOR_RGB2GRAY) return np.expand_dims(state, 2) def plot(self): spline_x = np.linspace(0, self.current_episode, num=self.current_episode) ep_scores = np.array(self.episode_score) ep_groups = [ ep_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM] for i in range((len(ep_scores) + GROUP_NUM - 1) // GROUP_NUM) ] # Pad for weird numpy error for now ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] * (GROUP_NUM - len(ep_groups[-1]))) x_groups = [i * GROUP_NUM for i in range(len(ep_groups))] self.ax[0].clear() if len(x_groups) > 5: ep_avgs = np.mean(ep_groups, 1) avg_spl = interp1d(x_groups, ep_avgs, kind='cubic', fill_value="extrapolate") ep_std = np.std(ep_groups, 1) std_spl = interp1d(x_groups, ep_std, kind='cubic', fill_value="extrapolate") self.ax[0].plot(spline_x, avg_spl(spline_x), lw=0.7, c="blue") self.ax[0].fill_between(spline_x, avg_spl(spline_x) - std_spl(spline_x), avg_spl(spline_x) + std_spl(spline_x), alpha=0.5, facecolor="red", interpolate=True) self.ax[0].title.set_text('Training Score') self.ax[0].set_xlabel('Episode') self.ax[0].set_ylabel('Score') policies = np.transpose(self.episode_policies) colors = pl.cm.jet(np.linspace(0, 1, len(policies) * 2)) self.ax[1].clear() self.ax[1].title.set_text('Policy Choices') for i, policy in enumerate(policies): if len(x_groups) > 5: ep_groups = [ policy[i * GROUP_NUM:(i + 1) * GROUP_NUM] for i in range((len(policy) + GROUP_NUM - 1) // GROUP_NUM) ] # Pad for weird numpy error for now ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] * (GROUP_NUM - len(ep_groups[-1]))) x_groups = [i * GROUP_NUM for i in range(len(ep_groups))] ep_avgs = np.mean(ep_groups, 1) avg_spl = interp1d(x_groups, ep_avgs, kind='cubic', fill_value="extrapolate") ep_std = np.std(ep_groups, 1) std_spl = interp1d(x_groups, ep_std, kind='cubic', fill_value="extrapolate") self.ax[1].plot(spline_x, avg_spl(spline_x), lw=0.7, c=colors[i], label="{} policy".format(PolEnum(i).name)) self.ax[1].fill_between(spline_x, avg_spl(spline_x) - std_spl(spline_x), avg_spl(spline_x) + std_spl(spline_x), alpha=0.5, facecolor=colors[-1 - i], interpolate=True) self.ax[1].legend() self.fig.canvas.draw() plt.show(block=False) plt.pause(.001)
class MultiObjectiveDeepSeaW(object): def __init__(self, episodes): self.current_episode = 0 self.episodes = episodes self.episode_score = [] self.episode_qs = [] self.episode_height = [] self.episode_loss = [] self.episode_ws = [] self.episode_policies = [] self.fig, self.ax = plt.subplots(figsize=(6, 4)) self.fig.tight_layout() self.fig.canvas.draw() self.numRewards = 2 self.env = DeepSeaTreasure(width=5, speed=50, graphical_state=True, render=True, is_debug=True, frame_stack=2, reshape_reward_weights=[[1, 1]], seed=1234) self.agent = DQNAgent( stateShape=(84, 84, 2), actionSpace=self.env.get_action_space(), numPicks=32, memorySize=10000, numRewards=self.numRewards, ) self.agent.load() def train(self): for _ in range(self.episodes): self.episode() self.current_episode += 1 self.plot() def episode(self): done = False rewardsSum = 0 policies = [0] * (self.numRewards + 1) qSums = [0] * (self.numRewards) wSums = [0] * (self.numRewards) actions = 1 state = self.env.reset() while not done: action, policy, qs, ws, random = self.agent.selectAction(state) policies[policy] += 1 if not random: qSums[policy] += qs wSums = [wSums[i] + ws[i] for i in range(len(wSums))] actions += 1 obs, reward, done, _ = self.env.step_all(action) nextState = obs rewardsSum = np.add(rewardsSum, reward[1]) self.agent.addMemory(state, action, policy, reward, nextState, done) state = nextState print("now epsilon is {}, the reward is {} in episode {}".format( self.agent.epsilon, rewardsSum, self.current_episode)) self.episode_score.append(rewardsSum) self.episode_policies.append(policies) self.episode_qs.append([qSum / actions for qSum in qSums]) self.episode_ws.append([wSum / actions for wSum in wSums]) print("Report: \nrewardSum:{}\npolicies:{}\nqAverage:{}\nws:{}".format( self.episode_score[-1], self.episode_policies[-1], self.episode_qs[-1], self.episode_ws[-1], )) print("memory len:" + str(len(self.agent.replayMemory[0]))) print("memory used:" + str(psutil.virtual_memory().used // 1e6)) tf.keras.backend.clear_session() gc.collect() def plot(self): spline_x = np.linspace(0, self.current_episode, num=self.current_episode) ep_scores = np.array(self.episode_score) ep_groups = [ ep_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM] for i in range((len(ep_scores) + GROUP_NUM - 1) // GROUP_NUM) ] # Pad for weird numpy error for now ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] * (GROUP_NUM - len(ep_groups[-1]))) x_groups = [i * GROUP_NUM for i in range(len(ep_groups))] self.ax.clear() self.ax.plot(spline_x, [-3] * len(spline_x), lw=0.7, c="blue", label="Pareto Front 1") self.ax.plot(spline_x, [-5] * len(spline_x), lw=0.7, c="green", label="Pareto Front 2") self.ax.plot(spline_x, [-7] * len(spline_x), lw=0.7, c="cyan", label="Pareto Front 3") self.ax.plot(spline_x, [-9] * len(spline_x), lw=0.7, c="orange", label="Pareto Front 4") self.ax.plot(spline_x, [-11] * len(spline_x), lw=0.7, c="purple", label="Pareto Front 5") if len(x_groups) > 5: ep_avgs = np.mean(ep_groups, 1) avg_spl = interp1d(x_groups, ep_avgs, kind="cubic", fill_value="extrapolate") ep_std = np.std(ep_groups, 1) std_spl = interp1d(x_groups, ep_std, kind="cubic", fill_value="extrapolate") self.ax.plot(spline_x, avg_spl(spline_x), lw=0.7, c="red", label="W-DQN Time Policy") self.ax.fill_between( spline_x, avg_spl(spline_x) - std_spl(spline_x), avg_spl(spline_x) + std_spl(spline_x), alpha=0.5, facecolor="red", interpolate=True, ) self.ax.legend(loc="lower left") plt.legend() self.ax.title.set_text("Training Score") self.ax.set_xlabel("Episode") self.ax.set_ylabel("Score") policies = np.transpose(self.episode_policies) colors = pl.cm.jet(np.linspace(0, 1, len(policies) * 2)) self.fig.canvas.draw() plt.savefig("dst_w_pddqn_retrain_081.png") def plot_compare(self): spline_x = np.linspace(0, self.current_episode, num=self.current_episode) ep_adam_scores = np.array(self.adam_scores) ep_rms_scores = np.array(self.rms_scores) ep_adam_groups = [ ep_adam_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM] for i in range((len(ep_adam_scores) + GROUP_NUM - 1) // GROUP_NUM) ] ep_rms_groups = [ ep_rms_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM] for i in range((len(ep_rms_scores) + GROUP_NUM - 1) // GROUP_NUM) ] # Pad for weird numpy error for now ep_adam_groups[-1] = np.append( ep_adam_groups[-1], [np.mean(ep_adam_groups[-1])] * (GROUP_NUM - len(ep_adam_groups[-1])), ) ep_rms_groups[-1] = np.append( ep_rms_groups[-1], [np.mean(ep_rms_groups[-1])] * (GROUP_NUM - len(ep_rms_groups[-1])), ) x_groups = [i * GROUP_NUM for i in range(len(ep_adam_groups))] self.ax.clear() if len(x_groups) > 5: ep_adam_avgs = np.mean(ep_adam_groups, 1) ep_rms_avgs = np.mean(ep_rms_groups, 1) avg_adam_spl = interp1d(x_groups, ep_adam_avgs, kind="cubic", fill_value="extrapolate") avg_rms_spl = interp1d(x_groups, ep_rms_avgs, kind="cubic", fill_value="extrapolate") ep_adam_std = np.std(ep_adam_groups, 1) ep_rms_std = np.std(ep_rms_groups, 1) std_adam_spl = interp1d(x_groups, ep_adam_std, kind="cubic", fill_value="extrapolate") std_rms_spl = interp1d(x_groups, ep_rms_std, kind="cubic", fill_value="extrapolate") self.ax.plot(spline_x, avg_adam_spl(spline_x), lw=0.7, c="blue", label="Adam") self.ax.fill_between( spline_x, avg_adam_spl(spline_x) - std_adam_spl(spline_x), avg_adam_spl(spline_x) + std_adam_spl(spline_x), alpha=0.5, facecolor="red", interpolate=True, ) self.ax.plot(spline_x, avg_rms_spl(spline_x), lw=0.7, c="orange", label="RMSProp") self.ax.fill_between( spline_x, avg_rms_spl(spline_x) - std_rms_spl(spline_x), avg_rms_spl(spline_x) + std_rms_spl(spline_x), alpha=0.5, facecolor="green", interpolate=True, ) self.ax.title.set_text("Training Score") self.ax.set_xlabel("Episode") self.ax.set_ylabel("Score") self.ax.legend() plt.show(block=True)