class DeepSeaTreasureBaselineDQN(object):
    def __init__(self, episodes):
        self.current_episode = 0
        self.episodes = episodes

        self.episode_score = []
        self.episode_qs = []
        self.episode_height = []
        self.episode_loss = []

        self.fig, self.ax = plt.subplots(figsize=(5, 4))
        self.fig.canvas.draw()
        plt.show(block=False)

        self.env = DeepSeaTreasure(width=5,
                                   speed=1000,
                                   graphical_state=False,
                                   render=True,
                                   is_debug=True)
        self.agent = DQNAgent(stateShape=(2, ),
                              actionSpace=self.env.get_action_space(),
                              numPicks=64,
                              memorySize=2000)

    def train(self):
        for _ in range(self.episodes):
            self.episode()
            self.plot()
            self.current_episode += 1
        plt.show(block=True)

        self.env.close()

    def episode(self):
        done = False
        rewardsSum = 0
        qSum = 0
        qActions = 1
        lossSum = 0

        state = self.env.reset().reshape(1, 2)
        maxHeight = -10000

        while not done:
            action, q = self.agent.selectAction(state)
            if q != -100000:
                qSum += q
                qActions += 1

            obs, reward, done, _ = self.env.step_all(action)
            # env.render()

            reward = reward[0] + reward[1]
            '''
            maxHeight = max(obs[0], maxHeight)
            if obs[0] >= 0.5:
                reward += 10
            '''
            nextState = obs.reshape(1, 2)
            rewardsSum = np.add(rewardsSum, reward)

            loss = self.agent.trainDQN()
            self.agent.addMemory((state, action, reward, nextState, done))
            state = nextState
            lossSum += loss

        self.agent.terminal()
        print("now epsilon is {}, the reward is {} with loss {} in episode {}".
              format(self.agent.epsilon, rewardsSum, lossSum,
                     self.current_episode))

        self.episode_score.append(rewardsSum)
        self.episode_qs.append(qSum / qActions)
        self.episode_height.append(maxHeight)
        self.episode_loss.append(lossSum)

    def plot(self):
        spline_x = np.linspace(0,
                               self.current_episode,
                               num=self.current_episode)

        ep_scores = np.array(self.episode_score)
        ep_groups = [
            ep_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM]
            for i in range((len(ep_scores) + GROUP_NUM - 1) // GROUP_NUM)
        ]
        #Pad for weird numpy error for now
        ep_groups[-1] = np.append(ep_groups[-1], [ep_groups[-1][-1]] *
                                  (GROUP_NUM - len(ep_groups[-1])))
        x_groups = [i * GROUP_NUM for i in range(len(ep_groups))]

        self.ax.clear()
        if len(x_groups) > 5:
            ep_avgs = np.mean(ep_groups, 1)
            avg_spl = interp1d(x_groups,
                               ep_avgs,
                               kind='cubic',
                               fill_value="extrapolate")
            ep_std = np.std(ep_groups, 1)
            std_spl = interp1d(x_groups,
                               ep_std,
                               kind='cubic',
                               fill_value="extrapolate")
            self.ax.plot(spline_x, avg_spl(spline_x), lw=0.7, c="blue")
            self.ax.fill_between(spline_x,
                                 avg_spl(spline_x) - std_spl(spline_x),
                                 avg_spl(spline_x) + std_spl(spline_x),
                                 alpha=0.5,
                                 facecolor="red",
                                 interpolate=True)

        self.ax.title.set_text('Training Score')
        self.ax.set_xlabel('Episode')
        self.ax.set_ylabel('Score')

        plt.show(block=False)
        plt.pause(.001)
class MultiObjectiveDeepSeaW(object):
    def __init__(self, episodes):
        self.current_episode = 0
        self.episodes = episodes

        self.episode_score = []
        self.episode_qs = []
        self.episode_height = []
        self.episode_loss = []
        self.episode_ws = []
        self.episode_policies = []

        self.fig, self.ax = plt.subplots(1, 2, figsize=(10, 4))
        self.fig.tight_layout()
        self.fig.canvas.draw()

        self.numRewards = 2

        self.env = DeepSeaTreasure(width=5,
                                   speed=1e8,
                                   graphical_state=True,
                                   render=False,
                                   is_debug=True,
                                   frame_stack=2)

    def train(self):
        self.agent = DQNAgent(stateShape=(84, 84, 2),
                              actionSpace=self.env.get_action_space(),
                              numPicks=32,
                              memorySize=10000,
                              numRewards=self.numRewards,
                              optim=keras.optimizers.Adam)

        for _ in range(self.episodes):
            self.episode()
            self.current_episode += 1

        #save scores
        self.adam_scores = deepcopy(self.episode_score)

        self.current_episode = 0
        self.agent = DQNAgent(stateShape=(84, 84, 2),
                              actionSpace=self.env.get_action_space(),
                              numPicks=32,
                              memorySize=10000,
                              numRewards=self.numRewards,
                              optim=keras.optimizers.RMSprop)
        for _ in range(self.episodes):
            self.episode()
            self.current_episode += 1

        self.rms_scores = deepcopy(self.episode_score)

        self.plot_compare()

    def episode(self):
        done = False
        rewardsSum = 0
        lossSums = [0] * (self.numRewards)
        policies = [0] * (self.numRewards)

        qSums = [0] * (self.numRewards)
        wSums = [0] * (self.numRewards)
        actions = 1

        state = self.env.reset()

        while not done:
            action, policy, qs, ws, random = self.agent.selectAction(state)
            if policy != -1:
                policies[policy] += 1
            if not random:
                qSums[policy] += qs
                wSums = [wSums[i] + ws[i] for i in range(len(wSums))]
                actions += 1

            obs, reward, done, _ = self.env.step_all(action)

            nextState = obs
            rewardsSum = np.add(rewardsSum, sum(reward))

            self.agent.addMemory(state, action, policy, reward, nextState,
                                 done)
            loss = self.agent.trainDQN()
            state = nextState
            lossSums = [lossSums[i] + loss[i] for i in range(len(lossSums))]

        print("now epsilon is {}, the reward is {} with loss {} in episode {}".
              format(self.agent.epsilon, rewardsSum, lossSums,
                     self.current_episode))

        self.episode_score.append(rewardsSum)
        self.episode_loss.append(lossSums)
        self.episode_policies.append(policies)
        self.episode_qs.append([qSum / actions for qSum in qSums])
        self.episode_ws.append([wSum / actions for wSum in wSums])

        print(
            "Report: \nrewardSum:{}\nloss:{}\npolicies:{}\nqAverage:{}\nws:{}".
            format(self.episode_score[-1], self.episode_loss[-1],
                   self.episode_policies[-1], self.episode_qs[-1],
                   self.episode_ws[-1]))
        print("memory len:" + str(len(self.agent.replayMemory[0])))
        print("memory used:" + str(psutil.virtual_memory().used // 1e6))
        tf.keras.backend.clear_session()
        gc.collect()

    def plot(self):
        spline_x = np.linspace(0,
                               self.current_episode,
                               num=self.current_episode)

        ep_scores = np.array(self.episode_score)
        ep_groups = [
            ep_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM]
            for i in range((len(ep_scores) + GROUP_NUM - 1) // GROUP_NUM)
        ]
        # Pad for weird numpy error for now
        ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] *
                                  (GROUP_NUM - len(ep_groups[-1])))
        x_groups = [i * GROUP_NUM for i in range(len(ep_groups))]

        self.ax[0].clear()
        if len(x_groups) > 5:
            ep_avgs = np.mean(ep_groups, 1)
            avg_spl = interp1d(x_groups,
                               ep_avgs,
                               kind='cubic',
                               fill_value="extrapolate")
            ep_std = np.std(ep_groups, 1)
            std_spl = interp1d(x_groups,
                               ep_std,
                               kind='cubic',
                               fill_value="extrapolate")
            self.ax[0].plot(spline_x, avg_spl(spline_x), lw=0.7, c="blue")
            self.ax[0].fill_between(spline_x,
                                    avg_spl(spline_x) - std_spl(spline_x),
                                    avg_spl(spline_x) + std_spl(spline_x),
                                    alpha=0.5,
                                    facecolor="red",
                                    interpolate=True)

            self.ax[0].plot(spline_x, avg_spl(spline_x), lw=0.7, c="orange")
            self.ax[0].fill_between(spline_x,
                                    avg_spl(spline_x) - std_spl(spline_x),
                                    avg_spl(spline_x) + std_spl(spline_x),
                                    alpha=0.5,
                                    facecolor="green",
                                    interpolate=True)

        self.ax[0].title.set_text('Training Score')
        self.ax[0].set_xlabel('Episode')
        self.ax[0].set_ylabel('Score')

        policies = np.transpose(self.episode_policies)
        colors = pl.cm.jet(np.linspace(0, 1, len(policies) * 2))

        for i, policy in enumerate(policies):
            if len(x_groups) > 5:
                ep_groups = [
                    policy[i * GROUP_NUM:(i + 1) * GROUP_NUM]
                    for i in range((len(policy) + GROUP_NUM - 1) // GROUP_NUM)
                ]
                # Pad for weird numpy error for now
                ep_groups[-1] = np.append(ep_groups[-1],
                                          [np.mean(ep_groups[-1])] *
                                          (GROUP_NUM - len(ep_groups[-1])))
                x_groups = [i * GROUP_NUM for i in range(len(ep_groups))]

                ep_avgs = np.mean(ep_groups, 1)
                avg_spl = interp1d(x_groups,
                                   ep_avgs,
                                   kind='cubic',
                                   fill_value="extrapolate")
                ep_std = np.std(ep_groups, 1)
                std_spl = interp1d(x_groups,
                                   ep_std,
                                   kind='cubic',
                                   fill_value="extrapolate")
                self.ax[1].plot(spline_x,
                                avg_spl(spline_x),
                                lw=0.7,
                                c=colors[i],
                                label="{} policy".format(PolEnum(i).name))
                self.ax[1].fill_between(spline_x,
                                        avg_spl(spline_x) - std_spl(spline_x),
                                        avg_spl(spline_x) + std_spl(spline_x),
                                        alpha=0.5,
                                        facecolor=colors[-1 - i],
                                        interpolate=True)

        self.ax[1].legend()

        self.fig.canvas.draw()
        plt.savefig("dst_w_pddqn_{}.png".format(self.current_episode))

    def plot_compare(self):
        spline_x = np.linspace(0,
                               self.current_episode,
                               num=self.current_episode)

        ep_adam_scores = np.array(self.adam_scores)
        ep_rms_scores = np.array(self.rms_scores)
        ep_adam_groups = [
            ep_adam_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM]
            for i in range((len(ep_adam_scores) + GROUP_NUM - 1) // GROUP_NUM)
        ]
        ep_rms_groups = [
            ep_rms_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM]
            for i in range((len(ep_rms_scores) + GROUP_NUM - 1) // GROUP_NUM)
        ]
        # Pad for weird numpy error for now
        ep_adam_groups[-1] = np.append(ep_adam_groups[-1],
                                       [np.mean(ep_adam_groups[-1])] *
                                       (GROUP_NUM - len(ep_adam_groups[-1])))
        ep_rms_groups[-1] = np.append(ep_rms_groups[-1],
                                      [np.mean(ep_rms_groups[-1])] *
                                      (GROUP_NUM - len(ep_rms_groups[-1])))
        x_groups = [i * GROUP_NUM for i in range(len(ep_adam_groups))]

        self.ax[0].clear()
        if len(x_groups) > 5:
            ep_adam_avgs = np.mean(ep_adam_groups, 1)
            ep_rms_avgs = np.mean(ep_rms_groups, 1)
            avg_adam_spl = interp1d(x_groups,
                                    ep_adam_avgs,
                                    kind='cubic',
                                    fill_value="extrapolate")
            avg_rms_spl = interp1d(x_groups,
                                   ep_rms_avgs,
                                   kind='cubic',
                                   fill_value="extrapolate")
            ep_adam_std = np.std(ep_adam_groups, 1)
            ep_rms_std = np.std(ep_rms_groups, 1)
            std_adam_spl = interp1d(x_groups,
                                    ep_adam_std,
                                    kind='cubic',
                                    fill_value="extrapolate")
            std_rms_spl = interp1d(x_groups,
                                   ep_rms_std,
                                   kind='cubic',
                                   fill_value="extrapolate")
            self.ax[0].plot(spline_x,
                            avg_adam_spl(spline_x),
                            lw=0.7,
                            c="blue",
                            label="Adam")
            self.ax[0].fill_between(
                spline_x,
                avg_adam_spl(spline_x) - std_adam_spl(spline_x),
                avg_adam_spl(spline_x) + std_adam_spl(spline_x),
                alpha=0.5,
                facecolor="red",
                interpolate=True)

            self.ax[0].plot(spline_x,
                            avg_rms_spl(spline_x),
                            lw=0.7,
                            c="orange",
                            label="RMSProp")
            self.ax[0].fill_between(
                spline_x,
                avg_rms_spl(spline_x) - std_rms_spl(spline_x),
                avg_rms_spl(spline_x) + std_rms_spl(spline_x),
                alpha=0.5,
                facecolor="green",
                interpolate=True)
示例#3
0
class DeepSeaTreasureGraphicalDDQN(object):
    def __init__(self, episodes):
        self.current_episode = 0
        self.episodes = episodes

        self.episode_score = []
        self.episode_qs = []
        self.episode_height = []
        self.episode_loss = []
        self.episode_policies = []

        self.fig, self.ax = plt.subplots(figsize=(10, 4))
        self.fig.canvas.draw()
        plt.show(block=False)

        self.numRewards = 2

        self.env = DeepSeaTreasure(width=5,
                                   speed=10000,
                                   graphical_state=True,
                                   render=True,
                                   is_debug=False,
                                   frame_stack=2)
        self.agent = DQNAgent(stateShape=(84, 84, 2),
                              actionSpace=self.env.get_action_space(),
                              numPicks=32,
                              memorySize=10000,
                              numRewards=self.numRewards)

    def train(self):
        for _ in range(self.episodes):
            self.episode()
            self.current_episode += 1

        plt.show(block=True)
        self.env.close()

    def episode(self):
        done = False
        rewardsSum = 0

        lossSum = 0
        qSums = [0] * (self.numRewards)
        actions = 1

        state = self.env.reset()
        maxHeight = -1

        while not done:
            action, qs = self.agent.selectAction(state)
            if qs != -100000:
                qSums += qs
                actions += 1

            obs, reward, done, _ = self.env.step_all(action)

            nextState = obs
            rewardsSum = np.add(rewardsSum, sum(reward))

            self.agent.addMemory(
                (state, action, (reward[0] + reward[1]), nextState, done))
            state = nextState

            loss = self.agent.trainDQN()
            lossSum += loss

        if self.current_episode % self.agent.sync == 0:
            self.agent.targetNetwork.set_weights(
                self.agent.trainNetwork.get_weights())

        print("now epsilon is {}, the reward is {} with loss {} in episode {}".
              format(self.agent.epsilon, rewardsSum, lossSum,
                     self.current_episode))

        self.episode_score.append(rewardsSum)
        self.episode_height.append(maxHeight)
        self.episode_loss.append(lossSum)
        self.episode_qs.append([qSum / actions for qSum in qSums])
        self.plot()

        print("Report: \nrewardSum:{}\nheight:{}\nloss:{}\nqAverage:{}".format(
            self.episode_score[-1], self.episode_height[-1],
            self.episode_loss[-1], self.episode_qs[-1]))

    def plot(self):
        spline_x = np.linspace(0,
                               self.current_episode,
                               num=self.current_episode)

        ep_scores = np.array(self.episode_score)
        ep_groups = [
            ep_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM]
            for i in range((len(ep_scores) + GROUP_NUM - 1) // GROUP_NUM)
        ]
        # Pad for weird numpy error for now
        ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] *
                                  (GROUP_NUM - len(ep_groups[-1])))
        x_groups = [i * GROUP_NUM for i in range(len(ep_groups))]

        self.ax.clear()
        if len(x_groups) > 5:
            ep_avgs = np.mean(ep_groups, 1)
            avg_spl = interp1d(x_groups,
                               ep_avgs,
                               kind='cubic',
                               fill_value="extrapolate")
            ep_std = np.std(ep_groups, 1)
            std_spl = interp1d(x_groups,
                               ep_std,
                               kind='cubic',
                               fill_value="extrapolate")
            self.ax.plot(spline_x, avg_spl(spline_x), lw=0.7, c="blue")
            self.ax.fill_between(spline_x,
                                 avg_spl(spline_x) - std_spl(spline_x),
                                 avg_spl(spline_x) + std_spl(spline_x),
                                 alpha=0.5,
                                 facecolor="red",
                                 interpolate=True)

        self.ax.title.set_text('Training Score')
        self.ax.set_xlabel('Episode')
        self.ax.set_ylabel('Score')
        '''
        policies = np.transpose(self.episode_policies)
        colors = pl.cm.jet(np.linspace(0, 1, len(policies)*2))

        self.ax[1].clear()
        self.ax[1].title.set_text('Policy Choices')
        for i, policy in enumerate(policies):
            if len(x_groups) > 5:
                ep_groups = [policy[i * GROUP_NUM:(i + 1) * GROUP_NUM] for i in range((len(policy) + GROUP_NUM - 1) // GROUP_NUM)]
                # Pad for weird numpy error for now
                ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] * (GROUP_NUM - len(ep_groups[-1])))
                x_groups = [i*GROUP_NUM for i in range(len(ep_groups))]

                ep_avgs = np.mean(ep_groups, 1)
                avg_spl = interp1d(x_groups, ep_avgs, kind='cubic', fill_value="extrapolate")
                ep_std = np.std(ep_groups, 1)
                std_spl = interp1d(x_groups, ep_std, kind='cubic', fill_value="extrapolate")
                self.ax[1].plot(spline_x, avg_spl(spline_x), lw=0.7, c=colors[i], label="{} policy".format(PolEnum(i).name))
                self.ax[1].fill_between(spline_x, avg_spl(spline_x)-std_spl(spline_x), avg_spl(spline_x)+std_spl(spline_x), alpha=0.5, facecolor=colors[-1-i], interpolate=True)

        self.ax[1].legend()
        '''
        self.fig.canvas.draw()
        plt.show(block=False)
        plt.pause(.001)
示例#4
0
class DeepSeaTreasureGraphicalPDDQN(object):
    def __init__(self, episodes):
        self.current_episode = 0
        self.episodes = episodes

        self.episode_score = []
        self.episode_qs = []
        self.episode_height = []
        self.episode_loss = []
        self.episode_policies = []

        self.fig, self.ax = plt.subplots(figsize=(6, 4))

        self.numRewards = 2

        self.env = DeepSeaTreasure(width=5,
                                   speed=10000000,
                                   graphical_state=True,
                                   render=False,
                                   is_debug=True,
                                   frame_stack=2,
                                   reshape_reward_weights=[[1, 1]],
                                   seed=1234)
        self.agent = DQNAgent(
            stateShape=(84, 84, 2),
            actionSpace=self.env.get_action_space(),
            numPicks=32,
            memorySize=10000,
            numRewards=self.numRewards,
        )
        self.agent.load()

    def train(self):
        for _ in range(self.episodes):
            self.episode()
            self.current_episode += 1

        self.plot()
        self.agent.save()

    def episode(self):
        done = False
        rewardsSum = 0

        lossSum = 0
        qSums = 0
        actions = 1

        state = self.env.reset()
        maxHeight = -1

        while not done:
            action, qs = self.agent.selectAction(state)
            if qs != -100000:
                qSums += qs
                actions += 1

            obs, reward, done, _ = self.env.step_all(action)

            nextState = obs
            rewardsSum = np.add(rewardsSum, sum(reward))

            self.agent.addMemory(state, action, (reward[0] + reward[1]),
                                 nextState, done)
            state = nextState

            loss = self.agent.trainDQN()
            lossSum += loss

        if self.current_episode % self.agent.sync == 0:
            self.agent.targetNetwork.set_weights(
                self.agent.trainNetwork.get_weights())

        print("now epsilon is {}, the reward is {} with loss {} in episode {}".
              format(self.agent.epsilon, rewardsSum, lossSum,
                     self.current_episode))

        self.episode_score.append(rewardsSum)
        self.episode_height.append(maxHeight)
        self.episode_loss.append(lossSum)
        self.episode_qs.append(qSums / actions)

        print("Report: \nrewardSum:{}\nheight:{}\nloss:{}\nqAverage:{}".format(
            self.episode_score[-1],
            self.episode_height[-1],
            self.episode_loss[-1],
            self.episode_qs[-1],
        ))

    def plot(self):
        spline_x = np.linspace(0,
                               self.current_episode,
                               num=self.current_episode)

        ep_scores = np.array(self.episode_score)
        ep_groups = [
            ep_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM]
            for i in range((len(ep_scores) + GROUP_NUM - 1) // GROUP_NUM)
        ]
        # Pad for weird numpy error for now
        ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] *
                                  (GROUP_NUM - len(ep_groups[-1])))
        x_groups = [i * GROUP_NUM for i in range(len(ep_groups))]

        self.ax.clear()
        if len(x_groups) > 5:
            ep_avgs = np.mean(ep_groups, 1)
            avg_spl = interp1d(x_groups,
                               ep_avgs,
                               kind="cubic",
                               fill_value="extrapolate")
            ep_std = np.std(ep_groups, 1)
            std_spl = interp1d(x_groups,
                               ep_std,
                               kind="cubic",
                               fill_value="extrapolate")
            self.ax.plot(spline_x, avg_spl(spline_x), lw=0.7, c="blue")
            self.ax.fill_between(
                spline_x,
                avg_spl(spline_x) - std_spl(spline_x),
                avg_spl(spline_x) + std_spl(spline_x),
                alpha=0.5,
                facecolor="red",
                interpolate=True,
            )

        self.ax.title.set_text("Training Score")
        self.ax.set_xlabel("Episode")
        self.ax.set_ylabel("Score")
        self.fig.canvas.draw()
        plt.savefig("dst_pddqn_retrain.png")
class DeepSeaGraphicalWAgent(object):
    def __init__(self, episodes):
        self.current_episode = 0
        self.episodes = episodes

        self.episode_score = []
        self.episode_qs = []
        self.episode_height = []
        self.episode_loss = []
        self.episode_ws = []
        self.episode_policies = []

        self.fig, self.ax = plt.subplots(1, 2, figsize=(10, 4))
        self.fig.canvas.draw()
        plt.show(block=False)

        self.numRewards = 2

        self.env = DeepSeaTreasure(width=5,
                                   speed=10000,
                                   graphical_state=True,
                                   render=True,
                                   is_debug=False)
        self.agent = DQNAgent(stateShape=(64, 64, 1),
                              actionSpace=self.env.get_action_space(),
                              numPicks=32,
                              memorySize=10000,
                              numRewards=self.numRewards)

    def train(self):
        for _ in range(self.episodes):
            self.episode()
            self.current_episode += 1

        plt.show(block=True)
        self.env.close()

    def episode(self):
        done = False
        rewardsSum = 0

        lossSums = [0] * (self.numRewards)
        policies = [0] * (self.numRewards)
        qSums = [0] * (self.numRewards)
        wSums = [0] * (self.numRewards)
        actions = 1

        state = self.process_state(self.env.reset())
        maxHeight = -1

        while not done:
            action, policy, qs, ws, random = self.agent.selectAction(state)
            if not random:
                policies[policy] += 1
                qSums = [qSums[i] + qs[i] for i in range(len(policies))]
                wSums = [wSums[i] + ws[i] for i in range(len(policies))]
                actions += 1

            obs, reward, done, _ = self.env.step_all(action)

            nextState = state - self.process_state(obs)
            rewardsSum = np.add(rewardsSum, sum(reward))

            self.agent.addMemory(
                (state, action, policy, reward, nextState, done))
            state = nextState

            loss = self.agent.trainDQN()
            lossSums = [lossSums[i] + loss[i][0] for i in range(len(policies))]

        print("now epsilon is {}, the reward is {} with loss {} in episode {}".
              format(self.agent.epsilon, rewardsSum, lossSums,
                     self.current_episode))

        self.episode_score.append(rewardsSum)
        self.episode_height.append(maxHeight)
        self.episode_loss.append(lossSums)
        self.episode_policies.append(policies)
        self.episode_qs.append([qSum / actions for qSum in qSums])
        self.episode_ws.append([wSum / actions for wSum in wSums])
        self.plot()

        print(
            "Report: \nrewardSum:{}\nheight:{}\nloss:{}\npolicies:{}\nqAverage:{}\nws:{}"
            .format(self.episode_score[-1], self.episode_height[-1],
                    self.episode_loss[-1], self.episode_policies[-1],
                    self.episode_qs[-1], self.episode_ws[-1]))

    def process_state(self, state):
        state = cv2.resize(state.astype('float32'), (64, 64),
                           interpolation=cv2.INTER_AREA)
        state = cv2.cvtColor(state, cv2.COLOR_RGB2GRAY)
        return np.expand_dims(state, 2)

    def plot(self):
        spline_x = np.linspace(0,
                               self.current_episode,
                               num=self.current_episode)

        ep_scores = np.array(self.episode_score)
        ep_groups = [
            ep_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM]
            for i in range((len(ep_scores) + GROUP_NUM - 1) // GROUP_NUM)
        ]
        # Pad for weird numpy error for now
        ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] *
                                  (GROUP_NUM - len(ep_groups[-1])))
        x_groups = [i * GROUP_NUM for i in range(len(ep_groups))]

        self.ax[0].clear()
        if len(x_groups) > 5:
            ep_avgs = np.mean(ep_groups, 1)
            avg_spl = interp1d(x_groups,
                               ep_avgs,
                               kind='cubic',
                               fill_value="extrapolate")
            ep_std = np.std(ep_groups, 1)
            std_spl = interp1d(x_groups,
                               ep_std,
                               kind='cubic',
                               fill_value="extrapolate")
            self.ax[0].plot(spline_x, avg_spl(spline_x), lw=0.7, c="blue")
            self.ax[0].fill_between(spline_x,
                                    avg_spl(spline_x) - std_spl(spline_x),
                                    avg_spl(spline_x) + std_spl(spline_x),
                                    alpha=0.5,
                                    facecolor="red",
                                    interpolate=True)

        self.ax[0].title.set_text('Training Score')
        self.ax[0].set_xlabel('Episode')
        self.ax[0].set_ylabel('Score')

        policies = np.transpose(self.episode_policies)
        colors = pl.cm.jet(np.linspace(0, 1, len(policies) * 2))

        self.ax[1].clear()
        self.ax[1].title.set_text('Policy Choices')
        for i, policy in enumerate(policies):
            if len(x_groups) > 5:
                ep_groups = [
                    policy[i * GROUP_NUM:(i + 1) * GROUP_NUM]
                    for i in range((len(policy) + GROUP_NUM - 1) // GROUP_NUM)
                ]
                # Pad for weird numpy error for now
                ep_groups[-1] = np.append(ep_groups[-1],
                                          [np.mean(ep_groups[-1])] *
                                          (GROUP_NUM - len(ep_groups[-1])))
                x_groups = [i * GROUP_NUM for i in range(len(ep_groups))]

                ep_avgs = np.mean(ep_groups, 1)
                avg_spl = interp1d(x_groups,
                                   ep_avgs,
                                   kind='cubic',
                                   fill_value="extrapolate")
                ep_std = np.std(ep_groups, 1)
                std_spl = interp1d(x_groups,
                                   ep_std,
                                   kind='cubic',
                                   fill_value="extrapolate")
                self.ax[1].plot(spline_x,
                                avg_spl(spline_x),
                                lw=0.7,
                                c=colors[i],
                                label="{} policy".format(PolEnum(i).name))
                self.ax[1].fill_between(spline_x,
                                        avg_spl(spline_x) - std_spl(spline_x),
                                        avg_spl(spline_x) + std_spl(spline_x),
                                        alpha=0.5,
                                        facecolor=colors[-1 - i],
                                        interpolate=True)

        self.ax[1].legend()

        self.fig.canvas.draw()
        plt.show(block=False)
        plt.pause(.001)
示例#6
0
class MultiObjectiveDeepSeaW(object):
    def __init__(self, episodes):
        self.current_episode = 0
        self.episodes = episodes

        self.episode_score = []
        self.episode_qs = []
        self.episode_height = []
        self.episode_loss = []
        self.episode_ws = []
        self.episode_policies = []

        self.fig, self.ax = plt.subplots(figsize=(6, 4))
        self.fig.tight_layout()
        self.fig.canvas.draw()

        self.numRewards = 2

        self.env = DeepSeaTreasure(width=5,
                                   speed=50,
                                   graphical_state=True,
                                   render=True,
                                   is_debug=True,
                                   frame_stack=2,
                                   reshape_reward_weights=[[1, 1]],
                                   seed=1234)

        self.agent = DQNAgent(
            stateShape=(84, 84, 2),
            actionSpace=self.env.get_action_space(),
            numPicks=32,
            memorySize=10000,
            numRewards=self.numRewards,
        )
        self.agent.load()

    def train(self):
        for _ in range(self.episodes):
            self.episode()
            self.current_episode += 1

        self.plot()

    def episode(self):
        done = False
        rewardsSum = 0
        policies = [0] * (self.numRewards + 1)

        qSums = [0] * (self.numRewards)
        wSums = [0] * (self.numRewards)
        actions = 1

        state = self.env.reset()

        while not done:
            action, policy, qs, ws, random = self.agent.selectAction(state)
            policies[policy] += 1
            if not random:
                qSums[policy] += qs
                wSums = [wSums[i] + ws[i] for i in range(len(wSums))]
                actions += 1

            obs, reward, done, _ = self.env.step_all(action)

            nextState = obs
            rewardsSum = np.add(rewardsSum, reward[1])

            self.agent.addMemory(state, action, policy, reward, nextState,
                                 done)
            state = nextState

        print("now epsilon is {}, the reward is {} in episode {}".format(
            self.agent.epsilon, rewardsSum, self.current_episode))

        self.episode_score.append(rewardsSum)
        self.episode_policies.append(policies)
        self.episode_qs.append([qSum / actions for qSum in qSums])
        self.episode_ws.append([wSum / actions for wSum in wSums])

        print("Report: \nrewardSum:{}\npolicies:{}\nqAverage:{}\nws:{}".format(
            self.episode_score[-1],
            self.episode_policies[-1],
            self.episode_qs[-1],
            self.episode_ws[-1],
        ))
        print("memory len:" + str(len(self.agent.replayMemory[0])))
        print("memory used:" + str(psutil.virtual_memory().used // 1e6))
        tf.keras.backend.clear_session()
        gc.collect()

    def plot(self):
        spline_x = np.linspace(0,
                               self.current_episode,
                               num=self.current_episode)

        ep_scores = np.array(self.episode_score)
        ep_groups = [
            ep_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM]
            for i in range((len(ep_scores) + GROUP_NUM - 1) // GROUP_NUM)
        ]
        # Pad for weird numpy error for now
        ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] *
                                  (GROUP_NUM - len(ep_groups[-1])))
        x_groups = [i * GROUP_NUM for i in range(len(ep_groups))]

        self.ax.clear()

        self.ax.plot(spline_x, [-3] * len(spline_x),
                     lw=0.7,
                     c="blue",
                     label="Pareto Front 1")
        self.ax.plot(spline_x, [-5] * len(spline_x),
                     lw=0.7,
                     c="green",
                     label="Pareto Front 2")
        self.ax.plot(spline_x, [-7] * len(spline_x),
                     lw=0.7,
                     c="cyan",
                     label="Pareto Front 3")
        self.ax.plot(spline_x, [-9] * len(spline_x),
                     lw=0.7,
                     c="orange",
                     label="Pareto Front 4")
        self.ax.plot(spline_x, [-11] * len(spline_x),
                     lw=0.7,
                     c="purple",
                     label="Pareto Front 5")

        if len(x_groups) > 5:
            ep_avgs = np.mean(ep_groups, 1)
            avg_spl = interp1d(x_groups,
                               ep_avgs,
                               kind="cubic",
                               fill_value="extrapolate")
            ep_std = np.std(ep_groups, 1)
            std_spl = interp1d(x_groups,
                               ep_std,
                               kind="cubic",
                               fill_value="extrapolate")
            self.ax.plot(spline_x,
                         avg_spl(spline_x),
                         lw=0.7,
                         c="red",
                         label="W-DQN Time Policy")
            self.ax.fill_between(
                spline_x,
                avg_spl(spline_x) - std_spl(spline_x),
                avg_spl(spline_x) + std_spl(spline_x),
                alpha=0.5,
                facecolor="red",
                interpolate=True,
            )

        self.ax.legend(loc="lower left")
        plt.legend()
        self.ax.title.set_text("Training Score")
        self.ax.set_xlabel("Episode")
        self.ax.set_ylabel("Score")

        policies = np.transpose(self.episode_policies)
        colors = pl.cm.jet(np.linspace(0, 1, len(policies) * 2))

        self.fig.canvas.draw()
        plt.savefig("dst_w_pddqn_retrain_081.png")

    def plot_compare(self):
        spline_x = np.linspace(0,
                               self.current_episode,
                               num=self.current_episode)

        ep_adam_scores = np.array(self.adam_scores)
        ep_rms_scores = np.array(self.rms_scores)
        ep_adam_groups = [
            ep_adam_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM]
            for i in range((len(ep_adam_scores) + GROUP_NUM - 1) // GROUP_NUM)
        ]
        ep_rms_groups = [
            ep_rms_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM]
            for i in range((len(ep_rms_scores) + GROUP_NUM - 1) // GROUP_NUM)
        ]
        # Pad for weird numpy error for now
        ep_adam_groups[-1] = np.append(
            ep_adam_groups[-1],
            [np.mean(ep_adam_groups[-1])] *
            (GROUP_NUM - len(ep_adam_groups[-1])),
        )
        ep_rms_groups[-1] = np.append(
            ep_rms_groups[-1],
            [np.mean(ep_rms_groups[-1])] *
            (GROUP_NUM - len(ep_rms_groups[-1])),
        )
        x_groups = [i * GROUP_NUM for i in range(len(ep_adam_groups))]

        self.ax.clear()
        if len(x_groups) > 5:
            ep_adam_avgs = np.mean(ep_adam_groups, 1)
            ep_rms_avgs = np.mean(ep_rms_groups, 1)

            avg_adam_spl = interp1d(x_groups,
                                    ep_adam_avgs,
                                    kind="cubic",
                                    fill_value="extrapolate")
            avg_rms_spl = interp1d(x_groups,
                                   ep_rms_avgs,
                                   kind="cubic",
                                   fill_value="extrapolate")

            ep_adam_std = np.std(ep_adam_groups, 1)
            ep_rms_std = np.std(ep_rms_groups, 1)

            std_adam_spl = interp1d(x_groups,
                                    ep_adam_std,
                                    kind="cubic",
                                    fill_value="extrapolate")
            std_rms_spl = interp1d(x_groups,
                                   ep_rms_std,
                                   kind="cubic",
                                   fill_value="extrapolate")

            self.ax.plot(spline_x,
                         avg_adam_spl(spline_x),
                         lw=0.7,
                         c="blue",
                         label="Adam")
            self.ax.fill_between(
                spline_x,
                avg_adam_spl(spline_x) - std_adam_spl(spline_x),
                avg_adam_spl(spline_x) + std_adam_spl(spline_x),
                alpha=0.5,
                facecolor="red",
                interpolate=True,
            )

            self.ax.plot(spline_x,
                         avg_rms_spl(spline_x),
                         lw=0.7,
                         c="orange",
                         label="RMSProp")
            self.ax.fill_between(
                spline_x,
                avg_rms_spl(spline_x) - std_rms_spl(spline_x),
                avg_rms_spl(spline_x) + std_rms_spl(spline_x),
                alpha=0.5,
                facecolor="green",
                interpolate=True,
            )

        self.ax.title.set_text("Training Score")
        self.ax.set_xlabel("Episode")
        self.ax.set_ylabel("Score")
        self.ax.legend()
        plt.show(block=True)