Пример #1
0
    def end(self, log=None, writer=None):
        if log.total_steps >= self.learn_start:
            self.epsilon *= epsilon_decrease
            self.epsilon = max(self.epsilon, 0.1)
        if (log.episode + 1) % 100 == 0:

            self.env.state = []
            self.env.action = 0
            libsumo.trafficlight.setRedYellowGreenState(self.env.light, self.env.red_yellow_green[0])
            # TODO make this nice
            self.observe._is_empty = True
            obs = self.observe(((self.env.state_to_grid(self.env.state, self.env.action, 0), 1), []))
            q_front = self.q_front(np.expand_dims(obs, axis=0))

            try:
                ref_point = np.array([-5, -5])
                hypervolume = compute_hypervolume(q_front[0], self.env.nA, ref_point)
            except ValueError:
                hypervolume = 0


            fig, axes = plt.subplots(1, 2, sharex='col', sharey='row',)
                                     #subplot_kw={'xlim': [-1, 400], 'ylim': [-120, 1]})

            fig.subplots_adjust(wspace=0, hspace=0)
            for act in range(2):
                ax = axes[act] #np.unravel_index(act, (1, 2))]
                x = q_front[0, act, :, 0]*self.normalize_reward[0]
                y = q_front[0, act, :, 1]*self.normalize_reward[1]
                ax.plot(x, y)

            # plt.show()
            fig.canvas.draw()
            # Now we can save it to a numpy array.
            data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
            data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
            data = np.rollaxis(data, -1, 0)

            writer.add_scalar('hypervolume', np.amax(hypervolume), log.total_steps)
            writer.add_image('pareto_front', data, log.total_steps)
            writer.add_scalar('epsilon', self.epsilon, log.total_steps)
            plt.close(fig)
        writer.add_scalar('pareto_loss', self.e_loss / log.episode_step, log.episode)
        writer.add_scalar('hypervolume_exceeded', self.hypervolume_exceeded, log.episode)

        if (log.episode + 1) % 1000 == 0:
            f = Path(list(writer.all_writers.keys())[0]) / 'checkpoints' / 'reward_est_{}.pt'.format(log.episode)
            f.parents[0].mkdir(parents=True, exist_ok=True)
            torch.save(self.estimate_reward.model, f)

            f = Path(list(writer.all_writers.keys())[0]) / 'checkpoints' / 'pareto_est_{}.pt'.format(log.episode)
            f.parents[0].mkdir(parents=True, exist_ok=True)
            torch.save(self.estimate_objective.model, f)
Пример #2
0
    def end(self, log=None, writer=None):

        if log.total_steps >= self.learn_start:
            self.epsilon *= epsilon_decrease
            self.epsilon = max(self.epsilon, 0.1)

        if (log.episode + 1) % 100 == 0:

            self.env.state = self.env.reset()
            #self.env.action_space = 0
            self.observe._is_empty = True

            q_front = self.q_front(np.expand_dims(obs, axis=0))
            #obs = self.observe(((self.env.state_to_grid(self.env.state, self.env.action, 0), 1), []))
            try:
                ref_point = np.array([-5, -5])
                hypervolume = compute_hypervolume(q_front[0],
                                                  self.action_space.n,
                                                  ref_point)
            except ValueError:
                hypervolume = 0
Пример #3
0
    def end(self, log=None, writer=None):
        if log.total_steps >= self.learn_start:
            self.epsilon *= epsilon_decrease
            self.epsilon = max(self.epsilon, 0.1)
        if (log.episode + 1) % 100 == 0:

            # all states of deep-sea-treasure
            plot_states = list(range(10)) + \
                          list(range(11, 20))+ \
                          list(range(22, 30))+ \
                          list(range(33, 40)) + \
                          list(range(46, 50))+ \
                          list(range(56, 60))+ \
                          list(range(66, 70))+ \
                          list(range(78, 80))+ \
                          list(range(88, 90))+ \
                          list(range(99, 100))
            # estimate pareto front for all states
            obs = np.array([]).reshape(0, self.env.nS)
            for s in plot_states:
                obs = np.concatenate((obs, np.expand_dims(self.observe(s), 0)))
            q_fronts = self.q_front(obs, self.n_samples)
            # undo pessimistic bias
            q_fronts += np.array([0, 1]).reshape(1, 1, 1, 2)
            # unnormalize reward
            q_fronts = q_fronts * self.normalize_reward['scale'].reshape(
                1, 1, 1, 2) + self.normalize_reward['min'].reshape(1, 1, 1, 2)

            try:
                ref_point = np.array([-2, -2])
                hypervolume = compute_hypervolume(q_fronts[0], self.env.nA,
                                                  ref_point)
            except ValueError:
                hypervolume = 0

            act = 2
            fig, axes = plt.subplots(11,
                                     10,
                                     sharex='col',
                                     sharey='row',
                                     subplot_kw={
                                         'xlim': [-1, 150],
                                         'ylim': [-20, 1]
                                     })

            fig.subplots_adjust(wspace=0, hspace=0)
            for s in range(len(plot_states)):
                ax = axes[np.unravel_index(plot_states[s], (11, 10))]
                x = q_fronts[s, act, :, 0]
                y = q_fronts[s, act, :, 1]
                ax.plot(x, y)

                # true pareto front
                true_xy = true_non_dominated[plot_states[s]][act]
                true_xy = true_xy * self.normalize_reward['scale'].reshape(
                    1, 2) + self.normalize_reward['min'].reshape(1, 2)

                ax.plot(true_xy[:, 0], true_xy[:, 1], '+')
            for s in range(self.env.nS):
                if unreachable(s):
                    ax = axes[np.unravel_index(s, (11, 10))]
                    ax.set_facecolor((1.0, 0.47, 0.42))

            # plt.show()
            fig.canvas.draw()
            # Now we can save it to a numpy array.
            data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
            data = data.reshape(fig.canvas.get_width_height()[::-1] + (3, ))
            data = np.rollaxis(data, -1, 0)

            writer.add_scalar('hypervolume', np.amax(hypervolume),
                              log.total_steps)
            writer.add_image('pareto_front', data, log.total_steps)
            writer.add_scalar('epsilon', self.epsilon, log.total_steps)
            plt.close(fig)
        writer.add_scalar('pareto_loss', self.e_loss / log.episode_step,
                          log.episode)
        writer.add_scalar('hypervolume_exceeded', self.hypervolume_exceeded,
                          log.episode)

        if (log.episode + 1) % 1000 == 0:
            f = Path(list(writer.all_writers.keys())[0]
                     ) / 'checkpoints' / 'reward_est_{}.pt'.format(log.episode)
            f.parents[0].mkdir(parents=True, exist_ok=True)
            torch.save(self.estimate_reward.model, f)

            f = Path(list(writer.all_writers.keys())[0]
                     ) / 'checkpoints' / 'pareto_est_{}.pt'.format(log.episode)
            f.parents[0].mkdir(parents=True, exist_ok=True)
            torch.save(self.estimate_objective.model, f)