def end(self, log=None, writer=None): if log.total_steps >= self.learn_start: self.epsilon *= epsilon_decrease self.epsilon = max(self.epsilon, 0.1) if (log.episode + 1) % 100 == 0: self.env.state = [] self.env.action = 0 libsumo.trafficlight.setRedYellowGreenState(self.env.light, self.env.red_yellow_green[0]) # TODO make this nice self.observe._is_empty = True obs = self.observe(((self.env.state_to_grid(self.env.state, self.env.action, 0), 1), [])) q_front = self.q_front(np.expand_dims(obs, axis=0)) try: ref_point = np.array([-5, -5]) hypervolume = compute_hypervolume(q_front[0], self.env.nA, ref_point) except ValueError: hypervolume = 0 fig, axes = plt.subplots(1, 2, sharex='col', sharey='row',) #subplot_kw={'xlim': [-1, 400], 'ylim': [-120, 1]}) fig.subplots_adjust(wspace=0, hspace=0) for act in range(2): ax = axes[act] #np.unravel_index(act, (1, 2))] x = q_front[0, act, :, 0]*self.normalize_reward[0] y = q_front[0, act, :, 1]*self.normalize_reward[1] ax.plot(x, y) # plt.show() fig.canvas.draw() # Now we can save it to a numpy array. data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8) data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) data = np.rollaxis(data, -1, 0) writer.add_scalar('hypervolume', np.amax(hypervolume), log.total_steps) writer.add_image('pareto_front', data, log.total_steps) writer.add_scalar('epsilon', self.epsilon, log.total_steps) plt.close(fig) writer.add_scalar('pareto_loss', self.e_loss / log.episode_step, log.episode) writer.add_scalar('hypervolume_exceeded', self.hypervolume_exceeded, log.episode) if (log.episode + 1) % 1000 == 0: f = Path(list(writer.all_writers.keys())[0]) / 'checkpoints' / 'reward_est_{}.pt'.format(log.episode) f.parents[0].mkdir(parents=True, exist_ok=True) torch.save(self.estimate_reward.model, f) f = Path(list(writer.all_writers.keys())[0]) / 'checkpoints' / 'pareto_est_{}.pt'.format(log.episode) f.parents[0].mkdir(parents=True, exist_ok=True) torch.save(self.estimate_objective.model, f)
def end(self, log=None, writer=None): if log.total_steps >= self.learn_start: self.epsilon *= epsilon_decrease self.epsilon = max(self.epsilon, 0.1) if (log.episode + 1) % 100 == 0: self.env.state = self.env.reset() #self.env.action_space = 0 self.observe._is_empty = True q_front = self.q_front(np.expand_dims(obs, axis=0)) #obs = self.observe(((self.env.state_to_grid(self.env.state, self.env.action, 0), 1), [])) try: ref_point = np.array([-5, -5]) hypervolume = compute_hypervolume(q_front[0], self.action_space.n, ref_point) except ValueError: hypervolume = 0
def end(self, log=None, writer=None): if log.total_steps >= self.learn_start: self.epsilon *= epsilon_decrease self.epsilon = max(self.epsilon, 0.1) if (log.episode + 1) % 100 == 0: # all states of deep-sea-treasure plot_states = list(range(10)) + \ list(range(11, 20))+ \ list(range(22, 30))+ \ list(range(33, 40)) + \ list(range(46, 50))+ \ list(range(56, 60))+ \ list(range(66, 70))+ \ list(range(78, 80))+ \ list(range(88, 90))+ \ list(range(99, 100)) # estimate pareto front for all states obs = np.array([]).reshape(0, self.env.nS) for s in plot_states: obs = np.concatenate((obs, np.expand_dims(self.observe(s), 0))) q_fronts = self.q_front(obs, self.n_samples) # undo pessimistic bias q_fronts += np.array([0, 1]).reshape(1, 1, 1, 2) # unnormalize reward q_fronts = q_fronts * self.normalize_reward['scale'].reshape( 1, 1, 1, 2) + self.normalize_reward['min'].reshape(1, 1, 1, 2) try: ref_point = np.array([-2, -2]) hypervolume = compute_hypervolume(q_fronts[0], self.env.nA, ref_point) except ValueError: hypervolume = 0 act = 2 fig, axes = plt.subplots(11, 10, sharex='col', sharey='row', subplot_kw={ 'xlim': [-1, 150], 'ylim': [-20, 1] }) fig.subplots_adjust(wspace=0, hspace=0) for s in range(len(plot_states)): ax = axes[np.unravel_index(plot_states[s], (11, 10))] x = q_fronts[s, act, :, 0] y = q_fronts[s, act, :, 1] ax.plot(x, y) # true pareto front true_xy = true_non_dominated[plot_states[s]][act] true_xy = true_xy * self.normalize_reward['scale'].reshape( 1, 2) + self.normalize_reward['min'].reshape(1, 2) ax.plot(true_xy[:, 0], true_xy[:, 1], '+') for s in range(self.env.nS): if unreachable(s): ax = axes[np.unravel_index(s, (11, 10))] ax.set_facecolor((1.0, 0.47, 0.42)) # plt.show() fig.canvas.draw() # Now we can save it to a numpy array. data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8) data = data.reshape(fig.canvas.get_width_height()[::-1] + (3, )) data = np.rollaxis(data, -1, 0) writer.add_scalar('hypervolume', np.amax(hypervolume), log.total_steps) writer.add_image('pareto_front', data, log.total_steps) writer.add_scalar('epsilon', self.epsilon, log.total_steps) plt.close(fig) writer.add_scalar('pareto_loss', self.e_loss / log.episode_step, log.episode) writer.add_scalar('hypervolume_exceeded', self.hypervolume_exceeded, log.episode) if (log.episode + 1) % 1000 == 0: f = Path(list(writer.all_writers.keys())[0] ) / 'checkpoints' / 'reward_est_{}.pt'.format(log.episode) f.parents[0].mkdir(parents=True, exist_ok=True) torch.save(self.estimate_reward.model, f) f = Path(list(writer.all_writers.keys())[0] ) / 'checkpoints' / 'pareto_est_{}.pt'.format(log.episode) f.parents[0].mkdir(parents=True, exist_ok=True) torch.save(self.estimate_objective.model, f)