taskid, random_state=run_id) # nn with embedding related nn_with_embedding = NN_with_EntityEmbedding( X_train, y_train_int, categorical_features, categorical_names, class_names, epochs=epochs, batch_size=batch_size, ) nn_with_embedding_loss, nn_with_embedding_score = nn_with_embedding.evaluate( X_test, y_test_int) print("nn_with_embedding prediction score: ", str(nn_with_embedding_score)) logger.log('nn_with_embedding', taskid, run_id, nn_with_embedding_score) # nn related nn = NN( X_train, y_train_int, categorical_features, categorical_names, class_names, epochs=epochs, batch_size=batch_size, ) nn_loss, nn_score = nn.evaluate(X_test, y_test_int) print("nn prediction score: ", str(nn_score)) logger.log('nn', taskid, run_id, nn_score)
batch_ac.append(ac) batch_rcw.append((row, col-w, w-prev_w)) batch_frames.append(frames) batch_prev_frames = np.array(batch_prev_frames) batch_ac = np.array(batch_ac) batch_rcw = np.array(batch_rcw)[:, None, :] batch_frames = np.array(batch_frames) q_map._optimize(batch_prev_frames, batch_ac, batch_rcw, batch_frames, batch_dones, batch_weights) if t % args.target == 0: q_map.update_target() if t % 50 == 0: losses = [] all_images = [] for i_level in range(len(test_levels)): pred_qmaps = q_map.compute_q_values(test_obs[i_level]) true_qmaps = test_qmaps[i_level] loss = np.mean((pred_qmaps - true_qmaps)**2) losses.append(loss) ob_images = np.concatenate(test_obs[i_level][image_indexes[i_level]], axis=1) pred_images = np.concatenate((color_map(pred_qmaps[image_indexes[i_level]].max(3))[:, :, :, :3] * 255).astype(np.uint8), axis=1) true_images = np.concatenate((color_map(true_qmaps[image_indexes[i_level]].max(3))[:, :, :, :3] * 255).astype(np.uint8), axis=1) all_images.append(np.concatenate((ob_images, true_images, pred_images), axis=0)) img = np.concatenate(all_images, axis=0) toimage(img, cmin=0, cmax=255).save('{}/images/{}.png'.format(path, t)) if args.render: img = np.repeat(np.repeat(img, 3, 0), 3, 1) viewer.imshow(img) print(t*args.batch, 'Losses:', *losses) loss_logger.log(t, *losses)
class Q_Map_DQN_Agent(Agent): def __init__( self, # All observation_space, n_actions, coords_shape, double_replay_buffer, task_gamma, exploration_schedule, seed, learning_starts=1000, train_freq=1, print_freq=100, env_name='ENV', agent_name='AGENT', renderer_viewer=True, # DQN: dqn_q_func=None, dqn_lr=5e-4, dqn_batch_size=32, dqn_optim_iters=1, dqn_target_net_update_freq=500, dqn_grad_norm_clip=100, dqn_double_q=True, # Q-Map: q_map_model=None, q_map_random_schedule=None, q_map_greedy_bias=0.5, q_map_timer_bonus=0.5, q_map_lr=5e-4, q_map_gamma=0.9, q_map_n_steps=1, q_map_batch_size=32, q_map_optim_iters=1, q_map_target_net_update_freq=500, q_map_min_goal_steps=10, q_map_max_goal_steps=20, q_map_grad_norm_clip=1000, q_map_double_q=True): # All self.observation_space = observation_space self.n_actions = n_actions self.coords_shape = coords_shape self.double_replay_buffer = double_replay_buffer self.task_gamma = task_gamma self.exploration_schedule = exploration_schedule self.learning_starts = learning_starts self.train_freq = train_freq self.print_freq = print_freq agent_name += '-train' + str(train_freq) # DQN if dqn_q_func is not None: self.use_dqn = True agent_name += '-' agent_name += 'DQN-lr' + str(dqn_lr) + '-freq-' + str(train_freq) self.dqn_target_net_update_freq = dqn_target_net_update_freq self.dqn = DQN(model=dqn_q_func, observation_space=observation_space, n_actions=n_actions, gamma=task_gamma, lr=dqn_lr, replay_buffer=double_replay_buffer, batch_size=dqn_batch_size, optim_iters=dqn_optim_iters, grad_norm_clip=dqn_grad_norm_clip, double_q=dqn_double_q) else: self.use_dqn = False # Q-MAP if q_map_model is not None: agent_name += '-' agent_name += 'Q-MAP-' + q_map_model.description + '-' + str( q_map_min_goal_steps ) + '-' + str(q_map_max_goal_steps) + '-gamma' + str( q_map_gamma) + '-lr' + str(q_map_lr) + '-bias' + str( q_map_greedy_bias) + '-bonus' + str(q_map_timer_bonus) self.use_q_map = True self.q_map_timer_bonus = q_map_timer_bonus self.using_q_map_starts = 2 * self.learning_starts self.q_map_random_schedule = q_map_random_schedule self.q_map_greedy_bias = q_map_greedy_bias self.q_map_goal_proba = 1 # TODO self.q_map_gamma = q_map_gamma self.q_map_target_net_update_freq = q_map_target_net_update_freq self.q_map_min_goal_steps = q_map_min_goal_steps self.q_map_max_goal_steps = q_map_max_goal_steps self.q_map_min_q_value = q_map_gamma**(q_map_max_goal_steps - 1) self.q_map_max_q_value = q_map_gamma**(q_map_min_goal_steps - 1) self.q_map_goal = None self.q_map_goal_timer = 0 self.q_map = Q_Map(model=q_map_model, observation_space=observation_space, coords_shape=coords_shape, n_actions=n_actions, gamma=q_map_gamma, n_steps=q_map_n_steps, lr=q_map_lr, replay_buffer=double_replay_buffer, batch_size=q_map_batch_size, optim_iters=q_map_optim_iters, grad_norm_clip=q_map_grad_norm_clip, double_q=q_map_double_q) else: self.use_q_map = False if not self.use_dqn and not self.use_q_map: agent_name += 'random' else: self.tf_saver = tf.train.Saver() agent_name += '-memory' + str(double_replay_buffer._maxsize) # All home = os.path.expanduser('~') sub_name = 'seed-{}_{}'.format( seed, datetime.utcnow().strftime('%F_%H-%M-%S-%f')) self.path = '{}/results/q-map/{}/{}/{}'.format(home, env_name, agent_name, sub_name) # log exploration for debugging exploration_labels = [ 'steps', 'planned exploration', 'current exploration', 'random actions', 'goal actions', 'greedy actions' ] self.exploration_logger = CSVLogger(exploration_labels, self.path + '/exploration') # videos etc. self.renderer = Q_Map_Renderer(self.path, viewer=renderer_viewer) # path to store self.tensorflow_path = self.path + '/tensorflow' if not os.path.exists(self.tensorflow_path): os.makedirs(self.tensorflow_path) U.initialize() self.t = 0 self.episode_rewards = [] self.random_proba = self.exploration_schedule.value(0) self.random_freq = self.exploration_schedule.value(0) self.greedy_freq = 1.0 - self.random_freq self.goal_freq = 0.0 if self.use_dqn: self.dqn.update_target() self.seed(seed) def seed(self, seed): self.np_random, seed = seeding.np_random(seed) if self.use_dqn: self.dqn.seed(seed) if self.use_q_map: self.q_map.seed(seed) return [seed] def reset(self, ob): if self.use_q_map: self.q_map_goal_timer = 0 self.q_map_goal = None frames = ob[0] ac = self.choose_action(ob) self.log() self.episode_rewards.append(0.0) self.prev_ob = ob self.prev_ac = ac return ac def step(self, ob, rew, done): prev_frames, (_, _, prev_w), _, _ = self.prev_ob frames, (row, col, w), _, _ = ob if self.double_replay_buffer is not None: self.double_replay_buffer.add(prev_frames, self.prev_ac, rew, (row, col - w, w - prev_w), frames, done) self.optimize() if not done: ac = self.choose_action(ob) else: ac = None self.add_to_renderer(ob) self.t += 1 self.episode_rewards[-1] += rew self.prev_ob = ob self.prev_ac = ac return ac def choose_action(self, ob): frames, (row, col, w), screen, (full_r, full_c) = ob q_map_values = None q_map_candidates = [] q_map_biased_candidates = [] # render Q-maps all the time even if we do not need them if self.use_q_map: q_map_values = self.q_map.compute_q_values( frames[None])[0] # (rows, cols, acs) if self.np_random.rand() < self.random_proba or ( not self.use_dqn and self.t <= self.using_q_map_starts): ac = self.np_random.randint(self.n_actions) action_type = 'random' else: # Q-Map available and started to train if self.use_q_map and self.t > self.using_q_map_starts: # reached goal if self.q_map_goal_timer > 0 and self.q_map_goal[1] < w: self.q_map_goal_timer = 0 self.q_map_goal = None # goal unreachable if self.q_map_goal_timer > 0 and (row, col) == self.q_map_goal: self.q_map_goal_timer = 0 self.q_map_goal = None # no more goal if self.q_map_goal_timer == 0: if self.np_random.rand() < self.q_map_goal_proba: # find a new goal q_map_max_values = q_map_values.max(2) # (rows, cols) q_map_candidates_mask = np.logical_and( self.q_map_min_q_value <= q_map_max_values, self.q_map_max_q_value >= q_map_max_values) q_map_candidates = np.where(q_map_candidates_mask) q_map_candidates = np.dstack(q_map_candidates)[ 0] # list of (row, col) if len(q_map_candidates) > 0: # goals compatible with greedy action if self.use_dqn and self.np_random.rand( ) < self.q_map_greedy_bias: greedy_ac = self.dqn.choose_action( frames, stochastic=False) q_map_biased_candidates_mask = np.logical_and( q_map_candidates_mask, q_map_values.argmax(2) == greedy_ac) q_map_biased_candidates = np.where( q_map_biased_candidates_mask) q_map_biased_candidates = np.dstack( q_map_biased_candidates)[ 0] # list of (row, col) # same DQN and Q-Map action if len(q_map_biased_candidates) > 0: goal_idx = self.np_random.randint( len(q_map_biased_candidates)) q_map_goal_row, q_map_goal_col_local = q_map_biased_candidates[ goal_idx] q_map_expected_steps = math.log( q_map_max_values[q_map_goal_row, q_map_goal_col_local], self.q_map_gamma) + 1 self.q_map_goal_timer = math.ceil( 1.5 * q_map_expected_steps) # 50% bonus self.q_map_goal = (q_map_goal_row, q_map_goal_col_local + w) ac = greedy_ac action_type = 'dqn/qmap' # greedy Q-Map action else: goal_idx = self.np_random.randint( len(q_map_candidates)) q_map_goal_row, q_map_goal_col_local = q_map_candidates[ goal_idx] q_map_expected_steps = math.log( q_map_max_values[q_map_goal_row, q_map_goal_col_local], self.q_map_gamma) + 1 self.q_map_goal_timer = math.ceil( (1. + self.q_map_timer_bonus) * q_map_expected_steps) self.q_map_goal = (q_map_goal_row, q_map_goal_col_local + w) ac, q_map_values = self.q_map.choose_action( None, (q_map_goal_row, q_map_goal_col_local), q_map_values ) # no need to recompute the Q-Map action_type = 'qmap' self.q_map_goal_timer -= 1 if self.q_map_goal_timer == 0: self.q_map_goal = None # random action else: self.q_map_goal_timer = 0 self.q_map_goal = None ac = self.np_random.randint(self.n_actions) action_type = 'random' # DQN action else: ac = self.dqn.choose_action(frames, stochastic=False) action_type = 'dqn' # Q-Map action else: q_map_goal_row, q_map_goal_col = self.q_map_goal q_map_goal_col_local = q_map_goal_col - w ac, q_map_values = self.q_map.choose_action( frames, (q_map_goal_row, q_map_goal_col_local)) self.q_map_goal_timer -= 1 if self.q_map_goal_timer == 0: self.q_map_goal = None action_type = 'qmap' # DQN action else: ac = self.dqn.choose_action(frames, stochastic=False) action_type = 'dqn' # rendering self.add_to_renderer(ob, q_map_values, ac, action_type, q_map_candidates, q_map_biased_candidates) # update exploration if action_type == 'dqn/qmap': self.random_freq += 0.01 * (0 - self.random_freq) self.greedy_freq += 0.01 * (1 - self.greedy_freq) self.goal_freq += 0.01 * (0 - self.goal_freq) # TODO: 1? elif action_type == 'dqn': self.random_freq += 0.01 * (0 - self.random_freq) self.greedy_freq += 0.01 * (1 - self.greedy_freq) self.goal_freq += 0.01 * (0 - self.goal_freq) elif action_type == 'qmap': self.random_freq += 0.01 * (0 - self.random_freq) self.greedy_freq += 0.01 * (0 - self.greedy_freq) self.goal_freq += 0.01 * (1 - self.goal_freq) elif action_type == 'random': self.random_freq += 0.01 * (1 - self.random_freq) self.greedy_freq += 0.01 * (0 - self.greedy_freq) self.goal_freq += 0.01 * (0 - self.goal_freq) else: raise NotImplementedError( 'unknown action type {}'.format(action_type)) target_exploration = self.exploration_schedule.value(self.t) current_exploration = (1.0 - self.greedy_freq) if self.use_q_map and self.t >= self.using_q_map_starts: self.random_proba = self.q_map_random_schedule.value(self.t) if current_exploration > target_exploration: self.q_map_goal_proba -= 0.001 elif current_exploration < target_exploration: self.q_map_goal_proba += 0.001 else: self.random_proba = self.exploration_schedule.value(self.t) if (self.t + 1) % 100 == 0: self.exploration_logger.log(self.t + 1, target_exploration, current_exploration, self.random_freq, self.goal_freq, self.greedy_freq) return ac def optimize(self): if ( self.use_dqn or self.use_q_map ) and self.t >= self.learning_starts and self.t % self.train_freq == 0: if self.use_dqn: self.dqn.optimize(self.t) if self.use_q_map: self.q_map.optimize(self.t) if self.use_dqn and self.t >= self.learning_starts and self.t % self.dqn_target_net_update_freq == 0: self.dqn.update_target() if self.use_q_map and self.t >= self.learning_starts and self.t % self.q_map_target_net_update_freq == 0: self.q_map.update_target() # save the session if (self.use_dqn or self.use_q_map) and (self.t + 1) % 100000 == 0: file_name = self.tensorflow_path + '/step_' + str(self.t + 1) + '.ckpt' print('saving tensorflow session to', file_name) self.tf_saver.save(tf.get_default_session(), file_name) def log(self): if self.t > 0 and self.print_freq is not None and len( self.episode_rewards) % self.print_freq == 0: mean_100ep_reward = np.mean(self.episode_rewards[-100:]) num_episodes = len(self.episode_rewards) logger.record_tabular('steps', self.t) logger.record_tabular('episodes', num_episodes) logger.record_tabular('mean 100 episode reward', '{:.3f}'.format(mean_100ep_reward)) logger.record_tabular( 'exploration (target)', '{:.3f} %'.format( 100 * self.exploration_schedule.value(self.t))) logger.record_tabular( 'exploration (current)', '{:.3f} %'.format(100 * (1.0 - self.greedy_freq))) logger.dump_tabular() def load(self, path): self.tf_saver.restore(tf.get_default_session(), path) print('model restored :)') def add_to_renderer(self, ob, q_map_values=None, ac=None, action_type='', q_map_candidates=[], q_map_biased_candidates=[]): if self.renderer is not None: if self.use_q_map and self.q_map_goal is not None: goal = self.q_map_goal assert self.q_map_goal_timer > 0 else: goal = None self.renderer.add(ob, self.coords_shape, q_map_values, ac, action_type, self.n_actions, q_map_candidates, q_map_biased_candidates, goal)