def update_step(self, t, lr, batch_data): """ Performs an update of parameters by sampling from replay_buffer Args: t: number of iteration (episode and move) replay_buffer: ReplayBuffer instance .sample() gives batches lr: (float) learning rate Returns: loss: (Q - Q_target)^2 """ fd = { # inputs self.s: batch_data.observations, self.target_action: batch_data.target, self.pred_flag: batch_data.mask, self.slen: batch_data.seqlen, self.hs: DNC.zero_state(self.config, self.config.batch_size), self.lr: lr, # extra info self.eval_acc_placeholder: self.eval_acc } loss_eval, grad_norm_eval, summary, _ = self.sess.run([self.loss, self.grad_norm, self.merged, self.train_op], feed_dict=fd) # tensorboard stuff self.file_writer.add_summary(summary, t) return loss_eval, grad_norm_eval
def update_step(self, t, replay_buffer, lr): """ Performs an update of parameters by sampling from replay_buffer Args: t: number of iteration (episode and move) replay_buffer: ReplayBuffer instance .sample() gives batches lr: (float) learning rate Returns: loss: (Q - Q_target)^2 """ s_batch, slen_batch, a_batch, past_a_batch, r_batch, done_mask_batch, seq_mask_batch, sp_batch, splen_batch = replay_buffer.sample_batch( self.config.batch_size) fd = { # inputs self.s: s_batch, self.slen: slen_batch, self.hs: DNC.zero_state(self.config, self.config.batch_size), self.a: a_batch, self.past_a: past_a_batch, self.r: r_batch, self.sp: sp_batch, self.splen: splen_batch, self.hsp: DNC.zero_state(self.config, self.config.batch_size), self.done_mask: done_mask_batch, self.seq_mask: seq_mask_batch, self.lr: lr, # extra info self.avg_reward_placeholder: self.avg_reward, self.max_reward_placeholder: self.max_reward, self.std_reward_placeholder: self.std_reward, self.avg_q_placeholder: self.avg_q, self.max_q_placeholder: self.max_q, self.std_q_placeholder: self.std_q, self.eval_reward_placeholder: self.eval_reward, } loss_eval, grad_norm_eval, summary, _ = self.sess.run( [self.loss, self.grad_norm, self.merged, self.train_op], feed_dict=fd) # tensorboard stuff self.file_writer.add_summary(summary, t) return loss_eval, grad_norm_eval
def evaluate(self, model_i, curri_idx=None, env=None, num_episodes=None): """ Evaluation with same procedure as the training """ if curri_idx is None: curri_idx = -1 # log our activity only if default call if num_episodes is None: self.logger.info("Evaluating...") # arguments defaults if num_episodes is None: num_episodes = self.config.num_episodes_test if env is None: env = self.env accs = [] for i in range(num_episodes): encoding_batch = [] predflag_batch = [] target_action_batch = [] slen_batch = [] max_len = 0 for j in range(self.config.batch_size): #config = self.config #config.n_node, config.k_ring, config.p_rewiring, config.path_len_limit, config.planning_len = cr_schedule[curri_idx] #self.env.reset(config) # h x w x c encoding, target_action, predflag = model_i.gen_sample_seq(self.config.ndigits, self.config.nway) encoding_batch.append(encoding[None]) predflag_batch.append(predflag[None]) target_action_batch.append(target_action[None]) slen_batch.append(encoding.shape[0]) if encoding.shape[0]>max_len: max_len = encoding.shape[0] batch_data = DatasetTensors(np.concatenate([np.concatenate([x, np.zeros([1, max_len-x.shape[1], x.shape[2]])], axis=1) for x in encoding_batch], axis=0), np.concatenate([np.concatenate([x, np.zeros([1, max_len-x.shape[1], x.shape[2]])], axis=1) for x in target_action_batch], axis=0), np.concatenate([np.concatenate([x, np.zeros([1, max_len-x.shape[1]])], axis=1) for x in predflag_batch], axis=0), np.array(slen_batch).astype('int32')) h_state = DNC.zero_state(self.config, batch_size=self.config.batch_size) pred_action, h_state = self.sess.run([self.q, self.hs_out], feed_dict={self.s: batch_data.observations, self.hs: h_state, self.slen: batch_data.seqlen}) for j in range(self.config.batch_size): accs.append((pred_action[j]*np.expand_dims(batch_data.mask[j],1) == batch_data.target[j]*np.expand_dims(batch_data.mask[j],1)).reshape(-1).all()) avg_acc = np.mean(accs) if num_episodes > 1: msg = "Average acc: {:04.2f}".format(avg_acc) self.logger.info(msg) return avg_acc
def update_step(self, t, lr, batch_data): """ Performs an update of parameters by sampling from replay_buffer Args: t: number of iteration (episode and move) replay_buffer: ReplayBuffer instance .sample() gives batches lr: (float) learning rate Returns: loss: (Q - Q_target)^2 """ fd = { # inputs self.s: batch_data.observations, self.target_action: batch_data.target, self.pred_flag: batch_data.mask, self.slen: batch_data.seqlen, self.hs: DNC.zero_state(self.config, self.config.batch_size), self.lr: lr, # extra info self.eval_acc_placeholder: self.eval_acc } loss_eval, grad_norm_eval, summary, _ = self.sess.run( [self.loss, self.grad_norm, self.merged, self.train_op], feed_dict=fd) ''' run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() loss_eval, grad_norm_eval, summary, _ = self.sess.run([self.loss, self.grad_norm, self.merged, self.train_op], feed_dict=fd, options=run_options, run_metadata=run_metadata) # Create the Timeline object, and write it to a json tl = timeline.Timeline(run_metadata.step_stats) ctf = tl.generate_chrome_trace_format() with open('timeline.json', 'w') as f: f.write(ctf) ''' # tensorboard stuff self.file_writer.add_summary(summary, t) return loss_eval, grad_norm_eval
def evaluate(self, env=None, num_episodes=None): """ Evaluation with same procedure as the training """ # log our activity only if default call if num_episodes is None: self.logger.info("Evaluating...") # arguments defaults if num_episodes is None: num_episodes = self.config.num_episodes_test if env is None: env = self.env env.state.is_render_image = self.config.render_test # replay memory to play replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = [] for i in range(num_episodes): total_reward = 0 state = env.reset() h_state = DNC.zero_state(self.config, batch_size=1) slen = np.ones(1).astype('int32') action = 0 for j in range(50): if self.config.render_test: env.render() #### for replay_buffer # store last state in buffer idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() action, action_q, h_state = self.get_action([q_input], h_state, slen, [action]) #print(action, action_q) # perform action in env new_state, reward, done = env.step(action) # store in replay memory replay_buffer.store_effect(idx, action, reward, done) state = new_state # count reward total_reward += reward if done: break # updates to perform at the end of an episode rewards.append(total_reward) avg_reward = np.mean(rewards) sigma_reward = np.sqrt(np.var(rewards) / len(rewards)) if num_episodes > 1: msg = "Average reward: {:04.2f} +/- {:04.2f}".format( avg_reward, sigma_reward) self.logger.info(msg) return avg_reward
def train(self, exp_schedule, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = deque(maxlen=self.config.num_episodes_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time #scores_eval += [self.evaluate()] prog = Progbar(target=self.config.nsteps_train) self.env.state.is_render_image = self.config.render_train # interact with environment while t < self.config.nsteps_train: total_reward = 0 state = self.env.reset() # h x w x c h_state = DNC.zero_state(self.config, batch_size=1) slen = np.ones(1).astype('int32') action = 0 for i in range(200): t += 1 last_eval += 1 last_record += 1 if self.config.render_train: self.env.render() #### for replay_buffer # replay memory stuff idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() # chose action according to current Q and exploration best_action, q_values, h_state = self.get_best_action( [q_input], h_state, slen, [action]) action = exp_schedule.get_action(best_action) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) # perform action in env new_state, reward, done = self.env.step(action) # store the transition replay_buffer.store_effect(idx, action, reward, done) state = new_state # perform a training step loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon) # logging stuff if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) exp_schedule.update(t) lr_schedule.update(t) if len(rewards) > 0: prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", self.avg_reward), ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max Q", self.max_q), ("lr", lr_schedule.epsilon)]) elif (t < self.config.learning_start) and ( t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format( t, self.config.learning_start)) sys.stdout.flush() # count reward total_reward += reward if done or t >= self.config.nsteps_train: break # updates to perform at the end of an episode rewards.append(total_reward) if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # evaluate our policy last_eval = 0 print("") self.logger.info("Global step: %d" % (t)) scores_eval += [self.evaluate()] # last words self.logger.info("- Training done.") self.save(t) scores_eval += [self.evaluate()] export_plot(scores_eval, "Scores", self.config.plot_output)
def evaluate(self, cr_schedule, curri_idx=None, env=None, num_episodes=None): """ Evaluation with same procedure as the training """ if curri_idx is None: curri_idx = -1 # log our activity only if default call if num_episodes is None: self.logger.info("Evaluating...") # arguments defaults if num_episodes is None: num_episodes = self.config.num_episodes_test if env is None: env = self.env accs = [] gt_len = [] for i in range(num_episodes): config = self.config config.n_node, config.k_ring, config.p_rewiring, config.path_len_limit, config.planning_len = cr_schedule[ curri_idx] env.reset(config) # h x w x c h_state = DNC.zero_state(config, batch_size=1) encoding, predflag, target_action = env.prepare_seq() slen = np.array(encoding.shape[0]).astype('int32') # describe graph, query and planning h_state = self.sess.run(self.hs_out, feed_dict={ self.s: encoding[None], self.hs: h_state, self.slen: slen }) past_state = -1 past_action_onehot = -1 path_len = 0 for i in range(config.max_step_len): gt_action = env.get_gt_action() next_state = env.next_state(gt_action) if self.config.use_transition_only_during_answering: current_encoding = GraphWorld.convert_triplets_to_encoding( np.array([[-1, -1, past_action_onehot]]).astype('int32'), config.ndigits, config.nway) else: current_encoding = GraphWorld.convert_triplets_to_encoding( np.array([[env.current_state, next_state, -1]]).astype('int32'), config.ndigits, config.nway) #current_encoding = GraphWorld.convert_triplets_to_encoding(np.array([[env.current_state, env.target_state, past_action_onehot]]).astype('int32'), config.ndigits, config.nway) current_encoding = np.concatenate( [current_encoding, np.array([[0, 1]])], axis=1) pred_action, h_state = self.sess.run( [self.q, self.hs_out], feed_dict={ self.s: current_encoding[None], self.hs: h_state, self.slen: np.ones(1).astype('int32') }) past_state = env.current_state _, done, past_action_onehot = env.step(pred_action.reshape(-1)) path_len += 1 if done: break accs.append( len(env.path[env.src_state]) == path_len and env.current_state == env.target_state) gt_len.append(len(env.path[env.src_state])) avg_acc = np.mean(accs) if num_episodes > 1: msg = "Average acc: {:04.2f}".format(avg_acc) self.logger.info(msg) return avg_acc
def train(self, beta_schedule, lr_schedule, cr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ self.init_averages() t = last_eval = curri_idx = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time prog = Progbar(target=self.config.nsteps_train) # interact with environment while t < self.config.nsteps_train: t += 1 last_eval += 1 config = self.config config.n_node, config.k_ring, config.p_rewiring, config.path_len_limit = cr_schedule[ curri_idx] self.env.reset(config) # h x w x c h_state = DNC.zero_state(config, batch_size=1) encoding, predflag, target_action = self.env.prepare_seq() slen = np.array(encoding.shape[0]).astype('int32') # describe graph, query and planning h_state = self.sess.run(self.hs_out, feed_dict={ self.s: encoding[None], self.hs: h_state, self.slen: slen }) past_state = -1 past_action_onehot = -1 encoding_a = np.zeros([config.max_step_len, encoding.shape[1]]) predflag_a = np.zeros(config.max_step_len) target_action_a = np.zeros( [config.max_step_len, target_action.shape[1]]) for i in range(config.max_step_len): if self.config.use_transition_only_during_answering: current_encoding = GraphWorld.convert_triplets_to_encoding( np.array([[-1, -1, past_action_onehot]]).astype('int32'), config.ndigits, config.nway) else: current_encoding = GraphWorld.convert_triplets_to_encoding( np.array([[ past_state, self.env.current_state, past_action_onehot ]]).astype('int32'), config.ndigits, config.nway) #current_encoding = GraphWorld.convert_triplets_to_encoding(np.array([[self.env.current_state, self.env.target_state, past_action_onehot]]).astype('int32'), config.ndigits, config.nway) current_encoding = np.concatenate( [current_encoding, np.array([[0, 1]])], axis=1) gt_action = self.env.get_gt_action() encoding_a[i, :] = current_encoding[0] predflag_a[i] = 1 target_action_a[i, :] = gt_action pred_action, h_state = self.sess.run( [self.q, self.hs_out], feed_dict={ self.s: current_encoding[None], self.hs: h_state, self.slen: np.ones(1).astype('int32') }) action = self.get_action(pred_action.reshape(-1), gt_action, beta_schedule.epsilon) past_state = self.env.current_state _, done, past_action_onehot = self.env.step(action) slen += 1 if done: break batch_data = DatasetTensors( np.concatenate([encoding, encoding_a], axis=0)[None], np.concatenate([target_action, target_action_a], axis=0)[None], np.concatenate([predflag, predflag_a], axis=0)[None], slen) # perform a training step loss_eval, grad_eval = self.train_step(t, lr_schedule.epsilon, batch_data) # logging stuff if ((t % config.log_freq == 0) and (t % config.learning_freq == 0)): self.update_averages(scores_eval) beta_schedule.update(t) lr_schedule.update(t) prog.update(t + 1, exact=[("Loss", loss_eval), ("Grads", grad_eval), ("lr", lr_schedule.epsilon)]) if t >= config.nsteps_train: break if last_eval >= config.eval_freq: # evaluate our policy last_eval = 0 print("") self.logger.info("Global step: %d" % (t)) scores_eval += [self.evaluate(cr_schedule, curri_idx)] if scores_eval[-1] > 0.8: curri_idx += 1 msg = "Upgrade to lesson {:d}".format(int(curri_idx)) self.logger.info(msg) self.logger.info( "----------Start Computing Final Score----------") scores_eval += [self.evaluate(cr_schedule)] self.logger.info( "----------Finish Computing Final Score----------") # last words self.logger.info("- Training done.") self.save(t) scores_eval += [self.evaluate(cr_schedule)] export_plot(scores_eval, "Scores", self.config.plot_output)