def run_epoch(self, sess, epoch, train_set, lr_schedule): """ Performs an epoch of training """ # for logging tic = time.time() losses = 0 nbatches = (len(train_set) + self.config.batch_size - 1) / self.config.batch_size prog = Progbar(target=nbatches) # iterate over minibatches for i, (img, formula) in enumerate(minibatches(train_set, self.config.batch_size)): # get feed dict fd = self.get_feed_dict(img, training=True, formula=formula, lr=lr_schedule.lr, dropout=self.config.dropout) # update step loss_eval, _, summary = sess.run([self.loss, self.train_op, self.merged], feed_dict=fd) self.file_writer.add_summary(summary, epoch*nbatches + i) losses += loss_eval # logging prog.update(i + 1, values=[("loss", loss_eval), ("perplexity", np.exp(loss_eval))], exact=[("lr", lr_schedule.lr)]) # update learning rate lr_schedule.update(batch_no=epoch*nbatches + i) toc = time.time() self.config.logger.info("Epoch {} - time: {:04.2f}, loss: {:04.4f}, lr: {:04.5f}".format( epoch, toc-tic, losses / float(max(i, 1)), lr_schedule.lr))
def train(self, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ self.init_averages() t = last_eval = curri_idx = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time prog = Progbar(target=self.config.nsteps_train) rcopy = RepeatCopy(num_bits=self.config.num_bits, batch_size=self.config.batch_size, min_length=self.config.min_length, max_length=self.config.max_length, min_repeats=self.config.min_repeats, max_repeats=self.config.max_repeats) # interact with environment while t < self.config.nsteps_train: t += 1 last_eval += 1 config = self.config batch_data = rcopy() # perform a training step loss_eval, grad_eval = self.train_step(t, lr_schedule.epsilon, batch_data) # logging stuff if ((t % config.log_freq == 0) and (t % config.learning_freq == 0)): self.update_averages(scores_eval) lr_schedule.update(t) prog.update(t + 1, exact=[("Loss", loss_eval), ("Grads", grad_eval), ("lr", lr_schedule.epsilon)]) if t >= config.nsteps_train: break if last_eval >= config.eval_freq: # evaluate our policy last_eval = 0 print("") self.logger.info("Global step: %d" % (t)) scores_eval += [self.evaluate()] # last words self.logger.info("- Training done.") self.save(t) scores_eval += [self.evaluate()] export_plot(scores_eval, "Scores", self.config.plot_output)
def _run_epoch(self, config, train_set, val_set, epoch, lr_schedule): """Performs an epoch of training Args: config: Config instance train_set: Dataset instance val_set: Dataset instance epoch: (int) id of the epoch, starting at 0 lr_schedule: LRSchedule instance that takes care of learning proc Returns: score: (float) model will select weights that achieve the highest score """ # logging batch_size = config.batch_size nbatches = (len(train_set) + batch_size - 1) // batch_size prog = Progbar(nbatches) # iterate over dataset for i, (img, formula) in enumerate(minibatches(train_set, batch_size)): # get feed dict fd = self._get_feed_dict(img, training=True, formula=formula, lr=lr_schedule.lr, dropout=config.dropout) # update step _, loss_eval = self.sess.run([self.train_op, self.loss], feed_dict=fd) prog.update(i + 1, [("loss", loss_eval), ("perplexity", np.exp(loss_eval)), ("lr", lr_schedule.lr)]) # update learning rate lr_schedule.update(batch_no=epoch * nbatches + i) # logging self.logger.info("- Training: {}".format(prog.info)) # evaluation config_eval = Config({ "dir_answers": self._dir_output + "formulas_val/", "batch_size": config.batch_size }) scores = self.evaluate(config_eval, val_set) score = scores[config.metric_val] lr_schedule.update(score=score) return score
def train(): #s, _, loss, y = autoencoder() s, _, loss, y, recon_loss, KL = vae() train_op, grad_norm = add_optimizer_op(loss) if not os.path.exists(config.output_path): os.makedirs(config.output_path) logger = get_logger(config.log_path) train_data, eval_data = load_data() sess = tf.Session() sess.run(tf.global_variables_initializer()) for i in xrange(0, config.epoch_num): # each epoch #train prog = Progbar(target=1 + int(len(train_data) / config.batch_size)) step = 1 for batch in minibatches(train_data, config.batch_size): loss_eval, grad_norm_eval, y_train, _, recon_loss_train, KL_train = sess.run([loss, grad_norm, y, train_op, recon_loss, KL], feed_dict={s: batch}) #prog.update(step, [("train loss", loss_eval), ("grad norm", grad_norm_eval)]) prog.update(step, [("train loss", loss_eval), ("grad norm", grad_norm_eval), ('recon loss', recon_loss_train), ('VLBO', KL_train)]) step += 1 plt.imshow(y_train[0,:,:,0], cmap='Greys') plt.savefig('y.png') #eval #prog = Progbar(target=1 + int(len(eval_data) / config.batch_size)) #step = 1 #losses = [] #for batch in minibatches(eval_data, config.batch_size): # loss_eval = sess.run(loss, feed_dict={s: batch}) # prog.update(step, [("eval loss", loss_eval)]) # losses.append(loss_eval) # step += 1 #avg_loss = np.mean(losses) #sigma_loss = np.sqrt(np.var(losses) / len(losses)) #print "" #msg = "Average loss: {:04.2f} +/- {:04.2f}".format(avg_loss, sigma_loss) #logger.info(msg) save(sess)
def train(self, sess, summary_op): allBatches = get_batches(self.all_data, self.batch_size, True, toy) if toy: prog = Progbar(target=(len(self.all_data)/2) / self.batch_size) else: prog = Progbar(target=(len(self.all_data[0])) / self.batch_size) fetches = [self.train_op, self.loss, summary_op] # array of desired outputs for i, batch in enumerate(allBatches): # if i > 10: # break if toy: questions, answers = batch[0], batch[1] enc_seq_len = get_sequence_length(questions) dec_seq_len = [self.max_dec_len for sen in answers] # get_sequence_length(answers) seq_len = {"enc": enc_seq_len, "dec": dec_seq_len} # print seq_len labels = [ [letter.index(1) for letter in word] for word in answers] labels = np.asarray(labels) feed_dict = self.create_feed_dict(questions, answers, labels, seq_len) else: questions_labels, answers_labels = batch[0], batch[1] seq_len = {"enc": [len(q) for q in questions_labels], "dec": [len(a) for a in answers_labels]} # Pad them to be of particular size. questions_labels = [self.addPaddingEnc(q) for q in questions_labels] answers_labels = [self.addPaddingDec(a) for a in answers_labels] feed_dict = self.create_feed_dict_embeddings(questions_labels, answers_labels, seq_len) _, loss, summary = sess.run(fetches, feed_dict) prog.update(i + 1, [("train loss", loss)])
def init_agent(self, id_, game_type): super(DQNAgent, self).init_agent(id_, game_type) # Assume the graph has been constructed. # Create a tf Session and run initializer of variables. tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True self._session = tf.Session(config=tf_config) # Tensorboard self._add_summary() # Initialize all variables. init = tf.global_variables_initializer() self._session.run(init) # Synchronise q and target_q networks. self._session.run(self._update_target_op) # for saving networks weights self._saver = tf.train.Saver() # Initialize replay buffer and variables. self._train_replay_buffer = ReplayBuffer(self._config.buffer_size, self._config.state_history) self._train_rewards = deque(maxlen=self._config.num_episodes_test) self._train_max_q_values = deque(maxlen=1000) self._train_q_values = deque(maxlen=1000) self._init_averages() self._time_step = 0 self._progress_bar = Progbar(target=self._config.nsteps_train) self._has_episode_started = False if not self._train_from_scratch: self._load()
def train(self, model_i, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ self.init_averages() t = last_eval = curri_idx = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time prog = Progbar(target=self.config.nsteps_train) # interact with environment while t < self.config.nsteps_train: t += 1 last_eval += 1 encoding_batch = [] predflag_batch = [] target_action_batch = [] slen_batch = [] max_len = 0 for i in range(self.config.batch_size): #config = self.config #config.n_node, config.k_ring, config.p_rewiring, config.path_len_limit, config.planning_len = cr_schedule[curri_idx] #self.env.reset(config) # h x w x c encoding, target_action, predflag = model_i.gen_sample_seq(self.config.ndigits, self.config.nway) encoding_batch.append(encoding[None]) predflag_batch.append(predflag[None]) target_action_batch.append(target_action[None]) slen_batch.append(encoding.shape[0]) if encoding.shape[0]>max_len: max_len = encoding.shape[0] batch_data = DatasetTensors(np.concatenate([np.concatenate([x, np.zeros([1, max_len-x.shape[1], x.shape[2]])], axis=1) for x in encoding_batch], axis=0), np.concatenate([np.concatenate([x, np.zeros([1, max_len-x.shape[1], x.shape[2]])], axis=1) for x in target_action_batch], axis=0), np.concatenate([np.concatenate([x, np.zeros([1, max_len-x.shape[1]])], axis=1) for x in predflag_batch], axis=0), np.array(slen_batch).astype('int32')) # perform a training step loss_eval, grad_eval = self.train_step(t, lr_schedule.epsilon, batch_data) # logging stuff if ((t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.update_averages(scores_eval) lr_schedule.update(t) prog.update(t + 1, exact=[("Loss", loss_eval), ("Grads", grad_eval), ("lr", lr_schedule.epsilon)]) if t >= self.config.nsteps_train: break if last_eval >= self.config.eval_freq: # evaluate our policy last_eval = 0 print("") self.logger.info("Global step: %d"%(t)) scores_eval += [self.evaluate(model_i)] ''' if scores_eval[-1]>0.8: curri_idx += 1 msg = "Upgrade to lesson {:d}".format(int(curri_idx)) self.logger.info(msg) self.logger.info("----------Start Computing Final Score----------") scores_eval += [self.evaluate(model_i)] self.logger.info("----------Finish Computing Final Score----------") ''' # last words self.logger.info("- Training done.") self.save(t) scores_eval += [self.evaluate(cr_schedule)] export_plot(scores_eval, "Scores", self.config.plot_output)
def train(self, exp_schedule, lr_schedule): # Initialize replay buffer and variables replay_buffer = ReplayBufferAC(self.FLAGS.buffer_size, self.FLAGS.state_hist) rewards = deque(maxlen=self.FLAGS.num_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = 0 # time control of nb of steps loss_eval = grad_eval = 0 scores_eval = [] # list of scores computed at iteration time #scores_eval += [self.evaluate(self.env, self.FLAGS.num_test)] self.prog = Progbar(target=self.FLAGS.train_steps) self.prog2 = Progbar(target=self.FLAGS.train_steps) # Train for # of train steps while t < self.FLAGS.train_steps: total_reward = 0 ep_len = 0 state = self.env.reset() reward = 0 first = 1 q_input = None # Run for 1 episode and update the buffer while True: ep_len += 1 # replay memory stuff if first == 1: first = 0 idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() # chose action according to current Q and exploration best_action, q_values = self.network.get_best_action(q_input) action = exp_schedule.get_action(best_action) orig_val = self.network.calcState(q_input) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) # perform action in env new_state, new_reward, done, info = self.env.step(action) idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() new_val = self.network.calcState(q_input) orig_val = orig_val[0][0] new_val = new_val[0][0] print (orig_val, new_reward, done, new_val, ep_len) if not done: # Non-terminal state. target = reward + ( self.FLAGS.gamma * new_val) else: target = reward + ( self.FLAGS.gamma * new_reward ) best_val = max((orig_val), target) actor_delta = new_val - orig_val replay_buffer.store_effect(idx-1, action, new_reward, done, best_val, actor_delta) state = new_state if done: replay_buffer.store_effect(idx, action, 0, done, 0, 0) # Count reward total_reward += new_reward reward=new_reward # Stop at end of episode if done: break old_t = t temp_ep_len = ep_len while True: t += 1 temp_ep_len -= 1 if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.learn_every == 0)): if replay_buffer.can_sample(self.FLAGS.batch_size) == True: loss_eval, grad_eval = self.network.update_critic_step(t, replay_buffer, lr_schedule.epsilon, self.summary) # Update logs if necessary if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.log_every == 0)): self.update_logs2(t, loss_eval, rewards, exp_schedule.epsilon, grad_eval, lr_schedule.epsilon) if temp_ep_len <= 0 or t >= self.FLAGS.train_steps: break rewards.append(total_reward) # Learn using replay while True: t += 1 ep_len -= 1 # Make train step if necessary if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.learn_every == 0)): if replay_buffer.can_sample(self.FLAGS.batch_size) == True: loss_eval, grad_eval = self.network.update_actor_step(t, replay_buffer, lr_schedule.epsilon, self.summary) exp_schedule.update(t) lr_schedule.update(t) # Update logs if necessary if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.log_every == 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) self.update_logs(t, loss_eval, rewards, exp_schedule.epsilon, grad_eval, lr_schedule.epsilon) # Update logs if necessary elif (t < self.FLAGS.learn_start) and (t % self.FLAGS.log_every == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format(t, self.FLAGS.learn_start)) sys.stdout.flush() if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.check_every == 0)): # Evaluate current model scores_eval += [self.evaluate(self.env, self.FLAGS.num_test)] # Save current Model self.network.save() # Record video of current model if self.FLAGS.record: self.record() if ep_len <= 0 or t >= self.FLAGS.train_steps: break # Update episodic rewards # End of training self.logger.info("- Training done.") self.network.save() scores_eval += [self.evaluate(self.env, self.FLAGS.num_test)] export_plot(scores_eval, "Scores", self.FLAGS.plot_path)
def train(self, model_a, exp_schedule, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = deque(maxlen=self.config.num_episodes_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time #scores_eval += [self.evaluate()] prog = Progbar(target=self.config.nsteps_train) self.env.state.is_render_image = self.config.render_train model_a.env.state.is_render_image = model_a.config.render_train orientation_map = [np.array([0, 1]), np.array([-1, 0]), np.array([0, -1]), np.array([1, 0])] npath = self.config.npath # paths to generate in each environment nquery = self.config.nquery # query to generate in each environment max_plan_len = self.config.max_plan_len ndigits = self.config.ndigits nway = self.config.nway num_classes = len(self.env.state.xmap.item_class_id) # three steps: # 1. sample paths from the teacher environment and pass to dnc # 2. get immediate reward from whether agent could reach the subgoal # 3. sample query paths and ask agent to follow the plan, get the final big reward # -- train one step after each teacher's move # interact with environment while t < self.config.nsteps_train: total_reward = 0 self.env.reset() model_a.env.reset() model_a.env.state.copy_state(model_a.env.agent, self.env.state) dnc_state = DNC.zero_state(self.config, batch_size=1) h_state = (np.zeros([1,self.config.h_size]),np.zeros([1,self.config.h_size])) slen = np.ones(1).astype('int32') action = 0 # sample paths for i in range(npath): state_seq, path_loc, path_ori = self.env.teacher.gen_sample_seq(self.env.state) state_seq_encoding = DRQN_planner.encode_state(state_seq, ndigits, nway) goal_state_seq = np.reshape(state_seq, [state_seq.shape[0], 4, 3, 3, num_classes+2]).astype('bool') #### missing could be everything #### goal_state_seq = np.tile(goal_state_seq[:,:,:,:,[num_classes]], [1,1,1,1,num_classes+2])+goal_state_seq #### treat missing observation as correct observation #### goal_state_seq[:,:,:,:,num_classes] = True #### transpose goal_state_seq = np.transpose(goal_state_seq, [0,2,3,4,1]) path_len = state_seq.shape[0] mask_seq = np.logical_not(state_seq[:,:3,:,num_classes]) flag_seq = np.zeros([path_len]) flag_seq[-1] = 1 model_a.env.state.teleport(model_a.env.agent, path_loc[0], orientation_map[path_ori[0]]) for j in range(path_len): # get agate from dnc cur_dnc_in = np.concatenate([state_seq_encoding[j].reshape(-1),mask_seq[j].reshape(-1), np.array([0, flag_seq[j]])], axis=0) agate_dnc_val = self.sess.run(self.agate_dnc, feed_dict={self.s_dnc: cur_dnc_in[None], self.hs_dnc: dnc_state}) agate_dnc_val = agate_dnc_val[0,0] # get q value and sample action idx = replay_buffer.store_frame(state_seq[j]) q_input = replay_buffer.encode_recent_observation() best_action, q_values, h_state = self.get_best_action([q_input], h_state, slen, [action], [agate_dnc_val]) action = exp_schedule.get_action(best_action) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) # take action and update dnc cur_dnc_in[-2] = action dnc_state = self.sess.run(self.hs_out_dnc, feed_dict={self.s_dnc: cur_dnc_in[None], self.hs_dnc: dnc_state}) # acquire reward reward = 0 done = False if action==1: h_state_a = (np.zeros([1,model_a.config.h_size]),np.zeros([1,model_a.config.h_size])) model_a.env.teacher.set_goal(goal_state_seq[j], path_loc[j]) reward_a = model_a.navi_goal(h_state_a, goal_state_seq[j]) if not model_a.env.teacher.goal_finish: reward += -0.05 reward += -0.05 model_a.env.state.teleport(model_a.env.agent, path_loc[j], orientation_map[path_ori[j]]) # acquire final reward if i==npath-1 and j==path_len-1: done = True reward_list = list() for k in range(nquery): reward_list.append(0) src_inputs, tgt_inputs, src_loc, tgt_loc, goal_obs_onehot_state = self.env.teacher.gen_sample_query(self.env.state) src_inputs = DRQN_planner.encode_state(src_inputs, ndigits, nway) tgt_inputs = DRQN_planner.encode_state(tgt_inputs, ndigits, nway) path_dnc_val, target_ldm_dnc_val = self.sess.run([self.path_dnc, self.target_ldm_dnc], feed_dict={self.hs_dnc: dnc_state, self.src_inputs_dnc: src_inputs[None], self.tgt_inputs_dnc: tgt_inputs[None], self.max_len_dnc: max_plan_len}) path_dnc_val = DRQN_planner.decode_state(np.reshape(path_dnc_val[0], [max_plan_len, 3, 3, -1]), ndigits, nway, num_classes+2) target_ldm_dnc_val = DRQN_planner.decode_state(np.reshape(target_ldm_dnc_val[0], [3, 3, -1]), ndigits, nway, num_classes+2) path_dnc_val_inner = np.argmax(path_dnc_val, axis=3) target_ldm_dnc_val_inner = np.argmax(target_ldm_dnc_val, axis=2) cur_len = max_plan_len for l in range(max_plan_len): if (path_dnc_val_inner[l]==target_ldm_dnc_val_inner).all(): cur_len = l+1 break path_dnc_val = path_dnc_val[:cur_len] path_dnc_val = np.concatenate([path_dnc_val, goal_obs_onehot_state[None]], 0) #### modify goal state #### #### missing could be everything #### path_dnc_val = np.tile(path_dnc_val[:,:,:,[num_classes]], [1,1,1,num_classes+2])+path_dnc_val #### treat missing observation as correct observation #### path_dnc_val[:,:,:,num_classes] = True model_a.env.state.teleport(model_a.env.agent, src_loc, np.array([0,1])) h_state_a = (np.zeros([1,model_a.config.h_size]),np.zeros([1,model_a.config.h_size])) for l in range(path_dnc_val.shape[0]): cur_goal_state = path_dnc_val[l] cur_goal_state = np.expand_dims(cur_goal_state, 3) cur_goal_state = np.concatenate([np.rot90(cur_goal_state, 0), np.rot90(cur_goal_state, 1), np.rot90(cur_goal_state, 2), np.rot90(cur_goal_state, 3)], 3) model_a.env.teacher.set_goal(cur_goal_state, tgt_loc) reward_list[-1] += model_a.navi_goal(h_state_a, cur_goal_state) if model_a.env.teacher.goal_finish: reward_list[-1] += 10 reward += sum(reward_list)/len(reward_list) # store everything into replay buffer replay_buffer.store_effect(idx, action, agate_dnc_val, reward, done) t += 1 last_eval += 1 last_record += 1 # perform a training step loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon) # logging stuff if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) exp_schedule.update(t) lr_schedule.update(t) if len(rewards) > 0: prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", self.avg_reward), ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max Q", self.max_q), ("lr", lr_schedule.epsilon)]) elif (t < self.config.learning_start) and (t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format(t, self.config.learning_start)) sys.stdout.flush() # count reward total_reward += reward if done or t >= self.config.nsteps_train: break # updates to perform at the end of an episode rewards.append(total_reward) if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # evaluate our policy last_eval = 0 print("") self.logger.info("Global step: %d"%(t)) scores_eval += [self.evaluate(model_a)] # last words self.logger.info("- Training done.") self.save(t) scores_eval += [self.evaluate(model_a)] export_plot(scores_eval, "Scores", self.config.plot_output)
def run_epoch(self, session, train, val, logger): num_samples = len(train["context"]) num_batches = int( np.ceil(num_samples) * 1.0 / self.config.training.batch_size) progress = Progbar(target=num_batches) best_f1 = 0 losses = [] for i, train_batch in enumerate( batches(train, is_train=True, batch_size=self.config.training.batch_size, window_size=self.config.training.window_size)): _, loss = self.optimize(session, train_batch) losses.append(loss) progress.update(i, [("training loss", np.mean(losses))]) if i % self.config.training.eval_num == 0 or i == num_batches: # Randomly get some samples from the dataset train_samples = get_random_samples( train, self.config.training.samples_used_for_evaluation) val_samples = get_random_samples( val, self.config.training.samples_used_for_evaluation) # First evaluate on the training set for not using best span f1_train, EM_train = self.evaluate_answer(session, train_samples, use_best_span=False) # Then evaluate on the val set f1_val, EM_val = self.evaluate_answer(session, val_samples, use_best_span=False) logging.info("Not using best span") logging.info("F1: {}, EM: {}, for {} training samples".format( f1_train, EM_train, self.config.training.samples_used_for_evaluation)) logging.info( "F1: {}, EM: {}, for {} validation samples".format( f1_val, EM_val, self.config.training.samples_used_for_evaluation)) # First evaluate on the training set f1_train, EM_train = self.evaluate_answer(session, train_samples, use_best_span=True) # Then evaluate on the val set f1_val, EM_val = self.evaluate_answer(session, val_samples, use_best_span=True) logging.info("Using best span") logging.info("F1: {}, EM: {}, for {} training samples".format( f1_train, EM_train, self.config.training.samples_used_for_evaluation)) logging.info( "F1: {}, EM: {}, for {} validation samples".format( f1_val, EM_val, self.config.training.samples_used_for_evaluation)) summaries_dict = { "f1_train": f1_train, "EM_train": EM_train, "f1_val": f1_val, "EM_val": EM_val, "training_loss": np.mean(losses) } logger.add_scalar_summary( self.cur_epoch_tensor.eval(session) * num_batches + i, summaries_dict) if f1_val > best_f1: self.save(session) best_f1 = f1_val
def train(self, exp_schedule, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = deque(maxlen=self.config.num_episodes_test) last_frames = deque(maxlen=4) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time scores_eval += [] embeddings = [] extractor = PongExtractor() prog = Progbar(target=self.config.nsteps_train) # interact with environment while t < 2000: total_reward = 0 state = self.env.reset() last_frame = state last_frames.append(state) while True: t += 1 last_eval += 1 last_record += 1 if self.config.render_train: self.env.render() feats = extractor.extract(np.squeeze(state)) # replay memory stuff idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() # chose action according to current Q and exploration best_action, q_values = self.get_best_action(q_input) embedding = self.sess.run(self.hidden, feed_dict={self.s: [q_input]})[0] # embedding = self.sess.run(self.q, feed_dict={self.s: [q_input]})[0] # print embedding.shape embeddings.append(embedding) action = best_action frame = np.squeeze(state) scipy.misc.imsave( 'embeddings/breakout/breakout{}.png'.format(t), frame) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) # perform action in env new_state, reward, done, info = self.env.step(action) replay_buffer.store_effect(idx, action, reward, done) state = new_state total_reward += reward if done or t >= 2000: print total_reward, t break # updates to perform at the end of an episode rewards.append(total_reward) # last words print 'Saving embeddings' np.save(open('embeddings/breakout/breakout.npy', 'w'), np.vstack(embeddings))
class DQNAgent(BaseAgent): non_terminal_reward = 0 def __init__(self, env, config, exp_schedule, lr_schedule, is_training_agent, train_from_scratch=False, reward_after_somebody_died=False, logger=None): """ Initialize Q Network and env :param env: Game environment :param config: config(hyper-parameters) instance :param logger: logger instance from logging module :param exp_schedule: exploration strategy for epsilon :param lr_schedule: schedule for learning rate """ super(DQNAgent, self).__init__() # Variables initialized in _build self._states = None self._actions = None self._rewards = None self._next_states = None self._done_mask = None self._learning_rate = None self._q_values = None self._target_q_values = None self._next_q_values = None self._update_target_op = None self._loss = None self._train_op = None self._grad_norm = None # Variables initialized in init_agent self._session = None self._avg_reward_placeholder = None self._max_reward_placeholder = None self._std_reward_placeholder = None self._avg_q_placeholder = None self._max_q_placeholder = None self._std_q_placeholder = None # TODO: Commented due to lack of evaluate() # self._eval_reward_placeholder = None self._merged = None self._file_writer = None self._saver = None self._train_replay_buffer = None self._train_rewards = None self._train_max_q_values = None self._train_q_values = None self._avg_reward = None self._max_reward = None self._std_reward = None self._avg_q = None self._max_q = None self._std_q = None # TODO: Commented due to lack of evaluate() # self._eval_reward = None self._time_step = None self._progress_bar = None self._has_episode_started = None # Variables initialized in act. self._last_action = None self._last_idx = None self._enemy_count = None # Directory for training outputs if not os.path.exists(config.output_path): os.makedirs(config.output_path) self._logger = logger if logger is None: self._logger = get_logger(config.log_path) self._config = config self._env = env self._exp_schedule = exp_schedule self._lr_schedule = lr_schedule self._is_training_agent = is_training_agent self._train_from_scratch = train_from_scratch self._reward_after_somebody_died = reward_after_somebody_died self._total_reward = 0 # Build model. self._build() def init_agent(self, id_, game_type): super(DQNAgent, self).init_agent(id_, game_type) # Assume the graph has been constructed. # Create a tf Session and run initializer of variables. tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True self._session = tf.Session(config=tf_config) # Tensorboard self._add_summary() # Initialize all variables. init = tf.global_variables_initializer() self._session.run(init) # Synchronise q and target_q networks. self._session.run(self._update_target_op) # for saving networks weights self._saver = tf.train.Saver() # Initialize replay buffer and variables. self._train_replay_buffer = ReplayBuffer(self._config.buffer_size, self._config.state_history) self._train_rewards = deque(maxlen=self._config.num_episodes_test) self._train_max_q_values = deque(maxlen=1000) self._train_q_values = deque(maxlen=1000) self._init_averages() self._time_step = 0 self._progress_bar = Progbar(target=self._config.nsteps_train) self._has_episode_started = False if not self._train_from_scratch: self._load() def act(self, obs, action_space): state = obs['board'][:, :, None] if not self._is_training_agent: # Act greedily when testing. if self._has_episode_started: self._train_replay_buffer.store_effect( self._last_idx, self._last_action, 0, done=False ) self._last_idx = self._train_replay_buffer.store_frame(state) q_input = self._train_replay_buffer.encode_recent_observation() action = self._get_action(q_input) self._last_action = action return action if self._has_episode_started: reward = DQNAgent.non_terminal_reward if self._reward_after_somebody_died: if len(self._character.enemies) < self._enemy_count: reward = 1 self._train(reward, done=False) self._enemy_count = len(self._character.enemies) self._time_step += 1 # Replay buffer idx = self._train_replay_buffer.store_frame(state) q_input = self._train_replay_buffer.encode_recent_observation() # Choose action according to current Q and exploration best_action, self._train_q_values = self._get_best_action(q_input) action = self._exp_schedule.get_action(best_action) self._train_max_q_values.append(max(self._train_q_values)) self._train_q_values += list(self._train_q_values) self._last_action = action self._last_idx = idx if not self._has_episode_started: self._has_episode_started = True return action def episode_end(self, reward): """ Updates to perform at the end of an episode """ # Reset episode. self._has_episode_started = False if not self._is_training_agent: return self._train(reward, done=True) self._train_rewards.append(self._total_reward) # Reset total reward. self._total_reward = 0 # TODO: Commented due to lack of evaluate() and record() # if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # # evaluate our policy # last_eval = 0 # print("") # scores_eval += [self.evaluate()] # # if (t > self.config.learning_start) and self.config.record and (last_record > self.config.record_freq): # self.logger.info("Recording...") # last_record = 0 # self.record() def shutdown(self): """ Save trained results """ if not self._is_training_agent: return self._logger.info("- Training done.") self._save() # TODO: Commented due to lack of evaluate() # scores_eval += [self.evaluate()] # DQNAgent.export_plot(scores_eval, "Scores", self.config.plot_output) def _train(self, reward, done): # Store the transition. self._train_replay_buffer.store_effect( self._last_idx, self._last_action, reward, done=done ) # Perform a training step. loss_eval, grad_eval = self._train_step( self._time_step, self._train_replay_buffer, self._lr_schedule.epsilon ) # Logging if self._time_step > self._config.learning_start \ and self._time_step % self._config.log_freq == 0 \ and self._time_step % self._config.learning_freq == 0: self._update_averages(self._train_rewards, self._train_max_q_values, self._train_q_values) self._exp_schedule.update(self._time_step) self._lr_schedule.update(self._time_step) if len(self._train_rewards) > 0: self._progress_bar.update( self._time_step + 1, exact=[ ("Loss", loss_eval), ("Avg R", self._avg_reward), ("Max R", np.max(self._train_rewards)), ("eps", self._exp_schedule.epsilon), ("Grads", grad_eval), ("Max Q", self._max_q), ("lr", self._lr_schedule.epsilon) ] ) elif self._time_step < self._config.learning_start and self._time_step % self._config.log_freq == 0: sys.stdout.write("\rPopulating the memory {}/{}...".format(self._time_step, self._config.learning_start)) sys.stdout.flush() # Accumulate reward self._total_reward += reward def _build(self): """ Build model by adding all necessary variables. """ # Add placeholders. self._add_placeholders_op() # Compute Q values of state. states = self._process_state(self._states) self._q_values = self._get_q_values_op(states, scope='q', reuse=False) # Compute Q values of next state. next_states = self._process_state(self._next_states) self._target_q_values = self._get_q_values_op(next_states, scope='target_q', reuse=False) # for Double DQN self._next_q_values = self._get_q_values_op(next_states, scope='q', reuse=True) # Add update operator for target network. self._add_update_target_op('q', 'target_q') # Add square loss. self._add_loss_op(self._q_values, self._target_q_values, self._next_q_values) # Add optimizer for the main networks. self._add_optimizer_op('q') def _add_placeholders_op(self): """ Adds placeholders to the graph These placeholders are used as inputs by the rest of the model building and will be fed data during training. Note that when "None" is in a placeholder's shape, it's flexible (so we can use different batch sizes without rebuilding the model """ state_shape = list(self._env.observation_space.shape) self._states = tf.placeholder(tf.uint8, (None, 11, 11, self._config.state_history)) self._actions = tf.placeholder(tf.int32, (None,)) self._rewards = tf.placeholder(tf.float32, (None,)) self._next_states = tf.placeholder(tf.uint8, (None, 11, 11, self._config.state_history)) self._done_mask = tf.placeholder(tf.bool, (None,)) self._learning_rate = tf.placeholder(tf.float32, ()) def _process_state(self, state): """ Processing of state State placeholders are tf.uint8 for fast transfer to GPU Need to cast it to float32 for the rest of the tf graph. :param state: Node of tf graph of shape = (batch_size, height, width, nchannels) of type tf.uint8.if, values are between 0 and 255 -> 0 and 1 """ state = tf.cast(state, tf.float32) state /= self._config.high return state def _get_q_values_op(self, state, scope, reuse=False): """ Returns Q values for all actions :param state: (tf tensor) shape = (batch_size, img height, img width, nchannels) :param scope: (string) scope name, that specifies if target network or not :param reuse: (bool) reuse of variables in the scope :return out: (tf tensor) of shape = (batch_size, num_actions) """ num_actions = self._env.action_space.n out = state with tf.variable_scope(scope, reuse=reuse) as _: x = layers.conv2d(state, 32, 5, stride=2, padding='SAME') x = layers.conv2d(x, 64, 4, stride=2, padding='SAME') x = layers.conv2d(x, 64, 3, stride=1, padding='SAME') x = layers.flatten(x) x = layers.fully_connected(x, 512) out = layers.fully_connected(x, num_actions, activation_fn=None) return out def _add_update_target_op(self, q_scope, target_q_scope): """ update_target_op will be called periodically to copy Q network weights to target Q network Remember that in DQN, we maintain two identical Q networks with 2 different set of weights. In tensorflow, we distinguish them with two different scopes. One for the target network, one for the regular network. If you're not familiar with the scope mechanism in tensorflow, read the docs https://www.tensorflow.org/programmers_guide/variable_scope Periodically, we need to update all the weights of the Q network and assign them with the values from the regular network. Thus, what we need to do is to build a tf op, that, when called, will assign all variables in the target network scope with the values of the corresponding variables of the regular network scope. :param q_scope: (string) name of the scope of variables for q :param target_q_scope: (string) name of the scope of variables for the target network """ tar_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=target_q_scope) q_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=q_scope) self._update_target_op = tf.group(*[tf.assign(tar_vars[i], q_vars[i]) for i in range(len(tar_vars))]) def _add_loss_op(self, q, target_q, next_q): """ Sets the loss of a batch, self.loss is a scalar :param q: (tf tensor) shape = (batch_size, num_actions)(Q(s, a)) :param target_q: (tf tensor) shape = (batch_size, num_actions)(Q_target(s', a')) :param next_q: Q(s', a') for Double DQN """ num_actions = self._env.action_space.n not_done = 1 - tf.cast(self._done_mask, tf.float32) # Double DQN # need q_next(Q(s', a')), then find argmax in it max_a = tf.argmax(next_q, axis=1) q_max = tf.reduce_sum(target_q * tf.one_hot(max_a, num_actions), axis=1) q_samp = self._rewards + not_done * self._config.gamma * q_max # nature DQN q_s = tf.reduce_sum(q * tf.one_hot(self._actions, num_actions), axis=1) self._loss = tf.reduce_mean(tf.square(q_samp - q_s)) def _add_optimizer_op(self, scope): """ Set self.train_op and self.grad_norm """ optimizer = tf.train.AdamOptimizer(self._learning_rate) vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope) grads_and_vars = optimizer.compute_gradients(self._loss, vars) clip_grads_and_vars = None if self._config.grad_clip: clip_grads_and_vars = [(tf.clip_by_norm(gv[0], self._config.clip_val), gv[1]) for gv in grads_and_vars] self._train_op = optimizer.apply_gradients(clip_grads_and_vars) self._grad_norm = tf.global_norm(clip_grads_and_vars) def _add_summary(self): """ Tensorflow stuff """ # extra placeholders to log stuff from python self._avg_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="avg_reward") self._max_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="max_reward") self._std_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="std_reward") self._avg_q_placeholder = tf.placeholder(tf.float32, shape=(), name="avg_q") self._max_q_placeholder = tf.placeholder(tf.float32, shape=(), name="max_q") self._std_q_placeholder = tf.placeholder(tf.float32, shape=(), name="std_q") # TODO: Commented due to lack of evaluate() # self._eval_reward_placeholder = tf.placeholder(tf.float32, shape=(), name="eval_reward") # add placeholders from the graph tf.summary.scalar("loss", self._loss) tf.summary.scalar("grads norm", self._grad_norm) # extra summaries from python -> placeholders tf.summary.scalar("Avg Reward", self._avg_reward_placeholder) tf.summary.scalar("Max Reward", self._max_reward_placeholder) tf.summary.scalar("Std Reward", self._std_reward_placeholder) tf.summary.scalar("Avg Q", self._avg_q_placeholder) tf.summary.scalar("Max Q", self._max_q_placeholder) tf.summary.scalar("Std Q", self._std_q_placeholder) # TODO: Commented due to lack of evaluate() # tf.summary.scalar("Eval Reward", self._eval_reward_placeholder) # logging self._merged = tf.summary.merge_all() self._file_writer = tf.summary.FileWriter(self._config.output_path, self._session.graph) def _init_averages(self): """ Define extra attributes for tensorboard. """ self._avg_reward = -21. self._max_reward = -21. self._std_reward = 0 self._avg_q = 0 self._max_q = 0 self._std_q = 0 # TODO: Commented due to lack of evaluate() # self._eval_reward = -21. def _get_action(self, obs): """ Returns action with some epsilon strategy :param obs: observation from gym """ if np.random.random() < self._config.soft_epsilon: return self._env.action_space.sample() else: return self._get_best_action(obs)[0] def _get_best_action(self, obs): """ Return best action :param obs: 4 consecutive observations from gym :return action: (int) :return action_values: (np array) q values for all actions """ action_values = self._session.run(self._q_values, feed_dict={self._states: [obs]})[0] return np.argmax(action_values), action_values def _train_step(self, t, replay_buffer, lr): """ Perform training step :param t: (int) nth step :param replay_buffer: buffer for sampling :param lr: (float) learning rate """ loss_eval, grad_eval = 0, 0 # Perform training step if t > self._config.learning_start and t % self._config.learning_freq == 0: loss_eval, grad_eval = self._update_step(t, replay_buffer, lr) # Occasionally update target network with q network if t % self._config.target_update_freq == 0: self._update_target_params() # Occasionally save the weights if t % self._config.saving_freq == 0: self._save() return loss_eval, grad_eval def _update_step(self, t, replay_buffer, lr): """ Performs an update of parameters by sampling from replay_buffer :param t: number of iteration (episode and move) :param replay_buffer: ReplayBuffer instance .sample() gives batches :param lr: (float) learning rate :return loss: (Q - Q_target) ^ 2 """ s_batch, a_batch, r_batch, sp_batch, done_mask_batch = replay_buffer.sample(self._config.batch_size) fd = { # Inputs self._states: s_batch, self._actions: a_batch, self._rewards: r_batch, self._next_states: sp_batch, self._done_mask: done_mask_batch, self._learning_rate: lr, # Extra info self._avg_reward_placeholder: self._avg_reward, self._max_reward_placeholder: self._max_reward, self._std_reward_placeholder: self._std_reward, self._avg_q_placeholder: self._avg_q, self._max_q_placeholder: self._max_q, self._std_q_placeholder: self._std_q, # TODO: Commented due to lack of evaluate() # self._eval_reward_placeholder: self.eval_reward, } loss_eval, grad_norm_eval, summary, _ = self._session.run( [self._loss, self._grad_norm, self._merged, self._train_op], feed_dict=fd ) # Tensorboard self._file_writer.add_summary(summary, t) return loss_eval, grad_norm_eval def _update_target_params(self): """ Update parameters of Q with parameters of Q """ self._session.run(self._update_target_op) def _load(self): """ Loads session """ ckpt = tf.train.get_checkpoint_state(self._config.model_output) self._saver.restore(self._session, ckpt.model_checkpoint_path) def _save(self): """ Saves session """ if not os.path.exists(self._config.model_output): os.makedirs(self._config.model_output) model_path = os.path.join(self._config.model_output, 'model.ckpt') self._saver.save(self._session, model_path) def _update_averages(self, rewards, max_q_values, q_values, scores_eval=None): """ Update the averages :param rewards: deque :param max_q_values: deque :param q_values: deque :param scores_eval: list """ self._avg_reward = np.mean(rewards) self._max_reward = np.max(rewards) self._std_reward = np.sqrt(np.var(rewards) / len(rewards)) self._max_q = np.mean(max_q_values) self._avg_q = np.mean(q_values) self._std_q = np.sqrt(np.var(q_values) / len(q_values)) # TODO: Commented due to lack of evaluate() # if len(scores_eval) > 0: # self.eval_reward = scores_eval[-1] @staticmethod def export_plot(y, y_label, filename): """ Export a plot in filename :param y: (list) of float / int to plot :param filename: (string) directory """ plt.figure() plt.plot(range(len(y)), y) plt.xlabel("Epoch") plt.ylabel(y_label) plt.savefig(filename) plt.close()
def train(self, exp_schedule, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = deque(maxlen=self.config.num_episodes_test) last_frames = deque(maxlen=4) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time scores_eval += [] extractor = PongExtractor() prog = Progbar(target=self.config.nsteps_train) # interact with environment while t < self.config.nsteps_train: total_reward = 0 state = self.env.reset() last_frame = state last_frames.append(state) while True: t += 1 last_eval += 1 last_record += 1 if self.config.render_train: self.env.render() feats = extractor.extract(np.squeeze(state)) # replay memory stuff idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() # chose action according to current Q and exploration best_action, q_values = self.get_best_action(q_input) embedding = self.sess.run(self.hidden, feed_dict={self.s: [q_input]})[0] action = exp_schedule.get_action(best_action) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) if t % 100 == 0: # print state.shape # frame = np.zeros(np.squeeze(state).shape) # for f in last_frames: # frame = frame + np.squeeze(f) # frame = frame / len(last_frames) frame = np.squeeze(state) last_frame = np.squeeze(last_frame) pickle.dump( last_frames, open('frames/embedding/atari{}.p'.format(t), 'w')) for i in range(4): f = np.squeeze(last_frames[i]) scipy.misc.imsave( 'frames/embedding/atari{}.png'.format(t - 3 + i), f) # scipy.misc.imsave('frames/atari{}.png'.format(t-1),last_frame) # posfile = open('frames/atari{}.txt'.format(t),'w') # posfile.write('Opp Paddle:\t{}\n'.format(oppY)) # posfile.write('Player Paddle:\t{}\n'.format(playerY)) # posfile.write('ball x:\t{}\n'.format(ballX)) # posfile.write('ball y:\t{}\n'.format(ballY)) # posfile.close() np.savetxt('frames/embedding/pong{}.txt'.format(t), feats, fmt='%.2f') # perform action in env new_state, reward, done, info = self.env.step(action) # print "state shape:",state.shape() # store the transition replay_buffer.store_effect(idx, action, reward, done) last_frame = state state = new_state last_frames.append(state) # perform a training step loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon) # logging stuff if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) exp_schedule.update(t) lr_schedule.update(t) if len(rewards) > 0: prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", self.avg_reward), ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max Q", self.max_q), ("lr", lr_schedule.epsilon)]) elif (t < self.config.learning_start) and ( t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format( t, self.config.learning_start)) sys.stdout.flush() # count reward total_reward += reward if done or t >= self.config.nsteps_train: break # updates to perform at the end of an episode rewards.append(total_reward) if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # evaluate our policy last_eval = 0 print("") scores_eval += [self.evaluate()] if (t > self.config.learning_start) and self.config.record and ( last_record > self.config.record_freq): self.logger.info("Recording...") last_record = 0 self.record() # last words self.logger.info("- Training done.") self.save() scores_eval += [self.evaluate()] export_plot(scores_eval, "Scores", self.config.plot_output)
def train(self, exp_schedule, lr_schedule, exp_schedule1, env=None): """ Performs training of Q only on agent 0 Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ if env is None: env = self.env # initialize replay buffer and variables rewards = deque(maxlen=self.config.num_episodes_test) rewardsB = deque(maxlen=self.config.num_episodes_test) self.model_0.rewards = rewards self.model_1.rewards = rewardsB # self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time scores_eval += [self.evaluate()] prog = Progbar(target=self.config.nsteps_train) self.model_0.train_init() self.model_1.train_init() # next_fire_B = False # interact with environment while t < self.config.nsteps_train: total_reward = 0 state = self.env.reset() # need_new_ball = False while True: t += 1 last_eval += 1 last_record += 1 if self.config.render_train: env.render() action_0 = self.model_0.train_step_pre(state, exp_schedule) # if exp_schedule.epsilon == 1: # action_1 = exp_schedule.get_action(0,3) # agent altogether # else: action_1 = self.model_1.train_step_pre(state[:, ::-1], exp_schedule1) cur_action = actions.trans(action_0, action_1) # perform action in env new_state, reward, done, info = env.step(cur_action) # print("Reward", reward) # Problem loss_e0, grad_e0 = self.model_0.train_step_post( reward, done, t, lr_schedule, True) self.model_1.train_step_post(-reward, done, t, lr_schedule, False) state = new_state # logging stuff if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): # self.update_averages(rewards, max_q_values, q_values, scores_eval) exp_schedule.update(t) lr_schedule.update(t) if len(rewards) > 0: prog.update(t + 1, exact=[ ("Loss", loss_e0), ("Avg R", np.mean(rewards)), ("Max R", np.max(rewards)), ("Min R", np.min(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_e0), ("Max Q", np.mean(self.model_0.max_q_values)), ("lr", lr_schedule.epsilon) ]) elif (t < self.config.learning_start) and ( t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format( t, self.config.learning_start)) sys.stdout.flush() # count reward total_reward += reward if done or t >= self.config.nsteps_train: break # updates to perform at the end of an episode rewards.append(total_reward) rewardsB.append(-total_reward) if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # evaluate our policy last_eval = 0 print("") scores_eval += [self.evaluate()] if (t > self.config.learning_start) and self.config.record and ( last_record > self.config.record_freq): self.logger.info("Recording...") last_record = 0 self.record(exp_schedule) self.model_0.save(t) # save the models self.model_1.save(t) # save the models # last words self.logger.info("- Training done.") self.model_0.save() # save the models self.model_1.save() # save the models scores_eval += [self.evaluate()] export_plot(scores_eval, "Scores", self.config.plot_output)
def train(self, exp_schedule, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables if self.config.use_memory: replay_buffer = ReplayBuffer( self.config.buffer_size, self.config.state_history, memory_size=self.config.memory_unit_size) else: replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = deque(maxlen=self.config.num_episodes_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time scores_eval += [self.evaluate()[0]] prog = Progbar(target=self.config.nsteps_train) evaluation_result_list = [] oos_evalution_result_list = [] # interact with environment prev_time = time.time() while t < self.config.nsteps_train: total_reward = 0 state = self.env.reset() while True: t += 1 last_eval += 1 last_record += 1 if self.config.render_train: self.env.render() # replay memory stuff idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() if self.config.use_memory: prev_memory = replay_buffer.encode_recent_memory() best_action, q_values, _, next_memory = self.get_best_action_with_memory( q_input, prev_memory) next_memory = np.squeeze(next_memory) else: best_action, q_values = self.get_best_action(q_input) # chose action according to current Q and exploration action = exp_schedule.get_action(best_action) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) # perform action in env new_state, reward, done, info = self.env.step(action) # store the transition replay_buffer.store_effect(idx, action, reward, done) if self.config.use_memory: replay_buffer.store_memory(idx, next_memory) state = new_state # perform a training step loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon) # logging stuff time_log_freq = 1000 if t % time_log_freq == 0: with open(self.config.output_path + 'time_log.txt', 'a') as of: of.write('{}\n'.format(time.time() - prev_time)) of.write('\n') prev_time = time.time() if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) exp_schedule.update(t) lr_schedule.update(t) if len(rewards) > 0: prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg_R", self.avg_reward), ("Max_R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max_Q", self.max_q), ("lr", lr_schedule.epsilon)]) elif (t < self.config.learning_start) and ( t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format( t, self.config.learning_start)) sys.stdout.flush() # count reward total_reward += reward if done or t >= self.config.nsteps_train: break # updates to perform at the end of an episode rewards.append(total_reward) if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # evaluate our policy last_eval = 0 print("") score, complete, length = self.evaluate() if complete > 0: evaluation_result_list += [(score, complete, length)] if score > self.config.extended_eval_threshold: self.logger.info('Extended in-sample evaluation...') self.evaluate(num_episodes=1000) for _ in range(10): self.logger.info( 'Extended out-of-sample evaluation...') oos_result = self.evaluate( EnvMaze(n=self.config.maze_size), num_episodes=100) oos_evalution_result_list += [oos_result] scores_eval += [score] if (t > self.config.learning_start) and self.config.record and ( last_record > self.config.record_freq): self.logger.info("Recording...") last_record = 0 self.record() # last words self.logger.info("- Training done.") self.save() scores_eval += [self.evaluate()[0]] export_plot(scores_eval, "Scores", self.config.plot_output) return evaluation_result_list, oos_evalution_result_list
def train(self, model_a, exp_schedule, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = deque(maxlen=self.config.num_episodes_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time #scores_eval += [self.evaluate()] prog = Progbar(target=self.config.nsteps_train) self.env.state.is_render_image = self.config.render_train model_a.env.state.is_render_image = model_a.config.render_train orientation_map = [np.array([0, 1]), np.array([-1, 0]), np.array([0, -1]), np.array([1, 0])] # interact with environment while t < self.config.nsteps_train: total_reward = 0 flag = True while flag: state = self.env.reset() # h x w x c agent_location = self.env.state.agent_location if self.env.teacher.dist_map[agent_location[1],agent_location[0]]!=np.inf: flag = False model_a.env.reset() model_a.env.state.copy_state(model_a.env.agent, self.env.state) h_state_fw = (np.zeros([1,self.config.h_size]),np.zeros([1,self.config.h_size])) h_state_bw = (np.zeros([1,self.config.h_size]),np.zeros([1,self.config.h_size])) state_batch = list() goal_state_batch = list() goal_obs_image_batch = list() path_loc = list() path_ori = list() done_batch = list() width, height = self.env.state.xmap.dim['width'], self.env.state.xmap.dim['height'] side_radius = min(self.config.visible_radius_unit_side, max(width - 1, height - 1)) block_size = self.env.state.image_block_size for i in range(200): #### teacher rotate #### agent_location = self.env.state.agent_location agent_orientation = self.env.state.agent_orientation goal_location = agent_location+agent_orientation gt_action = self.env.teacher.action_map[agent_location[1], agent_location[0]] if np.dot(agent_orientation, orientation_map[gt_action])!=1: tmp = np.cross(agent_orientation, orientation_map[gt_action]) if tmp==1: state, reward_i, done = self.env.step(3) else: state, reward_i, done = self.env.step(2) continue path_loc.append(copy.deepcopy(goal_location)) path_ori.append(copy.deepcopy(agent_orientation)) raw_goal_state, goal_state = self.convert_state_to_goal_state(state) state_batch.append(raw_goal_state[None][None]) goal_state_batch.append(goal_state) if self.config.render_train: goal_obs_image_batch.append(self.env.state.image[:3*block_size, (side_radius-1)*block_size:(side_radius+2)*block_size, :]) state, reward_i, done = self.env.step(0) done_batch.append(done) if done: break slen = np.array([len(state_batch)]).astype('int32') state_batch = np.concatenate(state_batch, axis=1) best_action_batch, q_values_batch, h_state_fw, h_state_bw = self.get_best_action_batch(state_batch, h_state_fw, h_state_bw, slen) action_batch = exp_schedule.get_action_batch(best_action_batch) for i in range(q_values_batch.shape[0]): max_q_values.append(max(q_values_batch[i])) q_values += list(q_values_batch[i]) reward_batch = list() for i, action in enumerate(action_batch): if action==0: reward_batch.append(0) else: if self.config.render_train: model_a.env.teacher.goal_obs_image = goal_obs_image_batch[i] h_state_a = (np.zeros([1,model_a.config.h_size]),np.zeros([1,model_a.config.h_size])) model_a.env.teacher.set_goal(goal_state_batch[i], path_loc[i]) reward_a = model_a.navi_goal(h_state_a, goal_state_batch[i]) if model_a.env.teacher.goal_finish: reward_batch.append(-0.05) else: reward_batch.append(-0.1) #model_a.env.state.teleport(model_a.env.agent, path_loc[i], path_ori[i]) if action_batch[-1]==1 and model_a.env.teacher.goal_finish: reward_batch[-1] += 1 else: if self.config.render_train: model_a.env.teacher.goal_obs_image = goal_obs_image_batch[-1] h_state_a = (np.zeros([1,model_a.config.h_size]),np.zeros([1,model_a.config.h_size])) model_a.env.teacher.set_goal(goal_state_batch[-1], path_loc[-1]) reward_a = model_a.navi_goal(h_state_a, goal_state_batch[-1]) if model_a.env.teacher.goal_finish: reward_batch[-1] += 1 for i in range(action_batch.shape[0]): idx = replay_buffer.store_frame(state_batch[0][i]) replay_buffer.store_effect(idx, action_batch[i], reward_batch[i], done_batch[i]) for i in range(action_batch.shape[0]): t += 1 last_eval += 1 last_record += 1 # perform a training step loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon) # logging stuff if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) exp_schedule.update(t) lr_schedule.update(t) if len(rewards) > 0: prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", self.avg_reward), ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max Q", self.max_q), ("lr", lr_schedule.epsilon)]) elif (t < self.config.learning_start) and (t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format(t, self.config.learning_start)) sys.stdout.flush() # count reward total_reward = sum(reward_batch) # updates to perform at the end of an episode rewards.append(total_reward) if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # evaluate our policy last_eval = 0 print("") self.logger.info("Global step: %d"%(t)) scores_eval += [self.evaluate(model_a)] # last words self.logger.info("- Training done.") self.save(t) scores_eval += [self.evaluate(model_a)] export_plot(scores_eval, "Scores", self.config.plot_output)
def train(self, exp_schedule, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables if not self.config.batch: replay_buffer = ReplayBuffer( self.config.buffer_size, self.config.state_history ) else: self.logger.info( 'Loading replay buffer from {}'.format(self.config.buffer_path) ) replay_buffer = ReplayBuffer.load(self.config.buffer_path) self.logger.info( 'Loaded buffer with {} observations and {} in buffer'.format( len(replay_buffer.obs), replay_buffer.num_in_buffer ) ) rewards = deque(maxlen=self.config.num_episodes_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) episode_lengths = deque(maxlen=1000) max_episode_length = 0 self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time scores_eval += [self.evaluate()] prog = Progbar(target=self.config.nsteps_train) # interact with environment while t < self.config.nsteps_train: total_reward = 0 if not self.config.batch: state = self.env.reset() episode_step = 0 avg_episode_length = ( np.nan if len(episode_lengths) == 0 else np.mean(episode_lengths) ) while True: t += 1 episode_step += 1 last_eval += 1 last_record += 1 if self.config.render_train: self.env.render() if not self.config.batch: get_action = functools.partial( exp_schedule.get_action, episode_num=len(episode_lengths), episode_step=episode_step, avg_episode_length=avg_episode_length ) state, reward, done, _q_values = self.interact( replay_buffer, state, get_action ) else: reward = 0 done = True _q_values = [0] # store q values max_q_values.append(max(_q_values)) q_values.extend(list(_q_values)) # perform a training step loss_eval, grad_eval = self.train_step( t, replay_buffer, lr_schedule.epsilon ) # logging stuff learning = (t > self.config.learning_start) learning_and_loggging = ( learning and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0) ) if learning_and_loggging: self.update_averages( rewards, max_q_values, q_values, scores_eval, episode_lengths, max_episode_length ) exp_schedule.update(t) lr_schedule.update(t) if len(rewards) > 0: if self.config.batch: exact = [ ("Loss", loss_eval), ("Grads", grad_eval), ("lr", lr_schedule.epsilon), ] else: exact = [ ("Loss", loss_eval), ("Avg_R", self.avg_reward), ("Max_R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max_Q", self.max_q), ("lr", lr_schedule.epsilon), ("avg_ep_len", avg_episode_length) ] prog.update(t + 1, exact=exact) elif not learning and (t % self.config.log_freq == 0): sys.stdout.write( "\rPopulating the memory {}/{}...".format( t, self.config.learning_start ) ) sys.stdout.flush() # count reward total_reward += reward if done or t >= self.config.nsteps_train: episode_lengths.append(episode_step) if episode_step > max_episode_length: max_episode_length = episode_step # retrain the clusters every time the max episode # length changes if hasattr(self, 'reset_counts'): self.reset_counts( n_clusters=max_episode_length, states=replay_buffer.get_encoded_states(), actions=replay_buffer.get_actions() ) break # updates to perform at the end of an episode rewards.append(total_reward) should_evaluate = ( (t > self.config.learning_start) and (last_eval > self.config.eval_freq) ) if should_evaluate: # evaluate our policy last_eval = 0 print("") scores_eval.append(self.evaluate()) should_record = ( (t > self.config.learning_start) and self.config.record and (last_record > self.config.record_freq) ) if should_record: self.logger.info("Recording...") last_record = 0 self.record() # last words self.logger.info("- Training done.") self.save() scores_eval.append(self.evaluate()) export_plot(scores_eval, "Scores", self.config.plot_output) if not self.config.batch: # save replay buffer self.logger.info( 'Saving buffer to {}'.format(self.config.buffer_path) ) replay_buffer.save(self.config.buffer_path)
def train(self, beta_schedule, lr_schedule, cr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ self.init_averages() t = last_eval = curri_idx = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time curriculum_batch_size = np.ceil( self.config.nsteps_train / cr_schedule.n_curriculum).astype('int32') prog = Progbar(target=self.config.nsteps_train) # interact with environment while t < self.config.nsteps_train: t += 1 last_eval += 1 config = self.config config.n_node, config.k_ring, config.p_rewiring, config.path_len_limit = cr_schedule[ curri_idx] self.env.reset(config) # h x w x c h_state = DNC.zero_state(config, batch_size=1) encoding, predflag, target_action = self.env.prepare_seq() slen = np.array(encoding.shape[0]).astype('int32') # describe graph, query and planning h_state = self.sess.run(self.hs_out, feed_dict={ self.s: encoding[None], self.hs: h_state, self.slen: slen }) past_state = -1 past_action_onehot = -1 encoding_a = np.zeros([config.max_step_len, encoding.shape[1]]) predflag_a = np.zeros(config.max_step_len) target_action_a = np.zeros( [config.max_step_len, target_action.shape[1]]) for i in range(config.max_step_len): current_encoding = GraphWorld.convert_triplets_to_encoding( np.array([[ past_state, self.env.current_state, past_action_onehot ]]).astype('int32'), config.ndigits, config.nway) current_encoding = np.concatenate( [current_encoding, np.array([[1, 0]])], axis=1) pred_action, h_state = self.sess.run( [self.q, self.hs_out], feed_dict={ self.s: current_encoding[None], self.hs: h_state, self.slen: np.ones(1).astype('int32') }) gt_action = self.env.get_gt_action() action = self.get_action(pred_action[0], gt_action, beta_schedule.epsilon) past_state = self.env.current_state _, done, past_action_onehot = self.env.step(action) encoding_a[i, :] = current_encoding[0] predflag_a[i] = 1 target_action_a[i] = gt_action slen += 1 if done: break batch_data = (np.concatenate([encoding, encoding_a], axis=0)[None], np.concatenate([predflag, predflag_a], axis=0), np.concatenate([target_action, target_action_a], axis=0), slen) # perform a training step loss_eval, grad_eval = self.train_step(t, lr_schedule.epsilon, batch_data) # logging stuff if ((t % config.log_freq == 0) and (t % config.learning_freq == 0)): self.update_averages(scores_eval) beta_schedule.update(t) lr_schedule.update(t) prog.update(t + 1, exact=[("Loss", loss_eval), ("Grads", grad_eval), ("lr", lr_schedule.epsilon)]) if t >= config.nsteps_train: break if last_eval >= config.eval_freq: # evaluate our policy last_eval = 0 print("") self.logger.info("Global step: %d" % (t)) scores_eval += [self.evaluate(cr_schedule, curri_idx)] if scores_eval[-1] > 0.8: curri_idx += 1 msg = "Upgrade to lesson {:d}".format(int(curri_idx)) self.logger.info(msg) self.logger.info( "----------Start Computing Final Score----------") scores_eval += [self.evaluate(cr_schedule)] self.logger.info( "----------Finish Computing Final Score----------") # last words self.logger.info("- Training done.") self.save(t) scores_eval += [self.evaluate(cr_schedule)] export_plot(scores_eval, "Scores", self.config.plot_output)
def evaluate(self, env=None, num_episodes=None): """ Evaluation with same procedure as the training """ # log our activity only if default call if num_episodes is None: self.logger.info("Evaluating...") prog_bar = True else: prog_bar = False # arguments defaults if num_episodes is None: num_episodes = self.config.num_episodes_test if env is None: env = self.env # replay memory to play replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = [] if prog_bar: prog = Progbar(target=num_episodes) for i in range(num_episodes): total_reward = 0 state = env.reset() while True: if self.config.render_test: env.render() # store last state in buffer idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() action = self.get_action(q_input) # perform action in env new_state, reward, done, info = env.step(action) # store in replay memory replay_buffer.store_effect(idx, action, reward, done) state = new_state # count reward total_reward += reward if done: break # updates to perform at the end of an episode rewards.append(total_reward) if prog_bar: prog.update(i + 1, exact=[("Reward", total_reward)]) avg_reward = np.mean(rewards) sigma_reward = np.sqrt(np.var(rewards) / len(rewards)) if num_episodes > 1: msg = "Average reward: {:04.2f} +/- {:04.2f}".format(avg_reward, sigma_reward) self.logger.info(msg) return avg_reward
def train(self, exp_schedule, lr_schedule): # Initialize replay buffer and variables replay_buffer = ReplayBuffer(self.FLAGS.buffer_size, self.FLAGS.state_hist) rewards = deque(maxlen=self.FLAGS.num_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = 0 # time control of nb of steps loss_eval = grad_eval = 0 scores_eval = [] # list of scores computed at iteration time scores_eval += [self.evaluate(self.env, self.FLAGS.num_test)] self.prog = Progbar(target=self.FLAGS.train_steps) # Train for # of train steps while t < self.FLAGS.train_steps: continual_crash = 0 try: total_reward = 0 ep_len = 0 state = self.env.reset() # Run for 1 episode and update the buffer while True: ep_len += 1 # replay memory stuff idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() # chose action according to current Q and exploration best_action, q_values = self.network.get_best_action( q_input) action = exp_schedule.get_action(best_action) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) # perform action in env new_state, reward, done, info = self.env.step(action) # store the transition replay_buffer.store_effect(idx, action, reward, done) state = new_state # Count reward total_reward += reward # Stop at end of episode if done: break #Store episodic rewards if ep_len > 1: rewards.append(total_reward) # Learn using replay while True: t += 1 ep_len -= 1 # Make train step if necessary if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.learn_every == 0)): loss_eval, grad_eval = self.network.update_step( t, replay_buffer, lr_schedule.epsilon, self.summary) exp_schedule.update(t) lr_schedule.update(t) if (t % self.FLAGS.target_every == 0): self.network.update_target_params() # Update logs if necessary if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.log_every == 0) and (len(rewards) > 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) self.update_logs(t, loss_eval, rewards, exp_schedule.epsilon, grad_eval, lr_schedule.epsilon) # Update logs if necessary elif (t < self.FLAGS.learn_start) and ( t % self.FLAGS.log_every == 0): sys.stdout.write( "\rPopulating the memory {}/{}...".format( t, self.FLAGS.learn_start)) sys.stdout.flush() if ((t > self.FLAGS.learn_start) and (t % self.FLAGS.check_every == 0)): # Evaluate current model scores_eval += [ self.evaluate(self.env, self.FLAGS.num_test) ] # Save current Model self.network.save() # Record video of current model if self.FLAGS.record: self.record() if ep_len <= 0 or t >= self.FLAGS.train_steps: break continual_crash = 0 except Exception as e: continual_crash += 1 self.logger.info(e) if continual_crash >= 10: self.logger.info("Crashed 10 times -- stopping u suck") raise e else: t -= 1 self.logger.info("Env crash, making new env") time.sleep(60) self.env = create_slither_env(self.FLAGS.state_type) self.env = Unvectorize(self.env) self.env.configure(fps=self.FLAGS.fps, remotes=self.FLAGS.remotes, start_timeout=15 * 60, vnc_driver='go', vnc_kwargs={ 'encoding': 'tight', 'compress_level': 0, 'fine_quality_level': 50 }) time.sleep(60) # End of training self.logger.info("- Training done.") self.network.save() scores_eval += [self.evaluate(self.env, self.FLAGS.num_test)] export_plot(scores_eval, "Scores", self.FLAGS.plot_path)
def train(self, exp_schedule, lr_schedule, choose_teacher_strategy=None): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = deque(maxlen=self.config.num_episodes_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time scores_eval += [self.evaluate()] prog = Progbar(target=self.config.nsteps_train) # interact with environment allsteps = [] while t < self.config.nsteps_train: total_reward = 0 state = self.env.reset() while True: if self.config.state_subspace is not None: out_of_bounds = False if self.config.state_subspace in [ 'ball_top_half', 'ball_bottom_half' ]: image = self.env.unwrapped._get_obs() ball_position = ball_half_screen_position(image) # check if ball is in top half but we're restricted to bottom half if ball_position == 1 and self.config.state_subspace == 'ball_bottom_half': out_of_bounds = True # check if ball is in bottom half but we're restricted to top half elif ball_position == 0 and self.config.state_subspace == 'ball_top_half': out_of_bounds = True else: raise NotImplementedError if out_of_bounds: # current state is outside of this agent's state subspace # perform action in env state, reward, done, info = self.env.step(action) t += 1 last_eval += 1 last_record += 1 if self.config.render_train: self.env.render() # replay memory stuff idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() # self.q_inputs.append(q_input) # chose action according to current Q and exploration best_action, q_values = self.get_best_action(q_input) action = exp_schedule.get_action(best_action) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) # perform action in env new_state, reward, done, info = self.env.step(action) # store the transition replay_buffer.store_effect(idx, action, reward, done) state = new_state if choose_teacher_strategy is not None: # store the reward with the teacher choice strategy choose_teacher_strategy.store_reward(reward, q_input) # perform a training step loss_eval, grad_eval = self.train_step( t, replay_buffer, lr_schedule.epsilon, choose_teacher_strategy) # logging stuff if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) exp_schedule.update(t) lr_schedule.update(t) if choose_teacher_strategy is not None: choose_teacher_strategy.update_schedule(t) if len(rewards) > 0: exact = [("Loss", loss_eval), ("Avg R", self.avg_reward), ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max Q", self.max_q), ("lr", lr_schedule.epsilon)] if choose_teacher_strategy is not None and hasattr( choose_teacher_strategy, 'eps_schedule'): exact.append( ("Choose teacher eps", choose_teacher_strategy.eps_schedule.epsilon)) prog.update(t + 1, exact=exact) elif ((t > self.config.learning_start) and (t % self.config.save_teacher_choice_freq == 0) and (choose_teacher_strategy is not None)): choose_teacher_strategy.save( self.config.teacher_choice_output_path) elif (t < self.config.learning_start) and ( t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format( t, self.config.learning_start)) sys.stdout.flush() # count reward total_reward += reward if done or t >= self.config.nsteps_train: break # updates to perform at the end of an episode rewards.append(total_reward) if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # evaluate our policy last_eval = 0 print("") scores_eval += [self.evaluate()] if (t > self.config.learning_start) and self.config.record and ( last_record > self.config.record_freq): self.logger.info("Recording...") last_record = 0 self.record() # last words self.logger.info("- Training done.") self.save() scores_eval += [self.evaluate()] export_plot(scores_eval, "Scores", self.config.plot_output) if choose_teacher_strategy is not None: choose_teacher_strategy.save( self.config.teacher_choice_output_path)
def train(self, model_a, exp_schedule, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = deque(maxlen=self.config.num_episodes_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time #scores_eval += [self.evaluate()] prog = Progbar(target=self.config.nsteps_train) self.env.state.is_render_image = self.config.render_train orientation_map = [ np.array([0, 1]), np.array([-1, 0]), np.array([0, -1]), np.array([1, 0]) ] # interact with environment while t < self.config.nsteps_train: total_reward = 0 flag = True while flag: state = self.env.reset() # h x w x c agent_location = self.env.state.agent_location if self.env.teacher.dist_map[agent_location[1], agent_location[0]] != np.inf: flag = False model_a.env.reset() model_a.env.state.copy_state(model_a.env.agent, self.env.state) h_state = (np.zeros([1, self.config.h_size]), np.zeros([1, self.config.h_size])) h_state_a = (np.zeros([1, model_a.config.h_size]), np.zeros([1, model_a.config.h_size])) slen = np.ones(1).astype('int32') action = 0 for i in range(200): t += 1 last_eval += 1 last_record += 1 raw_goal_state, goal_state = self.convert_state_to_goal_state( state) #### for replay_buffer # replay memory stuff idx = replay_buffer.store_frame(raw_goal_state) q_input = replay_buffer.encode_recent_observation() # chose action according to current Q and exploration best_action, q_values, h_state = self.get_best_action( [q_input], h_state, slen, [action]) action = exp_schedule.get_action(best_action) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) reward = 0 #### perform action in env #### #### update goal obs image #### if action == 1: if self.config.render_train: self.env.teacher.update_goal_obs_image(self.env.state) if self.config.render_train: self.env.render() #### teacher move #### agent_location = self.env.state.agent_location agent_orientation = self.env.state.agent_orientation goal_location = agent_location + agent_orientation gt_action = self.env.teacher.action_map[agent_location[1], agent_location[0]] if np.dot(agent_orientation, orientation_map[gt_action]) == 1: new_state, reward_i, done = self.env.step(0) else: tmp = np.cross(agent_orientation, orientation_map[gt_action]) if tmp == 1: new_state, reward_i, done = self.env.step(3) else: new_state, reward_i, done = self.env.step(2) #### issue command #### if action == 1: model_a.env.teacher.set_goal(goal_state, goal_location) reward_a = model_a.navi_goal(h_state_a, goal_state) if model_a.env.teacher.goal_finish: reward += reward_i reward += reward_a reward += -1 self.env.state.teleport( self.env.agent, model_a.env.state.agent_location, model_a.env.state.agent_orientation) new_state = self.env.state.onehot_state # store the transition replay_buffer.store_effect(idx, action, reward, done) state = new_state # perform a training step loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon) # logging stuff if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) exp_schedule.update(t) lr_schedule.update(t) if len(rewards) > 0: prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", self.avg_reward), ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max Q", self.max_q), ("lr", lr_schedule.epsilon)]) elif (t < self.config.learning_start) and ( t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format( t, self.config.learning_start)) sys.stdout.flush() # count reward total_reward += reward if done or t >= self.config.nsteps_train: break # updates to perform at the end of an episode rewards.append(total_reward) if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # evaluate our policy last_eval = 0 print("") self.logger.info("Global step: %d" % (t)) scores_eval += [self.evaluate(model_a)] # last words self.logger.info("- Training done.") self.save(t) scores_eval += [self.evaluate(model_a)] export_plot(scores_eval, "Scores", self.config.plot_output)
def run_epoch(self, session, train, val, log): num_samples = len(train["context"]) num_batches = int(np.ceil(num_samples) * 1.0 / self.config.batch_size) self.result_saver.save("batch_size", self.config.batch_size) progress = Progbar(target=num_batches) best_f1 = 0 for i, train_batch in enumerate( batches(train, is_train=True, batch_size=self.config.batch_size, window_size=self.config.window_size)): _, loss = self.optimize(session, train_batch) progress.update(i, [("training loss", loss)]) self.result_saver.save("losses", loss) if i % self.config.eval_num == 0 or i == num_batches: # Randomly get some samples from the dataset train_samples = get_random_samples( train, self.config.samples_used_for_evaluation) val_samples = get_random_samples( val, self.config.samples_used_for_evaluation) # First evaluate on the training set for not using best span f1_train, EM_train = self.evaluate_answer(session, train_samples, use_best_span=False) # Then evaluate on the val set f1_val, EM_val = self.evaluate_answer(session, val_samples, use_best_span=False) if log: print() print("Not using best span") logging.info( "F1: {}, EM: {}, for {} training samples".format( f1_train, EM_train, self.config.samples_used_for_evaluation)) logging.info( "F1: {}, EM: {}, for {} validation samples".format( f1_val, EM_val, self.config.samples_used_for_evaluation)) # First evaluate on the training set f1_train, EM_train = self.evaluate_answer(session, train_samples, use_best_span=True) # Then evaluate on the val set f1_val, EM_val = self.evaluate_answer(session, val_samples, use_best_span=True) if log: print() print("Using best span") logging.info( "F1: {}, EM: {}, for {} training samples".format( f1_train, EM_train, self.config.samples_used_for_evaluation)) logging.info( "F1: {}, EM: {}, for {} validation samples".format( f1_val, EM_val, self.config.samples_used_for_evaluation)) self.result_saver.save("f1_train", f1_train) self.result_saver.save("EM_train", EM_train) self.result_saver.save("f1_val", f1_val) self.result_saver.save("EM_val", EM_val) batches_trained = 1 if self.result_saver.is_empty("batch_indices") \ else self.result_saver.get("batch_indices")[-1] + min(i + 1, self.config.eval_num) self.result_saver.save("batch_indices", batches_trained) save_graphs(self.result_saver.data, path=self.config.train_dir) if f1_val > best_f1: saver = tf.train.Saver() saver.save( session, pjoin(self.config.train_dir, "BATCH-{}".format(batches_trained))) best_f1 = f1_val
def train(self, exp_schedule, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history, self.config) rewards = deque(maxlen=self.config.num_episodes_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time scores_eval += [self.evaluate()] prog = Progbar(target=self.config.nsteps_train) # interact with environment while t < self.config.nsteps_train: total_reward = 0 state = self.env.reset() while True: t += 1 last_eval += 1 last_record += 1 if self.config.render_train: self.env.render() # replay memory stuff idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() # chose action according to current Q and exploration best_action, q_values = self.get_best_action(q_input) action, explore = exp_schedule.get_action(best_action) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) # perform action in env new_state, reward, done, info = self.env.step(action) # store the transition replay_buffer.store_effect(idx, action, reward, done, explore) state = new_state # perform a training step loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon, exp_schedule.epsilon) # logging stuff if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) exp_schedule.update(t) lr_schedule.update(t) if len(rewards) > 0: prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", self.avg_reward), ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max Q", self.max_q), ("lr", lr_schedule.epsilon)]) elif (t < self.config.learning_start) and (t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format(t, self.config.learning_start)) sys.stdout.flush() # count reward total_reward += reward if done or t >= self.config.nsteps_train: break # updates to perform at the end of an episode rewards.append(total_reward) if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # evaluate our policy last_eval = 0 print("") scores_eval += [self.evaluate()] if (t > self.config.learning_start) and self.config.record and (last_record > self.config.record_freq): self.logger.info("Recording...") last_record =0 self.record() # last words self.logger.info("- Training done.") self.save() scores_eval += [self.evaluate()] export_plot(scores_eval, "Scores", self.config.plot_output)
def train(self, exp_schedule, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = deque(maxlen=self.config.num_episodes_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time #scores_eval += [self.evaluate()] prog = Progbar(target=self.config.nsteps_train) self.env.state.is_render_image = self.config.render_train # interact with environment while t < self.config.nsteps_train: total_reward = 0 state = self.env.reset() # h x w x c goal_state = self.env.teacher.goal_obs_onehot_state # h x w x c h_state = (np.zeros([1, self.config.h_size]), np.zeros([1, self.config.h_size])) slen = np.ones(1).astype('int32') action = 0 for i in range(200): t += 1 last_eval += 1 last_record += 1 if self.config.render_train: self.env.render() #### for replay_buffer # replay memory stuff idx = replay_buffer.store_frame(state, goal_state) q_input = replay_buffer.encode_recent_observation() # chose action according to current Q and exploration curr_attention = np.equal( np.sum(np.equal(q_input, goal_state[None][None][None]), 3), q_input.shape[3]) best_action, q_values, h_state = self.get_best_action( [q_input], curr_attention[None], h_state, slen, [action]) #best_action, q_values, h_state = self.get_best_action([q_input], goal_state[None][None], h_state, slen, [action]) action = exp_schedule.get_action(best_action) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) # perform action in env new_state, reward, done = self.env.step(action) # store the transition replay_buffer.store_effect(idx, action, reward, done) state = new_state # perform a training step loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon) # logging stuff if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) exp_schedule.update(t) lr_schedule.update(t) if len(rewards) > 0: prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg R", self.avg_reward), ("Max R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max Q", self.max_q), ("lr", lr_schedule.epsilon)]) elif (t < self.config.learning_start) and ( t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format( t, self.config.learning_start)) sys.stdout.flush() # count reward total_reward += reward if done or t >= self.config.nsteps_train: break # updates to perform at the end of an episode rewards.append(total_reward) if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # evaluate our policy last_eval = 0 print("") self.logger.info("Global step: %d" % (t)) scores_eval += [self.evaluate()] # last words self.logger.info("- Training done.") self.save(t) scores_eval += [self.evaluate()] export_plot(scores_eval, "Scores", self.config.plot_output)