def train_models(env_name, num_episodes, gamma, lam, kl_targ, coef, use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs, max_timesteps, reg_scale, phi_lr, phi_hs, policy_size, phi_obj, load_model): env, obs_dim, act_dim = init_gym(env_name) set_global_seeds(seed) env.seed(seed) env._max_episode_steps = max_timesteps obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories aigym_path = os.path.join('log-files/', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ, epochs, phi_epochs, policy_size=policy_size, phi_hidden_sizes=phi_hs, reg_scale=reg_scale, lr_phi=phi_lr, phi_obj=phi_obj) run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps) episode = 0 for _ in range(200): trajectories, traj_len_list = run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps) num_traj = len(trajectories) episode += len(trajectories) add_value(trajectories, val_func) add_disc_sum_rew(trajectories, gamma) add_gae(trajectories, gamma, lam) observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) policy.update(load_model, observes, actions, advantages, use_lr_adjust, ada_kl_penalty, c=0.) # update policy val_func.fit(observes, disc_sum_rew) # Save models policy.save_policy() val_func.save_val_func() logger.log("saved model")
def _build_graph(self): """ Build and initialize TensorFlow graph """ self.g = tf.Graph() with self.g.as_default(): self._placeholders() self._policy_nn() self._logprob() self._kl_entropy() self._sample() self._loss_train_op() self.init = tf.global_variables_initializer() # Save only policy parameters policy_vars = tf.get_collection(\ tf.GraphKeys.TRAINABLE_VARIABLES, scope='policy_nn') var_dict = {} for var in policy_vars: logger.log(var.name) var_dict[var.name]= var self._init_session() self.saver = tf.train.Saver(var_dict)
def update(self, observes, actions, advantages, use_lr_adjust, ada_kl_penalty): """ Update policy based on observations, actions and advantages Args: observes: observations, shape = (N, obs_dim) actions: actions, shape = (N, act_dim) advantages: advantages, shape = (N,) phi_value: phi_value, shape = (N,) phi_act_g: phi_act_g, shape = (N, act_dim) """ feed_dict = {self.obs_ph: observes, self.act_ph: actions, self.advantages_ph: advantages, self.beta_ph: self.beta, self.eta_ph: self.eta, self.lr_ph: self.lr * self.lr_multiplier, self.lr_phi_ph: self.lr_phi} old_means_np, old_log_vars_np = self.sess.run([self.means, self.log_vars], feed_dict) feed_dict[self.old_log_vars_ph] = old_log_vars_np feed_dict[self.old_means_ph] = old_means_np loss, kl, entropy = 0, 0, 0 if self.c_ph == 1.: # Update phi function & policy network logger.log("Training Phi for %d epochs"%self.phi_epochs) for _ in progressbar(range(self.phi_epochs), "Train Phi:", 25): self.sess.run(self.phi_train_op, feed_dict) phi_loss = self.sess.run(self.phi_loss, feed_dict) logger.record_tabular("Phi_loss", phi_loss) # Training policy logger.log("Training Policy for %d epochs"%self.epochs) for _ in progressbar(range(self.epochs), "Train Policy", 25): self.sess.run(self.train_op, feed_dict) loss, kl, entropy = self.sess.run([self.loss, self.kl, self.entropy], feed_dict) if kl > self.kl_targ * 4: # early stopping if D_KL diverges badly break if (ada_kl_penalty): if kl > self.kl_targ * 2: # servo beta to reach D_KL target self.beta = np.minimum(35, 1.5 * self.beta) # max clip beta if (use_lr_adjust): if self.beta > 30 and self.lr_multiplier > 0.1: self.lr_multiplier /= 1.5 elif kl < self.kl_targ / 2: self.beta = np.maximum(1 / 35, self.beta / 1.5) # min clip beta if (use_lr_adjust): if self.beta < (1 / 30) and self.lr_multiplier < 10: self.lr_multiplier *= 1.5 logger.record_dicts({ 'PolicyLoss': loss, 'PolicyEntropy': entropy, 'KL': kl, 'Beta': self.beta, '_lr_multiplier': self.lr_multiplier})
def _loss_train_op(self): # get Phi function and its derivatives phi_value, phi_act_g = self.phi(self.obs_ph, self.act_ph, reuse=False) self.phi_value = phi_value self.phi_act_g = phi_act_g self.phi_nn_vars = self.phi.phi_vars ll_mean_g = 1/tf.exp(self.log_vars) * (self.act_ph - self.means) ll_log_vars_g = -1/2 * ( 1/tf.exp(self.log_vars) \ - 1/tf.exp(self.log_vars) * \ (self.act_ph - self.means) * \ (self.act_ph - self.means) * \ 1 / tf.exp(self.log_vars)) self.phi_value.set_shape((None,)) log_vars_inner = tf.expand_dims(tf.exp(self.logp - self.logp_old), 1) \ * (ll_log_vars_g * tf.expand_dims(self.advantages_ph - self.c_ph * self.phi_value, 1) \ + 1/2 * self.c_ph * ll_mean_g * self.phi_act_g ) means_inner = tf.expand_dims(tf.exp(self.logp - self.logp_old), 1) \ * (ll_mean_g * tf.expand_dims(self.advantages_ph - self.c_ph * self.phi_value, 1) \ + self.c_ph * self.phi_act_g) loss1_log_vars = - tf.reduce_mean( tf.stop_gradient(log_vars_inner) * \ tf.exp(self.log_vars)) loss1_mean = -tf.reduce_mean( tf.stop_gradient(means_inner) * \ self.means) loss1 = loss1_log_vars + loss1_mean loss2 = tf.reduce_mean(self.beta_ph * self.kl) loss3 = self.eta_ph * tf.square(\ tf.maximum(0.0, \ self.kl - 2.0 * self.kl_targ)) self.loss = loss1 + loss2 + loss3 optimizer = tf.train.AdamOptimizer(self.lr_ph) self.train_op = optimizer.minimize(self.loss, var_list= self.policy_nn_vars) if self.reg_scale > 0.: reg_variables = tf.get_collection(\ tf.GraphKeys.REGULARIZATION_LOSSES) reg_term = tf.contrib.layers.apply_regularization( self.phi.kernel_regularizer, reg_variables) else: reg_term = 0. if self.c_ph == 1.: if self.phi_obj == 'FitQ': self.phi_loss = tf.reduce_mean(\ tf.square(self.advantages_ph - \ self.phi_value), axis=0) + reg_term logger.log('phi_with FitQ as objective function') elif self.phi_obj == 'MinVar': self.means_mse = tf.reduce_sum(\ tf.reduce_mean( \ tf.square(means_inner - \ tf.reduce_mean(means_inner, \ axis=0)), axis = 0)) self.logstd_vars_mse = tf.reduce_sum(\ tf.reduce_mean( \ tf.square(log_vars_inner - \ tf.reduce_mean(log_vars_inner, \ axis=0)), axis = 0)) self.phi_loss = self.means_mse + self.logstd_vars_mse + reg_term logger.log('phi with MinVar as objecive function') else: raise NotImplementedError phi_optimizer = tf.train.AdamOptimizer(self.lr_phi_ph) self.phi_train_op = phi_optimizer.minimize(\ self.phi_loss, var_list=self.phi_nn_vars) elif self.c_ph == 0.: logger.log("Training with PPO") self.phi_train_op = tf.no_op
def _policy_nn(self): """ Neural net for policy approximation function """ with tf.variable_scope("policy_nn"): # hidden layer sizes determined by obs_dim # and act_dim (hid2 is geometric mean) if self.policy_size == 'small': logger.log("using small structure") hid1_size = self.obs_dim # * 10 hid3_size = self.act_dim # * 10 hid2_size = int(np.sqrt(hid1_size * hid3_size)) elif self.policy_size == 'large': logger.log('Using large structure ') hid1_size = self.obs_dim * self.hid1_mult hid3_size = self.act_dim * 10 hid2_size = int(np.sqrt(hid1_size * hid3_size)) else: raise NotImplementedError # heuristic to set learning rate based on NN size self.lr = 9e-4 / np.sqrt(hid2_size) # 9e-4 empirically determined # 3 hidden layers with tanh activations out = tf.layers.dense(self.obs_ph, hid1_size, tf.tanh, kernel_initializer=tf.random_normal_initializer( stddev=np.sqrt(1 / self.obs_dim)), name="h1") out = tf.layers.dense(out, hid2_size, tf.tanh, kernel_initializer= \ tf.random_normal_initializer( \ stddev=np.sqrt(1 / hid1_size)), name="h2") out = tf.layers.dense(out, hid3_size, tf.tanh, kernel_initializer= \ tf.random_normal_initializer( \ stddev=np.sqrt(1 / hid2_size)), name="h3") self.means = tf.layers.dense(out, self.act_dim, kernel_initializer= \ tf.random_normal_initializer( \ stddev=np.sqrt(1 / hid3_size)), name="means") logvar_speed = (10 * hid3_size) // 48 log_vars = tf.get_variable('logvars', (logvar_speed, self.act_dim), tf.float32, tf.constant_initializer(0.0)) self.log_vars = tf.reduce_sum(log_vars, axis=0) + self.policy_logvar self.policy_nn_vars = tf.get_collection(\ tf.GraphKeys.TRAINABLE_VARIABLES, scope='policy_nn') logger.log('Policy Params -- h1: {}, h2: {}, \ h3: {}, lr: {:.3g}, logvar_speed: {}' .format(hid1_size, hid2_size, hid3_size, self.lr, logvar_speed))
def eval_models(env_name, num_episodes, gamma, lam, kl_targ, coef, use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs, max_timesteps, reg_scale, phi_lr, phi_hs, policy_size, phi_obj, load_model): env, obs_dim, act_dim = init_gym(env_name) set_global_seeds(seed) env.seed(seed) env._max_episode_steps = max_timesteps obs_dim += 1 now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") aigym_path = os.path.join('log-files/', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False) # scaler = Scaler(obs_dim) logger.log("loading scaler") with open('models/scaler/scaler.pkl', 'rb') as input: scaler = pickle.load(input) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ, epochs, phi_epochs, policy_size=policy_size, phi_hidden_sizes=phi_hs, reg_scale=reg_scale, lr_phi=phi_lr, phi_obj=phi_obj) logger.log("loading model") load_dir = "models/" policy.load_model(load_dir) load_v = False #whether load value function baseline or train from scratch; no big impact on stein if load_v == True: val_func.load_val_model(load_dir) episode = 0 trajectories, traj_len_list = run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps, mode=load_model) num_traj = len(trajectories) logger.log("Avg Length %d total Length %d"%( \ np.mean(traj_len_list), \ np.sum(traj_len_list))) episode += len(trajectories) #Split data into validation and training data random.shuffle(trajectories) t_trajectories = trajectories[:int(len(trajectories) / 2)] v_trajectories = trajectories[int(len(trajectories) / 2):] refit_v = True # if fit value function baseline once again before evaluating; no big impact on stein if refit_v == True: tt_trajectories = copy.deepcopy(t_trajectories) add_value(tt_trajectories, val_func) add_disc_sum_rew(tt_trajectories, gamma) add_gae(tt_trajectories, gamma, lam) tt_observes, tt_actions, tt_advantages, tt_disc_sum_rew = build_train_set( tt_trajectories) logger.log("refit value function baseline") val_func.fit(tt_observes, tt_disc_sum_rew) # update value function logger.log("done") # build training data after refit v add_value(t_trajectories, val_func) add_disc_sum_rew(t_trajectories, gamma) add_gae(t_trajectories, gamma, lam) t_observes, t_actions, t_advantages, t_disc_sum_rew = build_train_set( t_trajectories) # build validation data after refit v add_value(v_trajectories, val_func) add_disc_sum_rew(v_trajectories, gamma) add_gae(v_trajectories, gamma, lam) v_observes, v_actions, v_advantages, v_disc_sum_rew = build_train_set( v_trajectories) sub_folder = "max_timesteps=%s_eval_data/%s_%s_data_seed=%d_max-steps=%d"%(\ max_timesteps, env_name, phi_obj, seed, max_timesteps) if not os.path.exists(sub_folder): os.mkdir(sub_folder) # save original gradient mc_grad_info = policy.get_batch_gradient(v_observes, v_actions, v_advantages, c=0.) mc_grad_info['traj_lens'] = traj_len_list with open(sub_folder + '/mc_num_episode=%d.pkl' % (num_episodes), 'wb') as fp: pickle.dump(mc_grad_info, fp) d = Dataset(dict(ob=t_observes, ac=t_actions, atarg=t_advantages, vtarg=t_disc_sum_rew), shuffle=True) for _ in range(phi_epochs): # optim_epochs for batch in d.iterate_once(128): # optim_batchsize policy.update(load_model, batch['ob'], batch['ac'], batch['atarg'], use_lr_adjust, ada_kl_penalty, c=1) # update policy stein_grad_info = policy.get_batch_gradient(v_observes, \ v_actions, v_advantages, c=1.) stein_grad_info['traj_lens'] = traj_len_list with open(sub_folder + '/stein_num_episode=%d.pkl' % (num_episodes), 'wb') as fp: pickle.dump(stein_grad_info, fp)
def _loss_train_op(self): # get Phi function and its derivatives if self.type == 'stein': phi_value, phi_act_g = self.phi(self.obs_ph, self.act_ph, reuse=False) elif self.type == 'state': phi_value, phi_act_g = self.phi(self.obs_ph, reuse=False) self.phi_value = phi_value self.phi_act_g = phi_act_g self.phi_nn_vars = self.phi.phi_vars ll_mean_g = 1/tf.exp(self.log_vars) * (self.act_ph - self.means) ll_log_vars_g = -1/2 * ( 1/tf.exp(self.log_vars) \ - 1/tf.exp(self.log_vars) * \ (self.act_ph - self.means) * \ (self.act_ph - self.means) * \ 1 / tf.exp(self.log_vars)) self.phi_value.set_shape((None,)) log_vars_inner = tf.expand_dims(tf.exp(self.logp - self.logp_old), 1) \ * (ll_log_vars_g * tf.expand_dims(self.advantages_ph - self.c_ph * self.phi_value, 1) \ + 1/2 * self.c_ph * ll_mean_g * self.phi_act_g ) means_inner = tf.expand_dims(tf.exp(self.logp - self.logp_old), 1) \ * (ll_mean_g * tf.expand_dims(self.advantages_ph - self.c_ph * self.phi_value, 1) \ + self.c_ph * self.phi_act_g) loss1_log_vars = - tf.reduce_mean( tf.stop_gradient(log_vars_inner) * \ tf.exp(self.log_vars)) loss1_mean = -tf.reduce_mean( tf.stop_gradient(means_inner) * \ self.means) loss1 = loss1_log_vars + loss1_mean loss2 = tf.reduce_mean(self.beta_ph * self.kl) loss3 = self.eta_ph * tf.square(\ tf.maximum(0.0, \ self.kl - 2.0 * self.kl_targ)) self.loss = loss1 + loss2 + loss3 optimizer = tf.train.AdamOptimizer(self.lr_ph) self.train_op = optimizer.minimize(self.loss, var_list= self.policy_nn_vars) # phi loss train op if self.phi_obj == 'MinVar': means_mse = tf.reduce_sum(\ tf.reduce_mean( \ tf.square(means_inner - \ tf.reduce_mean(means_inner, \ axis=0)), axis = 0)) logstd_vars_mse = tf.reduce_sum(\ tf.reduce_mean(\ tf.square(log_vars_inner - \ tf.reduce_mean(log_vars_inner,\ axis=0)), axis = 0)) gradient = tf.concat([means_inner, log_vars_inner], axis=1) est_A = tf.gather(gradient, tf.range(0, tf.shape(gradient)[0] //2)) est_B = tf.gather(gradient, tf.range(tf.shape(gradient)[0] //2, tf.shape(gradient)[0])) # calculate loss est_var = tf.reduce_sum(\ tf.square(tf.reduce_mean(\ est_A, axis=0) - \ tf.reduce_mean(est_B, axis=0))) if self.reg_scale > 0.: reg_variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) reg_term = tf.contrib.layers.apply_regularization( self.phi.kernel_regularizer, reg_variables) for var in reg_variables: logger.log("regularized, ", var.name, var.shape) else: reg_term = 0. if self.phi_obj == 'FitQ': self.phi_loss = tf.reduce_mean(\ tf.square(self.advantages_ph - \ self.phi_value), axis=0) + reg_term logger.log('phi_with FitQ as objective function') elif self.phi_obj == 'MinVar': self.phi_loss = means_mse + logstd_vars_mse + reg_term logger.log('phi with MinVar as objecive function') else: raise NotImplementedError phi_optimizer = tf.train.AdamOptimizer(self.lr_phi_ph) self.phi_train_op = phi_optimizer.minimize(self.phi_loss, var_list=self.phi_nn_vars) self.means_inner = means_inner self.log_vars_inner = log_vars_inner
def train_models(env_name, num_episodes, gamma, lam, kl_targ, coef, use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs, max_timesteps, reg_scale, phi_lr, phi_hs, policy_size, phi_obj, load_model, type): env, obs_dim, act_dim = init_gym(env_name) set_global_seeds(seed) env.seed(seed) env._max_episode_steps = max_timesteps obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") # create unique directories aigym_path = os.path.join('log-files/', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ,epochs, phi_epochs, policy_size=policy_size, phi_hidden_sizes=phi_hs, reg_scale=reg_scale, lr_phi=phi_lr, phi_obj=phi_obj, type=type) run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps, mode=load_model) # run a few to init scaler episode = 0 for i in range(2000): print("sampling and training at %s iteration\n"%(i)) trajectories, traj_len_list = run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps, mode=load_model) num_traj = len(trajectories) episode += len(trajectories) add_value(trajectories, val_func) add_disc_sum_rew(trajectories, gamma) add_gae(trajectories, gamma, lam) observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) policy.update(load_model, observes, actions, advantages, use_lr_adjust, ada_kl_penalty, c=0.) # update policy val_func.fit(observes, disc_sum_rew) # Save models policy.save_policy() val_func.save_val_func() refine_scaler = False if refine_scaler == True: run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps, mode=load_model) # run a few to refine scaler with open('models/scaler/scaler.pkl', 'wb') as output: pickle.dump(scaler, output, pickle.HIGHEST_PROTOCOL) logger.log("saved model")
def eval_models(env_name, num_episodes, gamma, lam, kl_targ, coef, use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs, max_timesteps, reg_scale, phi_lr, phi_hs, policy_size, phi_obj, load_model): env, obs_dim, act_dim = init_gym(env_name) set_global_seeds(seed) env.seed(seed) env._max_episode_steps = max_timesteps obs_dim += 1 now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") aigym_path = os.path.join('log-files/', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ, epochs, phi_epochs, policy_size=policy_size, phi_hidden_sizes=phi_hs, reg_scale=reg_scale, lr_phi=phi_lr, phi_obj=phi_obj) logger.log("loading model") load_dir = "models/" policy.load_model(load_dir) val_func.load_val_model(load_dir) run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps) episode = 0 trajectories, traj_len_list = run_policy(env, policy, scaler, num_episodes, max_timesteps=max_timesteps) num_traj = len(trajectories) logger.log("Avg Length %d total Length %d"%( \ np.mean(traj_len_list), \ np.sum(traj_len_list))) episode += len(trajectories) add_value(trajectories, val_func) add_disc_sum_rew(trajectories, gamma) add_gae(trajectories, gamma, lam) observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) sub_folder = "eval_data/%s_%s_data_seed=%d_max-steps=%d"%(\ env_name, phi_obj, seed, max_timesteps) if not os.path.exists(sub_folder): os.mkdir(sub_folder) # save original gradient mc_grad_info = policy.get_batch_gradient(observes, actions, advantages, c=0.) mc_grad_info['traj_lens'] = traj_len_list with open(sub_folder + '/mc_num_episode=%d.pkl' % (num_episodes), 'wb') as fp: pickle.dump(mc_grad_info, fp) policy.update(load_model, observes, actions, advantages, use_lr_adjust, ada_kl_penalty, c=1) # update policy stein_grad_info = policy.get_batch_gradient(observes, \ actions, advantages, c=1.) stein_grad_info['traj_lens'] = traj_len_list with open(sub_folder + '/stein_num_episode=%d.pkl' % (num_episodes), 'wb') as fp: pickle.dump(stein_grad_info, fp)
def main(env_name, num_iterations, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, coef, use_lr_adjust, ada_kl_penalty, seed, epochs, phi_epochs, max_timesteps, reg_scale, phi_lr, phi_hs, policy_size, phi_obj): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_iterations: maximum number of iterations to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance coef: coefficient of Stein control variate use_lr_adjust: whether adjust lr based on kl ada_kl_penalty: whether adjust kl penalty max_timesteps: maximum time steps per trajectory reg_scale: regularization coefficient policy_size: policy network size phi_obj: FitQ or MinVar """ env, obs_dim, act_dim = init_gym(env_name) set_global_seeds(seed) env.seed(seed) env._max_episode_steps = max_timesteps obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H:%M:%S") aigym_path = os.path.join('log-files/', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True, video_callable=False) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar, epochs, phi_epochs, policy_size=policy_size, phi_hidden_sizes=phi_hs, c_ph=coef, reg_scale=reg_scale, lr_phi=phi_lr, phi_obj=phi_obj) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, batch_size=1000, max_timesteps=max_timesteps) for _ in range(num_iterations): logger.log("\n#Training Iter %d" % (_)) logger.log("Draw Samples..") trajectories = run_policy(env, policy, scaler, batch_size=batch_size, max_timesteps=max_timesteps) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew) logger.log("Starting Training...") policy.update(observes, actions, advantages, \ use_lr_adjust, ada_kl_penalty) # update policy val_func.fit(observes, disc_sum_rew) # update value function logger.log('--------------------------------\n') policy.close_sess() val_func.close_sess()