def run_policy(env, policy, scaler, num_episodes, max_timesteps, mode): total_steps = 0 trajectories = [] traj_len_list = [] for itr in range(num_episodes): observes, actions, rewards, unscaled_obs = run_episode(env, \ policy, scaler, max_timesteps=max_timesteps) total_steps += observes.shape[0] traj_len_list.append(len(observes)) trajectory = { 'observes': observes, 'actions': actions, 'rewards': rewards, 'unscaled_obs': unscaled_obs } trajectories.append(trajectory) unscaled = np.concatenate([t['unscaled_obs'] for t in trajectories]) if mode == 'save': # only update scaler when training policy, get rid of possible bias when evaluating scaler.update(unscaled) logger.record_dicts({ "_MeanReward": np.mean([t['rewards'].sum() for t in trajectories]), 'Steps': total_steps, }) return trajectories, traj_len_list
def update(self, observes, actions, advantages, use_lr_adjust, ada_kl_penalty): """ Update policy based on observations, actions and advantages Args: observes: observations, shape = (N, obs_dim) actions: actions, shape = (N, act_dim) advantages: advantages, shape = (N,) phi_value: phi_value, shape = (N,) phi_act_g: phi_act_g, shape = (N, act_dim) """ feed_dict = {self.obs_ph: observes, self.act_ph: actions, self.advantages_ph: advantages, self.beta_ph: self.beta, self.eta_ph: self.eta, self.lr_ph: self.lr * self.lr_multiplier, self.lr_phi_ph: self.lr_phi} old_means_np, old_log_vars_np = self.sess.run([self.means, self.log_vars], feed_dict) feed_dict[self.old_log_vars_ph] = old_log_vars_np feed_dict[self.old_means_ph] = old_means_np loss, kl, entropy = 0, 0, 0 if self.c_ph == 1.: # Update phi function & policy network logger.log("Training Phi for %d epochs"%self.phi_epochs) for _ in progressbar(range(self.phi_epochs), "Train Phi:", 25): self.sess.run(self.phi_train_op, feed_dict) phi_loss = self.sess.run(self.phi_loss, feed_dict) logger.record_tabular("Phi_loss", phi_loss) # Training policy logger.log("Training Policy for %d epochs"%self.epochs) for _ in progressbar(range(self.epochs), "Train Policy", 25): self.sess.run(self.train_op, feed_dict) loss, kl, entropy = self.sess.run([self.loss, self.kl, self.entropy], feed_dict) if kl > self.kl_targ * 4: # early stopping if D_KL diverges badly break if (ada_kl_penalty): if kl > self.kl_targ * 2: # servo beta to reach D_KL target self.beta = np.minimum(35, 1.5 * self.beta) # max clip beta if (use_lr_adjust): if self.beta > 30 and self.lr_multiplier > 0.1: self.lr_multiplier /= 1.5 elif kl < self.kl_targ / 2: self.beta = np.maximum(1 / 35, self.beta / 1.5) # min clip beta if (use_lr_adjust): if self.beta < (1 / 30) and self.lr_multiplier < 10: self.lr_multiplier *= 1.5 logger.record_dicts({ 'PolicyLoss': loss, 'PolicyEntropy': entropy, 'KL': kl, 'Beta': self.beta, '_lr_multiplier': self.lr_multiplier})
def update(self, load_policy, observes, actions, advantages, use_lr_adjust, ada_kl_penalty, c=1): feed_dict = {self.obs_ph: observes, self.act_ph: actions, self.advantages_ph: advantages, self.beta_ph: self.beta, self.eta_ph: self.eta, self.lr_ph: self.lr * self.lr_multiplier, self.lr_phi_ph: self.lr_phi, self.c_ph:c} old_means_np, old_log_vars_np = self.sess.run([self.means, self.log_vars], feed_dict) feed_dict[self.old_log_vars_ph] = old_log_vars_np feed_dict[self.old_means_ph] = old_means_np loss, kl, entropy = 0, 0, 0 for _ in range(self.phi_epochs): self.sess.run(self.phi_train_op, feed_dict) if load_policy == 'save': for e in range(self.epochs): self.sess.run(self.train_op, feed_dict) loss, kl, entropy = self.sess.run([self.loss, self.kl, self.entropy], feed_dict) if kl > self.kl_targ * 4: break if (ada_kl_penalty): if kl > self.kl_targ * 2: # servo beta to reach D_KL target self.beta = np.minimum(35, 1.5 * self.beta) # max clip beta if (use_lr_adjust): if self.beta > 30 and self.lr_multiplier > 0.1: self.lr_multiplier /= 1.5 elif kl < self.kl_targ / 2: self.beta = np.maximum(1 / 35, self.beta / 1.5) # min clip beta if (use_lr_adjust): if self.beta < (1 / 30) and self.lr_multiplier < 10: self.lr_multiplier *= 1.5 logger.record_dicts({ 'PolicyLoss': loss, 'PolicyEntropy': entropy, 'KL': kl, 'Beta': self.beta, '_lr_multiplier': self.lr_multiplier})
def run_policy(env, policy, scaler, batch_size, max_timesteps): """ Run policy and collect data for a minimum of min_steps and min_episodes Args: env: ai gym environment policy: policy object with sample() method scaler: scaler object, used to scale/offset each observation dimension to a similar range episodes: total episodes to run max_timesteps: max timesteps per episode to run Returns: list of trajectory dictionaries, list length = number of episodes 'observes' : NumPy array of states from episode 'actions' : NumPy array of actions from episode 'rewards' : NumPy array of (un-discounted) rewards from episode 'unscaled_obs' : NumPy array of (un-discounted) rewards from episode """ total_steps = 0 trajectories = [] while total_steps < batch_size: observes, actions, rewards, unscaled_obs = run_episode(env, \ policy, scaler, max_timesteps=max_timesteps) total_steps += observes.shape[0] trajectory = { 'observes': observes, 'actions': actions, 'rewards': rewards, 'unscaled_obs': unscaled_obs } trajectories.append(trajectory) unscaled = np.concatenate([t['unscaled_obs'] for t in trajectories]) scaler.update( unscaled) # update running statistics for scaling observations logger.record_dicts({ "_MeanReward": np.mean([t['rewards'].sum() for t in trajectories]), 'Steps': total_steps, }) return trajectories
def log_batch_stats(observes, actions, advantages, disc_sum_rew, episode): logger.record_dicts({ '_mean_obs': np.mean(observes), '_min_obs': np.min(observes), '_max_obs': np.max(observes), '_mean_act': np.mean(actions), '_max_act': np.max(actions), '_std_act': np.mean(np.var(actions, axis=0)), '_mean_adv': np.mean(advantages), '_min_adv': np.min(advantages), '_max_adv': np.max(advantages), '_std_adv': np.var(advantages), '_mean_discrew': np.mean(disc_sum_rew), '_min_discrew': np.min(disc_sum_rew), '_max_discrew': np.max(disc_sum_rew), '_std_discrew': np.var(disc_sum_rew)}) logger.dump_tabular()
def fit(self, x, y): """ Fit model to current data batch + previous data batch Args: x: features y: target """ num_batches = max(x.shape[0] // 256, 1) batch_size = x.shape[0] // num_batches y_hat = self.predict(x) # check explained variance prior to update old_exp_var = 1 - np.var(y - y_hat) / np.var(y) if self.replay_buffer_x is None: x_train, y_train = x, y else: x_train = np.concatenate([x, self.replay_buffer_x]) y_train = np.concatenate([y, self.replay_buffer_y]) self.replay_buffer_x = x self.replay_buffer_y = y for e in range(self.epochs): x_train, y_train = shuffle(x_train, y_train) for j in range(num_batches): start = j * batch_size end = (j + 1) * batch_size feed_dict = { self.obs_ph: x_train[start:end, :], self.val_ph: y_train[start:end] } _, l = self.sess.run([self.train_op, self.loss], feed_dict=feed_dict) y_hat = self.predict(x) loss = np.mean(np.square(y_hat - y)) # explained variance after update exp_var = 1 - np.var(y - y_hat) / np.var( y) # diagnose over-fitting of val func logger.record_dicts({ 'VarFuncLoss': loss, 'ExplainedVarNew': exp_var, 'ExplainedVarOld': old_exp_var })