def _forward_policy(self, episodes, ratio=False): T = episodes.observations.size(0) values, log_probs, entropy = [], [], [] if not self.use_clstm: hx = torch.zeros(self.num_workers, self.lstm_size).to(device=self.device) else: hx = torch.zeros(self.num_workers, self.lstm_size, 7, 7).to(device=self.device) for t in range(T): pi, v, hx = self.policy(episodes.observations[t], hx, episodes.embeds[t]) #pi, v = self.policy(episodes.observations[t]) values.append(v) entropy.append(pi.entropy()) if ratio: log_probs.append( pi.log_prob(episodes.actions[t]) - episodes.logprobs[t]) else: log_probs.append(pi.log_prob(episodes.actions[t])) log_probs = torch.stack(log_probs) values = torch.stack(values) entropy = torch.stack(entropy) advantages = episodes.gae(values, tau=self.tau) advantages = weighted_normalize(advantages, weights=episodes.mask) if log_probs.dim() > 2: log_probs = torch.sum(log_probs, dim=2) return log_probs, advantages, values, entropy
def compute_advantages(self, baseline, gae_lambda=1.0, normalize=True): # Compute the values based on the baseline values = baseline(self).detach() # Add an additional 0 at the end of values for # the estimation at the end of the episode values = F.pad(values * self.mask, (0, 0, 0, 1)) # Compute the advantages based on the values deltas = self.rewards + self.gamma * values[1:] - values[:-1] self._advantages = torch.zeros_like(self.rewards) gae = torch.zeros((self.batch_size, ), dtype=torch.float32) for i in range(len(self) - 1, -1, -1): gae = gae * self.gamma * gae_lambda + deltas[i] self._advantages[i] = gae # Normalize the advantages if normalize: self._advantages = weighted_normalize(self._advantages, lengths=self.lengths) # Once the advantages are computed, the returns are not necessary # anymore (only to compute the parameters of the baseline) del self._returns del self._mask return self.advantages
def test_weighted_normalize(): lengths = [2, 3, 7, 5, 11] # Inputs inputs_np = np.random.rand(13, 5).astype(np.float32) # Pytorch inputs_th = torch.as_tensor(inputs_np) normalized_th = weighted_normalize(inputs_th, lengths=lengths) for i, length in enumerate(lengths): assert (normalized_th[length:, i] == 0.).all()
def surrogate_loss(self, episodes, old_pis=None): losses, kls, pis = [], [], [] if old_pis is None: old_pis = [None] * len(episodes) for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis): if self.usePPO: params, grad_norm = self.adapt_ppo(train_episodes) else: params = self.adapt(train_episodes) self.logger.info("in surrogate_loss") with torch.set_grad_enabled(old_pi is None): if self.baseline_type == 'critic shared': pi, _ = self.policy(valid_episodes.observations, params=params) pi = self.policy(valid_episodes.observations, params=params) pis.append(detach_distribution(pi)) if old_pi is None: old_pi = detach_distribution(pi) if self.baseline_type == 'linear': values = self.baseline(valid_episodes) elif self.baseline_type == 'critic separate': values = self.baseline(valid_episodes.observations) elif self.baseline_type == 'critic shared': _, values = self.policy(valid_episodes.observations, params=params) advantages = valid_episodes.gae(values, tau=self.tau) advantages = weighted_normalize(advantages, weights=valid_episodes.mask) log_ratio = (pi.log_prob(valid_episodes.actions) - old_pi.log_prob(valid_episodes.actions)) if log_ratio.dim() > 2: log_ratio = torch.sum(log_ratio, dim=2) ratio = torch.exp(log_ratio) loss = -weighted_mean( ratio * advantages, dim=0, weights=valid_episodes.mask) losses.append(loss) mask = valid_episodes.mask if valid_episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask) kls.append(kl) return (torch.mean(torch.stack(losses, dim=0)), torch.mean(torch.stack(kls, dim=0)), pis)
def val(args, sampler_val, policy, baseline, batch): start_time = time.time() from maml_rl.utils.torch_utils import weighted_normalize, weighted_mean tasks_val = sampler_val.sample_tasks() task_to_episodes = dict() for task in tasks_val: task_episodes = [] sampler_val.reset_task(task) for i_episode in range(args.num_adapt_val + 1): if i_episode == 0: params = None episodes = sampler_val.sample(policy, params=params, gamma=args.gamma, device=args.device) # compute inner loss baseline.fit(episodes) values = baseline(episodes) advantages = episodes.gae(values, tau=args.tau) advantages = weighted_normalize(advantages, weights=episodes.mask) pi = policy(episodes.observations, params=params) log_probs = pi.log_prob(episodes.actions) if log_probs.dim() > 2: log_probs = torch.sum(log_probs, dim=2) entropy = pi.entropy().mean() loss = -weighted_mean( log_probs * advantages, dim=0, weights=episodes.mask) - args.entropy_coef_val * entropy fast_lr = args.fast_lr if i_episode == 0 else args.fast_lr_val_after_one if i_episode <= args.num_adapt_val: params = policy.update_params(loss, step_size=fast_lr, first_order=True) task_episodes.append(episodes) task_to_episodes[str(task)] = task_episodes for i_episode in range(args.num_adapt_val + 1): returns = calculate_returns([ task_episodes[i_episode].rewards for task_episodes in task_to_episodes.values() ]) logger.logkv(f'val_return_avg_adapt{i_episode}', returns.mean().item()) logger.logkv(f'val_return_std_adapt{i_episode}', returns.std().item()) logger.logkv('val_time', time.time() - start_time) save_dir = os.path.join(args.log_dir, 'val') os.makedirs(save_dir, exist_ok=True) pickle.dump(task_to_episodes, open(os.path.join(save_dir, f'val_{batch}.pkl'), 'wb'))
def surrogate_loss(self, episodes, old_pis=None): losses, kls, action_dists, critic_losses = [], [], [], [] if old_pis is None: old_pis = [None] * len(episodes) for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis): policy_params, critic_params = self.adapt(train_episodes) with torch.set_grad_enabled(old_pi is None): action_dist = self.policy(valid_episodes.observations, params=policy_params) action_dists.append(detach_distribution(action_dist)) if old_pi is None: old_pi = detach_distribution(action_dist) values = self.critic(valid_episodes.observations, params=critic_params) advantages = valid_episodes.gae(values, tau=self.tau) value_loss = weighted_mean(advantages.pow(2), dim=0, weights=valid_episodes.mask) critic_losses.append(value_loss) advantages = weighted_normalize(advantages, weights=valid_episodes.mask, epsilon=1e-5) log_ratio = (action_dist.log_prob(valid_episodes.actions) - old_pi.log_prob(valid_episodes.actions)) if log_ratio.dim() > 2: log_ratio = torch.sum(log_ratio, dim=2) ratio = torch.exp(log_ratio) loss = -weighted_mean(ratio * advantages.detach(), dim=0, weights=valid_episodes.mask) losses.append(loss) mask = valid_episodes.mask if valid_episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(action_dist, old_pi), dim=0, weights=mask) kls.append(kl) return (torch.mean(torch.stack(losses, dim=0)), torch.mean(torch.stack(kls, dim=0)), action_dists, torch.mean(torch.stack(critic_losses, dim=0)))
def surrogate_loss(self, episodes, old_pis=None): """ Using TRPO. old_pis are not None only when doing line search? How are old_pis used? Like the behavior policy in TRPO? How? """ losses, kls, pis = [], [], [] if old_pis is None: old_pis = [None] * len(episodes) for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis): # adapt our policy network to a new task params = self.adapt(train_episodes) # doing learning only when old_pi is None? with torch.set_grad_enabled(old_pi is None): pi = self.policy(valid_episodes.observations, params=params) # the set of policies adapted to each task pis.append(detach_distribution(pi)) if old_pi is None: old_pi = detach_distribution(pi) values = self.baseline(valid_episodes) advantages = valid_episodes.gae(values, tau=self.tau) advantages = weighted_normalize(advantages, weights=valid_episodes.mask) log_ratio = (pi.log_prob(valid_episodes.actions) - old_pi.log_prob(valid_episodes.actions)) if log_ratio.dim() > 2: log_ratio = torch.sum(log_ratio, dim=2) ratio = torch.exp(log_ratio) loss = -weighted_mean( ratio * advantages, dim=0, weights=valid_episodes.mask) losses.append(loss) mask = valid_episodes.mask if valid_episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask) kls.append(kl) return (torch.mean(torch.stack(losses, dim=0)), torch.mean(torch.stack(kls, dim=0)), pis)
def inner_loss(self, episodes, params=None): """Compute the inner loss for the one-step gradient update. The inner loss is REINFORCE with baseline [2], computed on advantages estimated with Generalized Advantage Estimation (GAE, [3]). """ values = self.baseline(episodes) advantages = episodes.gae(values, tau=self.tau) advantages = weighted_normalize(advantages, weights=episodes.mask) pi = self.policy(episodes.observations, params=params) log_probs = pi.log_prob(episodes.actions) if log_probs.dim() > 2: log_probs = torch.sum(log_probs, dim=2) loss = -weighted_mean(log_probs * advantages, dim=0) return loss
def surrogate_loss(self, episodes, old_pis=None): losses, kls, pis = [], [], [] if old_pis is None: old_pis = [None] * len(episodes) for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis): params = self.adapt(train_episodes) with torch.set_grad_enabled(old_pi is None): pi = self.policy(valid_episodes.observations, params=params) # detach the mu, scale parameters of distribution pi, no gradients, no update to them pis.append(detach_distribution(pi)) if old_pi is None: old_pi = detach_distribution(pi) values = self.baseline(valid_episodes) advantages = valid_episodes.gae(values, tau=self.tau) advantages = weighted_normalize(advantages, weights=valid_episodes.mask) # initial 0, changed in line search process as pi changed, old_pi not changed log_ratio = (pi.log_prob(valid_episodes.actions) - old_pi.log_prob(valid_episodes.actions)) # print('log_ratio: ',log_ratio) if log_ratio.dim() > 2: log_ratio = torch.sum(log_ratio, dim=2) ratio = torch.exp(log_ratio) # print('ratio: ', ratio) # print('advantages: ', advantages) loss = -weighted_mean( ratio * advantages, dim=0, weights=valid_episodes.mask) # the weighted_mean loss is very samll, e-8 magnitude print('loss: ', loss) losses.append(loss) mask = valid_episodes.mask if valid_episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask) kls.append(kl) return (torch.mean(torch.stack(losses, dim=0)), torch.mean(torch.stack(kls, dim=0)), pis)
def surrogate_loss(self, episodes, old_pis=None): losses, kls, pis = [], [], [] if old_pis is None: old_pis = [None] * len(episodes) for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis): params = self.adapt(train_episodes) with torch.set_grad_enabled(True): pi = self.policy(valid_episodes.observations, params=params) pis.append(detach_distribution(pi)) if old_pi is None: old_pi = detach_distribution(pi) values = self.baseline(valid_episodes) advantages = valid_episodes.gae(values, tau=self.tau) advantages = weighted_normalize(advantages, weights=valid_episodes.mask) log_ratio = (pi.log_prob(valid_episodes.actions) - old_pi.log_prob(valid_episodes.actions)) if log_ratio.dim() > 2: log_ratio = torch.sum(log_ratio, dim=2) ratio = torch.exp(log_ratio) loss_clipped = ratio.clamp(1.0 - self.ppo_ratio, 1.0 + self.ppo_ratio) * advantages loss = ratio * advantages loss = -torch.min(loss, loss_clipped) loss = weighted_mean(loss, dim=0, weights=valid_episodes.mask) mask = valid_episodes.mask if valid_episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(old_pi, pi), dim=0, weights=mask) kls.append(kl) losses.append(loss + kl * 0.0005) return (torch.mean(torch.stack(losses, dim=0)), torch.mean(torch.stack(kls, dim=0)), pis)
def inner_loss(self, episodes, params=None): """Compute the inner loss for the one-step gradient update. The inner loss is REINFORCE with baseline [2], computed on advantages estimated with Generalized Advantage Estimation (GAE, [3]). The baseline is subtracted from the empirical return to reduce variance of the optimization. In here, a linear function as the baseline with a time-varying feature vector is used. """ values = self.baseline(episodes) advantages = episodes.gae(values, tau=self.tau) advantages = weighted_normalize(advantages, weights=episodes.mask) pi = self.policy(episodes.observations, params=params) log_probs = pi.log_prob(episodes.actions) if log_probs.dim() > 2: log_probs = torch.sum(log_probs, dim=2) loss = -weighted_mean(log_probs * advantages, dim=0, weights=episodes.mask) return loss
def inner_loss(self, episodes, l_params=None, h_params=None): """Compute the inner loss for the one-step gradient update. The inner loss is REINFORCE with baseline [2], computed on advantages estimated with Generalized Advantage Estimation (GAE, [3]). """ values = self.baseline(episodes) advantages = episodes.gae(values, tau=self.tau) advantages = weighted_normalize(advantages, weights=episodes.mask) # First we calculate the latent space actions from the higher level policy (stored in episodes). # Then we calculate the lower level actions using the higher level actions pi_higher = self.h_policy(episodes.observations, params=h_params) # Calculate the log probability log_probs = pi_higher.log_prob(episodes.higher_level_actions) if log_probs.dim() > 2: log_probs = torch.sum(log_probs, dim=2) loss = -weighted_mean(log_probs * advantages, dim=0) return loss
def surrogate_loss(self, episodes, old_pis=None): """Computes the surrogate loss in TRPO: (pi(a|s) / q(a|s)) * Q(s,a) in Eqn 14 Because the meta-loss tried to find theta that minimizes loss with phi, the loss is computed with valid episodes """ losses, kls, pis = [], [], [] if old_pis is None: old_pis = [None] * len(episodes) for (train_episodes, valid_episodes), old_pi in zip(episodes, old_pis): params = self.adapt(train_episodes) with torch.set_grad_enabled(old_pi is None): pi = self.policy(valid_episodes.observations, params=params) pis.append(detach_distribution(pi)) if old_pi is None: old_pi = detach_distribution(pi) values = self.baseline(valid_episodes) advantages = valid_episodes.gae(values, tau=self.tau) advantages = weighted_normalize(advantages, weights=valid_episodes.mask) log_ratio = (pi.log_prob(valid_episodes.actions) - old_pi.log_prob(valid_episodes.actions)) if log_ratio.dim() > 2: log_ratio = torch.sum(log_ratio, dim=2) ratio = torch.exp(log_ratio) # Convert back to ratio from log loss = -weighted_mean(ratio * advantages, dim=0, weights=valid_episodes.mask) losses.append(loss) mask = valid_episodes.mask if valid_episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask) kls.append(kl) return ( torch.mean(torch.stack(losses, dim=0)), torch.mean(torch.stack(kls, dim=0)), pis)
def inner_loss(self, episodes, params=None): """Compute the inner loss for the one-step gradient update. The inner loss is REINFORCE with baseline [2], computed on advantages estimated with Generalized Advantage Estimation (GAE, [3]). https://pytorch.org/docs/0.3.1/distributions.html (except using advantag instead of rewards.) Implements eq 4. """ vf_loss = -1 loss = 0 if self.baseline_type == 'linear': values = self.baseline(episodes) elif self.baseline_type == 'critic separate': values = self.baseline(episodes.observations) # find value loss sum [(R-V(s))^2] R = episodes.returns.view([200, 20, 1]) vf_loss = (((values - R)**2).mean())**(1 / 2) #else: # pi,values = self.policy(episodes.observations) # pi,vi = self.policy(episodes.observations,params=params) # log_probs = pi.log_prob(values.size()) # loss = (((values - R) ** 2).mean()) ** (1 / 2) advantages = episodes.gae(values, tau=self.tau) advantages_unnorm = advantages sum_adv = torch.sum(advantages_unnorm).numpy() logging.info("unnormalized advantages: " + str(sum_adv)) logging.info("sum of returns:" + str(torch.sum(episodes.returns))) advantages = weighted_normalize(advantages, weights=episodes.mask) pi = self.policy(episodes.observations, params=params) log_probs = pi.log_prob(episodes.actions) if log_probs.dim() > 2: # sum over all the workers. log_probs = torch.sum(log_probs, dim=2) loss = loss - weighted_mean( log_probs * advantages, dim=0, weights=episodes.mask) logging.info("inner loss: " + str(loss)) return loss, vf_loss
def surrogate_loss(self, episodes, old_pis=None): """ Surrogate objective: E_r SmoothReLU( V_r^{adapted self.policy} - \max_{\pi \in self.policies[0:policy_idx - 1} V_r^\pi) V_r^{adapted self.policy} can be evaluated by valid_episodes in episodes \max_{\pi \in self.policies[0:policy_idx - 1} V_r^\pi is computed in self.values_of_optimized_policies :param episodes: [(episodes before adapting, episodes after adapting) for task in sampled tasks] :param old_pis: dummy parameter derived from super :return: mean of losses, mean of kls, pis """ losses, kls, pis = [], [], [] if old_pis is None: old_pis = [None] * len(episodes) for episode_index in range(len(episodes)): (train_episodes, valid_episodes) = episodes[episode_index] old_pi = old_pis[episode_index] if self.current_policy_idx == 0: dominance_correction = 1 else: difference_from_best_value = total_rewards( valid_episodes.rewards ) - self.values_of_optimized_policies[episode_index] dominance_correction = 1 - 1 / ( 1 + math.exp(difference_from_best_value)) params = self.adapt(train_episodes) with torch.set_grad_enabled(old_pi is None): pi = self.policy(valid_episodes.observations, params=params) pis.append(detach_distribution(pi)) if old_pi is None: old_pi = detach_distribution(pi) values = self.baseline(valid_episodes) advantages = valid_episodes.gae(values, tau=self.tau) advantages = weighted_normalize(advantages, weights=valid_episodes.mask) log_ratio = (pi.log_prob(valid_episodes.actions) - old_pi.log_prob(valid_episodes.actions)) if log_ratio.dim() > 2: log_ratio = torch.sum(log_ratio, dim=2) ratio = torch.exp(log_ratio) loss = -dominance_correction * weighted_mean( ratio * advantages, dim=0, weights=valid_episodes.mask) losses.append(loss) mask = valid_episodes.mask if valid_episodes.actions.dim() > 2: mask = mask.unsqueeze(2) kl = weighted_mean(kl_divergence(pi, old_pi), dim=0, weights=mask) kls.append(kl) if len(losses) == 0 or len(kls) == 0: # signal outside that no losses. avoiding taking mean of empty tensors.. return (None, None, pis) else: return (torch.mean(torch.stack(losses, dim=0)), torch.mean(torch.stack(kls, dim=0)), pis)
def main(args): np.random.seed(RANDOM_SEED) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) sampler = MrcBatchSampler(args.env_name, batch_size=args.fast_batch_size, train_folder=TRAIN_TRACES) policy = ActorNet(input_size=[S_INFO, S_LEN], output_size=A_DIM, learning_rate=ACTOR_LR_RATE) baseline = CriticNet(input_size=[S_INFO, S_LEN], output_size=A_DIM, learning_rate=CRITIC_LR_RATE) # metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, # fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(args.num_batches): print( "===================================================================" ) print("=====================Now epoch: ", batch, "========================") tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) sampler.reset_task(0.5) episodes = sampler.sample(policy, gamma=args.gamma, device=args.device) rewards = np.array(episodes.rewards) rewards = rewards.sum(0) mean_reward = rewards.mean() entropys = np.array(episodes.entropys).sum(0) mean_entropy = entropys.mean() values = baseline(episodes.observations) advantages = episodes.gae(values, tau=1) advantages = weighted_normalize(advantages, weights=episodes.mask) advantages = np.array(advantages).sum(0) mean_ad = advantages.mean() print(" mean Ad: ", mean_ad, " mean reward:", mean_reward, " mean_entropy: ", mean_entropy) # episodes = metalearner.sample(tasks, first_order=args.first_order) # metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, # cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, # ls_backtrack_ratio=args.ls_backtrack_ratio) baseline.fit(episodes, CRITIC_LR_RATE) policy.fit(episodes, baseline, ACTOR_LR_RATE) if not batch % 4: noMetaTest(policy, baseline, batch) # print("total_rewards", total_rewards([ep.rewards for ep in episodes])) # print('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes])) # for taskid in range(args.meta_bath_size): # before = episodes[0][0].rewards # print() # metaTest(policy, baseline, batch) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f) with open( os.path.join(save_folder, 'baseline-{0}.pt'.format(batch)), 'wb') as f: torch.save(baseline.state_dict(), f)
def compute_ng_gradient(self, episodes, max_kl=1e-3, cg_iters=20, cg_damping=1e-2, ls_max_steps=10, ls_backtrack_ratio=0.5): ng_grads = [] for train_episodes, valid_episodes in episodes: params, step_size, step = self.adapt(train_episodes) # compute $grad = \nabla_x J^{lvc}(x) at x = \theta - \eta\UM(\theta) pi = self.policy(valid_episodes.observations, params=params) pi_detach = detach_distribution(pi) values = self.baseline(valid_episodes) advantages = valid_episodes.gae(values, tau=self.tau) advantages = weighted_normalize(advantages, weights=valid_episodes.mask) log_ratio = pi.log_prob( valid_episodes.actions) - pi_detach.log_prob( valid_episodes.actions) if log_ratio.dim() > 2: log_ratio = torch.sum(log_ratio, dim=2) ratio = torch.exp(log_ratio) loss = -weighted_mean( ratio * advantages, dim=0, weights=valid_episodes.mask) ng_grad_0 = torch.autograd.grad( loss, self.policy.parameters()) # no create graph ng_grad_0 = parameters_to_vector(ng_grad_0) # compute the inverse of Fihser matrix at x=\theta times $grad with Conjugate Gradient hessian_vector_product = self.hessian_vector_product_ng( train_episodes, damping=cg_damping) F_inv_grad = conjugate_gradient(hessian_vector_product, ng_grad_0, cg_iters=cg_iters) # compute $ng_grad_1 = \nabla^2 J^{lvc}(x) at x = \theta times $F_inv_grad # create graph for higher differential # self.baseline.fit(train_episodes) loss = self.inner_loss(train_episodes) grad = torch.autograd.grad(loss, self.policy.parameters(), create_graph=True) grad = parameters_to_vector(grad) grad_F_inv_grad = torch.dot(grad, F_inv_grad.detach()) ng_grad_1 = torch.autograd.grad(grad_F_inv_grad, self.policy.parameters()) ng_grad_1 = parameters_to_vector(ng_grad_1) # compute $ng_grad_2 = the Jocobian of {F(x) U(\theta)} at x = \theta times $F_inv_grad hessian_vector_product = self.hessian_vector_product_ng( train_episodes, damping=cg_damping) F_U = hessian_vector_product(step) ng_grad_2 = torch.autograd.grad( torch.dot(F_U, F_inv_grad.detach()), self.policy.parameters()) ng_grad_2 = parameters_to_vector(ng_grad_2) ng_grad = ng_grad_0 - step_size * (ng_grad_1 + ng_grad_2) ng_grad = parameters_to_vector(ng_grad) ng_grads.append(ng_grad.view(len(ng_grad), 1)) return torch.mean(torch.stack(ng_grads, dim=1), dim=[1, 2])
def inner_loss_ppo_noUpdate(self, episodes, first_order, params=None, ent_coef=0, vf_coef=0, nenvs=1): """Compute the inner loss for the one-step gradient update. The inner loss is PPO with clipped ratio = new_pi/old_pi. Can make cliprange adaptable. nenvs = number of workers. nsteps defined in env """ #episodes = [num of steps, num of episodes, obs_space] #NEED TO CHANGE ADVANTAGE CALCULATION TO CRITIC. losses = [] self.logger.info("cliprange: " + str(self.cliprange) + "; noptepochs: " + str(self.noptepochs) + ";nminibaches: " + str(self.nminibatches) + ";ppo_lr: " + str(self.ppo_lr)) # Save the old parameters old_policy = copy.deepcopy(self.policy) old_params = parameters_to_vector(old_policy.parameters()) #Need to take mini-batch of sampled examples to do gradient update a few times. nepisodes = episodes.observations.shape[1] nsteps = episodes.observations.shape[0] nbatch = nenvs * nsteps * nepisodes nbatch_train = nbatch // self.nminibatches mblossvals = [] #Flattern the episode to [steps, observations] episodes_flat = BatchEpisodes(batch_size=nbatch) i = 0 for ep in range(nepisodes): for step in range(nsteps): episodes_flat.append([episodes.observations[step][ep].numpy()], [episodes.actions[step][ep].numpy()], [episodes.returns[step][ep].numpy()], (i, )) i += 1 inds = np.arange(nbatch) # For the case with linear baseline. vf_loss = -1 for epoch in range(self.noptepochs): # Randomize the indexes #np.random.shuffle(inds) mb_vf_loss = torch.zeros(1) grad_norm = [] # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): mb_obs, mb_returns, mb_masks, mb_actions = [], [], [], [] mb_episodes = BatchEpisodes(batch_size=nbatch_train) end = start + nbatch_train mbinds = inds[start:end] for i in range(len(mbinds)): mb_obs.append( episodes_flat.observations[0][mbinds[i]].numpy()) mb_returns.append( episodes_flat.returns[0][mbinds[i]].numpy()) mb_masks.append(episodes_flat.mask[0][mbinds[i]].numpy()) mb_actions.append( episodes_flat.actions[0][mbinds[i]].numpy()) mb_episodes.append([mb_obs[i]], [mb_actions[i]], [mb_returns[i]], (i, )) if self.baseline_type == 'linear': values = self.baseline(mb_episodes) elif self.baseline_type == 'critic separate': values = self.baseline(mb_episodes.observations) # find value loss sum [(R-V(s))^2] R = torch.FloatTensor(np.array(mb_returns)) mb_vf_loss = (((values - R)**2).mean()) + mb_vf_loss #values = self.baseline(mb_episodes) advantages = mb_episodes.gae(values, tau=self.tau) advantages_unnorm = advantages advantages = weighted_normalize(advantages.type(torch.float32), weights=torch.ones( 1, advantages.shape[1])) mb_returns_sum = np.sum(mb_returns) self.logger.info("iter: " + "epoch:" + str(epoch) + "; mb:" + str(start / nbatch_train)) self.logger.info("mb returns: " + str(mb_returns_sum)) pi = self.policy(mb_episodes.observations) log_probs = pi.log_prob(mb_episodes.actions) #reload old policy. vector_to_parameters(old_params, old_policy.parameters()) pi_old = old_policy(mb_episodes.observations) log_probs_old = pi_old.log_prob(mb_episodes.actions) if log_probs.dim() > 2: log_probs_old = torch.sum(log_probs_old, dim=2) log_probs = torch.sum(log_probs, dim=2) ratio = torch.exp(log_probs - log_probs_old) self.logger.info("max pi: ") self.logger.info(torch.max(pi.mean)) for x in ratio[0][:10]: if x > 1E5 or x < 1E-5: #pdb.set_trace() self.logger.info("ratio too large or too small.") self.logger.info(ratio[0][:10]) self.logger.info("policy ratio: ") self.logger.info(ratio[0][:10]) #loss function pg_losses = -advantages * ratio pg_losses2 = -advantages * torch.clamp( ratio, 1.0 - self.cliprange, 1.0 + self.cliprange) # Final PG loss pg_loss = weighted_mean(torch.max(pg_losses, pg_losses2), weights=torch.ones( 1, advantages.shape[1])) self.logger.debug("policy mu weights: ") self.logger.debug(self.policy.mu.weight) sum_adv = torch.sum(advantages_unnorm).numpy() self.logger.info("unnormalized advantages: " + str(sum_adv)) # Total loss loss = pg_loss self.logger.info("max_action: " + str(np.max(mb_actions))) self.logger.info("max_action index: " + str(np.argmax(mb_actions))) # Save the old parameters old_params = parameters_to_vector(self.policy.parameters()) losses.append(loss) self.logger.info("inner loss for each mb and epoch: ") self.logger.info(mblossvals) return torch.mean(torch.stack(losses, dim=0))