def update_params(batch): rewards = torch.Tensor(batch.reward).to(device) masks = torch.Tensor(batch.mask).to(device) actions = torch.Tensor(np.concatenate(batch.action, 0)).to(device) states = torch.Tensor(batch.state).to(device) values = value_net(states) returns = torch.Tensor(actions.size(0), 1).to(device) deltas = torch.Tensor(actions.size(0), 1).to(device) advantages = torch.Tensor(actions.size(0), 1).to(device) prev_return = 0 prev_value = 0 prev_advantage = 0 for i in reversed(range(rewards.size(0))): returns[i] = rewards[i] + args.gamma * prev_return * masks[i] deltas[i] = rewards[ i] + args.gamma * prev_value * masks[i] - values.data[i] advantages[ i] = deltas[i] + args.gamma * args.tau * prev_advantage * masks[i] prev_return = returns[i, 0] prev_value = values.data[i, 0] prev_advantage = advantages[i, 0] targets = returns batch_size = math.ceil(states.shape[0] / args.vf_iters) idx = np.random.permutation(states.shape[0]) for i in range(args.vf_iters): smp_idx = idx[i * batch_size:(i + 1) * batch_size] smp_states = states[smp_idx, :] smp_targets = targets[smp_idx, :] value_optimizer.zero_grad() value_loss = value_criterion(value_net(smp_states), smp_targets) value_loss.backward() value_optimizer.step() advantages = (advantages - advantages.mean()) / advantages.std() action_means, action_log_stds, action_stds = policy_net(states.cpu()) fixed_log_prob = normal_log_density(actions.cpu(), action_means, action_log_stds, action_stds).data.clone() def get_loss(): action_means, action_log_stds, action_stds = policy_net(states.cpu()) log_prob = normal_log_density(actions.cpu(), action_means, action_log_stds, action_stds) action_loss = -advantages.cpu() * torch.exp(log_prob - fixed_log_prob) return action_loss.mean() def get_kl(): mean1, log_std1, std1 = policy_net(states.cpu()) mean0 = mean1.data log_std0 = log_std1.data std0 = std1.data kl = log_std1 - log_std0 + (std0.pow(2) + (mean0 - mean1).pow(2)) / ( 2.0 * std1.pow(2)) - 0.5 return kl.sum(1, keepdim=True) trpo_step(policy_net, get_loss, get_kl, args.max_kl, args.damping)
def update_policy(batch): advantages = batch["advantages"] states = batch["states"] actions = batch["actions"] fixed_log_prob = policy_net.getLogProbabilityDensity( Variable(states), actions).detach() trpo_step(policy_net, states, actions, advantages, fixed_log_prob, args.max_kl, args.damping)
def update_policy(batch): """ Get advantage , states and action and calls trpo step Parameters: batch (dict of arrays of numpy) : TODO (batch is different than prepare_data by structure) Returns: """ advantages = batch["advantages"] states = batch["states"] actions = batch["actions"] trpo_step(policy_net, states,actions,advantages , args.max_kl, args.damping)
def update_params(batch): rewards = torch.Tensor(batch.reward) masks = torch.Tensor(batch.mask) actions = torch.Tensor(np.concatenate(batch.action, 0)) states = torch.Tensor(batch.state) values = value_net(Variable(states)) returns = torch.Tensor(actions.size(0), 1) deltas = torch.Tensor(actions.size(0), 1) advantages = torch.Tensor(actions.size(0), 1) prev_return = 0 prev_value = 0 prev_advantage = 0 for i in reversed(range(rewards.size(0))): returns[i] = rewards[i] + args.gamma * prev_return * masks[i] deltas[i] = rewards[ i] + args.gamma * prev_value * masks[i] - values.data[i] advantages[ i] = deltas[i] + args.gamma * args.tau * prev_advantage * masks[i] prev_return = returns[i, 0] prev_value = values.data[i, 0] prev_advantage = advantages[i, 0] targets = Variable(returns) # Original code uses the same LBFGS to optimize the value loss def get_value_loss(flat_params): set_flat_params_to(value_net, torch.Tensor(flat_params)) for param in value_net.parameters(): if param.grad is not None: param.grad.data.fill_(0) values_ = value_net(Variable(states)) value_loss = (values_ - targets).pow(2).mean() # weight decay for param in value_net.parameters(): value_loss += param.pow(2).sum() * args.l2_reg value_loss.backward() return (value_loss.data.double().numpy()[0], get_flat_grad_from(value_net).data.double().numpy()) flat_params, _, opt_info = scipy.optimize.fmin_l_bfgs_b( get_value_loss, get_flat_params_from(value_net).double().numpy(), maxiter=25) set_flat_params_to(value_net, torch.Tensor(flat_params)) advantages = (advantages - advantages.mean()) / advantages.std() action_means, action_log_stds, action_stds = policy_net(Variable(states)) fixed_log_prob = normal_log_density(Variable(actions), action_means, action_log_stds, action_stds).data.clone() def get_loss(volatile=False): action_means, action_log_stds, action_stds = policy_net( Variable(states, volatile=volatile)) log_prob = normal_log_density(Variable(actions), action_means, action_log_stds, action_stds) action_loss = -Variable(advantages) * torch.exp( log_prob - Variable(fixed_log_prob)) return action_loss.mean() def get_kl(): mean1, log_std1, std1 = policy_net(Variable(states)) mean0 = Variable(mean1.data) log_std0 = Variable(log_std1.data) std0 = Variable(std1.data) kl = log_std1 - log_std0 + (std0.pow(2) + (mean0 - mean1).pow(2)) / ( 2.0 * std1.pow(2)) - 0.5 return kl.sum(1, keepdim=True) trpo_step(policy_net, get_loss, get_kl, args.max_kl, args.damping)
def update_params(batch): rewards = torch.Tensor(batch.reward) masks = torch.Tensor(batch.mask) actions = torch.Tensor(np.concatenate(batch.action, 0)) states = torch.Tensor(batch.state) n = actions.size(0) values = value_net(states) ############## GAE ############### returns = torch.Tensor(n, 1) deltas = torch.Tensor(n, 1) advantages = torch.Tensor(n, 1) prev_return = 0 prev_value = 0 prev_advantage = 0 for i in reversed(range(rewards.size(0))): returns[i] = rewards[i] + args.gamma * prev_return * masks[i] deltas[i] = rewards[ i] + args.gamma * prev_value * masks[i] - values.data[i] advantages[ i] = deltas[i] + args.gamma * args.tau * prev_advantage * masks[i] prev_return = returns[i, 0] prev_value = values.data[i, 0] prev_advantage = advantages[i, 0] ################################## ###################### Sever ############################ if args.sever == 1: add_hooks(policy_net) clear_backprops(policy_net) policy_net.zero_grad() action_means, action_log_stds, action_stds = policy_net(states) log_policy = normal_log_density(actions, action_means, action_log_stds, action_stds) torch.autograd.grad(log_policy.mean(), policy_net.parameters()) ## compute gradient of log policy for every single data point, the trick only works for linear and conv layers compute_grad1(policy_net, loss_type='mean') actor_grad_logp = [] for param in policy_net.parameters(): actor_grad_logp.append(param.grad1.view(param.grad1.shape[0], -1)) actor_grad_logp = torch.cat(actor_grad_logp, 1) remove_hooks(policy_net) policy_net.zero_grad() ## standardize the advantage estimate for stable training. Anticipating outliers, use huber's robust estimate of mean and std instead of vanilla sample mean and std. h = huber h.maxiter = 100 try: mean, std = h(advantages) except: "huber failed." mean = advantages.mean() std = advantages.std() advantages = (advantages - mean) / std actor_loss_grad = actor_grad_logp * (advantages ) # vanilla policy gradient start_time = time.time() ## robust CG procedure search_dir, indices = Sever_CG(actor_loss_grad, actor_grad_logp, n, nsteps=10, r=4, p=args.eps / 2) else: advantages = (advantages - advantages.mean()) / advantages.std() indices = list(range(n)) search_dir = None ######################################################### # Use same LBFGS to optimize the value loss def get_value_loss(flat_params): set_flat_params_to(value_net, torch.Tensor(flat_params)) for param in value_net.parameters(): if param.grad is not None: param.grad.data.fill_(0) values_ = value_net(states[indices]) value_loss = (values_ - returns[indices]).pow(2).mean() # print('value loss:',value_loss) # weight decay for param in value_net.parameters(): value_loss += param.pow(2).sum() * args.l2_reg value_loss.backward() return (value_loss.data.double().numpy(), get_flat_grad_from(value_net).data.double().numpy()) flat_params, _, opt_info = scipy.optimize.fmin_l_bfgs_b( get_value_loss, get_flat_params_from(value_net).double().numpy(), maxiter=25) set_flat_params_to(value_net, torch.Tensor(flat_params)) action_means, action_log_stds, action_stds = policy_net(states[indices]) fixed_log_prob = normal_log_density(actions[indices], action_means, action_log_stds, action_stds).data.clone() # Policy loss def get_loss(volatile=False): if volatile: with torch.no_grad(): action_means, action_log_stds, action_stds = policy_net( states[indices]) else: action_means, action_log_stds, action_stds = policy_net( states[indices]) log_prob = normal_log_density(actions[indices], action_means, action_log_stds, action_stds) action_loss = -advantages[indices] * torch.exp(log_prob - fixed_log_prob) return action_loss.mean() def get_kl(): mean1, log_std1, std1 = policy_net(states[indices]) mean0 = mean1.data log_std0 = log_std1.data std0 = std1.data kl = log_std1 - log_std0 + (std0.pow(2) + (mean0 - mean1).pow(2)) / ( 2.0 * std1.pow(2)) - 0.5 return kl.sum(1, keepdim=True) trpo_step(policy_net, get_loss, get_kl, args.max_kl, args.damping, xinit=search_dir) num_attack = args.batch_size * args.eps return 1 - sum(1 for i in indices if i < num_attack ) / num_attack ## fraction of outlier detected
def update_params(batch, gamma, tau, l2_reg, max_kl, damping): rewards = torch.Tensor(batch.reward) masks = torch.Tensor(batch.mask) actions = torch.Tensor(np.concatenate(batch.action, 0)) states = torch.Tensor(batch.state).squeeze() values = value_net(Variable(states)) x_poses = torch.Tensor(batch.x_pos) returns = torch.Tensor(actions.size(0), 1) deltas = torch.Tensor(actions.size(0), 1) advantages = torch.Tensor(actions.size(0), 1) prev_return = 0 prev_value = 0 prev_advantage = 0 for i in reversed(range(rewards.size(0))): returns[i] = rewards[i] + gamma * prev_return * masks[i] deltas[i] = rewards[ i] + gamma * prev_value * masks[i] - values.data[i] advantages[i] = deltas[i] + gamma * tau * prev_advantage * masks[i] prev_return = returns[i, 0] prev_value = values.data[i, 0] prev_advantage = advantages[i, 0] targets = Variable(returns) # Original code uses the same LBFGS to optimize the value loss def get_value_loss(flat_params): global val_loss set_flat_params_to(value_net, torch.Tensor(flat_params)) for param in value_net.parameters(): if param.grad is not None: param.grad.data.fill_(0) values_ = value_net(Variable(states)) value_loss = (values_ - targets).pow(2).mean() # weight decay for param in value_net.parameters(): value_loss += param.pow(2).sum() * l2_reg value_loss.backward() val_loss = value_loss.item() # print("Value Loss: ", val_loss) return (value_loss.data.double().item(), get_flat_grad_from(value_net).data.double().numpy()) flat_params, _, opt_info = scipy.optimize.fmin_l_bfgs_b( get_value_loss, get_flat_params_from(value_net).double().numpy(), maxiter=25) set_flat_params_to(value_net, torch.Tensor(flat_params)) advantages = (advantages - advantages.mean()) / advantages.std() # print("States: ", states) # print("States Size: ", states.size()) probs = policy_net((states, x_poses)).squeeze() # print("Actions: ", actions) # print("probs: ", probs) fixed_log_prob = ( torch.log(probs) * Variable(actions) ).sum(1).data.clone( ) #normal_log_density(Variable(actions), action_means, action_log_stds, action_stds).data.clone() def get_loss(volatile=False): # print("Action Size: ", actions.size()) if volatile: with torch.no_grad(): probs = policy_net(Variable(states)).squeeze() else: probs = policy_net(Variable(states)).squeeze() # print("Probs Size: ", probs.size()) log_prob = (torch.log(probs) * Variable(actions)).sum( 1 ) #normal_log_density(Variable(actions), action_means, action_log_stds, action_stds) # print("Log Probs Size: ", log_prob.size()) # print("Advantages Size: ", advantages.size()) action_loss = -Variable(advantages).squeeze() * torch.exp( log_prob - Variable(fixed_log_prob)) # print("Action Loss Size: ", action_loss.size()) # print("Action Loss: ", action_loss.mean().item()) return action_loss.mean() def get_kl(): actprobs = policy_net((Variable(states), Variable(x_poses))) + 1e-8 old_actprobs = Variable(actprobs.data) kl = old_actprobs * torch.log(old_actprobs / actprobs) return kl.sum(1, keepdim=True) probs_old = policy_net((Variable(states), Variable(x_poses))) loss = trpo_step(policy_net, get_loss, get_kl, max_kl, damping) probs_new = policy_net((Variable(states), Variable(x_poses))) + 1e-8 kl = torch.sum(probs_old * torch.log(probs_old / probs_new), 1) return loss, kl.mean()