def calculate_gradient(self, batch_info, device, model, rollout): """ Calculate loss of the supplied rollout """ evaluator = model.evaluate(rollout) # Use evaluator interface to get the what we are interested in from the model advantages = evaluator.get('rollout:advantages') returns = evaluator.get('rollout:returns') rollout_values = evaluator.get('rollout:values') logprobs = evaluator.get('model:action:logprobs') values = evaluator.get('model:values') entropy = evaluator.get('model:entropy') # Actual calculations. Pretty trivial policy_loss = -torch.mean(advantages * logprobs) value_loss = 0.5 * F.mse_loss(values, returns) policy_entropy = torch.mean(entropy) loss_value = (policy_loss - self.entropy_coefficient * policy_entropy + self.value_coefficient * value_loss) loss_value.backward() return { 'policy_loss': policy_loss.item(), 'value_loss': value_loss.item(), 'policy_entropy': policy_entropy.item(), 'advantage_norm': torch.norm(advantages).item(), 'explained_variance': explained_variance(returns, rollout_values) }
def calculate_gradient(self, batch_info, device, model, rollout): """ Calculate loss of the supplied rollout """ observations = rollout['observations'] returns = rollout['returns'] advantages = rollout['advantages'] actions = rollout['actions'] values = rollout['values'] action_pd_params, value_outputs = model(observations) log_prob = model.logprob(actions, action_pd_params) policy_loss = -torch.mean(advantages * log_prob) value_loss = 0.5 * F.mse_loss(value_outputs, returns) policy_entropy = torch.mean(model.entropy(action_pd_params)) loss_value = (policy_loss - self.entropy_coefficient * policy_entropy + self.value_coefficient * value_loss) loss_value.backward() return { 'policy_loss': policy_loss.item(), 'value_loss': value_loss.item(), 'policy_entropy': policy_entropy.item(), 'advantage_norm': torch.norm(advantages).item(), 'explained_variance': explained_variance(returns, values) }
def optimizer_step(self, batch_info, device, model, rollout): """ Single optimization step for a model """ observations = rollout['observations'] returns = rollout['returns'] # Evaluate model on the observations action_pd_params = model.policy(observations) policy_entropy = torch.mean(model.entropy(action_pd_params)) policy_loss = self.calc_policy_loss(model, action_pd_params, policy_entropy, rollout) policy_grad = p2v(autograd.grad(policy_loss, model.policy_parameters(), retain_graph=True)).detach() # Calculate gradient of KL divergence of model with fixed version of itself # Value of kl_divergence will be 0, but what we need is the gradient, actually the 2nd derivarive kl_divergence = torch.mean(model.kl_divergence(action_pd_params.detach(), action_pd_params)) kl_divergence_gradient = p2v(torch.autograd.grad(kl_divergence, model.policy_parameters(), create_graph=True)) step_direction = conjugate_gradient_method( matrix_vector_operator=lambda x: self.fisher_vector_product(x, kl_divergence_gradient, model), # Because we want to decrease the loss, we want to go into the direction of -gradient loss_gradient=-policy_grad, nsteps=self.cg_iters ) shs = 0.5 * step_direction @ self.fisher_vector_product(step_direction, kl_divergence_gradient, model) lm = torch.sqrt(shs / self.mak_kl) full_step = step_direction / lm # Because we want to decrease the loss, we want to go into the direction of -gradient expected_improvement = (-policy_grad) @ full_step original_parameter_vec = p2v(model.policy_parameters()).detach_() policy_optimization_success, ratio, policy_loss_improvement, new_policy_loss, kl_divergence_step = self.line_search( model, rollout, policy_loss, action_pd_params, original_parameter_vec, full_step, expected_improvement ) gradient_norms = [] for i in range(self.vf_iters): batch_info.optimizer.zero_grad() value_loss = self.value_loss(model, observations, returns) value_loss.backward() # Gradient clipping if self.max_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( filter(lambda p: p.requires_grad, model.parameters()), max_norm=self.max_grad_norm ) gradient_norms.append(grad_norm) batch_info.optimizer.step(closure=None) if gradient_norms: gradient_norm = np.mean(gradient_norms) else: gradient_norm = 0.0 # noinspection PyUnboundLocalVariable return { 'new_policy_loss': new_policy_loss.item(), 'policy_entropy': policy_entropy.item(), 'value_loss': value_loss.item(), 'policy_optimization_success': float(policy_optimization_success), 'policy_improvement_ratio': ratio.item(), 'kl_divergence_step': kl_divergence_step.item(), 'policy_loss_improvement': policy_loss_improvement.item(), 'grad_norm': gradient_norm, 'advantage_norm': torch.norm(rollout['advantages']).item(), 'explained_variance': explained_variance(returns, rollout['values']) }
def calculate_gradient(self, batch_info, device, model, rollout): """ Calculate loss of the supplied rollout """ observations = rollout['observations'] returns = rollout['returns'] advantages = rollout['advantages'] rollout_values = rollout['values'] rollout_actions = rollout['actions'] rollout_logprobs = rollout['logprobs'] # Select the cliprange current_cliprange = self.cliprange.value(batch_info['progress']) # Normalize the advantages? advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) # PART 0 - model_evaluation eval_action_pd_params, eval_value_outputs = model(observations) # PART 1 - policy entropy policy_entropy = torch.mean(model.entropy(eval_action_pd_params)) # PART 2 - value function value_output_clipped = rollout_values + torch.clamp( eval_value_outputs - rollout_values, -current_cliprange, current_cliprange) value_loss_part1 = (eval_value_outputs - returns).pow(2) value_loss_part2 = (value_output_clipped - returns).pow(2) value_loss = 0.5 * torch.mean( torch.max(value_loss_part1, value_loss_part2)) # PART 3 - policy gradient loss eval_logprobs = model.logprob(rollout_actions, eval_action_pd_params) ratio = torch.exp(eval_logprobs - rollout_logprobs) pg_loss_part1 = -advantages * ratio pg_loss_part2 = -advantages * torch.clamp( ratio, 1.0 - current_cliprange, 1.0 + current_cliprange) policy_loss = torch.mean(torch.max(pg_loss_part1, pg_loss_part2)) loss_value = (policy_loss - self.entropy_coefficient * policy_entropy + self.value_coefficient * value_loss) loss_value.backward() with torch.no_grad(): approx_kl_divergence = 0.5 * torch.mean( (eval_logprobs - rollout_logprobs)**2) clip_fraction = torch.mean( (torch.abs(ratio - 1.0) > current_cliprange).to( dtype=torch.float)) return { 'policy_loss': policy_loss.item(), 'value_loss': value_loss.item(), 'policy_entropy': policy_entropy.item(), 'approx_kl_divergence': approx_kl_divergence.item(), 'clip_fraction': clip_fraction.item(), 'advantage_norm': torch.norm(advantages).item(), 'explained_variance': explained_variance(returns, rollout_values) }
def calculate_gradient(self, batch_info, device, model, rollout): """ Calculate loss of the supplied rollout """ evaluator = model.evaluate(rollout) # Part 0.0 - Rollout values advantages = evaluator.get('rollout:estimated_advantages') rollout_values = evaluator.get('rollout:estimated_values') rollout_action_logprobs = evaluator.get('rollout:action:logprobs') returns = evaluator.get('rollout:estimated_returns') # PART 0.1 - Model evaluation entropy = evaluator.get('model:entropy') model_values = evaluator.get('model:estimated_values') model_action_logprobs = evaluator.get('model:action:logprobs') # Select the cliprange current_cliprange = self.cliprange.value(batch_info['progress']) # Normalize the advantages? if self.normalize_advantage: advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) # PART 1 - policy entropy policy_entropy = torch.mean(entropy) # PART 2 - value function value_output_clipped = rollout_values + torch.clamp( model_values - rollout_values, -current_cliprange, current_cliprange) value_loss_part1 = (model_values - returns).pow(2) value_loss_part2 = (value_output_clipped - returns).pow(2) value_loss = 0.5 * torch.mean( torch.max(value_loss_part1, value_loss_part2)) # PART 3 - policy gradient loss ratio = torch.exp(model_action_logprobs - rollout_action_logprobs) pg_loss_part1 = -advantages * ratio pg_loss_part2 = -advantages * torch.clamp( ratio, 1.0 - current_cliprange, 1.0 + current_cliprange) policy_loss = torch.mean(torch.max(pg_loss_part1, pg_loss_part2)) loss_value = (policy_loss - self.entropy_coefficient * policy_entropy + self.value_coefficient * value_loss) loss_value.backward() with torch.no_grad(): approx_kl_divergence = 0.5 * torch.mean( (model_action_logprobs - rollout_action_logprobs).pow(2)) clip_fraction = torch.mean( (torch.abs(ratio - 1.0) > current_cliprange).to( dtype=torch.float)) return { 'policy_loss': policy_loss.item(), 'value_loss': value_loss.item(), 'policy_entropy': policy_entropy.item(), 'approx_kl_divergence': approx_kl_divergence.item(), 'clip_fraction': clip_fraction.item(), 'advantage_norm': torch.norm(advantages).item(), 'explained_variance': explained_variance(returns, rollout_values) }