Пример #1
0
Файл: a2c.py Проект: yulkang/vel
    def calculate_gradient(self, batch_info, device, model, rollout):
        """ Calculate loss of the supplied rollout """
        evaluator = model.evaluate(rollout)

        # Use evaluator interface to get the what we are interested in from the model
        advantages = evaluator.get('rollout:advantages')
        returns = evaluator.get('rollout:returns')
        rollout_values = evaluator.get('rollout:values')

        logprobs = evaluator.get('model:action:logprobs')
        values = evaluator.get('model:values')
        entropy = evaluator.get('model:entropy')

        # Actual calculations. Pretty trivial
        policy_loss = -torch.mean(advantages * logprobs)
        value_loss = 0.5 * F.mse_loss(values, returns)
        policy_entropy = torch.mean(entropy)

        loss_value = (policy_loss - self.entropy_coefficient * policy_entropy +
                      self.value_coefficient * value_loss)

        loss_value.backward()

        return {
            'policy_loss': policy_loss.item(),
            'value_loss': value_loss.item(),
            'policy_entropy': policy_entropy.item(),
            'advantage_norm': torch.norm(advantages).item(),
            'explained_variance': explained_variance(returns, rollout_values)
        }
Пример #2
0
    def calculate_gradient(self, batch_info, device, model, rollout):
        """ Calculate loss of the supplied rollout """
        observations = rollout['observations']
        returns = rollout['returns']
        advantages = rollout['advantages']
        actions = rollout['actions']
        values = rollout['values']

        action_pd_params, value_outputs = model(observations)

        log_prob = model.logprob(actions, action_pd_params)

        policy_loss = -torch.mean(advantages * log_prob)
        value_loss = 0.5 * F.mse_loss(value_outputs, returns)
        policy_entropy = torch.mean(model.entropy(action_pd_params))

        loss_value = (policy_loss - self.entropy_coefficient * policy_entropy +
                      self.value_coefficient * value_loss)
        loss_value.backward()

        return {
            'policy_loss': policy_loss.item(),
            'value_loss': value_loss.item(),
            'policy_entropy': policy_entropy.item(),
            'advantage_norm': torch.norm(advantages).item(),
            'explained_variance': explained_variance(returns, values)
        }
Пример #3
0
    def optimizer_step(self, batch_info, device, model, rollout):
        """ Single optimization step for a model """
        observations = rollout['observations']
        returns = rollout['returns']

        # Evaluate model on the observations
        action_pd_params = model.policy(observations)
        policy_entropy = torch.mean(model.entropy(action_pd_params))

        policy_loss = self.calc_policy_loss(model, action_pd_params, policy_entropy, rollout)
        policy_grad = p2v(autograd.grad(policy_loss, model.policy_parameters(), retain_graph=True)).detach()

        # Calculate gradient of KL divergence of model with fixed version of itself
        # Value of kl_divergence will be 0, but what we need is the gradient, actually the 2nd derivarive
        kl_divergence = torch.mean(model.kl_divergence(action_pd_params.detach(), action_pd_params))
        kl_divergence_gradient = p2v(torch.autograd.grad(kl_divergence, model.policy_parameters(), create_graph=True))

        step_direction = conjugate_gradient_method(
            matrix_vector_operator=lambda x: self.fisher_vector_product(x, kl_divergence_gradient, model),
            # Because we want to decrease the loss, we want to go into the direction of -gradient
            loss_gradient=-policy_grad,
            nsteps=self.cg_iters
        )

        shs = 0.5 * step_direction @ self.fisher_vector_product(step_direction, kl_divergence_gradient, model)
        lm = torch.sqrt(shs / self.mak_kl)
        full_step = step_direction / lm

        # Because we want to decrease the loss, we want to go into the direction of -gradient
        expected_improvement = (-policy_grad) @ full_step
        original_parameter_vec = p2v(model.policy_parameters()).detach_()

        policy_optimization_success, ratio, policy_loss_improvement, new_policy_loss, kl_divergence_step = self.line_search(
            model, rollout, policy_loss, action_pd_params, original_parameter_vec, full_step, expected_improvement
        )

        gradient_norms = []

        for i in range(self.vf_iters):
            batch_info.optimizer.zero_grad()
            value_loss = self.value_loss(model, observations, returns)

            value_loss.backward()

            # Gradient clipping
            if self.max_grad_norm is not None:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    filter(lambda p: p.requires_grad, model.parameters()),
                    max_norm=self.max_grad_norm
                )

                gradient_norms.append(grad_norm)

            batch_info.optimizer.step(closure=None)

        if gradient_norms:
            gradient_norm = np.mean(gradient_norms)
        else:
            gradient_norm = 0.0

        # noinspection PyUnboundLocalVariable
        return {
            'new_policy_loss': new_policy_loss.item(),
            'policy_entropy': policy_entropy.item(),
            'value_loss': value_loss.item(),
            'policy_optimization_success': float(policy_optimization_success),
            'policy_improvement_ratio': ratio.item(),
            'kl_divergence_step': kl_divergence_step.item(),
            'policy_loss_improvement': policy_loss_improvement.item(),
            'grad_norm': gradient_norm,
            'advantage_norm': torch.norm(rollout['advantages']).item(),
            'explained_variance': explained_variance(returns, rollout['values'])
        }
Пример #4
0
    def calculate_gradient(self, batch_info, device, model, rollout):
        """ Calculate loss of the supplied rollout """
        observations = rollout['observations']
        returns = rollout['returns']
        advantages = rollout['advantages']
        rollout_values = rollout['values']
        rollout_actions = rollout['actions']
        rollout_logprobs = rollout['logprobs']

        # Select the cliprange
        current_cliprange = self.cliprange.value(batch_info['progress'])

        # Normalize the advantages?
        advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                         1e-8)

        # PART 0 - model_evaluation
        eval_action_pd_params, eval_value_outputs = model(observations)

        # PART 1 - policy entropy
        policy_entropy = torch.mean(model.entropy(eval_action_pd_params))

        # PART 2 - value function
        value_output_clipped = rollout_values + torch.clamp(
            eval_value_outputs - rollout_values, -current_cliprange,
            current_cliprange)
        value_loss_part1 = (eval_value_outputs - returns).pow(2)
        value_loss_part2 = (value_output_clipped - returns).pow(2)
        value_loss = 0.5 * torch.mean(
            torch.max(value_loss_part1, value_loss_part2))

        # PART 3 - policy gradient loss
        eval_logprobs = model.logprob(rollout_actions, eval_action_pd_params)
        ratio = torch.exp(eval_logprobs - rollout_logprobs)

        pg_loss_part1 = -advantages * ratio
        pg_loss_part2 = -advantages * torch.clamp(
            ratio, 1.0 - current_cliprange, 1.0 + current_cliprange)
        policy_loss = torch.mean(torch.max(pg_loss_part1, pg_loss_part2))

        loss_value = (policy_loss - self.entropy_coefficient * policy_entropy +
                      self.value_coefficient * value_loss)

        loss_value.backward()

        with torch.no_grad():
            approx_kl_divergence = 0.5 * torch.mean(
                (eval_logprobs - rollout_logprobs)**2)
            clip_fraction = torch.mean(
                (torch.abs(ratio - 1.0) > current_cliprange).to(
                    dtype=torch.float))

        return {
            'policy_loss': policy_loss.item(),
            'value_loss': value_loss.item(),
            'policy_entropy': policy_entropy.item(),
            'approx_kl_divergence': approx_kl_divergence.item(),
            'clip_fraction': clip_fraction.item(),
            'advantage_norm': torch.norm(advantages).item(),
            'explained_variance': explained_variance(returns, rollout_values)
        }
Пример #5
0
    def calculate_gradient(self, batch_info, device, model, rollout):
        """ Calculate loss of the supplied rollout """
        evaluator = model.evaluate(rollout)

        # Part 0.0 - Rollout values
        advantages = evaluator.get('rollout:estimated_advantages')
        rollout_values = evaluator.get('rollout:estimated_values')
        rollout_action_logprobs = evaluator.get('rollout:action:logprobs')
        returns = evaluator.get('rollout:estimated_returns')

        # PART 0.1 - Model evaluation
        entropy = evaluator.get('model:entropy')
        model_values = evaluator.get('model:estimated_values')
        model_action_logprobs = evaluator.get('model:action:logprobs')

        # Select the cliprange
        current_cliprange = self.cliprange.value(batch_info['progress'])

        # Normalize the advantages?
        if self.normalize_advantage:
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-8)

        # PART 1 - policy entropy
        policy_entropy = torch.mean(entropy)

        # PART 2 - value function
        value_output_clipped = rollout_values + torch.clamp(
            model_values - rollout_values, -current_cliprange,
            current_cliprange)
        value_loss_part1 = (model_values - returns).pow(2)
        value_loss_part2 = (value_output_clipped - returns).pow(2)
        value_loss = 0.5 * torch.mean(
            torch.max(value_loss_part1, value_loss_part2))

        # PART 3 - policy gradient loss
        ratio = torch.exp(model_action_logprobs - rollout_action_logprobs)

        pg_loss_part1 = -advantages * ratio
        pg_loss_part2 = -advantages * torch.clamp(
            ratio, 1.0 - current_cliprange, 1.0 + current_cliprange)
        policy_loss = torch.mean(torch.max(pg_loss_part1, pg_loss_part2))

        loss_value = (policy_loss - self.entropy_coefficient * policy_entropy +
                      self.value_coefficient * value_loss)

        loss_value.backward()

        with torch.no_grad():
            approx_kl_divergence = 0.5 * torch.mean(
                (model_action_logprobs - rollout_action_logprobs).pow(2))
            clip_fraction = torch.mean(
                (torch.abs(ratio - 1.0) > current_cliprange).to(
                    dtype=torch.float))

        return {
            'policy_loss': policy_loss.item(),
            'value_loss': value_loss.item(),
            'policy_entropy': policy_entropy.item(),
            'approx_kl_divergence': approx_kl_divergence.item(),
            'clip_fraction': clip_fraction.item(),
            'advantage_norm': torch.norm(advantages).item(),
            'explained_variance': explained_variance(returns, rollout_values)
        }