Exemplo n.º 1
0
    def learn(self, D, **kwargs):
        logprobs = [torch.cat(traj.get_infos('action_logprob')) for traj in D]
        entropies = [torch.cat(traj.get_infos('entropy')) for traj in D]
        Vs = [torch.cat(traj.get_infos('V')) for traj in D]
        last_Vs = [traj.extra_info['last_info']['V'] for traj in D]
        Qs = [
            bootstrapped_returns(self.config['agent.gamma'], traj.rewards,
                                 last_V, traj.reach_terminal)
            for traj, last_V in zip(D, last_Vs)
        ]
        As = [
            gae(self.config['agent.gamma'], self.config['agent.gae_lambda'],
                traj.rewards, V, last_V, traj.reach_terminal)
            for traj, V, last_V in zip(D, Vs, last_Vs)
        ]

        # Metrics -> Tensor, device
        logprobs, entropies, Vs = map(lambda x: torch.cat(x).squeeze(),
                                      [logprobs, entropies, Vs])
        Qs, As = map(
            lambda x: tensorify(np.concatenate(x).copy(), self.device),
            [Qs, As])
        if self.config['agent.standardize_adv']:
            As = (As - As.mean()) / (As.std() + 1e-4)
        assert all([x.ndim == 1 for x in [logprobs, entropies, Vs, Qs, As]])

        # Loss
        policy_loss = -logprobs * As.detach()
        entropy_loss = -entropies
        value_loss = F.mse_loss(Vs, Qs, reduction='none')
        loss = policy_loss + self.config[
            'agent.value_coef'] * value_loss + self.config[
                'agent.entropy_coef'] * entropy_loss
        loss = loss.mean()

        self.optimizer.zero_grad()
        loss.backward()
        grad_norm = nn.utils.clip_grad_norm_(
            self.parameters(), self.config['agent.max_grad_norm'])
        self.optimizer.step()
        if self.config['agent.use_lr_scheduler']:
            self.lr_scheduler.step(self.total_timestep)
        self.total_timestep += sum([traj.T for traj in D])

        out = {}
        if self.config['agent.use_lr_scheduler']:
            out['current_lr'] = self.lr_scheduler.get_lr()
        out['loss'] = loss.item()
        out['grad_norm'] = grad_norm
        out['policy_loss'] = policy_loss.mean().item()
        out['entropy_loss'] = entropy_loss.mean().item()
        out['policy_entropy'] = -out['entropy_loss']
        out['value_loss'] = value_loss.mean().item()
        out['V'] = describe(numpify(Vs, 'float').squeeze(),
                            axis=-1,
                            repr_indent=1,
                            repr_prefix='\n')
        out['explained_variance'] = ev(y_true=numpify(Qs, 'float'),
                                       y_pred=numpify(Vs, 'float'))
        return out
Exemplo n.º 2
0
    def learn_one_update(self, data):
        data = [d.detach().to(self.device) for d in data]
        observations, old_actions, old_logprobs, old_entropies, old_Vs, old_Qs, old_As = data

        action_dist = self.policy(observations)
        logprobs = action_dist.log_prob(old_actions).squeeze()
        entropies = action_dist.entropy().squeeze()
        Vs = self.value(observations).squeeze()
        assert all([x.ndim == 1 for x in [logprobs, entropies, Vs]])

        ratio = torch.exp(logprobs - old_logprobs)
        eps = self.config['agent.clip_range']
        policy_loss = -torch.min(
            ratio * old_As,
            torch.clamp(ratio, 1.0 - eps, 1.0 + eps) * old_As)
        policy_loss = policy_loss.mean(0)

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        policy_grad_norm = nn.utils.clip_grad_norm_(
            self.policy.parameters(), self.config['agent.max_grad_norm'])
        self.policy_optimizer.step()
        if self.config['agent.use_lr_scheduler']:
            self.policy_lr_scheduler.step(self.total_timestep)

        clipped_Vs = old_Vs + torch.clamp(Vs - old_Vs, -eps, eps)
        value_loss = torch.max(
            F.mse_loss(Vs, old_Qs, reduction='none'),
            F.mse_loss(clipped_Vs, old_Qs, reduction='none'))
        value_loss = value_loss.mean(0)

        self.value_optimizer.zero_grad()
        value_loss.backward()
        value_grad_norm = nn.utils.clip_grad_norm_(
            self.value.parameters(), self.config['agent.max_grad_norm'])
        self.value_optimizer.step()

        out = {}
        out['policy_grad_norm'] = policy_grad_norm
        out['value_grad_norm'] = value_grad_norm
        out['policy_loss'] = policy_loss.item()
        out['policy_entropy'] = entropies.mean().item()
        out['value_loss'] = value_loss.item()
        out['explained_variance'] = ev(y_true=numpify(old_Qs, 'float'),
                                       y_pred=numpify(Vs, 'float'))
        out['approx_kl'] = (old_logprobs - logprobs).mean(0).item()
        out['clip_frac'] = ((ratio < 1.0 - eps) |
                            (ratio > 1.0 + eps)).float().mean(0).item()
        return out
Exemplo n.º 3
0
    def learn_one_update(self, data):
        data = [d.to(self.device) for d in data]
        observations, old_actions, old_logprobs, old_entropies, old_Vs, old_Qs, old_As = data

        out = self.choose_action(observations)
        logprobs = out['action_dist'].log_prob(old_actions).squeeze()
        entropies = out['entropy'].squeeze()
        Vs = out['V'].squeeze()

        ratio = torch.exp(logprobs - old_logprobs)
        eps = self.config['agent.clip_range']
        policy_loss = -torch.min(
            ratio * old_As,
            torch.clamp(ratio, 1.0 - eps, 1.0 + eps) * old_As)
        entropy_loss = -entropies
        clipped_Vs = old_Vs + torch.clamp(Vs - old_Vs, -eps, eps)
        value_loss = torch.max(
            F.mse_loss(Vs, old_Qs, reduction='none'),
            F.mse_loss(clipped_Vs, old_Qs, reduction='none'))
        loss = policy_loss + self.config[
            'agent.value_coef'] * value_loss + self.config[
                'agent.entropy_coef'] * entropy_loss
        loss = loss.mean()

        self.optimizer.zero_grad()
        loss.backward()
        grad_norm = nn.utils.clip_grad_norm_(
            self.parameters(), self.config['agent.max_grad_norm'])
        if self.config['agent.use_lr_scheduler']:
            self.lr_scheduler.step(self.total_timestep)
        self.optimizer.step()

        out = {}
        out['loss'] = loss.item()
        out['grad_norm'] = grad_norm
        out['policy_loss'] = policy_loss.mean().item()
        out['entropy_loss'] = entropy_loss.mean().item()
        out['policy_entropy'] = -entropy_loss.mean().item()
        out['value_loss'] = value_loss.mean().item()
        out['explained_variance'] = ev(y_true=old_Qs.detach().cpu().numpy(),
                                       y_pred=Vs.detach().cpu().numpy())
        out['approx_kl'] = torch.mean(old_logprobs - logprobs).item()
        out['clip_frac'] = ((ratio < 1.0 - eps) |
                            (ratio > 1.0 + eps)).float().mean().item()
        return out
Exemplo n.º 4
0
    def learn(self, D, **kwargs):
        # Compute all metrics, D: list of Trajectory
        Ts = [len(traj) for traj in D]
        behavior_logprobs = [
            torch.cat(traj.get_all_info('action_logprob')) for traj in D
        ]
        out_agent = self.choose_action(
            np.concatenate([traj.numpy_observations[:-1] for traj in D], 0))
        logprobs = out_agent['action_logprob'].squeeze()
        entropies = out_agent['entropy'].squeeze()
        Vs = out_agent['V'].squeeze()
        with torch.no_grad():
            last_observations = tensorify(
                np.concatenate([traj.last_observation for traj in D], 0),
                self.device)
            last_Vs = self.V_head(
                self.feature_network(last_observations)).squeeze(-1)

        vs, As = [], []
        for traj, behavior_logprob, logprob, V, last_V in zip(
                D, behavior_logprobs,
                logprobs.detach().cpu().split(Ts),
                Vs.detach().cpu().split(Ts), last_Vs):
            v, A = vtrace(behavior_logprob, logprob, self.gamma, traj.rewards,
                          V, last_V, traj.reach_terminal, self.clip_rho,
                          self.clip_pg_rho)
            vs.append(v)
            As.append(A)

        # Metrics -> Tensor, device
        vs, As = map(
            lambda x: tensorify(np.concatenate(x).copy(), self.device),
            [vs, As])
        if self.config['agent.standardize_adv']:
            As = (As - As.mean()) / (As.std() + 1e-8)

        assert all(
            [x.ndimension() == 1 for x in [logprobs, entropies, Vs, vs, As]])

        # Loss
        policy_loss = -logprobs * As
        entropy_loss = -entropies
        value_loss = F.mse_loss(Vs, vs, reduction='none')

        loss = policy_loss + self.config[
            'agent.value_coef'] * value_loss + self.config[
                'agent.entropy_coef'] * entropy_loss
        loss = loss.mean()

        self.optimizer.zero_grad()
        loss.backward()
        grad_norm = nn.utils.clip_grad_norm_(
            self.parameters(), self.config['agent.max_grad_norm'])
        self.optimizer.step()
        if self.config['agent.use_lr_scheduler']:
            self.lr_scheduler.step(self.total_timestep)

        self.total_timestep += sum([len(traj) for traj in D])
        out = {}
        if self.config['agent.use_lr_scheduler']:
            out['current_lr'] = self.lr_scheduler.get_lr()
        out['loss'] = loss.item()
        out['grad_norm'] = grad_norm
        out['policy_loss'] = policy_loss.mean().item()
        out['entropy_loss'] = entropy_loss.mean().item()
        out['policy_entropy'] = -entropy_loss.mean().item()
        out['value_loss'] = value_loss.mean().item()
        out['V'] = describe(numpify(Vs, 'float').squeeze(),
                            axis=-1,
                            repr_indent=1,
                            repr_prefix='\n')
        out['explained_variance'] = ev(y_true=numpify(vs, 'float'),
                                       y_pred=numpify(Vs, 'float'))
        return out
Exemplo n.º 5
0
    def learn(self, D, **kwargs):
        # Compute all metrics, D: list of Trajectory
        logprobs = [
            torch.cat(traj.get_all_info('action_logprob')) for traj in D
        ]
        entropies = [torch.cat(traj.get_all_info('entropy')) for traj in D]
        Vs = [torch.cat(traj.get_all_info('V')) for traj in D]

        with torch.no_grad():
            last_observations = tensorify(
                np.concatenate([traj.last_observation for traj in D], 0),
                self.device)
            last_Vs = self.V_head(
                self.feature_network(last_observations)).squeeze(-1)
        Qs = [
            bootstrapped_returns(self.config['agent.gamma'], traj, last_V)
            for traj, last_V in zip(D, last_Vs)
        ]
        As = [
            gae(self.config['agent.gamma'], self.config['agent.gae_lambda'],
                traj, V, last_V) for traj, V, last_V in zip(D, Vs, last_Vs)
        ]

        # Metrics -> Tensor, device
        logprobs, entropies, Vs = map(lambda x: torch.cat(x).squeeze(),
                                      [logprobs, entropies, Vs])
        Qs, As = map(
            lambda x: tensorify(np.concatenate(x).copy(), self.device),
            [Qs, As])
        if self.config['agent.standardize_adv']:
            As = (As - As.mean()) / (As.std() + 1e-8)

        assert all(
            [x.ndimension() == 1 for x in [logprobs, entropies, Vs, Qs, As]])

        # Loss
        policy_loss = -logprobs * As
        entropy_loss = -entropies
        value_loss = F.mse_loss(Vs, Qs, reduction='none')

        loss = policy_loss + self.config[
            'agent.value_coef'] * value_loss + self.config[
                'agent.entropy_coef'] * entropy_loss
        loss = loss.mean()

        self.optimizer.zero_grad()
        loss.backward()
        grad_norm = nn.utils.clip_grad_norm_(
            self.parameters(), self.config['agent.max_grad_norm'])
        if self.config['agent.use_lr_scheduler']:
            self.lr_scheduler.step(self.total_timestep)
        self.optimizer.step()

        self.total_timestep += sum([len(traj) for traj in D])
        out = {}
        if self.config['agent.use_lr_scheduler']:
            out['current_lr'] = self.lr_scheduler.get_lr()
        out['loss'] = loss.item()
        out['grad_norm'] = grad_norm
        out['policy_loss'] = policy_loss.mean().item()
        out['entropy_loss'] = entropy_loss.mean().item()
        out['policy_entropy'] = -entropy_loss.mean().item()
        out['value_loss'] = value_loss.mean().item()
        out['V'] = describe(numpify(Vs, 'float').squeeze(),
                            axis=-1,
                            repr_indent=1,
                            repr_prefix='\n')
        out['explained_variance'] = ev(y_true=numpify(Qs, 'float'),
                                       y_pred=numpify(Vs, 'float'))
        return out