示例#1
0
def test_bootstrapped_returns(gamma, last_V):
    y = [
        0.1 + gamma * (0.2 + gamma * (0.3 + gamma * (0.4 + gamma * last_V))),
        0.2 + gamma * (0.3 + gamma * (0.4 + gamma * last_V)),
        0.3 + gamma * (0.4 + gamma * last_V), 0.4 + gamma * last_V
    ]
    reach_terminal = False
    rewards = [0.1, 0.2, 0.3, 0.4]
    assert np.allclose(
        bootstrapped_returns(gamma, rewards, last_V, reach_terminal), y)
    assert np.allclose(
        bootstrapped_returns(gamma, rewards, torch.tensor(last_V),
                             reach_terminal), y)

    y = [
        0.1 + gamma * (0.2 + gamma * (0.3 + gamma *
                                      (0.4 + gamma * last_V * 0.0))),
        0.2 + gamma * (0.3 + gamma * (0.4 + gamma * last_V * 0.0)),
        0.3 + gamma * (0.4 + gamma * last_V * 0.0), 0.4 + gamma * last_V * 0.0
    ]
    reach_terminal = True
    rewards = [0.1, 0.2, 0.3, 0.4]
    assert np.allclose(
        bootstrapped_returns(gamma, rewards, last_V, reach_terminal), y)
    assert np.allclose(
        bootstrapped_returns(gamma, rewards, torch.tensor(last_V),
                             reach_terminal), y)

    y = [
        0.1 + gamma * (0.2 + gamma * (0.3 + gamma * (0.4 + gamma *
                                                     (0.5 + gamma * last_V)))),
        0.2 + gamma * (0.3 + gamma * (0.4 + gamma * (0.5 + gamma * last_V))),
        0.3 + gamma * (0.4 + gamma * (0.5 + gamma * last_V)),
        0.4 + gamma * (0.5 + gamma * last_V), 0.5 + gamma * last_V
    ]
    reach_terminal = False
    rewards = [0.1, 0.2, 0.3, 0.4, 0.5]
    assert np.allclose(
        bootstrapped_returns(gamma, rewards, last_V, reach_terminal), y)
    assert np.allclose(
        bootstrapped_returns(gamma, rewards, torch.tensor(last_V),
                             reach_terminal), y)

    y = [
        0.1 + gamma * (0.2 + gamma * (0.3 + gamma *
                                      (0.4 + gamma *
                                       (0.5 + gamma * last_V * 0.0)))), 0.2 +
        gamma * (0.3 + gamma * (0.4 + gamma * (0.5 + gamma * last_V * 0.0))),
        0.3 + gamma * (0.4 + gamma * (0.5 + gamma * last_V * 0.0)),
        0.4 + gamma * (0.5 + gamma * last_V * 0.0), 0.5 + gamma * last_V * 0.0
    ]
    reach_terminal = True
    rewards = [0.1, 0.2, 0.3, 0.4, 0.5]
    assert np.allclose(
        bootstrapped_returns(gamma, rewards, last_V, reach_terminal), y)
    assert np.allclose(
        bootstrapped_returns(gamma, rewards, torch.tensor(last_V),
                             reach_terminal), y)
示例#2
0
    def learn(self, D, **kwargs):
        logprobs = [torch.cat(traj.get_infos('action_logprob')) for traj in D]
        entropies = [torch.cat(traj.get_infos('entropy')) for traj in D]
        Vs = [torch.cat(traj.get_infos('V')) for traj in D]
        last_Vs = [traj.extra_info['last_info']['V'] for traj in D]
        Qs = [
            bootstrapped_returns(self.config['agent.gamma'], traj.rewards,
                                 last_V, traj.reach_terminal)
            for traj, last_V in zip(D, last_Vs)
        ]
        As = [
            gae(self.config['agent.gamma'], self.config['agent.gae_lambda'],
                traj.rewards, V, last_V, traj.reach_terminal)
            for traj, V, last_V in zip(D, Vs, last_Vs)
        ]

        # Metrics -> Tensor, device
        logprobs, entropies, Vs = map(lambda x: torch.cat(x).squeeze(),
                                      [logprobs, entropies, Vs])
        Qs, As = map(
            lambda x: tensorify(np.concatenate(x).copy(), self.device),
            [Qs, As])
        if self.config['agent.standardize_adv']:
            As = (As - As.mean()) / (As.std() + 1e-4)
        assert all([x.ndim == 1 for x in [logprobs, entropies, Vs, Qs, As]])

        # Loss
        policy_loss = -logprobs * As.detach()
        entropy_loss = -entropies
        value_loss = F.mse_loss(Vs, Qs, reduction='none')
        loss = policy_loss + self.config[
            'agent.value_coef'] * value_loss + self.config[
                'agent.entropy_coef'] * entropy_loss
        loss = loss.mean()

        self.optimizer.zero_grad()
        loss.backward()
        grad_norm = nn.utils.clip_grad_norm_(
            self.parameters(), self.config['agent.max_grad_norm'])
        self.optimizer.step()
        if self.config['agent.use_lr_scheduler']:
            self.lr_scheduler.step(self.total_timestep)
        self.total_timestep += sum([traj.T for traj in D])

        out = {}
        if self.config['agent.use_lr_scheduler']:
            out['current_lr'] = self.lr_scheduler.get_lr()
        out['loss'] = loss.item()
        out['grad_norm'] = grad_norm
        out['policy_loss'] = policy_loss.mean().item()
        out['entropy_loss'] = entropy_loss.mean().item()
        out['policy_entropy'] = -out['entropy_loss']
        out['value_loss'] = value_loss.mean().item()
        out['V'] = describe(numpify(Vs, 'float').squeeze(),
                            axis=-1,
                            repr_indent=1,
                            repr_prefix='\n')
        out['explained_variance'] = ev(y_true=numpify(Qs, 'float'),
                                       y_pred=numpify(Vs, 'float'))
        return out
示例#3
0
    def learn(self, D, **kwargs):
        # Compute all metrics, D: list of Trajectory
        logprobs = [
            torch.cat(traj.get_all_info('action_logprob')) for traj in D
        ]
        entropies = [torch.cat(traj.get_all_info('entropy')) for traj in D]
        Vs = [torch.cat(traj.get_all_info('V')) for traj in D]

        last_observations = torch.from_numpy(
            np.concatenate([traj.last_observation for traj in D], 0)).float()
        with torch.no_grad():
            last_Vs = self.V_head(
                self.feature_network(last_observations.to(
                    self.device))).squeeze(-1)
        Qs = [
            bootstrapped_returns(self.config['agent.gamma'], traj, last_V)
            for traj, last_V in zip(D, last_Vs)
        ]
        As = [
            gae(self.config['agent.gamma'], self.config['agent.gae_lambda'],
                traj, V, last_V) for traj, V, last_V in zip(D, Vs, last_Vs)
        ]

        # Metrics -> Tensor, device
        logprobs, entropies, Vs = map(lambda x: torch.cat(x).squeeze(),
                                      [logprobs, entropies, Vs])
        Qs, As = map(
            lambda x: torch.from_numpy(np.concatenate(x).copy()).to(
                self.device), [Qs, As])
        if self.config['agent.standardize_adv']:
            As = (As - As.mean()) / (As.std() + 1e-8)

        assert all(
            [x.ndimension() == 1 for x in [logprobs, entropies, Vs, Qs, As]])

        dataset = Dataset(D, logprobs, entropies, Vs, Qs, As)
        dataloader = DataLoader(dataset,
                                self.config['train.batch_size'],
                                shuffle=True)
        for epoch in range(self.config['train.num_epochs']):
            logs = [self.learn_one_update(data) for data in dataloader]

        self.total_timestep += sum([len(traj) for traj in D])
        out = {}
        if self.config['agent.use_lr_scheduler']:
            out['current_lr'] = self.lr_scheduler.get_lr()
        out['loss'] = np.mean([item['loss'] for item in logs])
        out['grad_norm'] = np.mean([item['grad_norm'] for item in logs])
        out['policy_loss'] = np.mean([item['policy_loss'] for item in logs])
        out['entropy_loss'] = np.mean([item['entropy_loss'] for item in logs])
        out['policy_entropy'] = np.mean(
            [item['policy_entropy'] for item in logs])
        out['value_loss'] = np.mean([item['value_loss'] for item in logs])
        out['explained_variance'] = np.mean(
            [item['explained_variance'] for item in logs])
        out['approx_kl'] = np.mean([item['approx_kl'] for item in logs])
        out['clip_frac'] = np.mean([item['clip_frac'] for item in logs])
        return out
示例#4
0
    def learn(self, D, **kwargs):
        logprobs = [torch.cat(traj.get_infos('action_logprob')) for traj in D]
        entropies = [torch.cat(traj.get_infos('entropy')) for traj in D]
        Vs = [torch.cat(traj.get_infos('V')) for traj in D]
        with torch.no_grad():
            last_observations = tensorify([traj[-1].observation for traj in D],
                                          self.device)
            last_Vs = self.value(last_observations).squeeze(-1)
        Qs = [
            bootstrapped_returns(self.config['agent.gamma'], traj.rewards,
                                 last_V, traj.reach_terminal)
            for traj, last_V in zip(D, last_Vs)
        ]
        As = [
            gae(self.config['agent.gamma'], self.config['agent.gae_lambda'],
                traj.rewards, V, last_V, traj.reach_terminal)
            for traj, V, last_V in zip(D, Vs, last_Vs)
        ]

        # Metrics -> Tensor, device
        logprobs, entropies, Vs = map(lambda x: torch.cat(x).squeeze(),
                                      [logprobs, entropies, Vs])
        Qs, As = map(
            lambda x: tensorify(np.concatenate(x).copy(), self.device),
            [Qs, As])
        if self.config['agent.standardize_adv']:
            As = (As - As.mean()) / (As.std() + 1e-4)
        assert all([x.ndim == 1 for x in [logprobs, entropies, Vs, Qs, As]])

        dataset = Dataset(D, logprobs, entropies, Vs, Qs, As)
        dataloader = DataLoader(dataset,
                                self.config['train.batch_size'],
                                shuffle=True)
        for epoch in range(self.config['train.num_epochs']):
            logs = [self.learn_one_update(data) for data in dataloader]

        self.total_timestep += sum([traj.T for traj in D])
        out = {}
        if self.config['agent.use_lr_scheduler']:
            out['current_lr'] = self.policy_lr_scheduler.get_lr()
        out['policy_grad_norm'] = np.mean(
            [item['policy_grad_norm'] for item in logs])
        out['value_grad_norm'] = np.mean(
            [item['value_grad_norm'] for item in logs])
        out['policy_loss'] = np.mean([item['policy_loss'] for item in logs])
        out['policy_entropy'] = np.mean(
            [item['policy_entropy'] for item in logs])
        out['value_loss'] = np.mean([item['value_loss'] for item in logs])
        out['explained_variance'] = np.mean(
            [item['explained_variance'] for item in logs])
        out['approx_kl'] = np.mean([item['approx_kl'] for item in logs])
        out['clip_frac'] = np.mean([item['clip_frac'] for item in logs])
        return out
示例#5
0
文件: agent.py 项目: StanfordVL/lagom
    def learn(self, D, **kwargs):
        # Compute all metrics, D: list of Trajectory
        logprobs = [
            torch.cat(traj.get_all_info('action_logprob')) for traj in D
        ]
        entropies = [torch.cat(traj.get_all_info('entropy')) for traj in D]
        Vs = [torch.cat(traj.get_all_info('V')) for traj in D]

        with torch.no_grad():
            last_observations = tensorify(
                np.concatenate([traj.last_observation for traj in D], 0),
                self.device)
            last_Vs = self.V_head(
                self.feature_network(last_observations)).squeeze(-1)
        Qs = [
            bootstrapped_returns(self.config['agent.gamma'], traj, last_V)
            for traj, last_V in zip(D, last_Vs)
        ]
        As = [
            gae(self.config['agent.gamma'], self.config['agent.gae_lambda'],
                traj, V, last_V) for traj, V, last_V in zip(D, Vs, last_Vs)
        ]

        # Metrics -> Tensor, device
        logprobs, entropies, Vs = map(lambda x: torch.cat(x).squeeze(),
                                      [logprobs, entropies, Vs])
        Qs, As = map(
            lambda x: tensorify(np.concatenate(x).copy(), self.device),
            [Qs, As])
        if self.config['agent.standardize_adv']:
            As = (As - As.mean()) / (As.std() + 1e-8)

        assert all(
            [x.ndimension() == 1 for x in [logprobs, entropies, Vs, Qs, As]])

        # Loss
        policy_loss = -logprobs * As
        entropy_loss = -entropies
        value_loss = F.mse_loss(Vs, Qs, reduction='none')

        loss = policy_loss + self.config[
            'agent.value_coef'] * value_loss + self.config[
                'agent.entropy_coef'] * entropy_loss
        loss = loss.mean()

        self.optimizer.zero_grad()
        loss.backward()
        grad_norm = nn.utils.clip_grad_norm_(
            self.parameters(), self.config['agent.max_grad_norm'])
        if self.config['agent.use_lr_scheduler']:
            self.lr_scheduler.step(self.total_timestep)
        self.optimizer.step()

        self.total_timestep += sum([len(traj) for traj in D])
        out = {}
        if self.config['agent.use_lr_scheduler']:
            out['current_lr'] = self.lr_scheduler.get_lr()
        out['loss'] = loss.item()
        out['grad_norm'] = grad_norm
        out['policy_loss'] = policy_loss.mean().item()
        out['entropy_loss'] = entropy_loss.mean().item()
        out['policy_entropy'] = -entropy_loss.mean().item()
        out['value_loss'] = value_loss.mean().item()
        out['V'] = describe(numpify(Vs, 'float').squeeze(),
                            axis=-1,
                            repr_indent=1,
                            repr_prefix='\n')
        out['explained_variance'] = ev(y_true=numpify(Qs, 'float'),
                                       y_pred=numpify(Vs, 'float'))
        return out