def test_kl_divergence(dist_type): set_random_seed(8) # Test 1: same distribution should have KL Div = 0 dist1 = dist_type dist2 = dist_type # PyTorch implementation of kl_divergence doesn't sum across dimensions assert th.allclose(kl_divergence(dist1, dist2).sum(), th.tensor(0.0)) # Test 2: KL Div = E(Unbiased approx KL Div) if isinstance(dist_type, CategoricalDistribution): dist1 = dist_type.proba_distribution(th.rand(N_ACTIONS).repeat(N_SAMPLES, 1)) # deepcopy needed to assign new memory to new distribution instance dist2 = deepcopy(dist_type).proba_distribution(th.rand(N_ACTIONS).repeat(N_SAMPLES, 1)) elif isinstance(dist_type, DiagGaussianDistribution) or isinstance(dist_type, SquashedDiagGaussianDistribution): mean_actions1 = th.rand(1).repeat(N_SAMPLES, 1) log_std1 = th.rand(1).repeat(N_SAMPLES, 1) mean_actions2 = th.rand(1).repeat(N_SAMPLES, 1) log_std2 = th.rand(1).repeat(N_SAMPLES, 1) dist1 = dist_type.proba_distribution(mean_actions1, log_std1) dist2 = deepcopy(dist_type).proba_distribution(mean_actions2, log_std2) elif isinstance(dist_type, BernoulliDistribution): dist1 = dist_type.proba_distribution(th.rand(1).repeat(N_SAMPLES, 1)) dist2 = deepcopy(dist_type).proba_distribution(th.rand(1).repeat(N_SAMPLES, 1)) elif isinstance(dist_type, MultiCategoricalDistribution): dist1 = dist_type.proba_distribution(th.rand(1, sum([N_ACTIONS, N_ACTIONS])).repeat(N_SAMPLES, 1)) dist2 = deepcopy(dist_type).proba_distribution(th.rand(1, sum([N_ACTIONS, N_ACTIONS])).repeat(N_SAMPLES, 1)) elif isinstance(dist_type, StateDependentNoiseDistribution): dist1 = StateDependentNoiseDistribution(1) dist2 = deepcopy(dist1) state = th.rand(1, N_FEATURES).repeat(N_SAMPLES, 1) mean_actions1 = th.rand(1).repeat(N_SAMPLES, 1) mean_actions2 = th.rand(1).repeat(N_SAMPLES, 1) _, log_std = dist1.proba_distribution_net(N_FEATURES, log_std_init=th.log(th.tensor(0.2))) dist1.sample_weights(log_std, batch_size=N_SAMPLES) dist2.sample_weights(log_std, batch_size=N_SAMPLES) dist1 = dist1.proba_distribution(mean_actions1, log_std, state) dist2 = dist2.proba_distribution(mean_actions2, log_std, state) full_kl_div = kl_divergence(dist1, dist2).mean(dim=0) actions = dist1.get_actions() approx_kl_div = (dist1.log_prob(actions) - dist2.log_prob(actions)).mean(dim=0) assert th.allclose(full_kl_div, approx_kl_div, rtol=5e-2) # Test 3 Sanity test with easy Bernoulli distribution if isinstance(dist_type, BernoulliDistribution): dist1 = BernoulliDistribution(1).proba_distribution(th.tensor([0.3])) dist2 = BernoulliDistribution(1).proba_distribution(th.tensor([0.65])) full_kl_div = kl_divergence(dist1, dist2) actions = th.tensor([0.0, 1.0]) ad_hoc_kl = th.sum( th.exp(dist1.distribution.log_prob(actions)) * (dist1.distribution.log_prob(actions) - dist2.distribution.log_prob(actions)) ) assert th.allclose(full_kl_div, ad_hoc_kl)
def __init__(self, observation_space: gym.spaces.Space, action_space: gym.spaces.Space, net_arch: List[int], features_extractor: nn.Module, features_dim: int, activation_fn: Type[nn.Module] = nn.ReLU, use_sde: bool = False, log_std_init: float = -3, full_std: bool = True, sde_net_arch: Optional[List[int]] = None, use_expln: bool = False, clip_mean: float = 2.0, normalize_images: bool = True, device: Union[th.device, str] = 'auto'): super(Actor, self).__init__(observation_space, action_space, features_extractor=features_extractor, normalize_images=normalize_images, device=device, squash_output=True) # Save arguments to re-create object at loading self.use_sde = use_sde self.sde_features_extractor = None self.sde_net_arch = sde_net_arch self.net_arch = net_arch self.features_dim = features_dim self.activation_fn = activation_fn self.log_std_init = log_std_init self.sde_net_arch = sde_net_arch self.use_expln = use_expln self.full_std = full_std self.clip_mean = clip_mean action_dim = get_action_dim(self.action_space) latent_pi_net = create_mlp(features_dim, -1, net_arch, activation_fn) self.latent_pi = nn.Sequential(*latent_pi_net) last_layer_dim = net_arch[-1] if len(net_arch) > 0 else features_dim if self.use_sde: latent_sde_dim = last_layer_dim # Separate feature extractor for gSDE if sde_net_arch is not None: self.sde_features_extractor, latent_sde_dim = create_sde_features_extractor(features_dim, sde_net_arch, activation_fn) self.action_dist = StateDependentNoiseDistribution(action_dim, full_std=full_std, use_expln=use_expln, learn_features=True, squash_output=True) self.mu, self.log_std = self.action_dist.proba_distribution_net(latent_dim=last_layer_dim, latent_sde_dim=latent_sde_dim, log_std_init=log_std_init) # Avoid numerical issues by limiting the mean of the Gaussian # to be in [-clip_mean, clip_mean] if clip_mean > 0.0: self.mu = nn.Sequential(self.mu, nn.Hardtanh(min_val=-clip_mean, max_val=clip_mean)) else: self.action_dist = SquashedDiagGaussianDistribution(action_dim) self.mu = nn.Linear(last_layer_dim, action_dim) self.log_std = nn.Linear(last_layer_dim, action_dim)
def test_sde_distribution(): n_actions = 1 deterministic_actions = th.ones(N_SAMPLES, n_actions) * 0.1 state = th.ones(N_SAMPLES, N_FEATURES) * 0.3 dist = StateDependentNoiseDistribution(n_actions, full_std=True, squash_output=False) set_random_seed(1) _, log_std = dist.proba_distribution_net(N_FEATURES) dist.sample_weights(log_std, batch_size=N_SAMPLES) dist = dist.proba_distribution(deterministic_actions, log_std, state) actions = dist.get_actions() assert th.allclose(actions.mean(), dist.distribution.mean.mean(), rtol=2e-3) assert th.allclose(actions.std(), dist.distribution.scale.mean(), rtol=2e-3)
set_random_seed(1) _, log_std = dist.proba_distribution_net(N_FEATURES) dist.sample_weights(log_std, batch_size=N_SAMPLES) dist = dist.proba_distribution(deterministic_actions, log_std, state) actions = dist.get_actions() assert th.allclose(actions.mean(), dist.distribution.mean.mean(), rtol=2e-3) assert th.allclose(actions.std(), dist.distribution.scale.mean(), rtol=2e-3) # TODO: analytical form for squashed Gaussian? @pytest.mark.parametrize("dist", [ DiagGaussianDistribution(N_ACTIONS), StateDependentNoiseDistribution(N_ACTIONS, squash_output=False), ]) def test_entropy(dist): # The entropy can be approximated by averaging the negative log likelihood # mean negative log likelihood == differential entropy set_random_seed(1) state = th.rand(N_SAMPLES, N_FEATURES) deterministic_actions = th.rand(N_SAMPLES, N_ACTIONS) _, log_std = dist.proba_distribution_net(N_FEATURES, log_std_init=th.log(th.tensor(0.2))) if isinstance(dist, DiagGaussianDistribution): dist = dist.proba_distribution(deterministic_actions, log_std) else: dist.sample_weights(log_std, batch_size=N_SAMPLES) dist = dist.proba_distribution(deterministic_actions, log_std, state)
actions = dist.get_actions() assert th.allclose(actions.mean(), dist.distribution.mean.mean(), rtol=2e-3) assert th.allclose(actions.std(), dist.distribution.scale.mean(), rtol=2e-3) # TODO: analytical form for squashed Gaussian? @pytest.mark.parametrize( "dist", [ DiagGaussianDistribution(N_ACTIONS), StateDependentNoiseDistribution(N_ACTIONS, squash_output=False), ], ) def test_entropy(dist): # The entropy can be approximated by averaging the negative log likelihood # mean negative log likelihood == differential entropy set_random_seed(1) deterministic_actions = th.rand(1, N_ACTIONS).repeat(N_SAMPLES, 1) _, log_std = dist.proba_distribution_net(N_FEATURES, log_std_init=th.log( th.tensor(0.2))) if isinstance(dist, DiagGaussianDistribution): dist = dist.proba_distribution(deterministic_actions, log_std) else: state = th.rand(1, N_FEATURES).repeat(N_SAMPLES, 1)