def test_output_properties(self, out_dim, num_heads, batch_size, deterministic): in_dim = (4, ) net = Ensemble(in_dim, out_dim, num_heads=num_heads, deterministic=deterministic) if batch_size is None: t = torch.randn(in_dim) else: t = torch.randn((batch_size, 2) + in_dim) o = tensor_to_distribution(net(t)) assert isinstance(o, torch.distributions.MultivariateNormal) assert o.has_rsample assert not o.has_enumerate_support assert o.batch_shape == torch.Size(( batch_size, 2) if batch_size is not None else ()) net.set_prediction_strategy("set_head") net.set_head(0) o = tensor_to_distribution(net(t)) if deterministic: assert isinstance(o, Delta) else: assert isinstance(o, torch.distributions.MultivariateNormal) assert o.batch_shape == torch.Size(( batch_size, 2) if batch_size is not None else ()) assert o.has_rsample assert not o.has_enumerate_support
def get_log_p_and_ope_weight(self, state, action): """Get log_p of a state-action and the off-pol weight w.r.t. the old policy.""" pi = tensor_to_distribution(self.policy(state), **self.policy.dist_params) pi_o = tensor_to_distribution(self.old_policy(state), **self.policy.dist_params) _, log_p = get_entropy_and_log_p(pi, action, self.policy.action_scale) _, log_p_old = get_entropy_and_log_p(pi_o, action, self.policy.action_scale) ratio = torch.exp(log_p - log_p_old) return log_p, ratio
def test_output_shape(self, net, in_dim, out_dim, batch_size): net = torch.jit.script(net(in_dim, out_dim)) if batch_size is None: t = torch.randn(in_dim) o = tensor_to_distribution(net(t)).sample() assert o.shape == torch.Size(out_dim) else: t = torch.randn((batch_size, ) + in_dim) o = tensor_to_distribution(net(t)).sample() assert o.shape == torch.Size((batch_size, ) + out_dim)
def actor_loss(self, observation): """Use the model to compute the gradient loss.""" state, action = observation.state, observation.action next_state, done = observation.next_state, observation.done # Infer eta. action_mean, action_chol = self.policy(state) with torch.no_grad(): eta = torch.inverse(action_chol) @ ( (action - action_mean).unsqueeze(-1)) # Compute entropy and log_probability. pi = tensor_to_distribution((action_mean, action_chol)) _, log_p = get_entropy_and_log_p(pi, action, self.policy.action_scale) # Compute off-policy weight. with torch.no_grad(): weight = self.get_ope_weight(state, action, observation.log_prob_action) with DisableGradient( self.dynamical_model, self.reward_model, self.termination_model, self.critic_target, ): # Compute re-parameterized policy sample. action = (action_mean + (action_chol @ eta).squeeze(-1)).clamp( -1, 1) # Infer xi. ns_mean, ns_chol = self.dynamical_model(state, action) with torch.no_grad(): xi = torch.inverse(ns_chol) @ ( (next_state - ns_mean).unsqueeze(-1)) # Compute re-parameterized next-state sample. ns = ns_mean + (ns_chol @ xi).squeeze(-1) # Compute reward. r = tensor_to_distribution(self.reward_model(state, action, ns)).rsample() r = r[..., 0] next_v = self.value_function(ns) if isinstance(self.critic, NNEnsembleValueFunction) or isinstance( self.critic, NNEnsembleQFunction): next_v = next_v[..., 0] v = r + self.gamma * next_v * (1 - done) return Loss(policy_loss=-(weight * v)).reduce(self.criterion.reduction)
def test_goal(self, batch_size): goal = random_tensor(False, 3, None) self.init(False, False, 4, 2, goal=goal) state = random_tensor(False, 4, batch_size) pi = tensor_to_distribution(self.policy(state)) action = pi.sample() assert action.shape == torch.Size([batch_size, 2] if batch_size else [2]) assert action.dtype is torch.get_default_dtype() other_goal = random_tensor(False, 3, None) self.policy.set_goal(other_goal) other_pi = tensor_to_distribution(self.policy(state)) assert not torch.any(other_pi.mean == pi.mean)
def test_output_shape(self, out_dim, batch_size, num_heads, deterministic): in_dim = (4, ) net = Ensemble(in_dim, out_dim, num_heads=num_heads, deterministic=deterministic) if batch_size is None: t = torch.randn(in_dim) o = tensor_to_distribution(net(t)).sample() assert o.shape == torch.Size(out_dim) else: t = torch.randn((batch_size, ) + in_dim) o = tensor_to_distribution(net(t)).sample() assert o.shape == torch.Size((batch_size, ) + out_dim)
def test_class_method(self, net, batch_size, out_dim, num_heads): layers = [64, 64] in_dim = (4, ) try: n1 = net(in_dim, out_dim, layers=layers, num_heads=num_heads) except TypeError: base_net = net(in_dim, out_dim, layers=layers) n1 = Ensemble.from_feedforward(base_net, num_heads=num_heads) if isinstance(base_net, DeterministicNN): assert n1.deterministic else: assert not n1.deterministic _test_from_other(n1, Ensemble) _test_from_other_with_copy(n1, Ensemble) # Test layers layers = layers or list() # Check nn.parameters (+1: head) assert 2 * (len(layers) + 2) == len([*n1.parameters()]) # Check shapes layers.append(out_dim[0] * num_heads) layers.append(out_dim[0] * num_heads) i = 0 for name, param in n1.named_parameters(): if name.startswith("_scale"): assert param.shape[0] == out_dim[0] * num_heads # * out_dim else: assert param.shape[0] == layers[i // 2] i += 1 # Check output if batch_size is None: t = torch.randn(in_dim) o = tensor_to_distribution(n1(t)) assert o.sample().shape == torch.Size(out_dim) assert o.batch_shape == torch.Size([]) else: t = torch.randn((batch_size, 2) + in_dim) o = tensor_to_distribution(n1(t)) assert o.sample().shape == torch.Size((batch_size, 2) + out_dim) assert o.batch_shape == torch.Size((batch_size, 2)) assert isinstance(o, torch.distributions.MultivariateNormal) assert o.has_rsample assert not o.has_enumerate_support
def act(self, state): """Ask the agent for an action to interact with the environment.""" if self.total_steps < self.exploration_steps or ( self.total_episodes < self.exploration_episodes ): policy = self.policy.random() else: if not isinstance(state, torch.Tensor): state = torch.tensor( state, dtype=torch.get_default_dtype(), device=self.device ) policy = self.policy(state) self.pi = tensor_to_distribution(policy, **self.policy.dist_params) if self.training: action = self.pi.sample() elif self.pi.has_enumerate_support: action = torch.argmax(self.pi.probs) else: try: action = self.pi.mean except NotImplementedError: action = self.pi.sample((100,)).mean(dim=0) if not self.policy.discrete_action: action = action.clamp(-1.0, 1.0) action = self.policy.action_scale * action return action.detach().to("cpu").numpy()
def get_ope_weight(self, state, action, log_prob_action): """Get off-policy weight of a given transition.""" pi = tensor_to_distribution(self.policy(state), **self.policy.dist_params) _, log_p = get_entropy_and_log_p(pi, action, self.policy.action_scale) weight = off_policy_weight(log_p, log_prob_action, full_trajectory=False) return weight
def mdp2mrp(transitions, rewards, policy, terminal_states=None): """Transform MDP and Policy to an MRP. Parameters ---------- transitions: Tensor. rewards: Tensor. policy: AbstractPolicy. Returns ------- environment: MDP. """ num_states, num_actions = rewards.shape mrp_kernel = torch.zeros((num_states, 1, num_states)) mrp_reward = torch.zeros((num_states, 1)) if terminal_states is None: terminal_states = [] for state in range(num_states): if state in terminal_states: mrp_kernel[state, 0, state] = 1 mrp_reward[state] = 0 continue state = torch.tensor(state).long() policy_ = tensor_to_distribution(policy(state), **policy.dist_params) for action, p_action in enumerate(policy_.probs): for next_state, p_next_state in enumerate(transitions[state, action]): mrp_reward[state, 0] += p_action * p_next_state * rewards[state, action] mrp_kernel[state, 0, next_state] += p_action * p_next_state return mrp_kernel, mrp_reward
def test_forward(self, dim_state, dim_action, batch_size, deterministic): self.init(False, False, dim_state, dim_action, deterministic) state = random_tensor(False, dim_state, batch_size) distribution = tensor_to_distribution(self.policy(state)) sample = distribution.sample() if deterministic: assert isinstance(distribution, Delta) else: assert isinstance(distribution, MultivariateNormal) if batch_size: assert distribution.mean.shape == (batch_size,) + self.dim_action if not deterministic: assert distribution.covariance_matrix.shape == ( batch_size, self.dim_action[0], self.dim_action[0], ) assert sample.shape == (batch_size, dim_action) else: assert distribution.mean.shape == self.dim_action if not deterministic: assert distribution.covariance_matrix.shape == ( self.dim_action[0], self.dim_action[0], ) assert sample.shape == torch.Size((dim_action,))
def test_call(self, discrete_state, discrete_action, dim_state, dim_action, batch_size): self.init(discrete_state, discrete_action, dim_state, dim_action) state = random_tensor(discrete_state, dim_state, batch_size) distribution = tensor_to_distribution(self.policy(state)) sample = distribution.sample() if distribution.has_enumerate_support: # Discrete assert isinstance(distribution, Categorical) if batch_size: assert distribution.logits.shape == (batch_size, self.num_actions) assert sample.shape == (batch_size, ) else: assert distribution.logits.shape == (self.num_actions, ) assert sample.shape == () else: # Continuous assert isinstance(distribution, MultivariateNormal) if batch_size: assert distribution.mean.shape == ( batch_size, ) + self.dim_action assert distribution.covariance_matrix.shape == ( batch_size, self.dim_action[0], self.dim_action[0], ) assert sample.shape == (batch_size, dim_action) else: assert distribution.mean.shape == self.dim_action assert distribution.covariance_matrix.shape == ( self.dim_action[0], self.dim_action[0], ) assert sample.shape == (dim_action, )
def test_from_nn(self, discrete_state, dim_state, dim_action, batch_size): self.init(discrete_state, False, dim_state, dim_action) policy = NNPolicy.from_nn( HomoGaussianNN( self.policy.nn.kwargs["in_dim"], self.policy.nn.kwargs["out_dim"], layers=[20, 20], biased_head=False, ), self.dim_state, self.dim_action, num_states=self.num_states, num_actions=self.num_actions, ) state = random_tensor(discrete_state, dim_state, batch_size) action = tensor_to_distribution(policy(state)).sample() embeddings = policy.embeddings(state) assert action.shape == torch.Size( [batch_size, dim_action] if batch_size else [dim_action]) assert embeddings.shape == torch.Size( [batch_size, 20] if batch_size else [20]) assert action.dtype is torch.get_default_dtype() assert embeddings.dtype is torch.get_default_dtype()
def test_discrete(t_start, q_function): policy = SoftMax(q_function, t_start) for _ in range(100): state = torch.randint(4, ()) logits = q_function(state) probs = torch.softmax(logits / t_start, dim=0) torch.testing.assert_allclose( tensor_to_distribution(policy(state)).probs, probs)
def test_random_action(self, dim_state, dim_action): self.init(False, False, dim_state, dim_action) distribution = tensor_to_distribution(self.policy.random()) sample = distribution.sample() assert distribution.mean.shape == self.dim_action assert sample.shape == (dim_action,)
def actor_loss(self, observation): """Compute actor loss.""" state = repeat_along_dimension( observation.state, number=self.num_samples, dim=0 ) pi = tensor_to_distribution(self.old_policy(state), **self.policy.dist_params) action = self.policy.action_scale * pi.sample().clamp(-1.0, 1.0) return self.compute_mpo_loss(state, action)
def step(self, action): """See `AbstractSystem.step'.""" if not isinstance(action, torch.Tensor): action = torch.tensor(action, dtype=torch.get_default_dtype()) state = torch.tensor(self.state, dtype=torch.get_default_dtype()) self.state = (tensor_to_distribution( self.dynamical_model(state, action)).sample().numpy()) return self.state
def test_goal(self, batch_size): goal = random_tensor(False, 3, None) policy = NNPolicy(dim_state=(4, ), dim_action=(2, ), layers=[32, 32], goal=goal) state = random_tensor(False, 4, batch_size) pi = tensor_to_distribution(policy(state)) action = pi.sample() assert action.shape == torch.Size( [batch_size, 2] if batch_size else [2]) assert action.dtype is torch.get_default_dtype() other_goal = random_tensor(False, 3, None) policy.set_goal(other_goal) other_pi = tensor_to_distribution(policy(state)) assert not torch.any(other_pi.mean == pi.mean)
def test_discrete(eps_start, q_function): policy = EpsGreedy(q_function, eps_start) for _ in range(100): state = torch.randint(4, ()) action = q_function(state).argmax(dim=-1) probs = eps_start / 2 * torch.ones(2) probs[action] += 1 - eps_start assert (tensor_to_distribution(policy(state)).probs == probs).all()
def step(self, action): """See `AbstractEnvironment.step'.""" self._time += 1 state = self.system.state # this might be noisy. reward = float("nan") if self.reward is not None: reward = (tensor_to_distribution(self.reward( state, action, None)).sample().squeeze(-1)) next_state = self.system.step(action) if self.termination_model is not None: done = (tensor_to_distribution( self.termination_model(state, action, next_state)).sample().squeeze(-1)) else: done = False return next_state, reward, done, {}
def get_kl_entropy(self, state): """Get kl divergence and current policy at a given state. Compute the separated KL divergence between current and old policy. When the policy is a MultivariateNormal distribution, it compute the divergence that correspond to the mean and the covariance separately. When the policy is a Categorical distribution, it computes the divergence and assigns it to the mean component. The variance component is kept to zero. Parameters ---------- state: torch.Tensor Empirical state distribution. Returns ------- kl_mean: torch.Tensor KL-Divergence due to the change in the mean between current and previous policy. kl_var: torch.Tensor KL-Divergence due to the change in the variance between current and previous policy. entropy: torch.Tensor Entropy of the current policy at the given state. """ pi = tensor_to_distribution(self.policy(state), **self.policy.dist_params) pi_old = tensor_to_distribution( self.old_policy(state), **self.policy.dist_params ) try: action = pi.rsample() except NotImplementedError: action = pi.sample() if not self.policy.discrete_action: action = self.policy.action_scale * (action.clamp(-1.0, 1.0)) entropy, log_p = get_entropy_and_log_p(pi, action, self.policy.action_scale) _, log_p_old = get_entropy_and_log_p(pi_old, action, self.policy.action_scale) kl_mean, kl_var = separated_kl(p=pi_old, q=pi, log_p=log_p_old, log_q=log_p) return kl_mean, kl_var, entropy
def collect_model_transitions(state_dist, policy, dynamical_model, reward_model, num_samples): """Collect transitions by interacting with an environment. Parameters ---------- state_dist: Distribution. State distribution. policy: AbstractPolicy or Distribution. Policy to interact with the environment. dynamical_model: AbstractModel. Model with which to interact. reward_model: AbstractReward. Reward model with which to interact. num_samples: int. Number of transitions. Returns ------- transitions: List[Observation] List of 1-step transitions. """ state = state_dist.sample((num_samples, )) if isinstance(policy, AbstractPolicy): action_dist = tensor_to_distribution(policy(state), **policy.dist_params) action = action_dist.sample() else: # action_distribution action_dist = policy action = action_dist.sample((num_samples, )) next_state = tensor_to_distribution(dynamical_model(state, action)).sample() reward = tensor_to_distribution(reward_model(state, action, next_state)).sample() transitions = [] for state_, action_, reward_, next_state_ in zip(state, action, reward, next_state): transitions.append( Observation(state_, action_, reward_, next_state_).to_torch()) return transitions
def _policy_weighted_nll(self, state, action, weights): """Return weighted policy negative log-likelihood.""" pi = tensor_to_distribution(self.policy(state), **self.policy.dist_params) _, action_log_p = get_entropy_and_log_p(pi, action, self.policy.action_scale) weighted_log_p = weights.detach() * action_log_p # Clamping is crucial for stability so that it does not converge to a delta. log_likelihood = torch.mean(weighted_log_p.clamp_max(1e-3)) return -log_likelihood
def build_empirical_y0(observation, support, policy=None): """Build empirical distribution over samples.""" state = observation.state num_states_ = state.shape[0] if support == "state-action": y0 = torch.ones(num_states_) / float(num_states_) elif support == "state": pi = tensor_to_distribution(policy(state).detach()) y0 = pi.probs / float(num_states_) else: raise NotImplementedError(f"{support} not implemented.") return y0
def rollout_policy(environment, policy, num_episodes=1, max_steps=1000, render=False): """Conduct a rollout of a policy in an environment. Parameters ---------- environment: AbstractEnvironment Environment with which the policy interacts. policy: AbstractPolicy Policy that interacts with the environment. num_episodes: int, optional (default=1) Number of episodes. max_steps: int. Maximum number of steps per episode. render: bool. Flag that indicates whether to render the environment or not. Returns ------- trajectories: List[Trajectory]=List[List[Observation]] A list of trajectories. """ trajectories = [] for _ in tqdm(range(num_episodes)): state = environment.reset() done = False trajectory = [] with torch.no_grad(): time_step = 0 while not done: pi = tensor_to_distribution( policy(torch.tensor(state, dtype=torch.get_default_dtype())), **policy.dist_params, ) action = pi.sample() if not policy.discrete_action: action = policy.action_scale * action.clamp_(-1.0, 1.0) obs, state, done, info = step_env( environment=environment, state=state, action=action.detach().numpy(), action_scale=policy.action_scale, pi=pi, render=render, ) trajectory.append(obs) time_step += 1 if max_steps <= time_step: break trajectories.append(trajectory) return trajectories
def compute_mpo_loss(self, state, action): """Compute mpo loss for a given set of state/action pairs.""" pi_dist = tensor_to_distribution(self.policy(state), **self.policy.dist_params) log_p = pi_dist.log_prob(action) q_values = self.critic_target(state, action) mpo_loss = self.mpo_loss(q_values=q_values, action_log_p=log_p).reduce( self.criterion.reduction) self._info.update(mpo_eta=self.mpo_loss.eta) return mpo_loss
def train_exact_gp_type2mll_step(model, observation, optimizer): """Train a GP using type-2 Marginal-Log-Likelihood optimization.""" optimizer.zero_grad() output = tensor_to_distribution( model(observation.state[:, 0], observation.action[:, 0])) with gpytorch.settings.fast_pred_var(): val = torch.stack(tuple([gp.train_targets for gp in model.gp]), 0) loss = exact_mll(output, val, model.gp) loss.backward() optimizer.step() model.eval() return loss
def test_input_transform(self, batch_size): policy = NNPolicy( dim_state=(2, ), dim_action=(4, ), layers=[64, 64], input_transform=StateTransform(), ) out = tensor_to_distribution( policy(random_tensor(False, 2, batch_size))) action = out.sample() assert action.shape == torch.Size( [batch_size, 4] if batch_size else [4]) assert action.dtype is torch.get_default_dtype()
def test_random_action(self, discrete_state, discrete_action, dim_state, dim_action): self.init(discrete_state, discrete_action, dim_state, dim_action) distribution = tensor_to_distribution(self.policy.random()) sample = distribution.sample() if distribution.has_enumerate_support: # Discrete assert distribution.logits.shape == (self.num_actions, ) assert sample.shape == () else: # Continuous assert distribution.mean.shape == self.dim_action assert sample.shape == (dim_action, )
def test_output_properties(self, net, in_dim, out_dim, batch_size): net = torch.jit.script(net(in_dim, out_dim)) if batch_size is None: t = torch.randn(in_dim) else: t = torch.randn((batch_size, 2) + in_dim) o = tensor_to_distribution(net(t)) assert isinstance(o, torch.distributions.MultivariateNormal) assert o.has_rsample assert not o.has_enumerate_support assert o.batch_shape == torch.Size(( batch_size, 2) if batch_size is not None else ())