def init( self, discrete_state, discrete_action, dim_state, dim_action, deterministic=False, ): self.num_states, self.dim_state = ((dim_state, ()) if discrete_state else (-1, (dim_state, ))) self.num_actions, self.dim_action = ((dim_action, ()) if discrete_action else (-1, (dim_action, ))) self.policy = NNPolicy( dim_state=self.dim_state, dim_action=self.dim_action, num_states=self.num_states, num_actions=self.num_actions, layers=[32, 32], deterministic=deterministic, )
def get_default_policy(environment, function_approximation): """Get default policy.""" if function_approximation == "tabular": policy = TabularPolicy.default(environment) elif function_approximation == "linear": policy = NNPolicy.default(environment, layers=[200]) freeze_hidden_layers(policy) else: policy = NNPolicy.default(environment) return policy
def default(cls, environment, *args, **kwargs): """See AbstractPolicy.default().""" true_policy = NNPolicy.default(environment, *args, **kwargs) hallucination_policy = NNPolicy.default( environment, dim_action=environment.dim_state, *args, **kwargs) hallucination_policy.action_scale = torch.ones(environment.dim_state) return cls( true_policy=true_policy, hallucination_policy=hallucination_policy, *args, **kwargs, )
def default( cls, environment, critic=None, policy=None, lr=3e-4, deterministic=True, exploration_noise=None, policy_update_frequency=2, clip_gradient_val=10, *args, **kwargs, ): """See `AbstractAgent.default'.""" if critic is None: critic = NNQFunction.default(environment) if policy is None: policy = NNPolicy.default(environment, deterministic=deterministic) optimizer = Adam(chain(policy.parameters(), critic.parameters()), lr=lr) if exploration_noise is None: exploration_noise = OUNoise(dim=environment.dim_action) return super().default( environment=environment, critic=critic, policy=policy, optimizer=optimizer, exploration_noise=exploration_noise, policy_update_frequency=policy_update_frequency, clip_gradient_val=clip_gradient_val, *args, **kwargs, )
def default(cls, environment, *args, **kwargs): """See AbstractValueFunction.default.""" q_function = NNQFunction.default(environment, *args, **kwargs) policy = NNPolicy.default(environment, *args, **kwargs) return super().default(environment, q_function=q_function, policy=policy)
def test_from_nn(self, discrete_state, dim_state, dim_action, batch_size): self.init(discrete_state, False, dim_state, dim_action) policy = NNPolicy.from_nn( HomoGaussianNN( self.policy.nn.kwargs["in_dim"], self.policy.nn.kwargs["out_dim"], layers=[20, 20], biased_head=False, ), self.dim_state, self.dim_action, num_states=self.num_states, num_actions=self.num_actions, ) state = random_tensor(discrete_state, dim_state, batch_size) action = tensor_to_distribution(policy(state)).sample() embeddings = policy.embeddings(state) assert action.shape == torch.Size( [batch_size, dim_action] if batch_size else [dim_action]) assert embeddings.shape == torch.Size( [batch_size, 20] if batch_size else [20]) assert action.dtype is torch.get_default_dtype() assert embeddings.dtype is torch.get_default_dtype()
def default( cls, environment, policy=None, critic=None, critic_lr=1e-3, actor_lr=3e-4, *args, **kwargs, ): """See `AbstractAgent.default'.""" if policy is None: policy = NNPolicy.default(environment) if critic is None: critic = NNQFunction.default(environment) optimizer = Adam( [ {"params": policy.parameters(), "lr": actor_lr}, {"params": critic.parameters(), "lr": critic_lr}, ] ) return super().default( environment=environment, policy=policy, critic=critic, optimizer=optimizer, *args, **kwargs, )
def default( cls, environment, critic=None, policy=None, lr=3e-4, policy_update_frequency=2, clip_gradient_val=10, *args, **kwargs, ): """See `AbstractAgent.default'.""" if critic is None: critic = NNEnsembleQFunction.default(environment) if policy is None: policy = NNPolicy.default(environment) optimizer = Adam(chain(policy.parameters(), critic.parameters()), lr=lr) return super().default( environment, critic=critic, policy=policy, optimizer=optimizer, policy_update_frequency=policy_update_frequency, clip_gradient_val=clip_gradient_val, *args, **kwargs, )
def test_goal(self, batch_size): goal = random_tensor(False, 3, None) policy = NNPolicy(dim_state=(4, ), dim_action=(2, ), layers=[32, 32], goal=goal) state = random_tensor(False, 4, batch_size) pi = tensor_to_distribution(policy(state)) action = pi.sample() assert action.shape == torch.Size( [batch_size, 2] if batch_size else [2]) assert action.dtype is torch.get_default_dtype() other_goal = random_tensor(False, 3, None) policy.set_goal(other_goal) other_pi = tensor_to_distribution(policy(state)) assert not torch.any(other_pi.mean == pi.mean)
def init( self, discrete_state, discrete_action, dim_state, dim_action, num_heads, num_samples=1, layers=None, biased_head=True, ): self.num_states, self.dim_state = ((dim_state, ()) if discrete_state else (-1, (dim_state, ))) self.num_actions, self.dim_action = ((dim_action, ()) if discrete_action else (-1, (dim_action, ))) layers = layers if layers is not None else [32, 32] if num_heads is None: self.q_function = NNQFunction( dim_state=self.dim_state, dim_action=self.dim_action, num_states=self.num_states, num_actions=self.num_actions, layers=layers, biased_head=biased_head, ) else: self.q_function = NNEnsembleQFunction( dim_state=self.dim_state, dim_action=self.dim_action, num_states=self.num_states, num_actions=self.num_actions, num_heads=num_heads, layers=layers, biased_head=biased_head, ) self.policy = NNPolicy( dim_state=self.dim_state, dim_action=self.dim_action, num_states=self.num_states, num_actions=self.num_actions, layers=layers, biased_head=biased_head, ) self.value_function = IntegrateQValueFunction( q_function=self.q_function, policy=self.policy, num_samples=num_samples)
def test_input_transform(self, batch_size): policy = NNPolicy( dim_state=(2, ), dim_action=(4, ), layers=[64, 64], input_transform=StateTransform(), ) out = tensor_to_distribution( policy(random_tensor(False, 2, batch_size))) action = out.sample() assert action.shape == torch.Size( [batch_size, 4] if batch_size else [4]) assert action.dtype is torch.get_default_dtype()
def default(cls, environment, policy=None, critic=None, lr=3e-4, *args, **kwargs): """See `AbstractAgent.default'.""" if critic is None: critic = NNEnsembleQFunction.default(environment, jit_compile=False) if policy is None: policy = NNPolicy.default(environment, jit_compile=False) optimizer = Adam(chain(policy.parameters(), critic.parameters()), lr=lr) return super().default( environment, critic=critic, policy=policy, optimizer=optimizer, *args, **kwargs, )
def init( self, discrete_state, discrete_action, dim_state, dim_action, deterministic=False, goal=None, ): self.num_states, self.dim_state = ( (dim_state, ()) if discrete_state else (-1, (dim_state,)) ) self.num_actions, self.dim_action = ( (dim_action, ()) if discrete_action else (-1, (dim_action,)) ) if discrete_state: base_dim = 1 else: base_dim = self.dim_state[0] if discrete_action: base_dim += 1 else: base_dim += self.dim_action[0] base_policy = NNPolicy( dim_state=self.dim_state, dim_action=(base_dim,), num_states=self.num_states, num_actions=self.num_actions, layers=[32, 32], deterministic=deterministic, goal=goal, ) self.policy = DerivedPolicy(base_policy, self.dim_action)
def _get_nn_policy(dim_state, dim_action, params, action_scale, input_transform=None): if params.exploration == "optimistic": dim_action = (dim_action[0] + dim_state[0], ) policy = NNPolicy( dim_state=dim_state, dim_action=dim_action, layers=params.policy_layers, biased_head=not params.policy_unbiased_head, non_linearity=params.policy_non_linearity, squashed_output=True, input_transform=input_transform, action_scale=action_scale, deterministic=params.policy_deterministic, tau=params.policy_tau, ) params.update({"policy": policy.__class__.__name__}) # policy = torch.jit.script(policy) return policy
def default(cls, environment, critic=None, policy=None, lr=5e-3, *args, **kwargs): """See `AbstractAgent.default'.""" if critic is None: critic = NNValueFunction.default(environment) if policy is None: policy = NNPolicy.default(environment) optimizer = Adam(critic.parameters(), lr=lr) return super().default( environment, policy=policy, critic=critic, optimizer=optimizer, *args, **kwargs, )
def default(cls, environment, policy=None, critic=None, lr=5e-4, *args, **kwargs): """See `AbstractAgent.default'.""" if critic is None: critic = NNQFunction.default(environment) if policy is None: policy = NNPolicy.default(environment, layers=[100, 100]) optimizer = Adam(chain(policy.parameters(), critic.parameters()), lr=lr) return super().default( environment, policy=policy, critic=critic, optimizer=optimizer, *args, **kwargs, )
class TestMLPPolicy(object): def init( self, discrete_state, discrete_action, dim_state, dim_action, deterministic=False, ): self.num_states, self.dim_state = ((dim_state, ()) if discrete_state else (-1, (dim_state, ))) self.num_actions, self.dim_action = ((dim_action, ()) if discrete_action else (-1, (dim_action, ))) self.policy = NNPolicy( dim_state=self.dim_state, dim_action=self.dim_action, num_states=self.num_states, num_actions=self.num_actions, layers=[32, 32], deterministic=deterministic, ) def test_property_values(self, discrete_state, discrete_action, dim_state, dim_action): self.init(discrete_state, discrete_action, dim_state, dim_action) assert (self.num_states if self.num_states is not None else -1) == self.policy.num_states assert (self.num_actions if self.num_actions is not None else -1) == self.policy.num_actions assert discrete_state == self.policy.discrete_state assert discrete_action == self.policy.discrete_action def test_random_action(self, discrete_state, discrete_action, dim_state, dim_action): self.init(discrete_state, discrete_action, dim_state, dim_action) distribution = tensor_to_distribution(self.policy.random()) sample = distribution.sample() if distribution.has_enumerate_support: # Discrete assert distribution.logits.shape == (self.num_actions, ) assert sample.shape == () else: # Continuous assert distribution.mean.shape == self.dim_action assert sample.shape == (dim_action, ) def test_forward( self, discrete_state, discrete_action, dim_state, dim_action, batch_size, deterministic, ): self.init(discrete_state, discrete_action, dim_state, dim_action, deterministic) state = random_tensor(discrete_state, dim_state, batch_size) distribution = tensor_to_distribution(self.policy(state)) sample = distribution.sample() if distribution.has_enumerate_support: # Discrete assert isinstance(distribution, Categorical) if batch_size: assert distribution.logits.shape == (batch_size, self.num_actions) assert sample.shape == (batch_size, ) else: assert distribution.logits.shape == (self.num_actions, ) assert sample.shape == () else: # Continuous if deterministic: assert isinstance(distribution, Delta) else: assert isinstance(distribution, MultivariateNormal) if batch_size: assert distribution.mean.shape == ( batch_size, ) + self.dim_action if not deterministic: assert distribution.covariance_matrix.shape == ( batch_size, self.dim_action[0], self.dim_action[0], ) assert sample.shape == (batch_size, dim_action) else: assert distribution.mean.shape == self.dim_action if not deterministic: assert distribution.covariance_matrix.shape == ( self.dim_action[0], self.dim_action[0], ) assert sample.shape == (dim_action, ) def test_embeddings(self, discrete_state, discrete_action, dim_state, dim_action, batch_size): self.init(discrete_state, discrete_action, dim_state, dim_action) state = random_tensor(discrete_state, dim_state, batch_size) embeddings = self.policy.embeddings(state) assert embeddings.shape == torch.Size( [batch_size, 33] if batch_size else [33]) assert embeddings.dtype is torch.get_default_dtype() def test_input_transform(self, batch_size): policy = NNPolicy( dim_state=(2, ), dim_action=(4, ), layers=[64, 64], input_transform=StateTransform(), ) out = tensor_to_distribution( policy(random_tensor(False, 2, batch_size))) action = out.sample() assert action.shape == torch.Size( [batch_size, 4] if batch_size else [4]) assert action.dtype is torch.get_default_dtype() def test_goal(self, batch_size): goal = random_tensor(False, 3, None) policy = NNPolicy(dim_state=(4, ), dim_action=(2, ), layers=[32, 32], goal=goal) state = random_tensor(False, 4, batch_size) pi = tensor_to_distribution(policy(state)) action = pi.sample() assert action.shape == torch.Size( [batch_size, 2] if batch_size else [2]) assert action.dtype is torch.get_default_dtype() other_goal = random_tensor(False, 3, None) policy.set_goal(other_goal) other_pi = tensor_to_distribution(policy(state)) assert not torch.any(other_pi.mean == pi.mean) def test_from_other(self, discrete_state, discrete_action, dim_state, dim_action): self.init(discrete_state, discrete_action, dim_state, dim_action) _test_from_other(self.policy, NNPolicy) _test_from_other_with_copy(self.policy, NNPolicy) def test_from_nn(self, discrete_state, dim_state, dim_action, batch_size): self.init(discrete_state, False, dim_state, dim_action) policy = NNPolicy.from_nn( HomoGaussianNN( self.policy.nn.kwargs["in_dim"], self.policy.nn.kwargs["out_dim"], layers=[20, 20], biased_head=False, ), self.dim_state, self.dim_action, num_states=self.num_states, num_actions=self.num_actions, ) state = random_tensor(discrete_state, dim_state, batch_size) action = tensor_to_distribution(policy(state)).sample() embeddings = policy.embeddings(state) assert action.shape == torch.Size( [batch_size, dim_action] if batch_size else [dim_action]) assert embeddings.shape == torch.Size( [batch_size, 20] if batch_size else [20]) assert action.dtype is torch.get_default_dtype() assert embeddings.dtype is torch.get_default_dtype()
def default( cls, environment, gamma=0.99, exploration_steps=0, exploration_episodes=0, tensorboard=False, test=False, ): """See `AbstractAgent.default'.""" model = EnsembleModel( dim_state=environment.dim_state, dim_action=environment.dim_action, num_heads=5, layers=[200, 200], biased_head=False, non_linearity="ReLU", input_transform=None, deterministic=False, ) dynamical_model = TransformedModel(model, list()) model_optimizer = Adam(dynamical_model.parameters(), lr=5e-4) reward_model = QuadraticReward( torch.eye(environment.dim_state[0]), torch.eye(environment.dim_action[0]), goal=environment.goal, ) policy = NNPolicy( dim_state=environment.dim_state, dim_action=environment.dim_action, layers=[100, 100], biased_head=True, non_linearity="ReLU", squashed_output=True, input_transform=None, action_scale=environment.action_scale, goal=environment.goal, deterministic=False, tau=5e-3, ) value_function = NNValueFunction( dim_state=environment.dim_state, layers=[200, 200], biased_head=True, non_linearity="ReLU", input_transform=None, tau=5e-3, ) optimizer = Adam(chain(policy.parameters(), value_function.parameters()), lr=5e-3) return cls( model_optimizer, policy, value_function, dynamical_model, reward_model, optimizer, mpo_value_learning_criterion=loss.MSELoss, termination_model=None, initial_distribution=None, plan_horizon=1, plan_samples=8, plan_elites=1, max_memory=10000, model_learn_batch_size=64, model_learn_num_iter=4 if test else 30, bootstrap=True, mpo_epsilon=0.1, mpo_epsilon_mean=0.1, mpo_epsilon_var=0.0001, mpo_regularization=False, mpo_num_iter=5 if test else 200, mpo_gradient_steps=50, mpo_batch_size=None, mpo_num_action_samples=15, mpo_target_update_frequency=4, sim_num_steps=5 if test else 200, sim_initial_states_num_trajectories=8, sim_initial_dist_num_trajectories=0, sim_memory_num_trajectories=0, sim_max_memory=100000, sim_num_subsample=1, sim_refresh_interval=1, thompson_sampling=False, gamma=gamma, exploration_steps=exploration_steps, exploration_episodes=exploration_episodes, tensorboard=tensorboard, comment=environment.name, )