def test_basic(self): state_dim = 8 action_dim = 4 model = GaussianFullyConnectedActor( state_dim, action_dim, sizes=[7, 6], activations=["relu", "relu"], use_batch_norm=True, ) input = model.input_prototype() self.assertEqual((1, state_dim), input.float_features.shape) # Using batch norm requires more than 1 example in training, avoid that model.eval() action = model(input) self.assertEqual((1, action_dim), action.action.shape)
def test_get_log_prob(self): torch.manual_seed(0) state_dim = 8 action_dim = 4 model = GaussianFullyConnectedActor( state_dim, action_dim, sizes=[7, 6], activations=["relu", "relu"], use_batch_norm=False, ) input = model.input_prototype() self.assertEqual((1, state_dim), input.float_features.shape) action = model(input) squashed_action = action.action.detach() action_log_prob = model.get_log_prob(input, squashed_action).detach() npt.assert_allclose(action.log_prob.detach(), action_log_prob, rtol=1e-4)
def build_actor( self, state_normalization_data: NormalizationData, action_normalization_data: NormalizationData, ) -> ModelBase: state_dim = get_num_output_features( state_normalization_data.dense_normalization_parameters) action_dim = get_num_output_features( action_normalization_data.dense_normalization_parameters) return GaussianFullyConnectedActor( state_dim=state_dim, action_dim=action_dim, sizes=self.sizes, activations=self.activations, use_batch_norm=self.use_batch_norm, use_layer_norm=self.use_layer_norm, )
def test_save_load(self): state_dim = 8 action_dim = 4 model = GaussianFullyConnectedActor( state_dim, action_dim, sizes=[7, 6], activations=["relu", "relu"], use_batch_norm=False, ) expected_num_params, expected_num_inputs, expected_num_outputs = 6, 1, 1 # Actor output is stochastic and won't match between PyTorch & Caffe2 check_save_load( self, model, expected_num_params, expected_num_inputs, expected_num_outputs, check_equality=False, )
def build_actor( self, state_feature_config: rlt.ModelFeatureConfig, state_normalization_data: NormalizationData, action_normalization_data: NormalizationData, ) -> ModelBase: state_dim = get_num_output_features( state_normalization_data.dense_normalization_parameters) action_dim = get_num_output_features( action_normalization_data.dense_normalization_parameters) input_dim = state_dim embedding_dim = self.embedding_dim embedding_concat = None if embedding_dim is not None: embedding_concat = models.EmbeddingBagConcat( state_dim=state_dim, model_feature_config=state_feature_config, embedding_dim=embedding_dim, ) input_dim = embedding_concat.output_dim gaussian_fc_actor = GaussianFullyConnectedActor( state_dim=input_dim, action_dim=action_dim, sizes=self.sizes, activations=self.activations, use_batch_norm=self.use_batch_norm, use_layer_norm=self.use_layer_norm, use_l2_normalization=self.use_l2_normalization, ) if not embedding_dim: return gaussian_fc_actor assert embedding_concat is not None return models.Sequential( # type: ignore embedding_concat, rlt.TensorFeatureData(), gaussian_fc_actor, )
def get_sac_trainer( env: OpenAIGymEnvironment, rl_parameters: RLParameters, trainer_parameters: SACTrainerParameters, critic_training: FeedForwardParameters, actor_training: FeedForwardParameters, sac_value_training: Optional[FeedForwardParameters], use_gpu: bool, ) -> SACTrainer: assert rl_parameters == trainer_parameters.rl state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features(env.normalization_action) q1_network = FullyConnectedParametricDQN(state_dim, action_dim, critic_training.layers, critic_training.activations) q2_network = None # TODO: # if trainer_parameters.use_2_q_functions: # q2_network = FullyConnectedParametricDQN( # state_dim, # action_dim, # critic_training.layers, # critic_training.activations, # ) value_network = None if sac_value_training: value_network = FullyConnectedNetwork( [state_dim] + sac_value_training.layers + [1], sac_value_training.activations + ["linear"], ) actor_network = GaussianFullyConnectedActor(state_dim, action_dim, actor_training.layers, actor_training.activations) min_action_range_tensor_training = torch.full((1, action_dim), -1 + 1e-6) max_action_range_tensor_training = torch.full((1, action_dim), 1 - 1e-6) min_action_range_tensor_serving = ( torch.from_numpy(env.action_space.low).float().unsqueeze( dim=0) # type: ignore ) max_action_range_tensor_serving = ( torch.from_numpy(env.action_space.high).float().unsqueeze( dim=0) # type: ignore ) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() if value_network: value_network.cuda() actor_network.cuda() min_action_range_tensor_training = min_action_range_tensor_training.cuda( ) max_action_range_tensor_training = max_action_range_tensor_training.cuda( ) min_action_range_tensor_serving = min_action_range_tensor_serving.cuda( ) max_action_range_tensor_serving = max_action_range_tensor_serving.cuda( ) return SACTrainer( q1_network, actor_network, trainer_parameters, use_gpu=use_gpu, value_network=value_network, q2_network=q2_network, min_action_range_tensor_training=min_action_range_tensor_training, max_action_range_tensor_training=max_action_range_tensor_training, min_action_range_tensor_serving=min_action_range_tensor_serving, max_action_range_tensor_serving=max_action_range_tensor_serving, )
def get_sac_trainer( self, env, use_gpu, use_2_q_functions=False, logged_action_uniform_prior=True, constrain_action_sum=False, use_value_network=True, use_alpha_optimizer=True, entropy_temperature=None, ): q_network_params = FeedForwardParameters(layers=[128, 64], activations=["relu", "relu"]) value_network_params = FeedForwardParameters( layers=[128, 64], activations=["relu", "relu"]) actor_network_params = FeedForwardParameters( layers=[128, 64], activations=["relu", "relu"]) state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features( env.normalization_continuous_action) q1_network = FullyConnectedParametricDQN(state_dim, action_dim, q_network_params.layers, q_network_params.activations) q2_network = None if use_2_q_functions: q2_network = FullyConnectedParametricDQN( state_dim, action_dim, q_network_params.layers, q_network_params.activations, ) if constrain_action_sum: actor_network = DirichletFullyConnectedActor( state_dim, action_dim, actor_network_params.layers, actor_network_params.activations, ) else: actor_network = GaussianFullyConnectedActor( state_dim, action_dim, actor_network_params.layers, actor_network_params.activations, ) value_network = None if use_value_network: value_network = FullyConnectedNetwork( [state_dim] + value_network_params.layers + [1], value_network_params.activations + ["linear"], ) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() if value_network: value_network.cuda() actor_network.cuda() parameters = SACTrainerParameters( rl=RLParameters(gamma=DISCOUNT, target_update_rate=0.5), minibatch_size=self.minibatch_size, q_network_optimizer=OptimizerParameters(), value_network_optimizer=OptimizerParameters(), actor_network_optimizer=OptimizerParameters(), alpha_optimizer=OptimizerParameters() if use_alpha_optimizer else None, entropy_temperature=entropy_temperature, logged_action_uniform_prior=logged_action_uniform_prior, ) return SACTrainer( q1_network, actor_network, parameters, use_gpu=use_gpu, value_network=value_network, q2_network=q2_network, )