def get_trainer( self, environment, parameters=None, use_gpu=False, use_all_avail_gpus=False ): parameters = parameters or self.get_sarsa_parameters() q_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(environment.normalization), action_dim=get_num_output_features(environment.normalization_action), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) reward_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(environment.normalization), action_dim=get_num_output_features(environment.normalization_action), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) if use_gpu: q_network = q_network.cuda() reward_network = reward_network.cuda() if use_all_avail_gpus: q_network = q_network.get_distributed_data_parallel_model() reward_network = reward_network.get_distributed_data_parallel_model() q_network_target = q_network.get_target_network() trainer = ParametricDQNTrainer( q_network, q_network_target, reward_network, parameters ) return trainer
def create_parametric_dqn_trainer_from_params( model: ContinuousActionModelParameters, state_normalization_parameters: Dict[int, NormalizationParameters], action_normalization_parameters: Dict[int, NormalizationParameters], use_gpu: bool = False, use_all_avail_gpus: bool = False, ): q_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(state_normalization_parameters), action_dim=get_num_output_features(action_normalization_parameters), sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], ) reward_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(state_normalization_parameters), action_dim=get_num_output_features(action_normalization_parameters), sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], ) q_network_target = q_network.get_target_network() if use_gpu and torch.cuda.is_available(): q_network = q_network.cuda() q_network_target = q_network_target.cuda() reward_network = reward_network.cuda() if use_all_avail_gpus: q_network = q_network.get_distributed_data_parallel_model() q_network_target = q_network_target.get_distributed_data_parallel_model( ) reward_network = reward_network.get_distributed_data_parallel_model() return ParametricDQNTrainer(q_network, q_network_target, reward_network, model, use_gpu)
def _get_sac_trainer_params(env, sac_model_params, use_gpu): state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features(env.normalization_action) q1_network = FullyConnectedParametricDQN( state_dim, action_dim, sac_model_params.q_network.layers, sac_model_params.q_network.activations, ) q2_network = None if sac_model_params.training.use_2_q_functions: q2_network = FullyConnectedParametricDQN( state_dim, action_dim, sac_model_params.q_network.layers, sac_model_params.q_network.activations, ) value_network = FullyConnectedNetwork( [state_dim] + sac_model_params.value_network.layers + [1], sac_model_params.value_network.activations + ["linear"], ) actor_network = GaussianFullyConnectedActor( state_dim, action_dim, sac_model_params.actor_network.layers, sac_model_params.actor_network.activations, ) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() value_network.cuda() actor_network.cuda() value_network_target = deepcopy(value_network) min_action_range_tensor_training = torch.full((1, action_dim), -1 + 1e-6) max_action_range_tensor_training = torch.full((1, action_dim), 1 - 1e-6) action_range_low = env.action_space.low.astype(np.float32) action_range_high = env.action_space.high.astype(np.float32) min_action_range_tensor_serving = torch.from_numpy(action_range_low).unsqueeze( dim=0 ) max_action_range_tensor_serving = torch.from_numpy(action_range_high).unsqueeze( dim=0 ) trainer_args = [ q1_network, value_network, value_network_target, actor_network, sac_model_params, ] trainer_kwargs = { "q2_network": q2_network, "min_action_range_tensor_training": min_action_range_tensor_training, "max_action_range_tensor_training": max_action_range_tensor_training, "min_action_range_tensor_serving": min_action_range_tensor_serving, "max_action_range_tensor_serving": max_action_range_tensor_serving, } return trainer_args, trainer_kwargs
def get_trainer(self, environment, parameters=None, use_gpu=False, use_all_avail_gpus=False): layers = [256, 128] activations = ["relu", "relu"] parameters = parameters or self.get_sarsa_parameters() q_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(environment.normalization), action_dim=get_num_output_features( environment.normalization_action), sizes=layers, activations=activations, ) reward_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(environment.normalization), action_dim=get_num_output_features( environment.normalization_action), sizes=layers, activations=activations, ) if use_gpu: q_network = q_network.cuda() reward_network = reward_network.cuda() if use_all_avail_gpus: q_network = q_network.get_distributed_data_parallel_model() reward_network = reward_network.get_distributed_data_parallel_model( ) q_network_target = q_network.get_target_network() param_dict = parameters.asdict() # type: ignore trainer = ParametricDQNTrainer(q_network, q_network_target, reward_network, **param_dict) return trainer
def _get_sac_trainer_params(env, sac_model_params, use_gpu): state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features(env.normalization_action) q1_network = FullyConnectedParametricDQN( state_dim, action_dim, sac_model_params.q_network.layers, sac_model_params.q_network.activations, ) q2_network = None if sac_model_params.training.use_2_q_functions: q2_network = FullyConnectedParametricDQN( state_dim, action_dim, sac_model_params.q_network.layers, sac_model_params.q_network.activations, ) value_network = FullyConnectedNetwork( [state_dim] + sac_model_params.value_network.layers + [1], sac_model_params.value_network.activations + ["linear"], ) actor_network = GaussianFullyConnectedActor( state_dim, action_dim, sac_model_params.actor_network.layers, sac_model_params.actor_network.activations, ) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() value_network.cuda() actor_network.cuda() value_network_target = deepcopy(value_network) min_action_range_tensor_training = torch.full((1, action_dim), -1 + 1e-6) max_action_range_tensor_training = torch.full((1, action_dim), 1 - 1e-6) action_range_low = env.action_space.low.astype(np.float32) action_range_high = env.action_space.high.astype(np.float32) min_action_range_tensor_serving = torch.from_numpy(action_range_low).unsqueeze( dim=0 ) max_action_range_tensor_serving = torch.from_numpy(action_range_high).unsqueeze( dim=0 ) trainer_args = [ q1_network, value_network, value_network_target, actor_network, sac_model_params, ] trainer_kwargs = { "q2_network": q2_network, "min_action_range_tensor_training": min_action_range_tensor_training, "max_action_range_tensor_training": max_action_range_tensor_training, "min_action_range_tensor_serving": min_action_range_tensor_serving, "max_action_range_tensor_serving": max_action_range_tensor_serving, } return trainer_args, trainer_kwargs
def get_sac_trainer(self, env, parameters, use_gpu): state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features( env.normalization_continuous_action) q1_network = FullyConnectedParametricDQN( state_dim, action_dim, parameters.q_network.layers, parameters.q_network.activations, ) q2_network = None if parameters.training.use_2_q_functions: q2_network = FullyConnectedParametricDQN( state_dim, action_dim, parameters.q_network.layers, parameters.q_network.activations, ) if parameters.constrain_action_sum: actor_network = DirichletFullyConnectedActor( state_dim, action_dim, parameters.actor_network.layers, parameters.actor_network.activations, ) else: actor_network = GaussianFullyConnectedActor( state_dim, action_dim, parameters.actor_network.layers, parameters.actor_network.activations, ) value_network = None if parameters.training.use_value_network: value_network = FullyConnectedNetwork( [state_dim] + parameters.value_network.layers + [1], parameters.value_network.activations + ["linear"], ) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() if value_network: value_network.cuda() actor_network.cuda() return SACTrainer( q1_network, actor_network, parameters, use_gpu=use_gpu, value_network=value_network, q2_network=q2_network, )
def get_td3_trainer(env, parameters, use_gpu): state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features(env.normalization_action) q1_network = FullyConnectedParametricDQN( state_dim, action_dim, parameters.q_network.layers, parameters.q_network.activations, ) q2_network = None if parameters.training.use_2_q_functions: q2_network = FullyConnectedParametricDQN( state_dim, action_dim, parameters.q_network.layers, parameters.q_network.activations, ) actor_network = FullyConnectedActor( state_dim, action_dim, parameters.actor_network.layers, parameters.actor_network.activations, ) min_action_range_tensor_training = torch.full((1, action_dim), -1) max_action_range_tensor_training = torch.full((1, action_dim), 1) min_action_range_tensor_serving = torch.FloatTensor( env.action_space.low).unsqueeze(dim=0) max_action_range_tensor_serving = torch.FloatTensor( env.action_space.high).unsqueeze(dim=0) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() actor_network.cuda() min_action_range_tensor_training = min_action_range_tensor_training.cuda( ) max_action_range_tensor_training = max_action_range_tensor_training.cuda( ) min_action_range_tensor_serving = min_action_range_tensor_serving.cuda( ) max_action_range_tensor_serving = max_action_range_tensor_serving.cuda( ) trainer_args = [q1_network, actor_network, parameters] trainer_kwargs = { "q2_network": q2_network, "min_action_range_tensor_training": min_action_range_tensor_training, "max_action_range_tensor_training": max_action_range_tensor_training, "min_action_range_tensor_serving": min_action_range_tensor_serving, "max_action_range_tensor_serving": max_action_range_tensor_serving, } return TD3Trainer(*trainer_args, use_gpu=use_gpu, **trainer_kwargs)
def create_parametric_dqn_trainer_from_params( model: ContinuousActionModelParameters, state_normalization_parameters: Dict[int, NormalizationParameters], action_normalization_parameters: Dict[int, NormalizationParameters], use_gpu: bool = False, use_all_avail_gpus: bool = False, ): q_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(state_normalization_parameters), action_dim=get_num_output_features(action_normalization_parameters), sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], ) reward_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(state_normalization_parameters), action_dim=get_num_output_features(action_normalization_parameters), sizes=model.training.layers[1:-1], activations=model.training.activations[:-1], ) q_network_target = q_network.get_target_network() if use_gpu: q_network = q_network.cuda() q_network_target = q_network_target.cuda() reward_network = reward_network.cuda() if use_all_avail_gpus: q_network = q_network.get_distributed_data_parallel_model() q_network_target = q_network_target.get_distributed_data_parallel_model( ) reward_network = reward_network.get_distributed_data_parallel_model() trainer_parameters = ParametricDQNTrainerParameters( # type: ignore rl=model.rl, double_q_learning=model.rainbow.double_q_learning, minibatch_size=model.training.minibatch_size, optimizer=OptimizerParameters( optimizer=model.training.optimizer, learning_rate=model.training.learning_rate, l2_decay=model.training.l2_decay, ), ) return ParametricDQNTrainer( q_network, q_network_target, reward_network, use_gpu=use_gpu, **trainer_parameters.asdict() # type: ignore )
def get_modular_sarsa_trainer_exporter(self, environment, parameters=None, use_gpu=False, use_all_avail_gpus=False): parameters = parameters or self.get_sarsa_parameters() q_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(environment.normalization), action_dim=get_num_output_features( environment.normalization_action), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) reward_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(environment.normalization), action_dim=get_num_output_features( environment.normalization_action), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) if use_gpu: q_network = q_network.cuda() reward_network = reward_network.cuda() if use_all_avail_gpus: q_network = q_network.get_data_parallel_model() reward_network = reward_network.get_data_parallel_model() q_network_target = q_network.get_target_network() trainer = _ParametricDQNTrainer(q_network, q_network_target, reward_network, parameters) state_preprocessor = Preprocessor(environment.normalization, False, True) action_preprocessor = Preprocessor(environment.normalization_action, False, True) feature_extractor = PredictorFeatureExtractor( state_normalization_parameters=environment.normalization, action_normalization_parameters=environment.normalization_action, ) output_transformer = ParametricActionOutputTransformer() exporter = ParametricDQNExporter( q_network, feature_extractor, output_transformer, state_preprocessor, action_preprocessor, ) return (trainer, exporter)
def get_sac_trainer(self, env, parameters, use_gpu): state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features(env.normalization_action) q1_network = FullyConnectedParametricDQN( state_dim, action_dim, parameters.q_network.layers, parameters.q_network.activations, ) q2_network = None if parameters.training.use_2_q_functions: q2_network = FullyConnectedParametricDQN( state_dim, action_dim, parameters.q_network.layers, parameters.q_network.activations, ) value_network = FullyConnectedNetwork( [state_dim] + parameters.value_network.layers + [1], parameters.value_network.activations + ["linear"], ) actor_network = GaussianFullyConnectedActor( state_dim, action_dim, parameters.actor_network.layers, parameters.actor_network.activations, ) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() value_network.cuda() actor_network.cuda() value_network_target = deepcopy(value_network) return SACTrainer( q1_network, value_network, value_network_target, actor_network, parameters, q2_network=q2_network, )
def get_td3_trainer(self, env, parameters, use_gpu): state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features(env.normalization_action) q1_network = FullyConnectedParametricDQN( state_dim, action_dim, parameters.q_network.layers, parameters.q_network.activations, ) q2_network = None if parameters.training.use_2_q_functions: q2_network = FullyConnectedParametricDQN( state_dim, action_dim, parameters.q_network.layers, parameters.q_network.activations, ) actor_network = FullyConnectedActor( state_dim, action_dim, parameters.actor_network.layers, parameters.actor_network.activations, ) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() actor_network.cuda() return TD3Trainer( q1_network, actor_network, parameters, q2_network=q2_network, use_gpu=use_gpu, )
def get_modular_sarsa_trainer_exporter( self, environment, parameters=None, use_gpu=False, use_all_avail_gpus=False ): parameters = parameters or self.get_sarsa_parameters() q_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(environment.normalization), action_dim=get_num_output_features(environment.normalization_action), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) reward_network = FullyConnectedParametricDQN( state_dim=get_num_output_features(environment.normalization), action_dim=get_num_output_features(environment.normalization_action), sizes=parameters.training.layers[1:-1], activations=parameters.training.activations[:-1], ) if use_gpu: q_network = q_network.cuda() reward_network = reward_network.cuda() if use_all_avail_gpus: q_network = q_network.get_data_parallel_model() reward_network = reward_network.get_data_parallel_model() q_network_target = q_network.get_target_network() trainer = _ParametricDQNTrainer( q_network, q_network_target, reward_network, parameters ) feature_extractor = PredictorFeatureExtractor( state_normalization_parameters=environment.normalization, action_normalization_parameters=environment.normalization_action, ) output_transformer = ParametricActionOutputTransformer() exporter = ParametricDQNExporter( q_network, feature_extractor, output_transformer ) return (trainer, exporter)
def get_sac_trainer( self, env, use_gpu, use_2_q_functions=False, logged_action_uniform_prior=True, constrain_action_sum=False, use_value_network=True, ): q_network_params = FeedForwardParameters(layers=[128, 64], activations=["relu", "relu"]) value_network_params = FeedForwardParameters( layers=[128, 64], activations=["relu", "relu"]) actor_network_params = FeedForwardParameters( layers=[128, 64], activations=["relu", "relu"]) state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features( env.normalization_continuous_action) q1_network = FullyConnectedParametricDQN(state_dim, action_dim, q_network_params.layers, q_network_params.activations) q2_network = None if use_2_q_functions: q2_network = FullyConnectedParametricDQN( state_dim, action_dim, q_network_params.layers, q_network_params.activations, ) if constrain_action_sum: actor_network = DirichletFullyConnectedActor( state_dim, action_dim, actor_network_params.layers, actor_network_params.activations, ) else: actor_network = GaussianFullyConnectedActor( state_dim, action_dim, actor_network_params.layers, actor_network_params.activations, ) value_network = None if use_value_network: value_network = FullyConnectedNetwork( [state_dim] + value_network_params.layers + [1], value_network_params.activations + ["linear"], ) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() if value_network: value_network.cuda() actor_network.cuda() parameters = SACTrainerParameters( rl=RLParameters(gamma=DISCOUNT, target_update_rate=0.5), minibatch_size=self.minibatch_size, q_network_optimizer=OptimizerParameters(), value_network_optimizer=OptimizerParameters(), actor_network_optimizer=OptimizerParameters(), alpha_optimizer=OptimizerParameters(), logged_action_uniform_prior=logged_action_uniform_prior, ) return SACTrainer( q1_network, actor_network, parameters, use_gpu=use_gpu, value_network=value_network, q2_network=q2_network, )
def _get_sac_trainer_params(env: OpenAIGymEnvironment, sac_model_params: SACModelParameters, use_gpu: bool): state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features(env.normalization_action) q1_network = FullyConnectedParametricDQN( state_dim, action_dim, sac_model_params.q_network.layers, sac_model_params.q_network.activations, ) q2_network = None if sac_model_params.training.use_2_q_functions: q2_network = FullyConnectedParametricDQN( state_dim, action_dim, sac_model_params.q_network.layers, sac_model_params.q_network.activations, ) value_network = None if sac_model_params.training.use_value_network: assert sac_model_params.value_network is not None value_network = FullyConnectedNetwork( [state_dim] + sac_model_params.value_network.layers + [1], sac_model_params.value_network.activations + ["linear"], ) actor_network = GaussianFullyConnectedActor( state_dim, action_dim, sac_model_params.actor_network.layers, sac_model_params.actor_network.activations, ) min_action_range_tensor_training = torch.full((1, action_dim), -1 + 1e-6) max_action_range_tensor_training = torch.full((1, action_dim), 1 - 1e-6) min_action_range_tensor_serving = ( torch.from_numpy(env.action_space.low).float().unsqueeze( dim=0) # type: ignore ) max_action_range_tensor_serving = ( torch.from_numpy(env.action_space.high).float().unsqueeze( dim=0) # type: ignore ) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() if value_network: value_network.cuda() actor_network.cuda() min_action_range_tensor_training = min_action_range_tensor_training.cuda( ) max_action_range_tensor_training = max_action_range_tensor_training.cuda( ) min_action_range_tensor_serving = min_action_range_tensor_serving.cuda( ) max_action_range_tensor_serving = max_action_range_tensor_serving.cuda( ) trainer_args = [q1_network, actor_network, sac_model_params] trainer_kwargs = { "value_network": value_network, "q2_network": q2_network, "min_action_range_tensor_training": min_action_range_tensor_training, "max_action_range_tensor_training": max_action_range_tensor_training, "min_action_range_tensor_serving": min_action_range_tensor_serving, "max_action_range_tensor_serving": max_action_range_tensor_serving, } return trainer_args, trainer_kwargs
def get_sac_trainer( env: OpenAIGymEnvironment, rl_parameters: RLParameters, trainer_parameters: SACTrainerParameters, critic_training: FeedForwardParameters, actor_training: FeedForwardParameters, sac_value_training: Optional[FeedForwardParameters], use_gpu: bool, ) -> SACTrainer: assert rl_parameters == trainer_parameters.rl state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features(env.normalization_action) q1_network = FullyConnectedParametricDQN(state_dim, action_dim, critic_training.layers, critic_training.activations) q2_network = None # TODO: # if trainer_parameters.use_2_q_functions: # q2_network = FullyConnectedParametricDQN( # state_dim, # action_dim, # critic_training.layers, # critic_training.activations, # ) value_network = None if sac_value_training: value_network = FullyConnectedNetwork( [state_dim] + sac_value_training.layers + [1], sac_value_training.activations + ["linear"], ) actor_network = GaussianFullyConnectedActor(state_dim, action_dim, actor_training.layers, actor_training.activations) min_action_range_tensor_training = torch.full((1, action_dim), -1 + 1e-6) max_action_range_tensor_training = torch.full((1, action_dim), 1 - 1e-6) min_action_range_tensor_serving = ( torch.from_numpy(env.action_space.low).float().unsqueeze( dim=0) # type: ignore ) max_action_range_tensor_serving = ( torch.from_numpy(env.action_space.high).float().unsqueeze( dim=0) # type: ignore ) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() if value_network: value_network.cuda() actor_network.cuda() min_action_range_tensor_training = min_action_range_tensor_training.cuda( ) max_action_range_tensor_training = max_action_range_tensor_training.cuda( ) min_action_range_tensor_serving = min_action_range_tensor_serving.cuda( ) max_action_range_tensor_serving = max_action_range_tensor_serving.cuda( ) return SACTrainer( q1_network, actor_network, trainer_parameters, use_gpu=use_gpu, value_network=value_network, q2_network=q2_network, min_action_range_tensor_training=min_action_range_tensor_training, max_action_range_tensor_training=max_action_range_tensor_training, min_action_range_tensor_serving=min_action_range_tensor_serving, max_action_range_tensor_serving=max_action_range_tensor_serving, )