Exemplo n.º 1
0
    def get_trainer(
        self, environment, parameters=None, use_gpu=False, use_all_avail_gpus=False
    ):
        parameters = parameters or self.get_sarsa_parameters()
        q_network = FullyConnectedParametricDQN(
            state_dim=get_num_output_features(environment.normalization),
            action_dim=get_num_output_features(environment.normalization_action),
            sizes=parameters.training.layers[1:-1],
            activations=parameters.training.activations[:-1],
        )
        reward_network = FullyConnectedParametricDQN(
            state_dim=get_num_output_features(environment.normalization),
            action_dim=get_num_output_features(environment.normalization_action),
            sizes=parameters.training.layers[1:-1],
            activations=parameters.training.activations[:-1],
        )
        if use_gpu:
            q_network = q_network.cuda()
            reward_network = reward_network.cuda()
            if use_all_avail_gpus:
                q_network = q_network.get_distributed_data_parallel_model()
                reward_network = reward_network.get_distributed_data_parallel_model()

        q_network_target = q_network.get_target_network()
        trainer = ParametricDQNTrainer(
            q_network, q_network_target, reward_network, parameters
        )
        return trainer
    def __init__(
        self,
        parameters: ContinuousActionModelParameters,
        state_normalization_parameters: Dict[int, NormalizationParameters],
        action_normalization_parameters: Dict[int, NormalizationParameters],
        additional_feature_types:
        AdditionalFeatureTypes = DEFAULT_ADDITIONAL_FEATURE_TYPES,
    ) -> None:
        self._additional_feature_types = additional_feature_types
        self.state_normalization_parameters = state_normalization_parameters
        self.action_normalization_parameters = action_normalization_parameters
        num_features = get_num_output_features(
            state_normalization_parameters) + get_num_output_features(
                action_normalization_parameters)

        # ensure state and action IDs have no intersection
        overlapping_features = set(
            state_normalization_parameters.keys()) & set(
                action_normalization_parameters.keys())
        assert len(overlapping_features) == 0, (
            "There are some overlapping state and action features: " +
            str(overlapping_features))

        parameters.training.layers[0] = num_features
        parameters.training.layers[-1] = 1

        RLTrainer.__init__(self, parameters)

        self._create_internal_policy_net()
    def get_trainer(self,
                    environment,
                    parameters=None,
                    use_gpu=False,
                    use_all_avail_gpus=False):
        layers = [256, 128]
        activations = ["relu", "relu"]
        parameters = parameters or self.get_sarsa_parameters()
        q_network = FullyConnectedParametricDQN(
            state_dim=get_num_output_features(environment.normalization),
            action_dim=get_num_output_features(
                environment.normalization_action),
            sizes=layers,
            activations=activations,
        )
        reward_network = FullyConnectedParametricDQN(
            state_dim=get_num_output_features(environment.normalization),
            action_dim=get_num_output_features(
                environment.normalization_action),
            sizes=layers,
            activations=activations,
        )
        if use_gpu:
            q_network = q_network.cuda()
            reward_network = reward_network.cuda()
            if use_all_avail_gpus:
                q_network = q_network.get_distributed_data_parallel_model()
                reward_network = reward_network.get_distributed_data_parallel_model(
                )

        q_network_target = q_network.get_target_network()
        param_dict = parameters.asdict()  # type: ignore
        trainer = ParametricDQNTrainer(q_network, q_network_target,
                                       reward_network, **param_dict)
        return trainer
Exemplo n.º 4
0
def _get_sac_trainer_params(env, sac_model_params, use_gpu):
    state_dim = get_num_output_features(env.normalization)
    action_dim = get_num_output_features(env.normalization_action)
    q1_network = FullyConnectedParametricDQN(
        state_dim,
        action_dim,
        sac_model_params.q_network.layers,
        sac_model_params.q_network.activations,
    )
    q2_network = None
    if sac_model_params.training.use_2_q_functions:
        q2_network = FullyConnectedParametricDQN(
            state_dim,
            action_dim,
            sac_model_params.q_network.layers,
            sac_model_params.q_network.activations,
        )
    value_network = FullyConnectedNetwork(
        [state_dim] + sac_model_params.value_network.layers + [1],
        sac_model_params.value_network.activations + ["linear"],
    )
    actor_network = GaussianFullyConnectedActor(
        state_dim,
        action_dim,
        sac_model_params.actor_network.layers,
        sac_model_params.actor_network.activations,
    )
    if use_gpu:
        q1_network.cuda()
        if q2_network:
            q2_network.cuda()
        value_network.cuda()
        actor_network.cuda()
    value_network_target = deepcopy(value_network)
    min_action_range_tensor_training = torch.full((1, action_dim), -1 + 1e-6)
    max_action_range_tensor_training = torch.full((1, action_dim), 1 - 1e-6)
    action_range_low = env.action_space.low.astype(np.float32)
    action_range_high = env.action_space.high.astype(np.float32)
    min_action_range_tensor_serving = torch.from_numpy(action_range_low).unsqueeze(
        dim=0
    )
    max_action_range_tensor_serving = torch.from_numpy(action_range_high).unsqueeze(
        dim=0
    )

    trainer_args = [
        q1_network,
        value_network,
        value_network_target,
        actor_network,
        sac_model_params,
    ]
    trainer_kwargs = {
        "q2_network": q2_network,
        "min_action_range_tensor_training": min_action_range_tensor_training,
        "max_action_range_tensor_training": max_action_range_tensor_training,
        "min_action_range_tensor_serving": min_action_range_tensor_serving,
        "max_action_range_tensor_serving": max_action_range_tensor_serving,
    }
    return trainer_args, trainer_kwargs
Exemplo n.º 5
0
def _get_sac_trainer_params(env, sac_model_params, use_gpu):
    state_dim = get_num_output_features(env.normalization)
    action_dim = get_num_output_features(env.normalization_action)
    q1_network = FullyConnectedParametricDQN(
        state_dim,
        action_dim,
        sac_model_params.q_network.layers,
        sac_model_params.q_network.activations,
    )
    q2_network = None
    if sac_model_params.training.use_2_q_functions:
        q2_network = FullyConnectedParametricDQN(
            state_dim,
            action_dim,
            sac_model_params.q_network.layers,
            sac_model_params.q_network.activations,
        )
    value_network = FullyConnectedNetwork(
        [state_dim] + sac_model_params.value_network.layers + [1],
        sac_model_params.value_network.activations + ["linear"],
    )
    actor_network = GaussianFullyConnectedActor(
        state_dim,
        action_dim,
        sac_model_params.actor_network.layers,
        sac_model_params.actor_network.activations,
    )
    if use_gpu:
        q1_network.cuda()
        if q2_network:
            q2_network.cuda()
        value_network.cuda()
        actor_network.cuda()
    value_network_target = deepcopy(value_network)
    min_action_range_tensor_training = torch.full((1, action_dim), -1 + 1e-6)
    max_action_range_tensor_training = torch.full((1, action_dim), 1 - 1e-6)
    action_range_low = env.action_space.low.astype(np.float32)
    action_range_high = env.action_space.high.astype(np.float32)
    min_action_range_tensor_serving = torch.from_numpy(action_range_low).unsqueeze(
        dim=0
    )
    max_action_range_tensor_serving = torch.from_numpy(action_range_high).unsqueeze(
        dim=0
    )

    trainer_args = [
        q1_network,
        value_network,
        value_network_target,
        actor_network,
        sac_model_params,
    ]
    trainer_kwargs = {
        "q2_network": q2_network,
        "min_action_range_tensor_training": min_action_range_tensor_training,
        "max_action_range_tensor_training": max_action_range_tensor_training,
        "min_action_range_tensor_serving": min_action_range_tensor_serving,
        "max_action_range_tensor_serving": max_action_range_tensor_serving,
    }
    return trainer_args, trainer_kwargs
    def __init__(self,
                 state_normalization_parameters: Dict[str,
                                                      NormalizationParameters],
                 action_normalization_parameters: Dict[
                     str, NormalizationParameters],
                 parameters: ContinuousActionModelParameters,
                 skip_normalization: Optional[bool] = False) -> None:
        self._action_features = list(action_normalization_parameters.keys())
        self.num_unprocessed_action_features = len(self._action_features)
        self.num_processed_action_features = get_num_output_features(
            action_normalization_parameters)

        self.num_processed_state_features = get_num_output_features(
            state_normalization_parameters)

        if parameters.training.layers[0] is None or\
           parameters.training.layers[0] == -1:
            parameters.training.layers[0] = self.num_state_features +\
                self.num_action_features

        assert parameters.training.layers[-1] == 1, "Set layers[-1] to 1"

        self._action_normalization_parameters = action_normalization_parameters
        RLTrainer.__init__(self, state_normalization_parameters, parameters,
                           skip_normalization)
        print(action_normalization_parameters)

        self._prepare_action_normalization()
Exemplo n.º 7
0
def create_parametric_dqn_trainer_from_params(
    model: ContinuousActionModelParameters,
    state_normalization_parameters: Dict[int, NormalizationParameters],
    action_normalization_parameters: Dict[int, NormalizationParameters],
    use_gpu: bool = False,
    use_all_avail_gpus: bool = False,
):
    q_network = FullyConnectedParametricDQN(
        state_dim=get_num_output_features(state_normalization_parameters),
        action_dim=get_num_output_features(action_normalization_parameters),
        sizes=model.training.layers[1:-1],
        activations=model.training.activations[:-1],
    )
    reward_network = FullyConnectedParametricDQN(
        state_dim=get_num_output_features(state_normalization_parameters),
        action_dim=get_num_output_features(action_normalization_parameters),
        sizes=model.training.layers[1:-1],
        activations=model.training.activations[:-1],
    )
    q_network_target = q_network.get_target_network()

    if use_gpu and torch.cuda.is_available():
        q_network = q_network.cuda()
        q_network_target = q_network_target.cuda()
        reward_network = reward_network.cuda()

    if use_all_avail_gpus:
        q_network = q_network.get_distributed_data_parallel_model()
        q_network_target = q_network_target.get_distributed_data_parallel_model(
        )
        reward_network = reward_network.get_distributed_data_parallel_model()

    return ParametricDQNTrainer(q_network, q_network_target, reward_network,
                                model, use_gpu)
Exemplo n.º 8
0
    def get_modular_sarsa_trainer_reward_boost(
        self,
        environment,
        reward_shape,
        dueling,
        use_gpu=False,
        use_all_avail_gpus=False,
        clip_grad_norm=None,
    ):
        parameters = self.get_sarsa_parameters(environment, reward_shape,
                                               dueling, clip_grad_norm)
        q_network = FullyConnectedDQN(
            state_dim=get_num_output_features(environment.normalization),
            action_dim=len(environment.ACTIONS),
            sizes=parameters.training.layers[1:-1],
            activations=parameters.training.activations[:-1],
        )
        reward_network = FullyConnectedDQN(
            state_dim=get_num_output_features(environment.normalization),
            action_dim=len(environment.ACTIONS),
            sizes=parameters.training.layers[1:-1],
            activations=parameters.training.activations[:-1],
        )
        if use_gpu:
            q_network = q_network.cuda()
            reward_network = reward_network.cuda()
            if use_all_avail_gpus:
                q_network = q_network.get_data_parallel_model()
                reward_network = reward_network.get_data_parallel_model()

        q_network_target = q_network.get_target_network()
        trainer = _DQNTrainer(q_network, q_network_target, reward_network,
                              parameters, use_gpu)
        return trainer
Exemplo n.º 9
0
    def get_sac_trainer(self, env, parameters, use_gpu):
        state_dim = get_num_output_features(env.normalization)
        action_dim = get_num_output_features(
            env.normalization_continuous_action)
        q1_network = FullyConnectedParametricDQN(
            state_dim,
            action_dim,
            parameters.q_network.layers,
            parameters.q_network.activations,
        )
        q2_network = None
        if parameters.training.use_2_q_functions:
            q2_network = FullyConnectedParametricDQN(
                state_dim,
                action_dim,
                parameters.q_network.layers,
                parameters.q_network.activations,
            )
        if parameters.constrain_action_sum:
            actor_network = DirichletFullyConnectedActor(
                state_dim,
                action_dim,
                parameters.actor_network.layers,
                parameters.actor_network.activations,
            )
        else:
            actor_network = GaussianFullyConnectedActor(
                state_dim,
                action_dim,
                parameters.actor_network.layers,
                parameters.actor_network.activations,
            )

        value_network = None
        if parameters.training.use_value_network:
            value_network = FullyConnectedNetwork(
                [state_dim] + parameters.value_network.layers + [1],
                parameters.value_network.activations + ["linear"],
            )

        if use_gpu:
            q1_network.cuda()
            if q2_network:
                q2_network.cuda()
            if value_network:
                value_network.cuda()
            actor_network.cuda()

        return SACTrainer(
            q1_network,
            actor_network,
            parameters,
            use_gpu=use_gpu,
            value_network=value_network,
            q2_network=q2_network,
        )
Exemplo n.º 10
0
    def __init__(
        self,
        parameters: ContinuousActionModelParameters,
        state_normalization_parameters: Dict[int, NormalizationParameters],
        action_normalization_parameters: Dict[int, NormalizationParameters],
        use_gpu=False,
        additional_feature_types:
        AdditionalFeatureTypes = DEFAULT_ADDITIONAL_FEATURE_TYPES,
    ) -> None:

        self.warm_start_model_path = parameters.training.warm_start_model_path
        self.minibatch_size = parameters.training.minibatch_size
        self.state_normalization_parameters = state_normalization_parameters
        self.action_normalization_parameters = action_normalization_parameters
        self.num_state_features = get_num_output_features(
            state_normalization_parameters)
        self.num_action_features = get_num_output_features(
            action_normalization_parameters)
        self.num_features = self.num_state_features + self.num_action_features

        # ensure state and action IDs have no intersection
        overlapping_features = set(
            state_normalization_parameters.keys()) & set(
                action_normalization_parameters.keys())
        assert len(overlapping_features) == 0, (
            "There are some overlapping state and action features: " +
            str(overlapping_features))

        if parameters.training.factorization_parameters is None:
            parameters.training.layers[0] = self.num_features
            parameters.training.layers[-1] = 1
        else:
            parameters.training.factorization_parameters.state.layers[
                0] = self.num_state_features
            parameters.training.factorization_parameters.action.layers[
                0] = self.num_action_features

        RLTrainer.__init__(self, parameters, use_gpu, additional_feature_types,
                           None)

        self.q_network = self._get_model(parameters.training)

        self.q_network_target = deepcopy(self.q_network)
        self._set_optimizer(parameters.training.optimizer)
        self.q_network_optimizer = self.optimizer_func(
            self.q_network.parameters(), lr=parameters.training.learning_rate)

        self.reward_network = self._get_model(parameters.training)
        self.reward_network_optimizer = self.optimizer_func(
            self.reward_network.parameters(),
            lr=parameters.training.learning_rate)

        if self.use_gpu:
            self.q_network.cuda()
            self.q_network_target.cuda()
            self.reward_network.cuda()
Exemplo n.º 11
0
def get_td3_trainer(env, parameters, use_gpu):
    state_dim = get_num_output_features(env.normalization)
    action_dim = get_num_output_features(env.normalization_action)
    q1_network = FullyConnectedParametricDQN(
        state_dim,
        action_dim,
        parameters.q_network.layers,
        parameters.q_network.activations,
    )
    q2_network = None
    if parameters.training.use_2_q_functions:
        q2_network = FullyConnectedParametricDQN(
            state_dim,
            action_dim,
            parameters.q_network.layers,
            parameters.q_network.activations,
        )
    actor_network = FullyConnectedActor(
        state_dim,
        action_dim,
        parameters.actor_network.layers,
        parameters.actor_network.activations,
    )

    min_action_range_tensor_training = torch.full((1, action_dim), -1)
    max_action_range_tensor_training = torch.full((1, action_dim), 1)
    min_action_range_tensor_serving = torch.FloatTensor(
        env.action_space.low).unsqueeze(dim=0)
    max_action_range_tensor_serving = torch.FloatTensor(
        env.action_space.high).unsqueeze(dim=0)

    if use_gpu:
        q1_network.cuda()
        if q2_network:
            q2_network.cuda()
        actor_network.cuda()

        min_action_range_tensor_training = min_action_range_tensor_training.cuda(
        )
        max_action_range_tensor_training = max_action_range_tensor_training.cuda(
        )
        min_action_range_tensor_serving = min_action_range_tensor_serving.cuda(
        )
        max_action_range_tensor_serving = max_action_range_tensor_serving.cuda(
        )

    trainer_args = [q1_network, actor_network, parameters]
    trainer_kwargs = {
        "q2_network": q2_network,
        "min_action_range_tensor_training": min_action_range_tensor_training,
        "max_action_range_tensor_training": max_action_range_tensor_training,
        "min_action_range_tensor_serving": min_action_range_tensor_serving,
        "max_action_range_tensor_serving": max_action_range_tensor_serving,
    }
    return TD3Trainer(*trainer_args, use_gpu=use_gpu, **trainer_kwargs)
Exemplo n.º 12
0
 def __init__(
     self,
     state_preprocessor: Preprocessor,
     action_preprocessor: Preprocessor,
     seq_len: int,
 ):
     super().__init__(state_preprocessor, action_preprocessor)
     self.state_dim = get_num_output_features(
         state_preprocessor.normalization_parameters)
     self.action_dim = get_num_output_features(
         action_preprocessor.normalization_parameters)
     self.seq_len = seq_len
Exemplo n.º 13
0
def create_parametric_dqn_trainer_from_params(
    model: ContinuousActionModelParameters,
    state_normalization_parameters: Dict[int, NormalizationParameters],
    action_normalization_parameters: Dict[int, NormalizationParameters],
    use_gpu: bool = False,
    use_all_avail_gpus: bool = False,
):
    q_network = FullyConnectedParametricDQN(
        state_dim=get_num_output_features(state_normalization_parameters),
        action_dim=get_num_output_features(action_normalization_parameters),
        sizes=model.training.layers[1:-1],
        activations=model.training.activations[:-1],
    )
    reward_network = FullyConnectedParametricDQN(
        state_dim=get_num_output_features(state_normalization_parameters),
        action_dim=get_num_output_features(action_normalization_parameters),
        sizes=model.training.layers[1:-1],
        activations=model.training.activations[:-1],
    )
    q_network_target = q_network.get_target_network()

    if use_gpu:
        q_network = q_network.cuda()
        q_network_target = q_network_target.cuda()
        reward_network = reward_network.cuda()

    if use_all_avail_gpus:
        q_network = q_network.get_distributed_data_parallel_model()
        q_network_target = q_network_target.get_distributed_data_parallel_model(
        )
        reward_network = reward_network.get_distributed_data_parallel_model()

    trainer_parameters = ParametricDQNTrainerParameters(  # type: ignore
        rl=model.rl,
        double_q_learning=model.rainbow.double_q_learning,
        minibatch_size=model.training.minibatch_size,
        optimizer=OptimizerParameters(
            optimizer=model.training.optimizer,
            learning_rate=model.training.learning_rate,
            l2_decay=model.training.l2_decay,
        ),
    )

    return ParametricDQNTrainer(
        q_network,
        q_network_target,
        reward_network,
        use_gpu=use_gpu,
        **trainer_parameters.asdict()  # type: ignore
    )
Exemplo n.º 14
0
    def __init__(
        self,
        parameters: DiscreteActionModelParameters,
        normalization_parameters: Dict[int, NormalizationParameters],
        additional_feature_types:
        AdditionalFeatureTypes = DEFAULT_ADDITIONAL_FEATURE_TYPES,
    ) -> None:
        self._additional_feature_types = additional_feature_types
        self._actions = parameters.actions if parameters.actions is not None else []
        self.reward_shape = {}  # type: Dict[int, float]
        if parameters.rl.reward_boost is not None and self._actions is not None:
            for k in parameters.rl.reward_boost.keys():
                i = self._actions.index(k)
                self.reward_shape[i] = parameters.rl.reward_boost[k]
        if parameters.training.cnn_parameters is None:
            self.state_normalization_parameters: Optional[Dict[
                int, NormalizationParameters]] = normalization_parameters
            num_features = get_num_output_features(normalization_parameters)
            parameters.training.layers[0] = num_features
        else:
            self.state_normalization_parameters = None
        parameters.training.layers[-1] = self.num_actions

        RLTrainer.__init__(self, parameters)

        self._create_all_q_score_net()
        self._create_internal_policy_net()
Exemplo n.º 15
0
    def __init__(
        self,
        state_normalization_parameters: Dict[str, NormalizationParameters],
        parameters: DiscreteActionModelParameters,
        skip_normalization: Optional[bool] = False
    ) -> None:
        self._actions = parameters.actions

        self.num_processed_state_features = get_num_output_features(
            state_normalization_parameters
        )

        if parameters.training.layers[0] in [None, -1, 1]:
            parameters.training.layers[0] = self.num_state_features

        # There is a logical 1-dimensional output for each state/action pair,
        # but the underlying network computes num_actions-dimensional outputs
        if parameters.training.layers[-1] in [None, -1, 1]:
            parameters.training.layers[-1] = self.num_actions

        assert parameters.training.layers[-1] == self.num_actions,\
            "Set layers[-1] to a the number of actions or a default placeholder value"

        RLTrainer.__init__(
            self, state_normalization_parameters, parameters, skip_normalization
        )
Exemplo n.º 16
0
    def __init__(
        self,
        parameters: ContinuousActionModelParameters,
        state_normalization_parameters: Dict[int, NormalizationParameters],
        action_normalization_parameters: Dict[int, NormalizationParameters],
    ) -> None:
        self.state_normalization_parameters = state_normalization_parameters
        self.action_normalization_parameters = action_normalization_parameters
        num_features = get_num_output_features(
            state_normalization_parameters) + get_num_output_features(
                action_normalization_parameters)

        parameters.training.layers[0] = num_features
        parameters.training.layers[-1] = 1

        RLTrainer.__init__(self, parameters)
Exemplo n.º 17
0
 def build_actor(
     self,
     state_normalization_data: NormalizationData,
     action_normalization_data: NormalizationData,
 ) -> ModelBase:
     state_dim = get_num_output_features(
         state_normalization_data.dense_normalization_parameters)
     action_dim = get_num_output_features(
         action_normalization_data.dense_normalization_parameters)
     return DirichletFullyConnectedActor(
         state_dim=state_dim,
         action_dim=action_dim,
         sizes=self.config.sizes,
         activations=self.config.activations,
         use_batch_norm=self.config.use_batch_norm,
     )
Exemplo n.º 18
0
    def get_modular_sarsa_trainer_exporter(self,
                                           environment,
                                           parameters=None,
                                           use_gpu=False,
                                           use_all_avail_gpus=False):
        parameters = parameters or self.get_sarsa_parameters()
        q_network = FullyConnectedParametricDQN(
            state_dim=get_num_output_features(environment.normalization),
            action_dim=get_num_output_features(
                environment.normalization_action),
            sizes=parameters.training.layers[1:-1],
            activations=parameters.training.activations[:-1],
        )
        reward_network = FullyConnectedParametricDQN(
            state_dim=get_num_output_features(environment.normalization),
            action_dim=get_num_output_features(
                environment.normalization_action),
            sizes=parameters.training.layers[1:-1],
            activations=parameters.training.activations[:-1],
        )
        if use_gpu:
            q_network = q_network.cuda()
            reward_network = reward_network.cuda()
            if use_all_avail_gpus:
                q_network = q_network.get_data_parallel_model()
                reward_network = reward_network.get_data_parallel_model()

        q_network_target = q_network.get_target_network()
        trainer = _ParametricDQNTrainer(q_network, q_network_target,
                                        reward_network, parameters)
        state_preprocessor = Preprocessor(environment.normalization, False,
                                          True)
        action_preprocessor = Preprocessor(environment.normalization_action,
                                           False, True)
        feature_extractor = PredictorFeatureExtractor(
            state_normalization_parameters=environment.normalization,
            action_normalization_parameters=environment.normalization_action,
        )
        output_transformer = ParametricActionOutputTransformer()
        exporter = ParametricDQNExporter(
            q_network,
            feature_extractor,
            output_transformer,
            state_preprocessor,
            action_preprocessor,
        )
        return (trainer, exporter)
Exemplo n.º 19
0
 def build_q_network(
     self,
     state_normalization_parameters: Dict[int, NormalizationParameters],
     action_normalization_parameters: Dict[int, NormalizationParameters],
     output_dim: int = 1,
 ) -> ModelBase:
     state_dim = get_num_output_features(state_normalization_parameters)
     action_dim = get_num_output_features(action_normalization_parameters)
     return FullyConnectedParametricDQN(
         state_dim=state_dim,
         action_dim=action_dim,
         sizes=self.config.sizes,
         activations=self.config.activations,
         use_batch_norm=self.config.use_batch_norm,
         use_layer_norm=self.config.use_layer_norm,
         output_dim=output_dim,
     )
Exemplo n.º 20
0
def normalize_dense_matrix(
    inputs: np.ndarray,
    features: List[str],
    normalization_params: Dict[str, NormalizationParameters],
    norm_blob_map: Dict[int, str],
    norm_net: core.Net,
    blobname_template: str,
    num_output_features: Optional[int] = None,
) -> np.ndarray:
    """
    Normalizes inputs according to parameters. Expects a dense matrix whose ith
    column corresponds to feature i.

    Note that the Caffe2 BatchBoxCox operator isn't implemented on CUDA GPU so
    we need to use a CPU context.

    :param inputs: Numpy array with inputs to normalize. Should be of
        shape (any, num_features).
    :param features: Array of feature names.
    :param normalization_params: Mapping from feature names to
        NormalizationParameters.
    :param norm_blob_map: Dictionary that stores a mapping from feature index
        to input normalization blob name.
    :param norm_net: Caffe2 net for normalization.
    :param blobname_template: String template for input blobs to norm_net.
    :param num_output_features: The number of features in an output processed
        datapoint. If set to None, this function will compute it.
    """
    num_input_features = len(features)

    num_output_features = \
        num_output_features or get_num_output_features(normalization_params)

    assert inputs.shape[1] == num_input_features
    outputs = np.zeros((inputs.shape[0], num_output_features), dtype=np.float32)

    with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
        for idx in range(num_input_features):
            input_blob = blobname_template.format(idx)
            workspace.FeedBlob(input_blob, inputs[:, idx])
        workspace.RunNet(norm_net)

        output_col = 0
        for idx, feature in enumerate(features):
            normalized_input_blob = norm_blob_map[idx]
            normalized_inputs = workspace.FetchBlob(normalized_input_blob)
            normalization_param = normalization_params[feature]
            if normalization_param.feature_type == identify_types.ENUM:
                next_output_col = output_col + len(
                    normalization_param.possible_values
                )
                outputs[:, output_col:next_output_col] = normalized_inputs
            else:
                next_output_col = output_col + 1
                outputs[:, output_col] = normalized_inputs
            output_col = next_output_col
    return outputs
Exemplo n.º 21
0
 def build_value_network(
         self,
         state_normalization_data: NormalizationData) -> torch.nn.Module:
     state_dim = get_num_output_features(
         state_normalization_data.dense_normalization_parameters)
     return FullyConnectedNetwork(
         [state_dim] + self.config.sizes + [1],
         self.config.activations + ["linear"],
         use_layer_norm=self.config.use_layer_norm,
     )
Exemplo n.º 22
0
    def get_sac_trainer(self, env, parameters, use_gpu):
        state_dim = get_num_output_features(env.normalization)
        action_dim = get_num_output_features(env.normalization_action)
        q1_network = FullyConnectedParametricDQN(
            state_dim,
            action_dim,
            parameters.q_network.layers,
            parameters.q_network.activations,
        )
        q2_network = None
        if parameters.training.use_2_q_functions:
            q2_network = FullyConnectedParametricDQN(
                state_dim,
                action_dim,
                parameters.q_network.layers,
                parameters.q_network.activations,
            )
        value_network = FullyConnectedNetwork(
            [state_dim] + parameters.value_network.layers + [1],
            parameters.value_network.activations + ["linear"],
        )
        actor_network = GaussianFullyConnectedActor(
            state_dim,
            action_dim,
            parameters.actor_network.layers,
            parameters.actor_network.activations,
        )
        if use_gpu:
            q1_network.cuda()
            if q2_network:
                q2_network.cuda()
            value_network.cuda()
            actor_network.cuda()

        value_network_target = deepcopy(value_network)
        return SACTrainer(
            q1_network,
            value_network,
            value_network_target,
            actor_network,
            parameters,
            q2_network=q2_network,
        )
Exemplo n.º 23
0
    def __init__(
        self,
        parameters: DiscreteActionModelParameters,
        state_normalization_parameters: Dict[int, NormalizationParameters],
        use_gpu=False,
        additional_feature_types:
        AdditionalFeatureTypes = DEFAULT_ADDITIONAL_FEATURE_TYPES,
        gradient_handler=None,
    ) -> None:

        self.double_q_learning = parameters.rainbow.double_q_learning
        self.warm_start_model_path = parameters.training.warm_start_model_path
        self.minibatch_size = parameters.training.minibatch_size
        self._actions = parameters.actions if parameters.actions is not None else []

        self.reward_shape = {}  # type: Dict[int, float]
        if parameters.rl.reward_boost is not None and self._actions is not None:
            for k in parameters.rl.reward_boost.keys():
                i = self._actions.index(k)
                self.reward_shape[i] = parameters.rl.reward_boost[k]

        if parameters.training.cnn_parameters is None:
            self.state_normalization_parameters: Optional[Dict[
                int, NormalizationParameters]] = state_normalization_parameters
            self.num_features = get_num_output_features(
                state_normalization_parameters)
            parameters.training.layers[0] = self.num_features
        else:
            self.state_normalization_parameters = None
        parameters.training.layers[-1] = self.num_actions

        RLTrainer.__init__(self, parameters, use_gpu, additional_feature_types,
                           gradient_handler)

        if parameters.rainbow.dueling_architecture:
            self.q_network = DuelingArchitectureQNetwork(
                parameters.training.layers, parameters.training.activations)
        else:
            self.q_network = GenericFeedForwardNetwork(
                parameters.training.layers, parameters.training.activations)
        self.q_network_target = deepcopy(self.q_network)
        self._set_optimizer(parameters.training.optimizer)
        self.q_network_optimizer = self.optimizer_func(
            self.q_network.parameters(), lr=parameters.training.learning_rate)

        self.reward_network = GenericFeedForwardNetwork(
            parameters.training.layers, parameters.training.activations)
        self.reward_network_optimizer = self.optimizer_func(
            self.reward_network.parameters(),
            lr=parameters.training.learning_rate)

        if self.use_gpu:
            self.q_network.cuda()
            self.q_network_target.cuda()
            self.reward_network.cuda()
Exemplo n.º 24
0
    def __init__(
        self,
        parameters: DiscreteActionModelParameters,
        normalization_parameters: Dict[int, NormalizationParameters],
    ) -> None:
        self._actions = parameters.actions if parameters.actions is not None else []

        self.state_normalization_parameters = normalization_parameters
        num_features = get_num_output_features(normalization_parameters)
        parameters.training.layers[0] = num_features
        parameters.training.layers[-1] = self.num_actions

        RLTrainer.__init__(self, parameters)
Exemplo n.º 25
0
    def get_td3_trainer(self, env, parameters, use_gpu):
        state_dim = get_num_output_features(env.normalization)
        action_dim = get_num_output_features(env.normalization_action)
        q1_network = FullyConnectedParametricDQN(
            state_dim,
            action_dim,
            parameters.q_network.layers,
            parameters.q_network.activations,
        )
        q2_network = None
        if parameters.training.use_2_q_functions:
            q2_network = FullyConnectedParametricDQN(
                state_dim,
                action_dim,
                parameters.q_network.layers,
                parameters.q_network.activations,
            )
        actor_network = FullyConnectedActor(
            state_dim,
            action_dim,
            parameters.actor_network.layers,
            parameters.actor_network.activations,
        )

        if use_gpu:
            q1_network.cuda()
            if q2_network:
                q2_network.cuda()
            actor_network.cuda()

        return TD3Trainer(
            q1_network,
            actor_network,
            parameters,
            q2_network=q2_network,
            use_gpu=use_gpu,
        )
Exemplo n.º 26
0
    def __init__(
        self,
        parameters: ContinuousActionModelParameters,
        state_normalization_parameters: Dict[int, NormalizationParameters],
        action_normalization_parameters: Dict[int, NormalizationParameters],
    ) -> None:
        self.state_normalization_parameters = state_normalization_parameters
        self.action_normalization_parameters = action_normalization_parameters
        num_features = get_num_output_features(
            state_normalization_parameters) + get_num_output_features(
                action_normalization_parameters)

        # ensure state and action IDs have no intersection
        overlapping_features = (set(state_normalization_parameters.keys())
                                & set(action_normalization_parameters.keys()))
        assert (
            len(overlapping_features) == 0
        ), "There are some overlapping state and action features: " + str(
            overlapping_features)

        parameters.training.layers[0] = num_features
        parameters.training.layers[-1] = 1

        RLTrainer.__init__(self, parameters)
Exemplo n.º 27
0
    def get_modular_sarsa_trainer_exporter(
        self, environment, parameters=None, use_gpu=False, use_all_avail_gpus=False
    ):
        parameters = parameters or self.get_sarsa_parameters()
        q_network = FullyConnectedParametricDQN(
            state_dim=get_num_output_features(environment.normalization),
            action_dim=get_num_output_features(environment.normalization_action),
            sizes=parameters.training.layers[1:-1],
            activations=parameters.training.activations[:-1],
        )
        reward_network = FullyConnectedParametricDQN(
            state_dim=get_num_output_features(environment.normalization),
            action_dim=get_num_output_features(environment.normalization_action),
            sizes=parameters.training.layers[1:-1],
            activations=parameters.training.activations[:-1],
        )
        if use_gpu:
            q_network = q_network.cuda()
            reward_network = reward_network.cuda()
            if use_all_avail_gpus:
                q_network = q_network.get_data_parallel_model()
                reward_network = reward_network.get_data_parallel_model()

        q_network_target = q_network.get_target_network()
        trainer = _ParametricDQNTrainer(
            q_network, q_network_target, reward_network, parameters
        )
        feature_extractor = PredictorFeatureExtractor(
            state_normalization_parameters=environment.normalization,
            action_normalization_parameters=environment.normalization_action,
        )
        output_transformer = ParametricActionOutputTransformer()
        exporter = ParametricDQNExporter(
            q_network, feature_extractor, output_transformer
        )
        return (trainer, exporter)
Exemplo n.º 28
0
    def __init__(
        self,
        parameters: DiscreteActionModelParameters,
        normalization_parameters: Dict[int, NormalizationParameters],
    ) -> None:
        self._actions = parameters.actions if parameters.actions is not None else []
        self.reward_shape = {}  # type: Dict[int, float]
        if parameters.rl.reward_boost is not None and self._actions is not None:
            for k in parameters.rl.reward_boost.keys():
                i = self._actions.index(k)
                self.reward_shape[i] = parameters.rl.reward_boost[k]
        self.state_normalization_parameters = normalization_parameters
        num_features = get_num_output_features(normalization_parameters)
        parameters.training.layers[0] = num_features
        parameters.training.layers[-1] = self.num_actions

        RLTrainer.__init__(self, parameters)

        self._create_all_q_score_net()
Exemplo n.º 29
0
    def __init__(
        self,
        normalization_parameters: Dict[int, NormalizationParameters],
        use_gpu: bool,
    ) -> None:
        super().__init__()
        self.num_output_features = get_num_output_features(
            normalization_parameters)

        feature_types = {
            norm_param.feature_type
            for norm_param in normalization_parameters.values()
        }
        assert (
            len(feature_types) == 1
        ), "All dimensions of actions should have the same preprocessing"
        self.feature_type = list(feature_types)[0]
        assert self.feature_type in {
            CONTINUOUS_ACTION,
            DO_NOT_PREPROCESS,
        }, "Only support CONTINUOUS_ACTION & DO_NOT_PREPROCESS"

        self.device = torch.device(
            "cuda" if use_gpu else "cpu")  # type: ignore

        if self.feature_type == CONTINUOUS_ACTION:
            sorted_features = sorted(normalization_parameters.keys())
            self.min_serving_value = torch.tensor(
                [
                    normalization_parameters[f].min_value
                    for f in sorted_features
                ],
                device=self.device,
            )
            self.scaling_factor = torch.tensor(
                [
                    (
                        normalization_parameters[f].max_value  # type: ignore
                        - normalization_parameters[f].min_value  # type: ignore
                    ) / (2 * (1 - EPS)) for f in sorted_features
                ],
                device=self.device,
            )
Exemplo n.º 30
0
def create_dqn_trainer_from_params(
    model: DiscreteActionModelParameters,
    normalization_parameters: Dict[int, NormalizationParameters],
    use_gpu: bool = False,
    use_all_avail_gpus: bool = False,
    metrics_to_score=None,
):
    metrics_to_score = metrics_to_score or []

    if model.rainbow.quantile:
        q_network = QuantileDQN(
            state_dim=get_num_output_features(normalization_parameters),
            action_dim=len(model.actions),
            num_atoms=model.rainbow.num_atoms,
            sizes=model.training.layers[1:-1],
            activations=model.training.activations[:-1],
            dropout_ratio=model.training.dropout_ratio,
        )
    elif model.rainbow.categorical:
        q_network = CategoricalDQN(  # type: ignore
            state_dim=get_num_output_features(normalization_parameters),
            action_dim=len(model.actions),
            num_atoms=model.rainbow.num_atoms,
            qmin=model.rainbow.qmin,
            qmax=model.rainbow.qmax,
            sizes=model.training.layers[1:-1],
            activations=model.training.activations[:-1],
            dropout_ratio=model.training.dropout_ratio,
            use_gpu=use_gpu,
        )
    elif model.rainbow.dueling_architecture:
        q_network = DuelingQNetwork(  # type: ignore
            layers=[get_num_output_features(normalization_parameters)] +
            model.training.layers[1:-1] + [len(model.actions)],
            activations=model.training.activations,
        )
    else:
        q_network = FullyConnectedDQN(  # type: ignore
            state_dim=get_num_output_features(normalization_parameters),
            action_dim=len(model.actions),
            sizes=model.training.layers[1:-1],
            activations=model.training.activations[:-1],
            dropout_ratio=model.training.dropout_ratio,
        )

    if use_gpu and torch.cuda.is_available():
        q_network = q_network.cuda()

    q_network_target = q_network.get_target_network()

    reward_network, q_network_cpe, q_network_cpe_target = None, None, None
    if model.evaluation.calc_cpe_in_training:
        # Metrics + reward
        num_output_nodes = (len(metrics_to_score) + 1) * len(model.actions)
        reward_network = FullyConnectedDQN(
            state_dim=get_num_output_features(normalization_parameters),
            action_dim=num_output_nodes,
            sizes=model.training.layers[1:-1],
            activations=model.training.activations[:-1],
            dropout_ratio=model.training.dropout_ratio,
        )
        q_network_cpe = FullyConnectedDQN(
            state_dim=get_num_output_features(normalization_parameters),
            action_dim=num_output_nodes,
            sizes=model.training.layers[1:-1],
            activations=model.training.activations[:-1],
            dropout_ratio=model.training.dropout_ratio,
        )

        if use_gpu and torch.cuda.is_available():
            reward_network.cuda()
            q_network_cpe.cuda()

        q_network_cpe_target = q_network_cpe.get_target_network()

    if (use_all_avail_gpus and not model.rainbow.categorical
            and not model.rainbow.quantile):
        q_network = q_network.get_distributed_data_parallel_model()
        reward_network = (reward_network.get_distributed_data_parallel_model()
                          if reward_network else None)
        q_network_cpe = (q_network_cpe.get_distributed_data_parallel_model()
                         if q_network_cpe else None)

    if model.rainbow.quantile:
        assert (not use_all_avail_gpus
                ), "use_all_avail_gpus not implemented for distributional RL"
        return QRDQNTrainer(
            q_network,
            q_network_target,
            model,
            use_gpu,
            metrics_to_score=metrics_to_score,
        )

    elif model.rainbow.categorical:
        assert (not use_all_avail_gpus
                ), "use_all_avail_gpus not implemented for distributional RL"
        return C51Trainer(
            q_network,
            q_network_target,
            model,
            use_gpu,
            metrics_to_score=metrics_to_score,
        )

    else:
        return DQNTrainer(
            q_network,
            q_network_target,
            reward_network,
            model,
            use_gpu,
            q_network_cpe=q_network_cpe,
            q_network_cpe_target=q_network_cpe_target,
            metrics_to_score=metrics_to_score,
        )
Exemplo n.º 31
0
    def __init__(
        self,
        parameters,
        state_normalization_parameters: Dict[int, NormalizationParameters],
        action_normalization_parameters: Dict[int, NormalizationParameters],
        min_action_range_tensor_serving: torch.Tensor,
        max_action_range_tensor_serving: torch.Tensor,
        use_gpu: bool = False,
        additional_feature_types: AdditionalFeatureTypes = DEFAULT_ADDITIONAL_FEATURE_TYPES,
        use_all_avail_gpus: bool = False,
    ) -> None:

        self.state_normalization_parameters = state_normalization_parameters
        self.action_normalization_parameters = action_normalization_parameters

        for param in self.action_normalization_parameters.values():
            assert param.feature_type == CONTINUOUS, (
                "DDPG Actor features must be set to continuous (set to "
                + param.feature_type
                + ")"
            )

        self.state_dim = get_num_output_features(state_normalization_parameters)
        self.action_dim = min_action_range_tensor_serving.shape[1]
        self.num_features = self.state_dim + self.action_dim

        # Actor generates actions between -1 and 1 due to tanh output layer so
        # convert actions to range [-1, 1] before training.
        self.min_action_range_tensor_training = torch.ones(1, self.action_dim) * -1
        self.max_action_range_tensor_training = torch.ones(1, self.action_dim)
        self.min_action_range_tensor_serving = min_action_range_tensor_serving
        self.max_action_range_tensor_serving = max_action_range_tensor_serving

        # Shared params
        self.warm_start_model_path = parameters.shared_training.warm_start_model_path
        self.minibatch_size = parameters.shared_training.minibatch_size
        self.final_layer_init = parameters.shared_training.final_layer_init
        self._set_optimizer(parameters.shared_training.optimizer)

        # Actor params
        self.actor_params = parameters.actor_training
        assert (
            self.actor_params.activations[-1] == "tanh"
        ), "Actor final layer activation must be tanh"
        self.actor_params.layers[0] = self.state_dim
        self.actor_params.layers[-1] = self.action_dim
        self.noise_generator = OrnsteinUhlenbeckProcessNoise(self.action_dim)
        self.actor = ActorNet(
            self.actor_params.layers,
            self.actor_params.activations,
            self.final_layer_init,
        )
        self.actor_target = deepcopy(self.actor)
        self.actor_optimizer = self.optimizer_func(
            self.actor.parameters(),
            lr=self.actor_params.learning_rate,
            weight_decay=self.actor_params.l2_decay,
        )
        self.noise = self.noise_generator

        # Critic params
        self.critic_params = parameters.critic_training
        self.critic_params.layers[0] = self.state_dim
        self.critic_params.layers[-1] = 1
        self.critic = self.q_network = CriticNet(
            self.critic_params.layers,
            self.critic_params.activations,
            self.final_layer_init,
            self.action_dim,
        )
        self.critic_target = deepcopy(self.critic)
        self.critic_optimizer = self.optimizer_func(
            self.critic.parameters(),
            lr=self.critic_params.learning_rate,
            weight_decay=self.critic_params.l2_decay,
        )

        # ensure state and action IDs have no intersection
        overlapping_features = set(state_normalization_parameters.keys()) & set(
            action_normalization_parameters.keys()
        )
        assert len(overlapping_features) == 0, (
            "There are some overlapping state and action features: "
            + str(overlapping_features)
        )

        RLTrainer.__init__(self, parameters, use_gpu, additional_feature_types, None)

        self.min_action_range_tensor_training = self.min_action_range_tensor_training.type(
            self.dtype
        )
        self.max_action_range_tensor_training = self.max_action_range_tensor_training.type(
            self.dtype
        )
        self.min_action_range_tensor_serving = self.min_action_range_tensor_serving.type(
            self.dtype
        )
        self.max_action_range_tensor_serving = self.max_action_range_tensor_serving.type(
            self.dtype
        )

        if self.use_gpu:
            self.actor.cuda()
            self.actor_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

            if use_all_avail_gpus:
                self.actor = nn.DataParallel(self.actor)
                self.actor_target = nn.DataParallel(self.actor_target)
                self.critic = nn.DataParallel(self.critic)
                self.critic_target = nn.DataParallel(self.critic_target)
Exemplo n.º 32
0
    def get_modular_sarsa_trainer_reward_boost(
        self,
        environment,
        reward_shape,
        dueling,
        categorical,
        quantile,
        use_gpu=False,
        use_all_avail_gpus=False,
        clip_grad_norm=None,
    ):
        assert not quantile or not categorical
        parameters = self.get_sarsa_parameters(environment, reward_shape,
                                               dueling, categorical, quantile,
                                               clip_grad_norm)

        if quantile:
            if dueling:
                q_network = DuelingQuantileDQN(
                    layers=[
                        get_num_output_features(environment.normalization)
                    ] + parameters.training.layers[1:-1] +
                    [len(environment.ACTIONS)],
                    activations=parameters.training.activations,
                    num_atoms=parameters.rainbow.num_atoms,
                )
            else:
                q_network = QuantileDQN(
                    state_dim=get_num_output_features(
                        environment.normalization),
                    action_dim=len(environment.ACTIONS),
                    num_atoms=parameters.rainbow.num_atoms,
                    sizes=parameters.training.layers[1:-1],
                    activations=parameters.training.activations[:-1],
                )
        elif categorical:
            assert not dueling
            q_network = CategoricalDQN(
                state_dim=get_num_output_features(environment.normalization),
                action_dim=len(environment.ACTIONS),
                num_atoms=parameters.rainbow.num_atoms,
                qmin=-100,
                qmax=200,
                sizes=parameters.training.layers[1:-1],
                activations=parameters.training.activations[:-1],
            )
        else:
            if dueling:
                q_network = DuelingQNetwork(
                    layers=[
                        get_num_output_features(environment.normalization)
                    ] + parameters.training.layers[1:-1] +
                    [len(environment.ACTIONS)],
                    activations=parameters.training.activations,
                )
            else:
                q_network = FullyConnectedDQN(
                    state_dim=get_num_output_features(
                        environment.normalization),
                    action_dim=len(environment.ACTIONS),
                    sizes=parameters.training.layers[1:-1],
                    activations=parameters.training.activations[:-1],
                )

        q_network_cpe, q_network_cpe_target, reward_network = None, None, None

        if parameters.evaluation and parameters.evaluation.calc_cpe_in_training:
            q_network_cpe = FullyConnectedDQN(
                state_dim=get_num_output_features(environment.normalization),
                action_dim=len(environment.ACTIONS),
                sizes=parameters.training.layers[1:-1],
                activations=parameters.training.activations[:-1],
            )
            q_network_cpe_target = q_network_cpe.get_target_network()
            reward_network = FullyConnectedDQN(
                state_dim=get_num_output_features(environment.normalization),
                action_dim=len(environment.ACTIONS),
                sizes=parameters.training.layers[1:-1],
                activations=parameters.training.activations[:-1],
            )

        if use_gpu:
            q_network = q_network.cuda()
            if parameters.evaluation.calc_cpe_in_training:
                reward_network = reward_network.cuda()
                q_network_cpe = q_network_cpe.cuda()
                q_network_cpe_target = q_network_cpe_target.cuda()
            if use_all_avail_gpus and not categorical:
                q_network = q_network.get_distributed_data_parallel_model()
                reward_network = reward_network.get_distributed_data_parallel_model(
                )
                q_network_cpe = q_network_cpe.get_distributed_data_parallel_model(
                )
                q_network_cpe_target = (
                    q_network_cpe_target.get_distributed_data_parallel_model())

        if quantile:
            trainer = QRDQNTrainer(
                q_network,
                q_network.get_target_network(),
                parameters,
                use_gpu,
                reward_network=reward_network,
                q_network_cpe=q_network_cpe,
                q_network_cpe_target=q_network_cpe_target,
            )
        elif categorical:
            trainer = C51Trainer(q_network, q_network.get_target_network(),
                                 parameters, use_gpu)
        else:
            parameters = DQNTrainerParameters.from_discrete_action_model_parameters(
                parameters)
            trainer = DQNTrainer(
                q_network,
                q_network.get_target_network(),
                reward_network,
                parameters,
                use_gpu,
                q_network_cpe=q_network_cpe,
                q_network_cpe_target=q_network_cpe_target,
            )
        return trainer
Exemplo n.º 33
0
    def __init__(
        self,
        parameters,
        state_normalization_parameters: Dict[int, NormalizationParameters],
        action_normalization_parameters: Dict[int, NormalizationParameters],
        min_action_range_tensor_serving: torch.Tensor,
        max_action_range_tensor_serving: torch.Tensor,
        use_gpu: bool = False,
        additional_feature_types: AdditionalFeatureTypes = DEFAULT_ADDITIONAL_FEATURE_TYPES,
        use_all_avail_gpus: bool = False,
    ) -> None:

        self.state_normalization_parameters = state_normalization_parameters
        self.action_normalization_parameters = action_normalization_parameters

        self.state_dim = get_num_output_features(state_normalization_parameters)
        self.action_dim = min_action_range_tensor_serving.shape[1]

        # Actor generates actions between -1 and 1 due to tanh output layer so
        # convert actions to range [-1, 1] before training.
        self.min_action_range_tensor_training = torch.ones(1, self.action_dim) * -1
        self.max_action_range_tensor_training = torch.ones(1, self.action_dim)
        self.min_action_range_tensor_serving = min_action_range_tensor_serving
        self.max_action_range_tensor_serving = max_action_range_tensor_serving

        # Shared params
        self.warm_start_model_path = parameters.shared_training.warm_start_model_path
        self.minibatch_size = parameters.shared_training.minibatch_size
        self.final_layer_init = parameters.shared_training.final_layer_init
        self._set_optimizer(parameters.shared_training.optimizer)

        # Actor params
        self.actor_params = parameters.actor_training
        assert (
            self.actor_params.activations[-1] == "tanh"
        ), "Actor final layer activation must be tanh"
        self.actor_params.layers[0] = self.state_dim
        self.actor_params.layers[-1] = self.action_dim
        self.noise_generator = OrnsteinUhlenbeckProcessNoise(self.action_dim)
        self.actor = ActorNet(
            self.actor_params.layers,
            self.actor_params.activations,
            self.final_layer_init,
        )
        self.actor_target = deepcopy(self.actor)
        self.actor_optimizer = self.optimizer_func(
            self.actor.parameters(),
            lr=self.actor_params.learning_rate,
            weight_decay=self.actor_params.l2_decay,
        )
        self.noise = self.noise_generator

        # Critic params
        self.critic_params = parameters.critic_training
        self.critic_params.layers[0] = self.state_dim
        self.critic_params.layers[-1] = 1
        self.critic = CriticNet(
            self.critic_params.layers,
            self.critic_params.activations,
            self.final_layer_init,
            self.action_dim,
        )
        self.critic_target = deepcopy(self.critic)
        self.critic_optimizer = self.optimizer_func(
            self.critic.parameters(),
            lr=self.critic_params.learning_rate,
            weight_decay=self.critic_params.l2_decay,
        )

        RLTrainer.__init__(self, parameters, use_gpu, additional_feature_types, None)

        self.min_action_range_tensor_training = self.min_action_range_tensor_training.type(
            self.dtype
        )
        self.max_action_range_tensor_training = self.max_action_range_tensor_training.type(
            self.dtype
        )
        self.min_action_range_tensor_serving = self.min_action_range_tensor_serving.type(
            self.dtype
        )
        self.max_action_range_tensor_serving = self.max_action_range_tensor_serving.type(
            self.dtype
        )

        if self.use_gpu:
            self.actor.cuda()
            self.actor_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

            if use_all_avail_gpus:
                self.actor = nn.DataParallel(self.actor)
                self.actor_target = nn.DataParallel(self.actor_target)
                self.critic = nn.DataParallel(self.critic)
                self.critic_target = nn.DataParallel(self.critic_target)
Exemplo n.º 34
0
    def __init__(
        self,
        parameters: ContinuousActionModelParameters,
        state_normalization_parameters: Dict[int, NormalizationParameters],
        action_normalization_parameters: Dict[int, NormalizationParameters],
        use_gpu: bool = False,
        additional_feature_types: AdditionalFeatureTypes = DEFAULT_ADDITIONAL_FEATURE_TYPES,
        metrics_to_score=None,
        gradient_handler=None,
        use_all_avail_gpus: bool = False,
    ) -> None:

        self.double_q_learning = parameters.rainbow.double_q_learning
        self.warm_start_model_path = parameters.training.warm_start_model_path
        self.minibatch_size = parameters.training.minibatch_size
        self.state_normalization_parameters = state_normalization_parameters
        self.action_normalization_parameters = action_normalization_parameters
        self.num_state_features = get_num_output_features(
            state_normalization_parameters
        )
        self.num_action_features = get_num_output_features(
            action_normalization_parameters
        )
        self.num_features = self.num_state_features + self.num_action_features

        # ensure state and action IDs have no intersection
        overlapping_features = set(state_normalization_parameters.keys()) & set(
            action_normalization_parameters.keys()
        )
        assert len(overlapping_features) == 0, (
            "There are some overlapping state and action features: "
            + str(overlapping_features)
        )

        reward_network_layers = deepcopy(parameters.training.layers)
        reward_network_layers[0] = self.num_features
        reward_network_layers[-1] = 1

        if parameters.rainbow.dueling_architecture:
            parameters.training.layers[0] = self.num_state_features
            parameters.training.layers[-1] = 1
        elif parameters.training.factorization_parameters is None:
            parameters.training.layers[0] = self.num_features
            parameters.training.layers[-1] = 1
        else:
            parameters.training.factorization_parameters.state.layers[
                0
            ] = self.num_state_features
            parameters.training.factorization_parameters.action.layers[
                0
            ] = self.num_action_features

        RLTrainer.__init__(
            self,
            parameters,
            use_gpu,
            additional_feature_types,
            metrics_to_score,
            gradient_handler,
        )

        self.q_network = self._get_model(
            parameters.training, parameters.rainbow.dueling_architecture
        )

        self.q_network_target = deepcopy(self.q_network)
        self._set_optimizer(parameters.training.optimizer)
        self.q_network_optimizer = self.optimizer_func(
            self.q_network.parameters(),
            lr=parameters.training.learning_rate,
            weight_decay=parameters.training.l2_decay,
        )

        self.reward_network = FullyConnectedNetwork(
            reward_network_layers, parameters.training.activations
        )
        self.reward_network_optimizer = self.optimizer_func(
            self.reward_network.parameters(), lr=parameters.training.learning_rate
        )

        if self.use_gpu:
            self.q_network.cuda()
            self.q_network_target.cuda()
            self.reward_network.cuda()

            if use_all_avail_gpus:
                self.q_network = torch.nn.DataParallel(self.q_network)
                self.q_network_target = torch.nn.DataParallel(self.q_network_target)
                self.reward_network = torch.nn.DataParallel(self.reward_network)