Exemplo n.º 1
0
    def __init__(
        self,
        parameters: DiscreteActionModelParameters,
        normalization_parameters: Dict[int, NormalizationParameters],
        additional_feature_types:
        AdditionalFeatureTypes = DEFAULT_ADDITIONAL_FEATURE_TYPES,
    ) -> None:
        self._additional_feature_types = additional_feature_types
        self._actions = parameters.actions if parameters.actions is not None else []
        self.reward_shape = {}  # type: Dict[int, float]
        if parameters.rl.reward_boost is not None and self._actions is not None:
            for k in parameters.rl.reward_boost.keys():
                i = self._actions.index(k)
                self.reward_shape[i] = parameters.rl.reward_boost[k]
        if parameters.training.cnn_parameters is None:
            self.state_normalization_parameters: Optional[Dict[
                int, NormalizationParameters]] = normalization_parameters
            num_features = get_num_output_features(normalization_parameters)
            parameters.training.layers[0] = num_features
        else:
            self.state_normalization_parameters = None
        parameters.training.layers[-1] = self.num_actions

        RLTrainer.__init__(self, parameters)

        self._create_all_q_score_net()
        self._create_internal_policy_net()
Exemplo n.º 2
0
    def train_numpy(self, tdp, evaluator):
        self._quantile_states.extendleft(tdp.states)
        if self._update_counter % self._quantile_update_frequency == \
                self._quantile_update_frequency - 1:
            self._update_quantile()
        self._update_counter += 1

        if self._max_q:
            workspace.FeedBlob('states', tdp.states)
            workspace.FeedBlob('actions', tdp.possible_next_actions)
            workspace.RunNetOnce(self.q_score_model.net)
            q_values = workspace.FetchBlob(self.q_score_output)
            q_next_actions = np.argmax(q_values, axis=1).reshape(-1, 1)
            q_next_actions_mask = np.zeros(
                [q_next_actions.shape[0], self.num_actions], dtype=np.float32)
            for x in range(q_next_actions.shape[0]):
                q_next_actions_mask[q_next_actions[x, 0], 0] = 1.0
            q_next_actions = q_next_actions_mask
        else:
            q_next_actions = tdp.next_actions
        penalty = self._reward_penalty(tdp.actions, q_next_actions,
                                       tdp.not_terminals)
        assert penalty.shape == tdp.rewards.shape, "" + str(
            penalty.shape) + "" + str(tdp.rewards.shape)
        tdp.rewards = tdp.rewards - penalty
        RLTrainer.train_numpy(
            self,
            tdp,
            evaluator,
        )
    def __init__(
        self,
        parameters: ContinuousActionModelParameters,
        state_normalization_parameters: Dict[int, NormalizationParameters],
        action_normalization_parameters: Dict[int, NormalizationParameters],
        additional_feature_types:
        AdditionalFeatureTypes = DEFAULT_ADDITIONAL_FEATURE_TYPES,
    ) -> None:
        self._additional_feature_types = additional_feature_types
        self.state_normalization_parameters = state_normalization_parameters
        self.action_normalization_parameters = action_normalization_parameters
        num_features = get_num_output_features(
            state_normalization_parameters) + get_num_output_features(
                action_normalization_parameters)

        # ensure state and action IDs have no intersection
        overlapping_features = set(
            state_normalization_parameters.keys()) & set(
                action_normalization_parameters.keys())
        assert len(overlapping_features) == 0, (
            "There are some overlapping state and action features: " +
            str(overlapping_features))

        parameters.training.layers[0] = num_features
        parameters.training.layers[-1] = 1

        RLTrainer.__init__(self, parameters)

        self._create_internal_policy_net()
Exemplo n.º 4
0
    def __init__(
        self,
        state_normalization_parameters: Dict[str, NormalizationParameters],
        parameters: DiscreteActionModelParameters,
        skip_normalization: Optional[bool] = False
    ) -> None:
        self._actions = parameters.actions

        self.num_processed_state_features = get_num_output_features(
            state_normalization_parameters
        )

        if parameters.training.layers[0] in [None, -1, 1]:
            parameters.training.layers[0] = self.num_state_features

        # There is a logical 1-dimensional output for each state/action pair,
        # but the underlying network computes num_actions-dimensional outputs
        if parameters.training.layers[-1] in [None, -1, 1]:
            parameters.training.layers[-1] = self.num_actions

        assert parameters.training.layers[-1] == self.num_actions,\
            "Set layers[-1] to a the number of actions or a default placeholder value"

        RLTrainer.__init__(
            self, state_normalization_parameters, parameters, skip_normalization
        )
    def stream(
        self, states, actions, rewards, next_states, next_actions, is_terminals,
        possible_next_actions, reward_timelines, evaluator
    ):
        self._quantile_states.extendleft(states)
        if self._update_counter % self._quantile_update_frequency == \
                self._quantile_update_frequency - 1:
            self._update_quantile()
        self._update_counter += 1

        if self._max_q:
            q_next_actions = self.get_maxq_actions(
                next_states, possible_next_actions
            )
        else:
            q_next_actions = next_actions
        penalty = self._reward_penalty(actions, q_next_actions, is_terminals)
        assert penalty.shape == rewards.shape, "" + str(
            penalty.shape
        ) + "" + str(rewards.shape)
        RLTrainer.stream(
            self,
            states,
            actions,
            rewards - penalty,
            next_states,
            next_actions,
            is_terminals,
            possible_next_actions,
            None,
            evaluator,
        )
    def __init__(self,
                 state_normalization_parameters: Dict[str,
                                                      NormalizationParameters],
                 action_normalization_parameters: Dict[
                     str, NormalizationParameters],
                 parameters: ContinuousActionModelParameters,
                 skip_normalization: Optional[bool] = False) -> None:
        self._action_features = list(action_normalization_parameters.keys())
        self.num_unprocessed_action_features = len(self._action_features)
        self.num_processed_action_features = get_num_output_features(
            action_normalization_parameters)

        self.num_processed_state_features = get_num_output_features(
            state_normalization_parameters)

        if parameters.training.layers[0] is None or\
           parameters.training.layers[0] == -1:
            parameters.training.layers[0] = self.num_state_features +\
                self.num_action_features

        assert parameters.training.layers[-1] == 1, "Set layers[-1] to 1"

        self._action_normalization_parameters = action_normalization_parameters
        RLTrainer.__init__(self, state_normalization_parameters, parameters,
                           skip_normalization)
        print(action_normalization_parameters)

        self._prepare_action_normalization()
Exemplo n.º 7
0
    def train(
        self,
        states: np.ndarray,
        actions: np.ndarray,
        rewards: np.ndarray,
        next_states: np.ndarray,
        next_actions: Optional[np.ndarray],
        not_terminals: np.ndarray,
        possible_next_actions: np.ndarray,
    ) -> None:
        """
        Takes in a batch of transitions. For transition i, calculates target qval:
            next_q_values_i = {
                max_{pna_i} Q(next_state_i, pna_i), self.maxq_learning
                Q(next_state_i, next_action_i), self.sarsa
            }
            q_val_target_i = {
                r_i + gamma * next_q_values_i, not_terminals_i
                r_i, !not_terminals_i
            }
        Trains Q Network on the q_val_targets as labels.

        :param states: Numpy array with shape (batch_size, state_dim). The ith
            row is a representation of the ith transition's state.
        :param actions: Numpy array with shape (batch_size, action_dim). The ith
            row contains the one-hotted representation of the ith transition's
            action: actions[i][j] = 1 if action_i == j else 0.
        :param rewards: Numpy array with shape (batch_size, 1). The ith entry is
            the reward experienced at the ith transition.
        :param not_terminals: Numpy array with shape (batch_size, 1). The ith entry
            is equal to 1 iff the ith transition's state is not terminal.
        :param next_states: Numpy array with shape (batch_size, state_dim). The
            ith row is a representation of the ith transition's next state.
        :param next_actions: Numpy array with shape (batch_size, action_dim). The
            ith row contains the one-hotted representation of the ith transition's
            action: next_actions[i][j] = 1 if next_action_i == j else 0.
        :param possible_next_actions: Numpy array with shape (batch_size, action_dim).
            possible_next_actions[i][j] = 1 iff the agent can take action j from
            state i.
        """
        batch_size = states.shape[0]
        assert actions.shape == (batch_size, self.num_actions)
        assert next_states.shape == (batch_size, self.num_state_features)
        assert not_terminals.shape == (batch_size, 1)
        if next_actions is not None:
            assert next_actions.shape == (batch_size, self.num_actions)
        if possible_next_actions is not None:
            assert possible_next_actions.shape == (batch_size, self.num_actions)
        RLTrainer.train(
            self,
            states,
            actions,
            rewards,
            next_states,
            next_actions,
            not_terminals,
            possible_next_actions,
        )
    def __init__(
        self,
        parameters: DiscreteActionModelParameters,
        normalization_parameters: Dict[int, NormalizationParameters],
    ) -> None:
        self._actions = parameters.actions if parameters.actions is not None else []

        self.state_normalization_parameters = normalization_parameters
        num_features = get_num_output_features(normalization_parameters)
        parameters.training.layers[0] = num_features
        parameters.training.layers[-1] = self.num_actions

        RLTrainer.__init__(self, parameters)
Exemplo n.º 9
0
    def __init__(
        self,
        parameters: ContinuousActionModelParameters,
        state_normalization_parameters: Dict[int, NormalizationParameters],
        action_normalization_parameters: Dict[int, NormalizationParameters],
    ) -> None:
        self.state_normalization_parameters = state_normalization_parameters
        self.action_normalization_parameters = action_normalization_parameters
        num_features = get_num_output_features(
            state_normalization_parameters) + get_num_output_features(
                action_normalization_parameters)

        parameters.training.layers[0] = num_features
        parameters.training.layers[-1] = 1

        RLTrainer.__init__(self, parameters)
Exemplo n.º 10
0
    def __init__(
        self,
        parameters: DiscreteActionModelParameters,
        normalization_parameters: Dict[int, NormalizationParameters],
    ) -> None:
        self._actions = parameters.actions if parameters.actions is not None else []
        self.reward_shape = {}  # type: Dict[int, float]
        if parameters.rl.reward_boost is not None and self._actions is not None:
            for k in parameters.rl.reward_boost.keys():
                i = self._actions.index(k)
                self.reward_shape[i] = parameters.rl.reward_boost[k]
        self.state_normalization_parameters = normalization_parameters
        num_features = get_num_output_features(normalization_parameters)
        parameters.training.layers[0] = num_features
        parameters.training.layers[-1] = self.num_actions

        RLTrainer.__init__(self, parameters)

        self._create_all_q_score_net()
Exemplo n.º 11
0
    def __init__(
        self,
        parameters: ContinuousActionModelParameters,
        state_normalization_parameters: Dict[int, NormalizationParameters],
        action_normalization_parameters: Dict[int, NormalizationParameters],
    ) -> None:
        self.state_normalization_parameters = state_normalization_parameters
        self.action_normalization_parameters = action_normalization_parameters
        num_features = get_num_output_features(
            state_normalization_parameters) + get_num_output_features(
                action_normalization_parameters)

        # ensure state and action IDs have no intersection
        overlapping_features = (set(state_normalization_parameters.keys())
                                & set(action_normalization_parameters.keys()))
        assert (
            len(overlapping_features) == 0
        ), "There are some overlapping state and action features: " + str(
            overlapping_features)

        parameters.training.layers[0] = num_features
        parameters.training.layers[-1] = 1

        RLTrainer.__init__(self, parameters)