def __init__( self, parameters: DiscreteActionModelParameters, normalization_parameters: Dict[int, NormalizationParameters], additional_feature_types: AdditionalFeatureTypes = DEFAULT_ADDITIONAL_FEATURE_TYPES, ) -> None: self._additional_feature_types = additional_feature_types self._actions = parameters.actions if parameters.actions is not None else [] self.reward_shape = {} # type: Dict[int, float] if parameters.rl.reward_boost is not None and self._actions is not None: for k in parameters.rl.reward_boost.keys(): i = self._actions.index(k) self.reward_shape[i] = parameters.rl.reward_boost[k] if parameters.training.cnn_parameters is None: self.state_normalization_parameters: Optional[Dict[ int, NormalizationParameters]] = normalization_parameters num_features = get_num_output_features(normalization_parameters) parameters.training.layers[0] = num_features else: self.state_normalization_parameters = None parameters.training.layers[-1] = self.num_actions RLTrainer.__init__(self, parameters) self._create_all_q_score_net() self._create_internal_policy_net()
def train_numpy(self, tdp, evaluator): self._quantile_states.extendleft(tdp.states) if self._update_counter % self._quantile_update_frequency == \ self._quantile_update_frequency - 1: self._update_quantile() self._update_counter += 1 if self._max_q: workspace.FeedBlob('states', tdp.states) workspace.FeedBlob('actions', tdp.possible_next_actions) workspace.RunNetOnce(self.q_score_model.net) q_values = workspace.FetchBlob(self.q_score_output) q_next_actions = np.argmax(q_values, axis=1).reshape(-1, 1) q_next_actions_mask = np.zeros( [q_next_actions.shape[0], self.num_actions], dtype=np.float32) for x in range(q_next_actions.shape[0]): q_next_actions_mask[q_next_actions[x, 0], 0] = 1.0 q_next_actions = q_next_actions_mask else: q_next_actions = tdp.next_actions penalty = self._reward_penalty(tdp.actions, q_next_actions, tdp.not_terminals) assert penalty.shape == tdp.rewards.shape, "" + str( penalty.shape) + "" + str(tdp.rewards.shape) tdp.rewards = tdp.rewards - penalty RLTrainer.train_numpy( self, tdp, evaluator, )
def __init__( self, parameters: ContinuousActionModelParameters, state_normalization_parameters: Dict[int, NormalizationParameters], action_normalization_parameters: Dict[int, NormalizationParameters], additional_feature_types: AdditionalFeatureTypes = DEFAULT_ADDITIONAL_FEATURE_TYPES, ) -> None: self._additional_feature_types = additional_feature_types self.state_normalization_parameters = state_normalization_parameters self.action_normalization_parameters = action_normalization_parameters num_features = get_num_output_features( state_normalization_parameters) + get_num_output_features( action_normalization_parameters) # ensure state and action IDs have no intersection overlapping_features = set( state_normalization_parameters.keys()) & set( action_normalization_parameters.keys()) assert len(overlapping_features) == 0, ( "There are some overlapping state and action features: " + str(overlapping_features)) parameters.training.layers[0] = num_features parameters.training.layers[-1] = 1 RLTrainer.__init__(self, parameters) self._create_internal_policy_net()
def __init__( self, state_normalization_parameters: Dict[str, NormalizationParameters], parameters: DiscreteActionModelParameters, skip_normalization: Optional[bool] = False ) -> None: self._actions = parameters.actions self.num_processed_state_features = get_num_output_features( state_normalization_parameters ) if parameters.training.layers[0] in [None, -1, 1]: parameters.training.layers[0] = self.num_state_features # There is a logical 1-dimensional output for each state/action pair, # but the underlying network computes num_actions-dimensional outputs if parameters.training.layers[-1] in [None, -1, 1]: parameters.training.layers[-1] = self.num_actions assert parameters.training.layers[-1] == self.num_actions,\ "Set layers[-1] to a the number of actions or a default placeholder value" RLTrainer.__init__( self, state_normalization_parameters, parameters, skip_normalization )
def stream( self, states, actions, rewards, next_states, next_actions, is_terminals, possible_next_actions, reward_timelines, evaluator ): self._quantile_states.extendleft(states) if self._update_counter % self._quantile_update_frequency == \ self._quantile_update_frequency - 1: self._update_quantile() self._update_counter += 1 if self._max_q: q_next_actions = self.get_maxq_actions( next_states, possible_next_actions ) else: q_next_actions = next_actions penalty = self._reward_penalty(actions, q_next_actions, is_terminals) assert penalty.shape == rewards.shape, "" + str( penalty.shape ) + "" + str(rewards.shape) RLTrainer.stream( self, states, actions, rewards - penalty, next_states, next_actions, is_terminals, possible_next_actions, None, evaluator, )
def __init__(self, state_normalization_parameters: Dict[str, NormalizationParameters], action_normalization_parameters: Dict[ str, NormalizationParameters], parameters: ContinuousActionModelParameters, skip_normalization: Optional[bool] = False) -> None: self._action_features = list(action_normalization_parameters.keys()) self.num_unprocessed_action_features = len(self._action_features) self.num_processed_action_features = get_num_output_features( action_normalization_parameters) self.num_processed_state_features = get_num_output_features( state_normalization_parameters) if parameters.training.layers[0] is None or\ parameters.training.layers[0] == -1: parameters.training.layers[0] = self.num_state_features +\ self.num_action_features assert parameters.training.layers[-1] == 1, "Set layers[-1] to 1" self._action_normalization_parameters = action_normalization_parameters RLTrainer.__init__(self, state_normalization_parameters, parameters, skip_normalization) print(action_normalization_parameters) self._prepare_action_normalization()
def train( self, states: np.ndarray, actions: np.ndarray, rewards: np.ndarray, next_states: np.ndarray, next_actions: Optional[np.ndarray], not_terminals: np.ndarray, possible_next_actions: np.ndarray, ) -> None: """ Takes in a batch of transitions. For transition i, calculates target qval: next_q_values_i = { max_{pna_i} Q(next_state_i, pna_i), self.maxq_learning Q(next_state_i, next_action_i), self.sarsa } q_val_target_i = { r_i + gamma * next_q_values_i, not_terminals_i r_i, !not_terminals_i } Trains Q Network on the q_val_targets as labels. :param states: Numpy array with shape (batch_size, state_dim). The ith row is a representation of the ith transition's state. :param actions: Numpy array with shape (batch_size, action_dim). The ith row contains the one-hotted representation of the ith transition's action: actions[i][j] = 1 if action_i == j else 0. :param rewards: Numpy array with shape (batch_size, 1). The ith entry is the reward experienced at the ith transition. :param not_terminals: Numpy array with shape (batch_size, 1). The ith entry is equal to 1 iff the ith transition's state is not terminal. :param next_states: Numpy array with shape (batch_size, state_dim). The ith row is a representation of the ith transition's next state. :param next_actions: Numpy array with shape (batch_size, action_dim). The ith row contains the one-hotted representation of the ith transition's action: next_actions[i][j] = 1 if next_action_i == j else 0. :param possible_next_actions: Numpy array with shape (batch_size, action_dim). possible_next_actions[i][j] = 1 iff the agent can take action j from state i. """ batch_size = states.shape[0] assert actions.shape == (batch_size, self.num_actions) assert next_states.shape == (batch_size, self.num_state_features) assert not_terminals.shape == (batch_size, 1) if next_actions is not None: assert next_actions.shape == (batch_size, self.num_actions) if possible_next_actions is not None: assert possible_next_actions.shape == (batch_size, self.num_actions) RLTrainer.train( self, states, actions, rewards, next_states, next_actions, not_terminals, possible_next_actions, )
def __init__( self, parameters: DiscreteActionModelParameters, normalization_parameters: Dict[int, NormalizationParameters], ) -> None: self._actions = parameters.actions if parameters.actions is not None else [] self.state_normalization_parameters = normalization_parameters num_features = get_num_output_features(normalization_parameters) parameters.training.layers[0] = num_features parameters.training.layers[-1] = self.num_actions RLTrainer.__init__(self, parameters)
def __init__( self, parameters: ContinuousActionModelParameters, state_normalization_parameters: Dict[int, NormalizationParameters], action_normalization_parameters: Dict[int, NormalizationParameters], ) -> None: self.state_normalization_parameters = state_normalization_parameters self.action_normalization_parameters = action_normalization_parameters num_features = get_num_output_features( state_normalization_parameters) + get_num_output_features( action_normalization_parameters) parameters.training.layers[0] = num_features parameters.training.layers[-1] = 1 RLTrainer.__init__(self, parameters)
def __init__( self, parameters: DiscreteActionModelParameters, normalization_parameters: Dict[int, NormalizationParameters], ) -> None: self._actions = parameters.actions if parameters.actions is not None else [] self.reward_shape = {} # type: Dict[int, float] if parameters.rl.reward_boost is not None and self._actions is not None: for k in parameters.rl.reward_boost.keys(): i = self._actions.index(k) self.reward_shape[i] = parameters.rl.reward_boost[k] self.state_normalization_parameters = normalization_parameters num_features = get_num_output_features(normalization_parameters) parameters.training.layers[0] = num_features parameters.training.layers[-1] = self.num_actions RLTrainer.__init__(self, parameters) self._create_all_q_score_net()
def __init__( self, parameters: ContinuousActionModelParameters, state_normalization_parameters: Dict[int, NormalizationParameters], action_normalization_parameters: Dict[int, NormalizationParameters], ) -> None: self.state_normalization_parameters = state_normalization_parameters self.action_normalization_parameters = action_normalization_parameters num_features = get_num_output_features( state_normalization_parameters) + get_num_output_features( action_normalization_parameters) # ensure state and action IDs have no intersection overlapping_features = (set(state_normalization_parameters.keys()) & set(action_normalization_parameters.keys())) assert ( len(overlapping_features) == 0 ), "There are some overlapping state and action features: " + str( overlapping_features) parameters.training.layers[0] = num_features parameters.training.layers[-1] = 1 RLTrainer.__init__(self, parameters)