def __init__(self, state_normalization_parameters: Dict[str, NormalizationParameters], parameters: Union[DiscreteActionModelParameters, ContinuousActionModelParameters], skip_normalization: Optional[bool] = False) -> None: print(state_normalization_parameters) print(parameters) self._state_normalization_parameters = state_normalization_parameters MLTrainer.__init__(self, "rl_trainer", parameters.training) self.target_network = TargetNetwork(self, parameters.rl.target_update_rate) self.reward_burnin = parameters.rl.reward_burnin self.maxq_learning = parameters.rl.maxq_learning self.rl_discount_rate = parameters.rl.gamma self.training_iteration = 0 self._buffers = None self.minibatch_size = parameters.training.minibatch_size self.skip_normalization = skip_normalization self._prepare_state_normalization()
def _setup_initial_blobs(self): MLTrainer._setup_initial_blobs(self) self.output_conv_blob = "Conv_output_{}".format(self.model_id) workspace.FeedBlob(self.output_conv_blob, np.zeros(1, dtype=np.float32)) self.conv_weights: List[str] = [] self.conv_biases: List[str] = [] for x in six.moves.range(len(self.dims) - 1): dim_in = self.dims[x] dim_out = self.dims[x + 1] kernel_h = self.conv_height_kernels[x] kernel_w = self.conv_width_kernels[x] weight_shape = [dim_out, kernel_h, kernel_w, dim_in] bias_shape = [ dim_out, ] conv_weight_name = "ConvWeights_" + str(x) + "_" + self.model_id bias_name = "ConvBiases_" + str(x) + "_" + self.model_id self.conv_weights.append(conv_weight_name) self.conv_biases.append(bias_name) conv_bias = np.zeros(shape=bias_shape, dtype=np.float32) workspace.FeedBlob(bias_name, conv_bias) conv_weights = scipy.stats.norm(0, np.sqrt( 1 / dim_in)).rvs(size=weight_shape).astype(np.float32) workspace.FeedBlob(conv_weight_name, conv_weights)
def _setup_initial_blobs(self): self.input_dim = self.num_state_features self.output_dim = self.num_actions self.action_blob = "action" workspace.FeedBlob(self.action_blob, np.zeros(1, dtype=np.float32)) MLTrainer._setup_initial_blobs(self)
def __init__(self, name, parameters, scaled_output=True): """ :param name: A unique name for this trainer used to create the data on the caffe2 workspace :param layers: A list of integers describing the layer sizes :param activations: A list of strings describing the activation functions """ self.scaled_output = scaled_output MLTrainer.__init__(self, name, parameters)
def __init__(self, name: str, fc_parameters: TrainingParameters, cnn_parameters: CNNModelParameters, img_height: int, img_width: int) -> None: self.init_height = img_height self.init_width = img_width self.dims = cnn_parameters.conv_dims self.conv_height_kernels = cnn_parameters.conv_height_kernels self.conv_width_kernels = cnn_parameters.conv_width_kernels self.pool_kernels_strides = cnn_parameters.pool_kernels_strides self.pool_types = cnn_parameters.pool_types MLTrainer.__init__(self, name, fc_parameters)
def __init__( self, parameters: Union[DiscreteActionModelParameters, ContinuousActionModelParameters], ) -> None: logger.info(str(parameters)) assert parameters.training.layers[0] >= 0,\ "Set layers[0] to a the number of features" self.num_features = parameters.training.layers[0] MLTrainer.__init__(self, RL_TRAINER_MODEL_ID, parameters.training) self.target_network = TargetNetwork( self, parameters.rl.target_update_rate ) self.reward_burnin = parameters.rl.reward_burnin self.maxq_learning = parameters.rl.maxq_learning self.rl_discount_rate = parameters.rl.gamma self.rl_temperature = parameters.rl.temperature self.training_iteration = 0 self.minibatch_size = parameters.training.minibatch_size self.parameters = parameters self.loss_blob: Optional[str] = None workspace.FeedBlob('states', np.array([0], dtype=np.float32)) workspace.FeedBlob('actions', np.array([0], dtype=np.float32)) workspace.FeedBlob('rewards', np.array([0], dtype=np.float32)) workspace.FeedBlob('next_states', np.array([0], dtype=np.float32)) workspace.FeedBlob('not_terminals', np.array([0], dtype=np.float32)) workspace.FeedBlob('next_actions', np.array([0], dtype=np.float32)) workspace.FeedBlob( 'possible_next_actions', np.array([0], dtype=np.float32) ) workspace.FeedBlob( 'possible_next_actions_lengths', np.array([0], dtype=np.float32) ) self.rl_train_model: Optional[ModelHelper] = None self.reward_train_model: Optional[ModelHelper] = None self.q_score_model: Optional[ModelHelper] = None self._create_reward_train_net() self._create_rl_train_net() self._create_q_score_net() assert self.rl_train_model is not None assert self.reward_train_model is not None assert self.q_score_model is not None
def test_input_validation(self): with self.assertRaises(Exception): # layers and activations sizes incompatible MLTrainer( "Test model", TrainingParameters([2, 1], ['linear', 'relu'], 100, 0.001, 'ADAM')) with self.assertRaises(Exception): # All values in layers should be positive integers MLTrainer( "Test model", TrainingParameters([-1, 1], ['linear'], 100, 0.001, 'ADAM')) MLTrainer( "Test model", TrainingParameters([1.3, 1], ['linear'], 100, 0.001, 'ADAM'))
def _validate_inputs(self): conv_dim_len = len(self.dims) - 1 if (conv_dim_len != len(self.conv_height_kernels) or conv_dim_len != len(self.conv_width_kernels) or conv_dim_len != len(self.pool_kernels_strides) or conv_dim_len != len(self.pool_types)): raise Exception( "Ensure that `conv_dims`, `conv_height_kernels`, `conv_width_kernels`" + ", `pool_kernels`, and `pool_types` are the same length.") for pool_type in self.pool_types: if pool_type not in ['max', 'avg']: raise Exception("Unsupported pool type: {}".format(pool_type)) self._set_conv_dimensions() MLTrainer._validate_inputs(self)
def __init__( self, parameters: Union[DiscreteActionModelParameters, ContinuousActionModelParameters], ) -> None: logger.info(str(parameters)) assert parameters.training.layers[0] >= 0,\ "Set layers[0] to a the number of features" self.num_features = parameters.training.layers[0] MLTrainer.__init__(self, "rl_trainer", parameters.training) self.target_network = TargetNetwork(self, parameters.rl.target_update_rate) self.reward_burnin = parameters.rl.reward_burnin self.maxq_learning = parameters.rl.maxq_learning self.rl_discount_rate = parameters.rl.gamma self.training_iteration = 0 self.minibatch_size = parameters.training.minibatch_size
def test_adam_weights(self): num_features = 4 num_outputs = 1 trainer = MLTrainer( "Linear Regression", TrainingParameters(layers=[num_features, num_outputs], activations=['linear'], minibatch_size=100, learning_rate=0.1, optimizer='ADAM')) dist = get_weight_dist(trainer, num_features=num_features, num_outputs=num_outputs) self.assertLess(dist, 0.1)
def test_sgd_weights(self): num_features = 4 num_outputs = 1 trainer = MLTrainer( "Linear Regression", TrainingParameters(layers=[num_features, num_outputs], activations=['linear'], minibatch_size=100, learning_rate=0.001, optimizer='SGD', gamma=0.9999, lr_policy='step')) dist = get_weight_dist(trainer, num_features=num_features, num_outputs=num_outputs) self.assertLess(dist, 0.1)
def _setup_initial_blobs(self): MLTrainer._setup_initial_blobs(self) if self.extension_mltrainer is not None or self.scaled_output: self._setup_initial_extension_blobs()
class RLTrainer: num_trainers = 0 DEFAULT_TRAINING_NUM_WORKERS = 4 def __init__( self, parameters: Union[DiscreteActionModelParameters, ContinuousActionModelParameters], ) -> None: logger.info(str(parameters)) RLTrainer.num_trainers += 1 self.model_id = RL_TRAINER_PREFIX + str(RLTrainer.num_trainers) if parameters.training.cnn_parameters is not None: self.conv_ml_trainer = ConvMLTrainer( CONV_ML_TRAINER_PREFIX + str(RLTrainer.num_trainers), parameters.training.cnn_parameters, ) # The final layer of the conv net is the input to the fc net. parameters.training.layers[ 0] = self.conv_ml_trainer.get_output_size() self.conv_target_network = ConvTargetNetwork( CONV_TARGET_NETWORK_PREFIX + str(RLTrainer.num_trainers), parameters.training.cnn_parameters, parameters.rl.target_update_rate, self.conv_ml_trainer, ) else: self.conv_ml_trainer = None self.conv_target_network = None assert (parameters.training.layers[0] >= 0), "Set layers[0] to a the number of features" self.ml_trainer = MLTrainer( ML_TRAINER_PREFIX + str(RLTrainer.num_trainers), parameters.training) self.target_network = TargetNetwork( TARGET_NETWORK_PREFIX + str(RLTrainer.num_trainers), parameters.training, parameters.rl.target_update_rate, self.ml_trainer, ) self.reward_burnin = parameters.rl.reward_burnin self.maxq_learning = parameters.rl.maxq_learning self.rl_discount_rate = parameters.rl.gamma self.rl_temperature = parameters.rl.temperature self.use_seq_num_diff_as_time_diff = parameters.rl.use_seq_num_diff_as_time_diff self.training_iteration = 0 self.minibatch_size = parameters.training.minibatch_size self.parameters = parameters self.loss_blob: Optional[str] = None workspace.FeedBlob("states", np.array([0], dtype=np.float32)) workspace.FeedBlob("actions", np.array([0], dtype=np.float32)) workspace.FeedBlob("rewards", np.array([0], dtype=np.float32)) workspace.FeedBlob("next_states", np.array([0], dtype=np.float32)) workspace.FeedBlob("not_terminals", np.array([0], dtype=np.float32)) if self.maxq_learning: workspace.FeedBlob("possible_next_actions", np.array([0], dtype=np.float32)) workspace.FeedBlob("possible_next_actions_lengths", np.array([0], dtype=np.float32)) else: workspace.FeedBlob("next_actions", np.array([0], dtype=np.float32)) # Setting to 1 serves as a 1 unit time_diff if not set by user workspace.FeedBlob("time_diff", np.array([1], dtype=np.float32)) self.rl_train_model: Optional[ModelHelper] = None self.reward_train_model: Optional[ModelHelper] = None self.q_score_model: Optional[ModelHelper] = None self._create_reward_train_net() self._create_rl_train_net() self._create_q_score_net() assert self.rl_train_model is not None assert self.reward_train_model is not None assert self.q_score_model is not None def get_possible_next_actions(self): raise NotImplementedError() def get_max_q_values(self, next_states: str, possible_next_actions, use_target_network: bool) -> str: """ Takes in an array of next_states and outputs an array of the same shape whose ith entry = max_{pna} Q(state_i, pna). Uses target network for Q(state_i, pna) approximation. :param next_states: Numpy array with shape (batch_size, state_dim). Each row contains a representation of a state. :param possible_next_actions: See subclass' `get_max_q_values` documentation. """ raise NotImplementedError() def get_q_values(self, states: str, actions: str, use_target_network: bool) -> str: """ Takes in a set of next_states and corresponding next_actions. For each (next_state_i, next_action_i) pair, calculates Q(next_state, next_action). Returns these q values in a Numpy array of shape (batch_size, 1). :param next_states: Numpy array with shape (batch_size, state_dim). The ith row is a representation of the ith transition's next_state. :param next_actions: See subclass' `get_sarsa_values` documentation. """ raise NotImplementedError() def update_model(self, states: str, actions: str, q_vals_target: str) -> None: """ Takes in states, actions, and target q values. Updates the model: Runs the forward pass, computing Q(states, actions). Q(states, actions)[i][j] is an approximation of Q*(states[i], action_j). Comptutes Loss of Q(states, actions) with respect to q_vals_targets. Updates Q Network's weights according to loss and optimizer. :param states: Numpy array with shape (batch_size, state_dim). The ith row is a representation of the ith transition's state. :param actions: Numpy array with shape (batch_size, action_dim). The ith row is a representation of the ith transition's action. :param q_vals_targets: Numpy array with shape (batch_size, 1). The ith row is the label to train against for the data from the ith transition. """ raise NotImplementedError() def _create_reward_train_net(self) -> None: raise NotImplementedError() def _create_rl_train_net(self) -> None: raise NotImplementedError() def _create_q_score_net(self) -> None: self.q_score_model = ModelHelper(name="q_score_" + self.model_id) C2.set_model(self.q_score_model) self.q_score_output = self.get_q_values("states", "actions", True) workspace.RunNetOnce(self.q_score_model.param_init_net) self.q_score_model.net.Proto().num_workers = ( RLTrainer.DEFAULT_TRAINING_NUM_WORKERS) self.q_score_model.net.Proto().type = "async_scheduling" workspace.CreateNet(self.q_score_model.net) C2.set_model(None) def train_numpy(self, tdp: TrainingDataPage, evaluator: Optional[Evaluator]): workspace.FeedBlob("states", tdp.states) workspace.FeedBlob("actions", tdp.actions) workspace.FeedBlob("rewards", tdp.rewards) workspace.FeedBlob("next_states", tdp.next_states) workspace.FeedBlob("not_terminals", tdp.not_terminals) workspace.FeedBlob("time_diff", np.array([1], dtype=np.float32)) if self.maxq_learning: if isinstance(tdp.possible_next_actions, StackedArray): workspace.FeedBlob("possible_next_actions", tdp.possible_next_actions.values) workspace.FeedBlob("possible_next_actions_lengths", tdp.possible_next_actions.lengths) else: workspace.FeedBlob("possible_next_actions", tdp.possible_next_actions) else: workspace.FeedBlob("next_actions", tdp.next_actions) self.train() if evaluator is not None: self.evaluate(evaluator, tdp.actions, tdp.propensities, tdp.episode_values) def train(self) -> None: assert self.rl_train_model is not None assert self.reward_train_model is not None assert self.q_score_model is not None if self.training_iteration >= self.reward_burnin: if self.training_iteration == self.reward_burnin: logger.info( "Minibatch number == reward_burnin. Starting RL updates.") self.target_network.enable_slow_updates() if self.conv_target_network: self.conv_target_network.enable_slow_updates() workspace.RunNet(self.rl_train_model.net) else: workspace.RunNet(self.reward_train_model.net) workspace.RunNet(self.target_network._update_model.net) if self.conv_target_network: workspace.RunNet(self.conv_target_network._update_model.net) self.training_iteration += 1 workspace.RunNet(self.q_score_model.net) def evaluate( self, evaluator: Optional[Evaluator], logged_actions: Optional[np.ndarray], logged_propensities: Optional[np.ndarray], logged_values: Optional[np.ndarray], ): raise NotImplementedError() def build_predictor(self, model, input_blob, output_blob) -> List[str]: retval: List[str] = [] if self.conv_ml_trainer is not None: conv_output = model.net.NextBlob("conv_output") retval = self.conv_ml_trainer.build_predictor( model, input_blob, conv_output) conv_output_flat = model.net.NextBlob("conv_output_flat") model.net.Flatten([conv_output], [conv_output_flat]) input_blob = conv_output_flat retval += self.ml_trainer.build_predictor(model, input_blob, output_blob) return retval
def _setup_initial_blobs(self): self.input_dim = self.num_features self.output_dim = 1 MLTrainer._setup_initial_blobs(self)
def _setup_initial_blobs(self): MLTrainer._setup_initial_blobs(self) if self.scaled_output: self._setup_initial_extension_blobs()
def __init__( self, parameters: Union[DiscreteActionModelParameters, ContinuousActionModelParameters], ) -> None: logger.info(str(parameters)) RLTrainer.num_trainers += 1 self.model_id = RL_TRAINER_PREFIX + str(RLTrainer.num_trainers) if parameters.training.cnn_parameters is not None: self.conv_ml_trainer = ConvMLTrainer( CONV_ML_TRAINER_PREFIX + str(RLTrainer.num_trainers), parameters.training.cnn_parameters, ) # The final layer of the conv net is the input to the fc net. parameters.training.layers[ 0] = self.conv_ml_trainer.get_output_size() self.conv_target_network = ConvTargetNetwork( CONV_TARGET_NETWORK_PREFIX + str(RLTrainer.num_trainers), parameters.training.cnn_parameters, parameters.rl.target_update_rate, self.conv_ml_trainer, ) else: self.conv_ml_trainer = None self.conv_target_network = None assert (parameters.training.layers[0] >= 0), "Set layers[0] to a the number of features" self.ml_trainer = MLTrainer( ML_TRAINER_PREFIX + str(RLTrainer.num_trainers), parameters.training) self.target_network = TargetNetwork( TARGET_NETWORK_PREFIX + str(RLTrainer.num_trainers), parameters.training, parameters.rl.target_update_rate, self.ml_trainer, ) self.reward_burnin = parameters.rl.reward_burnin self.maxq_learning = parameters.rl.maxq_learning self.rl_discount_rate = parameters.rl.gamma self.rl_temperature = parameters.rl.temperature self.use_seq_num_diff_as_time_diff = parameters.rl.use_seq_num_diff_as_time_diff self.training_iteration = 0 self.minibatch_size = parameters.training.minibatch_size self.parameters = parameters self.loss_blob: Optional[str] = None workspace.FeedBlob("states", np.array([0], dtype=np.float32)) workspace.FeedBlob("actions", np.array([0], dtype=np.float32)) workspace.FeedBlob("rewards", np.array([0], dtype=np.float32)) workspace.FeedBlob("next_states", np.array([0], dtype=np.float32)) workspace.FeedBlob("not_terminals", np.array([0], dtype=np.float32)) if self.maxq_learning: workspace.FeedBlob("possible_next_actions", np.array([0], dtype=np.float32)) workspace.FeedBlob("possible_next_actions_lengths", np.array([0], dtype=np.float32)) else: workspace.FeedBlob("next_actions", np.array([0], dtype=np.float32)) # Setting to 1 serves as a 1 unit time_diff if not set by user workspace.FeedBlob("time_diff", np.array([1], dtype=np.float32)) self.rl_train_model: Optional[ModelHelper] = None self.reward_train_model: Optional[ModelHelper] = None self.q_score_model: Optional[ModelHelper] = None self._create_reward_train_net() self._create_rl_train_net() self._create_q_score_net() assert self.rl_train_model is not None assert self.reward_train_model is not None assert self.q_score_model is not None
def _setup_initial_blobs(self): self.input_dim = self.num_state_features + self.num_action_features self.output_dim = 1 MLTrainer._setup_initial_blobs(self)