예제 #1
0
    def __init__(self,
                 state_normalization_parameters: Dict[str,
                                                      NormalizationParameters],
                 parameters: Union[DiscreteActionModelParameters,
                                   ContinuousActionModelParameters],
                 skip_normalization: Optional[bool] = False) -> None:
        print(state_normalization_parameters)
        print(parameters)

        self._state_normalization_parameters = state_normalization_parameters
        MLTrainer.__init__(self, "rl_trainer", parameters.training)

        self.target_network = TargetNetwork(self,
                                            parameters.rl.target_update_rate)

        self.reward_burnin = parameters.rl.reward_burnin
        self.maxq_learning = parameters.rl.maxq_learning
        self.rl_discount_rate = parameters.rl.gamma

        self.training_iteration = 0
        self._buffers = None
        self.minibatch_size = parameters.training.minibatch_size

        self.skip_normalization = skip_normalization
        self._prepare_state_normalization()
예제 #2
0
    def _setup_initial_blobs(self):
        MLTrainer._setup_initial_blobs(self)

        self.output_conv_blob = "Conv_output_{}".format(self.model_id)
        workspace.FeedBlob(self.output_conv_blob, np.zeros(1,
                                                           dtype=np.float32))

        self.conv_weights: List[str] = []
        self.conv_biases: List[str] = []

        for x in six.moves.range(len(self.dims) - 1):
            dim_in = self.dims[x]
            dim_out = self.dims[x + 1]
            kernel_h = self.conv_height_kernels[x]
            kernel_w = self.conv_width_kernels[x]

            weight_shape = [dim_out, kernel_h, kernel_w, dim_in]
            bias_shape = [
                dim_out,
            ]

            conv_weight_name = "ConvWeights_" + str(x) + "_" + self.model_id
            bias_name = "ConvBiases_" + str(x) + "_" + self.model_id
            self.conv_weights.append(conv_weight_name)
            self.conv_biases.append(bias_name)

            conv_bias = np.zeros(shape=bias_shape, dtype=np.float32)
            workspace.FeedBlob(bias_name, conv_bias)

            conv_weights = scipy.stats.norm(0, np.sqrt(
                1 / dim_in)).rvs(size=weight_shape).astype(np.float32)
            workspace.FeedBlob(conv_weight_name, conv_weights)
예제 #3
0
    def _setup_initial_blobs(self):
        self.input_dim = self.num_state_features
        self.output_dim = self.num_actions

        self.action_blob = "action"
        workspace.FeedBlob(self.action_blob, np.zeros(1, dtype=np.float32))

        MLTrainer._setup_initial_blobs(self)
예제 #4
0
    def __init__(self, name, parameters, scaled_output=True):
        """

        :param name: A unique name for this trainer used to create the data on the
            caffe2 workspace
        :param layers: A list of integers describing the layer sizes
        :param activations: A list of strings describing the activation functions
        """
        self.scaled_output = scaled_output
        MLTrainer.__init__(self, name, parameters)
예제 #5
0
    def __init__(self, name: str, fc_parameters: TrainingParameters,
                 cnn_parameters: CNNModelParameters, img_height: int,
                 img_width: int) -> None:
        self.init_height = img_height
        self.init_width = img_width
        self.dims = cnn_parameters.conv_dims
        self.conv_height_kernels = cnn_parameters.conv_height_kernels
        self.conv_width_kernels = cnn_parameters.conv_width_kernels
        self.pool_kernels_strides = cnn_parameters.pool_kernels_strides
        self.pool_types = cnn_parameters.pool_types

        MLTrainer.__init__(self, name, fc_parameters)
예제 #6
0
    def __init__(
        self,
        parameters: Union[DiscreteActionModelParameters,
                          ContinuousActionModelParameters],
    ) -> None:
        logger.info(str(parameters))

        assert parameters.training.layers[0] >= 0,\
            "Set layers[0] to a the number of features"

        self.num_features = parameters.training.layers[0]

        MLTrainer.__init__(self, RL_TRAINER_MODEL_ID, parameters.training)

        self.target_network = TargetNetwork(
            self, parameters.rl.target_update_rate
        )

        self.reward_burnin = parameters.rl.reward_burnin
        self.maxq_learning = parameters.rl.maxq_learning
        self.rl_discount_rate = parameters.rl.gamma
        self.rl_temperature = parameters.rl.temperature
        self.training_iteration = 0
        self.minibatch_size = parameters.training.minibatch_size
        self.parameters = parameters
        self.loss_blob: Optional[str] = None

        workspace.FeedBlob('states', np.array([0], dtype=np.float32))
        workspace.FeedBlob('actions', np.array([0], dtype=np.float32))
        workspace.FeedBlob('rewards', np.array([0], dtype=np.float32))
        workspace.FeedBlob('next_states', np.array([0], dtype=np.float32))
        workspace.FeedBlob('not_terminals', np.array([0], dtype=np.float32))
        workspace.FeedBlob('next_actions', np.array([0], dtype=np.float32))
        workspace.FeedBlob(
            'possible_next_actions', np.array([0], dtype=np.float32)
        )
        workspace.FeedBlob(
            'possible_next_actions_lengths', np.array([0], dtype=np.float32)
        )

        self.rl_train_model: Optional[ModelHelper] = None
        self.reward_train_model: Optional[ModelHelper] = None
        self.q_score_model: Optional[ModelHelper] = None
        self._create_reward_train_net()
        self._create_rl_train_net()
        self._create_q_score_net()
        assert self.rl_train_model is not None
        assert self.reward_train_model is not None
        assert self.q_score_model is not None
예제 #7
0
    def test_input_validation(self):
        with self.assertRaises(Exception):
            # layers and activations sizes incompatible
            MLTrainer(
                "Test model",
                TrainingParameters([2, 1], ['linear', 'relu'], 100, 0.001,
                                   'ADAM'))

        with self.assertRaises(Exception):
            # All values in layers should be positive integers
            MLTrainer(
                "Test model",
                TrainingParameters([-1, 1], ['linear'], 100, 0.001, 'ADAM'))
            MLTrainer(
                "Test model",
                TrainingParameters([1.3, 1], ['linear'], 100, 0.001, 'ADAM'))
예제 #8
0
    def _validate_inputs(self):
        conv_dim_len = len(self.dims) - 1
        if (conv_dim_len != len(self.conv_height_kernels)
                or conv_dim_len != len(self.conv_width_kernels)
                or conv_dim_len != len(self.pool_kernels_strides)
                or conv_dim_len != len(self.pool_types)):
            raise Exception(
                "Ensure that `conv_dims`, `conv_height_kernels`, `conv_width_kernels`"
                + ", `pool_kernels`, and `pool_types` are the same length.")

        for pool_type in self.pool_types:
            if pool_type not in ['max', 'avg']:
                raise Exception("Unsupported pool type: {}".format(pool_type))

        self._set_conv_dimensions()
        MLTrainer._validate_inputs(self)
예제 #9
0
    def __init__(
        self,
        parameters: Union[DiscreteActionModelParameters,
                          ContinuousActionModelParameters],
    ) -> None:
        logger.info(str(parameters))

        assert parameters.training.layers[0] >= 0,\
            "Set layers[0] to a the number of features"

        self.num_features = parameters.training.layers[0]

        MLTrainer.__init__(self, "rl_trainer", parameters.training)

        self.target_network = TargetNetwork(self,
                                            parameters.rl.target_update_rate)

        self.reward_burnin = parameters.rl.reward_burnin
        self.maxq_learning = parameters.rl.maxq_learning
        self.rl_discount_rate = parameters.rl.gamma

        self.training_iteration = 0
        self.minibatch_size = parameters.training.minibatch_size
예제 #10
0
    def test_adam_weights(self):
        num_features = 4
        num_outputs = 1

        trainer = MLTrainer(
            "Linear Regression",
            TrainingParameters(layers=[num_features, num_outputs],
                               activations=['linear'],
                               minibatch_size=100,
                               learning_rate=0.1,
                               optimizer='ADAM'))

        dist = get_weight_dist(trainer,
                               num_features=num_features,
                               num_outputs=num_outputs)

        self.assertLess(dist, 0.1)
예제 #11
0
    def test_sgd_weights(self):
        num_features = 4
        num_outputs = 1

        trainer = MLTrainer(
            "Linear Regression",
            TrainingParameters(layers=[num_features, num_outputs],
                               activations=['linear'],
                               minibatch_size=100,
                               learning_rate=0.001,
                               optimizer='SGD',
                               gamma=0.9999,
                               lr_policy='step'))

        dist = get_weight_dist(trainer,
                               num_features=num_features,
                               num_outputs=num_outputs)

        self.assertLess(dist, 0.1)
예제 #12
0
 def _setup_initial_blobs(self):
     MLTrainer._setup_initial_blobs(self)
     if self.extension_mltrainer is not None or self.scaled_output:
         self._setup_initial_extension_blobs()
예제 #13
0
class RLTrainer:
    num_trainers = 0
    DEFAULT_TRAINING_NUM_WORKERS = 4

    def __init__(
        self,
        parameters: Union[DiscreteActionModelParameters,
                          ContinuousActionModelParameters],
    ) -> None:
        logger.info(str(parameters))
        RLTrainer.num_trainers += 1
        self.model_id = RL_TRAINER_PREFIX + str(RLTrainer.num_trainers)

        if parameters.training.cnn_parameters is not None:
            self.conv_ml_trainer = ConvMLTrainer(
                CONV_ML_TRAINER_PREFIX + str(RLTrainer.num_trainers),
                parameters.training.cnn_parameters,
            )

            # The final layer of the conv net is the input to the fc net.
            parameters.training.layers[
                0] = self.conv_ml_trainer.get_output_size()

            self.conv_target_network = ConvTargetNetwork(
                CONV_TARGET_NETWORK_PREFIX + str(RLTrainer.num_trainers),
                parameters.training.cnn_parameters,
                parameters.rl.target_update_rate,
                self.conv_ml_trainer,
            )
        else:
            self.conv_ml_trainer = None
            self.conv_target_network = None

        assert (parameters.training.layers[0] >=
                0), "Set layers[0] to a the number of features"

        self.ml_trainer = MLTrainer(
            ML_TRAINER_PREFIX + str(RLTrainer.num_trainers),
            parameters.training)

        self.target_network = TargetNetwork(
            TARGET_NETWORK_PREFIX + str(RLTrainer.num_trainers),
            parameters.training,
            parameters.rl.target_update_rate,
            self.ml_trainer,
        )

        self.reward_burnin = parameters.rl.reward_burnin
        self.maxq_learning = parameters.rl.maxq_learning
        self.rl_discount_rate = parameters.rl.gamma
        self.rl_temperature = parameters.rl.temperature
        self.use_seq_num_diff_as_time_diff = parameters.rl.use_seq_num_diff_as_time_diff
        self.training_iteration = 0
        self.minibatch_size = parameters.training.minibatch_size
        self.parameters = parameters
        self.loss_blob: Optional[str] = None

        workspace.FeedBlob("states", np.array([0], dtype=np.float32))
        workspace.FeedBlob("actions", np.array([0], dtype=np.float32))
        workspace.FeedBlob("rewards", np.array([0], dtype=np.float32))
        workspace.FeedBlob("next_states", np.array([0], dtype=np.float32))
        workspace.FeedBlob("not_terminals", np.array([0], dtype=np.float32))
        if self.maxq_learning:
            workspace.FeedBlob("possible_next_actions",
                               np.array([0], dtype=np.float32))
            workspace.FeedBlob("possible_next_actions_lengths",
                               np.array([0], dtype=np.float32))
        else:
            workspace.FeedBlob("next_actions", np.array([0], dtype=np.float32))
        # Setting to 1 serves as a 1 unit time_diff if not set by user
        workspace.FeedBlob("time_diff", np.array([1], dtype=np.float32))

        self.rl_train_model: Optional[ModelHelper] = None
        self.reward_train_model: Optional[ModelHelper] = None
        self.q_score_model: Optional[ModelHelper] = None
        self._create_reward_train_net()
        self._create_rl_train_net()
        self._create_q_score_net()
        assert self.rl_train_model is not None
        assert self.reward_train_model is not None
        assert self.q_score_model is not None

    def get_possible_next_actions(self):
        raise NotImplementedError()

    def get_max_q_values(self, next_states: str, possible_next_actions,
                         use_target_network: bool) -> str:
        """
        Takes in an array of next_states and outputs an array of the same shape
        whose ith entry = max_{pna} Q(state_i, pna). Uses target network for
        Q(state_i, pna) approximation.

        :param next_states: Numpy array with shape (batch_size, state_dim). Each
            row contains a representation of a state.
        :param possible_next_actions: See subclass' `get_max_q_values` documentation.
        """
        raise NotImplementedError()

    def get_q_values(self, states: str, actions: str,
                     use_target_network: bool) -> str:
        """
        Takes in a set of next_states and corresponding next_actions. For each
        (next_state_i, next_action_i) pair, calculates Q(next_state, next_action).
        Returns these q values in a Numpy array of shape (batch_size, 1).

        :param next_states: Numpy array with shape (batch_size, state_dim). The
            ith row is a representation of the ith transition's next_state.
        :param next_actions: See subclass' `get_sarsa_values` documentation.
        """
        raise NotImplementedError()

    def update_model(self, states: str, actions: str,
                     q_vals_target: str) -> None:
        """
        Takes in states, actions, and target q values. Updates the model:
            Runs the forward pass, computing Q(states, actions).
                Q(states, actions)[i][j] is an approximation of Q*(states[i], action_j).
            Comptutes Loss of Q(states, actions) with respect to q_vals_targets.
            Updates Q Network's weights according to loss and optimizer.

        :param states: Numpy array with shape (batch_size, state_dim). The ith
            row is a representation of the ith transition's state.
        :param actions: Numpy array with shape (batch_size, action_dim). The ith
            row is a representation of the ith transition's action.
        :param q_vals_targets: Numpy array with shape (batch_size, 1). The ith
            row is the label to train against for the data from the ith transition.
        """
        raise NotImplementedError()

    def _create_reward_train_net(self) -> None:
        raise NotImplementedError()

    def _create_rl_train_net(self) -> None:
        raise NotImplementedError()

    def _create_q_score_net(self) -> None:
        self.q_score_model = ModelHelper(name="q_score_" + self.model_id)
        C2.set_model(self.q_score_model)
        self.q_score_output = self.get_q_values("states", "actions", True)
        workspace.RunNetOnce(self.q_score_model.param_init_net)
        self.q_score_model.net.Proto().num_workers = (
            RLTrainer.DEFAULT_TRAINING_NUM_WORKERS)
        self.q_score_model.net.Proto().type = "async_scheduling"
        workspace.CreateNet(self.q_score_model.net)
        C2.set_model(None)

    def train_numpy(self, tdp: TrainingDataPage,
                    evaluator: Optional[Evaluator]):
        workspace.FeedBlob("states", tdp.states)
        workspace.FeedBlob("actions", tdp.actions)
        workspace.FeedBlob("rewards", tdp.rewards)
        workspace.FeedBlob("next_states", tdp.next_states)
        workspace.FeedBlob("not_terminals", tdp.not_terminals)
        workspace.FeedBlob("time_diff", np.array([1], dtype=np.float32))
        if self.maxq_learning:
            if isinstance(tdp.possible_next_actions, StackedArray):
                workspace.FeedBlob("possible_next_actions",
                                   tdp.possible_next_actions.values)
                workspace.FeedBlob("possible_next_actions_lengths",
                                   tdp.possible_next_actions.lengths)
            else:
                workspace.FeedBlob("possible_next_actions",
                                   tdp.possible_next_actions)
        else:
            workspace.FeedBlob("next_actions", tdp.next_actions)
        self.train()
        if evaluator is not None:
            self.evaluate(evaluator, tdp.actions, tdp.propensities,
                          tdp.episode_values)

    def train(self) -> None:
        assert self.rl_train_model is not None
        assert self.reward_train_model is not None
        assert self.q_score_model is not None

        if self.training_iteration >= self.reward_burnin:
            if self.training_iteration == self.reward_burnin:
                logger.info(
                    "Minibatch number == reward_burnin. Starting RL updates.")
                self.target_network.enable_slow_updates()
                if self.conv_target_network:
                    self.conv_target_network.enable_slow_updates()
            workspace.RunNet(self.rl_train_model.net)
        else:
            workspace.RunNet(self.reward_train_model.net)

        workspace.RunNet(self.target_network._update_model.net)
        if self.conv_target_network:
            workspace.RunNet(self.conv_target_network._update_model.net)
        self.training_iteration += 1
        workspace.RunNet(self.q_score_model.net)

    def evaluate(
        self,
        evaluator: Optional[Evaluator],
        logged_actions: Optional[np.ndarray],
        logged_propensities: Optional[np.ndarray],
        logged_values: Optional[np.ndarray],
    ):
        raise NotImplementedError()

    def build_predictor(self, model, input_blob, output_blob) -> List[str]:
        retval: List[str] = []
        if self.conv_ml_trainer is not None:
            conv_output = model.net.NextBlob("conv_output")
            retval = self.conv_ml_trainer.build_predictor(
                model, input_blob, conv_output)
            conv_output_flat = model.net.NextBlob("conv_output_flat")
            model.net.Flatten([conv_output], [conv_output_flat])
            input_blob = conv_output_flat
        retval += self.ml_trainer.build_predictor(model, input_blob,
                                                  output_blob)
        return retval
예제 #14
0
    def _setup_initial_blobs(self):
        self.input_dim = self.num_features
        self.output_dim = 1

        MLTrainer._setup_initial_blobs(self)
예제 #15
0
 def _setup_initial_blobs(self):
     MLTrainer._setup_initial_blobs(self)
     if self.scaled_output:
         self._setup_initial_extension_blobs()
예제 #16
0
    def __init__(
        self,
        parameters: Union[DiscreteActionModelParameters,
                          ContinuousActionModelParameters],
    ) -> None:
        logger.info(str(parameters))
        RLTrainer.num_trainers += 1
        self.model_id = RL_TRAINER_PREFIX + str(RLTrainer.num_trainers)

        if parameters.training.cnn_parameters is not None:
            self.conv_ml_trainer = ConvMLTrainer(
                CONV_ML_TRAINER_PREFIX + str(RLTrainer.num_trainers),
                parameters.training.cnn_parameters,
            )

            # The final layer of the conv net is the input to the fc net.
            parameters.training.layers[
                0] = self.conv_ml_trainer.get_output_size()

            self.conv_target_network = ConvTargetNetwork(
                CONV_TARGET_NETWORK_PREFIX + str(RLTrainer.num_trainers),
                parameters.training.cnn_parameters,
                parameters.rl.target_update_rate,
                self.conv_ml_trainer,
            )
        else:
            self.conv_ml_trainer = None
            self.conv_target_network = None

        assert (parameters.training.layers[0] >=
                0), "Set layers[0] to a the number of features"

        self.ml_trainer = MLTrainer(
            ML_TRAINER_PREFIX + str(RLTrainer.num_trainers),
            parameters.training)

        self.target_network = TargetNetwork(
            TARGET_NETWORK_PREFIX + str(RLTrainer.num_trainers),
            parameters.training,
            parameters.rl.target_update_rate,
            self.ml_trainer,
        )

        self.reward_burnin = parameters.rl.reward_burnin
        self.maxq_learning = parameters.rl.maxq_learning
        self.rl_discount_rate = parameters.rl.gamma
        self.rl_temperature = parameters.rl.temperature
        self.use_seq_num_diff_as_time_diff = parameters.rl.use_seq_num_diff_as_time_diff
        self.training_iteration = 0
        self.minibatch_size = parameters.training.minibatch_size
        self.parameters = parameters
        self.loss_blob: Optional[str] = None

        workspace.FeedBlob("states", np.array([0], dtype=np.float32))
        workspace.FeedBlob("actions", np.array([0], dtype=np.float32))
        workspace.FeedBlob("rewards", np.array([0], dtype=np.float32))
        workspace.FeedBlob("next_states", np.array([0], dtype=np.float32))
        workspace.FeedBlob("not_terminals", np.array([0], dtype=np.float32))
        if self.maxq_learning:
            workspace.FeedBlob("possible_next_actions",
                               np.array([0], dtype=np.float32))
            workspace.FeedBlob("possible_next_actions_lengths",
                               np.array([0], dtype=np.float32))
        else:
            workspace.FeedBlob("next_actions", np.array([0], dtype=np.float32))
        # Setting to 1 serves as a 1 unit time_diff if not set by user
        workspace.FeedBlob("time_diff", np.array([1], dtype=np.float32))

        self.rl_train_model: Optional[ModelHelper] = None
        self.reward_train_model: Optional[ModelHelper] = None
        self.q_score_model: Optional[ModelHelper] = None
        self._create_reward_train_net()
        self._create_rl_train_net()
        self._create_q_score_net()
        assert self.rl_train_model is not None
        assert self.reward_train_model is not None
        assert self.q_score_model is not None
    def _setup_initial_blobs(self):
        self.input_dim = self.num_state_features + self.num_action_features
        self.output_dim = 1

        MLTrainer._setup_initial_blobs(self)