예제 #1
0
    def export(cls, trainer, actions, normalization_parameters):
        """ Creates DiscreteActionPredictor from a list of action trainers

        :param trainer DiscreteActionTrainer
        :param features list of state feature names
        :param actions list of action names
        """

        model = model_helper.ModelHelper(name="predictor")
        net = model.net

        workspace.FeedBlob('input/float_features.lengths',
                           np.zeros(1, dtype=np.int32))
        workspace.FeedBlob('input/float_features.keys',
                           np.zeros(1, dtype=np.int32))
        workspace.FeedBlob('input/float_features.values',
                           np.zeros(1, dtype=np.float32))

        preprocessor = PreprocessorNet(net, True)
        parameters = []
        parameters.extend(preprocessor.parameters)
        normalized_dense_matrix, new_parameters = preprocessor.normalize_sparse_matrix(
            'input/float_features.lengths',
            'input/float_features.keys',
            'input/float_features.values',
            normalization_parameters,
            'state_norm',
        )
        parameters.extend(new_parameters)

        new_parameters = RLPredictor._forward_pass(
            model,
            trainer,
            normalized_dense_matrix,
            actions,
        )
        parameters.extend(new_parameters)

        workspace.RunNetOnce(model.param_init_net)
        workspace.CreateNet(net)
        return DiscreteActionPredictor(net, parameters)
    def export(cls,
               trainer,
               actions,
               state_normalization_parameters,
               int_features=False):
        """ Creates a DiscreteActionPredictor from a DiscreteActionTrainer.

        :param trainer DiscreteActionTrainer
        :param actions list of action names
        :param state_normalization_parameters state NormalizationParameters
        :param int_features boolean indicating if int features blob will be present
        """

        model = model_helper.ModelHelper(name="predictor")
        net = model.net
        C2.set_model(model)

        workspace.FeedBlob('input/image', np.zeros([1, 1, 1, 1],
                                                   dtype=np.int32))
        workspace.FeedBlob('input/float_features.lengths',
                           np.zeros(1, dtype=np.int32))
        workspace.FeedBlob('input/float_features.keys',
                           np.zeros(1, dtype=np.int64))
        workspace.FeedBlob('input/float_features.values',
                           np.zeros(1, dtype=np.float32))

        input_feature_lengths = 'input_feature_lengths'
        input_feature_keys = 'input_feature_keys'
        input_feature_values = 'input_feature_values'

        if int_features:
            workspace.FeedBlob('input/int_features.lengths',
                               np.zeros(1, dtype=np.int32))
            workspace.FeedBlob('input/int_features.keys',
                               np.zeros(1, dtype=np.int64))
            workspace.FeedBlob('input/int_features.values',
                               np.zeros(1, dtype=np.int32))
            C2.net().Cast(['input/int_features.values'],
                          ['input/int_features.values_float'],
                          dtype=caffe2_pb2.TensorProto.FLOAT)
            C2.net().MergeMultiScalarFeatureTensors([
                'input/float_features.lengths', 'input/float_features.keys',
                'input/float_features.values', 'input/int_features.lengths',
                'input/int_features.keys', 'input/int_features.values_float'
            ], [
                input_feature_lengths, input_feature_keys, input_feature_values
            ])
        else:
            C2.net().Copy(['input/float_features.lengths'],
                          [input_feature_lengths])
            C2.net().Copy(['input/float_features.keys'], [input_feature_keys])
            C2.net().Copy(['input/float_features.values'],
                          [input_feature_values])

        parameters = []
        if state_normalization_parameters is not None:
            preprocessor = PreprocessorNet(net, True)
            parameters.extend(preprocessor.parameters)
            normalized_dense_matrix, new_parameters = \
                preprocessor.normalize_sparse_matrix(
                    input_feature_lengths,
                    input_feature_keys,
                    input_feature_values,
                    state_normalization_parameters,
                    'state_norm',
                )
            parameters.extend(new_parameters)
        else:
            # Image input.  Note: Currently this does the wrong thing if
            #   more than one image is passed at a time.
            normalized_dense_matrix = 'input/image'

        new_parameters, q_values = RLPredictor._forward_pass(
            model,
            trainer,
            normalized_dense_matrix,
            actions,
        )
        parameters.extend(new_parameters)

        # Get 1 x n action index tensor under the max_q policy
        max_q_act_idxs = 'max_q_policy_actions'
        C2.net().Flatten([C2.ArgMax(q_values)], [max_q_act_idxs], axis=0)
        shape_of_num_of_states = 'num_states_shape'
        C2.net().FlattenToVec([max_q_act_idxs], [shape_of_num_of_states])
        num_states, _ = C2.Reshape(C2.Size(shape_of_num_of_states), shape=[1])

        # Get 1 x n action index tensor under the softmax policy
        temperature = C2.NextBlob("temperature")
        parameters.append(temperature)
        workspace.FeedBlob(
            temperature, np.array([trainer.rl_temperature], dtype=np.float32))
        tempered_q_values = C2.Div(q_values, "temperature", broadcast=1)
        softmax_values = C2.Softmax(tempered_q_values)
        softmax_act_idxs_nested = 'softmax_act_idxs_nested'
        C2.net().WeightedSample([softmax_values], [softmax_act_idxs_nested])
        softmax_act_idxs = 'softmax_policy_actions'
        C2.net().Flatten([softmax_act_idxs_nested], [softmax_act_idxs], axis=0)

        # Concat action index tensors to get 2 x n tensor - [[max_q], [softmax]]
        # transpose & flatten to get [a1_maxq, a1_softmax, a2_maxq, a2_softmax, ...]
        max_q_act_blob = C2.Cast(max_q_act_idxs,
                                 to=caffe2_pb2.TensorProto.INT32)
        softmax_act_blob = C2.Cast(softmax_act_idxs,
                                   to=caffe2_pb2.TensorProto.INT32)
        C2.net().Append([max_q_act_blob, softmax_act_blob], [max_q_act_blob])
        transposed_action_idxs = C2.Transpose(max_q_act_blob)
        flat_transposed_action_idxs = C2.FlattenToVec(transposed_action_idxs)
        output_values = 'output/string_single_categorical_features.values'
        workspace.FeedBlob(output_values, np.zeros(1, dtype=np.int64))
        C2.net().Gather(["action_names", flat_transposed_action_idxs],
                        [output_values])

        output_lengths = 'output/string_single_categorical_features.lengths'
        workspace.FeedBlob(output_lengths, np.zeros(1, dtype=np.int32))
        C2.net().ConstantFill([shape_of_num_of_states], [output_lengths],
                              value=2,
                              dtype=caffe2_pb2.TensorProto.INT32)

        output_keys = 'output/string_single_categorical_features.keys'
        workspace.FeedBlob(output_keys, np.zeros(1, dtype=np.int64))
        output_keys_tensor, _ = C2.Concat(
            C2.ConstantFill(shape=[1, 1],
                            value=0,
                            dtype=caffe2_pb2.TensorProto.INT64),
            C2.ConstantFill(shape=[1, 1],
                            value=1,
                            dtype=caffe2_pb2.TensorProto.INT64),
            axis=0,
        )
        output_key_tile = C2.Tile(output_keys_tensor, num_states, axis=0)
        C2.net().FlattenToVec([output_key_tile], [output_keys])

        workspace.RunNetOnce(model.param_init_net)
        workspace.CreateNet(net)
        return DiscreteActionPredictor(net, parameters, int_features)
예제 #3
0
    def preprocess_samples_discrete(
            self, samples: Samples,
            minibatch_size: int) -> List[TrainingDataPage]:
        samples.shuffle()

        net = core.Net("gridworld_preprocessing")
        C2.set_net(net)
        preprocessor = PreprocessorNet(True)
        saa = StackedAssociativeArray.from_dict_list(samples.states, "states")
        state_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization,
            "state_norm",
            False,
            False,
        )
        saa = StackedAssociativeArray.from_dict_list(samples.next_states,
                                                     "next_states")
        next_state_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization,
            "next_state_norm",
            False,
            False,
        )
        workspace.RunNetOnce(net)
        actions_one_hot = np.zeros(
            [len(samples.actions), len(self.ACTIONS)], dtype=np.float32)
        for i, action in enumerate(samples.actions):
            actions_one_hot[i, self.action_to_index(action)] = 1
        rewards = np.array(samples.rewards, dtype=np.float32).reshape(-1, 1)
        propensities = np.array(samples.propensities,
                                dtype=np.float32).reshape(-1, 1)
        next_actions_one_hot = np.zeros(
            [len(samples.next_actions),
             len(self.ACTIONS)], dtype=np.float32)
        for i, action in enumerate(samples.next_actions):
            if action == "":
                continue
            next_actions_one_hot[i, self.action_to_index(action)] = 1
        possible_next_actions_mask = []
        for pna in samples.possible_next_actions:
            pna_mask = [0] * self.num_actions
            for action in pna:
                pna_mask[self.action_to_index(action)] = 1
            possible_next_actions_mask.append(pna_mask)
        possible_next_actions_mask = np.array(possible_next_actions_mask,
                                              dtype=np.float32)
        is_terminals = np.array(samples.is_terminal,
                                dtype=np.bool).reshape(-1, 1)
        not_terminals = np.logical_not(is_terminals)
        if samples.reward_timelines is not None:
            reward_timelines = np.array(samples.reward_timelines,
                                        dtype=np.object)

        states_ndarray = workspace.FetchBlob(state_matrix)
        next_states_ndarray = workspace.FetchBlob(next_state_matrix)
        tdps = []
        for start in range(0, states_ndarray.shape[0], minibatch_size):
            end = start + minibatch_size
            if end > states_ndarray.shape[0]:
                break
            tdps.append(
                TrainingDataPage(
                    states=states_ndarray[start:end],
                    actions=actions_one_hot[start:end],
                    propensities=propensities[start:end],
                    rewards=rewards[start:end],
                    next_states=next_states_ndarray[start:end],
                    not_terminals=not_terminals[start:end],
                    next_actions=next_actions_one_hot[start:end],
                    possible_next_actions=possible_next_actions_mask[
                        start:end],
                    reward_timelines=reward_timelines[start:end]
                    if reward_timelines is not None else None,
                ))
        return tdps
예제 #4
0
    def preprocess_samples(self, samples: Samples,
                           minibatch_size: int) -> List[TrainingDataPage]:
        samples.shuffle()

        net = core.Net("gridworld_preprocessing")
        C2.set_net(net)
        preprocessor = PreprocessorNet(True)
        saa = StackedAssociativeArray.from_dict_list(samples.states, "states")
        state_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization,
            "state_norm",
            False,
            False,
        )
        saa = StackedAssociativeArray.from_dict_list(samples.next_states,
                                                     "next_states")
        next_state_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization,
            "next_state_norm",
            False,
            False,
        )
        saa = StackedAssociativeArray.from_dict_list(samples.actions, "action")
        action_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization_action,
            "action_norm",
            False,
            False,
        )
        saa = StackedAssociativeArray.from_dict_list(samples.next_actions,
                                                     "next_action")
        next_action_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization_action,
            "next_action_norm",
            False,
            False,
        )
        propensities = np.array(samples.propensities,
                                dtype=np.float32).reshape(-1, 1)
        rewards = np.array(samples.rewards, dtype=np.float32).reshape(-1, 1)

        pnas_lengths_list = []
        pnas_flat: List[List[str]] = []
        for pnas in samples.possible_next_actions:
            pnas_lengths_list.append(len(pnas))
            pnas_flat.extend(pnas)
        saa = StackedAssociativeArray.from_dict_list(pnas_flat,
                                                     "possible_next_actions")
        pnas_lengths = np.array(pnas_lengths_list, dtype=np.int32)
        possible_next_actions_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization_action,
            "possible_next_action_norm",
            False,
            False,
        )
        workspace.RunNetOnce(net)

        states_ndarray = workspace.FetchBlob(state_matrix)
        actions_ndarray = workspace.FetchBlob(action_matrix)
        next_states_ndarray = workspace.FetchBlob(next_state_matrix)
        next_actions_ndarray = workspace.FetchBlob(next_action_matrix)
        possible_next_actions_ndarray = workspace.FetchBlob(
            possible_next_actions_matrix)
        tdps = []
        pnas_start = 0
        for start in range(0, states_ndarray.shape[0], minibatch_size):
            end = start + minibatch_size
            if end > states_ndarray.shape[0]:
                break
            pnas_end = pnas_start + np.sum(pnas_lengths[start:end])
            pnas = possible_next_actions_ndarray[pnas_start:pnas_end]
            pnas_start = pnas_end
            tdps.append(
                TrainingDataPage(
                    states=states_ndarray[start:end],
                    actions=actions_ndarray[start:end],
                    propensities=propensities[start:end],
                    rewards=rewards[start:end],
                    next_states=next_states_ndarray[start:end],
                    next_actions=next_actions_ndarray[start:end],
                    possible_next_actions=StackedArray(pnas_lengths[start:end],
                                                       pnas),
                    not_terminals=(pnas_lengths[start:end] > 0).reshape(-1, 1),
                    reward_timelines=samples.reward_timelines[start:end]
                    if samples.reward_timelines else None,
                ))
        return tdps
예제 #5
0
    def preprocess_samples(self, samples: Samples,
                           minibatch_size: int) -> List[TrainingDataPage]:
        samples.shuffle()

        net = core.Net("gridworld_preprocessing")
        C2.set_net(net)
        preprocessor = PreprocessorNet(True)
        saa = StackedAssociativeArray.from_dict_list(samples.states, "states")
        state_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization,
            "state_norm",
            False,
            False,
            False,
        )
        saa = StackedAssociativeArray.from_dict_list(samples.next_states,
                                                     "next_states")
        next_state_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization,
            "next_state_norm",
            False,
            False,
            False,
        )
        saa = StackedAssociativeArray.from_dict_list(samples.actions, "action")
        action_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization_action,
            "action_norm",
            False,
            False,
            False,
        )
        saa = StackedAssociativeArray.from_dict_list(samples.next_actions,
                                                     "next_action")
        next_action_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization_action,
            "next_action_norm",
            False,
            False,
            False,
        )
        propensities = np.array(samples.propensities,
                                dtype=np.float32).reshape(-1, 1)
        rewards = np.array(samples.rewards, dtype=np.float32).reshape(-1, 1)

        pnas_lengths_list = []
        pnas_flat: List[List[str]] = []
        for pnas in samples.possible_next_actions:
            pnas_lengths_list.append(len(pnas))
            pnas_flat.extend(pnas)
        saa = StackedAssociativeArray.from_dict_list(pnas_flat,
                                                     "possible_next_actions")

        pnas_lengths = np.array(pnas_lengths_list, dtype=np.int32)
        pna_lens_blob = "pna_lens_blob"
        workspace.FeedBlob(pna_lens_blob, pnas_lengths)

        possible_next_actions_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization_action,
            "possible_next_action_norm",
            False,
            False,
            False,
        )

        state_pnas_tile_blob = C2.LengthsTile(next_state_matrix, pna_lens_blob)

        workspace.RunNetOnce(net)

        state_preprocessor = Preprocessor(self.normalization, False)
        action_preprocessor = Preprocessor(self.normalization_action, False)

        states_ndarray = workspace.FetchBlob(state_matrix)
        states_ndarray = state_preprocessor.forward(states_ndarray).numpy()

        actions_ndarray = workspace.FetchBlob(action_matrix)
        actions_ndarray = action_preprocessor.forward(actions_ndarray).numpy()

        next_states_ndarray = workspace.FetchBlob(next_state_matrix)
        next_states_ndarray = state_preprocessor.forward(
            next_states_ndarray).numpy()

        next_actions_ndarray = workspace.FetchBlob(next_action_matrix)
        next_actions_ndarray = action_preprocessor.forward(
            next_actions_ndarray).numpy()

        logged_possible_next_actions = action_preprocessor.forward(
            workspace.FetchBlob(possible_next_actions_matrix))

        state_pnas_tile = state_preprocessor.forward(
            workspace.FetchBlob(state_pnas_tile_blob))
        logged_possible_next_state_actions = torch.cat(
            (state_pnas_tile, logged_possible_next_actions), dim=1)

        possible_next_actions_ndarray = logged_possible_next_actions.cpu(
        ).numpy()
        next_state_pnas_concat = logged_possible_next_state_actions.cpu(
        ).numpy()
        time_diffs = np.ones(len(states_ndarray))
        episode_values = None
        if samples.reward_timelines is not None:
            episode_values = np.zeros(rewards.shape, dtype=np.float32)
            for i, reward_timeline in enumerate(samples.reward_timelines):
                for time_diff, reward in reward_timeline.items():
                    episode_values[i, 0] += reward * (DISCOUNT**time_diff)

        tdps = []
        pnas_start = 0
        for start in range(0, states_ndarray.shape[0], minibatch_size):
            end = start + minibatch_size
            if end > states_ndarray.shape[0]:
                break
            pnas_end = pnas_start + np.sum(pnas_lengths[start:end])
            pnas = possible_next_actions_ndarray[pnas_start:pnas_end]
            pnas_concat = next_state_pnas_concat[pnas_start:pnas_end]
            pnas_start = pnas_end
            tdps.append(
                TrainingDataPage(
                    states=states_ndarray[start:end],
                    actions=actions_ndarray[start:end],
                    propensities=propensities[start:end],
                    rewards=rewards[start:end],
                    next_states=next_states_ndarray[start:end],
                    next_actions=next_actions_ndarray[start:end],
                    possible_next_actions=StackedArray(pnas_lengths[start:end],
                                                       pnas),
                    not_terminals=(pnas_lengths[start:end] > 0).reshape(-1, 1),
                    episode_values=episode_values[start:end]
                    if episode_values is not None else None,
                    time_diffs=time_diffs[start:end],
                    possible_next_actions_lengths=pnas_lengths[start:end],
                    next_state_pnas_concat=pnas_concat,
                ))
        return tdps
예제 #6
0
    def export(
        cls,
        trainer,
        state_normalization_parameters,
        action_normalization_parameters,
    ):
        """ Creates ContinuousActionDQNPredictor from a list of action trainers

        :param trainer ContinuousActionDQNPredictor
        :param state_features list of state feature names
        :param action_features list of action feature names
        """
        # ensure state and action IDs have no intersection
        assert (len(
            set(state_normalization_parameters.keys())
            & set(action_normalization_parameters.keys())) == 0)

        model = model_helper.ModelHelper(name="predictor")
        net = model.net
        C2.set_model(model)

        workspace.FeedBlob('input/float_features.lengths',
                           np.zeros(1, dtype=np.int32))
        workspace.FeedBlob('input/float_features.keys',
                           np.zeros(1, dtype=np.int32))
        workspace.FeedBlob('input/float_features.values',
                           np.zeros(1, dtype=np.float32))

        preprocessor = PreprocessorNet(net, True)
        parameters = []
        parameters.extend(preprocessor.parameters)
        state_normalized_dense_matrix, new_parameters = \
            preprocessor.normalize_sparse_matrix(
                'input/float_features.lengths',
                'input/float_features.keys',
                'input/float_features.values',
                state_normalization_parameters,
                'state_norm',
            )
        parameters.extend(new_parameters)
        action_normalized_dense_matrix, new_parameters = \
            preprocessor.normalize_sparse_matrix(
                'input/float_features.lengths',
                'input/float_features.keys',
                'input/float_features.values',
                action_normalization_parameters,
                'action_norm',
            )
        parameters.extend(new_parameters)
        state_action_normalized = 'state_action_normalized'
        state_action_normalized_dim = 'state_action_normalized_dim'
        net.Concat(
            [state_normalized_dense_matrix, action_normalized_dense_matrix],
            [state_action_normalized, state_action_normalized_dim],
            axis=1)
        new_parameters, q_values = RLPredictor._forward_pass(
            model,
            trainer,
            state_action_normalized,
            ['Q'],
        )
        parameters.extend(new_parameters)

        flat_q_values_key = \
            'output/string_weighted_multi_categorical_features.values.values'
        num_examples, _ = C2.Reshape(C2.Size(flat_q_values_key), shape=[1])
        q_value_blob, _ = C2.Reshape(flat_q_values_key, shape=[1, -1])

        # Get 1 x n (number of examples) action index tensor under the max_q policy
        max_q_act_idxs = 'max_q_policy_actions'
        C2.net().FlattenToVec([C2.ArgMax(q_value_blob)], [max_q_act_idxs])
        max_q_act_blob = C2.Tile(max_q_act_idxs, num_examples, axis=0)

        # Get 1 x n (number of examples) action index tensor under the softmax policy
        temperature = C2.NextBlob("temperature")
        parameters.append(temperature)
        workspace.FeedBlob(
            temperature, np.array([trainer.rl_temperature], dtype=np.float32))
        tempered_q_values = C2.Div(q_value_blob, "temperature", broadcast=1)
        softmax_values = C2.Softmax(tempered_q_values)
        softmax_act_idxs_nested = 'softmax_act_idxs_nested'
        C2.net().WeightedSample([softmax_values], [softmax_act_idxs_nested])
        softmax_act_blob = C2.Tile(C2.FlattenToVec(softmax_act_idxs_nested),
                                   num_examples,
                                   axis=0)

        # Concat action idx vecs to get 2 x n tensor [[a_maxq, ..], [a_softmax, ..]]
        # transpose & flatten to get [a_maxq, a_softmax, a_maxq, a_softmax, ...]
        max_q_act_blob = C2.Cast(max_q_act_blob,
                                 to=caffe2_pb2.TensorProto.INT64)
        softmax_act_blob = C2.Cast(softmax_act_blob,
                                   to=caffe2_pb2.TensorProto.INT64)
        max_q_act_blob_nested, _ = C2.Reshape(max_q_act_blob, shape=[1, -1])
        softmax_act_blob_nested, _ = C2.Reshape(softmax_act_blob,
                                                shape=[1, -1])
        C2.net().Append([max_q_act_blob_nested, softmax_act_blob_nested],
                        [max_q_act_blob_nested])
        transposed_action_idxs = C2.Transpose(max_q_act_blob_nested)
        flat_transposed_action_idxs = C2.FlattenToVec(transposed_action_idxs)
        output_values = 'output/int_single_categorical_features.values'
        workspace.FeedBlob(output_values, np.zeros(1, dtype=np.int64))
        C2.net().Copy([flat_transposed_action_idxs], [output_values])

        output_lengths = 'output/int_single_categorical_features.lengths'
        workspace.FeedBlob(output_lengths, np.zeros(1, dtype=np.int32))
        C2.net().ConstantFill([flat_q_values_key], [output_lengths],
                              value=2,
                              dtype=caffe2_pb2.TensorProto.INT32)

        output_keys = 'output/int_single_categorical_features.keys'
        workspace.FeedBlob(output_keys, np.zeros(1, dtype=np.int64))
        output_keys_tensor, _ = C2.Concat(
            C2.ConstantFill(shape=[1, 1],
                            value=0,
                            dtype=caffe2_pb2.TensorProto.INT64),
            C2.ConstantFill(shape=[1, 1],
                            value=1,
                            dtype=caffe2_pb2.TensorProto.INT64),
            axis=0,
        )
        output_key_tile = C2.Tile(output_keys_tensor, num_examples, axis=0)
        C2.net().FlattenToVec([output_key_tile], [output_keys])

        workspace.RunNetOnce(model.param_init_net)
        workspace.CreateNet(net)
        return ContinuousActionDQNPredictor(net, parameters)
예제 #7
0
    def preprocess_samples(self, samples: Samples,
                           minibatch_size: int) -> List[TrainingDataPage]:
        logger.info("Shuffling...")
        samples.shuffle()

        logger.info("Sparse2Dense...")
        net = core.Net("gridworld_preprocessing")
        C2.set_net(net)
        preprocessor = PreprocessorNet(True)
        saa = StackedAssociativeArray.from_dict_list(samples.states, "states")
        state_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization,
            "state_norm",
            False,
            False,
            False,
        )
        saa = StackedAssociativeArray.from_dict_list(samples.next_states,
                                                     "next_states")
        next_state_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization,
            "next_state_norm",
            False,
            False,
            False,
        )
        saa = StackedAssociativeArray.from_dict_list(samples.actions, "action")
        action_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization_action,
            "action_norm",
            False,
            False,
            False,
        )
        saa = StackedAssociativeArray.from_dict_list(samples.next_actions,
                                                     "next_action")
        next_action_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization_action,
            "next_action_norm",
            False,
            False,
            False,
        )
        action_probabilities = torch.tensor(samples.action_probabilities,
                                            dtype=torch.float32).reshape(
                                                -1, 1)
        rewards = torch.tensor(samples.rewards,
                               dtype=torch.float32).reshape(-1, 1)

        pnas_lengths_list = []
        pnas_flat: List[List[str]] = []
        for pnas in samples.possible_next_actions:
            pnas_lengths_list.append(len(pnas))
            pnas_flat.extend(pnas)
        saa = StackedAssociativeArray.from_dict_list(pnas_flat,
                                                     "possible_next_actions")

        pnas_lengths = torch.tensor(pnas_lengths_list, dtype=torch.int32)
        pna_lens_blob = "pna_lens_blob"
        workspace.FeedBlob(pna_lens_blob, pnas_lengths.numpy())

        possible_next_actions_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization_action,
            "possible_next_action_norm",
            False,
            False,
            False,
        )

        state_pnas_tile_blob = C2.LengthsTile(next_state_matrix, pna_lens_blob)

        workspace.RunNetOnce(net)

        logger.info("Preprocessing...")
        state_preprocessor = Preprocessor(self.normalization, False)
        action_preprocessor = Preprocessor(self.normalization_action, False)

        states_ndarray = workspace.FetchBlob(state_matrix)
        states_ndarray = state_preprocessor.forward(states_ndarray)

        actions_ndarray = workspace.FetchBlob(action_matrix)
        actions_ndarray = action_preprocessor.forward(actions_ndarray)

        next_states_ndarray = workspace.FetchBlob(next_state_matrix)
        next_states_ndarray = state_preprocessor.forward(next_states_ndarray)

        next_actions_ndarray = workspace.FetchBlob(next_action_matrix)
        next_actions_ndarray = action_preprocessor.forward(
            next_actions_ndarray)

        logged_possible_next_actions = action_preprocessor.forward(
            workspace.FetchBlob(possible_next_actions_matrix))

        state_pnas_tile = state_preprocessor.forward(
            workspace.FetchBlob(state_pnas_tile_blob))
        logged_possible_next_state_actions = torch.cat(
            (state_pnas_tile, logged_possible_next_actions), dim=1)

        logger.info("Reward Timeline to Torch...")
        possible_next_actions_ndarray = logged_possible_next_actions
        possible_next_actions_state_concat = logged_possible_next_state_actions
        time_diffs = torch.ones([len(samples.states), 1])
        episode_values = torch.tensor(samples.episode_values,
                                      dtype=torch.float32).reshape(-1, 1)

        tdps = []
        pnas_start = 0
        logger.info("Batching...")
        for start in range(0, states_ndarray.shape[0], minibatch_size):
            end = start + minibatch_size
            if end > states_ndarray.shape[0]:
                break
            pnas_end = pnas_start + torch.sum(pnas_lengths[start:end])
            pnas = possible_next_actions_ndarray[pnas_start:pnas_end]
            pnas_concat = possible_next_actions_state_concat[
                pnas_start:pnas_end]
            pnas_start = pnas_end
            tdp = TrainingDataPage(
                states=states_ndarray[start:end],
                actions=actions_ndarray[start:end],
                propensities=action_probabilities[start:end],
                rewards=rewards[start:end],
                next_states=next_states_ndarray[start:end],
                next_actions=next_actions_ndarray[start:end],
                possible_next_actions=None,
                not_terminals=(pnas_lengths[start:end] > 0).reshape(-1, 1),
                episode_values=episode_values[start:end]
                if episode_values is not None else None,
                time_diffs=time_diffs[start:end],
                possible_next_actions_lengths=pnas_lengths[start:end],
                possible_next_actions_state_concat=pnas_concat,
            )
            tdp.set_type(torch.FloatTensor)
            tdps.append(tdp)
        return tdps
예제 #8
0
    def export(
        cls,
        trainer,
        actions,
        state_normalization_parameters,
        int_features=False,
        model_on_gpu=False,
    ):
        """Export caffe2 preprocessor net and pytorch DQN forward pass as one
        caffe2 net.

        :param trainer DQNTrainer
        :param state_normalization_parameters state NormalizationParameters
        :param int_features boolean indicating if int features blob will be present
        :param model_on_gpu boolean indicating if the model is a GPU model or CPU model
        """

        input_dim = trainer.num_features

        if isinstance(trainer.q_network, DataParallel):
            trainer.q_network = trainer.q_network.module

        buffer = PytorchCaffe2Converter.pytorch_net_to_buffer(
            PreprocesserAndForwardPassContainer(
                Preprocessor(state_normalization_parameters, model_on_gpu),
                trainer.q_network,
            ),
            input_dim,
            model_on_gpu,
        )
        qnet_input_blob, qnet_output_blob, caffe2_netdef = PytorchCaffe2Converter.buffer_to_caffe2_netdef(
            buffer)
        torch_workspace = caffe2_netdef.workspace

        parameters = torch_workspace.Blobs()
        for blob_str in parameters:
            workspace.FeedBlob(blob_str, torch_workspace.FetchBlob(blob_str))

        torch_init_net = core.Net(caffe2_netdef.init_net)
        torch_predict_net = core.Net(caffe2_netdef.predict_net)
        # While converting to metanetdef, the external_input of predict_net
        # will be recomputed. Add the real output of init_net to parameters
        # to make sure they will be counted.
        parameters.extend(
            set(caffe2_netdef.init_net.external_output) -
            set(caffe2_netdef.init_net.external_input))

        model = model_helper.ModelHelper(name="predictor")
        net = model.net
        C2.set_model(model)

        workspace.FeedBlob("input/image", np.zeros([1, 1, 1, 1],
                                                   dtype=np.int32))
        workspace.FeedBlob("input/float_features.lengths",
                           np.zeros(1, dtype=np.int32))
        workspace.FeedBlob("input/float_features.keys",
                           np.zeros(1, dtype=np.int64))
        workspace.FeedBlob("input/float_features.values",
                           np.zeros(1, dtype=np.float32))

        input_feature_lengths = "input_feature_lengths"
        input_feature_keys = "input_feature_keys"
        input_feature_values = "input_feature_values"

        if int_features:
            workspace.FeedBlob("input/int_features.lengths",
                               np.zeros(1, dtype=np.int32))
            workspace.FeedBlob("input/int_features.keys",
                               np.zeros(1, dtype=np.int64))
            workspace.FeedBlob("input/int_features.values",
                               np.zeros(1, dtype=np.int32))
            C2.net().Cast(
                ["input/int_features.values"],
                ["input/int_features.values_float"],
                dtype=caffe2_pb2.TensorProto.FLOAT,
            )
            C2.net().MergeMultiScalarFeatureTensors(
                [
                    "input/float_features.lengths",
                    "input/float_features.keys",
                    "input/float_features.values",
                    "input/int_features.lengths",
                    "input/int_features.keys",
                    "input/int_features.values_float",
                ],
                [
                    input_feature_lengths, input_feature_keys,
                    input_feature_values
                ],
            )
        else:
            C2.net().Copy(["input/float_features.lengths"],
                          [input_feature_lengths])
            C2.net().Copy(["input/float_features.keys"], [input_feature_keys])
            C2.net().Copy(["input/float_features.values"],
                          [input_feature_values])

        if state_normalization_parameters is not None:
            preprocessor = PreprocessorNet(clip_anomalies=True)
            state_normalized_dense_matrix, new_parameters = preprocessor.normalize_sparse_matrix(
                input_feature_lengths,
                input_feature_keys,
                input_feature_values,
                state_normalization_parameters,
                blobname_prefix="state_norm",
                split_sparse_to_dense=False,
                split_expensive_feature_groups=False,
                normalize=False,
            )
            parameters.extend(new_parameters)
        else:
            # Image input.  Note: Currently this does the wrong thing if
            #   more than one image is passed at a time.
            state_normalized_dense_matrix = "input/image"

        net.Copy([state_normalized_dense_matrix], [qnet_input_blob])

        workspace.RunNetOnce(model.param_init_net)
        workspace.RunNetOnce(torch_init_net)

        net.AppendNet(torch_predict_net)

        new_parameters, q_values = RLPredictor._forward_pass(
            model, trainer, state_normalized_dense_matrix, actions,
            qnet_output_blob)
        parameters.extend(new_parameters)

        # Get 1 x n action index tensor under the max_q policy
        max_q_act_idxs = "max_q_policy_actions"
        C2.net().Flatten([C2.ArgMax(q_values)], [max_q_act_idxs], axis=0)
        shape_of_num_of_states = "num_states_shape"
        C2.net().FlattenToVec([max_q_act_idxs], [shape_of_num_of_states])
        num_states, _ = C2.Reshape(C2.Size(shape_of_num_of_states), shape=[1])

        # Get 1 x n action index tensor under the softmax policy
        temperature = C2.NextBlob("temperature")
        parameters.append(temperature)
        workspace.FeedBlob(
            temperature, np.array([trainer.rl_temperature], dtype=np.float32))
        tempered_q_values = C2.Div(q_values, temperature, broadcast=1)
        softmax_values = C2.Softmax(tempered_q_values)
        softmax_act_idxs_nested = "softmax_act_idxs_nested"
        C2.net().WeightedSample([softmax_values], [softmax_act_idxs_nested])
        softmax_act_idxs = "softmax_policy_actions"
        C2.net().Flatten([softmax_act_idxs_nested], [softmax_act_idxs], axis=0)

        action_names = C2.NextBlob("action_names")
        parameters.append(action_names)
        workspace.FeedBlob(action_names, np.array(actions))

        # Concat action index tensors to get 2 x n tensor - [[max_q], [softmax]]
        # transpose & flatten to get [a1_maxq, a1_softmax, a2_maxq, a2_softmax, ...]
        max_q_act_blob = C2.Cast(max_q_act_idxs,
                                 to=caffe2_pb2.TensorProto.INT32)
        softmax_act_blob = C2.Cast(softmax_act_idxs,
                                   to=caffe2_pb2.TensorProto.INT32)
        C2.net().Append([max_q_act_blob, softmax_act_blob], [max_q_act_blob])
        transposed_action_idxs = C2.Transpose(max_q_act_blob)
        flat_transposed_action_idxs = C2.FlattenToVec(transposed_action_idxs)
        workspace.FeedBlob(OUTPUT_SINGLE_CAT_VALS_NAME,
                           np.zeros(1, dtype=np.int64))
        C2.net().Gather([action_names, flat_transposed_action_idxs],
                        [OUTPUT_SINGLE_CAT_VALS_NAME])

        workspace.FeedBlob(OUTPUT_SINGLE_CAT_LENGTHS_NAME,
                           np.zeros(1, dtype=np.int32))
        C2.net().ConstantFill(
            [shape_of_num_of_states],
            [OUTPUT_SINGLE_CAT_LENGTHS_NAME],
            value=2,
            dtype=caffe2_pb2.TensorProto.INT32,
        )

        workspace.FeedBlob(OUTPUT_SINGLE_CAT_KEYS_NAME,
                           np.zeros(1, dtype=np.int64))
        output_keys_tensor, _ = C2.Concat(
            C2.ConstantFill(shape=[1, 1],
                            value=0,
                            dtype=caffe2_pb2.TensorProto.INT64),
            C2.ConstantFill(shape=[1, 1],
                            value=1,
                            dtype=caffe2_pb2.TensorProto.INT64),
            axis=0,
        )
        output_key_tile = C2.Tile(output_keys_tensor, num_states, axis=0)
        C2.net().FlattenToVec([output_key_tile], [OUTPUT_SINGLE_CAT_KEYS_NAME])

        workspace.CreateNet(net)
        return DQNPredictor(net, torch_init_net, parameters, int_features)
예제 #9
0
    def export_actor(
        cls,
        trainer,
        state_normalization_parameters,
        min_action_range_tensor_serving,
        max_action_range_tensor_serving,
        int_features=False,
        model_on_gpu=False,
    ):
        """Export caffe2 preprocessor net and pytorch actor forward pass as one
        caffe2 net.

        :param trainer DDPGTrainer
        :param state_normalization_parameters state NormalizationParameters
        :param min_action_range_tensor_serving pytorch tensor that specifies
            min action value for each dimension
        :param max_action_range_tensor_serving pytorch tensor that specifies
            min action value for each dimension
        :param state_normalization_parameters state NormalizationParameters
        :param int_features boolean indicating if int features blob will be present
        :param model_on_gpu boolean indicating if the model is a GPU model or CPU model
        """
        input_dim = trainer.state_dim
        buffer = PytorchCaffe2Converter.pytorch_net_to_buffer(
            trainer.actor, input_dim, model_on_gpu
        )
        actor_input_blob, actor_output_blob, caffe2_netdef = PytorchCaffe2Converter.buffer_to_caffe2_netdef(
            buffer
        )
        torch_workspace = caffe2_netdef.workspace

        parameters = torch_workspace.Blobs()
        for blob_str in parameters:
            workspace.FeedBlob(blob_str, torch_workspace.FetchBlob(blob_str))

        torch_init_net = core.Net(caffe2_netdef.init_net)
        torch_predict_net = core.Net(caffe2_netdef.predict_net)

        model = model_helper.ModelHelper(name="predictor")
        net = model.net
        C2.set_model(model)

        # Feed action scaling tensors for serving
        min_action_serving_blob = C2.NextBlob("min_action_range_tensor_serving")
        workspace.FeedBlob(
            min_action_serving_blob, min_action_range_tensor_serving.cpu().data.numpy()
        )
        parameters.append(str(min_action_serving_blob))

        max_action_serving_blob = C2.NextBlob("max_action_range_tensor_serving")
        workspace.FeedBlob(
            max_action_serving_blob, max_action_range_tensor_serving.cpu().data.numpy()
        )
        parameters.append(str(max_action_serving_blob))

        # Feed action scaling tensors for training [-1, 1] due to tanh actor
        min_vals_training = trainer.min_action_range_tensor_training.cpu().data.numpy()
        min_action_training_blob = C2.NextBlob("min_action_range_tensor_training")
        workspace.FeedBlob(min_action_training_blob, min_vals_training)
        parameters.append(str(min_action_training_blob))

        max_vals_training = trainer.max_action_range_tensor_training.cpu().data.numpy()
        max_action_training_blob = C2.NextBlob("max_action_range_tensor_training")
        workspace.FeedBlob(max_action_training_blob, max_vals_training)
        parameters.append(str(max_action_training_blob))

        workspace.FeedBlob("input/float_features.lengths", np.zeros(1, dtype=np.int32))
        workspace.FeedBlob("input/float_features.keys", np.zeros(1, dtype=np.int64))
        workspace.FeedBlob("input/float_features.values", np.zeros(1, dtype=np.float32))

        input_feature_lengths = "input_feature_lengths"
        input_feature_keys = "input_feature_keys"
        input_feature_values = "input_feature_values"

        if int_features:
            workspace.FeedBlob(
                "input/int_features.lengths", np.zeros(1, dtype=np.int32)
            )
            workspace.FeedBlob("input/int_features.keys", np.zeros(1, dtype=np.int64))
            workspace.FeedBlob("input/int_features.values", np.zeros(1, dtype=np.int32))
            C2.net().Cast(
                ["input/int_features.values"],
                ["input/int_features.values_float"],
                dtype=caffe2_pb2.TensorProto.FLOAT,
            )
            C2.net().MergeMultiScalarFeatureTensors(
                [
                    "input/float_features.lengths",
                    "input/float_features.keys",
                    "input/float_features.values",
                    "input/int_features.lengths",
                    "input/int_features.keys",
                    "input/int_features.values_float",
                ],
                [input_feature_lengths, input_feature_keys, input_feature_values],
            )
        else:
            C2.net().Copy(["input/float_features.lengths"], [input_feature_lengths])
            C2.net().Copy(["input/float_features.keys"], [input_feature_keys])
            C2.net().Copy(["input/float_features.values"], [input_feature_values])

        preprocessor = PreprocessorNet(True)
        state_normalized_dense_matrix, new_parameters = preprocessor.normalize_sparse_matrix(
            input_feature_lengths,
            input_feature_keys,
            input_feature_values,
            state_normalization_parameters,
            "state_norm",
            False,
            False,
        )
        parameters.extend(new_parameters)
        net.Copy([state_normalized_dense_matrix], [actor_input_blob])

        workspace.RunNetOnce(model.param_init_net)
        workspace.RunNetOnce(torch_init_net)

        net.AppendNet(torch_predict_net)

        C2.FlattenToVec(C2.ArgMax(actor_output_blob))
        output_lengths = "output/float_features.lengths"
        workspace.FeedBlob(output_lengths, np.zeros(1, dtype=np.int32))
        C2.net().ConstantFill(
            [C2.FlattenToVec(C2.ArgMax(actor_output_blob))],
            [output_lengths],
            value=trainer.actor.layers[-1].out_features,
            dtype=caffe2_pb2.TensorProto.INT32,
        )

        output_keys = "output/float_features.keys"
        workspace.FeedBlob(output_keys, np.zeros(1, dtype=np.int32))
        C2.net().LengthsRangeFill([output_lengths], [output_keys])

        output_values = "output/float_features.values"
        workspace.FeedBlob(output_values, np.zeros(1, dtype=np.float32))
        # Scale actors actions from [-1, 1] to serving range
        prev_range = C2.Sub(max_action_training_blob, min_action_training_blob)
        new_range = C2.Sub(max_action_serving_blob, min_action_serving_blob)
        subtract_prev_min = C2.Sub(actor_output_blob, min_action_training_blob)
        div_by_prev_range = C2.Div(subtract_prev_min, prev_range)
        scaled_for_serving_actions = C2.Add(
            C2.Mul(div_by_prev_range, new_range), min_action_serving_blob
        )
        C2.net().FlattenToVec([scaled_for_serving_actions], [output_values])

        workspace.CreateNet(net)
        return DDPGPredictor(net, parameters, int_features)
예제 #10
0
    def export_critic(
        cls,
        trainer,
        state_normalization_parameters,
        action_normalization_parameters,
        int_features=False,
        model_on_gpu=False,
    ):
        """Export caffe2 preprocessor net and pytorch critic forward pass as one
        caffe2 net.

        :param trainer DDPGTrainer
        :param state_normalization_parameters state NormalizationParameters
        :param action_normalization_parameters action NormalizationParameters
        :param int_features boolean indicating if int features blob will be present
        """
        input_dim = trainer.state_dim + trainer.action_dim
        buffer = PytorchCaffe2Converter.pytorch_net_to_buffer(
            trainer.critic, input_dim, model_on_gpu
        )
        critic_input_blob, critic_output_blob, caffe2_netdef = PytorchCaffe2Converter.buffer_to_caffe2_netdef(
            buffer
        )
        torch_workspace = caffe2_netdef.workspace

        parameters = []
        for blob_str in torch_workspace.Blobs():
            workspace.FeedBlob(blob_str, torch_workspace.FetchBlob(blob_str))
            parameters.append(blob_str)

        torch_init_net = core.Net(caffe2_netdef.init_net)
        torch_predict_net = core.Net(caffe2_netdef.predict_net)

        model = model_helper.ModelHelper(name="predictor")
        net = model.net
        C2.set_model(model)

        workspace.FeedBlob("input/float_features.lengths", np.zeros(1, dtype=np.int32))
        workspace.FeedBlob("input/float_features.keys", np.zeros(1, dtype=np.int64))
        workspace.FeedBlob("input/float_features.values", np.zeros(1, dtype=np.float32))

        input_feature_lengths = "input_feature_lengths"
        input_feature_keys = "input_feature_keys"
        input_feature_values = "input_feature_values"

        if int_features:
            workspace.FeedBlob(
                "input/int_features.lengths", np.zeros(1, dtype=np.int32)
            )
            workspace.FeedBlob("input/int_features.keys", np.zeros(1, dtype=np.int64))
            workspace.FeedBlob("input/int_features.values", np.zeros(1, dtype=np.int32))
            C2.net().Cast(
                ["input/int_features.values"],
                ["input/int_features.values_float"],
                dtype=caffe2_pb2.TensorProto.FLOAT,
            )
            C2.net().MergeMultiScalarFeatureTensors(
                [
                    "input/float_features.lengths",
                    "input/float_features.keys",
                    "input/float_features.values",
                    "input/int_features.lengths",
                    "input/int_features.keys",
                    "input/int_features.values_float",
                ],
                [input_feature_lengths, input_feature_keys, input_feature_values],
            )
        else:
            C2.net().Copy(["input/float_features.lengths"], [input_feature_lengths])
            C2.net().Copy(["input/float_features.keys"], [input_feature_keys])
            C2.net().Copy(["input/float_features.values"], [input_feature_values])

        preprocessor = PreprocessorNet(True)
        state_normalized_dense_matrix, new_parameters = preprocessor.normalize_sparse_matrix(
            input_feature_lengths,
            input_feature_keys,
            input_feature_values,
            state_normalization_parameters,
            "state_norm",
            False,
            False,
        )
        parameters.extend(new_parameters)

        # Don't normalize actions, just go from sparse -> dense
        action_dense_matrix, new_parameters = preprocessor.normalize_sparse_matrix(
            input_feature_lengths,
            input_feature_keys,
            input_feature_values,
            action_normalization_parameters,
            "action_norm",
            False,
            False,
            normalize=False,
        )
        parameters.extend(new_parameters)
        state_action_normalized = "state_action_normalized"
        state_action_normalized_dim = "state_action_normalized_dim"
        net.Concat(
            [state_normalized_dense_matrix, action_dense_matrix],
            [state_action_normalized, state_action_normalized_dim],
            axis=1,
        )
        net.Copy([state_action_normalized], [critic_input_blob])

        workspace.RunNetOnce(model.param_init_net)
        workspace.RunNetOnce(torch_init_net)

        net.AppendNet(torch_init_net)
        net.AppendNet(torch_predict_net)

        C2.FlattenToVec(C2.ArgMax(critic_output_blob))
        output_lengths = "output/float_features.lengths"
        workspace.FeedBlob(output_lengths, np.zeros(1, dtype=np.int32))
        C2.net().ConstantFill(
            [C2.FlattenToVec(C2.ArgMax(critic_output_blob))],
            [output_lengths],
            value=trainer.critic.layers[-1].out_features,
            dtype=caffe2_pb2.TensorProto.INT32,
        )

        output_keys = "output/float_features.keys"
        workspace.FeedBlob(output_keys, np.zeros(1, dtype=np.int32))
        C2.net().LengthsRangeFill([output_lengths], [output_keys])

        output_values = "output/float_features.values"
        workspace.FeedBlob(output_values, np.zeros(1, dtype=np.float32))
        C2.net().FlattenToVec([critic_output_blob], [output_values])

        workspace.CreateNet(net)
        return DDPGPredictor(net, parameters, int_features)
    def export(
        cls,
        trainer,
        state_normalization_parameters,
        action_normalization_parameters,
        int_features=False,
    ):
        """ Creates a ContinuousActionDQNPredictor from a ContinuousActionDQNTrainer.

        :param trainer ContinuousActionDQNTrainer
        :param state_normalization_parameters state NormalizationParameters
        :param action_normalization_parameters action NormalizationParameters
        :param int_features boolean indicating if int features blob will be present
        """
        # ensure state and action IDs have no intersection
        assert (len(
            set(state_normalization_parameters.keys())
            & set(action_normalization_parameters.keys())) == 0)

        model = model_helper.ModelHelper(name="predictor")
        net = model.net
        C2.set_model(model)

        workspace.FeedBlob("input/float_features.lengths",
                           np.zeros(1, dtype=np.int32))
        workspace.FeedBlob("input/float_features.keys",
                           np.zeros(1, dtype=np.int64))
        workspace.FeedBlob("input/float_features.values",
                           np.zeros(1, dtype=np.float32))

        input_feature_lengths = "input_feature_lengths"
        input_feature_keys = "input_feature_keys"
        input_feature_values = "input_feature_values"

        if int_features:
            workspace.FeedBlob("input/int_features.lengths",
                               np.zeros(1, dtype=np.int32))
            workspace.FeedBlob("input/int_features.keys",
                               np.zeros(1, dtype=np.int64))
            workspace.FeedBlob("input/int_features.values",
                               np.zeros(1, dtype=np.int32))
            C2.net().Cast(
                ["input/int_features.values"],
                ["input/int_features.values_float"],
                dtype=caffe2_pb2.TensorProto.FLOAT,
            )
            C2.net().MergeMultiScalarFeatureTensors(
                [
                    "input/float_features.lengths",
                    "input/float_features.keys",
                    "input/float_features.values",
                    "input/int_features.lengths",
                    "input/int_features.keys",
                    "input/int_features.values_float",
                ],
                [
                    input_feature_lengths, input_feature_keys,
                    input_feature_values
                ],
            )
        else:
            C2.net().Copy(["input/float_features.lengths"],
                          [input_feature_lengths])
            C2.net().Copy(["input/float_features.keys"], [input_feature_keys])
            C2.net().Copy(["input/float_features.values"],
                          [input_feature_values])

        preprocessor = PreprocessorNet(True)
        parameters = []
        state_normalized_dense_matrix, new_parameters = preprocessor.normalize_sparse_matrix(
            input_feature_lengths,
            input_feature_keys,
            input_feature_values,
            state_normalization_parameters,
            "state_norm",
            False,
            False,
        )
        parameters.extend(new_parameters)
        action_normalized_dense_matrix, new_parameters = preprocessor.normalize_sparse_matrix(
            input_feature_lengths,
            input_feature_keys,
            input_feature_values,
            action_normalization_parameters,
            "action_norm",
            False,
            False,
        )
        parameters.extend(new_parameters)
        state_action_normalized = "state_action_normalized"
        state_action_normalized_dim = "state_action_normalized_dim"
        net.Concat(
            [state_normalized_dense_matrix, action_normalized_dense_matrix],
            [state_action_normalized, state_action_normalized_dim],
            axis=1,
        )
        new_parameters, q_values = RLPredictor._forward_pass(
            model, trainer, state_action_normalized, ["Q"])
        parameters.extend(new_parameters)

        flat_q_values_key = (
            "output/string_weighted_multi_categorical_features.values.values")
        num_examples, _ = C2.Reshape(C2.Size(flat_q_values_key), shape=[1])
        q_value_blob, _ = C2.Reshape(flat_q_values_key, shape=[1, -1])

        # Get 1 x n (number of examples) action index tensor under the max_q policy
        max_q_act_idxs = "max_q_policy_actions"
        C2.net().FlattenToVec([C2.ArgMax(q_value_blob)], [max_q_act_idxs])
        max_q_act_blob = C2.Tile(max_q_act_idxs, num_examples, axis=0)

        # Get 1 x n (number of examples) action index tensor under the softmax policy
        temperature = C2.NextBlob("temperature")
        parameters.append(temperature)
        workspace.FeedBlob(
            temperature, np.array([trainer.rl_temperature], dtype=np.float32))
        tempered_q_values = C2.Div(q_value_blob, temperature, broadcast=1)
        softmax_values = C2.Softmax(tempered_q_values)
        softmax_act_idxs_nested = "softmax_act_idxs_nested"
        C2.net().WeightedSample([softmax_values], [softmax_act_idxs_nested])
        softmax_act_blob = C2.Tile(C2.FlattenToVec(softmax_act_idxs_nested),
                                   num_examples,
                                   axis=0)

        # Concat action idx vecs to get 2 x n tensor [[a_maxq, ..], [a_softmax, ..]]
        # transpose & flatten to get [a_maxq, a_softmax, a_maxq, a_softmax, ...]
        max_q_act_blob = C2.Cast(max_q_act_blob,
                                 to=caffe2_pb2.TensorProto.INT64)
        softmax_act_blob = C2.Cast(softmax_act_blob,
                                   to=caffe2_pb2.TensorProto.INT64)
        max_q_act_blob_nested, _ = C2.Reshape(max_q_act_blob, shape=[1, -1])
        softmax_act_blob_nested, _ = C2.Reshape(softmax_act_blob,
                                                shape=[1, -1])
        C2.net().Append([max_q_act_blob_nested, softmax_act_blob_nested],
                        [max_q_act_blob_nested])
        transposed_action_idxs = C2.Transpose(max_q_act_blob_nested)
        flat_transposed_action_idxs = C2.FlattenToVec(transposed_action_idxs)
        output_values = "output/int_single_categorical_features.values"
        workspace.FeedBlob(output_values, np.zeros(1, dtype=np.int64))
        C2.net().Copy([flat_transposed_action_idxs], [output_values])

        output_lengths = "output/int_single_categorical_features.lengths"
        workspace.FeedBlob(output_lengths, np.zeros(1, dtype=np.int32))
        C2.net().ConstantFill(
            [flat_q_values_key],
            [output_lengths],
            value=2,
            dtype=caffe2_pb2.TensorProto.INT32,
        )

        output_keys = "output/int_single_categorical_features.keys"
        workspace.FeedBlob(output_keys, np.zeros(1, dtype=np.int64))
        output_keys_tensor, _ = C2.Concat(
            C2.ConstantFill(shape=[1, 1],
                            value=0,
                            dtype=caffe2_pb2.TensorProto.INT64),
            C2.ConstantFill(shape=[1, 1],
                            value=1,
                            dtype=caffe2_pb2.TensorProto.INT64),
            axis=0,
        )
        output_key_tile = C2.Tile(output_keys_tensor, num_examples, axis=0)
        C2.net().FlattenToVec([output_key_tile], [output_keys])

        workspace.RunNetOnce(model.param_init_net)
        workspace.CreateNet(net)
        return ContinuousActionDQNPredictor(net, parameters, int_features)
예제 #12
0
    def preprocess_samples(
        self,
        states: List[Dict[int, float]],
        actions: List[Dict[int, float]],
        rewards: List[float],
        next_states: List[Dict[int, float]],
        next_actions: List[Dict[int, float]],
        is_terminals: List[bool],
        possible_next_actions: List[List[Dict[int, float]]],
        reward_timelines: List[Dict[int, float]],
        minibatch_size: int,
    ) -> List[TrainingDataPage]:
        # Shuffle
        merged = list(
            zip(states, actions, rewards, next_states, next_actions,
                is_terminals, possible_next_actions, reward_timelines))
        random.shuffle(merged)
        states, actions, rewards, next_states, next_actions, is_terminals, \
            possible_next_actions, reward_timelines = zip(*merged)

        net = core.Net('gridworld_preprocessing')
        C2.set_net(net)
        preprocessor = PreprocessorNet(net, True)
        saa = StackedAssociativeArray.from_dict_list(states, 'states')
        state_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization,
            'state_norm',
        )
        saa = StackedAssociativeArray.from_dict_list(next_states,
                                                     'next_states')
        next_state_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization,
            'next_state_norm',
        )
        saa = StackedAssociativeArray.from_dict_list(actions, 'action')
        action_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization_action,
            'action_norm',
        )
        saa = StackedAssociativeArray.from_dict_list(next_actions,
                                                     'next_action')
        next_action_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization_action,
            'next_action_norm',
        )
        rewards = np.array(rewards, dtype=np.float32).reshape(-1, 1)

        pnas_lengths_list = []
        pnas_flat = []
        for pnas in possible_next_actions:
            pnas_lengths_list.append(len(pnas))
            pnas_flat.extend(pnas)
        saa = StackedAssociativeArray.from_dict_list(pnas_flat,
                                                     'possible_next_actions')
        pnas_lengths = np.array(pnas_lengths_list, dtype=np.int32)
        possible_next_actions_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization_action,
            'possible_next_action_norm',
        )
        workspace.RunNetOnce(net)

        states_ndarray = workspace.FetchBlob(state_matrix)
        actions_ndarray = workspace.FetchBlob(action_matrix)
        next_states_ndarray = workspace.FetchBlob(next_state_matrix)
        next_actions_ndarray = workspace.FetchBlob(next_action_matrix)
        possible_next_actions_ndarray = workspace.FetchBlob(
            possible_next_actions_matrix)
        tdps = []
        pnas_start = 0
        for start in range(0, states_ndarray.shape[0], minibatch_size):
            end = start + minibatch_size
            if end > states_ndarray.shape[0]:
                break
            pnas_end = pnas_start + np.sum(pnas_lengths[start:end])
            pnas = possible_next_actions_ndarray[pnas_start:pnas_end]
            pnas_start = pnas_end
            tdps.append(
                TrainingDataPage(
                    states=states_ndarray[start:end],
                    actions=actions_ndarray[start:end],
                    rewards=rewards[start:end],
                    next_states=next_states_ndarray[start:end],
                    next_actions=next_actions_ndarray[start:end],
                    possible_next_actions=StackedArray(pnas_lengths[start:end],
                                                       pnas),
                    not_terminals=(pnas_lengths[start:end] > 0).reshape(-1, 1),
                    reward_timelines=reward_timelines[start:end]
                    if reward_timelines else None,
                ))
        return tdps
    def export(
        cls,
        trainer,
        state_normalization_parameters,
        action_normalization_parameters,
    ):
        """ Creates ContinuousActionDQNPredictor from a list of action trainers

        :param trainer ContinuousActionDQNPredictor
        :param state_features list of state feature names
        :param action_features list of action feature names
        """
        # ensure state and action IDs have no intersection
        assert (
            len(
                set(state_normalization_parameters.keys()) &
                set(action_normalization_parameters.keys())
            ) == 0
        )

        model = model_helper.ModelHelper(name="predictor")
        net = model.net

        workspace.FeedBlob(
            'input/float_features.lengths', np.zeros(1, dtype=np.int32)
        )
        workspace.FeedBlob(
            'input/float_features.keys', np.zeros(1, dtype=np.int32)
        )
        workspace.FeedBlob(
            'input/float_features.values', np.zeros(1, dtype=np.float32)
        )

        preprocessor = PreprocessorNet(net, True)
        parameters = []
        parameters.extend(preprocessor.parameters)
        state_normalized_dense_matrix, new_parameters = \
            preprocessor.normalize_sparse_matrix(
                'input/float_features.lengths',
                'input/float_features.keys',
                'input/float_features.values',
                state_normalization_parameters,
                'state_norm',
            )
        parameters.extend(new_parameters)
        action_normalized_dense_matrix, new_parameters = \
            preprocessor.normalize_sparse_matrix(
                'input/float_features.lengths',
                'input/float_features.keys',
                'input/float_features.values',
                action_normalization_parameters,
                'action_norm',
            )
        parameters.extend(new_parameters)
        state_action_normalized = 'state_action_normalized'
        state_action_normalized_dim = 'state_action_normalized_dim'
        net.Concat(
            [state_normalized_dense_matrix, action_normalized_dense_matrix],
            [state_action_normalized, state_action_normalized_dim],
            axis=1
        )
        new_parameters = RLPredictor._forward_pass(
            model,
            trainer,
            state_action_normalized,
            ['Q'],
        )
        parameters.extend(new_parameters)

        workspace.RunNetOnce(model.param_init_net)
        workspace.CreateNet(net)
        return ContinuousActionDQNPredictor(net, parameters)
예제 #14
0
    def preprocess_samples_discrete(
        self,
        states: List[Dict[int, float]],
        actions: List[str],
        rewards: List[float],
        next_states: List[Dict[int, float]],
        next_actions: List[str],
        is_terminals: List[bool],
        possible_next_actions: List[List[str]],
        reward_timelines: Optional[List[Dict[int, float]]],
        minibatch_size: int,
    ) -> List[TrainingDataPage]:
        # Shuffle
        if reward_timelines is None:
            merged = list(
                zip(states, actions, rewards, next_states, next_actions,
                    is_terminals, possible_next_actions))
            random.shuffle(merged)
            states, actions, rewards, next_states, next_actions, \
                is_terminals, possible_next_actions = zip(*merged)
        else:
            merged = list(
                zip(states, actions, rewards, next_states, next_actions,
                    is_terminals, possible_next_actions, reward_timelines))
            random.shuffle(merged)
            states, actions, rewards, next_states, next_actions, \
                is_terminals, possible_next_actions, reward_timelines = zip(*merged)

        net = core.Net('gridworld_preprocessing')
        C2.set_net(net)
        preprocessor = PreprocessorNet(net, True)
        saa = StackedAssociativeArray.from_dict_list(states, 'states')
        state_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization,
            'state_norm',
        )
        saa = StackedAssociativeArray.from_dict_list(next_states,
                                                     'next_states')
        next_state_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization,
            'next_state_norm',
        )
        workspace.RunNetOnce(net)
        actions_one_hot = np.zeros(
            [len(actions), len(self.ACTIONS)], dtype=np.float32)
        for i, action in enumerate(actions):
            actions_one_hot[i, self.ACTIONS.index(action)] = 1
        rewards = np.array(rewards, dtype=np.float32).reshape(-1, 1)
        next_actions_one_hot = np.zeros(
            [len(next_actions), len(self.ACTIONS)], dtype=np.float32)
        for i, action in enumerate(next_actions):
            if action == '':
                continue
            next_actions_one_hot[i, self.ACTIONS.index(action)] = 1
        possible_next_actions_mask = []
        for pna in possible_next_actions:
            pna_mask = [0] * self.num_actions
            for action in pna:
                pna_mask[self.ACTIONS.index(action)] = 1
            possible_next_actions_mask.append(pna_mask)
        possible_next_actions_mask = np.array(possible_next_actions_mask,
                                              dtype=np.float32)
        is_terminals = np.array(is_terminals, dtype=np.bool).reshape(-1, 1)
        not_terminals = np.logical_not(is_terminals)
        if reward_timelines is not None:
            reward_timelines = np.array(reward_timelines, dtype=np.object)

        states_ndarray = workspace.FetchBlob(state_matrix)
        next_states_ndarray = workspace.FetchBlob(next_state_matrix)
        tdps = []
        for start in range(0, states_ndarray.shape[0], minibatch_size):
            end = start + minibatch_size
            if end > states_ndarray.shape[0]:
                break
            tdps.append(
                TrainingDataPage(
                    states=states_ndarray[start:end],
                    actions=actions_one_hot[start:end],
                    rewards=rewards[start:end],
                    next_states=next_states_ndarray[start:end],
                    not_terminals=not_terminals[start:end],
                    next_actions=next_actions_one_hot[start:end],
                    possible_next_actions=possible_next_actions_mask[
                        start:end],
                    reward_timelines=reward_timelines[start:end]
                    if reward_timelines is not None else None,
                ))
        return tdps
예제 #15
0
    def export(
        cls,
        trainer,
        state_normalization_parameters,
        action_normalization_parameters,
        int_features=False,
        model_on_gpu=False,
    ):
        """Export caffe2 preprocessor net and pytorch DQN forward pass as one
        caffe2 net.

        :param trainer ParametricDQNTrainer
        :param state_normalization_parameters state NormalizationParameters
        :param action_normalization_parameters action NormalizationParameters
        :param int_features boolean indicating if int features blob will be present
        :param model_on_gpu boolean indicating if the model is a GPU model or CPU model
        """

        input_dim = trainer.num_features
        if isinstance(trainer.q_network, DataParallel):
            trainer.q_network = trainer.q_network.module

        buffer = PytorchCaffe2Converter.pytorch_net_to_buffer(
            trainer.q_network, input_dim, model_on_gpu)
        qnet_input_blob, qnet_output_blob, caffe2_netdef = PytorchCaffe2Converter.buffer_to_caffe2_netdef(
            buffer)
        torch_workspace = caffe2_netdef.workspace

        parameters = torch_workspace.Blobs()
        for blob_str in parameters:
            workspace.FeedBlob(blob_str, torch_workspace.FetchBlob(blob_str))

        torch_init_net = core.Net(caffe2_netdef.init_net)
        torch_predict_net = core.Net(caffe2_netdef.predict_net)
        # While converting to metanetdef, the external_input of predict_net
        # will be recomputed. Add the real output of init_net to parameters
        # to make sure they will be counted.
        parameters.extend(
            set(caffe2_netdef.init_net.external_output) -
            set(caffe2_netdef.init_net.external_input))

        # ensure state and action IDs have no intersection
        assert (len(
            set(state_normalization_parameters.keys())
            & set(action_normalization_parameters.keys())) == 0)

        model = model_helper.ModelHelper(name="predictor")
        net = model.net
        C2.set_model(model)

        workspace.FeedBlob("input/float_features.lengths",
                           np.zeros(1, dtype=np.int32))
        workspace.FeedBlob("input/float_features.keys",
                           np.zeros(1, dtype=np.int64))
        workspace.FeedBlob("input/float_features.values",
                           np.zeros(1, dtype=np.float32))

        input_feature_lengths = "input_feature_lengths"
        input_feature_keys = "input_feature_keys"
        input_feature_values = "input_feature_values"

        if int_features:
            workspace.FeedBlob("input/int_features.lengths",
                               np.zeros(1, dtype=np.int32))
            workspace.FeedBlob("input/int_features.keys",
                               np.zeros(1, dtype=np.int64))
            workspace.FeedBlob("input/int_features.values",
                               np.zeros(1, dtype=np.int32))
            C2.net().Cast(
                ["input/int_features.values"],
                ["input/int_features.values_float"],
                dtype=caffe2_pb2.TensorProto.FLOAT,
            )
            C2.net().MergeMultiScalarFeatureTensors(
                [
                    "input/float_features.lengths",
                    "input/float_features.keys",
                    "input/float_features.values",
                    "input/int_features.lengths",
                    "input/int_features.keys",
                    "input/int_features.values_float",
                ],
                [
                    input_feature_lengths, input_feature_keys,
                    input_feature_values
                ],
            )
        else:
            C2.net().Copy(["input/float_features.lengths"],
                          [input_feature_lengths])
            C2.net().Copy(["input/float_features.keys"], [input_feature_keys])
            C2.net().Copy(["input/float_features.values"],
                          [input_feature_values])

        preprocessor = PreprocessorNet(clip_anomalies=True)

        state_normalized_dense_matrix, new_parameters = preprocessor.normalize_sparse_matrix(
            input_feature_lengths,
            input_feature_keys,
            input_feature_values,
            state_normalization_parameters,
            "state_norm",
            False,
            False,
        )
        parameters.extend(new_parameters)
        action_normalized_dense_matrix, new_parameters = preprocessor.normalize_sparse_matrix(
            input_feature_lengths,
            input_feature_keys,
            input_feature_values,
            action_normalization_parameters,
            "action_norm",
            False,
            False,
        )
        parameters.extend(new_parameters)
        state_action_normalized = "state_action_normalized"
        state_action_normalized_dim = "state_action_normalized_dim"
        net.Concat(
            [state_normalized_dense_matrix, action_normalized_dense_matrix],
            [state_action_normalized, state_action_normalized_dim],
            axis=1,
        )

        net.Copy([state_action_normalized], [qnet_input_blob])

        workspace.RunNetOnce(model.param_init_net)
        workspace.RunNetOnce(torch_init_net)

        net.AppendNet(torch_predict_net)

        new_parameters, q_values = RLPredictor._forward_pass(
            model, trainer, state_action_normalized, ["Q"], qnet_output_blob)
        parameters.extend(new_parameters)

        flat_q_values_key = (
            "output/string_weighted_multi_categorical_features.values.values")
        num_examples, _ = C2.Reshape(C2.Size(flat_q_values_key), shape=[1])
        q_value_blob, _ = C2.Reshape(flat_q_values_key, shape=[1, -1])

        # Get 1 x n (number of examples) action index tensor under the max_q policy
        max_q_act_idxs = "max_q_policy_actions"
        C2.net().FlattenToVec([C2.ArgMax(q_value_blob)], [max_q_act_idxs])
        max_q_act_blob = C2.Tile(max_q_act_idxs, num_examples, axis=0)

        # Get 1 x n (number of examples) action index tensor under the softmax policy
        temperature = C2.NextBlob("temperature")
        parameters.append(temperature)
        workspace.FeedBlob(
            temperature, np.array([trainer.rl_temperature], dtype=np.float32))
        tempered_q_values = C2.Div(q_value_blob, temperature, broadcast=1)
        softmax_values = C2.Softmax(tempered_q_values)
        softmax_act_idxs_nested = "softmax_act_idxs_nested"
        C2.net().WeightedSample([softmax_values], [softmax_act_idxs_nested])
        softmax_act_blob = C2.Tile(C2.FlattenToVec(softmax_act_idxs_nested),
                                   num_examples,
                                   axis=0)

        # Concat action idx vecs to get 2 x n tensor [[a_maxq, ..], [a_softmax, ..]]
        # transpose & flatten to get [a_maxq, a_softmax, a_maxq, a_softmax, ...]
        max_q_act_blob = C2.Cast(max_q_act_blob,
                                 to=caffe2_pb2.TensorProto.INT64)
        softmax_act_blob = C2.Cast(softmax_act_blob,
                                   to=caffe2_pb2.TensorProto.INT64)
        max_q_act_blob_nested, _ = C2.Reshape(max_q_act_blob, shape=[1, -1])
        softmax_act_blob_nested, _ = C2.Reshape(softmax_act_blob,
                                                shape=[1, -1])
        C2.net().Append([max_q_act_blob_nested, softmax_act_blob_nested],
                        [max_q_act_blob_nested])
        transposed_action_idxs = C2.Transpose(max_q_act_blob_nested)
        flat_transposed_action_idxs = C2.FlattenToVec(transposed_action_idxs)
        output_values = "output/int_single_categorical_features.values"
        workspace.FeedBlob(output_values, np.zeros(1, dtype=np.int64))
        C2.net().Copy([flat_transposed_action_idxs], [output_values])

        output_lengths = "output/int_single_categorical_features.lengths"
        workspace.FeedBlob(output_lengths, np.zeros(1, dtype=np.int32))
        C2.net().ConstantFill(
            [flat_q_values_key],
            [output_lengths],
            value=2,
            dtype=caffe2_pb2.TensorProto.INT32,
        )

        output_keys = "output/int_single_categorical_features.keys"
        workspace.FeedBlob(output_keys, np.zeros(1, dtype=np.int64))
        output_keys_tensor, _ = C2.Concat(
            C2.ConstantFill(shape=[1, 1],
                            value=0,
                            dtype=caffe2_pb2.TensorProto.INT64),
            C2.ConstantFill(shape=[1, 1],
                            value=1,
                            dtype=caffe2_pb2.TensorProto.INT64),
            axis=0,
        )
        output_key_tile = C2.Tile(output_keys_tensor, num_examples, axis=0)
        C2.net().FlattenToVec([output_key_tile], [output_keys])

        workspace.CreateNet(net)
        return ParametricDQNPredictor(net, torch_init_net, parameters,
                                      int_features)
예제 #16
0
    def preprocess_samples_discrete(
            self,
            samples: Samples,
            minibatch_size: int,
            one_hot_action: bool = True) -> List[TrainingDataPage]:
        logger.info("Shuffling...")
        samples.shuffle()
        logger.info("Preprocessing...")

        net = core.Net("gridworld_preprocessing")
        C2.set_net(net)
        preprocessor = PreprocessorNet(True)
        saa = StackedAssociativeArray.from_dict_list(samples.states, "states")
        state_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization,
            "state_norm",
            False,
            False,
            False,
        )
        saa = StackedAssociativeArray.from_dict_list(samples.next_states,
                                                     "next_states")
        next_state_matrix, _ = preprocessor.normalize_sparse_matrix(
            saa.lengths,
            saa.keys,
            saa.values,
            self.normalization,
            "next_state_norm",
            False,
            False,
            False,
        )
        workspace.RunNetOnce(net)

        logger.info("Converting to Torch...")
        actions_one_hot = torch.tensor((np.array(samples.actions).reshape(
            -1, 1) == np.array(self.ACTIONS)).astype(np.int64))
        actions = actions_one_hot.argmax(dim=1, keepdim=True)
        rewards = torch.tensor(samples.rewards,
                               dtype=torch.float32).reshape(-1, 1)
        action_probabilities = torch.tensor(samples.action_probabilities,
                                            dtype=torch.float32).reshape(
                                                -1, 1)
        next_actions_one_hot = torch.tensor(
            (np.array(samples.next_actions).reshape(-1, 1) == np.array(
                self.ACTIONS)).astype(np.int64))
        logger.info("Converting PNA to Torch...")
        possible_next_action_strings = np.array(
            list(
                itertools.zip_longest(*samples.possible_next_actions,
                                      fillvalue=""))).T
        possible_next_actions_mask = torch.zeros(
            [len(samples.next_actions),
             len(self.ACTIONS)])
        for i, action in enumerate(self.ACTIONS):
            possible_next_actions_mask[:, i] = torch.tensor(
                np.max(possible_next_action_strings == action,
                       axis=1).astype(np.int64))
        terminals = torch.tensor(samples.terminals,
                                 dtype=torch.int32).reshape(-1, 1)
        not_terminals = 1 - terminals
        episode_values = None
        logger.info("Converting RT to Torch...")
        episode_values = torch.tensor(samples.episode_values,
                                      dtype=torch.float32).reshape(-1, 1)

        time_diffs = torch.ones([len(samples.states), 1])

        logger.info("Preprocessing...")
        preprocessor = Preprocessor(self.normalization, False)

        states_ndarray = workspace.FetchBlob(state_matrix)
        states_ndarray = preprocessor.forward(states_ndarray)

        next_states_ndarray = workspace.FetchBlob(next_state_matrix)
        next_states_ndarray = preprocessor.forward(next_states_ndarray)

        logger.info("Batching...")
        tdps = []
        for start in range(0, states_ndarray.shape[0], minibatch_size):
            end = start + minibatch_size
            if end > states_ndarray.shape[0]:
                break
            tdp = TrainingDataPage(
                states=states_ndarray[start:end],
                actions=actions_one_hot[start:end]
                if one_hot_action else actions[start:end],
                propensities=action_probabilities[start:end],
                rewards=rewards[start:end],
                next_states=next_states_ndarray[start:end],
                not_terminals=not_terminals[start:end],
                next_actions=next_actions_one_hot[start:end],
                possible_next_actions=possible_next_actions_mask[start:end],
                episode_values=episode_values[start:end]
                if episode_values is not None else None,
                time_diffs=time_diffs[start:end],
            )
            tdp.set_type(torch.FloatTensor)
            tdps.append(tdp)
        return tdps
예제 #17
0
    def export_actor(cls,
                     trainer,
                     state_normalization_parameters,
                     int_features=False):
        """Export caffe2 preprocessor net and pytorch actor forward pass as one
        caffe2 net.

        :param trainer DDPGTrainer
        :param state_normalization_parameters state NormalizationParameters
        :param int_features boolean indicating if int features blob will be present
        """
        input_dim = len(state_normalization_parameters)
        buffer = PytorchCaffe2Converter.pytorch_net_to_buffer(
            trainer.actor, input_dim)
        actor_input_blob, actor_output_blob, caffe2_netdef =\
            PytorchCaffe2Converter.buffer_to_caffe2_netdef(buffer)
        torch_workspace = caffe2_netdef.workspace

        parameters = []
        for blob_str in torch_workspace.Blobs():
            workspace.FeedBlob(blob_str, torch_workspace.FetchBlob(blob_str))
            parameters.append(blob_str)

        torch_init_net = core.Net(caffe2_netdef.init_net)
        torch_predict_net = core.Net(caffe2_netdef.predict_net)

        model = model_helper.ModelHelper(name="predictor")
        net = model.net
        C2.set_model(model)

        workspace.FeedBlob('input/float_features.lengths',
                           np.zeros(1, dtype=np.int32))
        workspace.FeedBlob('input/float_features.keys',
                           np.zeros(1, dtype=np.int64))
        workspace.FeedBlob('input/float_features.values',
                           np.zeros(1, dtype=np.float32))

        input_feature_lengths = 'input_feature_lengths'
        input_feature_keys = 'input_feature_keys'
        input_feature_values = 'input_feature_values'

        if int_features:
            workspace.FeedBlob('input/int_features.lengths',
                               np.zeros(1, dtype=np.int32))
            workspace.FeedBlob('input/int_features.keys',
                               np.zeros(1, dtype=np.int64))
            workspace.FeedBlob('input/int_features.values',
                               np.zeros(1, dtype=np.int32))
            C2.net().Cast(['input/int_features.values'],
                          ['input/int_features.values_float'],
                          dtype=caffe2_pb2.TensorProto.FLOAT)
            C2.net().MergeMultiScalarFeatureTensors([
                'input/float_features.lengths', 'input/float_features.keys',
                'input/float_features.values', 'input/int_features.lengths',
                'input/int_features.keys', 'input/int_features.values_float'
            ], [
                input_feature_lengths, input_feature_keys, input_feature_values
            ])
        else:
            C2.net().Copy(['input/float_features.lengths'],
                          [input_feature_lengths])
            C2.net().Copy(['input/float_features.keys'], [input_feature_keys])
            C2.net().Copy(['input/float_features.values'],
                          [input_feature_values])

        preprocessor = PreprocessorNet(net, True)
        parameters.extend(preprocessor.parameters)
        state_normalized_dense_matrix, new_parameters = \
            preprocessor.normalize_sparse_matrix(
                input_feature_lengths,
                input_feature_keys,
                input_feature_values,
                state_normalization_parameters,
                'state_norm',
            )
        parameters.extend(new_parameters)
        net.Copy([state_normalized_dense_matrix], [actor_input_blob])

        workspace.RunNetOnce(model.param_init_net)
        workspace.RunNetOnce(torch_init_net)

        net.AppendNet(torch_init_net)
        net.AppendNet(torch_predict_net)

        C2.FlattenToVec(C2.ArgMax(actor_output_blob))
        output_lengths = 'output/float_features.lengths'
        workspace.FeedBlob(output_lengths, np.zeros(1, dtype=np.int32))
        C2.net().ConstantFill([C2.FlattenToVec(C2.ArgMax(actor_output_blob))],
                              [output_lengths],
                              value=trainer.actor.layers[-1].out_features,
                              dtype=caffe2_pb2.TensorProto.INT32)

        output_keys = 'output/float_features.keys'
        workspace.FeedBlob(output_keys, np.zeros(1, dtype=np.int32))
        C2.net().LengthsRangeFill([output_lengths], [output_keys])

        output_values = 'output/float_features.values'
        workspace.FeedBlob(output_values, np.zeros(1, dtype=np.float32))
        C2.net().FlattenToVec([actor_output_blob], [output_values])

        workspace.CreateNet(net)
        return DDPGPredictor(net, parameters, int_features)