示例#1
0
    def process(
        self,
        sorted_features: List[int],
        sparse_data: StackedAssociativeArray,
        set_missing_value_to_zero: bool = False,
    ) -> Tuple[str, List[str]]:
        lengths_blob = sparse_data.lengths
        keys_blob = sparse_data.keys
        values_blob = sparse_data.values

        MISSING_SCALAR = C2.NextBlob("MISSING_SCALAR")
        missing_value = 0.0 if set_missing_value_to_zero else MISSING_VALUE
        workspace.FeedBlob(MISSING_SCALAR,
                           np.array([missing_value], dtype=np.float32))
        C2.net().GivenTensorFill([], [MISSING_SCALAR],
                                 shape=[],
                                 values=[missing_value])

        parameters: List[str] = [MISSING_SCALAR]

        assert len(sorted_features) > 0, "Sorted features is empty"
        dense_input = C2.SparseToDenseMask(keys_blob,
                                           values_blob,
                                           MISSING_SCALAR,
                                           lengths_blob,
                                           mask=sorted_features)[0]

        return dense_input, parameters
    def normalize_sparse_matrix(
        self,
        lengths_blob: str,
        keys_blob: str,
        values_blob: str,
        normalization_parameters: Dict[str, NormalizationParameters],
        blobname_prefix: str,
        split_expensive_feature_groups: bool = False,
    ) -> Tuple[str, List[str]]:
        sorted_features, _ = sort_features_by_normalization(
            normalization_parameters)
        int_features = [int(feature) for feature in sorted_features]

        dense_input, _ = C2.SparseToDenseMask(keys_blob,
                                              values_blob,
                                              self.MISSING_SCALAR,
                                              lengths_blob,
                                              mask=int_features)
        return self.normalize_dense_matrix(
            dense_input,
            sorted_features,
            normalization_parameters,
            blobname_prefix,
            split_expensive_feature_groups,
        )
示例#3
0
def sparse_to_dense(lengths_blob: str, keys_blob: str, values_blob: str,
                    sorted_features: List[int]) -> Tuple[str, List[str]]:
    MISSING_SCALAR = C2.NextBlob("MISSING_SCALAR")
    workspace.FeedBlob(MISSING_SCALAR,
                       np.array([MISSING_VALUE], dtype=np.float32))
    C2.net().GivenTensorFill([], [MISSING_SCALAR],
                             shape=[],
                             values=[MISSING_VALUE])

    parameters: List[str] = [MISSING_SCALAR]

    assert len(sorted_features) > 0, "Sorted features is empty"
    dense_input = C2.SparseToDenseMask(keys_blob,
                                       values_blob,
                                       MISSING_SCALAR,
                                       lengths_blob,
                                       mask=sorted_features)[0]

    return dense_input, parameters
示例#4
0
    def normalize_sparse_matrix(
        self,
        lengths_blob: str,
        keys_blob: str,
        values_blob: str,
        normalization_parameters: Dict[int, NormalizationParameters],
        blobname_prefix: str,
        split_sparse_to_dense: bool,
        split_expensive_feature_groups: bool,
        normalize: bool = True,
        sorted_features_override: List[int] = None,
    ) -> Tuple[str, List[str]]:
        if sorted_features_override:
            sorted_features = sorted_features_override
        else:
            sorted_features, _ = sort_features_by_normalization(
                normalization_parameters)
        int_features = [int(feature) for feature in sorted_features]

        preprocess_num_batches = 8 if split_sparse_to_dense else 1

        lengths_batch = []
        keys_batch = []
        values_batch = []
        for _ in range(preprocess_num_batches):
            lengths_batch.append(C2.NextBlob(blobname_prefix +
                                             "_length_batch"))
            keys_batch.append(C2.NextBlob(blobname_prefix + "_key_batch"))
            values_batch.append(C2.NextBlob(blobname_prefix + "_value_batch"))

        C2.net().Split([lengths_blob], lengths_batch, axis=0)
        total_lengths_batch = []
        for x in range(preprocess_num_batches):
            total_lengths_batch.append(
                C2.Reshape(C2.ReduceBackSum(lengths_batch[x],
                                            num_reduce_dims=1),
                           shape=[1])[0])
        total_lengths_batch_concat, _ = C2.Concat(*total_lengths_batch, axis=0)
        C2.net().Split([keys_blob, total_lengths_batch_concat],
                       keys_batch,
                       axis=0)
        C2.net().Split([values_blob, total_lengths_batch_concat],
                       values_batch,
                       axis=0)

        dense_input_fragments = []
        parameters: List[str] = []

        MISSING_SCALAR = self._store_parameter(
            parameters, "MISSING_SCALAR",
            np.array([MISSING_VALUE], dtype=np.float32))
        C2.net().GivenTensorFill([], [MISSING_SCALAR],
                                 shape=[],
                                 values=[MISSING_VALUE])

        for preprocess_batch in range(preprocess_num_batches):
            dense_input_fragment = C2.SparseToDenseMask(
                keys_batch[preprocess_batch],
                values_batch[preprocess_batch],
                MISSING_SCALAR,
                lengths_batch[preprocess_batch],
                mask=int_features,
            )[0]

            if normalize:
                normalized_fragment, p = self.normalize_dense_matrix(
                    dense_input_fragment,
                    sorted_features,
                    normalization_parameters,
                    blobname_prefix,
                    split_expensive_feature_groups,
                )
                dense_input_fragments.append(normalized_fragment)
                parameters.extend(p)
            else:
                dense_input_fragments.append(dense_input_fragment)

        dense_input = C2.NextBlob(blobname_prefix + "_dense_input")
        dense_input_dims = C2.NextBlob(blobname_prefix + "_dense_input_dims")
        C2.net().Concat(dense_input_fragments, [dense_input, dense_input_dims],
                        axis=0)

        return dense_input, parameters