Пример #1
0
    def test_preprocessing_network(self):
        feature_value_map = read_data()

        normalization_parameters = {}
        for name, values in feature_value_map.items():
            normalization_parameters[name] = normalization.identify_parameter(
                name, values, feature_type=self._feature_type_override(name))
        test_features = NumpyFeatureProcessor.preprocess(
            feature_value_map, normalization_parameters)

        net = core.Net("PreprocessingTestNet")
        C2.set_net(net)
        preprocessor = PreprocessorNet()
        name_preprocessed_blob_map = {}
        for feature_name in feature_value_map:
            workspace.FeedBlob(str(feature_name), np.array([0],
                                                           dtype=np.int32))
            preprocessed_blob, _ = preprocessor.preprocess_blob(
                str(feature_name), [normalization_parameters[feature_name]])
            name_preprocessed_blob_map[feature_name] = preprocessed_blob

        workspace.CreateNet(net)

        for feature_name, feature_value in six.iteritems(feature_value_map):
            feature_value = np.expand_dims(feature_value, -1)
            workspace.FeedBlob(str(feature_name), feature_value)
        workspace.RunNetOnce(net)

        for feature_name in feature_value_map:
            normalized_features = workspace.FetchBlob(
                name_preprocessed_blob_map[feature_name])
            if feature_name != ENUM_FEATURE_ID:
                normalized_features = np.squeeze(normalized_features, -1)

            tolerance = 0.01
            if feature_name == BOXCOX_FEATURE_ID:
                # At the limit, boxcox has some numerical instability
                tolerance = 0.5
            non_matching = np.where(
                np.logical_not(
                    np.isclose(
                        normalized_features,
                        test_features[feature_name],
                        rtol=tolerance,
                        atol=tolerance,
                    )))
            self.assertTrue(
                np.all(
                    np.isclose(
                        normalized_features,
                        test_features[feature_name],
                        rtol=tolerance,
                        atol=tolerance,
                    )),
                "{} does not match: {} {}".format(
                    feature_name,
                    normalized_features[non_matching].tolist(),
                    test_features[feature_name][non_matching].tolist(),
                ),
            )
Пример #2
0
    def test_preprocessing_network(self):
        feature_value_map = read_data()

        normalization_parameters = {}
        name_preprocessed_blob_map = {}

        for feature_name, feature_values in feature_value_map.items():
            normalization_parameters[feature_name] = normalization.identify_parameter(
                feature_values, feature_type=self._feature_type_override(feature_name)
            )
            feature_values[
                0
            ] = MISSING_VALUE  # Set one entry to MISSING_VALUE to test that

            preprocessor = Preprocessor(
                {feature_name: normalization_parameters[feature_name]}, False
            )
            preprocessor.clamp = False
            feature_values_matrix = np.expand_dims(feature_values, -1)
            normalized_feature_values = preprocessor.forward(feature_values_matrix)
            name_preprocessed_blob_map[feature_name] = normalized_feature_values.numpy()

        test_features = self.preprocess(feature_value_map, normalization_parameters)

        for feature_name in feature_value_map:
            normalized_features = name_preprocessed_blob_map[feature_name]
            if feature_name != ENUM_FEATURE_ID:
                normalized_features = np.squeeze(normalized_features, -1)

            tolerance = 0.01
            if feature_name == BOXCOX_FEATURE_ID:
                # At the limit, boxcox has some numerical instability
                tolerance = 0.5
            non_matching = np.where(
                np.logical_not(
                    np.isclose(
                        normalized_features.flatten(),
                        test_features[feature_name].flatten(),
                        rtol=tolerance,
                        atol=tolerance,
                    )
                )
            )
            self.assertTrue(
                np.all(
                    np.isclose(
                        normalized_features.flatten(),
                        test_features[feature_name].flatten(),
                        rtol=tolerance,
                        atol=tolerance,
                    )
                ),
                "{} does not match: {} \n!=\n {}".format(
                    feature_name,
                    normalized_features.flatten()[non_matching],
                    test_features[feature_name].flatten()[non_matching],
                ),
            )
Пример #3
0
    def test_type_override(self):
        # Take a feature that should be identified as probability
        _, feature_value_map = preprocessing_util.read_data()
        probability_values = feature_value_map[identify_types.PROBABILITY]

        # And ask for a binary anyways
        parameter = normalization.identify_parameter(
            probability_values, feature_type=identify_types.BINARY)
        self.assertEqual(parameter.feature_type, "BINARY")
Пример #4
0
    def test_type_override(self):
        # Take a feature that should be identified as probability
        feature_value_map = read_data()
        probability_values = feature_value_map[PROBABILITY_FEATURE_ID]

        # And ask for a binary anyways
        parameter = normalization.identify_parameter(
            "_", probability_values, feature_type=identify_types.BINARY)
        self.assertEqual(parameter.feature_type, "BINARY")
Пример #5
0
    def test_persistency(self):
        _, feature_value_map = preprocessing_util.read_data()
        normalization_parameters = {}
        for name, values in feature_value_map.items():
            normalization_parameters[name] = normalization.identify_parameter(values)

        s = normalization.serialize(normalization_parameters)
        read_parameters = normalization.deserialize(s)
        self.assertEqual(read_parameters, normalization_parameters)
Пример #6
0
    def test_identification(self):
        feature_value_map = preprocessing_util.read_data()

        types = identify_types.identify_types(feature_value_map)

        # Examples through manual inspection
        self.assertEqual(types['179'], identify_types.BINARY)
        self.assertEqual(types['124'], identify_types.CONTINUOUS)
        self.assertEqual(types['74'], identify_types.PROBABILITY)
    def test_preprocessing_network(self):
        feature_value_map = preprocessing_util.read_data()
        normalization_parameters = {}
        for name, values in feature_value_map.items():
            normalization_parameters[name] = normalization.identify_parameter(
                values
            )
        test_features = self.preprocess(
            feature_value_map, normalization_parameters
        )

        net = core.Net("PreprocessingTestNet")
        preprocessor = PreprocessorNet(net, False)
        for feature_name in feature_value_map:
            workspace.FeedBlob(feature_name, np.array([0], dtype=np.int32))
            preprocessor.preprocess_blob(
                feature_name, normalization_parameters[feature_name]
            )

        workspace.CreateNet(net)

        for feature_name in feature_value_map:
            workspace.FeedBlob(feature_name, feature_value_map[feature_name])
        workspace.RunNetOnce(net)

        for feature_name in feature_value_map:
            normalized_features = workspace.FetchBlob(
                feature_name + "_preprocessed"
            )
            tolerance = 0.01
            if feature_name == 'boxcox':
                # At the limit, boxcox has some numerical instability
                tolerance = 0.1
            non_matching = np.where(
                np.logical_not(
                    np.isclose(
                        normalized_features,
                        test_features[feature_name],
                        rtol=tolerance,
                        atol=tolerance,
                    )
                )
            )
            self.assertTrue(
                np.all(
                    np.isclose(
                        normalized_features,
                        test_features[feature_name],
                        rtol=tolerance,
                        atol=tolerance,
                    )
                ), '{} does not match: {} {}'.format(
                    feature_name, normalized_features[non_matching].tolist(),
                    test_features[feature_name][non_matching].tolist()
                )
            )
Пример #8
0
    def test_persistency(self):
        feature_value_map = preprocessing_util.read_data()

        normalization_parameters = normalization.identify_parameters(
            feature_value_map
        )

        s = normalization.serialize(normalization_parameters)
        read_parameters = normalization.deserialize(s)
        self.assertEqual(read_parameters, normalization_parameters)
Пример #9
0
    def test_persistency(self):
        feature_value_map = read_data()
        normalization_parameters = {}
        for name, values in feature_value_map.items():
            normalization_parameters[name] = normalization.identify_parameter(
                values, feature_type=self._feature_type_override(name))

        s = normalization.serialize(normalization_parameters)
        read_parameters = normalization.deserialize(s)
        self.assertEqual(read_parameters, normalization_parameters)
Пример #10
0
    def test_type_override(self):
        # Take a feature that should be identified as probability
        feature_value_map = read_data()
        probability_values = feature_value_map[PROBABILITY_FEATURE_ID]

        # And ask for a binary anyways
        parameter = normalization.identify_parameter(
            "_", probability_values, feature_type=identify_types.BINARY
        )
        self.assertEqual(parameter.feature_type, "BINARY")
Пример #11
0
    def test_preprocessing_network(self):
        feature_value_map = preprocessing_util.read_data()
        types = identify_types.identify_types_dict(feature_value_map)
        normalization_parameters = normalization.identify_parameters(
            feature_value_map, types)
        test_features = self.preprocess(feature_value_map,
                                        normalization_parameters)
        test_features[u'186'] = 0

        net = core.Net("PreprocessingTestNet")
        preprocessor = PreprocessorNet(net, False)
        for feature_name in feature_value_map:
            workspace.FeedBlob(feature_name, np.array([0], dtype=np.int32))
            preprocessor.preprocess_blob(
                feature_name, normalization_parameters[feature_name])

        workspace.CreateNet(net)

        for feature_name in feature_value_map:
            if feature_name != u'186':
                workspace.FeedBlob(
                    feature_name,
                    feature_value_map[feature_name].astype(np.float32))
            else:
                workspace.FeedBlob(
                    feature_name,
                    normalization.MISSING_VALUE * np.ones(1, dtype=np.float32))
        workspace.RunNetOnce(net)

        for feature_name in feature_value_map:
            normalized_features = workspace.FetchBlob(feature_name +
                                                      "_preprocessed")
            self.assertTrue(
                np.all(
                    np.isclose(normalized_features,
                               test_features[feature_name])))
        for feature_name in feature_value_map:
            if feature_name != u'186':
                workspace.FeedBlob(
                    feature_name,
                    feature_value_map[feature_name].astype(np.float32))
            else:
                workspace.FeedBlob(
                    feature_name,
                    normalization.MISSING_VALUE * np.ones(1, dtype=np.float32))
        workspace.RunNetOnce(net)

        for feature_name in feature_value_map:
            normalized_features = workspace.FetchBlob(feature_name +
                                                      "_preprocessed")

            self.assertTrue(
                np.all(
                    np.isclose(normalized_features,
                               test_features[feature_name])))
Пример #12
0
    def test_preprocessing_network_onnx(self):
        feature_value_map = read_data()

        for feature_name, feature_values in feature_value_map.items():
            normalization_parameters = normalization.identify_parameter(
                feature_name,
                feature_values,
                feature_type=self._feature_type_override(feature_name),
            )
            feature_values[
                0
            ] = MISSING_VALUE  # Set one entry to MISSING_VALUE to test that
            feature_values_matrix = np.expand_dims(feature_values, -1)

            preprocessor = Preprocessor({feature_name: normalization_parameters}, False)
            normalized_feature_values = preprocessor.forward(feature_values_matrix)

            input_blob, output_blob, netdef = PytorchCaffe2Converter.pytorch_net_to_caffe2_netdef(
                preprocessor, 1, False, float_input=True
            )

            preproc_workspace = netdef.workspace

            preproc_workspace.FeedBlob(input_blob, feature_values_matrix)

            preproc_workspace.RunNetOnce(core.Net(netdef.init_net))
            preproc_workspace.RunNetOnce(core.Net(netdef.predict_net))

            normalized_feature_values_onnx = netdef.workspace.FetchBlob(output_blob)
            tolerance = 0.0001
            non_matching = np.where(
                np.logical_not(
                    np.isclose(
                        normalized_feature_values,
                        normalized_feature_values_onnx,
                        rtol=tolerance,
                        atol=tolerance,
                    )
                )
            )
            self.assertTrue(
                np.all(
                    np.isclose(
                        normalized_feature_values,
                        normalized_feature_values_onnx,
                        rtol=tolerance,
                        atol=tolerance,
                    )
                ),
                "{} does not match: {} \n!=\n {}".format(
                    feature_name,
                    normalized_feature_values[non_matching].tolist()[0:10],
                    normalized_feature_values_onnx[non_matching].tolist()[0:10],
                ),
            )
Пример #13
0
    def test_preprocessing_network(self):
        features, feature_value_map = preprocessing_util.read_data()
        normalization_parameters = {}
        name_preprocessed_blob_map = {}

        for feature_name, feature_values in feature_value_map.items():
            normalization_parameters[feature_name] = normalization.identify_parameter(
                feature_values
            )

            preprocessor = Preprocessor(
                {feature_name: normalization_parameters[feature_name]}, False
            )
            preprocessor.clamp = False
            feature_values_matrix = np.expand_dims(feature_values, -1)
            normalized_feature_values = preprocessor.forward(feature_values_matrix)
            name_preprocessed_blob_map[feature_name] = normalized_feature_values.numpy()

        test_features = self.preprocess(feature_value_map, normalization_parameters)

        for feature_name in feature_value_map:
            normalized_features = name_preprocessed_blob_map[feature_name]
            if feature_name != identify_types.ENUM:
                normalized_features = np.squeeze(normalized_features, -1)

            tolerance = 0.01
            if feature_name == BOXCOX:
                # At the limit, boxcox has some numerical instability
                tolerance = 0.5
            non_matching = np.where(
                np.logical_not(
                    np.isclose(
                        normalized_features,
                        test_features[feature_name],
                        rtol=tolerance,
                        atol=tolerance,
                    )
                )
            )
            self.assertTrue(
                np.all(
                    np.isclose(
                        normalized_features,
                        test_features[feature_name],
                        rtol=tolerance,
                        atol=tolerance,
                    )
                ),
                "{} does not match: {} {}".format(
                    feature_name,
                    normalized_features[non_matching].tolist()[0:10],
                    test_features[feature_name][non_matching].tolist()[0:10],
                ),
            )
Пример #14
0
    def test_persistency(self):
        feature_value_map = read_data()
        normalization_parameters = {}
        for name, values in feature_value_map.items():
            normalization_parameters[name] = normalization.identify_parameter(
                values)
            values[
                0] = MISSING_VALUE  # Set one entry to MISSING_VALUE to test that

        s = normalization.serialize(normalization_parameters)
        read_parameters = normalization.deserialize(s)
        self.assertEqual(read_parameters, normalization_parameters)
Пример #15
0
    def test_persistency(self):
        feature_value_map = preprocessing_util.read_data()

        types = identify_types.identify_types_dict(feature_value_map)
        normalization_parameters = normalization.identify_parameters(
            feature_value_map, types)

        with io.StringIO() as f:
            normalization.write_parameters(f, normalization_parameters)
            f.seek(0)
            read_parameters = normalization.load_parameters(f)

        self.assertEqual(read_parameters, normalization_parameters)
Пример #16
0
    def test_feature_parsing(self):
        feature_value_map = preprocessing_util.read_data()

        # There are features and we have mapped all them all
        self.assertTrue(len(feature_value_map) > 3)

        # A few samples based on manual inspection
        self.assertFalse(
            (feature_value_map['413'] -
             [473.763927022, 65.0, 65.0, 1.0, 50.0, 50.0, 2.0, 2.0, 1.0, 23.0
              ]).any())
        self.assertFalse(
            (feature_value_map['186'] -
             [0.0, 0.0, 0.0, 11.0, 1.0, 1.0, 7.0, 7.0, 14.0, 0.0]).any())
Пример #17
0
    def test_identification(self):
        feature_value_map = preprocessing_util.read_data()

        types = identify_types.identify_types(feature_value_map)

        # Examples through manual inspection
        self.assertEqual(types[identify_types.BINARY], identify_types.BINARY)
        self.assertEqual(types['normal'], identify_types.CONTINUOUS)
        self.assertEqual(types['boxcox'], identify_types.CONTINUOUS)

        # We don't yet know the quantile type
        self.assertEqual(types[identify_types.QUANTILE],
                         identify_types.CONTINUOUS)
        self.assertEqual(types[identify_types.ENUM], identify_types.ENUM)
        self.assertEqual(types[identify_types.PROBABILITY],
                         identify_types.PROBABILITY)
Пример #18
0
    def test_identification(self):
        feature_value_map = read_data()

        types = {}
        for name, values in feature_value_map.items():
            types[name] = identify_types.identify_type(values)

        # Examples through manual inspection
        self.assertEqual(types[BINARY_FEATURE_ID], identify_types.BINARY)
        self.assertEqual(types[CONTINUOUS_FEATURE_ID], identify_types.CONTINUOUS)

        # We don't yet know the boxcox type
        self.assertEqual(types[BOXCOX_FEATURE_ID], identify_types.CONTINUOUS)

        # We don't yet know the quantile type
        self.assertEqual(types[QUANTILE_FEATURE_ID], identify_types.CONTINUOUS)
        self.assertEqual(types[ENUM_FEATURE_ID], identify_types.ENUM)
        self.assertEqual(types[PROBABILITY_FEATURE_ID], identify_types.PROBABILITY)
Пример #19
0
    def test_persistency(self):
        feature_value_map = read_data()
        normalization_parameters = {}
        for name, values in feature_value_map.items():
            normalization_parameters[name] = normalization.identify_parameter(
                values, feature_type=self._feature_type_override(name))
            values[
                0] = MISSING_VALUE  # Set one entry to MISSING_VALUE to test that

        s = normalization.serialize(normalization_parameters)
        read_parameters = normalization.deserialize(s)
        # Unfortunately, Thrift serializatin seems to lose a bit of precision.
        # Using `==` will be false.
        self.assertEqual(read_parameters.keys(),
                         normalization_parameters.keys())
        for k in normalization_parameters:
            self.assertEqual(
                read_parameters[k].feature_type,
                normalization_parameters[k].feature_type,
            )
            self.assertEqual(
                read_parameters[k].possible_values,
                normalization_parameters[k].possible_values,
            )
            for field in [
                    "boxcox_lambda",
                    "boxcox_shift",
                    "mean",
                    "stddev",
                    "quantiles",
                    "min_value",
                    "max_value",
            ]:
                if getattr(normalization_parameters[k], field) is None:
                    self.assertEqual(
                        getattr(read_parameters[k], field),
                        getattr(normalization_parameters[k], field),
                    )
                else:
                    npt.assert_allclose(
                        getattr(read_parameters[k], field),
                        getattr(normalization_parameters[k], field),
                    )
Пример #20
0
    def test_persistency(self):
        feature_value_map = read_data()
        normalization_parameters = {}
        for name, values in feature_value_map.items():
            normalization_parameters[name] = normalization.identify_parameter(
                name, values, feature_type=self._feature_type_override(name)
            )
            values[0] = MISSING_VALUE  # Set one entry to MISSING_VALUE to test that

        s = normalization.serialize(normalization_parameters)
        read_parameters = normalization.deserialize(s)
        # Unfortunately, Thrift serializatin seems to lose a bit of precision.
        # Using `==` will be false.
        self.assertEqual(read_parameters.keys(), normalization_parameters.keys())
        for k in normalization_parameters:
            self.assertEqual(
                read_parameters[k].feature_type,
                normalization_parameters[k].feature_type,
            )
            self.assertEqual(
                read_parameters[k].possible_values,
                normalization_parameters[k].possible_values,
            )
            for field in [
                "boxcox_lambda",
                "boxcox_shift",
                "mean",
                "stddev",
                "quantiles",
                "min_value",
                "max_value",
            ]:
                if getattr(normalization_parameters[k], field) is None:
                    self.assertEqual(
                        getattr(read_parameters[k], field),
                        getattr(normalization_parameters[k], field),
                    )
                else:
                    npt.assert_allclose(
                        getattr(read_parameters[k], field),
                        getattr(normalization_parameters[k], field),
                    )
Пример #21
0
    def test_identification(self):
        feature_value_map = read_data()

        types = {}
        for name, values in feature_value_map.items():
            types[name] = identify_types.identify_type(values)

        # Examples through manual inspection
        self.assertEqual(types[BINARY_FEATURE_ID], identify_types.BINARY)
        self.assertEqual(types[CONTINUOUS_FEATURE_ID],
                         identify_types.CONTINUOUS)

        # We don't yet know the boxcox type
        self.assertEqual(types[BOXCOX_FEATURE_ID], identify_types.CONTINUOUS)

        # We don't yet know the quantile type
        self.assertEqual(types[QUANTILE_FEATURE_ID], identify_types.CONTINUOUS)
        self.assertEqual(types[ENUM_FEATURE_ID], identify_types.ENUM)
        self.assertEqual(types[PROBABILITY_FEATURE_ID],
                         identify_types.PROBABILITY)
Пример #22
0
    def test_identification(self):
        _, feature_value_map = preprocessing_util.read_data()

        types = {}
        for name, values in feature_value_map.items():
            types[name] = identify_types.identify_type(values)

        # Examples through manual inspection
        self.assertEqual(types[identify_types.BINARY], identify_types.BINARY)
        self.assertEqual(types[identify_types.CONTINUOUS],
                         identify_types.CONTINUOUS)

        # We don't yet know the boxcox type
        self.assertEqual(types[identify_types.BOXCOX],
                         identify_types.CONTINUOUS)

        # We don't yet know the quantile type
        self.assertEqual(types[identify_types.QUANTILE],
                         identify_types.CONTINUOUS)
        self.assertEqual(types[identify_types.ENUM], identify_types.ENUM)
        self.assertEqual(types[identify_types.PROBABILITY],
                         identify_types.PROBABILITY)
Пример #23
0
    def test_prepare_normalization_and_normalize(self):
        feature_value_map = read_data()

        normalization_parameters = {}
        for name, values in feature_value_map.items():
            normalization_parameters[name] = normalization.identify_parameter(
                name,
                values,
                10,
                feature_type=self._feature_type_override(name))
        for k, v in normalization_parameters.items():
            if id_to_type(k) == CONTINUOUS:
                self.assertEqual(v.feature_type, CONTINUOUS)
                self.assertIs(v.boxcox_lambda, None)
                self.assertIs(v.boxcox_shift, None)
            elif id_to_type(k) == BOXCOX:
                self.assertEqual(v.feature_type, BOXCOX)
                self.assertIsNot(v.boxcox_lambda, None)
                self.assertIsNot(v.boxcox_shift, None)
            else:
                assert v.feature_type == id_to_type(k)
        sorted_features, _ = sort_features_by_normalization(
            normalization_parameters)

        norm_net = core.Net("net")
        C2.set_net(norm_net)
        preprocessor = PreprocessorNet()
        input_matrix = np.zeros([10000, len(sorted_features)],
                                dtype=np.float32)
        for i, feature in enumerate(sorted_features):
            input_matrix[:, i] = feature_value_map[feature]
        input_matrix_blob = "input_matrix_blob"
        workspace.FeedBlob(input_matrix_blob, np.array([], dtype=np.float32))
        output_blob, _ = preprocessor.normalize_dense_matrix(
            input_matrix_blob, sorted_features, normalization_parameters, "",
            False)
        workspace.FeedBlob(input_matrix_blob, input_matrix)
        workspace.RunNetOnce(norm_net)
        normalized_feature_matrix = workspace.FetchBlob(output_blob)

        normalized_features = {}
        on_column = 0
        for feature in sorted_features:
            norm = normalization_parameters[feature]
            if norm.feature_type == ENUM:
                column_size = len(norm.possible_values)
            else:
                column_size = 1
            normalized_features[
                feature] = normalized_feature_matrix[:,
                                                     on_column:(on_column +
                                                                column_size)]
            on_column += column_size

        self.assertTrue(
            all([
                np.isfinite(parameter.stddev) and np.isfinite(parameter.mean)
                for parameter in normalization_parameters.values()
            ]))
        for k, v in six.iteritems(normalized_features):
            self.assertTrue(np.all(np.isfinite(v)))
            feature_type = normalization_parameters[k].feature_type
            if feature_type == identify_types.PROBABILITY:
                sigmoidv = special.expit(v)
                self.assertTrue(
                    np.all(
                        np.logical_and(np.greater(sigmoidv, 0),
                                       np.less(sigmoidv, 1))))
            elif feature_type == identify_types.ENUM:
                possible_values = normalization_parameters[k].possible_values
                self.assertEqual(v.shape[0], len(feature_value_map[k]))
                self.assertEqual(v.shape[1], len(possible_values))

                possible_value_map = {}
                for i, possible_value in enumerate(possible_values):
                    possible_value_map[possible_value] = i

                for i, row in enumerate(v):
                    original_feature = feature_value_map[k][i]
                    self.assertEqual(possible_value_map[original_feature],
                                     np.where(row == 1)[0][0])
            elif feature_type == identify_types.QUANTILE:
                for i, feature in enumerate(v[0]):
                    original_feature = feature_value_map[k][i]
                    expected = NumpyFeatureProcessor.value_to_quantile(
                        original_feature,
                        normalization_parameters[k].quantiles)
                    self.assertAlmostEqual(feature, expected, 2)
            elif feature_type == identify_types.BINARY:
                pass
            elif (feature_type == identify_types.CONTINUOUS
                  or feature_type == identify_types.BOXCOX):
                one_stddev = np.isclose(np.std(v, ddof=1), 1, atol=0.01)
                zero_stddev = np.isclose(np.std(v, ddof=1), 0, atol=0.01)
                zero_mean = np.isclose(np.mean(v), 0, atol=0.01)
                self.assertTrue(
                    np.all(zero_mean),
                    "mean of feature {} is {}, not 0".format(k, np.mean(v)),
                )
                self.assertTrue(np.all(np.logical_or(one_stddev, zero_stddev)))
            elif feature_type == identify_types.CONTINUOUS_ACTION:
                less_than_max = v < 1
                more_than_min = v > -1
                self.assertTrue(
                    np.all(less_than_max),
                    "values are not less than 1: {}".format(
                        v[less_than_max == False]),
                )
                self.assertTrue(
                    np.all(more_than_min),
                    "values are not more than -1: {}".format(
                        v[more_than_min == False]),
                )
            else:
                raise NotImplementedError()
Пример #24
0
    def test_prepare_normalization_and_normalize(self):
        feature_value_map = read_data()

        normalization_parameters = {}
        for name, values in feature_value_map.items():
            normalization_parameters[name] = normalization.identify_parameter(
                values, 10)
        for k, v in normalization_parameters.items():
            if id_to_type(k) == CONTINUOUS:
                self.assertEqual(v.feature_type, CONTINUOUS)
                self.assertIs(v.boxcox_lambda, None)
                self.assertIs(v.boxcox_shift, None)
            elif id_to_type(k) == BOXCOX:
                self.assertEqual(v.feature_type, BOXCOX)
                self.assertIsNot(v.boxcox_lambda, None)
                self.assertIsNot(v.boxcox_shift, None)
            else:
                assert v.feature_type == id_to_type(k)

        preprocessor = Preprocessor(normalization_parameters, False)
        sorted_features, _ = sort_features_by_normalization(
            normalization_parameters)
        preprocessor.clamp = False
        input_matrix = np.zeros([10000, len(sorted_features)],
                                dtype=np.float32)
        for i, feature in enumerate(sorted_features):
            input_matrix[:, i] = feature_value_map[feature]
        normalized_feature_matrix = preprocessor.forward(input_matrix)

        normalized_features = {}
        on_column = 0
        for feature in sorted_features:
            norm = normalization_parameters[feature]
            if norm.feature_type == ENUM:
                column_size = len(norm.possible_values)
            else:
                column_size = 1
            normalized_features[
                feature] = normalized_feature_matrix[:,
                                                     on_column:(on_column +
                                                                column_size)]
            on_column += column_size

        self.assertTrue(
            all([
                np.isfinite(parameter.stddev) and np.isfinite(parameter.mean)
                for parameter in normalization_parameters.values()
            ]))
        for k, v in six.iteritems(normalized_features):
            v = v.numpy()
            self.assertTrue(np.all(np.isfinite(v)))
            feature_type = normalization_parameters[k].feature_type
            if feature_type == identify_types.PROBABILITY:
                sigmoidv = special.expit(v)
                self.assertTrue(
                    np.all(
                        np.logical_and(np.greater(sigmoidv, 0),
                                       np.less(sigmoidv, 1))))
            elif feature_type == identify_types.ENUM:
                possible_values = normalization_parameters[k].possible_values
                self.assertEqual(v.shape[0], len(feature_value_map[k]))
                self.assertEqual(v.shape[1], len(possible_values))

                possible_value_map = {}
                for i, possible_value in enumerate(possible_values):
                    possible_value_map[possible_value] = i

                for i, row in enumerate(v):
                    original_feature = feature_value_map[k][i]
                    if abs(original_feature - MISSING_VALUE) < 0.01:
                        self.assertEqual(0.0, np.sum(row))
                    else:
                        self.assertEqual(
                            possible_value_map[original_feature],
                            np.where(row == 1)[0][0],
                        )
            elif feature_type == identify_types.QUANTILE:
                for i, feature in enumerate(v[0]):
                    original_feature = feature_value_map[k][i]
                    expected = self._value_to_quantile(
                        original_feature,
                        normalization_parameters[k].quantiles)
                    self.assertAlmostEqual(feature, expected, 2)
            elif feature_type == identify_types.BINARY:
                pass
            elif (feature_type == identify_types.CONTINUOUS
                  or feature_type == identify_types.BOXCOX):
                one_stddev = np.isclose(np.std(v, ddof=1), 1, atol=0.01)
                zero_stddev = np.isclose(np.std(v, ddof=1), 0, atol=0.01)
                zero_mean = np.isclose(np.mean(v), 0, atol=0.01)
                self.assertTrue(
                    np.all(zero_mean),
                    "mean of feature {} is {}, not 0".format(k, np.mean(v)),
                )
                self.assertTrue(np.all(np.logical_or(one_stddev, zero_stddev)))
            else:
                raise NotImplementedError()
Пример #25
0
    def test_preprocessing_network(self):
        feature_value_map = read_data()

        normalization_parameters = {}
        name_preprocessed_blob_map = {}

        for feature_name, feature_values in feature_value_map.items():
            normalization_parameters[feature_name] = normalization.identify_parameter(
                feature_name,
                feature_values,
                feature_type=self._feature_type_override(feature_name),
            )
            feature_values[
                0
            ] = MISSING_VALUE  # Set one entry to MISSING_VALUE to test that

            preprocessor = Preprocessor(
                {feature_name: normalization_parameters[feature_name]}, False
            )
            feature_values_matrix = np.expand_dims(feature_values, -1)
            normalized_feature_values = preprocessor.forward(feature_values_matrix)
            name_preprocessed_blob_map[feature_name] = normalized_feature_values.numpy()

        test_features = NumpyFeatureProcessor.preprocess(
            feature_value_map, normalization_parameters
        )

        for feature_name in feature_value_map:
            normalized_features = name_preprocessed_blob_map[feature_name]
            if feature_name != ENUM_FEATURE_ID:
                normalized_features = np.squeeze(normalized_features, -1)

            tolerance = 0.01
            if feature_name == BOXCOX_FEATURE_ID:
                # At the limit, boxcox has some numerical instability
                tolerance = 0.5
            non_matching = np.where(
                np.logical_not(
                    np.isclose(
                        normalized_features.flatten(),
                        test_features[feature_name].flatten(),
                        rtol=tolerance,
                        atol=tolerance,
                    )
                )
            )
            self.assertTrue(
                np.all(
                    np.isclose(
                        normalized_features.flatten(),
                        test_features[feature_name].flatten(),
                        rtol=tolerance,
                        atol=tolerance,
                    )
                ),
                "{} does not match: {} \n!=\n {}".format(
                    feature_name,
                    normalized_features.flatten()[non_matching],
                    test_features[feature_name].flatten()[non_matching],
                ),
            )
Пример #26
0
    def test_prepare_normalization_and_normalize(self):
        feature_value_map = preprocessing_util.read_data()

        types = identify_types.identify_types(feature_value_map)
        types_dict = identify_types.identify_types_dict(feature_value_map)
        normalization_parameters = normalization.identify_parameters(
            feature_value_map, types_dict)

        features = list(feature_value_map.keys())
        norm_net = core.Net("net")
        blobname_template = '{}_blob'
        blob_map = prepare_normalization(norm_net, normalization_parameters,
                                         features, blobname_template, False)

        normalized_features = normalize_feature_map(feature_value_map,
                                                    norm_net, features,
                                                    blob_map,
                                                    blobname_template)

        self.assertTrue(
            all([
                np.isfinite(parameter.stddev) and np.isfinite(parameter.mean)
                for parameter in normalization_parameters.values()
            ]))
        for k, v in six.iteritems(normalized_features):
            self.assertTrue(np.all(np.isfinite(v)))
            feature_type = normalization_parameters[k].feature_type
            if feature_type == identify_types.PROBABILITY:
                sigmoidv = special.expit(v)
                self.assertTrue(
                    np.all(
                        np.logical_and(np.greater(sigmoidv, 0),
                                       np.less(sigmoidv, 1))))
            elif feature_type == identify_types.ENUM:
                possible_values = normalization_parameters[k].possible_values
                self.assertEqual(v.shape[0], len(feature_value_map[k]))
                self.assertEqual(v.shape[1], len(possible_values))

                possible_value_map = {}
                for i, possible_value in enumerate(possible_values):
                    possible_value_map[possible_value] = i

                for i, row in enumerate(v):
                    original_feature = feature_value_map[k][i]
                    self.assertEqual(possible_value_map[original_feature],
                                     np.where(row == 1)[0][0])
            else:
                one_stddev = np.isclose(np.std(v, ddof=1), 1, atol=0.00001)
                zero_stddev = np.isclose(np.std(v, ddof=1), 0, atol=0.00001)
                zero_mean = np.isclose(np.mean(v), 0, atol=0.00001)
                is_binary = types[k] == identify_types.BINARY
                self.assertTrue(np.all(np.logical_or(zero_mean, is_binary)))
                self.assertTrue(
                    np.all(
                        np.logical_or(np.logical_or(one_stddev, zero_stddev),
                                      is_binary)))

                has_boxcox = normalization_parameters[
                    k].boxcox_lambda is not None
                is_ctd = types[k] == identify_types.CONTINUOUS
                # This should be true at the moment
                self.assertTrue(is_ctd == has_boxcox)
    def test_prepare_normalization_and_normalize(self):
        feature_value_map = preprocessing_util.read_data()

        normalization_parameters = {}
        for name, values in feature_value_map.items():
            normalization_parameters[name] = normalization.identify_parameter(
                values, 10
            )
        for k, v in normalization_parameters.items():
            if k == 'normal':
                self.assertEqual(v.feature_type, 'CONTINUOUS')
                self.assertIs(v.boxcox_lambda, None)
                self.assertIs(v.boxcox_shift, None)
            elif k == 'boxcox':
                self.assertEqual(v.feature_type, 'CONTINUOUS')
                self.assertIsNot(v.boxcox_lambda, None)
                self.assertIsNot(v.boxcox_shift, None)
            else:
                self.assertEqual(v.feature_type, k)

        features = list(feature_value_map.keys())
        norm_net = core.Net("net")
        blobname_template = '{}_blob'
        blob_map = prepare_normalization(
            norm_net, normalization_parameters, features, blobname_template,
            False
        )

        normalized_features = normalize_feature_map(
            feature_value_map, norm_net, features, blob_map, blobname_template
        )

        self.assertTrue(
            all(
                [
                    np.isfinite(parameter.stddev) and
                    np.isfinite(parameter.mean)
                    for parameter in normalization_parameters.values()
                ]
            )
        )
        for k, v in six.iteritems(normalized_features):
            self.assertTrue(np.all(np.isfinite(v)))
            feature_type = normalization_parameters[k].feature_type
            if feature_type == identify_types.PROBABILITY:
                sigmoidv = special.expit(v)
                self.assertTrue(
                    np.all(
                        np.logical_and(
                            np.greater(sigmoidv, 0), np.less(sigmoidv, 1)
                        )
                    )
                )
            elif feature_type == identify_types.ENUM:
                possible_values = normalization_parameters[k].possible_values
                self.assertEqual(v.shape[0], len(feature_value_map[k]))
                self.assertEqual(v.shape[1], len(possible_values))

                possible_value_map = {}
                for i, possible_value in enumerate(possible_values):
                    possible_value_map[possible_value] = i

                for i, row in enumerate(v):
                    original_feature = feature_value_map[k][i]
                    self.assertEqual(
                        possible_value_map[original_feature],
                        np.where(row == 1)[0][0]
                    )
            elif feature_type == identify_types.QUANTILE:
                quantiles = normalization_parameters[k].quantiles
                for i, feature in enumerate(v[0]):
                    original_feature = feature_value_map[k][i]
                    count = 0
                    for quantile in quantiles:
                        if original_feature >= quantile:
                            count += 1
                    count /= float(len(quantiles))
                    self.assertAlmostEqual(feature, count, 2)
            elif feature_type == identify_types.BINARY:
                pass
            elif feature_type == identify_types.CONTINUOUS:
                one_stddev = np.isclose(np.std(v, ddof=1), 1, atol=0.01)
                zero_stddev = np.isclose(np.std(v, ddof=1), 0, atol=0.01)
                zero_mean = np.isclose(np.mean(v), 0, atol=0.01)
                self.assertTrue(
                    np.all(zero_mean),
                    'mean of feature {} is {}, not 0'.format(k, np.mean(v))
                )
                self.assertTrue(np.all(np.logical_or(one_stddev, zero_stddev)))
            else:
                raise NotImplementedError()
Пример #28
0
    def test_prepare_normalization_and_normalize(self):
        features, feature_value_map = preprocessing_util.read_data()

        normalization_parameters = {}
        for name, values in feature_value_map.items():
            normalization_parameters[name] = normalization.identify_parameter(
                values, 10)
        for k, v in normalization_parameters.items():
            if k == CONTINUOUS:
                self.assertEqual(v.feature_type, CONTINUOUS)
                self.assertIs(v.boxcox_lambda, None)
                self.assertIs(v.boxcox_shift, None)
            elif k == BOXCOX:
                self.assertEqual(v.feature_type, BOXCOX)
                self.assertIsNot(v.boxcox_lambda, None)
                self.assertIsNot(v.boxcox_shift, None)
            else:
                assert v.feature_type == k or v.feature_type + "_2" + k

        norm_net = core.Net("net")
        preprocessor = PreprocessorNet(norm_net, False)
        input_matrix = np.zeros([10000, len(features)], dtype=np.float32)
        for i, feature in enumerate(features):
            input_matrix[:, i] = feature_value_map[feature]
        input_matrix_blob = 'input_matrix_blob'
        workspace.FeedBlob(input_matrix_blob, np.array([], dtype=np.float32))
        output_blob, _ = preprocessor.normalize_dense_matrix(
            input_matrix_blob, features, normalization_parameters, '')
        workspace.FeedBlob(input_matrix_blob, input_matrix)
        workspace.RunNetOnce(norm_net)
        normalized_feature_matrix = workspace.FetchBlob(output_blob)

        normalized_features = {}
        on_column = 0
        for feature in features:
            norm = normalization_parameters[feature]
            if norm.feature_type == ENUM:
                column_size = len(norm.possible_values)
            else:
                column_size = 1
            normalized_features[feature] = \
                normalized_feature_matrix[:, on_column:(
                    on_column + column_size
                )]
            on_column += column_size

        self.assertTrue(
            all([
                np.isfinite(parameter.stddev) and np.isfinite(parameter.mean)
                for parameter in normalization_parameters.values()
            ]))
        for k, v in six.iteritems(normalized_features):
            self.assertTrue(np.all(np.isfinite(v)))
            feature_type = normalization_parameters[k].feature_type
            if feature_type == identify_types.PROBABILITY:
                sigmoidv = special.expit(v)
                self.assertTrue(
                    np.all(
                        np.logical_and(np.greater(sigmoidv, 0),
                                       np.less(sigmoidv, 1))))
            elif feature_type == identify_types.ENUM:
                possible_values = normalization_parameters[k].possible_values
                self.assertEqual(v.shape[0], len(feature_value_map[k]))
                self.assertEqual(v.shape[1], len(possible_values))

                possible_value_map = {}
                for i, possible_value in enumerate(possible_values):
                    possible_value_map[possible_value] = i

                for i, row in enumerate(v):
                    original_feature = feature_value_map[k][i]
                    self.assertEqual(possible_value_map[original_feature],
                                     np.where(row == 1)[0][0])
            elif feature_type == identify_types.QUANTILE:
                for i, feature in enumerate(v[0]):
                    original_feature = feature_value_map[k][i]
                    expected = self._value_to_quantile(
                        original_feature,
                        normalization_parameters[k].quantiles)
                    self.assertAlmostEqual(feature, expected, 2)
            elif feature_type == identify_types.BINARY:
                pass
            elif feature_type == identify_types.CONTINUOUS or \
                    feature_type == identify_types.BOXCOX:
                one_stddev = np.isclose(np.std(v, ddof=1), 1, atol=0.01)
                zero_stddev = np.isclose(np.std(v, ddof=1), 0, atol=0.01)
                zero_mean = np.isclose(np.mean(v), 0, atol=0.01)
                self.assertTrue(
                    np.all(zero_mean),
                    'mean of feature {} is {}, not 0'.format(k, np.mean(v)))
                self.assertTrue(np.all(np.logical_or(one_stddev, zero_stddev)))
            else:
                raise NotImplementedError()
Пример #29
0
    def test_prepare_normalization_and_normalize(self):
        feature_value_map = read_data()

        normalization_parameters = {}
        for name, values in feature_value_map.items():
            normalization_parameters[name] = normalization.identify_parameter(
                name, values, 10, feature_type=self._feature_type_override(name)
            )
        for k, v in normalization_parameters.items():
            if id_to_type(k) == CONTINUOUS:
                self.assertEqual(v.feature_type, CONTINUOUS)
                self.assertIs(v.boxcox_lambda, None)
                self.assertIs(v.boxcox_shift, None)
            elif id_to_type(k) == BOXCOX:
                self.assertEqual(v.feature_type, BOXCOX)
                self.assertIsNot(v.boxcox_lambda, None)
                self.assertIsNot(v.boxcox_shift, None)
            else:
                assert v.feature_type == id_to_type(k)
        sorted_features, _ = sort_features_by_normalization(normalization_parameters)

        norm_net = core.Net("net")
        C2.set_net(norm_net)
        preprocessor = PreprocessorNet()
        input_matrix = np.zeros([10000, len(sorted_features)], dtype=np.float32)
        for i, feature in enumerate(sorted_features):
            input_matrix[:, i] = feature_value_map[feature]
        input_matrix_blob = "input_matrix_blob"
        workspace.FeedBlob(input_matrix_blob, np.array([], dtype=np.float32))
        output_blob, _ = preprocessor.normalize_dense_matrix(
            input_matrix_blob, sorted_features, normalization_parameters, "", False
        )
        workspace.FeedBlob(input_matrix_blob, input_matrix)
        workspace.RunNetOnce(norm_net)
        normalized_feature_matrix = workspace.FetchBlob(output_blob)

        normalized_features = {}
        on_column = 0
        for feature in sorted_features:
            norm = normalization_parameters[feature]
            if norm.feature_type == ENUM:
                column_size = len(norm.possible_values)
            else:
                column_size = 1
            normalized_features[feature] = normalized_feature_matrix[
                :, on_column : (on_column + column_size)
            ]
            on_column += column_size

        self.assertTrue(
            all(
                [
                    np.isfinite(parameter.stddev) and np.isfinite(parameter.mean)
                    for parameter in normalization_parameters.values()
                ]
            )
        )
        for k, v in six.iteritems(normalized_features):
            self.assertTrue(np.all(np.isfinite(v)))
            feature_type = normalization_parameters[k].feature_type
            if feature_type == identify_types.PROBABILITY:
                sigmoidv = special.expit(v)
                self.assertTrue(
                    np.all(
                        np.logical_and(np.greater(sigmoidv, 0), np.less(sigmoidv, 1))
                    )
                )
            elif feature_type == identify_types.ENUM:
                possible_values = normalization_parameters[k].possible_values
                self.assertEqual(v.shape[0], len(feature_value_map[k]))
                self.assertEqual(v.shape[1], len(possible_values))

                possible_value_map = {}
                for i, possible_value in enumerate(possible_values):
                    possible_value_map[possible_value] = i

                for i, row in enumerate(v):
                    original_feature = feature_value_map[k][i]
                    self.assertEqual(
                        possible_value_map[original_feature], np.where(row == 1)[0][0]
                    )
            elif feature_type == identify_types.QUANTILE:
                for i, feature in enumerate(v[0]):
                    original_feature = feature_value_map[k][i]
                    expected = NumpyFeatureProcessor.value_to_quantile(
                        original_feature, normalization_parameters[k].quantiles
                    )
                    self.assertAlmostEqual(feature, expected, 2)
            elif feature_type == identify_types.BINARY:
                pass
            elif (
                feature_type == identify_types.CONTINUOUS
                or feature_type == identify_types.BOXCOX
            ):
                one_stddev = np.isclose(np.std(v, ddof=1), 1, atol=0.01)
                zero_stddev = np.isclose(np.std(v, ddof=1), 0, atol=0.01)
                zero_mean = np.isclose(np.mean(v), 0, atol=0.01)
                self.assertTrue(
                    np.all(zero_mean),
                    "mean of feature {} is {}, not 0".format(k, np.mean(v)),
                )
                self.assertTrue(np.all(np.logical_or(one_stddev, zero_stddev)))
            elif feature_type == identify_types.CONTINUOUS_ACTION:
                less_than_max = v < 1
                more_than_min = v > -1
                self.assertTrue(
                    np.all(less_than_max),
                    "values are not less than 1: {}".format(v[less_than_max == False]),
                )
                self.assertTrue(
                    np.all(more_than_min),
                    "values are not more than -1: {}".format(v[more_than_min == False]),
                )
            else:
                raise NotImplementedError()
Пример #30
0
    def test_preprocessing_network(self):
        feature_value_map = read_data()

        normalization_parameters = {}
        for name, values in feature_value_map.items():
            normalization_parameters[name] = normalization.identify_parameter(
                name, values, feature_type=self._feature_type_override(name)
            )
        test_features = NumpyFeatureProcessor.preprocess(
            feature_value_map, normalization_parameters
        )

        net = core.Net("PreprocessingTestNet")
        C2.set_net(net)
        preprocessor = PreprocessorNet()
        name_preprocessed_blob_map = {}
        for feature_name in feature_value_map:
            workspace.FeedBlob(str(feature_name), np.array([0], dtype=np.int32))
            preprocessed_blob, _ = preprocessor.preprocess_blob(
                str(feature_name), [normalization_parameters[feature_name]]
            )
            name_preprocessed_blob_map[feature_name] = preprocessed_blob

        workspace.CreateNet(net)

        for feature_name, feature_value in six.iteritems(feature_value_map):
            feature_value = np.expand_dims(feature_value, -1)
            workspace.FeedBlob(str(feature_name), feature_value)
        workspace.RunNetOnce(net)

        for feature_name in feature_value_map:
            normalized_features = workspace.FetchBlob(
                name_preprocessed_blob_map[feature_name]
            )
            if feature_name != ENUM_FEATURE_ID:
                normalized_features = np.squeeze(normalized_features, -1)

            tolerance = 0.01
            if feature_name == BOXCOX_FEATURE_ID:
                # At the limit, boxcox has some numerical instability
                tolerance = 0.5
            non_matching = np.where(
                np.logical_not(
                    np.isclose(
                        normalized_features,
                        test_features[feature_name],
                        rtol=tolerance,
                        atol=tolerance,
                    )
                )
            )
            self.assertTrue(
                np.all(
                    np.isclose(
                        normalized_features,
                        test_features[feature_name],
                        rtol=tolerance,
                        atol=tolerance,
                    )
                ),
                "{} does not match: {} {}".format(
                    feature_name,
                    normalized_features[non_matching].tolist(),
                    test_features[feature_name][non_matching].tolist(),
                ),
            )