Пример #1
0
def get_norm_metadata(dataset, norm_params, norm_col):
    done = False
    batch = dataset.read_batch()
    samples_per_feature, samples = defaultdict(int), defaultdict(list)

    while not done:
        if batch is None or len(batch[norm_col]) == 0:
            logger.info("No more data in training data. Breaking.")
            break

        feature_df = batch[norm_col].apply(pd.Series)
        for feature in feature_df:
            values = feature_df[feature].dropna().values
            samples_per_feature[feature] += len(values)
            samples[feature].extend(values)

        done = check_samples_per_feature(samples_per_feature,
                                         norm_params["num_samples"])
        logger.info("Samples per feature: {}".format(samples_per_feature))
        if done:
            logger.info(
                "Collected sufficient sample size for all features. Breaking.")

        batch = dataset.read_batch(astype="df")

    output = {}
    for feature, values in samples.items():
        output[feature] = get_feature_norm_metadata(feature, values,
                                                    norm_params)
    return serialize(output)
def get_norm_metadata(dataset, norm_params, norm_col):
    done = False
    batch = dataset.read_batch(astype="df")
    samples_per_feature, samples = defaultdict(int), defaultdict(list)

    while not done:
        if batch is None or len(batch[norm_col]) == 0:
            logger.info("No more data in training data. Breaking.")
            break

        feature_df = batch[norm_col].apply(pd.Series)
        for feature in feature_df:
            values = feature_df[feature].dropna().values
            samples_per_feature[feature] += len(values)
            samples[feature].extend(values)

        done = check_samples_per_feature(
            samples_per_feature, norm_params["num_samples"]
        )
        logger.info("Samples per feature: {}".format(samples_per_feature))
        if done:
            logger.info("Collected sufficient sample size for all features. Breaking.")

        batch = dataset.read_batch(astype="df")

    output = {}
    for feature, values in samples.items():
        output[feature] = get_feature_norm_metadata(feature, values, norm_params)
    return serialize(output)
Пример #3
0
    def test_persistency(self):
        _, feature_value_map = preprocessing_util.read_data()
        normalization_parameters = {}
        for name, values in feature_value_map.items():
            normalization_parameters[name] = normalization.identify_parameter(values)

        s = normalization.serialize(normalization_parameters)
        read_parameters = normalization.deserialize(s)
        self.assertEqual(read_parameters, normalization_parameters)
Пример #4
0
    def test_persistency(self):
        feature_value_map = read_data()
        normalization_parameters = {}
        for name, values in feature_value_map.items():
            normalization_parameters[name] = normalization.identify_parameter(
                values, feature_type=self._feature_type_override(name))

        s = normalization.serialize(normalization_parameters)
        read_parameters = normalization.deserialize(s)
        self.assertEqual(read_parameters, normalization_parameters)
Пример #5
0
    def test_persistency(self):
        feature_value_map = preprocessing_util.read_data()

        normalization_parameters = normalization.identify_parameters(
            feature_value_map
        )

        s = normalization.serialize(normalization_parameters)
        read_parameters = normalization.deserialize(s)
        self.assertEqual(read_parameters, normalization_parameters)
Пример #6
0
    def test_persistency(self):
        feature_value_map = read_data()
        normalization_parameters = {}
        for name, values in feature_value_map.items():
            normalization_parameters[name] = normalization.identify_parameter(
                values)
            values[
                0] = MISSING_VALUE  # Set one entry to MISSING_VALUE to test that

        s = normalization.serialize(normalization_parameters)
        read_parameters = normalization.deserialize(s)
        self.assertEqual(read_parameters, normalization_parameters)
Пример #7
0
    def test_persistency(self):
        feature_value_map = read_data()
        normalization_parameters = {}
        for name, values in feature_value_map.items():
            normalization_parameters[name] = normalization.identify_parameter(
                name, values, feature_type=self._feature_type_override(name))
            values[
                0] = MISSING_VALUE  # Set one entry to MISSING_VALUE to test that

        s = normalization.serialize(normalization_parameters)
        read_parameters = normalization.deserialize(s)
        # Unfortunately, Thrift serializatin seems to lose a bit of precision.
        # Using `==` will be false.
        self.assertEqual(read_parameters.keys(),
                         normalization_parameters.keys())
        for k in normalization_parameters:
            self.assertEqual(
                read_parameters[k].feature_type,
                normalization_parameters[k].feature_type,
            )
            self.assertEqual(
                read_parameters[k].possible_values,
                normalization_parameters[k].possible_values,
            )
            for field in [
                    "boxcox_lambda",
                    "boxcox_shift",
                    "mean",
                    "stddev",
                    "quantiles",
                    "min_value",
                    "max_value",
            ]:
                if getattr(normalization_parameters[k], field) is None:
                    self.assertEqual(
                        getattr(read_parameters[k], field),
                        getattr(normalization_parameters[k], field),
                    )
                else:
                    npt.assert_allclose(
                        getattr(read_parameters[k], field),
                        getattr(normalization_parameters[k], field),
                    )
Пример #8
0
    def test_persistency(self):
        feature_value_map = read_data()
        normalization_parameters = {}
        for name, values in feature_value_map.items():
            normalization_parameters[name] = normalization.identify_parameter(
                name, values, feature_type=self._feature_type_override(name)
            )
            values[0] = MISSING_VALUE  # Set one entry to MISSING_VALUE to test that

        s = normalization.serialize(normalization_parameters)
        read_parameters = normalization.deserialize(s)
        # Unfortunately, Thrift serializatin seems to lose a bit of precision.
        # Using `==` will be false.
        self.assertEqual(read_parameters.keys(), normalization_parameters.keys())
        for k in normalization_parameters:
            self.assertEqual(
                read_parameters[k].feature_type,
                normalization_parameters[k].feature_type,
            )
            self.assertEqual(
                read_parameters[k].possible_values,
                normalization_parameters[k].possible_values,
            )
            for field in [
                "boxcox_lambda",
                "boxcox_shift",
                "mean",
                "stddev",
                "quantiles",
                "min_value",
                "max_value",
            ]:
                if getattr(normalization_parameters[k], field) is None:
                    self.assertEqual(
                        getattr(read_parameters[k], field),
                        getattr(normalization_parameters[k], field),
                    )
                else:
                    npt.assert_allclose(
                        getattr(read_parameters[k], field),
                        getattr(normalization_parameters[k], field),
                    )