Пример #1
0
    def fit(self, data):
        """
        Apply scale for input data
        Parameters
        ----------
        data: data_instance, input data

        Returns
        ----------
        data:data_instance, data after scale
        scale_value_results: list, the fit results information of scale
        """
        LOGGER.info("Start scale data fit ...")

        if self.model_param.method == consts.MINMAXSCALE:
            self.scale_obj = MinMaxScale(self.model_param)
        elif self.model_param.method == consts.STANDARDSCALE:
            self.scale_obj = StandardScale(self.model_param)
        else:
            LOGGER.warning("Scale method is {}, do nothing and return!".format(self.model_param.method))

        if self.scale_obj:
            fit_data = self.scale_obj.fit(data)
            fit_data.schema = data.schema

            self.callback_meta(metric_name="scale", metric_namespace="train",
                               metric_meta=MetricMeta(name="scale", metric_type="SCALE", extra_metas={"method":self.model_param.method}))
        else:
            fit_data = data

        LOGGER.info("End fit data ...")
        return fit_data
Пример #2
0
    def test_cols_select_fit_and_transform_repeat(self):
        scale_param = self.get_scale_param()
        scale_param.scale_column_idx = [1, 1, 2, 2, 4, 5, 5]
        standard_scaler = StandardScale(scale_param)
        fit_data = standard_scaler.fit(self.table_instance)
        scale_column_idx = standard_scaler.scale_column_idx

        scaler = SSL(with_mean=True, with_std=True)
        scaler.fit(self.test_data)
        transform_data = np.around(scaler.transform(self.test_data),
                                   4).tolist()

        for i, line in enumerate(transform_data):
            for j, cols in enumerate(line):
                if j not in scale_column_idx:
                    transform_data[i][j] = self.test_data[i][j]

        self.assertListEqual(self.get_table_instance_feature(fit_data),
                             transform_data)

        std_scale_transform_data = standard_scaler.transform(
            self.table_instance)
        self.assertListEqual(
            self.get_table_instance_feature(std_scale_transform_data),
            transform_data)
Пример #3
0
    def test_fit8(self):
        scale_param = self.get_scale_param()
        scale_param.scale_column_idx = []
        scale_param.feat_upper = [2, 2, 2, 2, 2, 2]
        scale_param.feat_lower = [1, 1, 1, 1, 1, 1]
        scale_param.with_mean = False
        scale_param.with_std = False

        standard_scaler = StandardScale(scale_param)
        fit_instance = standard_scaler.fit(self.table_instance)
        mean = standard_scaler.mean
        std = standard_scaler.std
        column_max_value = standard_scaler.column_max_value
        column_min_value = standard_scaler.column_min_value

        for i, line in enumerate(self.test_data):
            for j, value in enumerate(line):
                if value > 2:
                    self.test_data[i][j] = 2
                elif value < 1:
                    self.test_data[i][j] = 1

        self.assertListEqual(self.get_table_instance_feature(fit_instance),
                             np.around(self.test_data, 4).tolist())
        self.assertEqual(mean, [0 for _ in range(len(self.test_data[0]))])
        self.assertEqual(std, [1 for _ in range(len(self.test_data[0]))])
        self.assertEqual(column_max_value, [1, 2, 2, 2, 2, 2])
        self.assertEqual(column_min_value, [1, 1, 1, 2, 2, 1])
Пример #4
0
    def export_model(self):
        if not self.scale_obj:
            if self.model_param.method == consts.MINMAXSCALE:
                self.scale_obj = MinMaxScale(self.model_param)
            else:
                self.scale_obj = StandardScale(self.model_param)

        return self.scale_obj.export_model(self.need_run)
Пример #5
0
    def test_transform1(self):
        scale_param = self.get_scale_param()

        standard_scaler = StandardScale(scale_param)
        fit_instance = standard_scaler.fit(self.table_instance)
        transform_data = standard_scaler.transform(self.table_instance)
        self.assertListEqual(self.get_table_instance_feature(transform_data),
                             self.get_table_instance_feature(fit_instance))
Пример #6
0
    def test_fit10(self):
        scale_column_idx = [1, 2, 4]

        scale_param = self.get_scale_param()
        scale_param.scale_column_idx = []
        scale_param.feat_upper = 0.8
        scale_param.feat_lower = 0.2
        scale_param.with_mean = True
        scale_param.with_std = True
        scale_param.mode = "cap"
        scale_param.scale_column_idx = scale_column_idx
        scale_param.area = "col"

        standard_scaler = StandardScale(scale_param)
        fit_instance = standard_scaler.fit(self.table_instance)
        mean = standard_scaler.mean
        std = standard_scaler.std
        column_max_value = standard_scaler.column_max_value
        column_min_value = standard_scaler.column_min_value

        gt_cap_lower_list = [0, 2, 2, 2, 3, 1]
        gt_cap_upper_list = [1, 8, 8, 8, 7, 8]
        raw_data = copy.deepcopy(self.test_data)
        for i, line in enumerate(self.test_data):
            for j, value in enumerate(line):
                if j in scale_column_idx:
                    if value > gt_cap_upper_list[j]:
                        self.test_data[i][j] = gt_cap_upper_list[j]
                    elif value < gt_cap_lower_list[j]:
                        self.test_data[i][j] = gt_cap_lower_list[j]

        scaler = SSL(with_mean=True, with_std=True)
        scaler.fit(self.test_data)
        transform_data = np.around(scaler.transform(self.test_data),
                                   4).tolist()

        for i, line in enumerate(transform_data):
            for j, cols in enumerate(line):
                if j not in scale_column_idx:
                    transform_data[i][j] = raw_data[i][j]

        self.assertListEqual(self.get_table_instance_feature(fit_instance),
                             transform_data)
        self.assertEqual(column_max_value, gt_cap_upper_list)
        self.assertEqual(column_min_value, gt_cap_lower_list)

        self.assertListEqual(list(np.around(mean, 6)),
                             list(np.around(scaler.mean_, 6)))
        self.assertListEqual(list(np.around(std, 6)),
                             list(np.around(scaler.scale_, 6)))

        raw_data_transform = standard_scaler.transform(self.table_instance)
        self.assertListEqual(
            self.get_table_instance_feature(fit_instance),
            self.get_table_instance_feature(raw_data_transform))
Пример #7
0
    def test_transform6(self):
        scale_param = self.get_scale_param()
        scale_param.with_mean = False
        scale_param.with_std = False
        scale_param.scale_column_idx = []

        standard_scaler = StandardScale(scale_param)
        fit_instance = standard_scaler.fit(self.table_instance)
        transform_data = standard_scaler.transform(self.table_instance)
        self.assertListEqual(self.get_table_instance_feature(transform_data),
                             self.get_table_instance_feature(fit_instance))
Пример #8
0
    def test_fit9(self):
        scale_column_idx = [1, 2, 4]

        scale_param = self.get_scale_param()
        scale_param.scale_column_idx = []
        scale_param.feat_upper = [2, 2, 2, 2, 2, 2]
        scale_param.feat_lower = [1, 1, 1, 1, 1, 1]
        scale_param.with_mean = True
        scale_param.with_std = True
        scale_param.scale_column_idx = scale_column_idx
        scale_param.area = "col"

        standard_scaler = StandardScale(scale_param)
        fit_instance = standard_scaler.fit(self.table_instance)
        mean = standard_scaler.mean
        std = standard_scaler.std
        column_max_value = standard_scaler.column_max_value
        column_min_value = standard_scaler.column_min_value

        raw_data = copy.deepcopy(self.test_data)
        for i, line in enumerate(self.test_data):
            for j, value in enumerate(line):
                if j in scale_column_idx:
                    if value > 2:
                        self.test_data[i][j] = 2
                    elif value < 1:
                        self.test_data[i][j] = 1

        scaler = SSL(with_mean=True, with_std=True)
        scaler.fit(self.test_data)
        transform_data = np.around(scaler.transform(self.test_data),
                                   4).tolist()

        for i, line in enumerate(transform_data):
            for j, cols in enumerate(line):
                if j not in scale_column_idx:
                    transform_data[i][j] = raw_data[i][j]

        self.assertListEqual(self.get_table_instance_feature(fit_instance),
                             transform_data)
        self.assertListEqual(list(np.around(mean, 6)),
                             list(np.around(scaler.mean_, 6)))
        self.assertListEqual(list(np.around(std, 6)),
                             list(np.around(scaler.scale_, 6)))
        self.assertEqual(column_max_value, [1, 2, 2, 10, 2, 10])
        self.assertEqual(column_min_value, [0, 1, 1, 2, 2, -100])

        raw_data_transform = standard_scaler.transform(self.table_instance)
        self.assertListEqual(
            self.get_table_instance_feature(fit_instance),
            self.get_table_instance_feature(raw_data_transform))
Пример #9
0
    def test_fit1(self):
        scale_param = self.get_scale_param()
        standard_scaler = StandardScale(scale_param)
        fit_instance = standard_scaler.fit(self.table_instance)
        mean = standard_scaler.mean
        std = standard_scaler.std

        scaler = SSL()
        scaler.fit(self.test_data)
        self.assertListEqual(
            self.get_table_instance_feature(fit_instance),
            np.around(scaler.transform(self.test_data), 4).tolist())
        self.assertListEqual(list(np.around(mean, 4)),
                             list(np.around(scaler.mean_, 4)))
        self.assertListEqual(list(np.around(std, 4)),
                             list(np.around(scaler.scale_, 4)))
Пример #10
0
    def transform(self, data, fit_config=None):
        """
        Transform input data using scale with fit results
        Parameters
        ----------
        data: data_instance, input data
        fit_config: list, the fit results information of scale

        Returns
        ----------
        transform_data:data_instance, data after transform
        """
        LOGGER.info("Start scale data transform ...")

        if self.model_param.method == consts.MINMAXSCALE:
            self.scale_obj = MinMaxScale(self.model_param)
        elif self.model_param.method == consts.STANDARDSCALE:
            self.scale_obj = StandardScale(self.model_param)
            self.scale_obj.set_param(self.mean, self.std)
        else:
            LOGGER.info(
                "DataTransform method is {}, do nothing and return!".format(
                    self.model_param.method))

        if self.scale_obj:
            self.scale_obj.header = self.header
            self.scale_obj.scale_column_idx = self.scale_column_idx
            self.scale_obj.set_column_range(self.column_max_value,
                                            self.column_min_value)
            transform_data = self.scale_obj.transform(data)
            transform_data.schema = data.schema

            self.callback_meta(
                metric_name="scale",
                metric_namespace="train",
                metric_meta=MetricMeta(
                    name="scale",
                    metric_type="SCALE",
                    extra_metas={"method": self.model_param.method}))

        else:
            transform_data = data

        LOGGER.info("End transform data.")

        return transform_data
Пример #11
0
    def test_fit4(self):
        scale_param = self.get_scale_param()
        scale_param.with_std = False
        scale_param.with_mean = False

        standard_scaler = StandardScale(scale_param)
        fit_instance = standard_scaler.fit(self.table_instance)
        mean = standard_scaler.mean
        std = standard_scaler.std

        scaler = SSL(with_mean=False, with_std=False)
        scaler.fit(self.test_data)
        self.assertListEqual(
            self.get_table_instance_feature(fit_instance),
            np.around(scaler.transform(self.test_data), 4).tolist())
        self.assertEqual(mean, [0 for _ in range(len(self.test_data[0]))])
        self.assertEqual(std, [1 for _ in range(len(self.test_data[0]))])
Пример #12
0
    def test_fit6(self):
        scale_param = self.get_scale_param()
        scale_param.scale_column_idx = []
        scale_param.area = "col"

        standard_scaler = StandardScale(scale_param)
        fit_instance = standard_scaler.fit(self.table_instance)
        mean = standard_scaler.mean
        std = standard_scaler.std

        scaler = SSL()
        scaler.fit(self.test_data)
        self.assertListEqual(self.get_table_instance_feature(fit_instance),
                             np.around(self.test_data, 4).tolist())
        self.assertListEqual(list(np.around(mean, 4)),
                             list(np.around(scaler.mean_, 4)))
        self.assertListEqual(list(np.around(std, 4)),
                             list(np.around(scaler.scale_, 4)))
Пример #13
0
class Scale(ModelBase):
    """
    The Scale class is used to data scale. MinMaxScale and StandardScale is supported now
    """
    def __init__(self):
        super().__init__()
        self.model_name = None
        self.model_param_name = 'ScaleParam'
        self.model_meta_name = 'ScaleMeta'
        self.model_param = ScaleParam()

        self.scale_param_obj = None
        self.scale_obj = None
        self.header = None
        self.column_max_value = None
        self.column_min_value = None
        self.mean = None
        self.std = None
        self.scale_column_idx = None

    def fit(self, data):
        """
        Apply scale for input data
        Parameters
        ----------
        data: data_instance, input data

        Returns
        ----------
        data:data_instance, data after scale
        scale_value_results: list, the fit results information of scale
        """
        LOGGER.info("Start scale data fit ...")

        if self.model_param.method == consts.MINMAXSCALE:
            self.scale_obj = MinMaxScale(self.model_param)
        elif self.model_param.method == consts.STANDARDSCALE:
            self.scale_obj = StandardScale(self.model_param)
        else:
            LOGGER.warning("Scale method is {}, do nothing and return!".format(
                self.model_param.method))

        if self.scale_obj:
            fit_data = self.scale_obj.fit(data)
            fit_data.schema = data.schema

            self.callback_meta(
                metric_name="scale",
                metric_namespace="train",
                metric_meta=MetricMeta(
                    name="scale",
                    metric_type="SCALE",
                    extra_metas={"method": self.model_param.method}))

            LOGGER.info("start to get model summary ...")
            self.set_summary(self.scale_obj.get_model_summary())
            LOGGER.info("Finish getting model summary.")

        else:
            fit_data = data

        LOGGER.info("End fit data ...")
        return fit_data

    @assert_io_num_rows_equal
    @assert_schema_consistent
    def transform(self, data, fit_config=None):
        """
        Transform input data using scale with fit results
        Parameters
        ----------
        data: data_instance, input data
        fit_config: list, the fit results information of scale

        Returns
        ----------
        transform_data:data_instance, data after transform
        """
        LOGGER.info("Start scale data transform ...")

        if self.model_param.method == consts.MINMAXSCALE:
            self.scale_obj = MinMaxScale(self.model_param)
        elif self.model_param.method == consts.STANDARDSCALE:
            self.scale_obj = StandardScale(self.model_param)
            self.scale_obj.set_param(self.mean, self.std)
        else:
            LOGGER.info(
                "DataTransform method is {}, do nothing and return!".format(
                    self.model_param.method))

        if self.scale_obj:
            self.scale_obj.header = self.header
            self.scale_obj.scale_column_idx = self.scale_column_idx
            self.scale_obj.set_column_range(self.column_max_value,
                                            self.column_min_value)
            transform_data = self.scale_obj.transform(data)
            transform_data.schema = data.schema

            self.callback_meta(
                metric_name="scale",
                metric_namespace="train",
                metric_meta=MetricMeta(
                    name="scale",
                    metric_type="SCALE",
                    extra_metas={"method": self.model_param.method}))

        else:
            transform_data = data

        LOGGER.info("End transform data.")

        return transform_data

    def load_model(self, model_dict):
        model_obj = list(model_dict.get('model').values())[0].get(
            self.model_param_name)
        meta_obj = list(model_dict.get('model').values())[0].get(
            self.model_meta_name)
        self.header = list(model_obj.header)
        self.need_run = meta_obj.need_run

        shape = len(self.header)
        self.column_max_value = [0 for _ in range(shape)]
        self.column_min_value = [0 for _ in range(shape)]
        self.mean = [0 for _ in range(shape)]
        self.std = [1 for _ in range(shape)]
        self.scale_column_idx = []
        scale_param_dict = dict(model_obj.col_scale_param)
        for key, column_scale_param in scale_param_dict.items():
            index = self.header.index(key)
            self.scale_column_idx.append(index)

            self.column_max_value[index] = column_scale_param.column_upper
            self.column_min_value[index] = column_scale_param.column_lower
            self.mean[index] = column_scale_param.mean
            self.std[index] = column_scale_param.std

        self.scale_column_idx.sort()

    def export_model(self):
        if not self.scale_obj:
            if self.model_param.method == consts.MINMAXSCALE:
                self.scale_obj = MinMaxScale(self.model_param)
            else:
                self.scale_obj = StandardScale(self.model_param)

        return self.scale_obj.export_model(self.need_run)