def test_cols_select_fit_and_transform_repeat(self): scale_param = self.get_scale_param() scale_param.scale_column_idx = [1, 1, 2, 2, 4, 5, 5] standard_scaler = StandardScale(scale_param) fit_data = standard_scaler.fit(self.table_instance) scale_column_idx = standard_scaler.scale_column_idx scaler = SSL(with_mean=True, with_std=True) scaler.fit(self.test_data) transform_data = np.around(scaler.transform(self.test_data), 4).tolist() for i, line in enumerate(transform_data): for j, cols in enumerate(line): if j not in scale_column_idx: transform_data[i][j] = self.test_data[i][j] self.assertListEqual(self.get_table_instance_feature(fit_data), transform_data) std_scale_transform_data = standard_scaler.transform( self.table_instance) self.assertListEqual( self.get_table_instance_feature(std_scale_transform_data), transform_data)
def test_fit8(self): scale_param = self.get_scale_param() scale_param.scale_column_idx = [] scale_param.feat_upper = [2, 2, 2, 2, 2, 2] scale_param.feat_lower = [1, 1, 1, 1, 1, 1] scale_param.with_mean = False scale_param.with_std = False standard_scaler = StandardScale(scale_param) fit_instance = standard_scaler.fit(self.table_instance) mean = standard_scaler.mean std = standard_scaler.std column_max_value = standard_scaler.column_max_value column_min_value = standard_scaler.column_min_value for i, line in enumerate(self.test_data): for j, value in enumerate(line): if value > 2: self.test_data[i][j] = 2 elif value < 1: self.test_data[i][j] = 1 self.assertListEqual(self.get_table_instance_feature(fit_instance), np.around(self.test_data, 4).tolist()) self.assertEqual(mean, [0 for _ in range(len(self.test_data[0]))]) self.assertEqual(std, [1 for _ in range(len(self.test_data[0]))]) self.assertEqual(column_max_value, [1, 2, 2, 2, 2, 2]) self.assertEqual(column_min_value, [1, 1, 1, 2, 2, 1])
def test_transform1(self): scale_param = self.get_scale_param() standard_scaler = StandardScale(scale_param) fit_instance = standard_scaler.fit(self.table_instance) transform_data = standard_scaler.transform(self.table_instance) self.assertListEqual(self.get_table_instance_feature(transform_data), self.get_table_instance_feature(fit_instance))
def test_fit10(self): scale_column_idx = [1, 2, 4] scale_param = self.get_scale_param() scale_param.scale_column_idx = [] scale_param.feat_upper = 0.8 scale_param.feat_lower = 0.2 scale_param.with_mean = True scale_param.with_std = True scale_param.mode = "cap" scale_param.scale_column_idx = scale_column_idx scale_param.area = "col" standard_scaler = StandardScale(scale_param) fit_instance = standard_scaler.fit(self.table_instance) mean = standard_scaler.mean std = standard_scaler.std column_max_value = standard_scaler.column_max_value column_min_value = standard_scaler.column_min_value gt_cap_lower_list = [0, 2, 2, 2, 3, 1] gt_cap_upper_list = [1, 8, 8, 8, 7, 8] raw_data = copy.deepcopy(self.test_data) for i, line in enumerate(self.test_data): for j, value in enumerate(line): if j in scale_column_idx: if value > gt_cap_upper_list[j]: self.test_data[i][j] = gt_cap_upper_list[j] elif value < gt_cap_lower_list[j]: self.test_data[i][j] = gt_cap_lower_list[j] scaler = SSL(with_mean=True, with_std=True) scaler.fit(self.test_data) transform_data = np.around(scaler.transform(self.test_data), 4).tolist() for i, line in enumerate(transform_data): for j, cols in enumerate(line): if j not in scale_column_idx: transform_data[i][j] = raw_data[i][j] self.assertListEqual(self.get_table_instance_feature(fit_instance), transform_data) self.assertEqual(column_max_value, gt_cap_upper_list) self.assertEqual(column_min_value, gt_cap_lower_list) self.assertListEqual(list(np.around(mean, 6)), list(np.around(scaler.mean_, 6))) self.assertListEqual(list(np.around(std, 6)), list(np.around(scaler.scale_, 6))) raw_data_transform = standard_scaler.transform(self.table_instance) self.assertListEqual( self.get_table_instance_feature(fit_instance), self.get_table_instance_feature(raw_data_transform))
def test_transform6(self): scale_param = self.get_scale_param() scale_param.with_mean = False scale_param.with_std = False scale_param.scale_column_idx = [] standard_scaler = StandardScale(scale_param) fit_instance = standard_scaler.fit(self.table_instance) transform_data = standard_scaler.transform(self.table_instance) self.assertListEqual(self.get_table_instance_feature(transform_data), self.get_table_instance_feature(fit_instance))
def test_fit9(self): scale_column_idx = [1, 2, 4] scale_param = self.get_scale_param() scale_param.scale_column_idx = [] scale_param.feat_upper = [2, 2, 2, 2, 2, 2] scale_param.feat_lower = [1, 1, 1, 1, 1, 1] scale_param.with_mean = True scale_param.with_std = True scale_param.scale_column_idx = scale_column_idx scale_param.area = "col" standard_scaler = StandardScale(scale_param) fit_instance = standard_scaler.fit(self.table_instance) mean = standard_scaler.mean std = standard_scaler.std column_max_value = standard_scaler.column_max_value column_min_value = standard_scaler.column_min_value raw_data = copy.deepcopy(self.test_data) for i, line in enumerate(self.test_data): for j, value in enumerate(line): if j in scale_column_idx: if value > 2: self.test_data[i][j] = 2 elif value < 1: self.test_data[i][j] = 1 scaler = SSL(with_mean=True, with_std=True) scaler.fit(self.test_data) transform_data = np.around(scaler.transform(self.test_data), 4).tolist() for i, line in enumerate(transform_data): for j, cols in enumerate(line): if j not in scale_column_idx: transform_data[i][j] = raw_data[i][j] self.assertListEqual(self.get_table_instance_feature(fit_instance), transform_data) self.assertListEqual(list(np.around(mean, 6)), list(np.around(scaler.mean_, 6))) self.assertListEqual(list(np.around(std, 6)), list(np.around(scaler.scale_, 6))) self.assertEqual(column_max_value, [1, 2, 2, 10, 2, 10]) self.assertEqual(column_min_value, [0, 1, 1, 2, 2, -100]) raw_data_transform = standard_scaler.transform(self.table_instance) self.assertListEqual( self.get_table_instance_feature(fit_instance), self.get_table_instance_feature(raw_data_transform))
def test_fit1(self): scale_param = self.get_scale_param() standard_scaler = StandardScale(scale_param) fit_instance = standard_scaler.fit(self.table_instance) mean = standard_scaler.mean std = standard_scaler.std scaler = SSL() scaler.fit(self.test_data) self.assertListEqual( self.get_table_instance_feature(fit_instance), np.around(scaler.transform(self.test_data), 4).tolist()) self.assertListEqual(list(np.around(mean, 4)), list(np.around(scaler.mean_, 4))) self.assertListEqual(list(np.around(std, 4)), list(np.around(scaler.scale_, 4)))
def test_fit4(self): scale_param = self.get_scale_param() scale_param.with_std = False scale_param.with_mean = False standard_scaler = StandardScale(scale_param) fit_instance = standard_scaler.fit(self.table_instance) mean = standard_scaler.mean std = standard_scaler.std scaler = SSL(with_mean=False, with_std=False) scaler.fit(self.test_data) self.assertListEqual( self.get_table_instance_feature(fit_instance), np.around(scaler.transform(self.test_data), 4).tolist()) self.assertEqual(mean, [0 for _ in range(len(self.test_data[0]))]) self.assertEqual(std, [1 for _ in range(len(self.test_data[0]))])
def test_fit6(self): scale_param = self.get_scale_param() scale_param.scale_column_idx = [] scale_param.area = "col" standard_scaler = StandardScale(scale_param) fit_instance = standard_scaler.fit(self.table_instance) mean = standard_scaler.mean std = standard_scaler.std scaler = SSL() scaler.fit(self.test_data) self.assertListEqual(self.get_table_instance_feature(fit_instance), np.around(self.test_data, 4).tolist()) self.assertListEqual(list(np.around(mean, 4)), list(np.around(scaler.mean_, 4))) self.assertListEqual(list(np.around(std, 4)), list(np.around(scaler.scale_, 4)))
class Scale(ModelBase): """ The Scale class is used to data scale. MinMaxScale and StandardScale is supported now """ def __init__(self): super().__init__() self.model_name = None self.model_param_name = 'ScaleParam' self.model_meta_name = 'ScaleMeta' self.model_param = ScaleParam() self.scale_param_obj = None self.scale_obj = None self.header = None self.column_max_value = None self.column_min_value = None self.mean = None self.std = None self.scale_column_idx = None def fit(self, data): """ Apply scale for input data Parameters ---------- data: data_instance, input data Returns ---------- data:data_instance, data after scale scale_value_results: list, the fit results information of scale """ LOGGER.info("Start scale data fit ...") if self.model_param.method == consts.MINMAXSCALE: self.scale_obj = MinMaxScale(self.model_param) elif self.model_param.method == consts.STANDARDSCALE: self.scale_obj = StandardScale(self.model_param) else: LOGGER.warning("Scale method is {}, do nothing and return!".format( self.model_param.method)) if self.scale_obj: fit_data = self.scale_obj.fit(data) fit_data.schema = data.schema self.callback_meta( metric_name="scale", metric_namespace="train", metric_meta=MetricMeta( name="scale", metric_type="SCALE", extra_metas={"method": self.model_param.method})) LOGGER.info("start to get model summary ...") self.set_summary(self.scale_obj.get_model_summary()) LOGGER.info("Finish getting model summary.") else: fit_data = data LOGGER.info("End fit data ...") return fit_data @assert_io_num_rows_equal @assert_schema_consistent def transform(self, data, fit_config=None): """ Transform input data using scale with fit results Parameters ---------- data: data_instance, input data fit_config: list, the fit results information of scale Returns ---------- transform_data:data_instance, data after transform """ LOGGER.info("Start scale data transform ...") if self.model_param.method == consts.MINMAXSCALE: self.scale_obj = MinMaxScale(self.model_param) elif self.model_param.method == consts.STANDARDSCALE: self.scale_obj = StandardScale(self.model_param) self.scale_obj.set_param(self.mean, self.std) else: LOGGER.info( "DataTransform method is {}, do nothing and return!".format( self.model_param.method)) if self.scale_obj: self.scale_obj.header = self.header self.scale_obj.scale_column_idx = self.scale_column_idx self.scale_obj.set_column_range(self.column_max_value, self.column_min_value) transform_data = self.scale_obj.transform(data) transform_data.schema = data.schema self.callback_meta( metric_name="scale", metric_namespace="train", metric_meta=MetricMeta( name="scale", metric_type="SCALE", extra_metas={"method": self.model_param.method})) else: transform_data = data LOGGER.info("End transform data.") return transform_data def load_model(self, model_dict): model_obj = list(model_dict.get('model').values())[0].get( self.model_param_name) meta_obj = list(model_dict.get('model').values())[0].get( self.model_meta_name) self.header = list(model_obj.header) self.need_run = meta_obj.need_run shape = len(self.header) self.column_max_value = [0 for _ in range(shape)] self.column_min_value = [0 for _ in range(shape)] self.mean = [0 for _ in range(shape)] self.std = [1 for _ in range(shape)] self.scale_column_idx = [] scale_param_dict = dict(model_obj.col_scale_param) for key, column_scale_param in scale_param_dict.items(): index = self.header.index(key) self.scale_column_idx.append(index) self.column_max_value[index] = column_scale_param.column_upper self.column_min_value[index] = column_scale_param.column_lower self.mean[index] = column_scale_param.mean self.std[index] = column_scale_param.std self.scale_column_idx.sort() def export_model(self): if not self.scale_obj: if self.model_param.method == consts.MINMAXSCALE: self.scale_obj = MinMaxScale(self.model_param) else: self.scale_obj = StandardScale(self.model_param) return self.scale_obj.export_model(self.need_run)