def test_fit_col_feat(self): feat_upper = [0.8, 5, 6, 7, 8, 9] feat_lower = [0.5, 2, 3, 3, 4, 4.0] out_upper = 2.0 out_lower = -1.0 min_max_scaler = MinMaxScaler(mode='normal', area='col', feat_upper=feat_upper, feat_lower=feat_lower, out_upper=out_upper, out_lower=out_lower) fit_data, cols_transform_value = min_max_scaler.fit(self.table_instance) new_data = [] for data in self.test_data: tmp_data = [] for i in range(len(data)): if data[i] > feat_upper[i]: tmp_data.append(feat_upper[i]) elif data[i] < feat_lower[i]: tmp_data.append(feat_lower[i]) else: tmp_data.append(data[i]) new_data.append(tmp_data) feature_range = (out_lower, out_upper) scaler = MMS(feature_range) scaler.fit(new_data) self.assertListEqual(self.get_table_instance_feature(fit_data), np.around(scaler.transform(new_data), 4).tolist())
def fit(self, data): LOGGER.info("Start scale data fit ...") scale_value_results = [] self.header = data.schema.get('header') if self.scale_param.method == consts.MINMAXSCALE: min_max_scaler = MinMaxScaler(mode=self.scale_param.mode, area=self.scale_param.area, feat_upper=self.scale_param.feat_upper, feat_lower=self.scale_param.feat_lower, out_upper=self.scale_param.out_upper, out_lower=self.scale_param.out_lower) data, cols_scale_value = min_max_scaler.fit(data) scale_value_results.append(cols_scale_value) self.cols_scale_value = cols_scale_value elif self.scale_param.method == consts.STANDARDSCALE: standard_scaler = StandardScaler(with_mean=self.scale_param.with_mean, with_std=self.scale_param.with_std) data, mean, std = standard_scaler.fit(data) scale_value_results.append(mean) scale_value_results.append(std) self.mean = mean self.std = std else: LOGGER.info("Scale method is {}, do nothing and return!".format(self.scale_param.method)) data.schema['header'] = self.header LOGGER.info("End fit data ...") return data, scale_value_results
def test_cols_select_fit_and_transform_repeat(self): scale_column_idx = [1, 1, 2, 2, 4, 5, 5] min_max_scaler = MinMaxScaler(mode='normal', area='col', scale_column_idx=scale_column_idx, feat_upper=None, feat_lower=None, out_upper=None, out_lower=None) fit_data, cols_transform_value = min_max_scaler.fit( self.table_instance) scaler = MMS() scaler.fit(self.test_data) mms_transform_data = np.around(scaler.transform(self.test_data), 4).tolist() for i, line in enumerate(mms_transform_data): for j, cols in enumerate(line): if j not in scale_column_idx: mms_transform_data[i][j] = self.test_data[i][j] self.assertListEqual(self.get_table_instance_feature(fit_data), mms_transform_data) transform_data = min_max_scaler.transform(self.table_instance, cols_transform_value) self.assertListEqual(self.get_table_instance_feature(transform_data), mms_transform_data)
def transform(self, data, fit_config): """ Transform input data using scale with fit results Parameters ---------- data: data_instance, input data fit_config: list, the fit results information of scale Returns ---------- transform_data:data_instance, data after transform """ LOGGER.info("Start scale data transform ...") self.header = data.schema.get('header') if len(fit_config) == 0: LOGGER.warning("length fit_config is 0, can not do transform, do nothing and return") if self.scale_param.method == consts.MINMAXSCALE: min_max_scaler = MinMaxScaler() data = min_max_scaler.transform(data, fit_config) elif self.scale_param.method == consts.STANDARDSCALE: standard_scaler = StandardScaler() data = standard_scaler.transform(data, mean=fit_config[0], scale=fit_config[1], scale_column_idx=fit_config[2]) else: LOGGER.info("DataTransform method is {}, do nothing and return!".format(self.scale_param.method)) data.schema['header'] = self.header LOGGER.info("End transform data ...") return data
def test_fit_instance_default(self): min_max_scaler = MinMaxScaler(mode='normal', area='all', feat_upper=None, feat_lower=None, out_upper=None, out_lower=None) fit_instance, cols_transform_value = min_max_scaler.fit(self.table_instance) scaler = MMS() scaler.fit(self.test_data) self.assertListEqual(self.get_table_instance_feature(fit_instance), np.around(scaler.transform(self.test_data), 4).tolist()) self.assertListEqual(cols_transform_value, self.sklearn_attribute_format(scaler, [0, 1]))
def test_fit_col_out(self): min_max_scaler = MinMaxScaler(mode='normal', area='col', feat_upper=None, feat_lower=None, out_upper=2, out_lower=-1) fit_data, cols_transform_value = min_max_scaler.fit(self.table_instance) feature_range = (-1, 2) scaler = MMS(feature_range) scaler.fit(self.test_data) self.assertListEqual(self.get_table_instance_feature(fit_data), np.around(scaler.transform(self.test_data), 4).tolist()) self.assertListEqual(cols_transform_value, self.sklearn_attribute_format(scaler, feature_range))
def test_transform_all(self): feat_upper = 8 feat_lower = 3 out_upper = 2 out_lower = -1 min_max_scaler = MinMaxScaler(mode='normal', area='all', feat_upper=feat_upper, feat_lower=feat_lower, out_upper=out_upper, out_lower=out_lower) fit_data, cols_transform_value = min_max_scaler.fit(self.table_instance) transform_data = min_max_scaler.transform(self.table_instance, cols_transform_value) self.assertListEqual(self.get_table_instance_feature(fit_data), self.get_table_instance_feature(transform_data))
def test_fit_feat(self): feat_upper = 8 feat_lower = 3 out_upper = 2 out_lower = -1 min_max_scaler = MinMaxScaler(mode='normal', area='all', feat_upper=feat_upper, feat_lower=feat_lower, out_upper=out_upper, out_lower=out_lower) fit_data, cols_transform_value = min_max_scaler.fit( self.table_instance) new_data = [] for data in self.test_data: tmp_data = [] for i in range(len(data)): if data[i] > feat_upper: tmp_data.append(feat_upper) elif data[i] < feat_lower: tmp_data.append(feat_lower) else: tmp_data.append(data[i]) new_data.append(tmp_data) feature_range = (out_lower, out_upper) scaler = MMS(feature_range) scaler.fit(new_data) self.assertListEqual(self.get_table_instance_feature(fit_data), np.around(scaler.transform(new_data), 4).tolist()) sklearn_res = self.sklearn_attribute_format(scaler, [out_lower, out_upper]) # self.assertListEqual(cols_transform_value, [(feat_lower, feat_upper, out_lower, out_upper)]) for i in range(len(sklearn_res)): if sklearn_res[i][0] != feat_lower: tmp_res = list(sklearn_res[i]) tmp_res[0] = feat_lower sklearn_res[i] = tuple(tmp_res) if sklearn_res[i][1] != feat_upper: tmp_res = list(sklearn_res[i]) tmp_res[1] = feat_upper sklearn_res[i] = tuple(tmp_res) self.assertListEqual(cols_transform_value[0], sklearn_res)
def transform(self, data, fit_config): LOGGER.info("Start scale data transform ...") if len(fit_config) == 0: LOGGER.warning("length fit_config is 0, can not do transform, do nothing and return") if self.scale_param.method == consts.MINMAXSCALE: min_max_scaler = MinMaxScaler() data = min_max_scaler.transform(data, fit_config[0]) elif self.scale_param.method == consts.STANDARDSCALE: standard_scaler = StandardScaler() data = standard_scaler.transform(data, mean=fit_config[0], scale=fit_config[1]) else: LOGGER.info("DataTransform method is {}, do nothing and return!".format(self.scale_param.method)) LOGGER.info("End transform data ...") return data
def fit(self, data): """ Apply scale for input data Parameters ---------- data: data_instance, input data Returns ---------- data:data_instance, data after scale scale_value_results: list, the fit results information of scale """ LOGGER.info("Start scale data fit ...") self.header = data.schema.get('header') if self.scale_param.method == consts.MINMAXSCALE: min_max_scaler = MinMaxScaler(mode=self.scale_param.mode, area=self.scale_param.area, scale_column_idx=self.scale_param.scale_column_idx, feat_upper=self.scale_param.feat_upper, feat_lower=self.scale_param.feat_lower, out_upper=self.scale_param.out_upper, out_lower=self.scale_param.out_lower) data, cols_scale_value = min_max_scaler.fit(data) self.cols_scale_res = cols_scale_value elif self.scale_param.method == consts.STANDARDSCALE: standard_scaler = StandardScaler(area=self.scale_param.area, scale_column_idx=self.scale_param.scale_column_idx, with_mean=self.scale_param.with_mean, with_std=self.scale_param.with_std) data, cols_scale_value = standard_scaler.fit(data) self.mean = cols_scale_value[0] self.std = cols_scale_value[1] self.std_scale_column_idx = cols_scale_value[2] self.cols_scale_res = cols_scale_value else: LOGGER.info("Scale method is {}, do nothing and return!".format(self.scale_param.method)) data.schema['header'] = self.header LOGGER.info("End fit data ...") return data, self.cols_scale_res