def setup_class(self): self._identity_mapper_list = DataMapper([([0], IdentityTransformer()), ([1], IdentityTransformer())]) column_transformer = ColumnTransformer([ ("column1", IdentityTransformer(), [0]), ("column2", IdentityTransformer(), [1]) ]) x = np.ones((10, 5)) column_transformer.fit(x) self._identity_mapper_column_transformer = DataMapper( column_transformer)
def test_many_to_many_support_transformations(self): # Instantiate data mapper with many to many transformer support and test whether the feature map is generated column_transformer = ColumnTransformer([ ("column_0_1_2_3", IdentityTransformer(), [0, 1, 2, 3]), ("column_4_5", OneHotEncoder(), [4, 5]) ]) x = np.ones((10, 6)) # so that one hot encoder doesn't complain of only one category x[0, 4] = 0 x[0, 5] = 0 column_transformer.fit(x) data_mapper = DataMapper(column_transformer, allow_all_transformations=True) data_mapper.transform(x) # check feature mapper contents feature_map_indices = [[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3], [4, 5], [6, 7]] x_out = column_transformer.transform(x) feature_map = np.zeros((x.shape[1], x_out.shape[1])) num_rows = 0 for i, row in enumerate(feature_map_indices[:4]): feature_map[i, row] = 0.25 num_rows += 1 for i, row in enumerate(feature_map_indices[4:], start=num_rows): feature_map[i, row] = 1.0 assert (data_mapper.feature_map == feature_map).all()
def test_many_to_many_exception_column_transformer(self): # A transformer that takes input many columns. Since we do not recognize this transformer and it uses # many input columns - it is treated as many to many/one map. with pytest.raises(ValueError): column_transformer = ColumnTransformer([ ("column_0_1", IdentityTransformer(), [0, 1]) ]) x = np.ones((2, 2)) column_transformer.fit(x) DataMapper(column_transformer)
def test_many_to_many_mapper_nested_pipelines(self): pipeline, x = self._get_nested_pipelines_and_data(IdentityTransformer()) feature_mapper = get_feature_mapper_for_pipeline(pipeline) feature_mapper.transform(x) feature_map = np.zeros((2, 5)) feature_map[0, :] = 0.6 feature_map[1, :] = 0.4 assert np.all(feature_mapper.feature_map == pytest.approx(feature_map))
def get_transformations_many_to_many(feature_names): # Instantiate data mapper with many to many transformer support and test whether the feature map is generated # IdentityTransformer is our custom transformer, so not recognized as one to many transformations = [("column_0_1_2_3", Pipeline([("scaler", StandardScaler()), ("identity", IdentityTransformer())]), [f for f in feature_names[:-2]]), ("column_4_5", StandardScaler(), [f for f in feature_names[-2:]])] # add transformations with pandas index types transformations.append(("pandas_index_columns", "passthrough", pd.Index([feature_names[0], feature_names[1]]))) column_transformer = ColumnTransformer(transformations) return column_transformer
def test_mixed_dtypes(self): x = np.ones((10, 2)) data_mapper = DataMapper([([0], IdentityTransformer()), ([1], SparseTransformer())]) result = data_mapper.transform(x) assert issparse(result)
def test_many_to_many_exception_list(self): # A transformer that takes input many columns. Since we do not recognize this transformer and it uses # many input columns - it is treated as many to many/one map. with pytest.raises(ValueError): DataMapper([([0, 1], IdentityTransformer())])