def test_column_transformer_fit_transform_should_support_multiple_tuples(): # Given test_case = ColumnChooserTestCase( data_inputs=np.array([[1, 1, 2, 3], [10, 11, 12, 13], [20, 21, 22, 23]]), expected_outputs=np.array([[0, 1, 2, 3], [10, 11, 12, 13], [20, 21, 22, 23]]), expected_processed_outputs=np.array([[2, 2, 4], [20, 22, 24], [40, 42, 44]]), column_transformer_tuple_list=[(slice(0, 2), MultiplyBy2()), (2, MultiplyBy2())], n_dimension=3) data_inputs = test_case.data_inputs column_transformer = ColumnTransformer( test_case.column_transformer_tuple_list) # When column_transformer, outputs = column_transformer.fit_transform( data_inputs, test_case.expected_outputs) # Then assert np.array_equal(test_case.expected_processed_outputs, outputs) actual_fitted_data = column_transformer['2_MultiplyBy2'][ 'MultiplyBy2'].fitted_data expected_fitted_data = [([[2], [12], [22]], [[0, 1, 2, 3], [10, 11, 12, 13], [20, 21, 22, 23]])] assert_data_fitted_properly(actual_fitted_data, expected_fitted_data) actual_fitted_data = column_transformer['slice(0, 2, None)_MultiplyBy2'][ 'MultiplyBy2'].fitted_data expected_fitted_data = [([[1, 1], [10, 11], [20, 21]], [[0, 1, 2, 3], [10, 11, 12, 13], [20, 21, 22, 23]])] assert_data_fitted_properly(actual_fitted_data, expected_fitted_data)
def test_column_transformer_transform_should_support_multiple_tuples(): # Given test_case = ColumnChooserTestCase( data_inputs=np.array([ [[1, 1, 2, 3]], [[10, 11, 12, 13]], [[20, 21, 22, 23]] ]), expected_outputs=np.array([ [[0, 1, 2, 3]], [[10, 11, 12, 13]], [[20, 21, 22, 23]] ]), expected_processed_outputs=np.array([ [[2, 2, 4]], [[20, 22, 24]], [[40, 42, 44]] ]), column_transformer_tuple_list=[ (slice(0, 2), MultiplyBy2()), (2, MultiplyBy2()) ], n_dimension=3 ) data_inputs = test_case.data_inputs p = ColumnTransformer(test_case.column_transformer_tuple_list, test_case.n_dimension) # When outputs = p.transform(data_inputs) # Then assert np.array_equal(test_case.expected_processed_outputs, outputs)
def test_column_transformer_transform_should_support_indexes(test_case: ColumnChooserTestCase): data_inputs = test_case.data_inputs column_transformer = ColumnTransformer(test_case.column_transformer_tuple_list, test_case.n_dimension) outputs = column_transformer.transform(data_inputs) assert np.array_equal(outputs, test_case.expected_processed_outputs)
def test_column_transformer_fit_should_support_indexes(test_case: ColumnChooserTestCase): data_inputs = test_case.data_inputs p = ColumnTransformer(test_case.column_transformer_tuple_list, test_case.n_dimension) p = p.fit(data_inputs, test_case.expected_outputs) actual_fitted_data = p[test_case.expected_step_key]['MultiplyBy2'].fitted_data expected_fitted_data = test_case.expected_fitted_data assert_data_fitted_properly(actual_fitted_data, expected_fitted_data)
def __init__(self, input_columns: ColumnChooserTupleList, output_columns: ColumnChooserTupleList, n_dimension: int = 3): super().__init__([ OutputTransformerWrapper(ColumnTransformer(output_columns, n_dimension), from_data_inputs=True), ColumnTransformer(input_columns, n_dimension), ])
def test_column_transformer_fit_transform_should_support_indexes( test_case: ColumnChooserTestCase): data_inputs = test_case.data_inputs expected_outputs = test_case.expected_outputs column_transformer = ColumnTransformer( test_case.column_transformer_tuple_list) column_transformer, outputs = column_transformer.fit_transform( data_inputs, expected_outputs) assert np.array_equal(outputs, test_case.expected_processed_outputs) actual_fitted_data = column_transformer[ test_case.expected_step_key]['MultiplyBy2'].fitted_data expected_fitted_data = test_case.expected_fitted_data assert_data_fitted_properly(actual_fitted_data, expected_fitted_data)
def _apply_different_encoders_to_columns(): """ One standalone LabelEncoder will be applied on the pets, and another one will be shared for the columns owner and location. """ p = ColumnTransformer( [ # A different encoder will be used for column 0 with name "pets": (0, FlattenForEach(LabelEncoder(), then_unflatten=True)), # A shared encoder will be used for column 1 and 2, "owner" and "location": ([1, 2], FlattenForEach(LabelEncoder(), then_unflatten=True)), ], n_dimension=2) p, predicted_output = p.fit_transform(df.values) expected_output = np.array([[0, 1, 0, 2, 1, 1], [1, 3, 0, 1, 5, 3], [4, 2, 2, 4, 4, 2]]).transpose() assert np.array_equal(predicted_output, expected_output)
] non_categorical_columns = [ i for i in X.columns if i not in categorical_columns ] categories = [ ["clear", "misty", "rain"], ["spring", "summer", "fall", "winter"], ["False", "True"], ["False", "True"], ] ordinal_encoder = OrdinalEncoder(categories=categories) gbrt_pipeline = Pipeline([ ColumnTransformer([ (categorical_columns, ordinal_encoder), (non_categorical_columns, Identity()), ], n_dimension=2), HistGradientBoostingRegressor(categorical_features=range(4), ), ]) # %% # # Lets evaluate our gradient boosting model with the mean absolute error of the # relative demand averaged accross our 5 time-based cross-validation splits: def evaluate(model, X, y, cv): class SilentMetaStep(MetaStep): """This class is needed here to disable the sklearn compatibility errors with the getters and setters.""" def __init__(self):