def test_all_cats_not_significant(self): # Expected e = { 'categorical_var': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C'], 'target': [1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0], 'categorical_var_processed': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C'] } # data -> actual d = { 'categorical_var': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C'], 'target': [1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0] } discrete_vars = ['categorical_var'] target_column_name = 'target' data = pd.DataFrame(d, columns=['categorical_var', 'target']) expected = pd.DataFrame( e, columns=['categorical_var', 'target', 'categorical_var_processed']) categorical_data_processor = CategoricalDataProcessor( category_size_threshold=0, p_value_threshold=0.0001) categorical_data_processor.fit(data, discrete_vars, target_column_name) actual = categorical_data_processor.transform(data, discrete_vars) pd.testing.assert_frame_equal(actual, expected)
def test_force_category(self): # Expected e = { 'categorical_var': [ 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'C', 'C' ], 'target': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0], 'categorical_var_processed': [ 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'C', 'C' ] } # data -> actual d = { 'categorical_var': [ 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'C', 'C' ], 'target': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0] } discrete_vars = ['categorical_var'] target_column_name = 'target' data = pd.DataFrame(d, columns=['categorical_var', 'target']) expected = pd.DataFrame( e, columns=['categorical_var', 'target', 'categorical_var_processed']) expected['categorical_var_processed'] = ( expected['categorical_var_processed'].astype("category")) categorical_data_processor = CategoricalDataProcessor( category_size_threshold=0, forced_categories={'categorical_var': ['C']}, p_value_threshold=0.05) categorical_data_processor.fit(data, discrete_vars, target_column_name) actual = categorical_data_processor.transform(data, discrete_vars) pd.testing.assert_frame_equal(actual, expected)