def full_pipeline(model_type, predicted_column, grain_column, impute=True, verbose=True): """ Builds the data preparation pipeline. Sequentially runs transformers and filters to clean and prepare the data. Note advanced users may wish to use their own custom pipeline. """ # Note: this could be done more elegantly using FeatureUnions _if_ you are not using pandas dataframes for # inputs of the later pipelines as FeatureUnion intrinsically converts outputs to numpy arrays. pipeline = Pipeline([ ('remove_DTS_columns', hcai_filters.DataframeColumnSuffixFilter()), ('remove_grain_column', hcai_filters.DataframeColumnRemover(grain_column)), # Perform one of two basic imputation methods # TODO we need to think about making this optional to solve the problem of rare and very predictive values ('imputation', hcai_transformers.DataFrameImputer(impute=impute, verbose=verbose)), ('null_row_filter', hcai_filters.DataframeNullValueFilter(excluded_columns=None)), ('convert_target_to_binary', hcai_transformers.DataFrameConvertTargetToBinary( model_type, predicted_column)), ('prediction_to_numeric', hcai_transformers.DataFrameConvertColumnToNumeric(predicted_column)), ('create_dummy_variables', hcai_transformers.DataFrameCreateDummyVariables( excluded_columns=[predicted_column])), ]) return pipeline
def test_imputation_false_returns_unmodified(self): df = pd.DataFrame([['a', 1, 2], ['b', 1, 1], ['b', 2, 2], ['a', None, None]]) expected = pd.DataFrame([['a', 1, 2], ['b', 1, 1], ['b', 2, 2], ['a', None, None]]) result = transformers.DataFrameImputer(impute=False).fit_transform(df) self.assertEqual(len(result), 4) # Assert column types remain identical self.assertTrue(list(result.dtypes) == list(df.dtypes)) self.assertTrue(expected.equals(result))
def test_imputation_removes_nones(self): df = pd.DataFrame([['a', 1, 2], ['b', 1, 1], ['b', 2, 2], [None, None, None]]) expected = pd.DataFrame([['a', 1, 2], ['b', 1, 1], ['b', 2, 2], ['b', 4 / 3.0, 5 / 3.0]]) result = transformers.DataFrameImputer().fit_transform(df) self.assertEqual(len(result), 4) self.assertFalse(result.isnull().values.any()) # Assert column types remain identical self.assertTrue(list(result.dtypes) == list(df.dtypes)) self.assertTrue(expected.equals(result))
def test_imputation_for_mean_of_numeric_and_mode_for_categorical(self): df = pd.DataFrame([['a', 1, 2], ['b', 1, 1], ['b', 2, 2], [None, None, None]]) result = transformers.DataFrameImputer().fit_transform(df) expected = pd.DataFrame([['a', 1, 2], ['b', 1, 1], ['b', 2, 2], ['b', 4. / 3, 5. / 3]]) self.assertEqual(len(result), 4) # Assert imputed values self.assertTrue(expected.equals(result)) # Assert column types remain identical self.assertTrue(list(result.dtypes) == list(df.dtypes))
def test_imputeStrategy_None_impute_for_None(self): df = pd.DataFrame([['a', 1, 2], ['b', 1, 1], ['b', 4, 1], ['a', 2, 8], ['b', 2, 6], ['b', 1, 2], ['a', 6, 2], ['b', 3, 1], ['b', 2, 7], [None, None, None]]) expected = pd.DataFrame([['a', 1, 2], ['b', 1, 1], ['b', 4, 1], ['a', 2, 8], ['b', 2, 6], ['b', 1, 2], ['a', 6, 2], ['b', 3, 1], ['b', 2, 7], ['b', 22 / 9.0, 30 / 9.0]]) result = transformers.DataFrameImputer( impute=True, imputeStrategy=None).fit_transform(df) self.assertEqual(len(result), 10) # Assert column types remain identical self.assertTrue(list(result.dtypes) == list(df.dtypes)) self.assertTrue(expected.equals(result))
def test_imputeStrategy_RandomForest_impute_for_NaN(self): df = pd.DataFrame([['a', 1, 2], ['b', 1, 1], ['b', 4, 1], ['a', 2, 8], ['b', 2, 6], ['b', 1, 2], ['a', 6, 2], ['b', 3, 1], ['b', 2, 7], [np.NaN, np.NaN, np.NaN]]) expected = pd.DataFrame([['a', 1, 2], ['b', 1, 1], ['b', 4, 1], ['a', 2, 8], ['b', 2, 6], ['b', 1, 2], ['a', 6, 2], ['b', 3, 1], ['b', 2, 7], ['b', 1.567, 6.032]]) result = transformers.DataFrameImputer( impute=True, imputeStrategy='RandomForest').fit_transform(df) result = round(result, 3) self.assertEqual(len(result), 10) # Assert column types remain identical self.assertTrue(list(result.dtypes) == list(df.dtypes)) self.assertTrue(expected.equals(result))
def test_imputation_false_and_imputeStrategy_RandomForest_returns_unmodified( self): df = pd.DataFrame([['a', 1, 2], ['b', 1, 1], ['b', 4, 1], ['a', 2, 8], ['b', 2, 6], ['b', 1, 2], ['a', 6, 2], ['b', 3, 1], ['b', 2, 7], [None, None, None]]) expected = pd.DataFrame([['a', 1, 2], ['b', 1, 1], ['b', 4, 1], ['a', 2, 8], ['b', 2, 6], ['b', 1, 2], ['a', 6, 2], ['b', 3, 1], ['b', 2, 7], [None, None, None]]) result = transformers.DataFrameImputer( impute=False, imputeStrategy='RandomForest').fit_transform(df) self.assertEqual(len(result), 10) # Assert column types remain identical self.assertTrue(list(result.dtypes) == list(df.dtypes)) self.assertTrue(expected.equals(result))