def test_cleanutil_replaceinterpol(self): int_missing_data = [ [1, 8, 1], [0, 9394, 2], [np.nan, np.nan, np.nan], [2, 4, 3], [np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan], [1, 8, 1], [0, 9394, 2], [np.nan, np.nan, np.nan], [2, 4, 3], [np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan], ] columns = ["col1", "col2", "col3"] data = pd.DataFrame(int_missing_data, columns=columns) clean = Clean(x_train=data, test_split_percentage=0.5) clean.replace_missing_interpolate("col1", "col2", "col3", limit_direction="both") validate = np.any(clean.x_train.isnull()) and np.any( clean.x_test.isnull()) self.assertFalse(validate)
def test_cleanutil_removecolumns(self): int_missing_data = [[1, 0, 0], [0, None, None], [None, None, None]] columns = ["col1", "col2", "col3"] data = pd.DataFrame(int_missing_data, columns=columns) clean = Clean(x_train=data, test_split_percentage=0.5, split=False) clean.remove_columns(0.5) validate = clean.x_train.columns.tolist() self.assertListEqual(validate, ["col1"])
def test_cleanutil_removerows(self): int_missing_data = np.array([(1, 0, 0), (0, None, None), (None, None, None)]) columns = ["col1", "col2", "col3"] data = pd.DataFrame(int_missing_data, columns=columns) clean = Clean(x_train=data, test_split_percentage=0.5, split=False) clean.remove_rows(0.5) validate = clean.x_train.values.tolist() self.assertListEqual(validate, np.array([(1, 0, 0)]).tolist())
def test_cleanutil_removeduplicatecolumns(self): data = [[1, 0, 1], [0, 2, 0], [1, 0, 1]] columns = ["col1", "col2", "col3"] data = pd.DataFrame(data, columns=columns) clean = Clean(x_train=data, test_split_percentage=0.5, split=False) clean.remove_duplicate_columns() validate = clean.x_train.values.tolist() self.assertListEqual(validate, [[1, 0], [0, 2], [1, 0]])
def test_cleannumeric_median(self): int_missing_data = [[1, 0, 2], [0, np.nan, 1], [np.nan, np.nan, np.nan]] columns = ["col1", "col2", "col3"] data = pd.DataFrame(int_missing_data, columns=columns) clean = Clean(x_train=data, test_split_percentage=0.5, split=False) clean.replace_missing_median() validate = clean.x_train.values.tolist() self.assertListEqual(validate, [[1, 0, 2], [0, 0, 1], [0.5, 0, 1.5]])
def test_cleancategorical_removerow(self): int_missing_data = [[1, 0, 2], [1, np.nan, 1], [np.nan, np.nan, np.nan]] columns = ["col1", "col2", "col3"] data = pd.DataFrame(int_missing_data, columns=columns) clean = Clean(x_train=data, test_split_percentage=0.5, split=False) clean.replace_missing_remove_row("col1", "col2") validate = clean.x_train.values.tolist() self.assertListEqual(validate, np.array([(1, 0, 2)]).tolist())
def test_lineplot(self): np.random.seed(42) df = pd.DataFrame({ "Google": np.random.randn(1000) + 0.2, "Apple": np.random.randn(1000) + 0.17, "date": pd.date_range("1/1/2000", periods=1000), }) clean = Clean(x_train=df, split=False) clean.visualize_lineplot("date", "Google", "Apple", show_figure=False) self.assertTrue(True)
def test_write_data_tocsv(self): np.random.seed(42) df = pd.DataFrame({ "Google": np.random.randn(1000) + 0.2, "Apple": np.random.randn(1000) + 0.17, "date": pd.date_range("1/1/2000", periods=1000), }) clean = Clean(x_train=df, split=False) clean.to_csv("test_write_data") os.remove("test_write_data_train.csv") self.assertTrue(True)
def test_cleancategorical_replacemissingnewcategory_noparams(self): missing_data = [[1.0, "Green", 2], [1.0, "Other", 1], [np.nan, np.nan, np.nan]] columns = ["col1", "col2", "col3"] data = pd.DataFrame(missing_data, columns=columns) clean = Clean(x_train=data, test_split_percentage=0.5, split=False) clean.replace_missing_new_category() validate = clean.x_train.values.tolist() self.assertListEqual( validate, [[1, "Green", 2], [1, "Other", 1], [-1, "Unknown", -1]])
def test_cleannumeric_constant(self): int_missing_data = np.array([(1, 0, 2), (1, None, 1), (None, None, None)]) columns = ["col1", "col2", "col3"] data = pd.DataFrame(int_missing_data, columns=columns) clean = Clean(x_train=data, test_split_percentage=0.5, split=False) clean.replace_missing_constant("col1", "col3", constant=10.5) validate = clean.x_train.values.tolist() self.assertListEqual( validate, np.array([(1, 0, 2), (1, None, 1), (10.5, None, 10.5)]).tolist())
def test_cleancategorical_replacemissingnewcategory_dict(self): missing_data = [[1, "Green", 2], [1, np.nan, 1], [np.nan, np.nan, np.nan]] columns = ["col1", "col2", "col3"] data = pd.DataFrame(missing_data, columns=columns) category_dict_mapping = {"col1": 2, "col2": "Blue", "col3": 4} clean = Clean(x_train=data, test_split_percentage=0.5, split=False) clean.replace_missing_new_category(col_mapping=category_dict_mapping) validate = clean.x_train.values.tolist() self.assertListEqual( validate, [[1.0, "Green", 2.0], [1.0, "Blue", 1.0], [2.0, "Blue", 4.0]])
def test_cleannumeric_mostfrequent(self): int_missing_data = np.array([(1, 0, 2), (1, np.nan, 1), (np.nan, np.nan, 1)]) columns = ["col1", "col2", "col3"] data = pd.DataFrame(int_missing_data, columns=columns) clean = Clean(x_train=data, test_split_percentage=0.5, split=False) clean.replace_missing_mostcommon() validate = clean.x_train.values.tolist() self.assertListEqual( validate, np.array([(1.0, 0.0, 2.0), (1.0, 0.0, 1.0), (1.0, 0.0, 1.0)]).tolist(), )
def test_dropcolumns_keep(self): int_missing_data = [[1, 0, 0], [0, 2, 3], [0, 3, 4], [1, 2, 3]] columns = ["col1", "col2", "col3"] data = pd.DataFrame(int_missing_data, columns=columns) clean = Clean(x_train=data, report_name="test", test_split_percentage=0.5) clean_inst = clean.drop(keep=["col2"], reason="Columns were unimportant.") validate = (clean_inst.x_train.columns == ["col2"] and clean_inst.x_test.columns == ["col2"] and isinstance(clean_inst, Clean)) self.assertTrue(validate)
def test_cleanutil_replacebfill(self): int_missing_data = [ [1, 8, 1], [0, 9394, 2], [np.nan, np.nan, np.nan], [2, 4, 3], [np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan], ] columns = ["col1", "col2", "col3"] data = pd.DataFrame(int_missing_data, columns=columns) clean = Clean(x_train=data, test_split_percentage=0.5) clean.replace_missing_backfill("col1", "col2", "col3") self.assertTrue(True)
def test_cleanutil_splitdata(self): data = np.zeros((5, 5)) columns = ["col1", "col2", "col3", "col4", "col5"] dataset = pd.DataFrame(data, columns=columns) clean = Clean(x_train=dataset) self.assertEqual(clean.x_train.shape[0], 4)
def test_search_notequal(self): data = pd.DataFrame({ "A": [1, 1, 2, 2], "B": [1, 2, 3, 4], "C": np.random.randn(4), "D": ["A", "A", "B", "B"], }) clean = Clean( x_train=data, x_test=None, split=False, target_field="", report_name="test", test_split_percentage=0.5, ) clean.search("A", not_equal=True, replace=True) self.assertTrue(True)
def test_settargetmapping(self): data = pd.DataFrame({ "A": [1, 1, 2, 2], "B": [1, 2, 3, 4], "C": np.random.randn(4), "D": ["A", "A", "B", "B"], }) clean = Clean( x_train=data, x_test=None, split=False, target_field="", report_name="test", test_split_percentage=0.5, ) clean.target_mapping = "a" self.assertEqual(clean.target_mapping, "a")
def test_encodelabels(self): data = pd.DataFrame({ "A": [1, 1, 2, 2], "B": [1, 2, 3, 4], "C": np.random.randn(4), "D": ["B", "A", "B", "B"], }) clean = Clean( x_train=data, x_test=None, split=False, target_field="D", report_name="test", test_split_percentage=0.5, ) clean.encode_target() self.assertDictEqual(clean.target_mapping, {0: "A", 1: "B"})
def test_report_cleaning_technique(self): int_missing_data = np.array([(1, 0, 0), (0, None, None), (None, None, None)]) columns = ["col1", "col2", "col3"] data = pd.DataFrame(int_missing_data, columns=columns) clean = Clean(x_train=data, test_split_percentage=0.5, split=False, report_name="test") clean.remove_columns(0.5) with open(clean._data_properties.report.filename) as f: content = f.read() validate = "col2" in content and "col3" in content os.remove(clean._data_properties.report.filename) self.assertTrue(validate)
def test_cleancategorical_replacemissingnewcategory_list_constantnotnone( self): missing_data = np.array([(1, "Green", 2), (1, "Other", 1), (None, None, None)]) columns = ["col1", "col2", "col3"] data = pd.DataFrame(missing_data, columns=columns) list_col = ["col1", "col3"] clean = Clean(x_train=data, test_split_percentage=0.5, split=False) clean.replace_missing_new_category(list_of_cols=list_col, new_category=0) validate = clean.x_train.values.tolist() self.assertListEqual( validate, np.array([(1, "Green", 2), (1, "Other", 1), (0, None, 0)]).tolist(), )
def test_cleanutil_replaceindicator_removecol(self): int_missing_data = [ [1, 8, 1], [0, 9394, 2], [np.nan, np.nan, np.nan], [2, 4, 3], [np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan], ] columns = ["col1", "col2", "col3"] data = pd.DataFrame(int_missing_data, columns=columns) clean = Clean(x_train=data, test_split_percentage=0.5) clean.replace_missing_indicator("col1", "col2", "col3", keep_col=False) validate = (clean.x_train.shape[1] == 3) and (clean.x_test.shape[1] == 3) self.assertTrue(True)
def test_report_cleaning_new_category(self): missing_data = [[1.0, "Green", 2], [1.0, "Other", 1], [np.nan, np.nan, np.nan]] columns = ["col1", "col2", "col3"] data = pd.DataFrame(missing_data, columns=columns) clean = Clean(x_train=data, test_split_percentage=0.5, split=False, report_name="test") clean_data = clean.replace_missing_new_category() with open(clean._data_properties.report.filename) as f: content = f.read() validate = "col1" in content and "col2" in content and "col3" in content os.remove(clean._data_properties.report.filename) self.assertTrue(validate)
def test_cleancategorical_replacemissingnewcategory_list_constantisnone( self): missing_data = [[1.0, "Green", 2], [1.0, "Other", 1], [np.nan, None, np.nan]] columns = ["col1", "col2", "col3"] data = pd.DataFrame(missing_data, columns=columns) list_col = ["col1", "col2"] clean = Clean(x_train=data, test_split_percentage=0.5, split=False) clean.replace_missing_new_category(list_of_cols=list_col) # Replacing NaNs with strings for validations as regular assert does == and to compare NaNs you need `is` clean._data_properties.x_train = clean.x_train.fillna("NaN was here") validate = clean.x_train.values.tolist() self.assertListEqual( validate, [[1, "Green", 2.0], [1, "Other", 1.0], [-1, "Unknown", "NaN was here"]], )
def test_dropcolumns_complex(self): int_missing_data = [[1, 0, 0, 3], [0, 2, 3, 4], [0, 3, 4, 4], [1, 2, 3, 6]] columns = ["col1", "col2", "col3", "py"] data = pd.DataFrame(int_missing_data, columns=columns) clean = Clean( x_train=data, x_test=None, split=True, target_field="", report_name="test", test_split_percentage=0.5, ) clean.drop("col1", keep=["col2"], regexp=r"col*", reason="Columns were unimportant.") validate = list(clean.x_train.columns) == ["col2", "py"] and list( clean.x_test.columns) == ["col2", "py"] self.assertTrue(validate)