예제 #1
0
    def test_cleanutil_replaceinterpol(self):

        int_missing_data = [
            [1, 8, 1],
            [0, 9394, 2],
            [np.nan, np.nan, np.nan],
            [2, 4, 3],
            [np.nan, np.nan, np.nan],
            [np.nan, np.nan, np.nan],
            [1, 8, 1],
            [0, 9394, 2],
            [np.nan, np.nan, np.nan],
            [2, 4, 3],
            [np.nan, np.nan, np.nan],
            [np.nan, np.nan, np.nan],
        ]

        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(int_missing_data, columns=columns)

        clean = Clean(x_train=data, test_split_percentage=0.5)
        clean.replace_missing_interpolate("col1",
                                          "col2",
                                          "col3",
                                          limit_direction="both")

        validate = np.any(clean.x_train.isnull()) and np.any(
            clean.x_test.isnull())

        self.assertFalse(validate)
예제 #2
0
    def test_cleanutil_removecolumns(self):

        int_missing_data = [[1, 0, 0], [0, None, None], [None, None, None]]
        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(int_missing_data, columns=columns)

        clean = Clean(x_train=data, test_split_percentage=0.5, split=False)
        clean.remove_columns(0.5)
        validate = clean.x_train.columns.tolist()

        self.assertListEqual(validate, ["col1"])
예제 #3
0
    def test_cleanutil_removerows(self):

        int_missing_data = np.array([(1, 0, 0), (0, None, None),
                                     (None, None, None)])
        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(int_missing_data, columns=columns)

        clean = Clean(x_train=data, test_split_percentage=0.5, split=False)
        clean.remove_rows(0.5)
        validate = clean.x_train.values.tolist()

        self.assertListEqual(validate, np.array([(1, 0, 0)]).tolist())
예제 #4
0
    def test_cleanutil_removeduplicatecolumns(self):

        data = [[1, 0, 1], [0, 2, 0], [1, 0, 1]]

        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(data, columns=columns)

        clean = Clean(x_train=data, test_split_percentage=0.5, split=False)
        clean.remove_duplicate_columns()
        validate = clean.x_train.values.tolist()

        self.assertListEqual(validate, [[1, 0], [0, 2], [1, 0]])
예제 #5
0
    def test_cleannumeric_median(self):
        int_missing_data = [[1, 0, 2], [0, np.nan, 1],
                            [np.nan, np.nan, np.nan]]

        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(int_missing_data, columns=columns)

        clean = Clean(x_train=data, test_split_percentage=0.5, split=False)
        clean.replace_missing_median()
        validate = clean.x_train.values.tolist()

        self.assertListEqual(validate, [[1, 0, 2], [0, 0, 1], [0.5, 0, 1.5]])
예제 #6
0
    def test_cleancategorical_removerow(self):

        int_missing_data = [[1, 0, 2], [1, np.nan, 1],
                            [np.nan, np.nan, np.nan]]

        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(int_missing_data, columns=columns)

        clean = Clean(x_train=data, test_split_percentage=0.5, split=False)
        clean.replace_missing_remove_row("col1", "col2")
        validate = clean.x_train.values.tolist()

        self.assertListEqual(validate, np.array([(1, 0, 2)]).tolist())
예제 #7
0
    def test_lineplot(self):

        np.random.seed(42)
        df = pd.DataFrame({
            "Google": np.random.randn(1000) + 0.2,
            "Apple": np.random.randn(1000) + 0.17,
            "date": pd.date_range("1/1/2000", periods=1000),
        })

        clean = Clean(x_train=df, split=False)
        clean.visualize_lineplot("date", "Google", "Apple", show_figure=False)

        self.assertTrue(True)
예제 #8
0
    def test_write_data_tocsv(self):

        np.random.seed(42)
        df = pd.DataFrame({
            "Google": np.random.randn(1000) + 0.2,
            "Apple": np.random.randn(1000) + 0.17,
            "date": pd.date_range("1/1/2000", periods=1000),
        })

        clean = Clean(x_train=df, split=False)
        clean.to_csv("test_write_data")
        os.remove("test_write_data_train.csv")

        self.assertTrue(True)
예제 #9
0
    def test_cleancategorical_replacemissingnewcategory_noparams(self):

        missing_data = [[1.0, "Green", 2], [1.0, "Other", 1],
                        [np.nan, np.nan, np.nan]]

        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(missing_data, columns=columns)

        clean = Clean(x_train=data, test_split_percentage=0.5, split=False)
        clean.replace_missing_new_category()
        validate = clean.x_train.values.tolist()

        self.assertListEqual(
            validate, [[1, "Green", 2], [1, "Other", 1], [-1, "Unknown", -1]])
예제 #10
0
    def test_cleannumeric_constant(self):
        int_missing_data = np.array([(1, 0, 2), (1, None, 1),
                                     (None, None, None)])

        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(int_missing_data, columns=columns)

        clean = Clean(x_train=data, test_split_percentage=0.5, split=False)
        clean.replace_missing_constant("col1", "col3", constant=10.5)
        validate = clean.x_train.values.tolist()

        self.assertListEqual(
            validate,
            np.array([(1, 0, 2), (1, None, 1), (10.5, None, 10.5)]).tolist())
예제 #11
0
    def test_cleancategorical_replacemissingnewcategory_dict(self):

        missing_data = [[1, "Green", 2], [1, np.nan, 1],
                        [np.nan, np.nan, np.nan]]

        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(missing_data, columns=columns)
        category_dict_mapping = {"col1": 2, "col2": "Blue", "col3": 4}

        clean = Clean(x_train=data, test_split_percentage=0.5, split=False)
        clean.replace_missing_new_category(col_mapping=category_dict_mapping)
        validate = clean.x_train.values.tolist()

        self.assertListEqual(
            validate,
            [[1.0, "Green", 2.0], [1.0, "Blue", 1.0], [2.0, "Blue", 4.0]])
예제 #12
0
    def test_cleannumeric_mostfrequent(self):
        int_missing_data = np.array([(1, 0, 2), (1, np.nan, 1),
                                     (np.nan, np.nan, 1)])

        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(int_missing_data, columns=columns)

        clean = Clean(x_train=data, test_split_percentage=0.5, split=False)
        clean.replace_missing_mostcommon()
        validate = clean.x_train.values.tolist()

        self.assertListEqual(
            validate,
            np.array([(1.0, 0.0, 2.0), (1.0, 0.0, 1.0),
                      (1.0, 0.0, 1.0)]).tolist(),
        )
예제 #13
0
    def test_dropcolumns_keep(self):

        int_missing_data = [[1, 0, 0], [0, 2, 3], [0, 3, 4], [1, 2, 3]]
        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(int_missing_data, columns=columns)

        clean = Clean(x_train=data,
                      report_name="test",
                      test_split_percentage=0.5)
        clean_inst = clean.drop(keep=["col2"],
                                reason="Columns were unimportant.")

        validate = (clean_inst.x_train.columns == ["col2"]
                    and clean_inst.x_test.columns == ["col2"]
                    and isinstance(clean_inst, Clean))

        self.assertTrue(validate)
예제 #14
0
    def test_cleanutil_replacebfill(self):

        int_missing_data = [
            [1, 8, 1],
            [0, 9394, 2],
            [np.nan, np.nan, np.nan],
            [2, 4, 3],
            [np.nan, np.nan, np.nan],
            [np.nan, np.nan, np.nan],
        ]

        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(int_missing_data, columns=columns)

        clean = Clean(x_train=data, test_split_percentage=0.5)
        clean.replace_missing_backfill("col1", "col2", "col3")

        self.assertTrue(True)
예제 #15
0
    def test_cleanutil_splitdata(self):

        data = np.zeros((5, 5))
        columns = ["col1", "col2", "col3", "col4", "col5"]
        dataset = pd.DataFrame(data, columns=columns)

        clean = Clean(x_train=dataset)

        self.assertEqual(clean.x_train.shape[0], 4)
예제 #16
0
    def test_search_notequal(self):

        data = pd.DataFrame({
            "A": [1, 1, 2, 2],
            "B": [1, 2, 3, 4],
            "C": np.random.randn(4),
            "D": ["A", "A", "B", "B"],
        })

        clean = Clean(
            x_train=data,
            x_test=None,
            split=False,
            target_field="",
            report_name="test",
            test_split_percentage=0.5,
        )
        clean.search("A", not_equal=True, replace=True)

        self.assertTrue(True)
예제 #17
0
    def test_settargetmapping(self):

        data = pd.DataFrame({
            "A": [1, 1, 2, 2],
            "B": [1, 2, 3, 4],
            "C": np.random.randn(4),
            "D": ["A", "A", "B", "B"],
        })

        clean = Clean(
            x_train=data,
            x_test=None,
            split=False,
            target_field="",
            report_name="test",
            test_split_percentage=0.5,
        )
        clean.target_mapping = "a"

        self.assertEqual(clean.target_mapping, "a")
예제 #18
0
    def test_encodelabels(self):

        data = pd.DataFrame({
            "A": [1, 1, 2, 2],
            "B": [1, 2, 3, 4],
            "C": np.random.randn(4),
            "D": ["B", "A", "B", "B"],
        })

        clean = Clean(
            x_train=data,
            x_test=None,
            split=False,
            target_field="D",
            report_name="test",
            test_split_percentage=0.5,
        )
        clean.encode_target()

        self.assertDictEqual(clean.target_mapping, {0: "A", 1: "B"})
예제 #19
0
    def test_report_cleaning_technique(self):

        int_missing_data = np.array([(1, 0, 0), (0, None, None),
                                     (None, None, None)])
        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(int_missing_data, columns=columns)

        clean = Clean(x_train=data,
                      test_split_percentage=0.5,
                      split=False,
                      report_name="test")
        clean.remove_columns(0.5)

        with open(clean._data_properties.report.filename) as f:
            content = f.read()
        validate = "col2" in content and "col3" in content

        os.remove(clean._data_properties.report.filename)

        self.assertTrue(validate)
예제 #20
0
    def test_cleancategorical_replacemissingnewcategory_list_constantnotnone(
            self):

        missing_data = np.array([(1, "Green", 2), (1, "Other", 1),
                                 (None, None, None)])

        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(missing_data, columns=columns)
        list_col = ["col1", "col3"]

        clean = Clean(x_train=data, test_split_percentage=0.5, split=False)
        clean.replace_missing_new_category(list_of_cols=list_col,
                                           new_category=0)
        validate = clean.x_train.values.tolist()

        self.assertListEqual(
            validate,
            np.array([(1, "Green", 2), (1, "Other", 1),
                      (0, None, 0)]).tolist(),
        )
예제 #21
0
    def test_cleanutil_replaceindicator_removecol(self):

        int_missing_data = [
            [1, 8, 1],
            [0, 9394, 2],
            [np.nan, np.nan, np.nan],
            [2, 4, 3],
            [np.nan, np.nan, np.nan],
            [np.nan, np.nan, np.nan],
        ]

        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(int_missing_data, columns=columns)

        clean = Clean(x_train=data, test_split_percentage=0.5)
        clean.replace_missing_indicator("col1", "col2", "col3", keep_col=False)

        validate = (clean.x_train.shape[1] == 3) and (clean.x_test.shape[1]
                                                      == 3)

        self.assertTrue(True)
예제 #22
0
    def test_report_cleaning_new_category(self):

        missing_data = [[1.0, "Green", 2], [1.0, "Other", 1],
                        [np.nan, np.nan, np.nan]]

        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(missing_data, columns=columns)

        clean = Clean(x_train=data,
                      test_split_percentage=0.5,
                      split=False,
                      report_name="test")
        clean_data = clean.replace_missing_new_category()

        with open(clean._data_properties.report.filename) as f:
            content = f.read()
        validate = "col1" in content and "col2" in content and "col3" in content

        os.remove(clean._data_properties.report.filename)

        self.assertTrue(validate)
예제 #23
0
    def test_cleancategorical_replacemissingnewcategory_list_constantisnone(
            self):

        missing_data = [[1.0, "Green", 2], [1.0, "Other", 1],
                        [np.nan, None, np.nan]]

        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(missing_data, columns=columns)
        list_col = ["col1", "col2"]

        clean = Clean(x_train=data, test_split_percentage=0.5, split=False)
        clean.replace_missing_new_category(list_of_cols=list_col)

        # Replacing NaNs with strings for validations as regular assert does == and to compare NaNs you need `is`
        clean._data_properties.x_train = clean.x_train.fillna("NaN was here")
        validate = clean.x_train.values.tolist()

        self.assertListEqual(
            validate,
            [[1, "Green", 2.0], [1, "Other", 1.0],
             [-1, "Unknown", "NaN was here"]],
        )
예제 #24
0
    def test_dropcolumns_complex(self):

        int_missing_data = [[1, 0, 0, 3], [0, 2, 3, 4], [0, 3, 4, 4],
                            [1, 2, 3, 6]]
        columns = ["col1", "col2", "col3", "py"]
        data = pd.DataFrame(int_missing_data, columns=columns)

        clean = Clean(
            x_train=data,
            x_test=None,
            split=True,
            target_field="",
            report_name="test",
            test_split_percentage=0.5,
        )
        clean.drop("col1",
                   keep=["col2"],
                   regexp=r"col*",
                   reason="Columns were unimportant.")

        validate = list(clean.x_train.columns) == ["col2", "py"] and list(
            clean.x_test.columns) == ["col2", "py"]

        self.assertTrue(validate)