Exemplo n.º 1
0
    def test_report_feature_bow(self):

        list_of_sentences = ["Hi my name is pyml", "Hi name pyml"]

        columns = ["text"]
        data = pd.DataFrame(list_of_sentences, columns=columns)

        feature = Data(x_train=data,
                       test_split_percentage=0.5,
                       split=False,
                       report_name="test")
        feature.bag_of_words()

        with open(feature.report.filename) as f:
            content = f.read()
        validate = "representation" in content

        os.remove(feature.report.filename)

        self.assertTrue(validate)
Exemplo n.º 2
0
    def test_report_preprocessing_standardize(self):

        unnormal_data = [[5.0, 3, 1], [2.0, 2, 1], [10.0, 1, 1]]

        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(unnormal_data, columns=columns)

        preprocess = Data(x_train=data,
                          test_split_percentage=0.5,
                          split=False,
                          report_name="test")
        preprocess.normalize_numeric()

        with open(preprocess.report.filename) as f:
            content = f.read()
        validate = "col1" in content and "col2" in content and "col3" in content

        os.remove(preprocess.report.filename)

        self.assertTrue(validate)
Exemplo n.º 3
0
    def test_dropcolumns_regex(self):

        int_missing_data = [[1, 0, 0, 3], [0, 2, 3, 4], [0, 3, 4, 4],
                            [1, 2, 3, 6]]
        columns = ["agent.hi", "agent.user_name", "agent.hello", "message"]
        data = pd.DataFrame(int_missing_data, columns=columns)

        clean = Data(
            x_train=data,
            x_test=None,
            split=False,
            target_field="",
            report_name="test",
            test_split_percentage=0.5,
        )
        clean.drop(regexp=r"agent*")

        validate = clean.x_train.columns == ["message"]

        self.assertTrue(validate)
Exemplo n.º 4
0
    def test_report_cleaning_technique(self):

        int_missing_data = np.array([(1, 0, 0), (0, None, None),
                                     (None, None, None)])
        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(int_missing_data, columns=columns)

        clean = Data(x_train=data,
                     test_split_percentage=0.5,
                     split=False,
                     report_name="test")
        clean.drop_column_missing_threshold(0.5)

        with open(clean.report.filename) as f:
            content = f.read()
        validate = "col2" in content and "col3" in content

        os.remove(clean.report.filename)

        self.assertTrue(validate)
Exemplo n.º 5
0
    def test_settargetmapping(self):

        data = pd.DataFrame({
            "A": [1, 1, 2, 2],
            "B": [1, 2, 3, 4],
            "C": np.random.randn(4),
            "D": ["A", "A", "B", "B"],
        })

        clean = Data(
            x_train=data,
            x_test=None,
            split=False,
            target_field="",
            report_name="test",
            test_split_percentage=0.5,
        )
        clean.target_mapping = "a"

        self.assertEqual(clean.target_mapping, "a")
Exemplo n.º 6
0
    def test_where(self):

        data = [[1, 0, 0], [0, 2, 3], [0, 3, 4], [1, 2, 3]]

        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(data, columns=columns)

        base = Data(
            x_train=data,
            x_test=None,
            split=False,
            target_field="",
            report_name="test",
            test_split_percentage=0.5,
        )

        subset = base.where(col1=0, col2=2, col3=[3, 4])
        validate = subset.values.tolist()

        self.assertListEqual(validate, [[0, 2, 3]])
Exemplo n.º 7
0
    def test_encodelabels(self):

        data = pd.DataFrame({
            "A": [1, 1, 2, 2],
            "B": [1, 2, 3, 4],
            "C": np.random.randn(4),
            "D": ["Bca", "Abc", "Bca", "Bca"],
        })

        clean = Data(
            x_train=data,
            x_test=None,
            split=False,
            target_field="D",
            report_name="",
            test_split_percentage=0.5,
        )
        clean.encode_target()

        self.assertDictEqual(clean.target_mapping, {0: "Abc", 1: "Bca"})
Exemplo n.º 8
0
    def test_preprocessnumeric_log(self):

        unnormal_data = [[1.0, -2.0, 2.0], [-2.0, 1.0, 3.0], [4.0, 1.0, -2.0]]

        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(unnormal_data, columns=columns)

        preprocess = Data(x_train=data, test_split_percentage=0.5, report_name="test")
        preprocess.normalize_log()
        preprocess.normalize_log(base=2)
        preprocess.normalize_log(base=10)

        self.assertTrue(True)
Exemplo n.º 9
0
    def test_cleanutil_replacebfill(self):

        int_missing_data = [
            [1, 8, 1],
            [0, 9394, 2],
            [np.nan, np.nan, np.nan],
            [2, 4, 3],
            [np.nan, np.nan, np.nan],
            [np.nan, np.nan, np.nan],
        ]

        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(int_missing_data, columns=columns)

        clean = Data(x_train=data,
                     test_split_percentage=0.5,
                     report_name="test")
        clean.replace_missing_backfill("col1", "col2", "col3")

        self.assertTrue(True)
Exemplo n.º 10
0
    def test_search_notequal(self):

        data = pd.DataFrame({
            "A": [1, 1, 2, 2],
            "B": [1, 2, 3, 4],
            "C": np.random.randn(4),
            "D": ["A", "A", "B", "B"],
        })

        clean = Data(
            x_train=data,
            x_test=None,
            split=False,
            target_field="",
            report_name="test",
            test_split_percentage=0.5,
        )
        clean.search("A", not_equal=True, replace=True)

        self.assertTrue(True)
Exemplo n.º 11
0
    def test_groupbyanalysis(self):

        data = pd.DataFrame({
            "A": [1, 1, 2, 2],
            "B": [1, 2, 3, 4],
            "C": np.random.randn(4),
            "D": ["A", "A", "B", "B"],
        })

        base = Data(
            x_train=data,
            x_test=None,
            split=False,
            target_field="",
            report_name="test",
            test_split_percentage=0.5,
        )

        base.groupby_analysis(["A"])

        self.assertTrue(True)
Exemplo n.º 12
0
    def test_cleanutil_replacerandomdiscrete(self):

        int_missing_data = [
            [1, 8, 1],
            [0, 9394, 2],
            [np.nan, np.nan, np.nan],
            [2, 4, 3],
        ]

        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(int_missing_data, columns=columns)

        clean = Data(x_train=data,
                     test_split_percentage=0.5,
                     report_name="test")
        clean.replace_missing_random_discrete("col1", "col2", "col3")

        validate = np.any(clean.x_train.isnull()) and np.any(
            clean.x_test.isnull())

        self.assertFalse(validate)
Exemplo n.º 13
0
    def test_report_cleaning_new_category(self):

        missing_data = [[1.0, "Green", 2], [1.0, "Other", 1],
                        [np.nan, np.nan, np.nan]]

        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(missing_data, columns=columns)

        clean = Data(x_train=data,
                     test_split_percentage=0.5,
                     split=False,
                     report_name="test")
        clean_data = clean.replace_missing_new_category()

        with open(clean.report.filename) as f:
            content = f.read()
        validate = "col1" in content and "col2" in content and "col3" in content

        os.remove(clean.report.filename)

        self.assertTrue(validate)
Exemplo n.º 14
0
    def test_preprocess_traindata(self):

        unnormal_x_train = [[5.0, 3, 1], [2.0, 2, 1], [10.0, 1, 1]]

        unnormal_x_test = [[5.0, 3, 1], [2.0, 2, 1], [10.0, 1, 1]]

        columns = ["col1", "col2", "col3"]
        x_train = pd.DataFrame(unnormal_x_train, columns=columns)
        x_test = pd.DataFrame(unnormal_x_test, columns=columns)

        preprocess = Data(
            x_train=x_train,
            x_test=x_test,
            test_split_percentage=0.5,
            report_name="test",
        )
        preprocess.normalize_numeric("col1", "col2", "col3")
        validate_train = preprocess.x_train.values.tolist()
        validate_test = preprocess.x_test.values.tolist()

        self.assertListEqual(validate_train, validate_test)
Exemplo n.º 15
0
    def test_ytrain_dne(self):

        data = [[1, 0, 0], [0, 2, 3], [0, 3, 4], [1, 2, 3]]
        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(data, columns=columns)

        base = Data(
            x_train=data,
            x_test=None,
            split=True,
            target_field="",
            report_name="test",
            test_split_percentage=0.5,
        )

        base.y_train = [1, 1]
        validate = base.x_train["label"].tolist() == [
            1,
            1,
        ] and base.y_train.tolist() == [1, 1]

        self.assertTrue(validate)
Exemplo n.º 16
0
    def test_normalize_column_names(self):

        data = np.zeros((4, 4))
        columns = ["PID", "CapsLock", "space column name", "Caps Space"]

        data = pd.DataFrame(data, columns=columns)

        base = Data(
            x_train=data,
            x_test=data,
            split=True,
            target_field="",
            report_name="test",
            test_split_percentage=0.5,
        )

        base.standardize_column_names()
        validate = (base.columns == [
            "pid", "capslock", "space_column_name", "caps_space"
        ] and base.x_test.columns.tolist() == base.x_train.columns.tolist())

        self.assertTrue(validate)
Exemplo n.º 17
0
    def test_preprocess_splitsentences(self):

        text_data = [
            "Hi my name is aethos. Please split me.",
            "This function is going to split by sentence. Automation is great.",
        ]
        data = pd.DataFrame(data=text_data, columns=["data"])

        prep = Data(x_train=data, split=False, report_name="test")
        prep.split_sentences("data")
        validate = prep.x_train["data_sentences"].values.tolist()

        self.assertListEqual(
            validate,
            [
                ["Hi my name is aethos.", "Please split me."],
                [
                    "This function is going to split by sentence.",
                    "Automation is great.",
                ],
            ],
        )
Exemplo n.º 18
0
    def test_featureextractioncategorical_onehot(self):

        normal_data = [
            ["Green", "Green", "Car"],
            ["Green", "Other", "Truck"],
            ["Blue", "Other", "Truck"],
        ]

        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(normal_data, columns=columns)

        feature = Data(x_train=data,
                       test_split_percentage=0.5,
                       split=False,
                       report_name="test")
        feature.onehot_encode(list_of_cols=["col1", "col3"], keep_col=False)
        validate = feature.x_train.values.tolist()

        self.assertListEqual(
            validate,
            [["Green", 0, 1, 1, 0], ["Other", 0, 1, 0, 1],
             ["Other", 1, 0, 0, 1]],
        )
Exemplo n.º 19
0
    def test_cleanutil_replaceindicator_removecol(self):

        int_missing_data = [
            [1, 8, 1],
            [0, 9394, 2],
            [np.nan, np.nan, np.nan],
            [2, 4, 3],
            [np.nan, np.nan, np.nan],
            [np.nan, np.nan, np.nan],
        ]

        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(int_missing_data, columns=columns)

        clean = Data(x_train=data,
                     test_split_percentage=0.5,
                     report_name="test")
        clean.replace_missing_indicator("col1", "col2", "col3", keep_col=False)

        validate = (clean.x_train.shape[1] == 3) and (clean.x_test.shape[1]
                                                      == 3)

        self.assertTrue(True)
Exemplo n.º 20
0
    def test_json_normalize_split(self):

        data = pd.DataFrame({
            "col1": [1, 2],
            "col2": [
                ast.literal_eval("{'foo':1, 'bar':2, 'baz':{'foo':2, 'x':1}}"),
                ast.literal_eval("{'foo':3, 'bar':5, 'baz':{'foo':2, 'x':1}}"),
            ],
        })

        base = Data(
            x_train=data,
            x_test=None,
            split=True,
            target_field="",
            report_name="test",
            test_split_percentage=0.5,
        )

        base.expand_json_column("col2")

        self.assertListEqual(base.x_test.columns.tolist(),
                             ["col1", "foo", "bar", "baz_foo", "baz_x"])
Exemplo n.º 21
0
    def test_cleancategorical_replacemissingnewcategory_list_constantnotnone(
            self):

        missing_data = np.array([(1, "Green", 2), (1, "Other", 1),
                                 (None, None, None)])

        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(missing_data, columns=columns)
        list_col = ["col1", "col3"]

        clean = Data(x_train=data,
                     test_split_percentage=0.5,
                     split=False,
                     report_name="test")
        clean.replace_missing_new_category(list_of_cols=list_col,
                                           new_category=0)
        validate = clean.x_train.values.tolist()

        self.assertListEqual(
            validate,
            np.array([(1, "Green", 2), (1, "Other", 1),
                      (0, None, 0)]).tolist(),
        )
Exemplo n.º 22
0
    def test_dropcolumns_complex(self):

        int_missing_data = [[1, 0, 0, 3], [0, 2, 3, 4], [0, 3, 4, 4],
                            [1, 2, 3, 6]]
        columns = ["col1", "col2", "col3", "py"]
        data = pd.DataFrame(int_missing_data, columns=columns)

        clean = Data(
            x_train=data,
            x_test=None,
            split=True,
            target_field="",
            report_name="test",
            test_split_percentage=0.5,
        )
        clean.drop("col1",
                   keep=["col2"],
                   regexp=r"col*",
                   reason="Columns were unimportant.")

        validate = list(clean.x_train.columns) == ["col2", "py"] and list(
            clean.x_test.columns) == ["col2", "py"]

        self.assertTrue(validate)
Exemplo n.º 23
0
    def test_getattr(self):

        int_missing_data = [[1, 0, 0], [0, 2, 3], [0, 3, 4], [1, 2, 3]]
        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(int_missing_data, columns=columns)

        base = Data(
            x_train=data,
            x_test=None,
            split=True,
            target_field="",
            report_name="test",
            test_split_percentage=0.5,
        )

        self.assertIsNotNone(base.col1)
Exemplo n.º 24
0
    def test_columns_property(self):

        data = pd.DataFrame(np.random.rand(100, 10))

        base = Data(
            x_train=data,
            x_test=None,
            split=True,
            target_field="col3",
            report_name="test",
            test_split_percentage=0.5,
        )

        validate = base.columns

        self.assertTrue(len(validate) == 10)
Exemplo n.º 25
0
    def test_dropcolumns_error(self):

        int_missing_data = [[1, 0, 0, 3], [0, 2, 3, 4], [0, 3, 4, 4],
                            [1, 2, 3, 6]]
        columns = ["col1", "col2", "col3", "py"]
        data = pd.DataFrame(int_missing_data, columns=columns)

        clean = Data(
            x_train=data,
            x_test=None,
            split=True,
            target_field="",
            report_name="test",
            test_split_percentage=0.5,
        )

        self.assertRaises(TypeError, clean.drop, keep="col2")
Exemplo n.º 26
0
    def test_ytest_split(self):

        data = [[1, 0, 0], [0, 2, 3], [0, 3, 4], [1, 2, 3]]
        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(data, columns=columns)

        base = Data(
            x_train=data,
            x_test=None,
            split=True,
            target_field="col3",
            report_name="test",
            test_split_percentage=0.5,
        )

        validate = len(base.y_test) == 2

        self.assertTrue(validate)
Exemplo n.º 27
0
    def test_setattr_trainset(self):

        int_missing_data = [[1, 0, 0, 1], [0, 2, 3, 1], [0, 3, 4, 1],
                            [1, 2, 3, 1]]
        columns = ["col1", "col2", "col3", "col4"]
        data = pd.DataFrame(int_missing_data, columns=columns)

        base = Data(
            x_train=data,
            x_test=None,
            split=True,
            target_field="",
            report_name="test",
            test_split_percentage=0.75,
        )
        base["col5"] = [4]

        self.assertListEqual(base["col5"].tolist(), [4])
Exemplo n.º 28
0
    def test_setitem_tupleeven(self):
        int_missing_data = [[1, 0, 0], [0, 2, 3], [0, 3, 4], [1, 2, 3]]
        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(int_missing_data, columns=columns)

        base = Data(
            x_train=data,
            x_test=None,
            split=True,
            target_field="",
            report_name="test",
            test_split_percentage=0.5,
        )
        base["col4"] = ([5, 5], [2, 2])

        validate = any(base.x_train["col4"].isnull()) and any(
            base.x_test["col4"].isnull())

        self.assertFalse(validate)
Exemplo n.º 29
0
    def test_missing_data(self):

        int_missing_data = [
            [np.NaN, np.NaN, np.NaN],
            [0, 2, 3],
            [np.NaN, np.NaN, np.NaN],
            [np.NaN, np.NaN, np.NaN],
        ]
        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(int_missing_data, columns=columns)

        base = Data(
            x_train=data,
            x_test=None,
            split=True,
            target_field="",
            report_name="",
            test_split_percentage=0.5,
        )

        base.missing_values

        self.assertTrue(True)