Пример #1
0
    def test_facet_grid_scatter_plot(self):
        ## Check if it works without any error.
        np.random.seed(1)

        df = load_iris(target="species")
        df["grid_col"] = np.random.choice(["c1", "c2"],
                                          size=df.shape[0],
                                          replace=True)
        df["grid_row"] = np.random.choice(["r1", "r2", "r3"],
                                          size=df.shape[0],
                                          replace=True)

        facet_grid_scatter_plot(data=df,
                                row="grid_row",
                                col="grid_col",
                                x="petal_width",
                                y="petal_length",
                                c="sepal_width")

        facet_grid_scatter_plot(data=df,
                                row="grid_row",
                                col="grid_col",
                                x="petal_width",
                                y="petal_length",
                                hue="species")
Пример #2
0
    def test_load_iris(self):
        """unittest for load_iris"""
        target = "species"
        cats = {"setosa", "versicolor", "virginica"}
        df = load_iris(target=target)

        self.assertTrue(isinstance(df, pd.DataFrame))
        self.assertEqual((150, 5), df.shape)
        self.assertEqual(target, list(df.columns)[-1])
        self.assertEqual(cats, set(df[target].unique()))
Пример #3
0
    def setUpClass(cls) -> None:
        np.random.seed(1)

        ## iris (multi-class classification)
        df = load_iris(target="label")
        df = df.sample(frac=1, replace=False)
        cls.iris_X = df.drop("label", axis=1)
        cls.iris_y = df["label"]

        cls.iris_plr = GridSearchCV(LogisticRegression(solver="liblinear",
                                                       multi_class="auto"),
                                    param_grid={"C": [0.1, 1]},
                                    cv=3,
                                    iid=False,
                                    return_train_score=True)
        cls.iris_plr.fit(cls.iris_X, cls.iris_y)

        ## boston dataset (regression)
        target = "price"
        df = load_boston(target=target)
        df = df.sample(frac=1, replace=False)
        cls.boston_X = df.drop("price", axis=1)
        cls.boston_y = df["price"]

        cls.boston_tree = GridSearchCV(DecisionTreeRegressor(),
                                       param_grid={"max_depth": [3, 5, 7]},
                                       cv=5,
                                       scoring="neg_mean_squared_error",
                                       iid=False,
                                       return_train_score=False)
        cls.boston_tree.fit(cls.boston_X, cls.boston_y)

        cls.boston_enet = GridSearchCV(ElasticNet(normalize=True),
                                       param_grid={
                                           "alpha": [0.1, 1.0],
                                           "l1_ratio": [0.1, 0.5, 0.9]
                                       },
                                       cv=3,
                                       scoring="neg_mean_squared_error",
                                       iid=False,
                                       return_train_score=True)
        cls.boston_enet.fit(cls.boston_X, cls.boston_y)

        ## breast (binary classification)
        df = load_breast_cancer("label")
        cls.breast_cancer_X = df.drop("label", axis=1)
        cls.breast_cancer_y = df["label"]

        cls.breast_cancer_plr = GridSearchCV(
            LogisticRegression(solver="liblinear"),
            param_grid={"C": [0.1, 1]},
            cv=3)
        cls.breast_cancer_plr.fit(cls.breast_cancer_X, cls.breast_cancer_y)
Пример #4
0
    def test_to_excel(self):
        df_iris = load_iris()
        df_boston = load_boston()
        df_cancer = load_breast_cancer().loc[:10, :]

        from openpyxl import load_workbook
        sheets = ["iris", "boston"]

        with TempDir() as temp_dir:
            excel_file = temp_dir.joinpath("new_file.xlsx")

            ## Case 1) Create a new file
            to_excel(df=df_iris,
                     file=excel_file,
                     sheet=sheets[0],
                     libreoffice=True)

            wb1 = load_workbook(str(excel_file))
            self.assertEqual([sheets[0]], wb1.sheetnames)

            ## whether we can read the excel through pandas.read_excel
            dg1 = pd.read_excel(excel_file, sheet_name=sheets[0])
            self.assertEqual(df_iris.columns.tolist(), dg1.columns.tolist())
            self.assertEqual(df_iris.shape, dg1.shape)

            ## Case 2) Add a sheet to an existing excel file
            to_excel(df=df_boston,
                     file=excel_file,
                     sheet=sheets[1],
                     libreoffice=True)

            wb2 = load_workbook(str(excel_file))
            self.assertEqual(sheets, wb2.sheetnames)

            dg2 = pd.read_excel(excel_file, sheet_name=sheets[1])
            self.assertEqual(df_boston.columns.tolist(), dg2.columns.tolist())
            self.assertEqual(df_boston.shape, dg2.shape)

            ## Case 3) Overwrite an existing sheet with a new table
            ## This table has no style.
            to_excel(df=df_cancer,
                     file=excel_file,
                     sheet=sheets[0],
                     libreoffice=False)

            wb3 = load_workbook(str(excel_file))
            self.assertEqual(sheets, wb3.sheetnames)

            dg3 = pd.read_excel(excel_file, sheet_name=sheets[0])
            self.assertEqual(df_cancer.columns.tolist(), dg3.columns.tolist())
            self.assertEqual(df_cancer.shape, dg3.shape)
Пример #5
0
    def test_bins_heatmap(self):
        # check if this function works without error
        np.random.seed(2)
        df = load_iris(target="species")
        df["cat1"] = np.random.choice(["a", "b"],
                                      size=df.shape[0],
                                      replace=True)

        bins_heatmap(df,
                     cat1="cat1",
                     cat2="species",
                     x="petal_width",
                     y="petal_length",
                     target="sepal_length",
                     n_bins=3)
Пример #6
0
    def test_bins_by_tree(self):
        """unittest for bins_by_tree"""
        df = load_iris(target="species")
        cols = list(df.columns)
        n_bins = 3

        ## Case) The target variable is continuous
        bins = bins_by_tree(df,
                            field=cols[2],
                            target=cols[3],
                            target_is_continuous=True,
                            n_bins=n_bins,
                            n_points=200,
                            precision=1)
        cats = sorted(bins.unique())

        self.assertIsInstance(bins, pd.Series)
        self.assertEqual(len(bins.unique()), n_bins)

        for cat in cats:
            self.assertIsInstance(cat, pd.Interval)
            self.assertEqual(cat.closed, "right")

        self.assertEqual(cats[0].left, -np.inf)
        self.assertEqual(cats[-1].right, np.inf)

        ## Case) The target variable is not continuous
        bins = bins_by_tree(df,
                            field=cols[2],
                            target=cols[-1],
                            target_is_continuous=False,
                            n_bins=n_bins,
                            n_points=200,
                            precision=1)
        cats = sorted(bins.unique())

        self.assertIsInstance(bins, pd.Series)
        self.assertEqual(len(bins.unique()), n_bins)

        for cat in cats:
            self.assertIsInstance(cat, pd.Interval)
            self.assertEqual(cat.closed, "right")

        self.assertEqual(cats[0].left, -np.inf)
        self.assertEqual(cats[-1].right, np.inf)
Пример #7
0
    def test_visualize_two_fields(self):
        # check if the function works without any error
        np.random.seed(1)
        df = load_iris(target="species")
        df["cat"] = np.random.choice(["a","b","c"],
                                     size=df.shape[0],
                                     replace=True)
        inspector = Inspector(df)

        ## continuous x continuous
        inspector.visualize_two_fields("sepal_width","sepal_length")

        ## continuous x categorical
        inspector.visualize_two_fields("sepal_length", "species")

        ## categorical x continuous
        inspector.visualize_two_fields("species", "petal_width")

        ## categorical x categorical
        inspector.visualize_two_fields("species","cat")
Пример #8
0
try:
    from adhoc.processing import Inspector
    from adhoc.modeling import show_tree
    from adhoc.utilities import load_iris, facet_grid_scatter_plot, bins_heatmap
except ImportError:
    import sys
    sys.path.append("..")
    from adhoc.processing import Inspector
    from adhoc.modeling import show_tree
    from adhoc.utilities import load_iris, facet_grid_scatter_plot, bins_heatmap

# +
np.random.seed(1)

df = load_iris(target="species")
df["cat1"] = np.random.choice(["a", "b", "c"], size=df.shape[0], replace=True)
df["cat2"] = (df.iloc[:, 0] * df.iloc[:, 1] - df.iloc[:, 2] * df.iloc[:, 3] >
              11).map({
                  True: 1,
                  False: 0
              })

inspector = Inspector(df)
inspector  ## 4 continuous variables and 3 categorical variables
# -

inspector.visualize_two_fields("sepal_width",
                               "sepal_length")  ## continuous x continuous

inspector.visualize_two_fields("petal_width",