示例#1
0
 def setUp(self) -> None:
     super(TestBalance, self).setUp()
     X, y = load_iris(return_X_y=True)
     y[y == 2] = 1
     X_train, X_test, y_train, y_test = train_test_split(X,
                                                         y,
                                                         test_size=0.2,
                                                         random_state=0)
     X_train = DataFrameContainer(
         "TrainSet",
         dataset_instance=X_train,
         resource_manager=self.mock_resource_manager)
     X_test = DataFrameContainer(
         "TestSet",
         dataset_instance=X_test,
         resource_manager=self.mock_resource_manager)
     y_train = NdArrayContainer("TrainLabel",
                                dataset_instance=y_train,
                                resource_manager=self.mock_resource_manager)
     y_test = NdArrayContainer("TestLabel",
                               dataset_instance=y_test,
                               resource_manager=self.mock_resource_manager)
     X_train.set_feature_groups(["num"] * 4)
     X_test.set_feature_groups(["num"] * 4)
     self.X_train = X_train
     self.X_test = X_test
     self.y_train = y_train
     self.y_test = y_test
示例#2
0
 def test_upload_download(self):
     titanic_df = load("titanic")
     titanic_df.index = reversed(titanic_df.index)
     dc = DataFrameContainer(dataset_instance=titanic_df,
                             resource_manager=self.mock_resource_manager)
     feat_grp = [f"feat_{i}" for i in range(dc.shape[1])]
     dc.set_feature_groups(feat_grp)
     column_descriptions = dc.column_descriptions
     dc.upload()
     dataset_id = dc.dataset_id
     download_dc = DataFrameContainer(
         "Unittest",
         dataset_id=dataset_id,
         resource_manager=self.mock_resource_manager)
     self.assertTrue(
         np.all(download_dc.data.fillna(0) == dc.data.fillna(0)))
     self.assertTrue(
         np.all(download_dc.feature_groups == dc.feature_groups))
     self.assertTrue(np.all(download_dc.columns == dc.columns))
     self.assertTrue(np.all(download_dc.index == dc.index))
     self.assertEqual(download_dc.column_descriptions,
                      dc.column_descriptions)
     self.assertEqual(download_dc.columns_mapper, dc.columns_mapper)
     self.assertEqual(download_dc.dataset_type, dc.dataset_type)
     self.assertEqual(download_dc.dataset_source, dc.dataset_source)
    def test_classifier(self):
        train_df = datasets.load("titanic")[["Name", "Survived"]]
        y = np.array(train_df.pop("Survived"))

        X_train, X_test, y_train, y_test = train_test_split(train_df,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=0)
        X_train = DataFrameContainer(
            "TrainSet",
            dataset_instance=X_train,
            resource_manager=self.mock_resource_manager)
        X_test = DataFrameContainer(
            "TestSet",
            dataset_instance=X_test,
            resource_manager=self.mock_resource_manager)
        y_train = NdArrayContainer("TrainLabel",
                                   dataset_instance=y_train,
                                   resource_manager=self.mock_resource_manager)
        y_test = NdArrayContainer("TestLabel",
                                  dataset_instance=y_test,
                                  resource_manager=self.mock_resource_manager)
        X_train.set_feature_groups(["text"])
        X_test.set_feature_groups(["text"])
        est_cls_list = [
            TsvdTransformer,
            NmfTransformer,
            LsiTransformer,
            LdaTransformer,
            RpTransformer,
        ]
        for cls in est_cls_list:
            print("=========================")
            print(cls.__name__)
            print("=========================")
            tokenizer = SimpleTokenlizer(
                **get_default_hp_of_cls(SimpleTokenlizer))
            tokenizer.in_feature_groups = "text"
            tokenizer.out_feature_groups = "token"
            transformer = cls(**get_default_hp_of_cls(cls))
            transformer.in_feature_groups = "token"
            transformer.out_feature_groups = "num"
            classifier = RandomForestClassifier(
                **get_default_hp_of_cls(RandomForestClassifier))
            pipeline = ML_Workflow([
                ("tokenizer", tokenizer),
                ("transformer", transformer),
                ("classifier", classifier),
            ],
                                   resource_manager=self.mock_resource_manager)
            start = time()
            pipeline.fit(X_train, y_train, X_test, y_test)
            y_pred = pipeline.predict(X_test)
            score = accuracy_score(y_test.data, y_pred)
            end = time()
            print("score:", score)
            print("time:", end - start)
            self.assertGreater(score, 0.6)
            print('\n' * 2)
示例#4
0
 def process_X(self, X: DataFrameContainer, X_origin):
     if X is None:
         return None
     assert X.shape[1] == len(self.columns)
     if isinstance(X_origin, np.ndarray):
         X.columns = self.columns
     elif isinstance(X_origin, pd.DataFrame):
         assert set(X.columns) == set(self.columns)
         if not np.all(X.columns == self.columns):
             self.logger.warning(
                 f"{X.dataset_source}'s columns do not match the TrainSet's columns by position!"
             )
             X.data = X.data[self.columns]
     elif isinstance(X_origin, DataFrameContainer):
         pass
     else:
         raise NotImplementedError
     X.set_feature_groups(self.feature_groups)
     return X
示例#5
0
 def test_handle_unknown(self):
     X_train = pd.DataFrame([
         ['A', 'alpha', 9],
         ['A', 'alpha', 1],
         ['B', 'beta', 2],
         ['B', 'beta', 3],
         ['C', 'gamma', 4],
         ['C', 'gamma', 5],
     ],
                            columns=['col1', 'col2', 'col3'])
     X_valid = pd.DataFrame([
         ['D', 'kappa', 6],
         ['D', 'kappa', 6],
         ['E', 'sigma', 7],
         ['E', 'sigma', 7],
         ['F', 'mu', 8],
         ['F', 'mu', 8],
     ],
                            columns=['col1', 'col2', 'col3'])
     X_train = DataFrameContainer(dataset_instance=X_train)
     X_valid = DataFrameContainer(dataset_instance=X_valid)
     X_train.set_feature_groups(['cat'] * 3)
     X_valid.set_feature_groups(['cat'] * 3)
     y_train = NdArrayContainer(dataset_instance=[0, 1, 0, 1, 0, 1])
     for cls in [
             EntityEncoder, OrdinalEncoder, OneHotEncoder, TargetEncoder,
             CatBoostEncoder
     ]:
         hp = get_default_hp_of_cls(cls)
         encoder = cls(**hp)
         encoder.in_feature_groups = "cat"
         encoder.out_feature_groups = "ordinal"
         result = encoder.fit_transform(X_train=X_train,
                                        X_valid=X_valid,
                                        y_train=y_train)
         assert np.all(
             encoder.transform(X_train)['X_train'].data ==
             result['X_train'].data)
         assert np.all(
             encoder.transform(X_valid)['X_train'].data ==
             result['X_valid'].data)
示例#6
0
    def test_ordinal_encode_category(self):
        df2 = pd.DataFrame([
            ['C', '3'],
            ['D', '4'],
            ['D', '4'],
        ],
                           columns=['alpha', 'digits'])
        df2["digits"] = df2["digits"].astype(
            CategoricalDtype(categories=["4", "3"], ordered=True))
        df2["alpha"] = df2["alpha"].astype(
            CategoricalDtype(categories=["D", "C"], ordered=True))
        df2_ = df2.loc[1:, :]
        df2_1 = df2.loc[:1, :]
        df2_c = pd.concat([df2_, df2_1])
        df2_c.index = range(4)
        encoder = OrdinalEncoder()

        encoder.in_feature_groups = "cat"
        encoder.out_feature_groups = "ordinal"
        # RunFeatureSelection().test_univar_clf()
        # RunCoding().test_procedure()
        dc = DataFrameContainer(dataset_instance=df2_c)
        dc.set_feature_groups(["cat"] * 2)
        encoder.fit(X_train=dc)
        result = encoder.transform(X_train=dc)["X_train"]
        print(result)
        should_be = pd.DataFrame({
            'alpha': {
                0: 0,
                1: 0,
                2: 1,
                3: 0
            },
            'digits': {
                0: 0,
                1: 0,
                2: 1,
                3: 0
            }
        })
        assert np.all(result.data == should_be)
示例#7
0
 def setUp(self) -> None:
     super(TestFeatureSelection, self).setUp()
     self.L = 1024
     df = load("qsar")
     y = df.pop("target")
     X = df
     X[X == 0] = -1
     X.index = reversed(X.index)
     self.index = deepcopy(X.index)
     X = DataFrameContainer("TrainSet",
                            dataset_instance=X,
                            resource_manager=self.mock_resource_manager)
     X.set_feature_groups(["num"] * X.shape[1])
     self.X = X
     self.y = NdArrayContainer("TrainSet",
                               dataset_instance=y,
                               resource_manager=self.mock_resource_manager)
     y_reg = y + np.random.rand(*y.shape)
     self.y_reg = NdArrayContainer(
         "TrainSet",
         dataset_instance=y_reg,
         resource_manager=self.mock_resource_manager)
示例#8
0
 def setUp(self) -> None:
     super(RunReduce, self).setUp()
     self.L = 1024
     df = load("qsar")
     y = df.pop("target")
     X = df
     X[X == 0] = -1
     X.index = reversed(X.index)
     self.index = deepcopy(X.index)
     X = DataFrameContainer("TrainSet", dataset_instance=X)
     X.set_feature_groups(["num"] * X.shape[1])
     X2 = deepcopy(X)
     y2 = deepcopy(y)
     N = 500
     X2.data = X2.data.iloc[:N, :]
     X2.set_feature_groups(["num"] * X2.shape[1])
     y2 = y2.iloc[:N]
     self.Xs = [
         X, X2
     ]
     self.ys = [
         NdArrayContainer("TrainLabel", dataset_instance=y),
         NdArrayContainer("TrainLabel", dataset_instance=y2)
     ]