def test_classifier(self):
        train_df = datasets.load("titanic")[["Name", "Survived"]]
        y = np.array(train_df.pop("Survived"))

        X_train, X_test, y_train, y_test = train_test_split(train_df,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=0)
        X_train = DataFrameContainer(
            "TrainSet",
            dataset_instance=X_train,
            resource_manager=self.mock_resource_manager)
        X_test = DataFrameContainer(
            "TestSet",
            dataset_instance=X_test,
            resource_manager=self.mock_resource_manager)
        y_train = NdArrayContainer("TrainLabel",
                                   dataset_instance=y_train,
                                   resource_manager=self.mock_resource_manager)
        y_test = NdArrayContainer("TestLabel",
                                  dataset_instance=y_test,
                                  resource_manager=self.mock_resource_manager)
        X_train.set_feature_groups(["text"])
        X_test.set_feature_groups(["text"])
        est_cls_list = [
            TsvdTransformer,
            NmfTransformer,
            LsiTransformer,
            LdaTransformer,
            RpTransformer,
        ]
        for cls in est_cls_list:
            print("=========================")
            print(cls.__name__)
            print("=========================")
            tokenizer = SimpleTokenlizer(
                **get_default_hp_of_cls(SimpleTokenlizer))
            tokenizer.in_feature_groups = "text"
            tokenizer.out_feature_groups = "token"
            transformer = cls(**get_default_hp_of_cls(cls))
            transformer.in_feature_groups = "token"
            transformer.out_feature_groups = "num"
            classifier = RandomForestClassifier(
                **get_default_hp_of_cls(RandomForestClassifier))
            pipeline = ML_Workflow([
                ("tokenizer", tokenizer),
                ("transformer", transformer),
                ("classifier", classifier),
            ],
                                   resource_manager=self.mock_resource_manager)
            start = time()
            pipeline.fit(X_train, y_train, X_test, y_test)
            y_pred = pipeline.predict(X_test)
            score = accuracy_score(y_test.data, y_pred)
            end = time()
            print("score:", score)
            print("time:", end - start)
            self.assertGreater(score, 0.6)
            print('\n' * 2)
Пример #2
0
 def test_procedure(self):
     for cls in [
             MinMaxScaler,
             StandardScaler,
             Normalizer,
             QuantileTransformer,
             RobustScaler,
             KeepGoing,
             # WOEEncoder,  # 不支持回归
     ]:
         print("=========================")
         print(cls.__name__)
         print("=========================")
         if cls == KeepGoing:
             hp = {}
         else:
             hp = get_default_hp_of_cls(cls)
         start = time()
         workflow = ML_Workflow(steps=[("scaler",
                                        cls(in_feature_groups="num",
                                            out_feature_groups="scaled",
                                            **hp)),
                                       ("rf", LinearSVR(random_state=0))],
                                resource_manager=self.mock_resource_manager)
         workflow.fit(X_train=self.X_train,
                      X_valid=self.X_test,
                      y_train=self.y_train,
                      y_valid=self.y_test)
         y_pred = workflow.predict(self.X_test)
         score = r2_score(self.y_test.data, y_pred)
         print("r2 = ", score)
         print("time = ", time() - start)
         print("\n" * 2)
Пример #3
0
 def test_procedure(self):
     for cls in [
             EntityEncoder,
             TargetEncoder,
             BinaryEncoder,
             CatBoostEncoder,
             OrdinalEncoder,
             LeaveOneOutEncoder,
             OneHotEncoder,
             # WOEEncoder,  # 不支持回归
     ]:
         print("=========================")
         print(cls.__name__)
         print("=========================")
         start = time()
         hp = get_default_hp_of_cls(cls)
         workflow = ML_Workflow(steps=[
             ("encoder",
              cls(in_feature_groups="cat", out_feature_groups="num", **hp)),
             ("rf", RandomForestRegressor(random_state=0))
         ],
                                resource_manager=self.mock_resource_manager)
         workflow.fit(X_train=self.X_train,
                      X_valid=self.X_test,
                      y_train=self.y_train,
                      y_valid=self.y_test)
         y_pred = workflow.predict(self.X_test)
         score = r2_score(self.y_test.data, y_pred)
         print("r2 = ", score)
         print("time = ", time() - start)
         print("\n" * 2)
Пример #4
0
 def test(self):
     for cls in [
         PCA,
         FastICA,
         KernelPCA
     ]:
         print("=========================")
         print(cls.__name__)
         print("=========================")
         for idx in [1]:
             for p in [0, 0.25, 0.5, 1]:
                 hp = get_default_hp_of_cls(cls)
                 hp.update({
                     "in_feature_groups": "num",
                     "out_feature_groups": "reduced",
                     "_n_components__sp1_ratio": p
                 })
                 start = time()
                 reducer = cls(**hp)
                 X = self.Xs[idx]
                 y = self.ys[idx]
                 reduced = reducer.fit_transform(X, y)["X_train"]
                 assert very_close(reduced.shape[1], self.calc_sp1(X.shape, p), delta=0)
                 print("consuming time :", time() - start)
                 print("assign ratio :", p)
                 print("actual ratio :", reduced.shape[1] / X.shape[1])
                 print("origin shape :", X.shape)
                 print("actual shape :", reduced.shape)
                 print("\n" * 2)
Пример #5
0
    def test_under_sample(self):

        est_cls_list = [
            AllKNN,
            ClusterCentroids,
            CondensedNearestNeighbour,
            EditedNearestNeighbours,
            InstanceHardnessThreshold,
            NearMiss,
            NeighbourhoodCleaningRule,
            OneSidedSelection,
            RandomUnderSampler,
            RepeatedEditedNearestNeighbours,
            TomekLinks,
        ]

        for cls in est_cls_list:
            print("=========================")
            print(cls.__name__)
            print("=========================")
            balancer = cls(**get_default_hp_of_cls(cls))
            classifier = LinearSVC(**get_default_hp_of_cls(LinearSVC))
            pipeline = ML_Workflow([
                ("balancer", balancer),
                ("classifier", classifier),
            ],
                                   resource_manager=self.mock_resource_manager,
                                   should_store_intermediate_result=True)
            start = time()
            pipeline.fit(self.X_train, self.y_train, self.X_test, self.y_test)
            balanced_y_train = NdArrayContainer(
                dataset_id=pipeline.intermediate_result["balancer"]["y_train"],
                resource_manager=self.mock_resource_manager)
            print("y_train:")
            print(Counter(self.y_train.data))
            print("balanced y_train:")
            print(Counter(balanced_y_train.data))

            y_pred = pipeline.predict(self.X_test)
            score = accuracy_score(self.y_test.data, y_pred)
            end = time()
            print("score:", score)
            print("time:", end - start)
            self.assertGreater(score, 0.6)
            print('\n' * 2)
Пример #6
0
    def test_over_sample(self):
        est_cls_list = [
            RandomOverSampler,
            # ADASYN,
            BorderlineSMOTE,
            KMeansSMOTE,
            SMOTE,
            SVMSMOTE,
        ]

        for cls in est_cls_list:
            print("=========================")
            print(cls.__name__)
            print("=========================")
            balancer = cls(**get_default_hp_of_cls(cls))
            classifier = LinearSVC(**get_default_hp_of_cls(LinearSVC))
            pipeline = ML_Workflow([
                ("balancer", balancer),
                ("classifier", classifier),
            ],
                                   resource_manager=self.mock_resource_manager,
                                   should_store_intermediate_result=True)
            start = time()
            pipeline.fit(self.X_train, self.y_train, self.X_test, self.y_test)
            balanced_y_train = NdArrayContainer(
                dataset_id=pipeline.intermediate_result["balancer"]["y_train"],
                resource_manager=self.mock_resource_manager)
            print("y_train:")
            print(Counter(self.y_train.data))
            print("balanced y_train:")
            print(Counter(balanced_y_train.data))

            y_pred = pipeline.predict(self.X_test)
            score = accuracy_score(self.y_test.data, y_pred)
            end = time()
            print("score:", score)
            print("time:", end - start)
            self.assertGreater(score, 0.6)
            print('\n' * 2)
Пример #7
0
 def test_io(self):
     for cls in [
             PowerTransformer,
             QuantileTransformer,
             KeepGoing,
     ]:
         encoder = cls(**get_default_hp_of_cls(cls))
         encoder.in_feature_groups = "num"
         encoder.out_feature_groups = "final"
         trans = encoder.fit_transform(X_train=self.X_train,
                                       X_valid=self.X_test,
                                       y_train=self.y_train)["X_train"]
         assert np.all(trans.feature_groups == "final")
         assert np.all(trans.index == self.index)
         assert np.all(trans.columns == self.X_train.columns)
Пример #8
0
 def test_handle_unknown(self):
     X_train = pd.DataFrame([
         ['A', 'alpha', 9],
         ['A', 'alpha', 1],
         ['B', 'beta', 2],
         ['B', 'beta', 3],
         ['C', 'gamma', 4],
         ['C', 'gamma', 5],
     ],
                            columns=['col1', 'col2', 'col3'])
     X_valid = pd.DataFrame([
         ['D', 'kappa', 6],
         ['D', 'kappa', 6],
         ['E', 'sigma', 7],
         ['E', 'sigma', 7],
         ['F', 'mu', 8],
         ['F', 'mu', 8],
     ],
                            columns=['col1', 'col2', 'col3'])
     X_train = DataFrameContainer(dataset_instance=X_train)
     X_valid = DataFrameContainer(dataset_instance=X_valid)
     X_train.set_feature_groups(['cat'] * 3)
     X_valid.set_feature_groups(['cat'] * 3)
     y_train = NdArrayContainer(dataset_instance=[0, 1, 0, 1, 0, 1])
     for cls in [
             EntityEncoder, OrdinalEncoder, OneHotEncoder, TargetEncoder,
             CatBoostEncoder
     ]:
         hp = get_default_hp_of_cls(cls)
         encoder = cls(**hp)
         encoder.in_feature_groups = "cat"
         encoder.out_feature_groups = "ordinal"
         result = encoder.fit_transform(X_train=X_train,
                                        X_valid=X_valid,
                                        y_train=y_train)
         assert np.all(
             encoder.transform(X_train)['X_train'].data ==
             result['X_train'].data)
         assert np.all(
             encoder.transform(X_valid)['X_train'].data ==
             result['X_valid'].data)
    def test_classifier(self):
        X, y = datasets.load_digits(return_X_y=True)

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=0)
        X_train = DataFrameContainer(
            "TrainSet",
            dataset_instance=X_train,
            resource_manager=self.mock_resource_manager)
        X_test = DataFrameContainer(
            "TestSet",
            dataset_instance=X_test,
            resource_manager=self.mock_resource_manager)
        y_train = NdArrayContainer("TrainLabel",
                                   dataset_instance=y_train,
                                   resource_manager=self.mock_resource_manager)
        y_test = NdArrayContainer("TestLabel",
                                  dataset_instance=y_test,
                                  resource_manager=self.mock_resource_manager)

        est_cls_list = [
            LogisticRegression,
            GradientBoostingClassifier,
            RandomForestClassifier,
            ExtraTreesClassifier,
            SGDClassifier,
        ]
        for cls in est_cls_list:
            print("=========================")
            print(cls.__name__)
            print("=========================")
            est = cls(**get_default_hp_of_cls(cls))
            start = time()
            est.fit(X_train, y_train, X_test, y_test)
            score = est.component.score(X_test.data, y_test.data)
            end = time()
            print("score:", score)
            print("time:", end - start)
            self.assertTrue(score == np.max(est.performance_history))
            print("max_iterations:", est.max_iterations)
            print("best_iteration_:", est.best_iteration_)
            print("early_stopping_rounds:", est.early_stopping_rounds)
            print("early_stopping_tol:", est.early_stopping_tol)
            print("iter_inc:", est.iter_inc)
            print("iteration:", est.iteration)
            print("iter_ix:", est.iter_ix)
            print("min_performance:", np.min(est.performance_history))
            print("max_performance:", np.max(est.performance_history))
            print("learning_curve:", est.learning_curve)
            print("estimator:", est)
            print('\n' * 2)
            learning_curve = est.learning_curve
            plt.grid()
            plt.plot(learning_curve[0], learning_curve[1], label="Train Set")
            plt.plot(learning_curve[0], learning_curve[2], label="Valid Set")
            plt.xlabel(est.iterations_name)
            plt.ylabel("Accuracy")
            title = cls.__name__
            plt.title(title)
            plt.axvline(x=est.best_iteration_, ls="--", c="k")
            plt.legend(loc="best")
            plt.savefig(self.plot_dir + f"/{title}.png", quality=100, dpi=600)
            plt.close()