예제 #1
0
 def do1DTest(self, trainable, train_X, train_y, test_X, test_y):
     #Test for 1-D array as input to the transformers
     train_X = train_X[:, 0]
     test_X = test_X[:, 0]
     trainable_pipeline = (trainable & NoOp()) >> ConcatFeatures(
     ) >> float32_transform() >> LR()
     trained_pipeline = trainable_pipeline.fit(train_X, train_y)
     trained_pipeline.predict(test_X)
     hyperopt = Hyperopt(estimator=trainable_pipeline, max_evals=1)
     trained_hyperopt = hyperopt.fit(train_X, train_y)
     trained_hyperopt.predict(test_X)
예제 #2
0
파일: test_optimizers.py 프로젝트: IBM/lale
def run_hyperopt_on_planned_pipeline(planned_pipeline, max_iters=1):
    # data
    from sklearn.datasets import load_iris

    features, labels = load_iris(return_X_y=True)
    # set up optimizer
    from lale.lib.lale.hyperopt import Hyperopt

    opt = Hyperopt(estimator=planned_pipeline, max_evals=max_iters)
    # run optimizer
    _ = opt.fit(features, labels)
예제 #3
0
 def test_nested_pipeline1(self):
     from sklearn.datasets import load_iris
     from lale.lib.lale import Hyperopt
     from sklearn.metrics import accuracy_score
     data = load_iris()
     X, y = data.data, data.target
     #pipeline = KNeighborsClassifier() | (OneHotEncoder(handle_unknown = 'ignore') >> LogisticRegression())
     pipeline = KNeighborsClassifier() | (SimpleImputer() >> LogisticRegression())
     clf = Hyperopt(estimator=pipeline, max_evals=1)
     trained = clf.fit(X, y)
     predictions = trained.predict(X)
     print(accuracy_score(y, predictions))
예제 #4
0
    def test_runtime_limit_zero_time_hor(self):
        planned_pipeline = (MinMaxScaler | Normalizer) >> LinearRegression
        from sklearn.datasets import load_boston
        X, y = load_boston(return_X_y=True)

        hor = Hyperopt(estimator=planned_pipeline,
                       max_evals=1,
                       cv=3,
                       max_opt_time=0.0,
                       scoring='r2')
        hor_fitted = hor.fit(X, y)
        assert hor_fitted.get_pipeline() is None
예제 #5
0
    def _fit_hyperopt(self, X, y):
        from lale.lib.lale import Hyperopt, NoOp
        from lale.lib.sklearn import (
            PCA,
            DecisionTreeClassifier,
            DecisionTreeRegressor,
            KNeighborsClassifier,
            KNeighborsRegressor,
            MinMaxScaler,
            RandomForestClassifier,
            RandomForestRegressor,
            RobustScaler,
            SelectKBest,
            SGDClassifier,
            SGDRegressor,
            StandardScaler,
        )

        prep = auto_prep(X)
        scale = MinMaxScaler | StandardScaler | RobustScaler | NoOp
        reduce_dims = PCA | SelectKBest | NoOp
        gbt = auto_gbt(self.prediction_type)
        if self.prediction_type == "regression":
            estim_trees = gbt | DecisionTreeRegressor | RandomForestRegressor
            estim_notree = SGDRegressor | KNeighborsRegressor
        else:
            estim_trees = gbt | DecisionTreeClassifier | RandomForestClassifier
            estim_notree = SGDClassifier | KNeighborsClassifier
        model_trees = reduce_dims >> estim_trees
        model_notree = scale >> reduce_dims >> estim_notree
        planned = prep >> (model_trees | model_notree)
        trainable = Hyperopt(
            estimator=planned,
            max_evals=self.max_evals - self._summary.shape[0],
            scoring=self.scoring,
            best_score=self.best_score,
            max_opt_time=self.max_opt_time - (time.time() - self._start_fit),
            max_eval_time=self.max_eval_time,
            verbose=self.verbose,
            show_progressbar=False,
        )
        trained = trainable.fit(X, y)
        best_trial = trained._impl._trials.best_trial
        if "loss" in best_trial["result"]:
            if (best_trial["result"]["loss"] <
                    self._summary.at[self._name_of_best, "loss"]):
                self._name_of_best = f'p{best_trial["tid"]}'
        summary = trained.summary()
        self._summary = pd.concat([self._summary, summary])
        for name in summary.index:
            assert name not in self._pipelines
            if summary.at[name, "status"] == hyperopt.STATUS_OK:
                self._pipelines[name] = trained.get_pipeline(name)
예제 #6
0
 def doTest(self, trainable, train_X, train_y, test_X, test_y):
     trained = trainable.fit(train_X, train_y)
     transformed = trained.transform(test_X)
     with self.assertWarns(DeprecationWarning):
         trainable.transform(train_X)
     trainable.to_json()
     trainable_pipeline = trainable >> float32_transform() >> LR()
     trained_pipeline = trainable_pipeline.fit(train_X, train_y)
     trained_pipeline.predict(test_X)
     hyperopt = Hyperopt(estimator=trainable_pipeline, max_evals=1)
     trained_hyperopt = hyperopt.fit(train_X, train_y)
     trained_hyperopt.predict(test_X)
예제 #7
0
    def test_with_hyperopt(self):
        from lale.lib.lale import Hyperopt

        def my_scorer(estimator, X, y=None):
            return 1

        hyperopt = Hyperopt(estimator=KMeans(n_clusters=3),
                            max_evals=5,
                            verbose=True,
                            scoring=my_scorer)
        trained = hyperopt.fit(self.X_train)
        _ = trained.predict(self.X_test)
예제 #8
0
    def test_runtime_limit_zero_time_hoc(self):
        planned_pipeline = (MinMaxScaler | Normalizer) >> (
            LogisticRegression | KNeighborsClassifier)
        from sklearn.datasets import load_iris
        X, y = load_iris(return_X_y=True)

        hoc = Hyperopt(estimator=planned_pipeline,
                       max_evals=1,
                       cv=3,
                       scoring='accuracy',
                       max_opt_time=0.0)
        hoc_fitted = hoc.fit(X, y)
        assert hoc_fitted.get_pipeline() is None
예제 #9
0
    def test_feature_preprocessor(self):
        X_train, y_train = self.X_train, self.y_train
        import importlib

        module_name = ".".join(fproc_name.split(".")[0:-1])
        class_name = fproc_name.split(".")[-1]
        module = importlib.import_module(module_name)

        class_ = getattr(module, class_name)
        fproc = class_()

        from lale.lib.sklearn.one_hot_encoder import OneHotEncoder

        if isinstance(fproc, OneHotEncoder):  # type: ignore
            # fproc = OneHotEncoder(handle_unknown = 'ignore')
            # remove the hack when this is fixed
            fproc = PCA()
        # test_schemas_are_schemas
        lale.type_checking.validate_is_schema(fproc.input_schema_fit())
        lale.type_checking.validate_is_schema(fproc.input_schema_transform())
        lale.type_checking.validate_is_schema(fproc.output_schema_transform())
        lale.type_checking.validate_is_schema(fproc.hyperparam_schema())

        # test_init_fit_transform
        trained = fproc.fit(self.X_train, self.y_train)
        _ = trained.transform(self.X_test)

        # test_predict_on_trainable
        trained = fproc.fit(X_train, y_train)
        fproc.transform(X_train)

        # test_to_json
        fproc.to_json()

        # test_in_a_pipeline
        # This test assumes that the output of feature processing is compatible with LogisticRegression
        from lale.lib.sklearn import LogisticRegression

        pipeline = fproc >> LogisticRegression()
        trained = pipeline.fit(self.X_train, self.y_train)
        _ = trained.predict(self.X_test)

        # Tune the pipeline with LR using Hyperopt
        from lale.lib.lale import Hyperopt

        hyperopt = Hyperopt(estimator=pipeline,
                            max_evals=1,
                            verbose=True,
                            cv=3)
        trained = hyperopt.fit(self.X_train, self.y_train)
        _ = trained.predict(self.X_test)
예제 #10
0
    def test_with_hyperopt(self):
        def my_scorer(estimator, X, y=None):
            return 1

        from lale.lib.lale import Hyperopt

        hyperopt = Hyperopt(
            estimator=IsolationForest(max_features=1.0, max_samples=1.0),
            max_evals=5,
            verbose=True,
            scoring=my_scorer,
        )
        trained = hyperopt.fit(self.X_train)
        _ = trained.predict(self.X_test)
예제 #11
0
    def test_with_hyperopt(self):
        from lale.lib.sklearn import OrdinalEncoder
        X_train, y_train = self.X_train, self.y_train
        X_test, y_test = self.X_test, self.y_test

        fproc = OrdinalEncoder()
        from lale.lib.sklearn import LogisticRegression
        pipeline = fproc >> LogisticRegression()

        #Tune the pipeline with LR using Hyperopt
        from lale.lib.lale import Hyperopt
        hyperopt = Hyperopt(estimator=pipeline, max_evals=1)
        trained = hyperopt.fit(self.X_train, self.y_train)
        predictions = trained.predict(self.X_test)
예제 #12
0
    def test_with_hyperopt(self):
        from lale.lib.sklearn import OrdinalEncoder

        fproc = OrdinalEncoder(handle_unknown="ignore")
        from lale.lib.sklearn import LogisticRegression

        pipeline = fproc >> LogisticRegression()

        # Tune the pipeline with LR using Hyperopt
        from lale.lib.lale import Hyperopt

        hyperopt = Hyperopt(estimator=pipeline, max_evals=1)
        trained = hyperopt.fit(self.X_train, self.y_train)
        _ = trained.predict(self.X_test)
예제 #13
0
 def test_preprocessing_union(self):
     from lale.datasets import openml
     (train_X, train_y), (test_X, test_y) = openml.fetch(
         'credit-g', 'classification', preprocess=False)
     from lale.lib.lale import Project
     from lale.lib.sklearn import Normalizer, OneHotEncoder
     from lale.lib.lale import ConcatFeatures as Concat
     from lale.lib.sklearn import RandomForestClassifier as Forest
     prep_num = Project(columns={'type': 'number'}) >> Normalizer
     prep_cat = Project(columns={'not': {'type': 'number'}}) >> OneHotEncoder(sparse=False)
     planned = (prep_num & prep_cat) >> Concat >> Forest
     from lale.lib.lale import Hyperopt
     hyperopt_classifier = Hyperopt(estimator=planned, max_evals=1)
     best_found = hyperopt_classifier.fit(train_X, train_y)
예제 #14
0
    def test_custom_scoring(self):
        from sklearn.metrics import f1_score, make_scorer

        lr = LogisticRegression()
        clf = Hyperopt(
            estimator=lr,
            scoring=make_scorer(f1_score, average="macro"),
            cv=5,
            max_evals=1,
        )
        trained = clf.fit(self.X_train, self.y_train)
        predictions = trained.predict(self.X_test)
        predictions_1 = clf.predict(self.X_test)
        assert np.array_equal(predictions_1, predictions)
예제 #15
0
    def test_regressor(self):
        X_train, y_train = self.X_train, self.y_train
        import importlib

        module_name = ".".join(clf_name.split(".")[0:-1])
        class_name = clf_name.split(".")[-1]
        module = importlib.import_module(module_name)

        class_ = getattr(module, class_name)
        regr = None
        if class_name in ["StackingRegressor", "VotingRegressor"]:
            regr = class_(estimators=[("base", SGDRegressor())])
        else:
            regr = class_()

        # test_schemas_are_schemas
        lale.type_checking.validate_is_schema(regr.input_schema_fit())
        lale.type_checking.validate_is_schema(regr.input_schema_predict())
        lale.type_checking.validate_is_schema(regr.output_schema_predict())
        lale.type_checking.validate_is_schema(regr.hyperparam_schema())

        # test_init_fit_predict
        trained = regr.fit(self.X_train, self.y_train)
        _ = trained.predict(self.X_test)

        # test score
        _ = trained.score(self.X_test, self.y_test)

        # test_predict_on_trainable
        trained = regr.fit(X_train, y_train)
        regr.predict(X_train)

        # test_to_json
        regr.to_json()

        # test_in_a_pipeline
        pipeline = NoOp() >> regr
        trained = pipeline.fit(self.X_train, self.y_train)
        _ = trained.predict(self.X_test)

        # test_with_hyperopt
        from lale.lib.sklearn.ridge import Ridge

        if isinstance(regr, Ridge):  # type: ignore
            from lale.lib.lale import Hyperopt

            hyperopt = Hyperopt(estimator=pipeline, max_evals=1)
            trained = hyperopt.fit(self.X_train, self.y_train)
            _ = trained.predict(self.X_test)
예제 #16
0
    def test_J48_for_car_dataset(self):
        from lalegpl.datasets.auto_weka import fetch_car
        (X_train, y_train), (X_test, y_test) = fetch_car()
        from sklearn.preprocessing import LabelEncoder
        le = LabelEncoder()
        y_train = le.fit_transform(y_train)
        y_test = le.transform(y_test)

        clf = J48()
        from sklearn.metrics import accuracy_score
        from lale.lib.lale import NoOp, Hyperopt
        from lale.operators import make_pipeline
        clf = Hyperopt(estimator=make_pipeline(J48()), max_evals=1)
        trained_clf = clf.fit(X_train, y_train)
        print(accuracy_score(y_test, trained_clf.predict(X_test)))
예제 #17
0
    def test_grammar_all_combinator(self):
        g = Grammar()

        g.start = g.estimator
        g.estimator = g.term_est | g.transformer >> g.term_est
        g.term_est = g.prim_est | g.ensemble
        g.ensemble = Boost(base_estimator=LR)
        g.transformer = g.union_tfm | g.union_tfm >> g.transformer
        g.union_tfm = g.prim_tfm | g.union_body >> Concat
        g.union_body = g.transformer | g.transformer & g.union_body

        g.prim_est = LR | KNN
        g.prim_tfm = PCA | Scaler
        g.ensembler = Boost

        generated = g.unfold(7)
        sample = g.sample(7)
        assert isinstance(generated, PlannedOperator)
        assert isinstance(sample, PlannedOperator)

        # Train
        try:
            gtrainer = Hyperopt(estimator=generated, max_evals=3, scoring="r2")
            gtrained = gtrainer.fit(self.train_X, self.train_y)
            assert isinstance(gtrained.get_pipeline(), TrainedOperator)
        except ValueError:
            # None of the trials succeeded
            pass

        try:
            strainer = Hyperopt(estimator=sample, max_evals=3, scoring="r2")
            strained = strainer.fit(self.train_X, self.train_y)
            assert isinstance(strained.get_pipeline(), TrainedOperator)
        except ValueError:
            # None of the trials succeeded
            pass
예제 #18
0
 def test_custom_scorer(self):
     from sklearn.metrics import f1_score, make_scorer
     pipeline = PCA() >> LogisticRegression()
     def custom_scorer(estimator, X, y, factor=0.1):
         #This is a custom scorer for demonstrating the use of kwargs
         #Just applies some factor to the accuracy
         from sklearn.metrics import accuracy_score
         predictions = estimator.predict(X)
         self.assertEqual(factor, 0.5)
         return factor*accuracy_score(y, predictions)
     clf = Hyperopt(estimator=pipeline, scoring=custom_scorer, cv = 5, max_evals=1, args_to_scorer={'factor':0.5})
     trained = clf.fit(self.X_train, self.y_train)
     predictions = trained.predict(self.X_test)
     predictions_1 = clf.predict(self.X_test)
     assert np.array_equal(predictions_1, predictions)
예제 #19
0
    def test_decision_function_1(self):
        def my_scorer(estimator, X, y=None):
            return 1

        from lale.lib.lale import Hyperopt

        hyperopt = Hyperopt(
            estimator=IsolationForest(max_features=1.0, max_samples=1.0),
            max_evals=5,
            verbose=True,
            scoring=my_scorer,
        )
        trained = hyperopt.fit(self.X_train)
        pipeline = trained.get_pipeline()
        assert pipeline is not None
        _ = pipeline.decision_function(self.X_test)
예제 #20
0
    def doTestPipeline(self,
                       trainable_pipeline,
                       train_X,
                       train_y,
                       test_X,
                       test_y,
                       optimization=False):
        def adjusted_smape(y_true, y_pred):
            """
            SMAPE
            """
            y_true, y_pred = np.array(y_true).ravel(), np.array(y_pred).ravel()
            if len(y_true) != len(y_pred):
                print(
                    "Size of Ground Truth and Predicted Values do not match!, returning None."
                )
                # May be raising error will interfere with daub execution if one pipeline fails
                # raise ValueError('Size of Ground Truth and Predicted Values do not match!')
                return None

            pred_diff = 2.0 * np.abs(cast(float, y_true - y_pred))
            divide = np.abs(y_true) + np.abs(y_pred)
            divide[divide < 1e-12] = 1.0
            scores = pred_diff / divide
            scores = np.array(scores, dtype=float)
            return np.nanmean(scores) * 100.0

        trained_pipeline = trainable_pipeline.fit(train_X, train_y)
        predicted = trained_pipeline.predict(test_X[:-1])
        if optimization:
            print(adjusted_smape(test_X[:-1], predicted))
        else:
            print(adjusted_smape(test_X[-1], predicted))
        with self.assertWarns(DeprecationWarning):
            trainable_pipeline.predict(train_X)
        trainable_pipeline.to_json()
        if optimization:
            hyperopt = Hyperopt(
                estimator=trainable_pipeline,
                max_evals=2,
                verbose=True,
                cv=TimeSeriesSplit(),
                scoring=make_scorer(adjusted_smape),
            )
            trained_hyperopt = hyperopt.fit(train_X, train_y)
            trained_hyperopt.predict(test_X)
예제 #21
0
파일: test_relational.py 프로젝트: IBM/lale
    def test_with_hyperopt(self):
        from sklearn.datasets import load_iris

        X, y = load_iris(return_X_y=True)
        gender_map = {"m": "Male", "f": "Female"}
        state_map = {"NY": "New York", "CA": "California"}
        map_replace = Map(
            columns=[
                replace(it.gender, gender_map),
                replace(it.state, state_map)
            ],
            remainder="drop",
        )
        pipeline = (Relational(operator=(Scan(table=it.main) & Scan(
            table=it.delay)) >> map_replace) >> LogisticRegression())
        opt = Hyperopt(estimator=pipeline, cv=3, max_evals=5)
        trained = opt.fit(X, y)
        _ = trained
예제 #22
0
 def test_text_and_structured(self):
     from lale.datasets.uci.uci_datasets import fetch_drugscom
     from sklearn.model_selection import train_test_split
     train_X_all, train_y_all, test_X, test_y = fetch_drugscom()
     #subset to speed up debugging
     train_X, train_X_ignore, train_y, train_y_ignore = train_test_split(
         train_X_all, train_y_all, train_size=0.01, random_state=42)
     from lale.lib.lale import Project
     from lale.lib.lale import ConcatFeatures as Cat
     from lale.lib.sklearn import TfidfVectorizer as Tfidf
     from lale.lib.sklearn import LinearRegression as LinReg
     from lale.lib.sklearn import RandomForestRegressor as Forest
     prep_text = Project(columns=['review']) >> Tfidf(max_features=100)
     prep_nums = Project(columns={'type': 'number'})
     planned = (prep_text & prep_nums) >> Cat >> (LinReg | Forest)
     from lale.lib.lale import Hyperopt
     hyperopt_classifier = Hyperopt(estimator=planned, max_evals=1, scoring='r2')
     best_found = hyperopt_classifier.fit(train_X, train_y)
예제 #23
0
    def test_with_concat_features1(self):
        import warnings
        warnings.filterwarnings("ignore")

        from sklearn.datasets import load_iris
        from lale.lib.lale import Hyperopt
        from sklearn.metrics import accuracy_score
        data = load_iris()
        X, y = data.data, data.target
        pca = PCA(n_components=3)
        nys = Nystroem(n_components=10)
        concat = ConcatFeatures()
        lr = LogisticRegression(random_state=42, C=0.1)
        pipeline = ((pca & nys) >> concat >> lr) | KNeighborsClassifier()
        clf = Hyperopt(estimator=pipeline, max_evals=1)
        trained = clf.fit(X, y)
        predictions = trained.predict(X)
        print(accuracy_score(y, predictions))
        warnings.resetwarnings()
예제 #24
0
    def test_regressor(self):
        X_train, y_train = self.X_train, self.y_train
        X_test, y_test = self.X_test, self.y_test
        import importlib

        module_name = ".".join(clf_name.split(".")[0:-1])
        class_name = clf_name.split(".")[-1]
        module = importlib.import_module(module_name)

        class_ = getattr(module, class_name)
        regr = class_()

        # test_schemas_are_schemas
        lale.type_checking.validate_is_schema(regr.input_schema_fit())
        lale.type_checking.validate_is_schema(regr.input_schema_predict())
        lale.type_checking.validate_is_schema(regr.output_schema_predict())
        lale.type_checking.validate_is_schema(regr.hyperparam_schema())

        # test_init_fit_predict
        trained = regr.fit(self.X_train, self.y_train)
        predictions = trained.predict(self.X_test)

        # test_predict_on_trainable
        trained = regr.fit(X_train, y_train)
        regr.predict(X_train)

        # test_to_json
        regr.to_json()

        # test_in_a_pipeline
        pipeline = NoOp() >> regr
        trained = pipeline.fit(self.X_train, self.y_train)
        predictions = trained.predict(self.X_test)

        # test_with_hyperopt
        from lale.lib.sklearn.ridge import RidgeImpl

        if regr._impl_class() != RidgeImpl:
            from lale.lib.lale import Hyperopt

            hyperopt = Hyperopt(estimator=pipeline, max_evals=1)
            trained = hyperopt.fit(self.X_train, self.y_train)
            predictions = trained.predict(self.X_test)
예제 #25
0
    def test_runtime_limit_hor(self):
        import time
        planned_pipeline = (MinMaxScaler | Normalizer) >> LinearRegression
        from sklearn.datasets import load_boston
        X, y = load_boston(return_X_y=True)

        max_opt_time = 3.0
        hor = Hyperopt(estimator=planned_pipeline,
                       max_evals=1,
                       cv=3,
                       max_opt_time=max_opt_time,
                       scoring='r2')
        start = time.time()
        best_trained = hor.fit(X[:500, :], y[:500])
        end = time.time()
        opt_time = end - start
        rel_diff = (opt_time - max_opt_time) / max_opt_time
        assert rel_diff < 0.2, (
            'Max time: {}, Actual time: {}, relative diff: {}'.format(
                max_opt_time, opt_time, rel_diff))
예제 #26
0
    def test_runtime_limit_hoc(self):
        import time
        planned_pipeline = (MinMaxScaler | Normalizer) >> (
            LogisticRegression | KNeighborsClassifier)
        from sklearn.datasets import load_iris
        X, y = load_iris(return_X_y=True)

        max_opt_time = 2.0
        hoc = Hyperopt(estimator=planned_pipeline,
                       max_evals=1,
                       cv=3,
                       scoring='accuracy',
                       max_opt_time=max_opt_time)
        start = time.time()
        best_trained = hoc.fit(X, y)
        end = time.time()
        opt_time = end - start
        rel_diff = (opt_time - max_opt_time) / max_opt_time
        assert rel_diff < 0.5, (
            'Max time: {}, Actual time: {}, relative diff: {}'.format(
                max_opt_time, opt_time, rel_diff))
예제 #27
0
    def _fit_hyperopt(self, X, y):
        from lale.lib.lale import Hyperopt, NoOp
        from lale.lib.sklearn import (
            PCA,
            DecisionTreeClassifier,
            DecisionTreeRegressor,
            KNeighborsClassifier,
            KNeighborsRegressor,
            MinMaxScaler,
            RandomForestClassifier,
            RandomForestRegressor,
            RobustScaler,
            SelectKBest,
            SGDClassifier,
            SGDRegressor,
            StandardScaler,
        )

        remaining_time = self.max_opt_time - (time.time() - self._start_fit)
        if remaining_time <= 0:
            return
        prep = auto_prep(X)
        scale = MinMaxScaler | StandardScaler | RobustScaler | NoOp
        reduce_dims = PCA | SelectKBest | NoOp
        gbt = auto_gbt(self.prediction_type)
        if self.prediction_type == "regression":
            estim_trees = gbt | DecisionTreeRegressor | RandomForestRegressor
            estim_notree = SGDRegressor | KNeighborsRegressor
        else:
            estim_trees = gbt | DecisionTreeClassifier | RandomForestClassifier
            estim_notree = SGDClassifier | KNeighborsClassifier
        model_trees = reduce_dims >> estim_trees
        model_notree = scale >> reduce_dims >> estim_notree
        planned = prep >> (model_trees | model_notree)
        prior_evals = self._summary.shape[0] if self._summary is not None else 0
        trainable = Hyperopt(
            estimator=planned,
            max_evals=self.max_evals - prior_evals,
            scoring=self.scoring,
            best_score=self.best_score,
            max_opt_time=remaining_time,
            max_eval_time=self.max_eval_time,
            verbose=self.verbose,
            show_progressbar=False,
        )
        trained = trainable.fit(X, y)
        # The static types are not currently smart enough to verify
        # that the conditionally defined summary method is actually present
        # But it must be, since the hyperopt impl type provides it
        summary: pd.DataFrame = trained.summary()  # type: ignore
        if list(summary.status) == ["new"]:
            return  # only one trial and that one timed out
        best_trial = trained._impl._trials.best_trial
        if "loss" in best_trial["result"]:
            if (self._summary is None or best_trial["result"]["loss"] <
                    self._summary.at[self._name_of_best, "loss"]):
                self._name_of_best = f'p{best_trial["tid"]}'
        if self._summary is None:
            self._summary = summary
        else:
            self._summary = pd.concat([self._summary, summary])
        for name in summary.index:
            assert name not in self._pipelines
            if summary.at[name, "status"] == hyperopt.STATUS_OK:
                self._pipelines[name] = trained.get_pipeline(name)
예제 #28
0
    def test_classifier(self):
        X_train, y_train = self.X_train, self.y_train
        X_test, y_test = self.X_test, self.y_test
        import importlib
        module_name = ".".join(clf_name.split('.')[0:-1])
        class_name = clf_name.split('.')[-1]
        module = importlib.import_module(module_name)

        class_ = getattr(module, class_name)
        clf = class_()

        #test_schemas_are_schemas
        lale.type_checking.validate_is_schema(clf.input_schema_fit())
        lale.type_checking.validate_is_schema(clf.input_schema_predict())
        lale.type_checking.validate_is_schema(clf.output_schema_predict())
        lale.type_checking.validate_is_schema(clf.hyperparam_schema())

        #test_init_fit_predict
        trained = clf.fit(self.X_train, self.y_train)
        predictions = trained.predict(self.X_test)

        #test_with_hyperopt
        from lale.lib.lale import Hyperopt
        hyperopt = Hyperopt(estimator=clf, max_evals=1)
        trained = hyperopt.fit(self.X_train, self.y_train)
        predictions = trained.predict(self.X_test)

        #test_cross_validation
        from lale.helpers import cross_val_score
        cv_results = cross_val_score(clf, X_train, y_train, cv=2)
        self.assertEqual(len(cv_results), 2)

        #test_with_gridsearchcv_auto_wrapped
        from sklearn.metrics import accuracy_score, make_scorer
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            from lale.lib.sklearn.gradient_boosting_classifier import GradientBoostingClassifierImpl
            from lale.lib.sklearn.mlp_classifier import MLPClassifierImpl
            if clf._impl_class() == GradientBoostingClassifierImpl:
                #because exponential loss does not work with iris dataset as it is not binary classification
                import lale.schemas as schemas
                clf = clf.customize_schema(
                    loss=schemas.Enum(default='deviance', values=['deviance']))
            grid_search = lale.lib.lale.GridSearchCV(
                estimator=clf,
                lale_num_samples=1,
                lale_num_grids=1,
                cv=2,
                scoring=make_scorer(accuracy_score))
            grid_search.fit(X_train, y_train)

        #test_predict_on_trainable
        trained = clf.fit(X_train, y_train)
        clf.predict(X_train)

        #test_to_json
        clf.to_json()

        #test_in_a_pipeline
        pipeline = NoOp() >> clf
        trained = pipeline.fit(self.X_train, self.y_train)
        predictions = trained.predict(self.X_test)
예제 #29
0
파일: test_relational.py 프로젝트: IBM/lale
    def test_with_hyperopt2(self):
        from lale.expressions import (
            count,
            it,
            max,
            mean,
            min,
            string_indexer,
            sum,
            variance,
        )

        wrap_imported_operators()
        scan = Scan(table=it["main"])
        scan_0 = Scan(table=it["customers"])
        join = Join(pred=[(it["main"]["group_customer_id"] == it["customers"]
                           ["group_customer_id"])])
        map = Map(
            columns={
                "[main](group_customer_id)[customers]|number_children|identity":
                it["number_children"],
                "[main](group_customer_id)[customers]|name|identity":
                it["name"],
                "[main](group_customer_id)[customers]|income|identity":
                it["income"],
                "[main](group_customer_id)[customers]|address|identity":
                it["address"],
                "[main](group_customer_id)[customers]|age|identity":
                it["age"],
            },
            remainder="drop",
        )
        pipeline_4 = join >> map
        scan_1 = Scan(table=it["purchase"])
        join_0 = Join(
            pred=[(it["main"]["group_id"] == it["purchase"]["group_id"])],
            join_limit=50.0,
        )
        aggregate = Aggregate(
            columns={
                "[main](group_id)[purchase]|price|variance":
                variance(it["price"]),
                "[main](group_id)[purchase]|time|sum": sum(it["time"]),
                "[main](group_id)[purchase]|time|mean": mean(it["time"]),
                "[main](group_id)[purchase]|time|min": min(it["time"]),
                "[main](group_id)[purchase]|price|sum": sum(it["price"]),
                "[main](group_id)[purchase]|price|count": count(it["price"]),
                "[main](group_id)[purchase]|price|mean": mean(it["price"]),
                "[main](group_id)[purchase]|price|min": min(it["price"]),
                "[main](group_id)[purchase]|price|max": max(it["price"]),
                "[main](group_id)[purchase]|time|max": max(it["time"]),
                "[main](group_id)[purchase]|time|variance":
                variance(it["time"]),
            },
            group_by=it["row_id"],
        )
        pipeline_5 = join_0 >> aggregate
        map_0 = Map(
            columns={
                "[main]|group_customer_id|identity": it["group_customer_id"],
                "[main]|transaction_id|identity": it["transaction_id"],
                "[main]|group_id|identity": it["group_id"],
                "[main]|comments|identity": it["comments"],
                "[main]|id|identity": it["id"],
                "prefix_0_id": it["prefix_0_id"],
                "next_purchase": it["next_purchase"],
                "[main]|time|identity": it["time"],
            },
            remainder="drop",
        )
        scan_2 = Scan(table=it["transactions"])
        scan_3 = Scan(table=it["products"])
        join_1 = Join(pred=[
            (it["main"]["transaction_id"] == it["transactions"]
             ["transaction_id"]),
            (it["transactions"]["product_id"] == it["products"]["product_id"]),
        ])
        map_1 = Map(
            columns={
                "[main](transaction_id)[transactions](product_id)[products]|price|identity":
                it["price"],
                "[main](transaction_id)[transactions](product_id)[products]|type|identity":
                it["type"],
            },
            remainder="drop",
        )
        pipeline_6 = join_1 >> map_1
        join_2 = Join(pred=[(it["main"]["transaction_id"] == it["transactions"]
                             ["transaction_id"])])
        map_2 = Map(
            columns={
                "[main](transaction_id)[transactions]|description|identity":
                it["description"],
                "[main](transaction_id)[transactions]|product_id|identity":
                it["product_id"],
            },
            remainder="drop",
        )
        pipeline_7 = join_2 >> map_2
        map_3 = Map(columns=[
            string_indexer(it["[main]|comments|identity"]),
            string_indexer(
                it["[main](transaction_id)[transactions]|description|identity"]
            ),
            string_indexer(it[
                "[main](transaction_id)[transactions](product_id)[products]|type|identity"]
                           ),
            string_indexer(
                it["[main](group_customer_id)[customers]|name|identity"]),
            string_indexer(
                it["[main](group_customer_id)[customers]|address|identity"]),
        ])
        pipeline_8 = ConcatFeatures() >> map_3
        relational = Relational(operator=make_pipeline_graph(
            steps=[
                scan,
                scan_0,
                pipeline_4,
                scan_1,
                pipeline_5,
                map_0,
                scan_2,
                scan_3,
                pipeline_6,
                pipeline_7,
                pipeline_8,
            ],
            edges=[
                (scan, pipeline_4),
                (scan, pipeline_5),
                (scan, map_0),
                (scan, pipeline_6),
                (scan, pipeline_7),
                (scan_0, pipeline_4),
                (pipeline_4, pipeline_8),
                (scan_1, pipeline_5),
                (pipeline_5, pipeline_8),
                (map_0, pipeline_8),
                (scan_2, pipeline_6),
                (scan_2, pipeline_7),
                (scan_3, pipeline_6),
                (pipeline_6, pipeline_8),
                (pipeline_7, pipeline_8),
            ],
        ))
        pipeline = relational >> (KNeighborsClassifier | LogisticRegression)
        from sklearn.datasets import load_iris

        X, y = load_iris(return_X_y=True)
        from lale.lib.lale import Hyperopt

        opt = Hyperopt(estimator=pipeline, max_evals=2)
        opt.fit(X, y)
예제 #30
0
    def test_resampler(self):
        from lale.lib.sklearn import PCA, LogisticRegression

        X_train, y_train = self.X_train, self.y_train
        X_test = self.X_test
        import importlib

        module_name = ".".join(res_name.split(".")[0:-1])
        class_name = res_name.split(".")[-1]
        module = importlib.import_module(module_name)

        class_ = getattr(module, class_name)
        with EnableSchemaValidation():
            with self.assertRaises(ValidationError):
                _ = class_()

        # test_schemas_are_schemas
        lale.type_checking.validate_is_schema(class_.input_schema_fit())
        lale.type_checking.validate_is_schema(class_.input_schema_predict())
        lale.type_checking.validate_is_schema(class_.output_schema_predict())
        lale.type_checking.validate_is_schema(class_.hyperparam_schema())

        # test_init_fit_predict
        from lale.operators import make_pipeline

        pipeline1 = PCA() >> class_(operator=make_pipeline(LogisticRegression()))
        trained = pipeline1.fit(X_train, y_train)
        _ = trained.predict(X_test)

        pipeline2 = class_(operator=make_pipeline(PCA(), LogisticRegression()))
        trained = pipeline2.fit(X_train, y_train)
        _ = trained.predict(X_test)

        # test_with_hyperopt
        from lale.lib.lale import Hyperopt

        optimizer = Hyperopt(
            estimator=PCA >> class_(operator=make_pipeline(LogisticRegression())),
            max_evals=1,
            show_progressbar=False,
        )
        trained_optimizer = optimizer.fit(X_train, y_train)
        _ = trained_optimizer.predict(X_test)

        pipeline3 = class_(
            operator=PCA()
            >> (Nystroem & NoOp)
            >> ConcatFeatures
            >> LogisticRegression()
        )
        optimizer = Hyperopt(estimator=pipeline3, max_evals=1, show_progressbar=False)
        trained_optimizer = optimizer.fit(X_train, y_train)
        _ = trained_optimizer.predict(X_test)

        pipeline4 = (
            (
                PCA >> class_(operator=make_pipeline(Nystroem()))
                & class_(operator=make_pipeline(Nystroem()))
            )
            >> ConcatFeatures
            >> LogisticRegression()
        )
        optimizer = Hyperopt(
            estimator=pipeline4, max_evals=1, scoring="roc_auc", show_progressbar=False
        )
        trained_optimizer = optimizer.fit(X_train, y_train)
        _ = trained_optimizer.predict(X_test)

        # test_cross_validation
        from lale.helpers import cross_val_score

        cv_results = cross_val_score(pipeline1, X_train, y_train, cv=2)
        self.assertEqual(len(cv_results), 2)

        # test_to_json
        pipeline1.to_json()