예제 #1
0
    def test_classifier(self):
        X_train, y_train = self.X_train, self.y_train
        X_test, y_test = self.X_test, self.y_test
        import importlib

        module_name = ".".join(clf_name.split(".")[0:-1])
        class_name = clf_name.split(".")[-1]
        module = importlib.import_module(module_name)

        class_ = getattr(module, class_name)
        clf = class_()

        # test_schemas_are_schemas
        lale.type_checking.validate_is_schema(clf.input_schema_fit())
        lale.type_checking.validate_is_schema(clf.input_schema_predict())
        lale.type_checking.validate_is_schema(clf.output_schema_predict())
        lale.type_checking.validate_is_schema(clf.hyperparam_schema())

        # test_init_fit_predict
        trained = clf.fit(self.X_train, self.y_train)
        predictions = trained.predict(self.X_test)

        # test_with_hyperopt
        from lale.lib.lale import Hyperopt

        hyperopt = Hyperopt(estimator=clf, max_evals=1)
        trained = hyperopt.fit(self.X_train, self.y_train)
        predictions = trained.predict(self.X_test)

        # test_cross_validation
        from lale.helpers import cross_val_score

        cv_results = cross_val_score(clf, X_train, y_train, cv=2)
        self.assertEqual(len(cv_results), 2)

        # test_with_gridsearchcv_auto_wrapped
        from sklearn.metrics import accuracy_score, make_scorer

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            from lale.lib.sklearn.gradient_boosting_classifier import (
                GradientBoostingClassifierImpl, )

            if clf._impl_class() == GradientBoostingClassifierImpl:
                # because exponential loss does not work with iris dataset as it is not binary classification
                import lale.schemas as schemas

                clf = clf.customize_schema(
                    loss=schemas.Enum(default="deviance", values=["deviance"]))
            grid_search = lale.lib.lale.GridSearchCV(
                estimator=clf,
                lale_num_samples=1,
                lale_num_grids=1,
                cv=2,
                scoring=make_scorer(accuracy_score),
            )
            grid_search.fit(X_train, y_train)

        # test_predict_on_trainable
        trained = clf.fit(X_train, y_train)
        clf.predict(X_train)

        # test_to_json
        clf.to_json()

        # test_in_a_pipeline
        pipeline = NoOp() >> clf
        trained = pipeline.fit(self.X_train, self.y_train)
        predictions = trained.predict(self.X_test)
예제 #2
0
    def test_resampler(self):
        from lale.lib.sklearn import PCA, LogisticRegression

        X_train, y_train = self.X_train, self.y_train
        X_test = self.X_test
        import importlib

        module_name = ".".join(res_name.split(".")[0:-1])
        class_name = res_name.split(".")[-1]
        module = importlib.import_module(module_name)

        class_ = getattr(module, class_name)
        with self.assertRaises(ValidationError):
            _ = class_()

        # test_schemas_are_schemas
        lale.type_checking.validate_is_schema(class_.input_schema_fit())
        lale.type_checking.validate_is_schema(class_.input_schema_predict())
        lale.type_checking.validate_is_schema(class_.output_schema_predict())
        lale.type_checking.validate_is_schema(class_.hyperparam_schema())

        # test_init_fit_predict
        from lale.operators import make_pipeline

        pipeline1 = PCA() >> class_(
            operator=make_pipeline(LogisticRegression()))
        trained = pipeline1.fit(X_train, y_train)
        _ = trained.predict(X_test)

        pipeline2 = class_(operator=make_pipeline(PCA(), LogisticRegression()))
        trained = pipeline2.fit(X_train, y_train)
        _ = trained.predict(X_test)

        # test_with_hyperopt
        from lale.lib.lale import Hyperopt

        optimizer = Hyperopt(
            estimator=PCA >>
            class_(operator=make_pipeline(LogisticRegression())),
            max_evals=1,
            show_progressbar=False,
        )
        trained_optimizer = optimizer.fit(X_train, y_train)
        _ = trained_optimizer.predict(X_test)

        pipeline3 = class_(operator=PCA() >> (
            Nystroem & NoOp) >> ConcatFeatures >> LogisticRegression())
        optimizer = Hyperopt(estimator=pipeline3,
                             max_evals=1,
                             show_progressbar=False)
        trained_optimizer = optimizer.fit(X_train, y_train)
        _ = trained_optimizer.predict(X_test)

        pipeline4 = ((PCA >> class_(operator=make_pipeline(Nystroem()))
                      & class_(operator=make_pipeline(Nystroem()))) >>
                     ConcatFeatures >> LogisticRegression())
        optimizer = Hyperopt(estimator=pipeline4,
                             max_evals=1,
                             scoring="roc_auc",
                             show_progressbar=False)
        trained_optimizer = optimizer.fit(X_train, y_train)
        _ = trained_optimizer.predict(X_test)

        # test_cross_validation
        from lale.helpers import cross_val_score

        cv_results = cross_val_score(pipeline1, X_train, y_train, cv=2)
        self.assertEqual(len(cv_results), 2)

        # test_to_json
        pipeline1.to_json()
예제 #3
0
    def test_with_hyperopt2(self):
        from lale.expressions import (
            count,
            it,
            max,
            mean,
            min,
            string_indexer,
            sum,
            variance,
        )

        wrap_imported_operators()
        scan = Scan(table=it["main"])
        scan_0 = Scan(table=it["customers"])
        join = Join(pred=[(it["main"]["group_customer_id"] == it["customers"]
                           ["group_customer_id"])])
        map = Map(
            columns={
                "[main](group_customer_id)[customers]|number_children|identity":
                it["number_children"],
                "[main](group_customer_id)[customers]|name|identity":
                it["name"],
                "[main](group_customer_id)[customers]|income|identity":
                it["income"],
                "[main](group_customer_id)[customers]|address|identity":
                it["address"],
                "[main](group_customer_id)[customers]|age|identity":
                it["age"],
            },
            remainder="drop",
        )
        pipeline_4 = join >> map
        scan_1 = Scan(table=it["purchase"])
        join_0 = Join(
            pred=[(it["main"]["group_id"] == it["purchase"]["group_id"])],
            join_limit=50.0,
        )
        aggregate = Aggregate(
            columns={
                "[main](group_id)[purchase]|price|variance":
                variance(it["price"]),
                "[main](group_id)[purchase]|time|sum": sum(it["time"]),
                "[main](group_id)[purchase]|time|mean": mean(it["time"]),
                "[main](group_id)[purchase]|time|min": min(it["time"]),
                "[main](group_id)[purchase]|price|sum": sum(it["price"]),
                "[main](group_id)[purchase]|price|count": count(it["price"]),
                "[main](group_id)[purchase]|price|mean": mean(it["price"]),
                "[main](group_id)[purchase]|price|min": min(it["price"]),
                "[main](group_id)[purchase]|price|max": max(it["price"]),
                "[main](group_id)[purchase]|time|max": max(it["time"]),
                "[main](group_id)[purchase]|time|variance":
                variance(it["time"]),
            },
            group_by=it["row_id"],
        )
        pipeline_5 = join_0 >> aggregate
        map_0 = Map(
            columns={
                "[main]|group_customer_id|identity": it["group_customer_id"],
                "[main]|transaction_id|identity": it["transaction_id"],
                "[main]|group_id|identity": it["group_id"],
                "[main]|comments|identity": it["comments"],
                "[main]|id|identity": it["id"],
                "prefix_0_id": it["prefix_0_id"],
                "next_purchase": it["next_purchase"],
                "[main]|time|identity": it["time"],
            },
            remainder="drop",
        )
        scan_2 = Scan(table=it["transactions"])
        scan_3 = Scan(table=it["products"])
        join_1 = Join(pred=[
            (it["main"]["transaction_id"] == it["transactions"]
             ["transaction_id"]),
            (it["transactions"]["product_id"] == it["products"]["product_id"]),
        ])
        map_1 = Map(
            columns={
                "[main](transaction_id)[transactions](product_id)[products]|price|identity":
                it["price"],
                "[main](transaction_id)[transactions](product_id)[products]|type|identity":
                it["type"],
            },
            remainder="drop",
        )
        pipeline_6 = join_1 >> map_1
        join_2 = Join(pred=[(it["main"]["transaction_id"] == it["transactions"]
                             ["transaction_id"])])
        map_2 = Map(
            columns={
                "[main](transaction_id)[transactions]|description|identity":
                it["description"],
                "[main](transaction_id)[transactions]|product_id|identity":
                it["product_id"],
            },
            remainder="drop",
        )
        pipeline_7 = join_2 >> map_2
        map_3 = Map(columns=[
            string_indexer(it["[main]|comments|identity"]),
            string_indexer(
                it["[main](transaction_id)[transactions]|description|identity"]
            ),
            string_indexer(it[
                "[main](transaction_id)[transactions](product_id)[products]|type|identity"]
                           ),
            string_indexer(
                it["[main](group_customer_id)[customers]|name|identity"]),
            string_indexer(
                it["[main](group_customer_id)[customers]|address|identity"]),
        ])
        pipeline_8 = ConcatFeatures() >> map_3
        relational = Relational(operator=make_pipeline_graph(
            steps=[
                scan,
                scan_0,
                pipeline_4,
                scan_1,
                pipeline_5,
                map_0,
                scan_2,
                scan_3,
                pipeline_6,
                pipeline_7,
                pipeline_8,
            ],
            edges=[
                (scan, pipeline_4),
                (scan, pipeline_5),
                (scan, map_0),
                (scan, pipeline_6),
                (scan, pipeline_7),
                (scan_0, pipeline_4),
                (pipeline_4, pipeline_8),
                (scan_1, pipeline_5),
                (pipeline_5, pipeline_8),
                (map_0, pipeline_8),
                (scan_2, pipeline_6),
                (scan_2, pipeline_7),
                (scan_3, pipeline_6),
                (pipeline_6, pipeline_8),
                (pipeline_7, pipeline_8),
            ],
        ))
        pipeline = relational >> (KNeighborsClassifier | LogisticRegression)
        from sklearn.datasets import load_iris

        X, y = load_iris(return_X_y=True)
        from lale.lib.lale import Hyperopt

        opt = Hyperopt(estimator=pipeline, max_evals=2)
        opt.fit(X, y)