示例#1
0
    def test_import_from_sklearn_pipeline_feature_union(self):
        from sklearn.decomposition import PCA as SklearnPCA
        from sklearn.kernel_approximation import Nystroem as SklearnNystroem
        from sklearn.neighbors import KNeighborsClassifier as SklearnKNN
        from sklearn.pipeline import FeatureUnion

        union = FeatureUnion([
            ("pca", SklearnPCA(n_components=1)),
            ("nys", SklearnNystroem(n_components=2, random_state=42)),
        ])
        sklearn_pipeline = sklearn.pipeline.make_pipeline(union, SklearnKNN())
        lale_pipeline = typing.cast(
            lale.operators.TrainablePipeline,
            import_from_sklearn_pipeline(sklearn_pipeline),
        )
        self.assertEqual(len(lale_pipeline.edges()), 3)
        from lale.lib.lale.concat_features import ConcatFeatures
        from lale.lib.sklearn.k_neighbors_classifier import KNeighborsClassifier
        from lale.lib.sklearn.nystroem import Nystroem
        from lale.lib.sklearn.pca import PCA

        self.assertIsInstance(lale_pipeline.edges()[0][0], PCA)  # type: ignore
        self.assertIsInstance(lale_pipeline.edges()[0][1],
                              ConcatFeatures)  # type: ignore
        self.assertIsInstance(lale_pipeline.edges()[1][0],
                              Nystroem)  # type: ignore
        self.assertIsInstance(lale_pipeline.edges()[1][1],
                              ConcatFeatures)  # type: ignore
        self.assertIsInstance(lale_pipeline.edges()[2][0],
                              ConcatFeatures)  # type: ignore
        self.assertIsInstance(lale_pipeline.edges()[2][1],
                              KNeighborsClassifier)  # type: ignore
        self.assert_equal_predictions(sklearn_pipeline, lale_pipeline)
示例#2
0
    def test_import_from_sklearn_pipeline_feature_union(self):
        from sklearn.decomposition import PCA as SklearnPCA
        from sklearn.kernel_approximation import Nystroem as SklearnNystroem
        from sklearn.neighbors import KNeighborsClassifier as SklearnKNN
        from sklearn.pipeline import FeatureUnion

        union = FeatureUnion([
            ("pca", SklearnPCA(n_components=1)),
            ("nys", SklearnNystroem(n_components=2, random_state=42)),
        ])
        sklearn_pipeline = sklearn.pipeline.make_pipeline(union, SklearnKNN())
        lale_pipeline = import_from_sklearn_pipeline(sklearn_pipeline)
        self.assertEqual(len(lale_pipeline.edges()), 3)
        from lale.lib.lale.concat_features import ConcatFeaturesImpl
        from lale.lib.sklearn.k_neighbors_classifier import KNeighborsClassifierImpl
        from lale.lib.sklearn.nystroem import NystroemImpl
        from lale.lib.sklearn.pca import PCAImpl

        self.assertEqual(lale_pipeline.edges()[0][0]._impl_class(), PCAImpl)
        self.assertEqual(lale_pipeline.edges()[0][1]._impl_class(),
                         ConcatFeaturesImpl)
        self.assertEqual(lale_pipeline.edges()[1][0]._impl_class(),
                         NystroemImpl)
        self.assertEqual(lale_pipeline.edges()[1][1]._impl_class(),
                         ConcatFeaturesImpl)
        self.assertEqual(lale_pipeline.edges()[2][0]._impl_class(),
                         ConcatFeaturesImpl)
        self.assertEqual(lale_pipeline.edges()[2][1]._impl_class(),
                         KNeighborsClassifierImpl)
        self.assert_equal_predictions(sklearn_pipeline, lale_pipeline)
示例#3
0
    def test_comparison_with_scikit(self):
        import warnings
        warnings.filterwarnings("ignore")
        from lale.lib.sklearn import PCA
        import sklearn.datasets
        from lale.helpers import cross_val_score
        pca = PCA(n_components=3, random_state=42, svd_solver='arpack')
        nys = Nystroem(n_components=10, random_state=42)
        concat = ConcatFeatures()
        lr = LogisticRegression(random_state=42, C=0.1)
        trainable = (pca & nys) >> concat >> lr
        digits = sklearn.datasets.load_digits()
        X, y = sklearn.utils.shuffle(digits.data, digits.target, random_state=42)

        cv_results = cross_val_score(trainable, X, y)
        cv_results = ['{0:.1%}'.format(score) for score in cv_results]

        from sklearn.pipeline import make_pipeline, FeatureUnion
        from sklearn.decomposition import PCA as SklearnPCA
        from sklearn.kernel_approximation import Nystroem as SklearnNystroem
        from sklearn.linear_model import LogisticRegression as SklearnLR
        from sklearn.model_selection import cross_val_score
        union = FeatureUnion([("pca", SklearnPCA(n_components=3, random_state=42, svd_solver='arpack')),
                            ("nys", SklearnNystroem(n_components=10, random_state=42))])
        lr = SklearnLR(random_state=42, C=0.1)
        pipeline = make_pipeline(union, lr)

        scikit_cv_results = cross_val_score(pipeline, X, y, cv = 5)
        scikit_cv_results = ['{0:.1%}'.format(score) for score in scikit_cv_results]
        self.assertEqual(cv_results, scikit_cv_results)
        warnings.resetwarnings()
示例#4
0
    def test_import_from_sklearn_pipeline_nested_pipeline1(self):
        from sklearn.decomposition import PCA as SklearnPCA
        from sklearn.feature_selection import SelectKBest
        from sklearn.kernel_approximation import Nystroem as SklearnNystroem
        from sklearn.neighbors import KNeighborsClassifier as SklearnKNN
        from sklearn.pipeline import FeatureUnion

        union = FeatureUnion(
            [
                (
                    "selectkbest_pca",
                    sklearn.pipeline.make_pipeline(
                        SelectKBest(k=3),
                        FeatureUnion(
                            [
                                ("pca", SklearnPCA(n_components=1)),
                                (
                                    "nested_pipeline",
                                    sklearn.pipeline.make_pipeline(
                                        SelectKBest(k=2), SklearnNystroem()
                                    ),
                                ),
                            ]
                        ),
                    ),
                ),
                ("nys", SklearnNystroem(n_components=2, random_state=42)),
            ]
        )
        sklearn_pipeline = sklearn.pipeline.make_pipeline(union, SklearnKNN())
        lale_pipeline = typing.cast(
            lale.operators.TrainablePipeline,
            import_from_sklearn_pipeline(sklearn_pipeline),
        )
        self.assertEqual(len(lale_pipeline.edges()), 8)
        # These assertions assume topological sort, which may not be unique. So the assertions are brittle.
        from lale.lib.lale.concat_features import ConcatFeatures
        from lale.lib.sklearn.k_neighbors_classifier import KNeighborsClassifier
        from lale.lib.sklearn.nystroem import Nystroem
        from lale.lib.sklearn.pca import PCA
        from lale.lib.sklearn.select_k_best import SelectKBest

        self.assertIsInstance(lale_pipeline.edges()[0][0], SelectKBest)  # type: ignore
        self.assertIsInstance(lale_pipeline.edges()[0][1], PCA)  # type: ignore
        self.assertIsInstance(lale_pipeline.edges()[1][0], SelectKBest)  # type: ignore
        self.assertIsInstance(lale_pipeline.edges()[1][1], SelectKBest)  # type: ignore
        self.assertIsInstance(lale_pipeline.edges()[2][0], SelectKBest)  # type: ignore
        self.assertIsInstance(lale_pipeline.edges()[2][1], Nystroem)  # type: ignore
        self.assertIsInstance(lale_pipeline.edges()[3][0], PCA)  # type: ignore
        self.assertIsInstance(lale_pipeline.edges()[3][1], ConcatFeatures)  # type: ignore
        self.assertIsInstance(lale_pipeline.edges()[4][0], Nystroem)  # type: ignore
        self.assertIsInstance(lale_pipeline.edges()[4][1], ConcatFeatures)  # type: ignore
        self.assertIsInstance(lale_pipeline.edges()[5][0], ConcatFeatures)  # type: ignore
        self.assertIsInstance(lale_pipeline.edges()[5][1], ConcatFeatures)  # type: ignore
        self.assertIsInstance(lale_pipeline.edges()[6][0], Nystroem)  # type: ignore
        self.assertIsInstance(lale_pipeline.edges()[6][1], ConcatFeatures)  # type: ignore
        self.assertIsInstance(lale_pipeline.edges()[7][0], ConcatFeatures)  # type: ignore
        self.assertIsInstance(lale_pipeline.edges()[7][1], KNeighborsClassifier)  # type: ignore
        self.assert_equal_predictions(sklearn_pipeline, lale_pipeline)
示例#5
0
 def test_import_from_sklearn_pipeline_nested_pipeline1(self):
     from sklearn.pipeline import FeatureUnion
     from sklearn.decomposition import PCA as SklearnPCA
     from sklearn.kernel_approximation import Nystroem as SklearnNystroem
     from sklearn.feature_selection import SelectKBest
     from sklearn.neighbors import KNeighborsClassifier as SklearnKNN
     union = FeatureUnion([
         ("selectkbest_pca",
          sklearn.pipeline.make_pipeline(
              SelectKBest(k=3),
              FeatureUnion([
                  ('pca', SklearnPCA(n_components=1)),
                  ('nested_pipeline',
                   sklearn.pipeline.make_pipeline(SelectKBest(k=2),
                                                  SklearnNystroem()))
              ]))), ("nys", SklearnNystroem(n_components=2,
                                            random_state=42))
     ])
     sklearn_pipeline = sklearn.pipeline.make_pipeline(union, SklearnKNN())
     lale_pipeline = import_from_sklearn_pipeline(sklearn_pipeline)
     self.assertEqual(len(lale_pipeline.edges()), 8)
     #These assertions assume topological sort, which may not be unique. So the assertions are brittle.
     from lale.lib.sklearn.pca import PCAImpl
     from lale.lib.sklearn.nystroem import NystroemImpl
     from lale.lib.lale.concat_features import ConcatFeaturesImpl
     from lale.lib.sklearn.k_neighbors_classifier import KNeighborsClassifierImpl
     from lale.lib.sklearn.select_k_best import SelectKBestImpl
     self.assertEqual(lale_pipeline.edges()[0][0]._impl_class(),
                      SelectKBestImpl)
     self.assertEqual(lale_pipeline.edges()[0][1]._impl_class(), PCAImpl)
     self.assertEqual(lale_pipeline.edges()[1][0]._impl_class(),
                      SelectKBestImpl)
     self.assertEqual(lale_pipeline.edges()[1][1]._impl_class(),
                      SelectKBestImpl)
     self.assertEqual(lale_pipeline.edges()[2][0]._impl_class(),
                      SelectKBestImpl)
     self.assertEqual(lale_pipeline.edges()[2][1]._impl_class(),
                      NystroemImpl)
     self.assertEqual(lale_pipeline.edges()[3][0]._impl_class(), PCAImpl)
     self.assertEqual(lale_pipeline.edges()[3][1]._impl_class(),
                      ConcatFeaturesImpl)
     self.assertEqual(lale_pipeline.edges()[4][0]._impl_class(),
                      NystroemImpl)
     self.assertEqual(lale_pipeline.edges()[4][1]._impl_class(),
                      ConcatFeaturesImpl)
     self.assertEqual(lale_pipeline.edges()[5][0]._impl_class(),
                      ConcatFeaturesImpl)
     self.assertEqual(lale_pipeline.edges()[5][1]._impl_class(),
                      ConcatFeaturesImpl)
     self.assertEqual(lale_pipeline.edges()[6][0]._impl_class(),
                      NystroemImpl)
     self.assertEqual(lale_pipeline.edges()[6][1]._impl_class(),
                      ConcatFeaturesImpl)
     self.assertEqual(lale_pipeline.edges()[7][0]._impl_class(),
                      ConcatFeaturesImpl)
     self.assertEqual(lale_pipeline.edges()[7][1]._impl_class(),
                      KNeighborsClassifierImpl)
     self.assert_equal_predictions(sklearn_pipeline, lale_pipeline)
示例#6
0
    def transform(self):
        # n_components must not be bigger than the original features
        self.n_components = min(self.data.shape[1], self.n_components)
        self.pca = SklearnPCA(n_components=self.n_components,
                              random_state=self.random_state,
                              copy=self.copy,
                              **self.kwargs)

        self.data_ = self.pca.fit_transform(self.data)
        return self.data_
示例#7
0
 def test_import_from_sklearn_pipeline1(self):
     from sklearn.decomposition import PCA as SklearnPCA
     from sklearn.neighbors import KNeighborsClassifier as SklearnKNN
     sklearn_pipeline = sklearn.pipeline.make_pipeline(
         SklearnPCA(n_components=3), SklearnKNN())
     lale_pipeline = import_from_sklearn_pipeline(sklearn_pipeline)
     for i, pipeline_step in enumerate(sklearn_pipeline.named_steps):
         sklearn_step_params = sklearn_pipeline.named_steps[
             pipeline_step].get_params()
         lale_sklearn_params = lale_pipeline.steps(
         )[i]._impl._wrapped_model.get_params()
         self.assertEqual(sklearn_step_params, lale_sklearn_params)
     self.assert_equal_predictions(sklearn_pipeline, lale_pipeline)
示例#8
0
    def dont_test_with_gridsearchcv2_auto(self):
        from sklearn.datasets import load_iris
        from sklearn.metrics import accuracy_score, make_scorer
        from sklearn.model_selection import GridSearchCV

        lr = LogisticRegression(random_state=42)
        pca = PCA(random_state=42, svd_solver="arpack")
        trainable = pca >> lr
        from sklearn.pipeline import Pipeline

        scikit_pipeline = Pipeline([
            (pca.name(), PCA(random_state=42, svd_solver="arpack")),
            (lr.name(), LogisticRegression(random_state=42)),
        ])
        all_parameters = get_grid_search_parameter_grids(trainable,
                                                         num_samples=1)
        # otherwise the test takes too long
        parameters = random.sample(all_parameters, 2)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            clf = GridSearchCV(scikit_pipeline,
                               parameters,
                               cv=2,
                               scoring=make_scorer(accuracy_score))
            iris = load_iris()
            clf.fit(iris.data, iris.target)
            predicted = clf.predict(iris.data)
            accuracy_with_lale_operators = accuracy_score(
                iris.target, predicted)

        from sklearn.decomposition import PCA as SklearnPCA
        from sklearn.linear_model import LogisticRegression as SklearnLR
        from sklearn.pipeline import Pipeline

        scikit_pipeline = Pipeline([
            (pca.name(), SklearnPCA(random_state=42, svd_solver="arpack")),
            (lr.name(), SklearnLR(random_state=42)),
        ])
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            clf = GridSearchCV(scikit_pipeline,
                               parameters,
                               cv=2,
                               scoring=make_scorer(accuracy_score))
            iris = load_iris()
            clf.fit(iris.data, iris.target)
            predicted = clf.predict(iris.data)
            accuracy_with_scikit_operators = accuracy_score(
                iris.target, predicted)
        self.assertEqual(accuracy_with_lale_operators,
                         accuracy_with_scikit_operators)
示例#9
0
    def test_import_from_sklearn_pipeline_nested_pipeline2(self):
        from sklearn.decomposition import PCA as SklearnPCA
        from sklearn.feature_selection import SelectKBest
        from sklearn.kernel_approximation import Nystroem as SklearnNystroem
        from sklearn.neighbors import KNeighborsClassifier as SklearnKNN
        from sklearn.pipeline import FeatureUnion

        union = FeatureUnion([
            (
                "selectkbest_pca",
                sklearn.pipeline.make_pipeline(
                    SelectKBest(k=3),
                    sklearn.pipeline.make_pipeline(SelectKBest(k=2),
                                                   SklearnPCA()),
                ),
            ),
            ("nys", SklearnNystroem(n_components=2, random_state=42)),
        ])
        sklearn_pipeline = sklearn.pipeline.make_pipeline(union, SklearnKNN())
        lale_pipeline = typing.cast(
            lale.operators.TrainablePipeline,
            import_from_sklearn_pipeline(sklearn_pipeline),
        )
        self.assertEqual(len(lale_pipeline.edges()), 5)
        from lale.lib.lale.concat_features import ConcatFeaturesImpl
        from lale.lib.sklearn.k_neighbors_classifier import KNeighborsClassifierImpl
        from lale.lib.sklearn.nystroem import NystroemImpl
        from lale.lib.sklearn.pca import PCAImpl
        from lale.lib.sklearn.select_k_best import SelectKBestImpl

        self.assertEqual(lale_pipeline.edges()[0][0]._impl_class(),
                         SelectKBestImpl)
        self.assertEqual(lale_pipeline.edges()[0][1]._impl_class(),
                         SelectKBestImpl)
        self.assertEqual(lale_pipeline.edges()[1][0]._impl_class(),
                         SelectKBestImpl)
        self.assertEqual(lale_pipeline.edges()[1][1]._impl_class(), PCAImpl)
        self.assertEqual(lale_pipeline.edges()[2][0]._impl_class(), PCAImpl)
        self.assertEqual(lale_pipeline.edges()[2][1]._impl_class(),
                         ConcatFeaturesImpl)
        self.assertEqual(lale_pipeline.edges()[3][0]._impl_class(),
                         NystroemImpl)
        self.assertEqual(lale_pipeline.edges()[3][1]._impl_class(),
                         ConcatFeaturesImpl)
        self.assertEqual(lale_pipeline.edges()[4][0]._impl_class(),
                         ConcatFeaturesImpl)
        self.assertEqual(lale_pipeline.edges()[4][1]._impl_class(),
                         KNeighborsClassifierImpl)

        self.assert_equal_predictions(sklearn_pipeline, lale_pipeline)
示例#10
0
    def test_import_from_sklearn_pipeline1(self):
        from sklearn.decomposition import PCA as SklearnPCA
        from sklearn.neighbors import KNeighborsClassifier as SklearnKNN

        sklearn_pipeline = sklearn.pipeline.make_pipeline(
            SklearnPCA(n_components=3), SklearnKNN())
        lale_pipeline = typing.cast(
            lale.operators.TrainablePipeline,
            import_from_sklearn_pipeline(sklearn_pipeline),
        )
        for i, pipeline_step in enumerate(sklearn_pipeline.named_steps):
            sklearn_step_params = sklearn_pipeline.named_steps[
                pipeline_step].get_params()
            lale_sklearn_params = self.get_sklearn_params(
                lale_pipeline.steps()[i])
            self.assertEqual(sklearn_step_params, lale_sklearn_params)
        self.assert_equal_predictions(sklearn_pipeline, lale_pipeline)
示例#11
0
    def test_compare_with_sklearn(self):
        tfm = PCA()
        clf = LogisticRegression(LogisticRegression.solver.lbfgs,
                                 LogisticRegression.multi_class.auto)
        trainable = lale.operators.make_pipeline(tfm, clf)
        digits = sklearn.datasets.load_digits()
        trained = trainable.fit(digits.data, digits.target)
        predicted = trained.predict(digits.data)
        from sklearn.decomposition import PCA as SklearnPCA
        from sklearn.linear_model import LogisticRegression as SklearnLR
        sklearn_pipeline = sklearn.pipeline.make_pipeline(
            SklearnPCA(), SklearnLR(solver="lbfgs", multi_class="auto"))
        sklearn_pipeline.fit(digits.data, digits.target)
        predicted_sklearn = sklearn_pipeline.predict(digits.data)

        lale_score = accuracy_score(digits.target, predicted)
        scikit_score = accuracy_score(digits.target, predicted_sklearn)
        self.assertEqual(lale_score, scikit_score)
示例#12
0
    # This is how the eigenvectors and eigenvalues are computed in my implementation
    _, eig_vectors = np.linalg.eigh(matrix)
    eig_vectors = eig_vectors[:, ::-1]

    # The eigenvector flip is done in these next lines
    max_abs_cols = np.argmax(np.abs(eig_vectors), axis=0)
    signs = np.sign(eig_vectors[max_abs_cols, range(eig_vectors.shape[1])])
    eig_vectors *= signs

    # Return same number of vectors/components
    return eig_vectors[:, :components].T


X = load_iris().data
n_comp = 2

sklearn_pca = SklearnPCA(n_components=n_comp)
Xt_sklearn = sklearn_pca.fit_transform(X)

pca = PCA(n_components=n_comp)
Xt_pca = pca.fit_transform(X)

assert np.allclose(pca.explained_variance_ratio,
                   sklearn_pca.explained_variance_ratio_)
assert np.allclose(pca.explained_variance, sklearn_pca.explained_variance_)

# Components are not directly comparable due to sklearn's eigenvector flip
assert np.allclose(sklearn_svd_flip(pca.cov_matrix, n_comp),
                   sklearn_pca.components_)
示例#13
0
 def __init__(self):
     self.pca = SklearnPCA()