Пример #1
0
    def test_StratifiedShuffleSplit(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)
        sf1 = df.model_selection.StratifiedShuffleSplit(random_state=self.random_state)
        sf2 = ms.StratifiedShuffleSplit(random_state=self.random_state)

        # consume generator
        ind1 = [x for x in sf1.split(df.data.values, df.target.values)]
        ind2 = [x for x in sf2.split(iris.data, iris.target)]

        for i1, i2 in zip(ind1, ind2):
            self.assertIsInstance(i1, tuple)
            self.assertEqual(len(i1), 2)
            self.assertIsInstance(i2, tuple)
            self.assertEqual(len(i2), 2)
            tm.assert_numpy_array_equal(i1[0], i1[0])
            tm.assert_numpy_array_equal(i1[1], i2[1])

        sf1 = df.model_selection.StratifiedShuffleSplit(random_state=self.random_state)
        with tm.assert_produces_warning(FutureWarning):
            gen = df.model_selection.iterate(sf1)

        # StratifiedShuffleSplit is not a subclass of BaseCrossValidator
        for train_df, test_df in gen:
            self.assertIsInstance(train_df, pdml.ModelFrame)
            self.assertIsInstance(test_df, pdml.ModelFrame)
            tm.assert_index_equal(df.columns, train_df.columns)
            tm.assert_index_equal(df.columns, test_df.columns)

            self.assertTrue(df.shape[0], train_df.shape[0] + test_df.shape[1])
Пример #2
0
    def test_inverse_transform(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        models = ['PCA']
        for model in models:
            mod1 = getattr(df.decomposition, model)()
            mod2 = getattr(decomposition, model)()

            df.fit(mod1)
            mod2.fit(iris.data, iris.target)

            result = df.transform(mod1)
            expected = mod2.transform(iris.data)

            self.assertIsInstance(result, pdml.ModelFrame)
            tm.assert_series_equal(df.target, result.target)
            self.assert_numpy_array_almost_equal(result.data.values, expected)

            result = df.inverse_transform(mod1)
            expected = mod2.inverse_transform(iris.data)

            self.assertIsInstance(result, pdml.ModelFrame)
            tm.assert_series_equal(df.target, result.target)
            self.assert_numpy_array_almost_equal(result.data.values, expected)
            tm.assert_index_equal(result.columns, df.columns)
Пример #3
0
    def test_preprocessing_assignment(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        original_columns = df.data.columns
        df['sepal length (cm)'] = df[
            'sepal length (cm)'].preprocessing.binarize(threshold=6)
        self.assertIsInstance(df, pdml.ModelFrame)
        binarized = pp.binarize(np.atleast_2d(iris.data[:, 0]), threshold=6)
        expected = np.hstack([binarized.T, iris.data[:, 1:]])
        self.assert_numpy_array_almost_equal(df.data.values, expected)
        tm.assert_index_equal(df.data.columns, original_columns)

        # recreate data
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        target_columns = ['sepal length (cm)', 'sepal width (cm)']
        df[target_columns] = df[target_columns].preprocessing.binarize(
            threshold=6)
        self.assertIsInstance(df, pdml.ModelFrame)
        binarized = pp.binarize(iris.data[:, 0:2], threshold=6)
        expected = np.hstack([binarized, iris.data[:, 2:]])
        self.assert_numpy_array_almost_equal(df.data.values, expected)
        tm.assert_index_equal(df.data.columns, original_columns)
Пример #4
0
    def test_binarize(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        result = df.preprocessing.binarize()
        expected = pp.binarize(iris.data)

        self.assertIsInstance(result, pdml.ModelFrame)
        self.assert_numpy_array_almost_equal(result.data.values, expected)
        tm.assert_index_equal(result.columns, df.data.columns)

        result = df.preprocessing.binarize(threshold=5)
        expected = pp.binarize(iris.data, threshold=5)

        self.assertIsInstance(result, pdml.ModelFrame)
        self.assert_numpy_array_almost_equal(result.data.values, expected)
        tm.assert_index_equal(result.columns, df.data.columns)

        s = df['sepal length (cm)']
        self.assertIsInstance(s, pdml.ModelSeries)
        result = s.preprocessing.binarize()
        expected = pp.binarize(iris.data[:, 0].reshape(-1, 1))

        self.assertIsInstance(result, pdml.ModelSeries)
        self.assert_numpy_array_almost_equal(result.values, expected.flatten())
        self.assertEqual(result.name, 'sepal length (cm)')

        result = s.preprocessing.binarize(threshold=6)
        expected = pp.binarize(iris.data[:, 0].reshape(-1, 1), threshold=6)

        self.assertIsInstance(result, pdml.ModelSeries)
        self.assert_numpy_array_almost_equal(result.values, expected.flatten())
        self.assertEqual(result.name, 'sepal length (cm)')
Пример #5
0
    def test_patsy_matrices(self):
        df = pd.DataFrame({'A': [1, 2, 3],
                           'B': [4, 5, 6],
                           'C': [7, 8, 9]},
                          index=['a', 'b', 'c'],
                          columns=['A', 'B', 'C'])
        s = pd.Series([10, 11, 12], index=['a', 'b', 'c'])
        mdf = pdml.ModelFrame(df, target=s)

        result = mdf.transform('A ~ B + C')
        self.assertIsInstance(result, pdml.ModelFrame)
        self.assertEqual(result.shape, (3, 4))
        tm.assert_index_equal(result.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(result.columns, pd.Index(['A', 'Intercept', 'B', 'C']))
        expected = pd.DataFrame({'A': [1, 2, 3],
                                 'Intercept': [1, 1, 1],
                                 'B': [4, 5, 6],
                                 'C': [7, 8, 9]},
                                index=['a', 'b', 'c'],
                                columns=['A', 'Intercept', 'B', 'C'],
                                dtype=float)
        tm.assert_frame_equal(result, expected)
        expected = pd.Series([1, 2, 3], index=['a', 'b', 'c'], name='A', dtype=float)
        tm.assert_series_equal(result.target, expected)
        self.assertEqual(result.target.name, 'A')
        self.assertEqual(result.target_name, 'A')
Пример #6
0
    def test_GaussianProcess_ge_018(self):
        X = np.atleast_2d([1., 3., 5., 6., 7., 8.]).T
        y = np.sin(X).ravel()
        df = pdml.ModelFrame(X, target=y)

        k1 = (df.gp.kernels.ConstantKernel(1.0, (1e-3, 1e3)) *
              df.gp.kernels.RBF(10, (1e-2, 1e2)))
        g1 = df.gp.GaussianProcessRegressor(kernel=k1,
                                            n_restarts_optimizer=9,
                                            random_state=self.random_state)

        k2 = (gp.kernels.ConstantKernel(1.0, (1e-3, 1e3)) *
              gp.kernels.RBF(10, (1e-2, 1e2)))
        g2 = gp.GaussianProcessRegressor(kernel=k2,
                                         n_restarts_optimizer=9,
                                         random_state=self.random_state)

        g1.fit(X, y)
        g2.fit(X, y)

        x = np.atleast_2d(np.linspace(0, 10, 1000)).T
        tdf = pdml.ModelFrame(x)

        y_result = tdf.predict(g1)
        y_expected = g2.predict(x)

        self.assertIsInstance(y_result, pdml.ModelSeries)
        tm.assert_index_equal(y_result.index, tdf.index)

        self.assert_numpy_array_almost_equal(y_result, y_expected)
Пример #7
0
    def test_binarize(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        result = df.preprocessing.binarize()
        expected = pp.binarize(iris.data)

        self.assertIsInstance(result, pdml.ModelFrame)
        self.assert_numpy_array_almost_equal(result.data.values, expected)
        tm.assert_index_equal(result.columns, df.data.columns)

        result = df.preprocessing.binarize(threshold=5)
        expected = pp.binarize(iris.data, threshold=5)

        self.assertIsInstance(result, pdml.ModelFrame)
        self.assert_numpy_array_almost_equal(result.data.values, expected)
        tm.assert_index_equal(result.columns, df.data.columns)

        s = df['sepal length (cm)']
        self.assertIsInstance(s, pdml.ModelSeries)
        result = s.preprocessing.binarize()
        expected = pp.binarize(iris.data[:, 0].reshape(-1, 1))

        self.assertIsInstance(result, pdml.ModelSeries)
        self.assert_numpy_array_almost_equal(result.values, expected.flatten())
        self.assertEqual(result.name, 'sepal length (cm)')

        result = s.preprocessing.binarize(threshold=6)
        expected = pp.binarize(iris.data[:, 0].reshape(-1, 1), threshold=6)

        self.assertIsInstance(result, pdml.ModelSeries)
        self.assert_numpy_array_almost_equal(result.values, expected.flatten())
        self.assertEqual(result.name, 'sepal length (cm)')
Пример #8
0
    def test_add_dummy_feature(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        result = df.preprocessing.add_dummy_feature()
        expected = pp.add_dummy_feature(iris.data)

        self.assertIsInstance(result, pdml.ModelFrame)
        self.assert_numpy_array_almost_equal(result.data.values, expected)

        result = df.preprocessing.add_dummy_feature(value=2)
        expected = pp.add_dummy_feature(iris.data, value=2)

        self.assertIsInstance(result, pdml.ModelFrame)
        self.assert_numpy_array_almost_equal(result.data.values, expected)
        tm.assert_index_equal(result.columns[1:], df.data.columns)

        s = df['sepal length (cm)']
        self.assertIsInstance(s, pdml.ModelSeries)
        result = s.preprocessing.add_dummy_feature()
        expected = pp.add_dummy_feature(iris.data[:, [0]])

        self.assertIsInstance(result, pdml.ModelFrame)
        self.assert_numpy_array_almost_equal(result.values, expected)
        self.assertEqual(result.columns[1], 'sepal length (cm)')
Пример #9
0
    def test_sample_ensemble(self):
        from imblearn.ensemble import BalanceCascade, EasyEnsemble

        models = [BalanceCascade, EasyEnsemble]

        X = np.random.randn(100, 5)
        y = np.array([0, 1]).repeat([80, 20])

        df = pdml.ModelFrame(X, target=y, columns=list('ABCDE'))

        for model in models:
            mod1 = model(random_state=self.random_state)
            mod2 = model(random_state=self.random_state)

            df.fit(mod1)
            mod2.fit(X, y)

            results = df.fit_resample(mod1)
            expected_X, expected_y = mod2.fit_resample(X, y)

            self.assertIsInstance(results, list)
            for r in results:
                self.assertIsInstance(r, pdml.ModelFrame)
                tm.assert_index_equal(r.columns, df.columns)

            mod1 = model(random_state=self.random_state)
            mod2 = model(random_state=self.random_state)

            results = df.fit_sample(mod1)
            expected_X, expected_y = mod2.fit_sample(X, y)

            self.assertIsInstance(results, list)
            for r in results:
                self.assertIsInstance(r, pdml.ModelFrame)
                tm.assert_index_equal(r.columns, df.columns)
Пример #10
0
    def test_CCA_PLSCannonical(self, algo):
        n = 500

        with tm.RNGContext(1):
            # 2 latents vars:
            l1 = np.random.normal(size=n)
            l2 = np.random.normal(size=n)

            latents = np.array([l1, l1, l2, l2]).T
            X = latents + np.random.normal(size=4 * n).reshape((n, 4))
            Y = latents + np.random.normal(size=4 * n).reshape((n, 4))

        X_train = X[:n // 2]
        Y_train = Y[:n // 2]
        X_test = X[n // 2:]
        Y_test = Y[n // 2:]

        train = pdml.ModelFrame(X_train, target=Y_train)
        test = pdml.ModelFrame(X_test, target=Y_test)

        # check multi target columns
        self.assertTrue(train.has_target())
        tm.assert_numpy_array_equal(train.data.values, X_train)
        tm.assert_numpy_array_equal(train.target.values, Y_train)
        tm.assert_numpy_array_equal(test.data.values, X_test)
        tm.assert_numpy_array_equal(test.target.values, Y_test)
        expected = pd.MultiIndex.from_tuples([('.target', 0), ('.target', 1),
                                              ('.target', 2), ('.target', 3)])
        tm.assert_index_equal(train.target_name, expected)
        self.assertEqual(train.data.shape, X_train.shape)
        self.assertEqual(train.target.shape, Y_train.shape)

        mod1 = getattr(train.cross_decomposition, algo)(n_components=2)
        mod2 = getattr(cd, algo)(n_components=2)

        train.fit(mod1)
        mod2.fit(X_train, Y_train)

        # 2nd cols are different on travis-CI
        self.assert_numpy_array_almost_equal(mod1.x_weights_[:, 0],
                                             mod2.x_weights_[:, 0])
        self.assert_numpy_array_almost_equal(mod1.y_weights_[:, 0],
                                             mod2.y_weights_[:, 0])

        result_tr = train.transform(mod1)
        result_test = test.transform(mod1)

        expected_tr = mod2.transform(X_train, Y_train)
        expected_test = mod2.transform(X_test, Y_test)

        self.assertIsInstance(result_tr, pdml.ModelFrame)
        self.assertIsInstance(result_test, pdml.ModelFrame)
        self.assert_numpy_array_almost_equal(result_tr.data.values[:, 0],
                                             expected_tr[0][:, 0])
        self.assert_numpy_array_almost_equal(result_tr.target.values[:, 0],
                                             expected_tr[1][:, 0])
        self.assert_numpy_array_almost_equal(result_test.data.values[:, 0],
                                             expected_test[0][:, 0])
        self.assert_numpy_array_almost_equal(result_test.target.values[:, 0],
                                             expected_test[1][:, 0])
Пример #11
0
    def test_silhouette_samples(self):
        result = self.df.metrics.silhouette_samples()
        expected = metrics.silhouette_samples(self.data, self.pred)

        self.assertTrue(isinstance(result, pdml.ModelSeries))
        tm.assert_index_equal(result.index, self.df.index)
        self.assert_numpy_array_almost_equal(result.values, expected)
Пример #12
0
    def test_StratifiedShuffleSplit(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)
        sf1 = df.model_selection.StratifiedShuffleSplit(random_state=self.random_state)
        sf2 = ms.StratifiedShuffleSplit(random_state=self.random_state)

        # consume generator
        ind1 = [x for x in sf1.split(df.data.values, df.target.values)]
        ind2 = [x for x in sf2.split(iris.data, iris.target)]

        for i1, i2 in zip(ind1, ind2):
            self.assertIsInstance(i1, tuple)
            self.assertEqual(len(i1), 2)
            self.assertIsInstance(i2, tuple)
            self.assertEqual(len(i2), 2)
            tm.assert_numpy_array_equal(i1[0], i1[0])
            tm.assert_numpy_array_equal(i1[1], i2[1])

        sf1 = df.model_selection.StratifiedShuffleSplit(random_state=self.random_state)
        with tm.assert_produces_warning(FutureWarning):
            gen = df.model_selection.iterate(sf1)

        # StratifiedShuffleSplit is not a subclass of BaseCrossValidator
        for train_df, test_df in gen:
            self.assertIsInstance(train_df, pdml.ModelFrame)
            self.assertIsInstance(test_df, pdml.ModelFrame)
            tm.assert_index_equal(df.columns, train_df.columns)
            tm.assert_index_equal(df.columns, test_df.columns)

            self.assertEqual(len(df), len(train_df) + len(test_df))
Пример #13
0
    def test_sample_ensemble(self):
        from imblearn.ensemble import BalanceCascade, EasyEnsemble

        models = [BalanceCascade, EasyEnsemble]

        X = np.random.randn(100, 5)
        y = np.array([0, 1]).repeat([80, 20])

        df = pdml.ModelFrame(X, target=y, columns=list('ABCDE'))

        for model in models:
            mod1 = model(random_state=self.random_state)
            mod2 = model(random_state=self.random_state)

            df.fit(mod1)
            mod2.fit(X, y)

            results = df.sample(mod1)
            expected_X, expected_y = mod2.sample(X, y)

            self.assertIsInstance(results, list)
            for r in results:
                self.assertIsInstance(r, pdml.ModelFrame)
                tm.assert_index_equal(r.columns, df.columns)

            mod1 = model(random_state=self.random_state)
            mod2 = model(random_state=self.random_state)

            results = df.fit_sample(mod1)
            expected_X, expected_y = mod2.fit_sample(X, y)

            self.assertIsInstance(results, list)
            for r in results:
                self.assertIsInstance(r, pdml.ModelFrame)
                tm.assert_index_equal(r.columns, df.columns)
Пример #14
0
    def test_add_dummy_feature(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        result = df.preprocessing.add_dummy_feature()
        expected = pp.add_dummy_feature(iris.data)

        self.assertIsInstance(result, pdml.ModelFrame)
        self.assert_numpy_array_almost_equal(result.data.values, expected)

        result = df.preprocessing.add_dummy_feature(value=2)
        expected = pp.add_dummy_feature(iris.data, value=2)

        self.assertIsInstance(result, pdml.ModelFrame)
        self.assert_numpy_array_almost_equal(result.data.values, expected)
        tm.assert_index_equal(result.columns[1:], df.data.columns)

        s = df['sepal length (cm)']
        self.assertIsInstance(s, pdml.ModelSeries)
        result = s.preprocessing.add_dummy_feature()
        expected = pp.add_dummy_feature(iris.data[:, [0]])

        self.assertIsInstance(result, pdml.ModelFrame)
        self.assert_numpy_array_almost_equal(result.values, expected)
        self.assertEqual(result.columns[1], 'sepal length (cm)')
Пример #15
0
    def test_silhouette_samples(self):
        result = self.df.metrics.silhouette_samples()
        expected = metrics.silhouette_samples(self.data, self.pred)

        self.assertTrue(isinstance(result, pdml.ModelSeries))
        tm.assert_index_equal(result.index, self.df.index)
        self.assert_numpy_array_almost_equal(result.values, expected)
Пример #16
0
    def test_anes96(self):
        data = getattr(sm.datasets.anes96, self.load_method)()
        df = pdml.ModelFrame(data)

        self.assertEqual(df.shape, (944, 6))
        self.assertEqual(df.target_name, 'PID')
        tm.assert_index_equal(df.data.columns, pd.Index(data.exog_name))
Пример #17
0
    def test_preprocessing_normalize(self):
        s = pdml.ModelSeries([1, 2, 3, 4, 5], index=['A', 'B', 'C', 'D', 'E'])
        self.assertIsInstance(s, pdml.ModelSeries)
        result = s.preprocessing.normalize()
        expected = pp.normalize(np.atleast_2d(s.values.astype(np.float)))[0, :]

        self.assertIsInstance(result, pdml.ModelSeries)
        self.assert_numpy_array_almost_equal(result.values, expected)
        tm.assert_index_equal(result.index, s.index)
Пример #18
0
    def test_empirical_covariance(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        result = df.covariance.empirical_covariance()
        expected = covariance.empirical_covariance(iris.data)
        self.assertIsInstance(result, pdml.ModelFrame)
        tm.assert_index_equal(result.index, df.data.columns)
        tm.assert_index_equal(result.columns, df.data.columns)
        self.assert_numpy_array_almost_equal(result.values, expected)
Пример #19
0
    def test_isotonic_regression(self):
        data = np.abs(np.random.randn(100))
        data = data.cumsum()
        df = pdml.ModelFrame(np.arange(len(data)), target=data)

        result = df.isotonic.isotonic_regression()
        expected = isotonic.isotonic_regression(data)
        self.assertIsInstance(result, pdml.ModelSeries)
        tm.assert_index_equal(result.index, df.index)
        tm.assert_numpy_array_equal(result.values, expected)
Пример #20
0
    def test_iterate(self):
        df = pdml.ModelFrame(datasets.load_iris())
        kf = df.cross_validation.KFold(4, n_folds=2, random_state=self.random_state)
        for train_df, test_df in df.cross_validation.iterate(kf):
            self.assertIsInstance(train_df, pdml.ModelFrame)
            self.assertIsInstance(test_df, pdml.ModelFrame)
            tm.assert_index_equal(df.columns, train_df.columns)
            tm.assert_index_equal(df.columns, test_df.columns)

            self.assertTrue(df.shape[0], train_df.shape[0] + test_df.shape[1])
Пример #21
0
    def test_precision_recall_fscore_support(self):
        result = self.df.metrics.precision_recall_fscore_support()
        expected = metrics.precision_recall_fscore_support(self.target, self.pred)
        self.assert_numpy_array_almost_equal(result['precision'].values, expected[0])
        self.assert_numpy_array_almost_equal(result['recall'].values, expected[1])
        self.assert_numpy_array_almost_equal(result['f1-score'].values, expected[2])
        self.assert_numpy_array_almost_equal(result['support'].values, expected[3])

        expected = pd.Index(['precision', 'recall', 'f1-score', 'support'])
        tm.assert_index_equal(result.columns, expected)
Пример #22
0
    def test_precision_recall_fscore_support(self):
        result = self.df.metrics.precision_recall_fscore_support()
        expected = metrics.precision_recall_fscore_support(self.target, self.pred)
        self.assert_numpy_array_almost_equal(result['precision'].values, expected[0])
        self.assert_numpy_array_almost_equal(result['recall'].values, expected[1])
        self.assert_numpy_array_almost_equal(result['f1-score'].values, expected[2])
        self.assert_numpy_array_almost_equal(result['support'].values, expected[3])

        expected = pd.Index(['precision', 'recall', 'f1-score', 'support'])
        tm.assert_index_equal(result.columns, expected)
Пример #23
0
    def test_isotonic_regression(self):
        data = np.abs(np.random.randn(100))
        data = data.cumsum()
        df = pdml.ModelFrame(np.arange(len(data)), target=data)

        result = df.isotonic.isotonic_regression()
        expected = isotonic.isotonic_regression(data)
        self.assertIsInstance(result, pdml.ModelSeries)
        tm.assert_index_equal(result.index, df.index)
        tm.assert_numpy_array_equal(result.values, expected)
Пример #24
0
    def test_empirical_covariance(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        result = df.covariance.empirical_covariance()
        expected = covariance.empirical_covariance(iris.data)
        self.assertIsInstance(result, pdml.ModelFrame)
        tm.assert_index_equal(result.index, df.data.columns)
        tm.assert_index_equal(result.columns, df.data.columns)
        self.assert_numpy_array_almost_equal(result.values, expected)
Пример #25
0
    def test_iterate(self):
        df = pdml.ModelFrame(datasets.load_iris())
        kf = df.cross_validation.KFold(4,
                                       n_folds=2,
                                       random_state=self.random_state)
        for train_df, test_df in df.cross_validation.iterate(kf):
            self.assertIsInstance(train_df, pdml.ModelFrame)
            self.assertIsInstance(test_df, pdml.ModelFrame)
            tm.assert_index_equal(df.columns, train_df.columns)
            tm.assert_index_equal(df.columns, test_df.columns)

            self.assertTrue(df.shape[0], train_df.shape[0] + test_df.shape[1])
Пример #26
0
    def test_mean_shift(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        result = df.cluster.mean_shift()
        expected = cluster.mean_shift(iris.data)

        self.assertEqual(len(result), 2)
        self.assert_numpy_array_almost_equal(result[0], expected[0])
        self.assertIsInstance(result[1], pdml.ModelSeries)
        tm.assert_index_equal(result[1].index, df.index)
        tm.assert_numpy_array_equal(result[1].values, expected[1])
Пример #27
0
    def test_sparse_encode(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        _, dictionary, _ = decomposition.dict_learning(iris.data, 2, 1,
                                                       random_state=self.random_state)

        result = df.decomposition.sparse_encode(dictionary)
        expected = decomposition.sparse_encode(iris.data, dictionary)
        self.assertIsInstance(result, pdml.ModelFrame)
        tm.assert_index_equal(result.index, df.data.index)
        self.assert_numpy_array_almost_equal(result.values, expected)
Пример #28
0
    def test_mean_shift(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        result = df.cluster.mean_shift()
        expected = cluster.mean_shift(iris.data)

        self.assertEqual(len(result), 2)
        self.assert_numpy_array_almost_equal(result[0], expected[0])
        self.assertIsInstance(result[1], pdml.ModelSeries)
        tm.assert_index_equal(result[1].index, df.index)
        tm.assert_numpy_array_equal(result[1].values, expected[1])
Пример #29
0
    def test_spectral_clustering(self):
        N = 50
        m = np.random.random_integers(1, 200, size=(N, N))
        m = (m + m.T) / 2

        df = pdml.ModelFrame(m)
        result = df.cluster.spectral_clustering(random_state=self.random_state)
        expected = cluster.spectral_clustering(m, random_state=self.random_state)

        self.assertIsInstance(result, pdml.ModelSeries)
        tm.assert_index_equal(result.index, df.index)
        tm.assert_numpy_array_equal(result.values, expected)
Пример #30
0
    def test_spectral_clustering(self):
        N = 50
        m = np.random.random_integers(1, 200, size=(N, N))
        m = (m + m.T) / 2

        df = pdml.ModelFrame(m)
        result = df.cluster.spectral_clustering(random_state=self.random_state)
        expected = cluster.spectral_clustering(m, random_state=self.random_state)

        self.assertIsInstance(result, pdml.ModelSeries)
        tm.assert_index_equal(result.index, df.index)
        tm.assert_numpy_array_equal(result.values, expected)
Пример #31
0
    def test_split(self):
        df = pdml.ModelFrame(datasets.load_iris())
        kf = df.model_selection.KFold(4, random_state=self.random_state)

        gen = df.model_selection.split(kf)

        for train_df, test_df in gen:
            self.assertIsInstance(train_df, pdml.ModelFrame)
            self.assertIsInstance(test_df, pdml.ModelFrame)
            tm.assert_index_equal(df.columns, train_df.columns)
            tm.assert_index_equal(df.columns, test_df.columns)

            self.assertEqual(len(df), len(train_df) + len(test_df))
Пример #32
0
    def test_MDS(self, algo):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        mod1 = getattr(df.manifold, algo)(random_state=self.random_state)
        mod2 = getattr(manifold, algo)(random_state=self.random_state)

        result = df.fit_transform(mod1)
        expected = mod2.fit_transform(iris.data)

        self.assertIsInstance(result, pdml.ModelFrame)
        tm.assert_index_equal(result.index, df.index)
        self.assert_numpy_array_almost_equal(result.data.values, expected)
Пример #33
0
    def test_split(self):
        df = pdml.ModelFrame(datasets.load_iris())
        kf = df.model_selection.KFold(4, random_state=self.random_state)

        gen = df.model_selection.split(kf)

        for train_df, test_df in gen:
            self.assertIsInstance(train_df, pdml.ModelFrame)
            self.assertIsInstance(test_df, pdml.ModelFrame)
            tm.assert_index_equal(df.columns, train_df.columns)
            tm.assert_index_equal(df.columns, test_df.columns)

            self.assertTrue(df.shape[0], train_df.shape[0] + test_df.shape[1])
Пример #34
0
    def test_locally_linear_embedding(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        result = df.manifold.locally_linear_embedding(3, 3)
        expected = manifold.locally_linear_embedding(iris.data, 3, 3)

        self.assertEqual(len(result), 2)
        self.assertIsInstance(result[0], pdml.ModelFrame)
        tm.assert_index_equal(result[0].index, df.index)
        tm.assert_numpy_array_equal(result[0].values, expected[0])

        self.assertEqual(result[1], expected[1])
Пример #35
0
    def test_locally_linear_embedding(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        result = df.manifold.locally_linear_embedding(3, 3)
        expected = manifold.locally_linear_embedding(iris.data, 3, 3)

        self.assertEqual(len(result), 2)
        self.assertIsInstance(result[0], pdml.ModelFrame)
        tm.assert_index_equal(result[0].index, df.index)
        tm.assert_numpy_array_equal(result[0].values, expected[0])

        self.assertEqual(result[1], expected[1])
Пример #36
0
    def test_affinity_propagation(self):
        iris = datasets.load_iris()
        similality = np.cov(iris.data)
        df = pdml.ModelFrame(similality)

        result = df.cluster.affinity_propagation()
        expected = cluster.affinity_propagation(similality)

        self.assertEqual(len(result), 2)
        self.assert_numpy_array_almost_equal(result[0], expected[0])

        self.assertIsInstance(result[1], pdml.ModelSeries)
        tm.assert_index_equal(result[1].index, df.index)
        tm.assert_numpy_array_equal(result[1].values, expected[1])
Пример #37
0
    def test_affinity_propagation(self):
        iris = datasets.load_iris()
        similality = np.cov(iris.data)
        df = pdml.ModelFrame(similality)

        result = df.cluster.affinity_propagation()
        expected = cluster.affinity_propagation(similality)

        self.assertEqual(len(result), 2)
        self.assert_numpy_array_almost_equal(result[0], expected[0])

        self.assertIsInstance(result[1], pdml.ModelSeries)
        tm.assert_index_equal(result[1].index, df.index)
        tm.assert_numpy_array_equal(result[1].values, expected[1])
Пример #38
0
    def test_lasso_stability_path(self):
        diabetes = datasets.load_diabetes()
        df = pdml.ModelFrame(diabetes)

        result = df.linear_model.lasso_stability_path(random_state=self.random_state)
        expected = lm.lasso_stability_path(diabetes.data, diabetes.target,
                                           random_state=self.random_state)

        self.assertEqual(len(result), 2)
        tm.assert_numpy_array_equal(result[0], expected[0])

        self.assertIsInstance(result[1], pdml.ModelFrame)
        tm.assert_index_equal(result[1].index, df.data.columns)
        tm.assert_numpy_array_equal(result[1].values, expected[1])
Пример #39
0
    def test_lars_path(self):
        diabetes = datasets.load_diabetes()
        df = pdml.ModelFrame(diabetes)

        result = df.linear_model.lars_path()
        expected = lm.lars_path(diabetes.data, diabetes.target)

        self.assertEqual(len(result), 3)
        tm.assert_numpy_array_equal(result[0], expected[0])
        self.assertEqual(result[1], expected[1])
        self.assertIsInstance(result[1], list)
        self.assertIsInstance(result[2], pdml.ModelFrame)
        tm.assert_index_equal(result[2].index, df.data.columns)
        tm.assert_numpy_array_equal(result[2].values, expected[2])
Пример #40
0
    def test_iterate(self):
        df = pdml.ModelFrame(datasets.load_iris())
        kf = df.model_selection.KFold(4, random_state=self.random_state)

        with tm.assert_produces_warning(FutureWarning):
            gen = df.model_selection.iterate(kf)

        for train_df, test_df in gen:
            self.assertIsInstance(train_df, pdml.ModelFrame)
            self.assertIsInstance(test_df, pdml.ModelFrame)
            tm.assert_index_equal(df.columns, train_df.columns)
            tm.assert_index_equal(df.columns, test_df.columns)

            self.assertEqual(len(df), len(train_df) + len(test_df))
Пример #41
0
    def test_LabelEncoder_frame(self):
        arr = np.array(['X', 'Y', 'Z', 'X'])
        df = pdml.ModelFrame(arr, index=['a', 'b', 'c', 'd'], columns=['A'])

        mod1 = df.pp.LabelEncoder()
        df.fit(mod1)
        result = df.transform(mod1)

        expected = np.array([0, 1, 2, 0]).reshape(-1, 1)

        self.assertIsInstance(result, pdml.ModelFrame)
        self.assert_numpy_array_almost_equal(result.values, expected)
        tm.assert_index_equal(result.columns, df.columns)
        tm.assert_index_equal(result.index, df.index)

        mod1 = df.pp.LabelEncoder()
        result = df.fit_transform(mod1)

        self.assertIsInstance(result, pdml.ModelFrame)
        self.assert_numpy_array_almost_equal(result.values, expected)
        tm.assert_index_equal(result.columns, df.columns)
        tm.assert_index_equal(result.index, df.index)

        inversed = result.inverse_transform(mod1)
        self.assertIsInstance(inversed, pdml.ModelFrame)
        tm.assert_frame_equal(inversed, df)
Пример #42
0
    def test_iterate(self):
        df = pdml.ModelFrame(datasets.load_iris())
        kf = df.model_selection.KFold(4, random_state=self.random_state)

        with tm.assert_produces_warning(FutureWarning):
            gen = df.model_selection.iterate(kf)

        for train_df, test_df in gen:
            self.assertIsInstance(train_df, pdml.ModelFrame)
            self.assertIsInstance(test_df, pdml.ModelFrame)
            tm.assert_index_equal(df.columns, train_df.columns)
            tm.assert_index_equal(df.columns, test_df.columns)

            self.assertTrue(df.shape[0], train_df.shape[0] + test_df.shape[1])
Пример #43
0
    def test_frame_target_object_set(self):

        df = pd.DataFrame({datetime.datetime(2014, 1, 1): [1, 2, 3],
                           datetime.datetime(2015, 1, 1): [4, 5, 6],
                           datetime.datetime(2016, 1, 1): [7, 8, 9]},
                          index=['a', 'b', 'c'])
        mdf = pdml.ModelFrame(df)

        mdf.target = pd.Series(['A', 'B', 'C'], index=['a', 'b', 'c'], name=5)
        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 4))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        expected = pd.Index([5, datetime.datetime(2014, 1, 1),
                             datetime.datetime(2015, 1, 1), datetime.datetime(2016, 1, 1)])
        tm.assert_index_equal(mdf.columns, expected)
        tm.assert_frame_equal(mdf.data, df)
        expected = pd.Series(['A', 'B', 'C'], index=['a', 'b', 'c'], name=5)
        tm.assert_series_equal(mdf.target, expected)
        self.assertEqual(mdf.target.name, 5)

        # name will be ignored if ModelFrame already has a target
        mdf.target = pd.Series([10, 11, 12], index=['a', 'b', 'c'], name='X')
        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 4))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        expected = pd.Index([5,
                             datetime.datetime(2014, 1, 1),
                             datetime.datetime(2015, 1, 1),
                             datetime.datetime(2016, 1, 1)])
        tm.assert_index_equal(mdf.columns, expected)
        tm.assert_frame_equal(mdf.data, df)
        expected = pd.Series([10, 11, 12], index=['a', 'b', 'c'], name=5)
        tm.assert_series_equal(mdf.target, expected)
        self.assertEqual(mdf.target.name, 5)
Пример #44
0
    def test_lasso_stability_path(self):
        diabetes = datasets.load_diabetes()
        df = pdml.ModelFrame(diabetes)

        result = df.linear_model.lasso_stability_path(random_state=self.random_state)
        expected = lm.lasso_stability_path(diabetes.data, diabetes.target,
                                           random_state=self.random_state)

        self.assertEqual(len(result), 2)
        tm.assert_numpy_array_equal(result[0], expected[0])

        self.assertIsInstance(result[1], pdml.ModelFrame)
        tm.assert_index_equal(result[1].index, df.data.columns)
        tm.assert_numpy_array_equal(result[1].values, expected[1])
Пример #45
0
    def test_frame_init_df_series(self):
        # initialization by dataframe and no-named series
        df = pd.DataFrame({'A': [1, 2, 3],
                           'B': [4, 5, 6],
                           'C': [7, 8, 9]},
                          index=['a', 'b', 'c'],
                          columns=['A', 'B', 'C'])
        s = pd.Series([1, 2, 3], index=['a', 'b', 'c'], name='.target')

        mdf = pdml.ModelFrame(df, target=s)
        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 4))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['.target', 'A', 'B', 'C']))
        tm.assert_frame_equal(mdf.data, df)
        tm.assert_series_equal(mdf.target, s)
        self.assertEqual(mdf.target.name, '.target')
        self.assertEqual(mdf.target_name, '.target')

        s = pd.Series([1, 2, 3])
        with self.assertRaisesRegexp(ValueError, 'data and target must have equal index'):
            mdf = pdml.ModelFrame(df, target=s)

        s = pd.Series([1, 2, 3], index=['a', 'b', 'c'], name='XXX')
        mdf = pdml.ModelFrame(df, target=s)
        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 4))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['XXX', 'A', 'B', 'C']))
        tm.assert_frame_equal(mdf.data, df)
        tm.assert_series_equal(mdf.target, s)
        self.assertEqual(mdf.target.name, 'XXX')
        self.assertEqual(mdf.target_name, 'XXX')
Пример #46
0
    def test_lars_path(self):
        diabetes = datasets.load_diabetes()
        df = pdml.ModelFrame(diabetes)

        result = df.linear_model.lars_path()
        expected = lm.lars_path(diabetes.data, diabetes.target)

        self.assertEqual(len(result), 3)
        tm.assert_numpy_array_equal(result[0], expected[0])
        self.assertEqual(result[1], expected[1])
        self.assertIsInstance(result[1], list)
        self.assertIsInstance(result[2], pdml.ModelFrame)
        tm.assert_index_equal(result[2].index, df.data.columns)
        tm.assert_numpy_array_equal(result[2].values, expected[2])
Пример #47
0
    def test_LabelEncoder_frame(self):
        arr = np.array(['X', 'Y', 'Z', 'X'])
        df = pdml.ModelFrame(arr, index=['a', 'b', 'c', 'd'], columns=['A'])

        mod1 = df.pp.LabelEncoder()
        df.fit(mod1)
        result = df.transform(mod1)

        expected = np.array([0, 1, 2, 0]).reshape(-1, 1)

        self.assertIsInstance(result, pdml.ModelFrame)
        self.assert_numpy_array_almost_equal(result.values, expected)
        tm.assert_index_equal(result.columns, df.columns)
        tm.assert_index_equal(result.index, df.index)

        mod1 = df.pp.LabelEncoder()
        result = df.fit_transform(mod1)

        self.assertIsInstance(result, pdml.ModelFrame)
        self.assert_numpy_array_almost_equal(result.values, expected)
        tm.assert_index_equal(result.columns, df.columns)
        tm.assert_index_equal(result.index, df.index)

        inversed = result.inverse_transform(mod1)
        self.assertIsInstance(inversed, pdml.ModelFrame)
        tm.assert_frame_equal(inversed, df)
Пример #48
0
    def test_frame_init_dict_list_series_index(self):
        # initialization by dataframe and list
        df = {'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}
        target = pd.Series([9, 8, 7], name='X', index=['a', 'b', 'c'])
        mdf = pdml.ModelFrame(df, target=target)

        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 4))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['X', 'A', 'B', 'C']))
        expected = pd.DataFrame(df, index=['a', 'b', 'c'])
        tm.assert_frame_equal(mdf.data, expected)
        tm.assert_series_equal(mdf.target, target)
        self.assertEqual(mdf.target.name, 'X')
        self.assertEqual(mdf.target_name, 'X')
Пример #49
0
    def test_frame_init_dict_list_series_index(self):
        # initialization by dataframe and list
        df = {'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}
        target = pd.Series([9, 8, 7], name='X', index=['a', 'b', 'c'])
        mdf = pdml.ModelFrame(df, target=target)

        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 4))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['X', 'A', 'B', 'C']))
        expected = pd.DataFrame(df, index=['a', 'b', 'c'])
        tm.assert_frame_equal(mdf.data, expected)
        tm.assert_series_equal(mdf.target, target)
        self.assertEqual(mdf.target.name, 'X')
        self.assertEqual(mdf.target_name, 'X')
Пример #50
0
    def test_MDS(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        models = ['MDS']
        for model in models:
            mod1 = getattr(df.manifold, model)(random_state=self.random_state)
            mod2 = getattr(manifold, model)(random_state=self.random_state)

            result = df.fit_transform(mod1)
            expected = mod2.fit_transform(iris.data)

            self.assertIsInstance(result, pdml.ModelFrame)
            tm.assert_index_equal(result.index, df.index)
            self.assert_numpy_array_almost_equal(result.data.values, expected)
Пример #51
0
    def test_k_means(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        result = df.cluster.k_means(3, random_state=self.random_state)
        expected = cluster.k_means(iris.data, 3, random_state=self.random_state)

        self.assertEqual(len(result), 3)
        self.assert_numpy_array_almost_equal(result[0], expected[0])

        self.assertIsInstance(result[1], pdml.ModelSeries)
        tm.assert_index_equal(result[1].index, df.index)
        tm.assert_numpy_array_equal(result[1].values, expected[1])

        self.assertAlmostEqual(result[2], expected[2])
Пример #52
0
    def test_oas(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        result = df.covariance.oas()
        expected = covariance.oas(iris.data)

        self.assertEqual(len(result), 2)

        self.assertIsInstance(result[0], pdml.ModelFrame)
        tm.assert_index_equal(result[0].index, df.data.columns)
        tm.assert_index_equal(result[0].columns, df.data.columns)
        self.assert_numpy_array_almost_equal(result[0].values, expected[0])

        self.assert_numpy_array_almost_equal(result[1], expected[1])
Пример #53
0
    def test_oas(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        result = df.covariance.oas()
        expected = covariance.oas(iris.data)

        self.assertEqual(len(result), 2)

        self.assertIsInstance(result[0], pdml.ModelFrame)
        tm.assert_index_equal(result[0].index, df.data.columns)
        tm.assert_index_equal(result[0].columns, df.data.columns)
        self.assert_numpy_array_almost_equal(result[0].values, expected[0])

        self.assert_numpy_array_almost_equal(result[1], expected[1])
Пример #54
0
    def test_k_means(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        result = df.cluster.k_means(3, random_state=self.random_state)
        expected = cluster.k_means(iris.data, 3, random_state=self.random_state)

        self.assertEqual(len(result), 3)
        self.assert_numpy_array_almost_equal(result[0], expected[0])

        self.assertIsInstance(result[1], pdml.ModelSeries)
        tm.assert_index_equal(result[1].index, df.index)
        tm.assert_numpy_array_equal(result[1].values, expected[1])

        self.assertAlmostEqual(result[2], expected[2])
Пример #55
0
    def _assert_fit_transform(self, df, exp_data, model1, model2):
        result = df.fit_transform(model1)
        expected = model2.fit_transform(exp_data)

        self.assertIsInstance(result, pdml.ModelFrame)
        # target is unchanged
        if df.has_target():
            # target is unchanged
            tm.assert_series_equal(df.target, result.target)
        else:
            self.assertIsNone(result.target)

        self.assert_numpy_array_almost_equal(result.data.values, expected)
        # index and columns are kept
        tm.assert_index_equal(result.index, df.index)
        tm.assert_index_equal(result.columns, df.columns)
Пример #56
0
    def test_spectral_embedding(self):
        N = 10
        m = np.random.random_integers(50, 200, size=(N, N))
        m = (m + m.T) / 2

        df = pdml.ModelFrame(m)
        self.assert_numpy_array_almost_equal(df.data.values, m)

        result = df.manifold.spectral_embedding(random_state=self.random_state)
        expected = manifold.spectral_embedding(m, random_state=self.random_state)

        self.assertIsInstance(result, pdml.ModelFrame)
        tm.assert_index_equal(result.index, df.index)
        # signs can be inversed
        self.assert_numpy_array_almost_equal(np.abs(result.data.values),
                                             np.abs(expected))