def test_StratifiedShuffleSplit(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) sf1 = df.model_selection.StratifiedShuffleSplit(random_state=self.random_state) sf2 = ms.StratifiedShuffleSplit(random_state=self.random_state) # consume generator ind1 = [x for x in sf1.split(df.data.values, df.target.values)] ind2 = [x for x in sf2.split(iris.data, iris.target)] for i1, i2 in zip(ind1, ind2): self.assertIsInstance(i1, tuple) self.assertEqual(len(i1), 2) self.assertIsInstance(i2, tuple) self.assertEqual(len(i2), 2) tm.assert_numpy_array_equal(i1[0], i1[0]) tm.assert_numpy_array_equal(i1[1], i2[1]) sf1 = df.model_selection.StratifiedShuffleSplit(random_state=self.random_state) with tm.assert_produces_warning(FutureWarning): gen = df.model_selection.iterate(sf1) # StratifiedShuffleSplit is not a subclass of BaseCrossValidator for train_df, test_df in gen: self.assertIsInstance(train_df, pdml.ModelFrame) self.assertIsInstance(test_df, pdml.ModelFrame) tm.assert_index_equal(df.columns, train_df.columns) tm.assert_index_equal(df.columns, test_df.columns) self.assertTrue(df.shape[0], train_df.shape[0] + test_df.shape[1])
def test_inverse_transform(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) models = ['PCA'] for model in models: mod1 = getattr(df.decomposition, model)() mod2 = getattr(decomposition, model)() df.fit(mod1) mod2.fit(iris.data, iris.target) result = df.transform(mod1) expected = mod2.transform(iris.data) self.assertIsInstance(result, pdml.ModelFrame) tm.assert_series_equal(df.target, result.target) self.assert_numpy_array_almost_equal(result.data.values, expected) result = df.inverse_transform(mod1) expected = mod2.inverse_transform(iris.data) self.assertIsInstance(result, pdml.ModelFrame) tm.assert_series_equal(df.target, result.target) self.assert_numpy_array_almost_equal(result.data.values, expected) tm.assert_index_equal(result.columns, df.columns)
def test_preprocessing_assignment(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) original_columns = df.data.columns df['sepal length (cm)'] = df[ 'sepal length (cm)'].preprocessing.binarize(threshold=6) self.assertIsInstance(df, pdml.ModelFrame) binarized = pp.binarize(np.atleast_2d(iris.data[:, 0]), threshold=6) expected = np.hstack([binarized.T, iris.data[:, 1:]]) self.assert_numpy_array_almost_equal(df.data.values, expected) tm.assert_index_equal(df.data.columns, original_columns) # recreate data iris = datasets.load_iris() df = pdml.ModelFrame(iris) target_columns = ['sepal length (cm)', 'sepal width (cm)'] df[target_columns] = df[target_columns].preprocessing.binarize( threshold=6) self.assertIsInstance(df, pdml.ModelFrame) binarized = pp.binarize(iris.data[:, 0:2], threshold=6) expected = np.hstack([binarized, iris.data[:, 2:]]) self.assert_numpy_array_almost_equal(df.data.values, expected) tm.assert_index_equal(df.data.columns, original_columns)
def test_binarize(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) result = df.preprocessing.binarize() expected = pp.binarize(iris.data) self.assertIsInstance(result, pdml.ModelFrame) self.assert_numpy_array_almost_equal(result.data.values, expected) tm.assert_index_equal(result.columns, df.data.columns) result = df.preprocessing.binarize(threshold=5) expected = pp.binarize(iris.data, threshold=5) self.assertIsInstance(result, pdml.ModelFrame) self.assert_numpy_array_almost_equal(result.data.values, expected) tm.assert_index_equal(result.columns, df.data.columns) s = df['sepal length (cm)'] self.assertIsInstance(s, pdml.ModelSeries) result = s.preprocessing.binarize() expected = pp.binarize(iris.data[:, 0].reshape(-1, 1)) self.assertIsInstance(result, pdml.ModelSeries) self.assert_numpy_array_almost_equal(result.values, expected.flatten()) self.assertEqual(result.name, 'sepal length (cm)') result = s.preprocessing.binarize(threshold=6) expected = pp.binarize(iris.data[:, 0].reshape(-1, 1), threshold=6) self.assertIsInstance(result, pdml.ModelSeries) self.assert_numpy_array_almost_equal(result.values, expected.flatten()) self.assertEqual(result.name, 'sepal length (cm)')
def test_patsy_matrices(self): df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}, index=['a', 'b', 'c'], columns=['A', 'B', 'C']) s = pd.Series([10, 11, 12], index=['a', 'b', 'c']) mdf = pdml.ModelFrame(df, target=s) result = mdf.transform('A ~ B + C') self.assertIsInstance(result, pdml.ModelFrame) self.assertEqual(result.shape, (3, 4)) tm.assert_index_equal(result.index, pd.Index(['a', 'b', 'c'])) tm.assert_index_equal(result.columns, pd.Index(['A', 'Intercept', 'B', 'C'])) expected = pd.DataFrame({'A': [1, 2, 3], 'Intercept': [1, 1, 1], 'B': [4, 5, 6], 'C': [7, 8, 9]}, index=['a', 'b', 'c'], columns=['A', 'Intercept', 'B', 'C'], dtype=float) tm.assert_frame_equal(result, expected) expected = pd.Series([1, 2, 3], index=['a', 'b', 'c'], name='A', dtype=float) tm.assert_series_equal(result.target, expected) self.assertEqual(result.target.name, 'A') self.assertEqual(result.target_name, 'A')
def test_GaussianProcess_ge_018(self): X = np.atleast_2d([1., 3., 5., 6., 7., 8.]).T y = np.sin(X).ravel() df = pdml.ModelFrame(X, target=y) k1 = (df.gp.kernels.ConstantKernel(1.0, (1e-3, 1e3)) * df.gp.kernels.RBF(10, (1e-2, 1e2))) g1 = df.gp.GaussianProcessRegressor(kernel=k1, n_restarts_optimizer=9, random_state=self.random_state) k2 = (gp.kernels.ConstantKernel(1.0, (1e-3, 1e3)) * gp.kernels.RBF(10, (1e-2, 1e2))) g2 = gp.GaussianProcessRegressor(kernel=k2, n_restarts_optimizer=9, random_state=self.random_state) g1.fit(X, y) g2.fit(X, y) x = np.atleast_2d(np.linspace(0, 10, 1000)).T tdf = pdml.ModelFrame(x) y_result = tdf.predict(g1) y_expected = g2.predict(x) self.assertIsInstance(y_result, pdml.ModelSeries) tm.assert_index_equal(y_result.index, tdf.index) self.assert_numpy_array_almost_equal(y_result, y_expected)
def test_binarize(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) result = df.preprocessing.binarize() expected = pp.binarize(iris.data) self.assertIsInstance(result, pdml.ModelFrame) self.assert_numpy_array_almost_equal(result.data.values, expected) tm.assert_index_equal(result.columns, df.data.columns) result = df.preprocessing.binarize(threshold=5) expected = pp.binarize(iris.data, threshold=5) self.assertIsInstance(result, pdml.ModelFrame) self.assert_numpy_array_almost_equal(result.data.values, expected) tm.assert_index_equal(result.columns, df.data.columns) s = df['sepal length (cm)'] self.assertIsInstance(s, pdml.ModelSeries) result = s.preprocessing.binarize() expected = pp.binarize(iris.data[:, 0].reshape(-1, 1)) self.assertIsInstance(result, pdml.ModelSeries) self.assert_numpy_array_almost_equal(result.values, expected.flatten()) self.assertEqual(result.name, 'sepal length (cm)') result = s.preprocessing.binarize(threshold=6) expected = pp.binarize(iris.data[:, 0].reshape(-1, 1), threshold=6) self.assertIsInstance(result, pdml.ModelSeries) self.assert_numpy_array_almost_equal(result.values, expected.flatten()) self.assertEqual(result.name, 'sepal length (cm)')
def test_add_dummy_feature(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) result = df.preprocessing.add_dummy_feature() expected = pp.add_dummy_feature(iris.data) self.assertIsInstance(result, pdml.ModelFrame) self.assert_numpy_array_almost_equal(result.data.values, expected) result = df.preprocessing.add_dummy_feature(value=2) expected = pp.add_dummy_feature(iris.data, value=2) self.assertIsInstance(result, pdml.ModelFrame) self.assert_numpy_array_almost_equal(result.data.values, expected) tm.assert_index_equal(result.columns[1:], df.data.columns) s = df['sepal length (cm)'] self.assertIsInstance(s, pdml.ModelSeries) result = s.preprocessing.add_dummy_feature() expected = pp.add_dummy_feature(iris.data[:, [0]]) self.assertIsInstance(result, pdml.ModelFrame) self.assert_numpy_array_almost_equal(result.values, expected) self.assertEqual(result.columns[1], 'sepal length (cm)')
def test_sample_ensemble(self): from imblearn.ensemble import BalanceCascade, EasyEnsemble models = [BalanceCascade, EasyEnsemble] X = np.random.randn(100, 5) y = np.array([0, 1]).repeat([80, 20]) df = pdml.ModelFrame(X, target=y, columns=list('ABCDE')) for model in models: mod1 = model(random_state=self.random_state) mod2 = model(random_state=self.random_state) df.fit(mod1) mod2.fit(X, y) results = df.fit_resample(mod1) expected_X, expected_y = mod2.fit_resample(X, y) self.assertIsInstance(results, list) for r in results: self.assertIsInstance(r, pdml.ModelFrame) tm.assert_index_equal(r.columns, df.columns) mod1 = model(random_state=self.random_state) mod2 = model(random_state=self.random_state) results = df.fit_sample(mod1) expected_X, expected_y = mod2.fit_sample(X, y) self.assertIsInstance(results, list) for r in results: self.assertIsInstance(r, pdml.ModelFrame) tm.assert_index_equal(r.columns, df.columns)
def test_CCA_PLSCannonical(self, algo): n = 500 with tm.RNGContext(1): # 2 latents vars: l1 = np.random.normal(size=n) l2 = np.random.normal(size=n) latents = np.array([l1, l1, l2, l2]).T X = latents + np.random.normal(size=4 * n).reshape((n, 4)) Y = latents + np.random.normal(size=4 * n).reshape((n, 4)) X_train = X[:n // 2] Y_train = Y[:n // 2] X_test = X[n // 2:] Y_test = Y[n // 2:] train = pdml.ModelFrame(X_train, target=Y_train) test = pdml.ModelFrame(X_test, target=Y_test) # check multi target columns self.assertTrue(train.has_target()) tm.assert_numpy_array_equal(train.data.values, X_train) tm.assert_numpy_array_equal(train.target.values, Y_train) tm.assert_numpy_array_equal(test.data.values, X_test) tm.assert_numpy_array_equal(test.target.values, Y_test) expected = pd.MultiIndex.from_tuples([('.target', 0), ('.target', 1), ('.target', 2), ('.target', 3)]) tm.assert_index_equal(train.target_name, expected) self.assertEqual(train.data.shape, X_train.shape) self.assertEqual(train.target.shape, Y_train.shape) mod1 = getattr(train.cross_decomposition, algo)(n_components=2) mod2 = getattr(cd, algo)(n_components=2) train.fit(mod1) mod2.fit(X_train, Y_train) # 2nd cols are different on travis-CI self.assert_numpy_array_almost_equal(mod1.x_weights_[:, 0], mod2.x_weights_[:, 0]) self.assert_numpy_array_almost_equal(mod1.y_weights_[:, 0], mod2.y_weights_[:, 0]) result_tr = train.transform(mod1) result_test = test.transform(mod1) expected_tr = mod2.transform(X_train, Y_train) expected_test = mod2.transform(X_test, Y_test) self.assertIsInstance(result_tr, pdml.ModelFrame) self.assertIsInstance(result_test, pdml.ModelFrame) self.assert_numpy_array_almost_equal(result_tr.data.values[:, 0], expected_tr[0][:, 0]) self.assert_numpy_array_almost_equal(result_tr.target.values[:, 0], expected_tr[1][:, 0]) self.assert_numpy_array_almost_equal(result_test.data.values[:, 0], expected_test[0][:, 0]) self.assert_numpy_array_almost_equal(result_test.target.values[:, 0], expected_test[1][:, 0])
def test_silhouette_samples(self): result = self.df.metrics.silhouette_samples() expected = metrics.silhouette_samples(self.data, self.pred) self.assertTrue(isinstance(result, pdml.ModelSeries)) tm.assert_index_equal(result.index, self.df.index) self.assert_numpy_array_almost_equal(result.values, expected)
def test_StratifiedShuffleSplit(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) sf1 = df.model_selection.StratifiedShuffleSplit(random_state=self.random_state) sf2 = ms.StratifiedShuffleSplit(random_state=self.random_state) # consume generator ind1 = [x for x in sf1.split(df.data.values, df.target.values)] ind2 = [x for x in sf2.split(iris.data, iris.target)] for i1, i2 in zip(ind1, ind2): self.assertIsInstance(i1, tuple) self.assertEqual(len(i1), 2) self.assertIsInstance(i2, tuple) self.assertEqual(len(i2), 2) tm.assert_numpy_array_equal(i1[0], i1[0]) tm.assert_numpy_array_equal(i1[1], i2[1]) sf1 = df.model_selection.StratifiedShuffleSplit(random_state=self.random_state) with tm.assert_produces_warning(FutureWarning): gen = df.model_selection.iterate(sf1) # StratifiedShuffleSplit is not a subclass of BaseCrossValidator for train_df, test_df in gen: self.assertIsInstance(train_df, pdml.ModelFrame) self.assertIsInstance(test_df, pdml.ModelFrame) tm.assert_index_equal(df.columns, train_df.columns) tm.assert_index_equal(df.columns, test_df.columns) self.assertEqual(len(df), len(train_df) + len(test_df))
def test_sample_ensemble(self): from imblearn.ensemble import BalanceCascade, EasyEnsemble models = [BalanceCascade, EasyEnsemble] X = np.random.randn(100, 5) y = np.array([0, 1]).repeat([80, 20]) df = pdml.ModelFrame(X, target=y, columns=list('ABCDE')) for model in models: mod1 = model(random_state=self.random_state) mod2 = model(random_state=self.random_state) df.fit(mod1) mod2.fit(X, y) results = df.sample(mod1) expected_X, expected_y = mod2.sample(X, y) self.assertIsInstance(results, list) for r in results: self.assertIsInstance(r, pdml.ModelFrame) tm.assert_index_equal(r.columns, df.columns) mod1 = model(random_state=self.random_state) mod2 = model(random_state=self.random_state) results = df.fit_sample(mod1) expected_X, expected_y = mod2.fit_sample(X, y) self.assertIsInstance(results, list) for r in results: self.assertIsInstance(r, pdml.ModelFrame) tm.assert_index_equal(r.columns, df.columns)
def test_add_dummy_feature(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) result = df.preprocessing.add_dummy_feature() expected = pp.add_dummy_feature(iris.data) self.assertIsInstance(result, pdml.ModelFrame) self.assert_numpy_array_almost_equal(result.data.values, expected) result = df.preprocessing.add_dummy_feature(value=2) expected = pp.add_dummy_feature(iris.data, value=2) self.assertIsInstance(result, pdml.ModelFrame) self.assert_numpy_array_almost_equal(result.data.values, expected) tm.assert_index_equal(result.columns[1:], df.data.columns) s = df['sepal length (cm)'] self.assertIsInstance(s, pdml.ModelSeries) result = s.preprocessing.add_dummy_feature() expected = pp.add_dummy_feature(iris.data[:, [0]]) self.assertIsInstance(result, pdml.ModelFrame) self.assert_numpy_array_almost_equal(result.values, expected) self.assertEqual(result.columns[1], 'sepal length (cm)')
def test_silhouette_samples(self): result = self.df.metrics.silhouette_samples() expected = metrics.silhouette_samples(self.data, self.pred) self.assertTrue(isinstance(result, pdml.ModelSeries)) tm.assert_index_equal(result.index, self.df.index) self.assert_numpy_array_almost_equal(result.values, expected)
def test_anes96(self): data = getattr(sm.datasets.anes96, self.load_method)() df = pdml.ModelFrame(data) self.assertEqual(df.shape, (944, 6)) self.assertEqual(df.target_name, 'PID') tm.assert_index_equal(df.data.columns, pd.Index(data.exog_name))
def test_preprocessing_normalize(self): s = pdml.ModelSeries([1, 2, 3, 4, 5], index=['A', 'B', 'C', 'D', 'E']) self.assertIsInstance(s, pdml.ModelSeries) result = s.preprocessing.normalize() expected = pp.normalize(np.atleast_2d(s.values.astype(np.float)))[0, :] self.assertIsInstance(result, pdml.ModelSeries) self.assert_numpy_array_almost_equal(result.values, expected) tm.assert_index_equal(result.index, s.index)
def test_empirical_covariance(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) result = df.covariance.empirical_covariance() expected = covariance.empirical_covariance(iris.data) self.assertIsInstance(result, pdml.ModelFrame) tm.assert_index_equal(result.index, df.data.columns) tm.assert_index_equal(result.columns, df.data.columns) self.assert_numpy_array_almost_equal(result.values, expected)
def test_isotonic_regression(self): data = np.abs(np.random.randn(100)) data = data.cumsum() df = pdml.ModelFrame(np.arange(len(data)), target=data) result = df.isotonic.isotonic_regression() expected = isotonic.isotonic_regression(data) self.assertIsInstance(result, pdml.ModelSeries) tm.assert_index_equal(result.index, df.index) tm.assert_numpy_array_equal(result.values, expected)
def test_iterate(self): df = pdml.ModelFrame(datasets.load_iris()) kf = df.cross_validation.KFold(4, n_folds=2, random_state=self.random_state) for train_df, test_df in df.cross_validation.iterate(kf): self.assertIsInstance(train_df, pdml.ModelFrame) self.assertIsInstance(test_df, pdml.ModelFrame) tm.assert_index_equal(df.columns, train_df.columns) tm.assert_index_equal(df.columns, test_df.columns) self.assertTrue(df.shape[0], train_df.shape[0] + test_df.shape[1])
def test_precision_recall_fscore_support(self): result = self.df.metrics.precision_recall_fscore_support() expected = metrics.precision_recall_fscore_support(self.target, self.pred) self.assert_numpy_array_almost_equal(result['precision'].values, expected[0]) self.assert_numpy_array_almost_equal(result['recall'].values, expected[1]) self.assert_numpy_array_almost_equal(result['f1-score'].values, expected[2]) self.assert_numpy_array_almost_equal(result['support'].values, expected[3]) expected = pd.Index(['precision', 'recall', 'f1-score', 'support']) tm.assert_index_equal(result.columns, expected)
def test_precision_recall_fscore_support(self): result = self.df.metrics.precision_recall_fscore_support() expected = metrics.precision_recall_fscore_support(self.target, self.pred) self.assert_numpy_array_almost_equal(result['precision'].values, expected[0]) self.assert_numpy_array_almost_equal(result['recall'].values, expected[1]) self.assert_numpy_array_almost_equal(result['f1-score'].values, expected[2]) self.assert_numpy_array_almost_equal(result['support'].values, expected[3]) expected = pd.Index(['precision', 'recall', 'f1-score', 'support']) tm.assert_index_equal(result.columns, expected)
def test_isotonic_regression(self): data = np.abs(np.random.randn(100)) data = data.cumsum() df = pdml.ModelFrame(np.arange(len(data)), target=data) result = df.isotonic.isotonic_regression() expected = isotonic.isotonic_regression(data) self.assertIsInstance(result, pdml.ModelSeries) tm.assert_index_equal(result.index, df.index) tm.assert_numpy_array_equal(result.values, expected)
def test_empirical_covariance(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) result = df.covariance.empirical_covariance() expected = covariance.empirical_covariance(iris.data) self.assertIsInstance(result, pdml.ModelFrame) tm.assert_index_equal(result.index, df.data.columns) tm.assert_index_equal(result.columns, df.data.columns) self.assert_numpy_array_almost_equal(result.values, expected)
def test_iterate(self): df = pdml.ModelFrame(datasets.load_iris()) kf = df.cross_validation.KFold(4, n_folds=2, random_state=self.random_state) for train_df, test_df in df.cross_validation.iterate(kf): self.assertIsInstance(train_df, pdml.ModelFrame) self.assertIsInstance(test_df, pdml.ModelFrame) tm.assert_index_equal(df.columns, train_df.columns) tm.assert_index_equal(df.columns, test_df.columns) self.assertTrue(df.shape[0], train_df.shape[0] + test_df.shape[1])
def test_mean_shift(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) result = df.cluster.mean_shift() expected = cluster.mean_shift(iris.data) self.assertEqual(len(result), 2) self.assert_numpy_array_almost_equal(result[0], expected[0]) self.assertIsInstance(result[1], pdml.ModelSeries) tm.assert_index_equal(result[1].index, df.index) tm.assert_numpy_array_equal(result[1].values, expected[1])
def test_sparse_encode(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) _, dictionary, _ = decomposition.dict_learning(iris.data, 2, 1, random_state=self.random_state) result = df.decomposition.sparse_encode(dictionary) expected = decomposition.sparse_encode(iris.data, dictionary) self.assertIsInstance(result, pdml.ModelFrame) tm.assert_index_equal(result.index, df.data.index) self.assert_numpy_array_almost_equal(result.values, expected)
def test_mean_shift(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) result = df.cluster.mean_shift() expected = cluster.mean_shift(iris.data) self.assertEqual(len(result), 2) self.assert_numpy_array_almost_equal(result[0], expected[0]) self.assertIsInstance(result[1], pdml.ModelSeries) tm.assert_index_equal(result[1].index, df.index) tm.assert_numpy_array_equal(result[1].values, expected[1])
def test_spectral_clustering(self): N = 50 m = np.random.random_integers(1, 200, size=(N, N)) m = (m + m.T) / 2 df = pdml.ModelFrame(m) result = df.cluster.spectral_clustering(random_state=self.random_state) expected = cluster.spectral_clustering(m, random_state=self.random_state) self.assertIsInstance(result, pdml.ModelSeries) tm.assert_index_equal(result.index, df.index) tm.assert_numpy_array_equal(result.values, expected)
def test_spectral_clustering(self): N = 50 m = np.random.random_integers(1, 200, size=(N, N)) m = (m + m.T) / 2 df = pdml.ModelFrame(m) result = df.cluster.spectral_clustering(random_state=self.random_state) expected = cluster.spectral_clustering(m, random_state=self.random_state) self.assertIsInstance(result, pdml.ModelSeries) tm.assert_index_equal(result.index, df.index) tm.assert_numpy_array_equal(result.values, expected)
def test_split(self): df = pdml.ModelFrame(datasets.load_iris()) kf = df.model_selection.KFold(4, random_state=self.random_state) gen = df.model_selection.split(kf) for train_df, test_df in gen: self.assertIsInstance(train_df, pdml.ModelFrame) self.assertIsInstance(test_df, pdml.ModelFrame) tm.assert_index_equal(df.columns, train_df.columns) tm.assert_index_equal(df.columns, test_df.columns) self.assertEqual(len(df), len(train_df) + len(test_df))
def test_MDS(self, algo): iris = datasets.load_iris() df = pdml.ModelFrame(iris) mod1 = getattr(df.manifold, algo)(random_state=self.random_state) mod2 = getattr(manifold, algo)(random_state=self.random_state) result = df.fit_transform(mod1) expected = mod2.fit_transform(iris.data) self.assertIsInstance(result, pdml.ModelFrame) tm.assert_index_equal(result.index, df.index) self.assert_numpy_array_almost_equal(result.data.values, expected)
def test_split(self): df = pdml.ModelFrame(datasets.load_iris()) kf = df.model_selection.KFold(4, random_state=self.random_state) gen = df.model_selection.split(kf) for train_df, test_df in gen: self.assertIsInstance(train_df, pdml.ModelFrame) self.assertIsInstance(test_df, pdml.ModelFrame) tm.assert_index_equal(df.columns, train_df.columns) tm.assert_index_equal(df.columns, test_df.columns) self.assertTrue(df.shape[0], train_df.shape[0] + test_df.shape[1])
def test_locally_linear_embedding(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) result = df.manifold.locally_linear_embedding(3, 3) expected = manifold.locally_linear_embedding(iris.data, 3, 3) self.assertEqual(len(result), 2) self.assertIsInstance(result[0], pdml.ModelFrame) tm.assert_index_equal(result[0].index, df.index) tm.assert_numpy_array_equal(result[0].values, expected[0]) self.assertEqual(result[1], expected[1])
def test_locally_linear_embedding(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) result = df.manifold.locally_linear_embedding(3, 3) expected = manifold.locally_linear_embedding(iris.data, 3, 3) self.assertEqual(len(result), 2) self.assertIsInstance(result[0], pdml.ModelFrame) tm.assert_index_equal(result[0].index, df.index) tm.assert_numpy_array_equal(result[0].values, expected[0]) self.assertEqual(result[1], expected[1])
def test_affinity_propagation(self): iris = datasets.load_iris() similality = np.cov(iris.data) df = pdml.ModelFrame(similality) result = df.cluster.affinity_propagation() expected = cluster.affinity_propagation(similality) self.assertEqual(len(result), 2) self.assert_numpy_array_almost_equal(result[0], expected[0]) self.assertIsInstance(result[1], pdml.ModelSeries) tm.assert_index_equal(result[1].index, df.index) tm.assert_numpy_array_equal(result[1].values, expected[1])
def test_affinity_propagation(self): iris = datasets.load_iris() similality = np.cov(iris.data) df = pdml.ModelFrame(similality) result = df.cluster.affinity_propagation() expected = cluster.affinity_propagation(similality) self.assertEqual(len(result), 2) self.assert_numpy_array_almost_equal(result[0], expected[0]) self.assertIsInstance(result[1], pdml.ModelSeries) tm.assert_index_equal(result[1].index, df.index) tm.assert_numpy_array_equal(result[1].values, expected[1])
def test_lasso_stability_path(self): diabetes = datasets.load_diabetes() df = pdml.ModelFrame(diabetes) result = df.linear_model.lasso_stability_path(random_state=self.random_state) expected = lm.lasso_stability_path(diabetes.data, diabetes.target, random_state=self.random_state) self.assertEqual(len(result), 2) tm.assert_numpy_array_equal(result[0], expected[0]) self.assertIsInstance(result[1], pdml.ModelFrame) tm.assert_index_equal(result[1].index, df.data.columns) tm.assert_numpy_array_equal(result[1].values, expected[1])
def test_lars_path(self): diabetes = datasets.load_diabetes() df = pdml.ModelFrame(diabetes) result = df.linear_model.lars_path() expected = lm.lars_path(diabetes.data, diabetes.target) self.assertEqual(len(result), 3) tm.assert_numpy_array_equal(result[0], expected[0]) self.assertEqual(result[1], expected[1]) self.assertIsInstance(result[1], list) self.assertIsInstance(result[2], pdml.ModelFrame) tm.assert_index_equal(result[2].index, df.data.columns) tm.assert_numpy_array_equal(result[2].values, expected[2])
def test_iterate(self): df = pdml.ModelFrame(datasets.load_iris()) kf = df.model_selection.KFold(4, random_state=self.random_state) with tm.assert_produces_warning(FutureWarning): gen = df.model_selection.iterate(kf) for train_df, test_df in gen: self.assertIsInstance(train_df, pdml.ModelFrame) self.assertIsInstance(test_df, pdml.ModelFrame) tm.assert_index_equal(df.columns, train_df.columns) tm.assert_index_equal(df.columns, test_df.columns) self.assertEqual(len(df), len(train_df) + len(test_df))
def test_LabelEncoder_frame(self): arr = np.array(['X', 'Y', 'Z', 'X']) df = pdml.ModelFrame(arr, index=['a', 'b', 'c', 'd'], columns=['A']) mod1 = df.pp.LabelEncoder() df.fit(mod1) result = df.transform(mod1) expected = np.array([0, 1, 2, 0]).reshape(-1, 1) self.assertIsInstance(result, pdml.ModelFrame) self.assert_numpy_array_almost_equal(result.values, expected) tm.assert_index_equal(result.columns, df.columns) tm.assert_index_equal(result.index, df.index) mod1 = df.pp.LabelEncoder() result = df.fit_transform(mod1) self.assertIsInstance(result, pdml.ModelFrame) self.assert_numpy_array_almost_equal(result.values, expected) tm.assert_index_equal(result.columns, df.columns) tm.assert_index_equal(result.index, df.index) inversed = result.inverse_transform(mod1) self.assertIsInstance(inversed, pdml.ModelFrame) tm.assert_frame_equal(inversed, df)
def test_iterate(self): df = pdml.ModelFrame(datasets.load_iris()) kf = df.model_selection.KFold(4, random_state=self.random_state) with tm.assert_produces_warning(FutureWarning): gen = df.model_selection.iterate(kf) for train_df, test_df in gen: self.assertIsInstance(train_df, pdml.ModelFrame) self.assertIsInstance(test_df, pdml.ModelFrame) tm.assert_index_equal(df.columns, train_df.columns) tm.assert_index_equal(df.columns, test_df.columns) self.assertTrue(df.shape[0], train_df.shape[0] + test_df.shape[1])
def test_frame_target_object_set(self): df = pd.DataFrame({datetime.datetime(2014, 1, 1): [1, 2, 3], datetime.datetime(2015, 1, 1): [4, 5, 6], datetime.datetime(2016, 1, 1): [7, 8, 9]}, index=['a', 'b', 'c']) mdf = pdml.ModelFrame(df) mdf.target = pd.Series(['A', 'B', 'C'], index=['a', 'b', 'c'], name=5) self.assertIsInstance(mdf, pdml.ModelFrame) self.assertEqual(mdf.shape, (3, 4)) tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c'])) expected = pd.Index([5, datetime.datetime(2014, 1, 1), datetime.datetime(2015, 1, 1), datetime.datetime(2016, 1, 1)]) tm.assert_index_equal(mdf.columns, expected) tm.assert_frame_equal(mdf.data, df) expected = pd.Series(['A', 'B', 'C'], index=['a', 'b', 'c'], name=5) tm.assert_series_equal(mdf.target, expected) self.assertEqual(mdf.target.name, 5) # name will be ignored if ModelFrame already has a target mdf.target = pd.Series([10, 11, 12], index=['a', 'b', 'c'], name='X') self.assertIsInstance(mdf, pdml.ModelFrame) self.assertEqual(mdf.shape, (3, 4)) tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c'])) expected = pd.Index([5, datetime.datetime(2014, 1, 1), datetime.datetime(2015, 1, 1), datetime.datetime(2016, 1, 1)]) tm.assert_index_equal(mdf.columns, expected) tm.assert_frame_equal(mdf.data, df) expected = pd.Series([10, 11, 12], index=['a', 'b', 'c'], name=5) tm.assert_series_equal(mdf.target, expected) self.assertEqual(mdf.target.name, 5)
def test_lasso_stability_path(self): diabetes = datasets.load_diabetes() df = pdml.ModelFrame(diabetes) result = df.linear_model.lasso_stability_path(random_state=self.random_state) expected = lm.lasso_stability_path(diabetes.data, diabetes.target, random_state=self.random_state) self.assertEqual(len(result), 2) tm.assert_numpy_array_equal(result[0], expected[0]) self.assertIsInstance(result[1], pdml.ModelFrame) tm.assert_index_equal(result[1].index, df.data.columns) tm.assert_numpy_array_equal(result[1].values, expected[1])
def test_frame_init_df_series(self): # initialization by dataframe and no-named series df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}, index=['a', 'b', 'c'], columns=['A', 'B', 'C']) s = pd.Series([1, 2, 3], index=['a', 'b', 'c'], name='.target') mdf = pdml.ModelFrame(df, target=s) self.assertIsInstance(mdf, pdml.ModelFrame) self.assertEqual(mdf.shape, (3, 4)) tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c'])) tm.assert_index_equal(mdf.columns, pd.Index(['.target', 'A', 'B', 'C'])) tm.assert_frame_equal(mdf.data, df) tm.assert_series_equal(mdf.target, s) self.assertEqual(mdf.target.name, '.target') self.assertEqual(mdf.target_name, '.target') s = pd.Series([1, 2, 3]) with self.assertRaisesRegexp(ValueError, 'data and target must have equal index'): mdf = pdml.ModelFrame(df, target=s) s = pd.Series([1, 2, 3], index=['a', 'b', 'c'], name='XXX') mdf = pdml.ModelFrame(df, target=s) self.assertIsInstance(mdf, pdml.ModelFrame) self.assertEqual(mdf.shape, (3, 4)) tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c'])) tm.assert_index_equal(mdf.columns, pd.Index(['XXX', 'A', 'B', 'C'])) tm.assert_frame_equal(mdf.data, df) tm.assert_series_equal(mdf.target, s) self.assertEqual(mdf.target.name, 'XXX') self.assertEqual(mdf.target_name, 'XXX')
def test_lars_path(self): diabetes = datasets.load_diabetes() df = pdml.ModelFrame(diabetes) result = df.linear_model.lars_path() expected = lm.lars_path(diabetes.data, diabetes.target) self.assertEqual(len(result), 3) tm.assert_numpy_array_equal(result[0], expected[0]) self.assertEqual(result[1], expected[1]) self.assertIsInstance(result[1], list) self.assertIsInstance(result[2], pdml.ModelFrame) tm.assert_index_equal(result[2].index, df.data.columns) tm.assert_numpy_array_equal(result[2].values, expected[2])
def test_LabelEncoder_frame(self): arr = np.array(['X', 'Y', 'Z', 'X']) df = pdml.ModelFrame(arr, index=['a', 'b', 'c', 'd'], columns=['A']) mod1 = df.pp.LabelEncoder() df.fit(mod1) result = df.transform(mod1) expected = np.array([0, 1, 2, 0]).reshape(-1, 1) self.assertIsInstance(result, pdml.ModelFrame) self.assert_numpy_array_almost_equal(result.values, expected) tm.assert_index_equal(result.columns, df.columns) tm.assert_index_equal(result.index, df.index) mod1 = df.pp.LabelEncoder() result = df.fit_transform(mod1) self.assertIsInstance(result, pdml.ModelFrame) self.assert_numpy_array_almost_equal(result.values, expected) tm.assert_index_equal(result.columns, df.columns) tm.assert_index_equal(result.index, df.index) inversed = result.inverse_transform(mod1) self.assertIsInstance(inversed, pdml.ModelFrame) tm.assert_frame_equal(inversed, df)
def test_frame_init_dict_list_series_index(self): # initialization by dataframe and list df = {'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]} target = pd.Series([9, 8, 7], name='X', index=['a', 'b', 'c']) mdf = pdml.ModelFrame(df, target=target) self.assertIsInstance(mdf, pdml.ModelFrame) self.assertEqual(mdf.shape, (3, 4)) tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c'])) tm.assert_index_equal(mdf.columns, pd.Index(['X', 'A', 'B', 'C'])) expected = pd.DataFrame(df, index=['a', 'b', 'c']) tm.assert_frame_equal(mdf.data, expected) tm.assert_series_equal(mdf.target, target) self.assertEqual(mdf.target.name, 'X') self.assertEqual(mdf.target_name, 'X')
def test_frame_init_dict_list_series_index(self): # initialization by dataframe and list df = {'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]} target = pd.Series([9, 8, 7], name='X', index=['a', 'b', 'c']) mdf = pdml.ModelFrame(df, target=target) self.assertIsInstance(mdf, pdml.ModelFrame) self.assertEqual(mdf.shape, (3, 4)) tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c'])) tm.assert_index_equal(mdf.columns, pd.Index(['X', 'A', 'B', 'C'])) expected = pd.DataFrame(df, index=['a', 'b', 'c']) tm.assert_frame_equal(mdf.data, expected) tm.assert_series_equal(mdf.target, target) self.assertEqual(mdf.target.name, 'X') self.assertEqual(mdf.target_name, 'X')
def test_MDS(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) models = ['MDS'] for model in models: mod1 = getattr(df.manifold, model)(random_state=self.random_state) mod2 = getattr(manifold, model)(random_state=self.random_state) result = df.fit_transform(mod1) expected = mod2.fit_transform(iris.data) self.assertIsInstance(result, pdml.ModelFrame) tm.assert_index_equal(result.index, df.index) self.assert_numpy_array_almost_equal(result.data.values, expected)
def test_k_means(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) result = df.cluster.k_means(3, random_state=self.random_state) expected = cluster.k_means(iris.data, 3, random_state=self.random_state) self.assertEqual(len(result), 3) self.assert_numpy_array_almost_equal(result[0], expected[0]) self.assertIsInstance(result[1], pdml.ModelSeries) tm.assert_index_equal(result[1].index, df.index) tm.assert_numpy_array_equal(result[1].values, expected[1]) self.assertAlmostEqual(result[2], expected[2])
def test_oas(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) result = df.covariance.oas() expected = covariance.oas(iris.data) self.assertEqual(len(result), 2) self.assertIsInstance(result[0], pdml.ModelFrame) tm.assert_index_equal(result[0].index, df.data.columns) tm.assert_index_equal(result[0].columns, df.data.columns) self.assert_numpy_array_almost_equal(result[0].values, expected[0]) self.assert_numpy_array_almost_equal(result[1], expected[1])
def test_oas(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) result = df.covariance.oas() expected = covariance.oas(iris.data) self.assertEqual(len(result), 2) self.assertIsInstance(result[0], pdml.ModelFrame) tm.assert_index_equal(result[0].index, df.data.columns) tm.assert_index_equal(result[0].columns, df.data.columns) self.assert_numpy_array_almost_equal(result[0].values, expected[0]) self.assert_numpy_array_almost_equal(result[1], expected[1])
def test_k_means(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) result = df.cluster.k_means(3, random_state=self.random_state) expected = cluster.k_means(iris.data, 3, random_state=self.random_state) self.assertEqual(len(result), 3) self.assert_numpy_array_almost_equal(result[0], expected[0]) self.assertIsInstance(result[1], pdml.ModelSeries) tm.assert_index_equal(result[1].index, df.index) tm.assert_numpy_array_equal(result[1].values, expected[1]) self.assertAlmostEqual(result[2], expected[2])
def _assert_fit_transform(self, df, exp_data, model1, model2): result = df.fit_transform(model1) expected = model2.fit_transform(exp_data) self.assertIsInstance(result, pdml.ModelFrame) # target is unchanged if df.has_target(): # target is unchanged tm.assert_series_equal(df.target, result.target) else: self.assertIsNone(result.target) self.assert_numpy_array_almost_equal(result.data.values, expected) # index and columns are kept tm.assert_index_equal(result.index, df.index) tm.assert_index_equal(result.columns, df.columns)
def test_spectral_embedding(self): N = 10 m = np.random.random_integers(50, 200, size=(N, N)) m = (m + m.T) / 2 df = pdml.ModelFrame(m) self.assert_numpy_array_almost_equal(df.data.values, m) result = df.manifold.spectral_embedding(random_state=self.random_state) expected = manifold.spectral_embedding(m, random_state=self.random_state) self.assertIsInstance(result, pdml.ModelFrame) tm.assert_index_equal(result.index, df.index) # signs can be inversed self.assert_numpy_array_almost_equal(np.abs(result.data.values), np.abs(expected))