def test_perceptron(test_path): stream = SEAGenerator(random_state=1) learner = PerceptronMask(random_state=1) cnt = 0 max_samples = 5000 y_pred = array('i') X_batch = [] y_batch = [] y_proba = [] wait_samples = 100 while cnt < max_samples: X, y = stream.next_sample() X_batch.append(X[0]) y_batch.append(y[0]) # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): y_pred.append(learner.predict(X)[0]) y_proba.append(learner.predict_proba(X)[0]) learner.partial_fit(X, y, classes=stream.target_values) cnt += 1 expected_predictions = array('i', [ 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1 ]) assert np.alltrue(y_pred == expected_predictions) test_file = os.path.join(test_path, 'data_perceptron_proba.npy') y_proba_expected = np.load(test_file) assert np.allclose(y_proba, y_proba_expected) expected_info = "PerceptronMask(alpha=0.0001, class_weight=None, early_stopping=False, " \ "eta0=1.0, fit_intercept=True, max_iter=1000, n_iter_no_change=5, " \ "n_jobs=None, penalty=None, random_state=1, shuffle=True, tol=0.001, " \ "validation_fraction=0.1, verbose=0, warm_start=False)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info # Coverage tests learner.reset() if not sklearn_version.startswith("0.21"): learner.fit(X=np.asarray(X_batch[:4500]), y=np.asarray(y_batch[:4500], dtype=int)) else: # Root cause of failure (TypeError: an integer is required) is in the fit() method # in sklearn 0.21.0. This is a workaround until a fix is made available in sklearn learner.partial_fit(X=np.asarray(X_batch[:4500]), y=np.asarray(y_batch[:4500]), classes=stream.target_values) learner.predict(X=X_batch[4501:]) # Run for coverage assert type(learner.predict(X)) == np.ndarray assert type(learner.predict_proba(X)) == np.ndarray
def test_classifier_chains(): seed = 112 stream = MultilabelGenerator(random_state=seed, n_targets=3, n_samples=5150) stream.prepare_for_use() estimator = SGDClassifier(random_state=seed, tol=1e-3, max_iter=10) learner = ClassifierChain(base_estimator=estimator, random_state=seed) X, y = stream.next_sample(150) learner.partial_fit(X, y) cnt = 0 max_samples = 5000 predictions = [] true_labels = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) true_labels.append(y[0]) if np.array_equal(y[0], predictions[-1]): correct_predictions += 1 learner.partial_fit(X, y) cnt += 1 if not sklearn_version.startswith("0.21"): expected_predictions = [[0.0, 0.0, 1.0], [1.0, 0.0, 0.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0], [1.0, 0.0, 1.0], [1.0, 0.0, 0.0], [1.0, 0.0, 1.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [1.0, 1.0, 1.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0], [0.0, 0.0, 0.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 1.0, 1.0], [0.0, 1.0, 1.0], [1.0, 1.0, 1.0], [0.0, 0.0, 0.0], [0.0, 1.0, 0.0], [1.0, 1.0, 1.0], [1.0, 1.0, 0.0], [0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 1.0, 0.0], [0.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.0, 0.0, 0.0], [1.0, 1.0, 1.0]] assert np.alltrue(np.array_equal(predictions, expected_predictions)) expected_correct_predictions = 21 assert correct_predictions == expected_correct_predictions expected_info = "ClassifierChain(base_estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n" \ " early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,\n" \ " l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=10,\n" \ " n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',\n" \ " power_t=0.5, random_state=112, shuffle=True, tol=0.001,\n" \ " validation_fraction=0.1, verbose=0, warm_start=False),\n" \ " order=None, random_state=112)" assert learner.get_info() == expected_info else: expected_predictions = [[0.0, 0.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [1.0, 0.0, 0.0], [1.0, 0.0, 1.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [1.0, 0.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0], [0.0, 0.0, 0.0], [0.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 0.0, 1.0], [0.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 1.0, 0.0], [0.0, 1.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 0.0], [0.0, 1.0, 1.0], [1.0, 0.0, 1.0], [0.0, 1.0, 1.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 1.0], [0.0, 1.0, 0.0], [0.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.0, 0.0, 0.0], [1.0, 1.0, 1.0]] assert np.alltrue(np.array_equal(predictions, expected_predictions)) expected_correct_predictions = 26 assert correct_predictions == expected_correct_predictions expected_info = "ClassifierChain(base_estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n" \ " early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,\n" \ " l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=10,\n" \ " n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,\n" \ " random_state=112, shuffle=True, tol=0.001,\n" \ " validation_fraction=0.1, verbose=0, warm_start=False),\n" \ " order=None, random_state=112)" assert learner.get_info() == expected_info assert type(learner.predict(X)) == np.ndarray
class TestPCA(unittest.TestCase): @classmethod def setUpClass(cls): cls.ionosphere = Table('ionosphere') cls.iris = Table('iris') cls.zoo = Table('zoo') def test_pca(self): data = self.ionosphere self.__pca_test_helper(data, n_com=3, min_xpl_var=0.5) self.__pca_test_helper(data, n_com=10, min_xpl_var=0.7) self.__pca_test_helper(data, n_com=32, min_xpl_var=1) def __pca_test_helper(self, data, n_com, min_xpl_var): pca = PCA(n_components=n_com) pca_model = pca(data) pca_xpl_var = np.sum(pca_model.explained_variance_ratio_) self.assertGreaterEqual(pca_xpl_var + 1e-6, min_xpl_var) self.assertEqual(n_com, pca_model.n_components) self.assertEqual((n_com, data.X.shape[1]), pca_model.components_.shape) proj = np.dot(data.X - pca_model.mean_, pca_model.components_.T) np.testing.assert_almost_equal(pca_model(data).X, proj) def test_sparse_pca(self): data = self.ionosphere[:100] self.__sparse_pca_test_helper(data, n_com=3, max_err=1500) self.__sparse_pca_test_helper(data, n_com=10, max_err=1000) self.__sparse_pca_test_helper(data, n_com=32, max_err=500) def __sparse_pca_test_helper(self, data, n_com, max_err): sparse_pca = SparsePCA(n_components=n_com, ridge_alpha=0.001, random_state=0) pca_model = sparse_pca(data) self.assertEqual(n_com, pca_model.n_components) self.assertEqual((n_com, data.X.shape[1]), pca_model.components_.shape) self.assertLessEqual(pca_model.error_[-1], max_err) def test_randomized_pca(self): data = self.ionosphere self.__rnd_pca_test_helper(data, n_com=3, min_xpl_var=0.5) self.__rnd_pca_test_helper(data, n_com=10, min_xpl_var=0.7) self.__rnd_pca_test_helper(data, n_com=32, min_xpl_var=0.98) def __rnd_pca_test_helper(self, data, n_com, min_xpl_var): rnd_pca = PCA(n_components=n_com, svd_solver='randomized') pca_model = rnd_pca(data) pca_xpl_var = np.sum(pca_model.explained_variance_ratio_) self.assertGreaterEqual(pca_xpl_var, min_xpl_var) self.assertEqual(n_com, pca_model.n_components) self.assertEqual((n_com, data.X.shape[1]), pca_model.components_.shape) proj = np.dot(data.X - pca_model.mean_, pca_model.components_.T) np.testing.assert_almost_equal(pca_model(data).X, proj) @unittest.skipIf( sklearn_version.startswith('0.20'), "https://github.com/scikit-learn/scikit-learn/issues/12234") def test_incremental_pca(self): data = self.ionosphere self.__ipca_test_helper(data, n_com=3, min_xpl_var=0.49) self.__ipca_test_helper(data, n_com=32, min_xpl_var=1) def __ipca_test_helper(self, data, n_com, min_xpl_var): pca = IncrementalPCA(n_components=n_com) pca_model = pca(data[::2]) pca_xpl_var = np.sum(pca_model.explained_variance_ratio_) self.assertGreaterEqual(pca_xpl_var + 1e-6, min_xpl_var) self.assertEqual(n_com, pca_model.n_components) self.assertEqual((n_com, data.X.shape[1]), pca_model.components_.shape) proj = np.dot(data.X - pca_model.mean_, pca_model.components_.T) np.testing.assert_almost_equal(pca_model(data).X, proj) pc1_ipca = pca_model.components_[0] self.assertAlmostEqual(np.linalg.norm(pc1_ipca), 1) pc1_pca = PCA(n_components=n_com)(data).components_[0] self.assertAlmostEqual(np.linalg.norm(pc1_pca), 1) self.assertNotAlmostEqual(abs(pc1_ipca.dot(pc1_pca)), 1, 2) pc1_ipca = pca_model.partial_fit(data[1::2]).components_[0] self.assertAlmostEqual(abs(pc1_ipca.dot(pc1_pca)), 1, 4) def test_truncated_svd(self): data = self.ionosphere self.__truncated_svd_test_helper(data, n_components=3, min_variance=0.5) self.__truncated_svd_test_helper(data, n_components=10, min_variance=0.7) self.__truncated_svd_test_helper(data, n_components=31, min_variance=0.99) def __truncated_svd_test_helper(self, data, n_components, min_variance): model = TruncatedSVD(n_components=n_components)(data) svd_variance = np.sum(model.explained_variance_ratio_) self.assertGreaterEqual(svd_variance + 1e-6, min_variance) self.assertEqual(n_components, model.n_components) self.assertEqual((n_components, data.X.shape[1]), model.components_.shape) proj = np.dot(data.X, model.components_.T) np.testing.assert_almost_equal(model(data).X, proj) def test_compute_value(self): iris = self.iris pca = PCA(n_components=2)(iris) pca_iris = pca(iris) pca_iris2 = Table(pca_iris.domain, iris) np.testing.assert_almost_equal(pca_iris.X, pca_iris2.X) np.testing.assert_equal(pca_iris.Y, pca_iris2.Y) pca_iris3 = pickle.loads(pickle.dumps(pca_iris)) np.testing.assert_almost_equal(pca_iris.X, pca_iris3.X) np.testing.assert_equal(pca_iris.Y, pca_iris3.Y) def test_transformed_domain_does_not_pickle_data(self): iris = self.iris pca = PCA(n_components=2)(iris) pca_iris = pca(iris) pca_iris2 = Table(pca_iris.domain, iris) pca_iris2 = pickle.loads(pickle.dumps(pca_iris)) self.assertIsNone(pca_iris2.domain[0].compute_value.transformed) def test_chain(self): zoo_c = Continuize()(self.zoo) pca = PCA(n_components=3)(zoo_c)(self.zoo) pca2 = PCA(n_components=3)(zoo_c)(zoo_c) pp = [Continuize()] pca3 = PCA(n_components=3, preprocessors=pp)(self.zoo)(self.zoo) np.testing.assert_almost_equal(pca.X, pca2.X) np.testing.assert_almost_equal(pca.X, pca3.X) def test_PCA_scorer(self): data = self.iris pca = PCA(preprocessors=[Normalize()]) pca.component = 1 scores = pca.score_data(data) self.assertEqual(scores.shape[1], len(data.domain.attributes)) self.assertEqual(['petal length', 'petal width'], sorted([ data.domain.attributes[i].name for i in np.argsort(scores[0])[-2:] ])) self.assertEqual([round(s, 4) for s in scores[0]], [0.5224, 0.2634, 0.5813, 0.5656]) def test_PCA_scorer_component(self): pca = PCA() for i in range(1, len(self.zoo.domain.attributes) + 1): pca.component = i scores = pca.score_data(self.zoo) self.assertEqual(scores.shape, (pca.component, len(self.zoo.domain.attributes))) def test_PCA_scorer_all_components(self): n_attr = len(self.iris.domain.attributes) pca = PCA() scores = pca.score_data(self.iris) self.assertEqual(scores.shape, (n_attr, n_attr)) def test_max_components(self): d = np.random.RandomState(0).rand(20, 20) data = Table(d) pca = PCA()(data) self.assertEqual(len(pca.explained_variance_ratio_), 20) pca = PCA(n_components=10)(data) self.assertEqual(len(pca.explained_variance_ratio_), 10)
def test_classifier_chains(): seed = 112 stream = MultilabelGenerator(random_state=seed, n_targets=3, n_samples=5150) estimator = SGDClassifier(random_state=seed, max_iter=10) learner = ClassifierChain(base_estimator=estimator, random_state=seed) X, y = get_next_n_samples(stream, 150) learner.partial_fit(X, y) cnt = 0 max_samples = 5000 predictions = [] true_labels = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(learner.predict(X)[0]) true_labels.append(y[0]) if np.array_equal(y[0], predictions[-1]): correct_predictions += 1 learner.partial_fit(X, y) cnt += 1 if not sklearn_version.startswith("0.21"): expected_predictions = [[0., 0., 1.], [0., 0., 0.], [1., 0., 1.], [1., 0., 1.], [0., 0., 1.], [1., 0., 0.], [1., 0., 1.], [1., 0., 1.], [0., 0., 1.], [0., 0., 0.], [1., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [0., 0., 1.], [1., 0., 1.], [0., 0., 0.], [1., 0., 1.], [0., 0., 0.], [0., 1., 1.], [0., 1., 1.], [0., 0., 1.], [0., 1., 1.], [0., 1., 1.], [0., 1., 1.], [0., 1., 0.], [0., 1., 0.], [1., 1., 1.], [0., 1., 0.], [0., 1., 1.], [1., 0., 1.], [0., 1., 1.], [0., 0., 0.], [0., 0., 0.], [1., 0., 0.], [1., 1., 1.], [0., 1., 1.], [0., 0., 0.], [1., 0., 1.], [0., 0., 1.], [0., 0., 0.], [0., 0., 0.], [0., 0., 1.], [0., 1., 0.], [0., 0., 0.], [1., 1., 1.], [0., 0., 0.], [1., 1., 1.]] assert np.alltrue(np.array_equal(predictions, expected_predictions)) expected_correct_predictions = 26 assert correct_predictions == expected_correct_predictions expected_info = "ClassifierChain(base_estimator=SGDClassifier(max_iter=10, " \ "random_state=112), order=None, random_state=112)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info else: expected_predictions = [[0.0, 0.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [1.0, 0.0, 0.0], [1.0, 0.0, 1.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [1.0, 0.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0], [0.0, 0.0, 0.0], [0.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 0.0, 1.0], [0.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 1.0, 0.0], [0.0, 1.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 0.0], [0.0, 1.0, 1.0], [1.0, 0.0, 1.0], [0.0, 1.0, 1.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 0.0, 0.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 1.0], [0.0, 1.0, 0.0], [0.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.0, 0.0, 0.0], [1.0, 1.0, 1.0]] assert np.alltrue(np.array_equal(predictions, expected_predictions)) expected_correct_predictions = 26 assert correct_predictions == expected_correct_predictions expected_info = "ClassifierChain(base_estimator=SGDClassifier(max_iter=10, " \ "random_state=112), order=None, random_state=112)" info = " ".join([line.strip() for line in learner.get_info().split()]) assert info == expected_info assert type(learner.predict(X)) == np.ndarray
def test_multi_output_learner(): stream = MultilabelGenerator(n_samples=5150, n_features=15, n_targets=3, n_labels=4, random_state=112) stream.prepare_for_use() estimator = SGDClassifier(random_state=112, tol=1e-3, max_iter=10, loss='log') classifier = MultiOutputLearner(base_estimator=estimator) X, y = stream.next_sample(150) classifier.partial_fit(X, y) cnt = 0 max_samples = 5000 predictions = [] true_labels = [] wait_samples = 100 correct_predictions = 0 while cnt < max_samples: X, y = stream.next_sample() # Test every n samples if (cnt % wait_samples == 0) and (cnt != 0): predictions.append(classifier.predict(X)[0]) true_labels.append(y[0]) if np.array_equal(y[0], predictions[-1]): correct_predictions += 1 classifier.partial_fit(X, y) cnt += 1 if not sklearn_version.startswith("0.21"): expected_predictions = [[1.0, 1.0, 1.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [1.0, 1.0, 1.0], [0.0, 0.0, 1.0], [1.0, 1.0, 1.0], [0.0, 0.0, 1.0], [0.0, 1.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 0.0, 0.0], [0.0, 1.0, 1.0], [1.0, 0.0, 0.0], [1.0, 0.0, 1.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [1.0, 1.0, 1.0], [0.0, 1.0, 0.0], [0.0, 1.0, 1.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 1.0, 1.0], [0.0, 1.0, 1.0], [1.0, 0.0, 0.0], [0.0, 1.0, 1.0], [0.0, 0.0, 1.0], [1.0, 1.0, 0.0], [1.0, 0.0, 1.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [1.0, 1.0, 1.0], [0.0, 1.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0]] assert np.alltrue(np.array_equal(predictions, expected_predictions)) expected_correct_predictions = 26 assert correct_predictions == expected_correct_predictions expected_performance = 0.7755102040816326 performance = hamming_score(true_labels, predictions) assert np.isclose(performance, expected_performance) expected_info = "MultiOutputLearner(base_estimator=SGDClassifier(alpha=0.0001, average=False, " \ "class_weight=None,\n" \ " early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,\n" \ " l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=10,\n" \ " n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',\n" \ " power_t=0.5, random_state=112, shuffle=True, tol=0.001,\n" \ " validation_fraction=0.1, verbose=0, warm_start=False))" assert classifier.get_info() == expected_info else: expected_predictions = [[1.0, 1.0, 1.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 0.0, 1.0], [1.0, 0.0, 1.0], [1.0, 0.0, 1.0], [0.0, 1.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [0.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 0.0], [1.0, 1.0, 1.0], [1.0, 0.0, 0.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [1.0, 1.0, 1.0], [0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [0.0, 1.0, 1.0], [1.0, 1.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [1.0, 0.0, 0.0], [0.0, 1.0, 1.0], [0.0, 0.0, 1.0], [1.0, 1.0, 0.0], [1.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [1.0, 1.0, 1.0], [1.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 1.0]] np.alltrue(np.array_equal(predictions, expected_predictions)) expected_correct_predictions = 23 assert correct_predictions == expected_correct_predictions expected_performance = 0.7482993197278911 performance = hamming_score(true_labels, predictions) assert np.isclose(performance, expected_performance) expected_info = "MultiOutputLearner(base_estimator=SGDClassifier(alpha=0.0001, average=False, " \ "class_weight=None,\n" \ " early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,\n" \ " l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=10,\n" \ " n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,\n" \ " random_state=112, shuffle=True, tol=0.001,\n" \ " validation_fraction=0.1, verbose=0, warm_start=False))" assert classifier.get_info() == expected_info assert type(classifier.predict(X)) == np.ndarray assert type(classifier.predict_proba(X)) == np.ndarray
class TestPCA(unittest.TestCase): @classmethod def setUpClass(cls): cls.ionosphere = Table(test_filename('datasets/ionosphere.tab')) cls.iris = Table('iris') cls.zoo = Table('zoo') def test_pca(self): data = self.ionosphere self.__pca_test_helper(data, n_com=3, min_xpl_var=0.5) self.__pca_test_helper(data, n_com=10, min_xpl_var=0.7) self.__pca_test_helper(data, n_com=32, min_xpl_var=1) def __pca_test_helper(self, data, n_com, min_xpl_var): pca = PCA(n_components=n_com) pca_model = pca(data) pca_xpl_var = np.sum(pca_model.explained_variance_ratio_) self.assertGreaterEqual(pca_xpl_var + 1e-6, min_xpl_var) self.assertEqual(n_com, pca_model.n_components) self.assertEqual((n_com, data.X.shape[1]), pca_model.components_.shape) proj = np.dot(data.X - pca_model.mean_, pca_model.components_.T) np.testing.assert_almost_equal(pca_model(data).X, proj) def test_sparse_pca(self): data = self.ionosphere[:100] self.__sparse_pca_test_helper(data, n_com=3, max_err=1500) self.__sparse_pca_test_helper(data, n_com=10, max_err=1000) self.__sparse_pca_test_helper(data, n_com=32, max_err=500) def __sparse_pca_test_helper(self, data, n_com, max_err): sparse_pca = SparsePCA(n_components=n_com, ridge_alpha=0.001, random_state=0) pca_model = sparse_pca(data) self.assertEqual(n_com, pca_model.n_components) self.assertEqual((n_com, data.X.shape[1]), pca_model.components_.shape) self.assertLessEqual(pca_model.error_[-1], max_err) def test_randomized_pca(self): data = self.ionosphere self.__rnd_pca_test_helper(data, n_com=3, min_xpl_var=0.5) self.__rnd_pca_test_helper(data, n_com=10, min_xpl_var=0.7) self.__rnd_pca_test_helper(data, n_com=32, min_xpl_var=0.98) def __rnd_pca_test_helper(self, data, n_com, min_xpl_var): rnd_pca = PCA(n_components=n_com, svd_solver='randomized') pca_model = rnd_pca(data) pca_xpl_var = np.sum(pca_model.explained_variance_ratio_) self.assertGreaterEqual(pca_xpl_var, min_xpl_var) self.assertEqual(n_com, pca_model.n_components) self.assertEqual((n_com, data.X.shape[1]), pca_model.components_.shape) proj = np.dot(data.X - pca_model.mean_, pca_model.components_.T) np.testing.assert_almost_equal(pca_model(data).X, proj) def test_improved_randomized_pca_properly_called(self): # It doesn't matter what we put into the matrix x_ = np.random.normal(0, 1, (100, 20)) x = Table.from_numpy(Domain.from_numpy(x_), x_) pca.randomized_pca = MagicMock(wraps=pca.randomized_pca) PCA(10, svd_solver="randomized", random_state=42)(x) pca.randomized_pca.assert_called_once() pca.randomized_pca.reset_mock() PCA(10, svd_solver="arpack", random_state=42)(x) pca.randomized_pca.assert_not_called() def test_improved_randomized_pca_dense_data(self): """Randomized PCA should work well on dense data.""" random_state = check_random_state(42) # Let's take a tall, skinny matrix x_ = random_state.normal(0, 1, (100, 20)) x = Table.from_numpy(Domain.from_numpy(x_), x_) pca = PCA(10, svd_solver="full", random_state=random_state)(x) rpca = PCA(10, svd_solver="randomized", random_state=random_state)(x) np.testing.assert_almost_equal( pca.components_, rpca.components_, decimal=8 ) np.testing.assert_almost_equal( pca.explained_variance_, rpca.explained_variance_, decimal=8 ) np.testing.assert_almost_equal( pca.singular_values_, rpca.singular_values_, decimal=8 ) # And take a short, fat matrix x_ = random_state.normal(0, 1, (20, 100)) x = Table.from_numpy(Domain.from_numpy(x_), x_) pca = PCA(10, svd_solver="full", random_state=random_state)(x) rpca = PCA(10, svd_solver="randomized", random_state=random_state)(x) np.testing.assert_almost_equal( pca.components_, rpca.components_, decimal=8 ) np.testing.assert_almost_equal( pca.explained_variance_, rpca.explained_variance_, decimal=8 ) np.testing.assert_almost_equal( pca.singular_values_, rpca.singular_values_, decimal=8 ) def test_improved_randomized_pca_sparse_data(self): """Randomized PCA should work well on dense data.""" random_state = check_random_state(42) # Let's take a tall, skinny matrix x_ = random_state.negative_binomial(1, 0.5, (100, 20)) x = Table.from_numpy(Domain.from_numpy(x_), x_).to_sparse() pca = PCA(10, svd_solver="full", random_state=random_state)(x.to_dense()) rpca = PCA(10, svd_solver="randomized", random_state=random_state)(x) np.testing.assert_almost_equal( pca.components_, rpca.components_, decimal=8 ) np.testing.assert_almost_equal( pca.explained_variance_, rpca.explained_variance_, decimal=8 ) np.testing.assert_almost_equal( pca.singular_values_, rpca.singular_values_, decimal=8 ) # And take a short, fat matrix x_ = random_state.negative_binomial(1, 0.5, (20, 100)) x = Table.from_numpy(Domain.from_numpy(x_), x_).to_sparse() pca = PCA(10, svd_solver="full", random_state=random_state)(x.to_dense()) rpca = PCA(10, svd_solver="randomized", random_state=random_state)(x) np.testing.assert_almost_equal( pca.components_, rpca.components_, decimal=8 ) np.testing.assert_almost_equal( pca.explained_variance_, rpca.explained_variance_, decimal=8 ) np.testing.assert_almost_equal( pca.singular_values_, rpca.singular_values_, decimal=8 ) @unittest.skipIf(sklearn_version.startswith('0.20'), "https://github.com/scikit-learn/scikit-learn/issues/12234") def test_incremental_pca(self): data = self.ionosphere self.__ipca_test_helper(data, n_com=3, min_xpl_var=0.49) self.__ipca_test_helper(data, n_com=32, min_xpl_var=1) def __ipca_test_helper(self, data, n_com, min_xpl_var): pca = IncrementalPCA(n_components=n_com) pca_model = pca(data[::2]) pca_xpl_var = np.sum(pca_model.explained_variance_ratio_) self.assertGreaterEqual(pca_xpl_var + 1e-6, min_xpl_var) self.assertEqual(n_com, pca_model.n_components) self.assertEqual((n_com, data.X.shape[1]), pca_model.components_.shape) proj = np.dot(data.X - pca_model.mean_, pca_model.components_.T) np.testing.assert_almost_equal(pca_model(data).X, proj) pc1_ipca = pca_model.components_[0] self.assertAlmostEqual(np.linalg.norm(pc1_ipca), 1) pc1_pca = PCA(n_components=n_com)(data).components_[0] self.assertAlmostEqual(np.linalg.norm(pc1_pca), 1) self.assertNotAlmostEqual(abs(pc1_ipca.dot(pc1_pca)), 1, 2) pc1_ipca = pca_model.partial_fit(data[1::2]).components_[0] self.assertAlmostEqual(abs(pc1_ipca.dot(pc1_pca)), 1, 4) def test_truncated_svd(self): data = self.ionosphere self.__truncated_svd_test_helper(data, n_components=3, min_variance=0.5) self.__truncated_svd_test_helper(data, n_components=10, min_variance=0.7) self.__truncated_svd_test_helper(data, n_components=31, min_variance=0.99) def __truncated_svd_test_helper(self, data, n_components, min_variance): model = TruncatedSVD(n_components=n_components)(data) svd_variance = np.sum(model.explained_variance_ratio_) self.assertGreaterEqual(svd_variance + 1e-6, min_variance) self.assertEqual(n_components, model.n_components) self.assertEqual((n_components, data.X.shape[1]), model.components_.shape) proj = np.dot(data.X, model.components_.T) np.testing.assert_almost_equal(model(data).X, proj) def test_compute_value(self): iris = self.iris pca = PCA(n_components=2)(iris) pca_iris = pca(iris) pca_iris2 = iris.transform(pca_iris.domain) np.testing.assert_almost_equal(pca_iris.X, pca_iris2.X) np.testing.assert_equal(pca_iris.Y, pca_iris2.Y) pca_iris3 = pickle.loads(pickle.dumps(pca_iris)) np.testing.assert_almost_equal(pca_iris.X, pca_iris3.X) np.testing.assert_equal(pca_iris.Y, pca_iris3.Y) def test_transformed_domain_does_not_pickle_data(self): iris = self.iris pca = PCA(n_components=2)(iris) pca_iris = pca(iris) pca_iris2 = iris.transform(pca_iris.domain) pca_iris2 = pickle.loads(pickle.dumps(pca_iris)) self.assertIsNone(pca_iris2.domain[0].compute_value.transformed) def test_chain(self): zoo_c = Continuize()(self.zoo) pca = PCA(n_components=3)(zoo_c)(self.zoo) pca2 = PCA(n_components=3)(zoo_c)(zoo_c) pp = [Continuize()] pca3 = PCA(n_components=3, preprocessors=pp)(self.zoo)(self.zoo) np.testing.assert_almost_equal(pca.X, pca2.X) np.testing.assert_almost_equal(pca.X, pca3.X) def test_PCA_scorer(self): data = self.iris pca = PCA(preprocessors=[Normalize()]) pca.component = 1 scores = pca.score_data(data) self.assertEqual(scores.shape[1], len(data.domain.attributes)) self.assertEqual(['petal length', 'petal width'], sorted([data.domain.attributes[i].name for i in np.argsort(scores[0])[-2:]])) self.assertEqual([round(s, 4) for s in scores[0]], [0.5224, 0.2634, 0.5813, 0.5656]) def test_PCA_scorer_component(self): pca = PCA() for i in range(1, len(self.zoo.domain.attributes) + 1): pca.component = i scores = pca.score_data(self.zoo) self.assertEqual(scores.shape, (pca.component, len(self.zoo.domain.attributes))) def test_PCA_scorer_all_components(self): n_attr = len(self.iris.domain.attributes) pca = PCA() scores = pca.score_data(self.iris) self.assertEqual(scores.shape, (n_attr, n_attr)) def test_max_components(self): d = np.random.RandomState(0).rand(20, 20) data = Table.from_numpy(None, d) pca = PCA()(data) self.assertEqual(len(pca.explained_variance_ratio_), 20) pca = PCA(n_components=10)(data) self.assertEqual(len(pca.explained_variance_ratio_), 10)