def check_decision_path(name): X = iris.data y = iris.target n_samples = X.shape[0] TreeEstimator = ALL_TREES[name] est = TreeEstimator(random_state=0, max_depth=2) est.fit(X, y) node_indicator_csr = est.decision_path(X) node_indicator = node_indicator_csr.toarray() assert_equal(node_indicator.shape, (n_samples, est.tree_.node_count)) # Assert that leaves index are correct leaves = est.apply(X) leave_indicator = [node_indicator[i, j] for i, j in enumerate(leaves)] assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples)) # Ensure only one leave node per sample all_leaves = est.tree_.children_left == TREE_LEAF assert_array_almost_equal(np.dot(node_indicator, all_leaves), np.ones(shape=n_samples)) # Ensure max depth is consistent with sum of indicator max_depth = node_indicator.sum(axis=1).max() assert_less_equal(est.tree_.max_depth, max_depth)
def test_min_max_scaler_1d(): """Test scaling of dataset along single axis""" rng = np.random.RandomState(0) X = rng.randn(5) X_orig_copy = X.copy() scaler = MinMaxScaler() X_scaled = scaler.fit(X).transform(X) assert_array_almost_equal(X_scaled.min(axis=0), 0.0) assert_array_almost_equal(X_scaled.max(axis=0), 1.0) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X_orig_copy) # Test with 1D list X = [0., 1., 2, 0.4, 1.] scaler = MinMaxScaler() X_scaled = scaler.fit(X).transform(X) assert_array_almost_equal(X_scaled.min(axis=0), 0.0) assert_array_almost_equal(X_scaled.max(axis=0), 1.0) # Constant feature. X = np.zeros(5) scaler = MinMaxScaler() X_scaled = scaler.fit(X).transform(X) assert_greater_equal(X_scaled.min(), 0.) assert_less_equal(X_scaled.max(), 1.)
def test_pearsonr_mat(self): pear_mat = pearsonr_mat(self.mat) assert_equal(pear_mat.shape, (10, 10)) pear_mat = pearsonr_mat(self.mat, self.w_mat) assert_equal(pear_mat.shape, (10, 10)) assert_greater_equal(np.min(pear_mat), -1) assert_less_equal(np.max(pear_mat), 1)
def test_optimization_minimizes_kl_divergence(): """t-SNE should give a lower KL divergence with more iterations.""" random_state = check_random_state(0) X, _ = make_blobs(n_features=3, random_state=random_state) kl_divergences = [] for n_iter in [200, 250, 300]: tsne = TSNE(n_components=2, perplexity=10, learning_rate=100.0, n_iter=n_iter, random_state=0) tsne.fit_transform(X) kl_divergences.append(tsne.kl_divergence_) assert_less_equal(kl_divergences[1], kl_divergences[0]) assert_less_equal(kl_divergences[2], kl_divergences[1])
def test_prediction_proba(self): y_test_predicted = self.clf.predict_proba(self.X_test) assert_greater_equal(y_test_predicted.min(), 0) assert_less_equal(y_test_predicted.max(), 1) # check performance assert_greater(roc_auc_score(self.y_test, y_test_predicted[:, 1]), self.roc_floor) # check shape of integrity n_classes = len(np.unique(self.y_train)) assert_equal(y_test_predicted.shape, (self.X_test.shape[0], n_classes)) # check probability sum is 1 y_test_predicted_sum = np.sum(y_test_predicted, axis=1) assert_allclose(np.ones([self.X_test.shape[0], ]), y_test_predicted_sum)
def test_precision(): rng_reg = RandomState(2) rng_clf = RandomState(8) for X, y, clf in zip( (rng_reg.random_sample((5, 2)), rng_clf.random_sample( (1000, 4))), (rng_reg.random_sample( (5, )), rng_clf.randint(2, size=(1000, ))), (DecisionTreeRegressor( criterion="friedman_mse", random_state=0, max_depth=1), DecisionTreeClassifier(max_depth=1, random_state=0))): clf.fit(X, y) for precision in (4, 3): dot_data = export_graphviz(clf, out_file=None, precision=precision, proportion=True) # With the current random state, the impurity and the threshold # will have the number of precision set in the export_graphviz # function. We will check the number of precision with a strict # equality. The value reported will have only 2 precision and # therefore, only a less equal comparison will be done. # check value for finding in finditer(r"value = \d+\.\d+", dot_data): assert_less_equal( len(search(r"\.\d+", finding.group()).group()), precision + 1) # check impurity if is_classifier(clf): pattern = r"gini = \d+\.\d+" else: pattern = r"friedman_mse = \d+\.\d+" # check impurity for finding in finditer(pattern, dot_data): assert_equal(len(search(r"\.\d+", finding.group()).group()), precision + 1) # check threshold for finding in finditer(r"<= \d+\.\d+", dot_data): assert_equal(len(search(r"\.\d+", finding.group()).group()), precision + 1)
def test_data_generate(self): X_train, y_train, X_test, y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination) assert_equal(y_train.shape[0], X_train.shape[0]) assert_equal(y_test.shape[0], X_test.shape[0]) assert_less_equal(self.n_train - X_train.shape[0], 1) assert_equal(X_train.shape[1], 2) assert_less_equal(self.n_test - X_test.shape[0], 1) assert_equal(X_test.shape[1], 2) out_perc = np.sum(y_train) / self.n_train assert_allclose(self.contamination, out_perc, atol=0.01) out_perc = np.sum(y_test) / self.n_test assert_allclose(self.contamination, out_perc, atol=0.01)
def test_min_grad_norm(): # Make sure that the parameter min_grad_norm is used correctly random_state = check_random_state(0) X = random_state.randn(100, 2) min_grad_norm = 0.002 tsne = TSNE(min_grad_norm=min_grad_norm, verbose=2, random_state=0, method='exact') old_stdout = sys.stdout sys.stdout = StringIO() try: tsne.fit_transform(X) finally: out = sys.stdout.getvalue() sys.stdout.close() sys.stdout = old_stdout lines_out = out.split('\n') # extract the gradient norm from the verbose output gradient_norm_values = [] for line in lines_out: # When the computation is Finished just an old gradient norm value # is repeated that we do not need to store if 'Finished' in line: break start_grad_norm = line.find('gradient norm') if start_grad_norm >= 0: line = line[start_grad_norm:] line = line.replace('gradient norm = ', '').split(' ')[0] gradient_norm_values.append(float(line)) # Compute how often the gradient norm is smaller than min_grad_norm gradient_norm_values = np.array(gradient_norm_values) n_smaller_gradient_norms = \ len(gradient_norm_values[gradient_norm_values <= min_grad_norm]) # The gradient norm can be smaller than min_grad_norm at most once, # because in the moment it becomes smaller the optimization stops assert_less_equal(n_smaller_gradient_norms, 1)
def test_precision(): rng_reg = RandomState(2) rng_clf = RandomState(8) for X, y, clf in zip( (rng_reg.random_sample((5, 2)), rng_clf.random_sample((1000, 4))), (rng_reg.random_sample((5, )), rng_clf.randint(2, size=(1000, ))), (DecisionTreeRegressor(criterion="friedman_mse", random_state=0, max_depth=1), DecisionTreeClassifier(max_depth=1, random_state=0))): clf.fit(X, y) for precision in (4, 3): dot_data = export_graphviz(clf, out_file=None, precision=precision, proportion=True) # With the current random state, the impurity and the threshold # will have the number of precision set in the export_graphviz # function. We will check the number of precision with a strict # equality. The value reported will have only 2 precision and # therefore, only a less equal comparison will be done. # check value for finding in finditer(r"value = \d+\.\d+", dot_data): assert_less_equal( len(search(r"\.\d+", finding.group()).group()), precision + 1) # check impurity if is_classifier(clf): pattern = r"gini = \d+\.\d+" else: pattern = r"friedman_mse = \d+\.\d+" # check impurity for finding in finditer(pattern, dot_data): assert_equal(len(search(r"\.\d+", finding.group()).group()), precision + 1) # check threshold for finding in finditer(r"<= \d+\.\d+", dot_data): assert_equal(len(search(r"\.\d+", finding.group()).group()), precision + 1)
def test_data_generate(self): X_train, y_train, X_test, y_test = \ generate_data(n_train=self.n_train, n_test=self.n_test, contamination=self.contamination) assert_equal(y_train.shape[0], X_train.shape[0]) assert_equal(y_test.shape[0], X_test.shape[0]) assert_less_equal(self.n_train - X_train.shape[0], 1) assert_equal(X_train.shape[1], 2) assert_less_equal(self.n_test - X_test.shape[0], 1) assert_equal(X_test.shape[1], 2) out_perc = np.sum(y_train) / self.n_train assert_allclose(self.contamination, out_perc, atol=0.01) out_perc = np.sum(y_test) / self.n_test assert_allclose(self.contamination, out_perc, atol=0.01)
def test_data_generate_cluster(self): X_train, X_test, y_train, y_test = \ generate_data_clusters(n_train=self.n_train, n_test=self.n_test, n_features=2, contamination=self.contamination, random_state=self.random_state) assert_equal(y_train.shape[0], X_train.shape[0]) assert_equal(y_test.shape[0], X_test.shape[0]) assert_less_equal(self.n_train - X_train.shape[0], 1) assert_equal(X_train.shape[1], 2) assert_less_equal(self.n_test - X_test.shape[0], 1) assert_equal(X_test.shape[1], 2) out_perc = (np.sum(y_train) + np.sum(y_test)) / (self.n_train + self.n_test) assert_allclose(self.contamination, out_perc, atol=0.01)
def test_rbf_sampler(): # test that RBFSampler approximates kernel on random data # compute exact kernel gamma = 10. kernel = rbf_kernel(X, Y, gamma=gamma) # approximate kernel mapping rbf_transform = RBFSampler(gamma=gamma, n_components=1000, random_state=42) X_trans = rbf_transform.fit_transform(X) Y_trans = rbf_transform.transform(Y) kernel_approx = np.dot(X_trans, Y_trans.T) error = kernel - kernel_approx assert_less_equal(np.abs(np.mean(error)), 0.01) # close to unbiased np.abs(error, out=error) assert_less_equal(np.max(error), 0.1) # nothing too far off assert_less_equal(np.mean(error), 0.05) # mean is fairly close
def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1)
def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1)
def check_limits(value, lower_bound, upper_bound): assert_less_equal(lower_bound, value) assert_greater(upper_bound, value)
def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) # check performance assert_greater(roc_auc_score(self.y_test, pred_proba), self.roc_floor)
def test_assert_less_equal(): assert_less_equal(0, 1) assert_less_equal(1, 1) assert_raises(AssertionError, assert_less_equal, 1, 0)
def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, proba_method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1)
def check_limits(value, low, high): # check if low <= value <= high assert_less_equal(low, value) assert_greater_equal(high, value)