def test_number_of_subsets_of_features(): # In RFE, 'number_of_subsets_of_features' # = the number of iterations in '_fit' # = max(ranking_) # = 1 + (n_features + step - n_features_to_select - 1) // step # After optimization #4534, this number # = 1 + np.ceil((n_features - n_features_to_select) / float(step)) # This test case is to test their equivalence, refer to #4534 and #3824 def formula1(n_features, n_features_to_select, step): return 1 + ((n_features + step - n_features_to_select - 1) // step) def formula2(n_features, n_features_to_select, step): return 1 + np.ceil((n_features - n_features_to_select) / float(step)) # RFE # Case 1, n_features - n_features_to_select is divisible by step # Case 2, n_features - n_features_to_select is not divisible by step n_features_list = [11, 11] n_features_to_select_list = [3, 3] step_list = [2, 3] for n_features, n_features_to_select, step in zip( n_features_list, n_features_to_select_list, step_list): generator = check_random_state(43) X = generator.normal(size=(100, n_features)) y = generator.rand(100).round() rfe = RFE(estimator=SVC(kernel="linear"), n_features_to_select=n_features_to_select, step=step) rfe.fit(X, y) # this number also equals to the maximum of ranking_ assert (np.max(rfe.ranking_) == formula1(n_features, n_features_to_select, step)) assert (np.max(rfe.ranking_) == formula2(n_features, n_features_to_select, step)) # In RFECV, 'fit' calls 'RFE._fit' # 'number_of_subsets_of_features' of RFE # = the size of 'grid_scores' of RFECV # = the number of iterations of the for loop before optimization #4534 # RFECV, n_features_to_select = 1 # Case 1, n_features - 1 is divisible by step # Case 2, n_features - 1 is not divisible by step n_features_to_select = 1 n_features_list = [11, 10] step_list = [2, 2] for n_features, step in zip(n_features_list, step_list): generator = check_random_state(43) X = generator.normal(size=(100, n_features)) y = generator.rand(100).round() rfecv = RFECV(estimator=SVC(kernel="linear"), step=step) rfecv.fit(X, y) assert (rfecv.grid_scores_.shape[0] == formula1( n_features, n_features_to_select, step)) assert (rfecv.grid_scores_.shape[0] == formula2( n_features, n_features_to_select, step))
def test_make_rng(): # Check the check_random_state utility function behavior assert check_random_state(None) is np.random.mtrand._rand assert check_random_state(np.random) is np.random.mtrand._rand rng_42 = np.random.RandomState(42) assert check_random_state(42).randint(100) == rng_42.randint(100) rng_42 = np.random.RandomState(42) assert check_random_state(rng_42) is rng_42 rng_42 = np.random.RandomState(42) assert check_random_state(43).randint(100) != rng_42.randint(100) assert_raises(ValueError, check_random_state, "some invalid seed")
def test_kd_tree_query(metric, k, dualtree, breadth_first): rng = check_random_state(0) X = rng.random_sample((40, DIMENSION)) Y = rng.random_sample((10, DIMENSION)) kwargs = METRICS[metric] check_neighbors(dualtree, breadth_first, k, metric, X, Y, kwargs)
def test_lda_dimension_warning(n_classes, n_features): # FIXME: Future warning to be removed in 0.23 rng = check_random_state(0) n_samples = 10 X = rng.randn(n_samples, n_features) # we create n_classes labels by repeating and truncating a # range(n_classes) until n_samples y = np.tile(range(n_classes), n_samples // n_classes + 1)[:n_samples] max_components = min(n_features, n_classes - 1) for n_components in [max_components - 1, None, max_components]: # if n_components <= min(n_classes - 1, n_features), no warning lda = LinearDiscriminantAnalysis(n_components=n_components) assert_no_warnings(lda.fit, X, y) for n_components in [ max_components + 1, max(n_features, n_classes - 1) + 1 ]: # if n_components > min(n_classes - 1, n_features), raise warning # We test one unit higher than max_components, and then something # larger than both n_features and n_classes - 1 to ensure the test # works for any value of n_component lda = LinearDiscriminantAnalysis(n_components=n_components) msg = ("n_components cannot be larger than min(n_features, " "n_classes - 1). Using min(n_features, " "n_classes - 1) = min(%d, %d - 1) = %d components." % (n_features, n_classes, max_components)) assert_warns_message(ChangedBehaviorWarning, msg, lda.fit, X, y) future_msg = ("In version 0.23, setting n_components > min(" "n_features, n_classes - 1) will raise a " "ValueError. You should set n_components to None" " (default), or a value smaller or equal to " "min(n_features, n_classes - 1).") assert_warns_message(FutureWarning, future_msg, lda.fit, X, y)
def test_multinomial_loss(): # test if the multinomial loss and gradient computations are consistent X, y = iris.data, iris.target.astype(np.float64) n_samples, n_features = X.shape n_classes = len(np.unique(y)) rng = check_random_state(42) weights = rng.randn(n_features, n_classes) intercept = rng.randn(n_classes) sample_weights = rng.randn(n_samples) np.abs(sample_weights, sample_weights) # compute loss and gradient like in multinomial SAG dataset, _ = make_dataset(X, y, sample_weights, random_state=42) loss_1, grad_1 = _multinomial_grad_loss_all_samples( dataset, weights, intercept, n_samples, n_features, n_classes) # compute loss and gradient like in multinomial LogisticRegression lbin = LabelBinarizer() Y_bin = lbin.fit_transform(y) weights_intercept = np.vstack((weights, intercept)).T.ravel() loss_2, grad_2, _ = _multinomial_loss_grad(weights_intercept, X, Y_bin, 0.0, sample_weights) grad_2 = grad_2.reshape(n_classes, -1) grad_2 = grad_2[:, :-1].T # comparison assert_array_almost_equal(grad_1, grad_2) assert_almost_equal(loss_1, loss_2)
def test_iforest_sparse(): """Check IForest for various parameter settings on sparse input.""" rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], boston.target[:50], random_state=rng) grid = ParameterGrid({"max_samples": [0.5, 1.0], "bootstrap": [True, False]}) for sparse_format in [csc_matrix, csr_matrix]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) for params in grid: # Trained on sparse format sparse_classifier = IsolationForest( n_estimators=10, random_state=1, **params).fit(X_train_sparse) sparse_results = sparse_classifier.predict(X_test_sparse) # Trained on dense format dense_classifier = IsolationForest( n_estimators=10, random_state=1, **params).fit(X_train) dense_results = dense_classifier.predict(X_test) assert_array_equal(sparse_results, dense_results)
def test_oneclass_decision_function(): # Test OneClassSVM decision function clf = svm.OneClassSVM() rnd = check_random_state(2) # Generate train data X = 0.3 * rnd.randn(100, 2) X_train = np.r_[X + 2, X - 2] # Generate some regular novel observations X = 0.3 * rnd.randn(20, 2) X_test = np.r_[X + 2, X - 2] # Generate some abnormal novel observations X_outliers = rnd.uniform(low=-4, high=4, size=(20, 2)) # fit the model clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1) clf.fit(X_train) # predict things y_pred_test = clf.predict(X_test) assert np.mean(y_pred_test == 1) > .9 y_pred_outliers = clf.predict(X_outliers) assert np.mean(y_pred_outliers == -1) > .9 dec_func_test = clf.decision_function(X_test) assert_array_equal((dec_func_test > 0).ravel(), y_pred_test == 1) dec_func_outliers = clf.decision_function(X_outliers) assert_array_equal((dec_func_outliers > 0).ravel(), y_pred_outliers == 1)
def test_rfe(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] X_sparse = sparse.csr_matrix(X) y = iris.target # dense model clf = SVC(kernel="linear") rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1) rfe.fit(X, y) X_r = rfe.transform(X) clf.fit(X_r, y) assert len(rfe.ranking_) == X.shape[1] # sparse model clf_sparse = SVC(kernel="linear") rfe_sparse = RFE(estimator=clf_sparse, n_features_to_select=4, step=0.1) rfe_sparse.fit(X_sparse, y) X_r_sparse = rfe_sparse.transform(X_sparse) assert X_r.shape == iris.data.shape assert_array_almost_equal(X_r[:10], iris.data[:10]) assert_array_almost_equal(rfe.predict(X), clf.predict(iris.data)) assert rfe.score(X, y) == clf.score(iris.data, iris.target) assert_array_almost_equal(X_r, X_r_sparse.toarray())
def test_compute_mi_cc(): # For two continuous variables a good approach is to test on bivariate # normal distribution, where mutual information is known. # Mean of the distribution, irrelevant for mutual information. mean = np.zeros(2) # Setup covariance matrix with correlation coeff. equal 0.5. sigma_1 = 1 sigma_2 = 10 corr = 0.5 cov = np.array([[sigma_1**2, corr * sigma_1 * sigma_2], [corr * sigma_1 * sigma_2, sigma_2**2]]) # True theoretical mutual information. I_theory = (np.log(sigma_1) + np.log(sigma_2) - 0.5 * np.log(np.linalg.det(cov))) rng = check_random_state(0) Z = rng.multivariate_normal(mean, cov, size=1000) x, y = Z[:, 0], Z[:, 1] # Theory and computed values won't be very close, assert that the # first figures after decimal point match. for n_neighbors in [3, 5, 7]: I_computed = _compute_mi(x, y, False, False, n_neighbors) assert_almost_equal(I_computed, I_theory, 1)
def test_compute_mi_cd(): # To test define a joint distribution as follows: # p(x, y) = p(x) p(y | x) # X ~ Bernoulli(p) # (Y | x = 0) ~ Uniform(-1, 1) # (Y | x = 1) ~ Uniform(0, 2) # Use the following formula for mutual information: # I(X; Y) = H(Y) - H(Y | X) # Two entropies can be computed by hand: # H(Y) = -(1-p)/2 * ln((1-p)/2) - p/2*log(p/2) - 1/2*log(1/2) # H(Y | X) = ln(2) # Now we need to implement sampling from out distribution, which is # done easily using conditional distribution logic. n_samples = 1000 rng = check_random_state(0) for p in [0.3, 0.5, 0.7]: x = rng.uniform(size=n_samples) > p y = np.empty(n_samples) mask = x == 0 y[mask] = rng.uniform(-1, 1, size=np.sum(mask)) y[~mask] = rng.uniform(0, 2, size=np.sum(~mask)) I_theory = -0.5 * ((1 - p) * np.log(0.5 * (1 - p)) + p * np.log(0.5 * p) + np.log(0.5)) - np.log(2) # Assert the same tolerance. for n_neighbors in [3, 5, 7]: I_computed = _compute_mi(x, y, True, False, n_neighbors) assert_almost_equal(I_computed, I_theory, 1)
def test_sample_weight_deviance(): # Test if deviance supports sample weights. rng = check_random_state(13) sample_weight = np.ones(100) reg_y = rng.rand(100) clf_y = rng.randint(0, 2, size=100) mclf_y = rng.randint(0, 3, size=100) for Loss in LOSS_FUNCTIONS.values(): if Loss is None: continue if issubclass(Loss, RegressionLossFunction): k = 1 y = reg_y p = reg_y else: k = 2 y = clf_y p = clf_y if Loss.is_multi_class: k = 3 y = mclf_y # one-hot encoding p = np.zeros((y.shape[0], k), dtype=np.float64) for i in range(k): p[:, i] = y == i loss = Loss(k) deviance_w_w = loss(y, p, sample_weight) deviance_wo_w = loss(y, p) assert deviance_wo_w == deviance_w_w
def test_mutual_info_classif_mixed(): # Here the target is discrete and there are two continuous and one # discrete feature. The idea of this test is clear from the code. rng = check_random_state(0) X = rng.rand(1000, 3) X[:, 1] += X[:, 0] y = ((0.5 * X[:, 0] + X[:, 2]) > 0.5).astype(int) X[:, 2] = X[:, 2] > 0.5 mi = mutual_info_classif(X, y, discrete_features=[2], n_neighbors=3, random_state=0) assert_array_equal(np.argsort(-mi), [2, 0, 1]) for n_neighbors in [5, 7, 9]: mi_nn = mutual_info_classif(X, y, discrete_features=[2], n_neighbors=n_neighbors, random_state=0) # Check that the continuous values have an higher MI with greater # n_neighbors assert mi_nn[0] > mi[0] assert mi_nn[1] > mi[1] # The n_neighbors should not have any effect on the discrete value # The MI should be the same assert mi_nn[2] == mi[2]
def test_ball_tree_pickle(): rng = check_random_state(0) X = rng.random_sample((10, 3)) bt1 = BallTree(X, leaf_size=1) # Test if BallTree with callable metric is picklable bt1_pyfunc = BallTree(X, metric=dist_func, leaf_size=1, p=2) ind1, dist1 = bt1.query(X) ind1_pyfunc, dist1_pyfunc = bt1_pyfunc.query(X) def check_pickle_protocol(protocol): s = pickle.dumps(bt1, protocol=protocol) bt2 = pickle.loads(s) s_pyfunc = pickle.dumps(bt1_pyfunc, protocol=protocol) bt2_pyfunc = pickle.loads(s_pyfunc) ind2, dist2 = bt2.query(X) ind2_pyfunc, dist2_pyfunc = bt2_pyfunc.query(X) assert_array_almost_equal(ind1, ind2) assert_array_almost_equal(dist1, dist2) assert_array_almost_equal(ind1_pyfunc, ind2_pyfunc) assert_array_almost_equal(dist1_pyfunc, dist2_pyfunc) assert isinstance(bt2, BallTree) for protocol in (0, 1, 2): check_pickle_protocol(protocol)
def test_linearsvc_fit_sampleweight(): # check correct result when sample_weight is 1 n_samples = len(X) unit_weight = np.ones(n_samples) clf = svm.LinearSVC(random_state=0).fit(X, Y) clf_unitweight = svm.LinearSVC(random_state=0).\ fit(X, Y, sample_weight=unit_weight) # check if same as sample_weight=None assert_array_equal(clf_unitweight.predict(T), clf.predict(T)) assert_allclose(clf.coef_, clf_unitweight.coef_, 1, 0.0001) # check that fit(X) = fit([X1, X2, X3],sample_weight = [n1, n2, n3]) where # X = X1 repeated n1 times, X2 repeated n2 times and so forth random_state = check_random_state(0) random_weight = random_state.randint(0, 10, n_samples) lsvc_unflat = svm.LinearSVC(random_state=0).\ fit(X, Y, sample_weight=random_weight) pred1 = lsvc_unflat.predict(T) X_flat = np.repeat(X, random_weight, axis=0) y_flat = np.repeat(Y, random_weight, axis=0) lsvc_flat = svm.LinearSVC(random_state=0).fit(X_flat, y_flat) pred2 = lsvc_flat.predict(T) assert_array_equal(pred1, pred2) assert_allclose(lsvc_unflat.coef_, lsvc_flat.coef_, 1, 0.0001)
def test_sample_weight_init_estimators(): # Smoke test for init estimators with sample weights. rng = check_random_state(13) X = rng.rand(100, 2) sample_weight = np.ones(100) reg_y = rng.rand(100) clf_y = rng.randint(0, 2, size=100) for Loss in LOSS_FUNCTIONS.values(): if Loss is None: continue if issubclass(Loss, RegressionLossFunction): k = 1 y = reg_y else: k = 2 y = clf_y if Loss.is_multi_class: # skip multiclass continue loss = Loss(k) init_est = loss.init_estimator() init_est.fit(X, y) out = loss.get_init_raw_predictions(X, init_est) assert out.shape == (y.shape[0], 1) sw_init_est = loss.init_estimator() sw_init_est.fit(X, y, sample_weight=sample_weight) sw_out = loss.get_init_raw_predictions(X, sw_init_est) assert sw_out.shape == (y.shape[0], 1) # check if predictions match assert_allclose(out, sw_out, rtol=1e-2)
def test_iforest_subsampled_features(): # It tests non-regression for #5732 which failed at predict. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], boston.target[:50], random_state=rng) clf = IsolationForest(max_features=0.8) clf.fit(X_train, y_train) clf.predict(X_test)
def test_query_haversine(): rng = check_random_state(0) X = 2 * np.pi * rng.random_sample((40, 2)) bt = BallTree(X, leaf_size=1, metric='haversine') dist1, ind1 = bt.query(X, k=5) dist2, ind2 = brute_force_neighbors(X, X, k=5, metric='haversine') assert_array_almost_equal(dist1, dist2) assert_array_almost_equal(ind1, ind2)
def test_sample_weight_smoke(): rng = check_random_state(13) y = rng.rand(100) pred = rng.rand(100) # least squares loss = LeastSquaresError(1) loss_wo_sw = loss(y, pred) loss_w_sw = loss(y, pred, np.ones(pred.shape[0], dtype=np.float32)) assert_almost_equal(loss_wo_sw, loss_w_sw)
def generate_dataset(n_samples, centers, covariances, random_state=None): """Generate a multivariate normal data given some centers and covariances""" rng = check_random_state(random_state) X = np.vstack([ rng.multivariate_normal(mean, cov, size=n_samples // len(centers)) for mean, cov in zip(centers, covariances) ]) y = np.hstack([[clazz] * (n_samples // len(centers)) for clazz in range(len(centers))]) return X, y
def test_safe_mask(): random_state = check_random_state(0) X = random_state.rand(5, 4) X_csr = sp.csr_matrix(X) mask = [False, False, True, True, True] mask = safe_mask(X, mask) assert X[mask].shape[0] == 3 mask = safe_mask(X_csr, mask) assert X_csr[mask].shape[0] == 3
def test_omp_reaches_least_squares(): # Use small simple data; it's a sanity check but OMP can stop early rng = check_random_state(0) n_samples, n_features = (10, 8) n_targets = 3 X = rng.randn(n_samples, n_features) Y = rng.randn(n_samples, n_targets) omp = OrthogonalMatchingPursuit(n_nonzero_coefs=n_features) lstsq = LinearRegression() omp.fit(X, Y) lstsq.fit(X, Y) assert_array_almost_equal(omp.coef_, lstsq.coef_)
def test_rfecv_mockclassifier(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = list(iris.target) # regression test: list should be supported # Test using the score function rfecv = RFECV(estimator=MockClassifier(), step=1) rfecv.fit(X, y) # non-regression test for missing worst feature: assert len(rfecv.grid_scores_) == X.shape[1] assert len(rfecv.ranking_) == X.shape[1]
def test_kd_tree_two_point(dualtree): n_samples, n_features = (100, 3) rng = check_random_state(0) X = rng.random_sample((n_samples, n_features)) Y = rng.random_sample((n_samples, n_features)) r = np.linspace(0, 1, 10) kdt = KDTree(X, leaf_size=10) D = DistanceMetric.get_metric("euclidean").pairwise(Y, X) counts_true = [(D <= ri).sum() for ri in r] counts = kdt.two_point_correlation(Y, r=r, dualtree=dualtree) assert_array_almost_equal(counts, counts_true)
def test_input_data_size(): # Regression test for #6288 # Previoulsly, a metric requiring a particular input dimension would fail def custom_metric(x, y): assert x.shape[0] == 3 return np.sum((x - y)**2) rng = check_random_state(0) X = rng.rand(10, 3) pyfunc = DistanceMetric.get_metric("pyfunc", func=dist_func, p=2) eucl = DistanceMetric.get_metric("euclidean") assert_array_almost_equal(pyfunc.pairwise(X), eucl.pairwise(X))
def make_data(random_state, n_samples_per_center, grid_size, scale): random_state = check_random_state(random_state) centers = np.array([[i, j] for i in range(grid_size) for j in range(grid_size)]) n_clusters_true, n_features = centers.shape noise = random_state.normal( scale=scale, size=(n_samples_per_center, centers.shape[1])) X = np.concatenate([c + noise for c in centers]) y = np.concatenate([[i] * n_samples_per_center for i in range(n_clusters_true)]) return shuffle(X, y, random_state=random_state)
def test_linear_regression_sparse(random_state=0): # Test that linear regression also works with sparse data random_state = check_random_state(random_state) for i in range(10): n = 100 X = sparse.eye(n, n) beta = random_state.rand(n) y = X * beta[:, np.newaxis] ols = LinearRegression() ols.fit(X, y.ravel()) assert_array_almost_equal(beta, ols.coef_ + ols.intercept_) assert_array_almost_equal(ols.predict(X) - y.ravel(), 0)
def test_rfe_mockclassifier(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = iris.target # dense model clf = MockClassifier() rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1) rfe.fit(X, y) X_r = rfe.transform(X) clf.fit(X_r, y) assert len(rfe.ranking_) == X.shape[1] assert X_r.shape == iris.data.shape
def test_kd_tree_kde(kernel, h): n_samples, n_features = (100, 3) rng = check_random_state(0) X = rng.random_sample((n_samples, n_features)) Y = rng.random_sample((n_samples, n_features)) kdt = KDTree(X, leaf_size=10) dens_true = compute_kernel_slow(Y, X, kernel, h) for rtol in [0, 1E-5]: for atol in [1E-6, 1E-2]: for breadth_first in (True, False): check_results(kernel, h, atol, rtol, breadth_first, Y, kdt, dens_true)
def test_linear_regression_sparse_multiple_outcome(random_state=0): # Test multiple-outcome linear regressions with sparse data random_state = check_random_state(random_state) X, y = make_sparse_uncorrelated(random_state=random_state) X = sparse.coo_matrix(X) Y = np.vstack((y, y)).T n_features = X.shape[1] ols = LinearRegression() ols.fit(X, Y) assert ols.coef_.shape == (2, n_features) Y_pred = ols.predict(X) ols.fit(X, y.ravel()) y_pred = ols.predict(X) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
def test_rfe_cv_groups(): generator = check_random_state(0) iris = load_iris() number_groups = 4 groups = np.floor(np.linspace(0, number_groups, len(iris.target))) X = iris.data y = (iris.target > 0).astype(int) est_groups = RFECV( estimator=RandomForestClassifier(random_state=generator), step=1, scoring='accuracy', cv=GroupKFold(n_splits=2)) est_groups.fit(X, y, groups=groups) assert est_groups.n_features_ > 0