def test_fit_resample_nn_obj(): kind = 'borderline1' nn_m = NearestNeighbors(n_neighbors=11) nn_k = NearestNeighbors(n_neighbors=6) smote = SMOTE( random_state=RND_SEED, kind=kind, k_neighbors=nn_k, m_neighbors=nn_m) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ 1.25192108, -0.22367336 ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ -0.28162401, -2.10400981 ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [ 0.70472253, -0.73309052 ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ 0.88407872, 0.35454207 ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ -0.18410027, -0.45194484 ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [ -0.41635887, -0.38299653 ], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.3765279, -0.2009615], [0.55276636, -0.10550373], [0.45413452, -0.08883319], [1.21118683, -0.22817957]]) y_gt = np.array([ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 ]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_partial_dependence_helpers(est, method, target_feature): # Check that what is returned by _partial_dependence_brute or # _partial_dependence_recursion is equivalent to manually setting a target # feature to a given value, and computing the average prediction over all # samples. # This also checks that the brute and recursion methods give the same # output. X, y = make_regression(random_state=0) # The 'init' estimator for GBDT (here the average prediction) isn't taken # into account with the recursion method, for technical reasons. We set # the mean to 0 to that this 'bug' doesn't have any effect. y = y - y.mean() est.fit(X, y) # target feature will be set to .5 and then to 123 features = np.array([target_feature], dtype=np.int32) grid = np.array([[.5], [123]]) if method == 'brute': pdp = _partial_dependence_brute(est, grid, features, X, response_method='auto') else: pdp = _partial_dependence_recursion(est, grid, features) mean_predictions = [] for val in (.5, 123): X_ = X.copy() X_[:, target_feature] = val mean_predictions.append(est.predict(X_).mean()) pdp = pdp[0] # (shape is (1, 2) so make it (2,)) assert_allclose(pdp, mean_predictions, atol=1e-3)
def test_ridge_regression_dtype_stability(solver): random_state = np.random.RandomState(0) n_samples, n_features = 6, 5 X = random_state.randn(n_samples, n_features) coef = random_state.randn(n_features) y = np.dot(X, coef) + 0.01 * rng.randn(n_samples) alpha = 1.0 rtol = 1e-2 if os.name == 'nt' and _IS_32BIT else 1e-5 results = dict() for current_dtype in (np.float32, np.float64): results[current_dtype] = ridge_regression(X.astype(current_dtype), y.astype(current_dtype), alpha=alpha, solver=solver, random_state=random_state, sample_weight=None, max_iter=500, tol=1e-10, return_n_iter=False, return_intercept=False) assert results[np.float32].dtype == np.float32 assert results[np.float64].dtype == np.float64 assert_allclose(results[np.float32], results[np.float64], rtol=rtol)
def test_score_to_label(self): manual_scores = [0.1, 0.4, 0.2, 0.3, 0.5, 0.9, 0.7, 1, 0.8, 0.6] labels = score_to_label(manual_scores, outliers_fraction=0.1) assert_allclose(labels, [0, 0, 0, 0, 0, 0, 0, 1, 0, 0]) labels = score_to_label(manual_scores, outliers_fraction=0.3) assert_allclose(labels, [0, 0, 0, 0, 0, 1, 0, 1, 1, 0])
def test_transform_target_regressor_multi_to_single(): X = friedman[0] y = np.transpose([friedman[1], (friedman[1] ** 2 + 1)]) def func(y): out = np.sqrt(y[:, 0] ** 2 + y[:, 1] ** 2) return out[:, np.newaxis] def inverse_func(y): return y tt = TransformedTargetRegressor(func=func, inverse_func=inverse_func, check_inverse=False) tt.fit(X, y) y_pred_2d_func = tt.predict(X) assert y_pred_2d_func.shape == (100, 1) # force that the function only return a 1D array def func(y): return np.sqrt(y[:, 0] ** 2 + y[:, 1] ** 2) tt = TransformedTargetRegressor(func=func, inverse_func=inverse_func, check_inverse=False) tt.fit(X, y) y_pred_1d_func = tt.predict(X) assert y_pred_1d_func.shape == (100, 1) assert_allclose(y_pred_1d_func, y_pred_2d_func)
def test_wpearsonr(self): # TODO: if unweight version changes, wp[0] format should be changed wp = wpearsonr(self.a, self.b) assert_allclose(wp[0], 0.6956083, atol=0.01) wp = wpearsonr(self.a, self.b, w=self.w) assert_allclose(wp, 0.5477226, atol=0.01)
def test_multilabel_representation_invariance(): # Generate some data n_classes = 4 n_samples = 50 _, y1 = make_multilabel_classification(n_features=1, n_classes=n_classes, random_state=0, n_samples=n_samples, allow_unlabeled=True) _, y2 = make_multilabel_classification(n_features=1, n_classes=n_classes, random_state=1, n_samples=n_samples, allow_unlabeled=True) # To make sure at least one empty label is present y1 = np.vstack([y1, [[0] * n_classes]]) y2 = np.vstack([y2, [[0] * n_classes]]) y1_sparse_indicator = sp.coo_matrix(y1) y2_sparse_indicator = sp.coo_matrix(y2) for name in MULTILABELS_METRICS: metric = ALL_METRICS[name] # XXX cruel hack to work with partial functions if isinstance(metric, partial): metric.__module__ = 'tmp' metric.__name__ = name measure = metric(y1, y2) # Check representation invariance assert_allclose(metric(y1_sparse_indicator, y2_sparse_indicator), measure, err_msg="%s failed representation invariance between " "dense and sparse indicator formats." % name)
def test_iterative_imputer_all_missing(): n = 100 d = 3 X = np.zeros((n, d)) imputer = IterativeImputer(missing_values=0, max_iter=1) X_imputed = imputer.fit_transform(X) assert_allclose(X_imputed, imputer.initial_imputer_.transform(X))
def test_normalize_option_multilabel_classification(): # Test in the multilabel case n_classes = 4 n_samples = 100 # for both random_state 0 and 1, y_true and y_pred has at least one # unlabelled entry _, y_true = make_multilabel_classification(n_features=1, n_classes=n_classes, random_state=0, allow_unlabeled=True, n_samples=n_samples) _, y_pred = make_multilabel_classification(n_features=1, n_classes=n_classes, random_state=1, allow_unlabeled=True, n_samples=n_samples) # To make sure at least one empty label is present y_true += [0]*n_classes y_pred += [0]*n_classes for name in METRICS_WITH_NORMALIZE_OPTION: metrics = ALL_METRICS[name] measure = metrics(y_true, y_pred, normalize=True) assert_array_less(-1.0 * measure, 0, err_msg="We failed to test correctly the normalize " "option") assert_allclose(metrics(y_true, y_pred, normalize=False) / n_samples, measure, err_msg="Failed with %s" % name)
def test_sample_with_nn_svm(): kind = 'svm' nn_k = NearestNeighbors(n_neighbors=6) svm = SVC(gamma='scale', random_state=RND_SEED) smote = SMOTE( random_state=RND_SEED, kind=kind, k_neighbors=nn_k, svm_estimator=svm) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.47436887, -0.2645749], [1.07844562, -0.19435291], [1.44228238, -1.31256615], [1.25636713, -1.04463226]]) y_gt = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_ada_fit_sample_nn_obj(): nn = NearestNeighbors(n_neighbors=6) ada = ADASYN(random_state=RND_SEED, n_neighbors=nn) X_resampled, y_resampled = ada.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.94899098, -0.30508981], [0.28204936, -0.13953426], [1.58028868, -0.04089947], [0.66117333, -0.28009063]]) y_gt = np.array([ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 ]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_sensitivity_specificity_extra_labels(average, expected_specificty): y_true = [1, 3, 3, 2] y_pred = [1, 1, 3, 2] actual = specificity_score( y_true, y_pred, labels=[0, 1, 2, 3, 4], average=average) assert_allclose(expected_specificty, actual, rtol=R_TOL)
def test_geometric_mean_support_binary(): y_true, y_pred, _ = make_prediction(binary=True) # compute the geometric mean for the binary problem geo_mean = geometric_mean_score(y_true, y_pred) assert_allclose(geo_mean, 0.77, rtol=R_TOL)
def test_iterative_imputer_additive_matrix(): rng = np.random.RandomState(0) n = 100 d = 10 A = rng.randn(n, d) B = rng.randn(n, d) X_filled = np.zeros(A.shape) for i in range(d): for j in range(d): X_filled[:, (i+j) % d] += (A[:, i] + B[:, j]) / 2 # a quarter is randomly missing nan_mask = rng.rand(n, d) < 0.25 X_missing = X_filled.copy() X_missing[nan_mask] = np.nan # split up data n = n // 2 X_train = X_missing[:n] X_test_filled = X_filled[n:] X_test = X_missing[n:] imputer = IterativeImputer(max_iter=10, verbose=1, random_state=rng).fit(X_train) X_test_est = imputer.transform(X_test) assert_allclose(X_test_filled, X_test_est, rtol=1e-3, atol=0.01)
def test_one_hot_encoder_pandas(): pd = pytest.importorskip('pandas') X_df = pd.DataFrame({'A': ['a', 'b'], 'B': [1, 2]}) Xtr = check_categorical_onehot(X_df) assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]])
def test_allknn_fit_resample(): allknn = AllKNN() X_resampled, y_resampled = allknn.fit_resample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [ -0.46226554, -0.50481004 ], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [ 1.12202806, 0.33811558 ], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [ 0.50307437, 0.498805 ], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [ 0.98382284, 0.37184502 ], [0.69804044, 0.44810796], [0.04296502, -0.37981873], [ 0.28294738, -1.00125525 ], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [ 1.59068979, -0.96622933 ], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [ 1.16606871, -0.25641059 ], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_allclose(y_resampled, y_gt, rtol=R_TOL)
def test_pairwise_distances_data_derived_params(n_jobs, metric, dist_function, y_is_x): # check that pairwise_distances give the same result in sequential and # parallel, when metric has data-derived parameters. with config_context(working_memory=1): # to have more than 1 chunk rng = np.random.RandomState(0) X = rng.random_sample((1000, 10)) if y_is_x: Y = X expected_dist_default_params = squareform(pdist(X, metric=metric)) if metric == "seuclidean": params = {'V': np.var(X, axis=0, ddof=1)} else: params = {'VI': np.linalg.inv(np.cov(X.T)).T} else: Y = rng.random_sample((1000, 10)) expected_dist_default_params = cdist(X, Y, metric=metric) if metric == "seuclidean": params = {'V': np.var(np.vstack([X, Y]), axis=0, ddof=1)} else: params = {'VI': np.linalg.inv(np.cov(np.vstack([X, Y]).T)).T} expected_dist_explicit_params = cdist(X, Y, metric=metric, **params) dist = np.vstack(tuple(dist_function(X, Y, metric=metric, n_jobs=n_jobs))) assert_allclose(dist, expected_dist_explicit_params) assert_allclose(dist, expected_dist_default_params)
def test_ridge_gcv_vs_ridge_loo_cv( gcv_mode, X_constructor, X_shape, y_shape, fit_intercept, normalize, noise): n_samples, n_features = X_shape n_targets = y_shape[-1] if len(y_shape) == 2 else 1 X, y = _make_sparse_offset_regression( n_samples=n_samples, n_features=n_features, n_targets=n_targets, random_state=0, shuffle=False, noise=noise, n_informative=5 ) y = y.reshape(y_shape) alphas = [1e-3, .1, 1., 10., 1e3] loo_ridge = RidgeCV(cv=n_samples, fit_intercept=fit_intercept, alphas=alphas, scoring='neg_mean_squared_error', normalize=normalize) gcv_ridge = RidgeCV(gcv_mode=gcv_mode, fit_intercept=fit_intercept, alphas=alphas, normalize=normalize) loo_ridge.fit(X, y) X_gcv = X_constructor(X) gcv_ridge.fit(X_gcv, y) assert gcv_ridge.alpha_ == pytest.approx(loo_ridge.alpha_) assert_allclose(gcv_ridge.coef_, loo_ridge.coef_, rtol=1e-3) assert_allclose(gcv_ridge.intercept_, loo_ridge.intercept_, rtol=1e-3)
def test_dtype_match(solver): rng = np.random.RandomState(0) alpha = 1.0 n_samples, n_features = 6, 5 X_64 = rng.randn(n_samples, n_features) y_64 = rng.randn(n_samples) X_32 = X_64.astype(np.float32) y_32 = y_64.astype(np.float32) # Check type consistency 32bits ridge_32 = Ridge(alpha=alpha, solver=solver, max_iter=500, tol=1e-10,) ridge_32.fit(X_32, y_32) coef_32 = ridge_32.coef_ # Check type consistency 64 bits ridge_64 = Ridge(alpha=alpha, solver=solver, max_iter=500, tol=1e-10,) ridge_64.fit(X_64, y_64) coef_64 = ridge_64.coef_ # Do the actual checks at once for easier debug assert coef_32.dtype == X_32.dtype assert coef_64.dtype == X_64.dtype assert ridge_32.predict(X_32).dtype == X_32.dtype assert ridge_64.predict(X_64).dtype == X_64.dtype assert_allclose(ridge_32.coef_, ridge_64.coef_, rtol=1e-4)
def test_ridge_regression_dtype_stability(solver, seed): random_state = np.random.RandomState(seed) n_samples, n_features = 6, 5 X = random_state.randn(n_samples, n_features) coef = random_state.randn(n_features) y = np.dot(X, coef) + 0.01 * random_state.randn(n_samples) alpha = 1.0 results = dict() # XXX: Sparse CG seems to be far less numerically stable than the # others, maybe we should not enable float32 for this one. atol = 1e-3 if solver == "sparse_cg" else 1e-5 for current_dtype in (np.float32, np.float64): results[current_dtype] = ridge_regression(X.astype(current_dtype), y.astype(current_dtype), alpha=alpha, solver=solver, random_state=random_state, sample_weight=None, max_iter=500, tol=1e-10, return_n_iter=False, return_intercept=False) assert results[np.float32].dtype == np.float32 assert results[np.float64].dtype == np.float64 assert_allclose(results[np.float32], results[np.float64], atol=atol)
def test_iterative_imputer_early_stopping(): rng = np.random.RandomState(0) n = 50 d = 5 A = rng.rand(n, 1) B = rng.rand(1, d) X = np.dot(A, B) nan_mask = rng.rand(n, d) < 0.5 X_missing = X.copy() X_missing[nan_mask] = np.nan imputer = IterativeImputer(max_iter=100, tol=1e-3, sample_posterior=False, verbose=1, random_state=rng) X_filled_100 = imputer.fit_transform(X_missing) assert len(imputer.imputation_sequence_) == d * imputer.n_iter_ imputer = IterativeImputer(max_iter=imputer.n_iter_, sample_posterior=False, verbose=1, random_state=rng) X_filled_early = imputer.fit_transform(X_missing) assert_allclose(X_filled_100, X_filled_early, atol=1e-7) imputer = IterativeImputer(max_iter=100, tol=0, sample_posterior=False, verbose=1, random_state=rng) imputer.fit(X_missing) assert imputer.n_iter_ == imputer.max_iter
def check_samplers_pandas(name, Sampler): pd = pytest.importorskip("pandas") # Check that the samplers handle pandas dataframe and pandas series X, y = make_classification(n_samples=1000, n_classes=3, n_informative=4, weights=[0.2, 0.3, 0.5], random_state=0) X_pd = pd.DataFrame(X) sampler = Sampler() if isinstance(Sampler(), SMOTE): samplers = [ Sampler(random_state=0, kind=kind) for kind in ('regular', 'borderline1', 'borderline2', 'svm') ] elif isinstance(Sampler(), NearMiss): samplers = [Sampler(version=version) for version in (1, 2, 3)] else: samplers = [Sampler()] for sampler in samplers: # FIXME: in 0.6 set the random_state for all if name not in DONT_HAVE_RANDOM_STATE: set_random_state(sampler) X_res_pd, y_res_pd = sampler.fit_resample(X_pd, y) X_res, y_res = sampler.fit_resample(X, y) assert_allclose(X_res_pd, X_res) assert_allclose(y_res_pd, y_res)
def test_symmetry(): # Test the symmetry of score and loss functions random_state = check_random_state(0) y_true = random_state.randint(0, 2, size=(20, )) y_pred = random_state.randint(0, 2, size=(20, )) # We shouldn't forget any metrics assert_equal(SYMMETRIC_METRICS.union( NOT_SYMMETRIC_METRICS, set(THRESHOLDED_METRICS), METRIC_UNDEFINED_BINARY_MULTICLASS), set(ALL_METRICS)) assert_equal( SYMMETRIC_METRICS.intersection(NOT_SYMMETRIC_METRICS), set([])) # Symmetric metric for name in SYMMETRIC_METRICS: metric = ALL_METRICS[name] assert_allclose(metric(y_true, y_pred), metric(y_pred, y_true), err_msg="%s is not symmetric" % name) # Not symmetric metrics for name in NOT_SYMMETRIC_METRICS: metric = ALL_METRICS[name] # use context manager to supply custom error message with assert_raises(AssertionError) as cm: assert_array_equal(metric(y_true, y_pred), metric(y_pred, y_true)) cm.msg = ("%s seems to be symmetric" % name)
def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks)
def test_nanmedian(axis, expected_median): X = np.array([[1, 1, 1, 2, np.nan, np.nan], [np.nan, 6, 6, 6, 7, np.nan]]) median = nanmedian(X, axis=axis) if axis is None: assert median == pytest.approx(expected_median) else: assert_allclose(median, expected_median)
def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks)
def test_iba_geo_mean_binary(): y_true, y_pred, _ = make_prediction(binary=True) iba_gmean = make_index_balanced_accuracy( alpha=0.5, squared=True)(geometric_mean_score) iba = iba_gmean(y_true, y_pred) assert_allclose(iba, 0.5948, rtol=R_TOL)
def test_data_generate2(self): X_train, y_train, X_test, y_test = \ generate_data(n_train=self.n_train, n_test=self.n_test, n_features=3, contamination=self.contamination) assert_allclose(X_train.shape, (self.n_train, 3)) assert_allclose(X_test.shape, (self.n_test, 3))
def check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true): kde = KernelDensity(kernel=kernel, bandwidth=bandwidth, atol=atol, rtol=rtol) log_dens = kde.fit(X).score_samples(Y) assert_allclose(np.exp(log_dens), dens_true, atol=atol, rtol=max(1E-7, rtol)) assert_allclose(np.exp(kde.score(Y)), np.prod(dens_true), atol=atol, rtol=max(1E-7, rtol))
def test_scaling_fit_transform(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng) spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha, random_state=rng, normalize_components=True) results_train = spca_lars.fit_transform(Y) results_test = spca_lars.transform(Y[:10]) assert_allclose(results_train[0], results_test[0])
def test_partial_dependence_pipeline(): # check that the partial dependence support pipeline iris = load_iris() scaler = StandardScaler() clf = DummyClassifier(random_state=42) pipe = make_pipeline(scaler, clf) clf.fit(scaler.fit_transform(iris.data), iris.target) pipe.fit(iris.data, iris.target) features = 0 pdp_pipe, values_pipe = partial_dependence( pipe, iris.data, features=[features] ) pdp_clf, values_clf = partial_dependence( clf, scaler.transform(iris.data), features=[features] ) assert_allclose(pdp_pipe, pdp_clf) assert_allclose( values_pipe[0], values_clf[0] * scaler.scale_[features] + scaler.mean_[features] )
def test_transform_target_regressor_functions_multioutput(): X = friedman[0] y = np.vstack((friedman[1], friedman[1]**2 + 1)).T regr = TransformedTargetRegressor(regressor=LinearRegression(), func=np.log, inverse_func=np.exp) y_pred = regr.fit(X, y).predict(X) # check the transformer output y_tran = regr.transformer_.transform(y) assert_allclose(np.log(y), y_tran) assert_allclose(y, regr.transformer_.inverse_transform(y_tran)) assert y.shape == y_pred.shape assert_allclose(y_pred, regr.inverse_func(regr.regressor_.predict(X))) # check the regressor output lr = LinearRegression().fit(X, regr.func(y)) assert_allclose(regr.regressor_.coef_.ravel(), lr.coef_.ravel())
def test_recursion_decision_function(target_feature): # Make sure the recursion method (implicitly uses decision_function) has # the same result as using brute method with # response_method=decision_function X, y = make_classification(n_classes=2, n_clusters_per_class=1, random_state=1) assert np.mean(y) == .5 # make sure the init estimator predicts 0 anyway est = GradientBoostingClassifier(random_state=0, loss='deviance') est.fit(X, y) preds_1, _ = partial_dependence(est, X, [target_feature], response_method='decision_function', method='recursion') preds_2, _ = partial_dependence(est, X, [target_feature], response_method='decision_function', method='brute') assert_allclose(preds_1, preds_2, atol=1e-7)
def test_permute_labels(metric_name): # All clustering metrics do not change score due to permutations of labels # that is when 0 and 1 exchanged. y_label = np.array([0, 0, 0, 1, 1, 0, 1]) y_pred = np.array([1, 0, 1, 0, 1, 1, 0]) if metric_name in SUPERVISED_METRICS: metric = SUPERVISED_METRICS[metric_name] score_1 = metric(y_pred, y_label) assert_allclose(score_1, metric(1 - y_pred, y_label)) assert_allclose(score_1, metric(1 - y_pred, 1 - y_label)) assert_allclose(score_1, metric(y_pred, 1 - y_label)) else: metric = UNSUPERVISED_METRICS[metric_name] X = np.random.randint(10, size=(7, 10)) score_1 = metric(X, y_pred) assert_allclose(score_1, metric(X, 1 - y_pred))
def test_integration_quic_graphical_lasso_cv(self, params_in, expected): """ Just tests inputs/outputs (not validity of result). """ X = datasets.load_diabetes().data ic = QuicGraphicalLassoCV(**params_in) ic.fit(X) result_vec = [ np.linalg.norm(ic.covariance_), np.linalg.norm(ic.precision_), np.linalg.norm(ic.opt_), np.linalg.norm(ic.duality_gap_), ] if isinstance(ic.lam_, float): result_vec.append(ic.lam_) elif isinstance(ic.lam_, np.ndarray): assert ic.lam_.shape == params_in["lam"].shape print(result_vec) assert_allclose(expected, result_vec, atol=1e-1, rtol=1e-1) assert len(ic.grid_scores_) == len(ic.cv_lams_)
def test_grlvq_iris(): check_estimator(GrlvqModel) model = GrlvqModel(regularization=0.5) model.fit(iris.data, iris.target) assert_greater(model.score(iris.data, iris.target), 0.89) model = GrlvqModel(initial_prototypes=[[0, 0, 0], [4, 4, 1]]) nb_ppc = 10 x = np.append( np.random.multivariate_normal([0, 0], np.array([[0.3, 0], [0, 4]]), size=nb_ppc), np.random.multivariate_normal([4, 4], np.array([[0.3, 0], [0, 4]]), size=nb_ppc), axis=0) y = np.append(np.zeros(nb_ppc), np.ones(nb_ppc), axis=0) model.fit(x, y) assert_allclose(np.array([1.0, 0.0]), model.lambda_, atol=0.2) assert_raise_message(ValueError, 'length of initial relevances is wrong', GrlvqModel(initial_relevances=[1, 2]).fit, iris.data, iris.target) assert_raise_message(ValueError, 'regularization must be a positive float', GrlvqModel(regularization=-1.0).fit, iris.data, iris.target)
def test_one_hot_encoder_handle_unknown(): X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]]) X2 = np.array([[4, 1, 1]]) # Test that one hot encoder raises error for unknown features # present during transform. oh = OneHotEncoder(handle_unknown='error') assert_warns(FutureWarning, oh.fit, X) assert_raises(ValueError, oh.transform, X2) # Test the ignore option, ignores unknown features (giving all 0's) oh = OneHotEncoder(handle_unknown='ignore') oh.fit(X) X2_passed = X2.copy() assert_array_equal( oh.transform(X2_passed).toarray(), np.array([[0., 0., 0., 0., 1., 0., 0.]])) # ensure transformed data was not modified in place assert_allclose(X2, X2_passed) # Raise error if handle_unknown is neither ignore or error. oh = OneHotEncoder(handle_unknown='42') assert_raises(ValueError, oh.fit, X)
def test_warmstart(gaussian_data): X, y = gaussian_data n_samples = y.shape[1] Xty = np.array([xx.T.dot(yy) for xx, yy in zip(X, y)]) alpha_max = np.linalg.norm(Xty, axis=0).max() alpha1 = 0.6 * alpha_max / n_samples alpha2 = 0.05 * alpha_max / n_samples est = DirtyModel(alpha=alpha1, beta=alpha1, warm_start=True, tol=1e-5) est.fit(X, y) est.alpha = alpha2 est.fit(X, y) assert hasattr(est, 'is_fitted_') assert_allclose(est.coef_specific_, 0.) coef1 = est.coef_.copy() est = DirtyModel(alpha=alpha2, beta=alpha1, tol=1e-5) est.fit(X, y) coef2 = est.coef_ assert_allclose(coef1, coef2, 1e-4)
def test_one_hot_encoder(X): Xtr = check_categorical_onehot(np.array(X)[:, [0]]) assert_allclose(Xtr, [[0, 1], [1, 0]]) Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]]) assert_allclose(Xtr, [[0, 1, 1, 0], [1, 0, 0, 1]]) Xtr = OneHotEncoder(categories='auto').fit_transform(X) assert_allclose(Xtr.toarray(), [[0, 1, 1, 0, 1], [1, 0, 0, 1, 1]])
def test_signed_circulant_matrix(gamma, n_components, use_offset): # compute exact kernel kernel = rbf_kernel(X, Y, gamma) # approximate kernel mapping transformer = SignedCirculantRandomMatrix(n_components=n_components, gamma=gamma, use_offset=use_offset, random_state=0) X_trans = transformer.fit_transform(X) Y_trans = transformer.transform(Y) kernel_approx = np.dot(X_trans, Y_trans.T) error = kernel - kernel_approx assert np.abs(np.mean(error)) < 0.01 assert np.max(error) < 0.1 # nothing too far off assert np.mean(error) < 0.05 # mean is fairly close # for sparse matrix X_trans_sp = transformer.transform(csr_matrix(X)) assert_allclose_dense_sparse(X_trans, X_trans_sp) # comparing naive computation result = [] for random_weights, sign in zip(transformer.random_weights_, transformer.random_sign_): circ = circulant(ifft(random_weights).real) circ = np.dot(np.diag(sign), circ) result += [np.dot(X, circ.T)*np.sqrt(2*gamma)] X_trans_naive = np.hstack(result) if use_offset: X_trans_naive = np.cos(X_trans_naive+transformer.random_offset_) else: X_trans_naive = np.hstack([np.cos(X_trans_naive), np.sin(X_trans_naive)]) X_trans_naive *= np.sqrt(2/n_components) assert_allclose(X_trans, X_trans_naive)
def test_missing_value_handling(est): # check that the preprocessing method let pass nan rng = np.random.RandomState(42) X = iris.data.copy() n_missing = 50 X[rng.randint(X.shape[0], size=n_missing), rng.randint(X.shape[1], size=n_missing)] = np.nan X_train, X_test = train_test_split(X, random_state=1) # sanity check assert not np.all(np.isnan(X_train), axis=0).any() assert np.any(np.isnan(X_train), axis=0).all() assert np.any(np.isnan(X_test), axis=0).all() X_test[:, 0] = np.nan # make sure this boundary case is tested Xt = est.fit(X_train).transform(X_test) # missing values should still be missing, and only them assert_array_equal(np.isnan(Xt), np.isnan(X_test)) # check that the inverse transform keep NaN Xt_inv = est.inverse_transform(Xt) assert_array_equal(np.isnan(Xt_inv), np.isnan(X_test)) # FIXME: we can introduce equal_nan=True in recent version of numpy. # For the moment which just check that non-NaN values are almost equal. assert_allclose(Xt_inv[~np.isnan(Xt_inv)], X_test[~np.isnan(X_test)]) for i in range(X.shape[1]): # train only on non-NaN est.fit(X_train[:, [i]][~np.isnan(X_train[:, i])]) # check transforming with NaN works even when training without NaN Xt_col = est.transform(X_test[:, [i]]) assert_array_equal(Xt_col, Xt[:, [i]]) # check non-NaN is handled as before - the 1st column is all nan if not np.isnan(X_test[:, i]).all(): Xt_col_nonan = est.transform(X_test[:, [i]][~np.isnan(X_test[:, i])]) assert_array_equal(Xt_col_nonan, Xt_col[~np.isnan(Xt_col.squeeze())])
def test_fit_sample_nn_obj(): kind = 'borderline1' nn_m = NearestNeighbors(n_neighbors=11) nn_k = NearestNeighbors(n_neighbors=6) smote = SMOTE(random_state=RND_SEED, kind=kind, k_neighbors=nn_k, m_neighbors=nn_m) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.3765279, -0.2009615], [0.55276636, -0.10550373], [0.45413452, -0.08883319], [1.21118683, -0.22817957]]) y_gt = np.array([ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 ]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_mice_transform_recovery(rank): rng = np.random.RandomState(0) n = 100 d = 100 A = rng.rand(n, rank) B = rng.rand(rank, d) X_filled = np.dot(A, B) # half is randomly missing nan_mask = rng.rand(n, d) < 0.5 X_missing = X_filled.copy() X_missing[nan_mask] = np.nan # split up data in half n = n // 2 X_train = X_missing[:n] X_test_filled = X_filled[n:] X_test = X_missing[n:] imputer = MICEImputer(n_imputations=10, n_burn_in=10, verbose=True, random_state=rng).fit(X_train) X_test_est = imputer.transform(X_test) assert_allclose(X_test_filled, X_test_est, rtol=1e-5, atol=0.1)
def test_sample_regular_with_nn(): nn_k = NearestNeighbors(n_neighbors=6) smote = SMOTE(random_state=RND_SEED, k_neighbors=nn_k) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ 1.25192108, -0.22367336 ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ -0.28162401, -2.10400981 ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [ 0.70472253, -0.73309052 ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ 0.88407872, 0.35454207 ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ -0.18410027, -0.45194484 ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [ -0.41635887, -0.38299653 ], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.29307743, -0.14670439], [0.84976473, -0.15570176], [0.61319159, -0.11571668], [0.66052536, -0.28246517]]) y_gt = np.array([ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 ]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_ridge_gcv_sample_weights(gcv_mode, X_constructor, fit_intercept, n_features, y_shape, noise): alphas = [1e-3, .1, 1., 10., 1e3] rng = np.random.RandomState(0) n_targets = y_shape[-1] if len(y_shape) == 2 else 1 X, y = _make_sparse_offset_regression(n_samples=11, n_features=n_features, n_targets=n_targets, random_state=0, shuffle=False, noise=noise) y = y.reshape(y_shape) sample_weight = 3 * rng.randn(len(X)) sample_weight = (sample_weight - sample_weight.min() + 1).astype(int) indices = np.repeat(np.arange(X.shape[0]), sample_weight) sample_weight = sample_weight.astype(float) X_tiled, y_tiled = X[indices], y[indices] cv = GroupKFold(n_splits=X.shape[0]) splits = cv.split(X_tiled, y_tiled, groups=indices) kfold = RidgeCV(alphas=alphas, cv=splits, scoring='neg_mean_squared_error', fit_intercept=fit_intercept) # ignore warning from GridSearchCV: DeprecationWarning: The default of the # `iid` parameter will change from True to False in version 0.22 and will # be removed in 0.24 with ignore_warnings(category=DeprecationWarning): kfold.fit(X_tiled, y_tiled) ridge_reg = Ridge(alpha=kfold.alpha_, fit_intercept=fit_intercept) splits = cv.split(X_tiled, y_tiled, groups=indices) predictions = cross_val_predict(ridge_reg, X_tiled, y_tiled, cv=splits) kfold_errors = (y_tiled - predictions)**2 kfold_errors = [ np.sum(kfold_errors[indices == i], axis=0) for i in np.arange(X.shape[0]) ] kfold_errors = np.asarray(kfold_errors) X_gcv = X_constructor(X) gcv_ridge = RidgeCV(alphas=alphas, store_cv_values=True, gcv_mode=gcv_mode, fit_intercept=fit_intercept) gcv_ridge.fit(X_gcv, y, sample_weight=sample_weight) if len(y_shape) == 2: gcv_errors = gcv_ridge.cv_values_[:, :, alphas.index(kfold.alpha_)] else: gcv_errors = gcv_ridge.cv_values_[:, alphas.index(kfold.alpha_)] assert kfold.alpha_ == pytest.approx(gcv_ridge.alpha_) assert_allclose(gcv_errors, kfold_errors, rtol=1e-3) assert_allclose(gcv_ridge.coef_, kfold.coef_, rtol=1e-3) assert_allclose(gcv_ridge.intercept_, kfold.intercept_, rtol=1e-3)
def test_validate_estimator_init(): smote = SMOTE(random_state=RND_SEED) tomek = TomekLinks(random_state=RND_SEED, ratio='all') smt = SMOTETomek(smote=smote, tomek=tomek, random_state=RND_SEED) X_resampled, y_resampled = smt.fit_sample(X, Y) X_gt = np.array([[0.68481731, 0.51935141], [1.34192108, -0.13367336], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], [0.38307743, -0.05670439], [0.70319159, -0.02571667], [0.75052536, -0.19246518]]) y_gt = np.array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_offset_transformer(data): X = data X_offset = X.copy() phi_offset = np.deg2rad(1.0) t = X_offset.index delta_t = t[-1] - t[0] d_phi = phi_offset / delta_t X_offset['phi'] = X_offset['phi'] + d_phi * t trans = OffsetTransformer() trans.fit(X=X) X_trans = trans.transform(X=X) fig, ax = plt.subplots() X.plot(y='phi', ax=ax, label='true') X_offset.plot(y='phi', ax=ax, label='model test') X_trans.plot(y='phi', ax=ax, style='--', label='offset removed') ax.legend() plt.show() assert_allclose(X_trans['phi'], X['phi'], atol=0.01)
def test_dtype_match(solver): rng = np.random.RandomState(0) alpha = 1.0 n_samples, n_features = 6, 5 X_64 = rng.randn(n_samples, n_features) y_64 = rng.randn(n_samples) X_32 = X_64.astype(np.float32) y_32 = y_64.astype(np.float32) # Check type consistency 32bits ridge_32 = Ridge( alpha=alpha, solver=solver, max_iter=500, tol=1e-10, ) ridge_32.fit(X_32, y_32) coef_32 = ridge_32.coef_ # Check type consistency 64 bits ridge_64 = Ridge( alpha=alpha, solver=solver, max_iter=500, tol=1e-10, ) ridge_64.fit(X_64, y_64) coef_64 = ridge_64.coef_ # Do the actual checks at once for easier debug assert coef_32.dtype == X_32.dtype assert coef_64.dtype == X_64.dtype assert ridge_32.predict(X_32).dtype == X_32.dtype assert ridge_64.predict(X_64).dtype == X_64.dtype assert_allclose(ridge_32.coef_, ridge_64.coef_, rtol=1e-4)
def test_allknn_fit_resample(): allknn = AllKNN() X_resampled, y_resampled = allknn.fit_resample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_allclose(y_resampled, y_gt, rtol=R_TOL)
def test_transform_target_regressor_functions(): X, y = friedman regr = TransformedTargetRegressor(regressor=LinearRegression(), func=np.log, inverse_func=np.exp) y_pred = regr.fit(X, y).predict(X) # check the transformer output y_tran = regr.transformer_.transform(y.reshape(-1, 1)).squeeze() assert_allclose(np.log(y), y_tran) assert_allclose( y, regr.transformer_.inverse_transform(y_tran.reshape(-1, 1)).squeeze()) assert y.shape == y_pred.shape assert_allclose(y_pred, regr.inverse_func(regr.regressor_.predict(X))) # check the regressor output lr = LinearRegression().fit(X, regr.func(y)) assert_allclose(regr.regressor_.coef_.ravel(), lr.coef_.ravel())
def test_iterative_imputer_zero_iters(): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() missing_flag = X == 0 X[missing_flag] = np.nan imputer = IterativeImputer(max_iter=0) X_imputed = imputer.fit_transform(X) # with max_iter=0, only initial imputation is performed assert_allclose(X_imputed, imputer.initial_imputer_.transform(X)) # repeat but force n_iter_ to 0 imputer = IterativeImputer(max_iter=5).fit(X) # transformed should not be equal to initial imputation assert not np.all( imputer.transform(X) == imputer.initial_imputer_.transform(X)) imputer.n_iter_ = 0 # now they should be equal as only initial imputation is done assert_allclose(imputer.transform(X), imputer.initial_imputer_.transform(X))
def test_one_hot_encoder(): X = [['abc', 1, 55], ['def', 2, 55]] Xtr = check_categorical_onehot(np.array(X)[:, [0]]) assert_allclose(Xtr, [[1, 0], [0, 1]]) Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]]) assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]]) Xtr = OneHotEncoder().fit_transform(X) assert_allclose(Xtr.toarray(), [[1, 0, 1, 0, 1], [0, 1, 0, 1, 1]])
def test_gaussian_mixture_fit(): # recover the ground truth rng = np.random.RandomState(0) rand_data = RandomData(rng) n_features = rand_data.n_features n_components = rand_data.n_components for covar_type in COVARIANCE_TYPE: X = rand_data.X[covar_type] g = GaussianMixture(n_components=n_components, n_init=20, reg_covar=0, random_state=rng, covariance_type=covar_type) g.fit(X) # needs more data to pass the test with rtol=1e-7 assert_allclose(np.sort(g.weights_), np.sort(rand_data.weights), rtol=0.1, atol=1e-2) arg_idx1 = g.means_[:, 0].argsort() arg_idx2 = rand_data.means[:, 0].argsort() assert_allclose(g.means_[arg_idx1], rand_data.means[arg_idx2], rtol=0.1, atol=1e-2) if covar_type == 'full': prec_pred = g.precisions_ prec_test = rand_data.precisions['full'] elif covar_type == 'tied': prec_pred = np.array([g.precisions_] * n_components) prec_test = np.array([rand_data.precisions['tied']] * n_components) elif covar_type == 'spherical': prec_pred = np.array( [np.eye(n_features) * c for c in g.precisions_]) prec_test = np.array([ np.eye(n_features) * c for c in rand_data.precisions['spherical'] ]) elif covar_type == 'diag': prec_pred = np.array([np.diag(d) for d in g.precisions_]) prec_test = np.array( [np.diag(d) for d in rand_data.precisions['diag']]) arg_idx1 = np.trace(prec_pred, axis1=1, axis2=2).argsort() arg_idx2 = np.trace(prec_test, axis1=1, axis2=2).argsort() for k, h in zip(arg_idx1, arg_idx2): ecov = EmpiricalCovariance() ecov.covariance_ = prec_test[h] # the accuracy depends on the number of data and randomness, rng assert_allclose(ecov.error_norm(prec_pred[k]), 0, atol=0.1)
def test_missing_value_handling(est, support_sparse): # check that the preprocessing method let pass nan rng = np.random.RandomState(42) X = iris.data.copy() n_missing = 50 X[rng.randint(X.shape[0], size=n_missing), rng.randint(X.shape[1], size=n_missing)] = np.nan X_train, X_test = train_test_split(X, random_state=1) # sanity check assert not np.all(np.isnan(X_train), axis=0).any() assert np.any(np.isnan(X_train), axis=0).all() assert np.any(np.isnan(X_test), axis=0).all() X_test[:, 0] = np.nan # make sure this boundary case is tested Xt = est.fit(X_train).transform(X_test) # missing values should still be missing, and only them assert_array_equal(np.isnan(Xt), np.isnan(X_test)) # check that the inverse transform keep NaN Xt_inv = est.inverse_transform(Xt) assert_array_equal(np.isnan(Xt_inv), np.isnan(X_test)) # FIXME: we can introduce equal_nan=True in recent version of numpy. # For the moment which just check that non-NaN values are almost equal. assert_allclose(Xt_inv[~np.isnan(Xt_inv)], X_test[~np.isnan(X_test)]) for i in range(X.shape[1]): # train only on non-NaN est.fit(_get_valid_samples_by_column(X_train, i)) # check transforming with NaN works even when training without NaN Xt_col = est.transform(X_test[:, [i]]) assert_array_equal(Xt_col, Xt[:, [i]]) # check non-NaN is handled as before - the 1st column is all nan if not np.isnan(X_test[:, i]).all(): Xt_col_nonan = est.transform( _get_valid_samples_by_column(X_test, i)) assert_array_equal(Xt_col_nonan, Xt_col[~np.isnan(Xt_col.squeeze())]) if support_sparse: est_dense = clone(est) est_sparse = clone(est) Xt_dense = est_dense.fit(X_train).transform(X_test) Xt_inv_dense = est_dense.inverse_transform(Xt_dense) for sparse_constructor in (sparse.csr_matrix, sparse.csc_matrix, sparse.bsr_matrix, sparse.coo_matrix, sparse.dia_matrix, sparse.dok_matrix, sparse.lil_matrix): # check that the dense and sparse inputs lead to the same results Xt_sparse = (est_sparse.fit(sparse_constructor(X_train)).transform( sparse_constructor(X_test))) assert_allclose(Xt_sparse.A, Xt_dense) Xt_inv_sparse = est_sparse.inverse_transform(Xt_sparse) assert_allclose(Xt_inv_sparse.A, Xt_inv_dense)
def test_invert_order(self): target = np.array([-0.1, -0.3, -0.5, -0.7, -0.2, -0.1]).ravel() scores1 = invert_order(self.scores1) assert_allclose(scores1, target) scores2 = invert_order(self.scores2) assert_allclose(scores2, target) target = np.array([0.6, 0.4, 0.2, 0, 0.5, 0.6]).ravel() scores2 = invert_order(self.scores2, method='subtraction') assert_allclose(scores2, target)
def test_nan_euclidean_distances_2x2(X, X_diag, missing_value): exp_dist = np.array([[0., X_diag], [X_diag, 0]]) dist = nan_euclidean_distances(X, missing_values=missing_value) assert_allclose(exp_dist, dist) dist_sq = nan_euclidean_distances(X, squared=True, missing_values=missing_value) assert_allclose(exp_dist**2, dist_sq) dist_two = nan_euclidean_distances(X, X, missing_values=missing_value) assert_allclose(exp_dist, dist_two) dist_two_copy = nan_euclidean_distances(X, X.copy(), missing_values=missing_value) assert_allclose(exp_dist, dist_two_copy)
def test_missing_indicator_new(missing_values, arr_type, dtype, param_features, n_features, features_indices): X_fit = np.array([[missing_values, missing_values, 1], [4, missing_values, 2]]) X_trans = np.array([[missing_values, missing_values, 1], [4, 12, 10]]) X_fit_expected = np.array([[1, 1, 0], [0, 1, 0]]) X_trans_expected = np.array([[1, 1, 0], [0, 0, 0]]) # convert the input to the right array format and right dtype X_fit = arr_type(X_fit).astype(dtype) X_trans = arr_type(X_trans).astype(dtype) X_fit_expected = X_fit_expected.astype(dtype) X_trans_expected = X_trans_expected.astype(dtype) indicator = MissingIndicator(missing_values=missing_values, features=param_features, sparse=False) X_fit_mask = indicator.fit_transform(X_fit) X_trans_mask = indicator.transform(X_trans) assert X_fit_mask.shape[1] == n_features assert X_trans_mask.shape[1] == n_features assert_array_equal(indicator.features_, features_indices) assert_allclose(X_fit_mask, X_fit_expected[:, features_indices]) assert_allclose(X_trans_mask, X_trans_expected[:, features_indices]) assert X_fit_mask.dtype == bool assert X_trans_mask.dtype == bool assert isinstance(X_fit_mask, np.ndarray) assert isinstance(X_trans_mask, np.ndarray) indicator.set_params(sparse=True) X_fit_mask_sparse = indicator.fit_transform(X_fit) X_trans_mask_sparse = indicator.transform(X_trans) assert X_fit_mask_sparse.dtype == bool assert X_trans_mask_sparse.dtype == bool assert X_fit_mask_sparse.format == 'csc' assert X_trans_mask_sparse.format == 'csc' assert_allclose(X_fit_mask_sparse.toarray(), X_fit_mask) assert_allclose(X_trans_mask_sparse.toarray(), X_trans_mask)
def test_data_generate3(self): X_train, y_train, X_test, y_test = \ generate_data(n_train=self.n_train, n_test=self.n_test, n_features=2, contamination=self.contamination, random_state=42) X_train2, y_train2, X_test2, y_test2 = \ generate_data(n_train=self.n_train, n_test=self.n_test, n_features=2, contamination=self.contamination, random_state=42) assert_allclose(X_train, X_train2) assert_allclose(X_test, X_test2) assert_allclose(y_train, y_train2) assert_allclose(y_test, y_test2)
def test_data_generate_cluster3(self): X_train, y_train, X_test, y_test = \ generate_data_clusters(n_train=self.n_train, n_test=self.n_test, n_features=3, contamination=self.contamination, random_state=self.random_state) X_train2, y_train2, X_test2, y_test2 = \ generate_data_clusters(n_train=self.n_train, n_test=self.n_test, n_features=3, contamination=self.contamination, random_state=self.random_state) assert_allclose(X_train, X_train2) assert_allclose(X_test, X_test2) assert_allclose(y_train, y_train2) assert_allclose(y_test, y_test2)