示例#1
0
def test_fit_resample_nn_obj():
    kind = 'borderline1'
    nn_m = NearestNeighbors(n_neighbors=11)
    nn_k = NearestNeighbors(n_neighbors=6)
    smote = SMOTE(
        random_state=RND_SEED, kind=kind, k_neighbors=nn_k, m_neighbors=nn_m)
    X_resampled, y_resampled = smote.fit_resample(X, Y)
    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [
        1.25192108, -0.22367336
    ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [
        -0.28162401, -2.10400981
    ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [
        0.70472253, -0.73309052
    ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [
        0.88407872, 0.35454207
    ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [
        -0.18410027, -0.45194484
    ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [
        -0.41635887, -0.38299653
    ], [0.08711622, 0.93259929], [1.70580611, -0.11219234],
                     [0.3765279, -0.2009615], [0.55276636, -0.10550373],
                     [0.45413452, -0.08883319], [1.21118683, -0.22817957]])
    y_gt = np.array([
        0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
def test_partial_dependence_helpers(est, method, target_feature):
    # Check that what is returned by _partial_dependence_brute or
    # _partial_dependence_recursion is equivalent to manually setting a target
    # feature to a given value, and computing the average prediction over all
    # samples.
    # This also checks that the brute and recursion methods give the same
    # output.

    X, y = make_regression(random_state=0)
    # The 'init' estimator for GBDT (here the average prediction) isn't taken
    # into account with the recursion method, for technical reasons. We set
    # the mean to 0 to that this 'bug' doesn't have any effect.
    y = y - y.mean()
    est.fit(X, y)

    # target feature will be set to .5 and then to 123
    features = np.array([target_feature], dtype=np.int32)
    grid = np.array([[.5],
                     [123]])

    if method == 'brute':
        pdp = _partial_dependence_brute(est, grid, features, X,
                                        response_method='auto')
    else:
        pdp = _partial_dependence_recursion(est, grid, features)

    mean_predictions = []
    for val in (.5, 123):
        X_ = X.copy()
        X_[:, target_feature] = val
        mean_predictions.append(est.predict(X_).mean())

    pdp = pdp[0]  # (shape is (1, 2) so make it (2,))
    assert_allclose(pdp, mean_predictions, atol=1e-3)
def test_ridge_regression_dtype_stability(solver):
    random_state = np.random.RandomState(0)
    n_samples, n_features = 6, 5
    X = random_state.randn(n_samples, n_features)
    coef = random_state.randn(n_features)
    y = np.dot(X, coef) + 0.01 * rng.randn(n_samples)
    alpha = 1.0
    rtol = 1e-2 if os.name == 'nt' and _IS_32BIT else 1e-5

    results = dict()
    for current_dtype in (np.float32, np.float64):
        results[current_dtype] = ridge_regression(X.astype(current_dtype),
                                                  y.astype(current_dtype),
                                                  alpha=alpha,
                                                  solver=solver,
                                                  random_state=random_state,
                                                  sample_weight=None,
                                                  max_iter=500,
                                                  tol=1e-10,
                                                  return_n_iter=False,
                                                  return_intercept=False)

    assert results[np.float32].dtype == np.float32
    assert results[np.float64].dtype == np.float64
    assert_allclose(results[np.float32], results[np.float64], rtol=rtol)
示例#4
0
    def test_score_to_label(self):
        manual_scores = [0.1, 0.4, 0.2, 0.3, 0.5, 0.9, 0.7, 1, 0.8, 0.6]
        labels = score_to_label(manual_scores, outliers_fraction=0.1)
        assert_allclose(labels, [0, 0, 0, 0, 0, 0, 0, 1, 0, 0])

        labels = score_to_label(manual_scores, outliers_fraction=0.3)
        assert_allclose(labels, [0, 0, 0, 0, 0, 1, 0, 1, 1, 0])
示例#5
0
def test_transform_target_regressor_multi_to_single():
    X = friedman[0]
    y = np.transpose([friedman[1], (friedman[1] ** 2 + 1)])

    def func(y):
        out = np.sqrt(y[:, 0] ** 2 + y[:, 1] ** 2)
        return out[:, np.newaxis]

    def inverse_func(y):
        return y

    tt = TransformedTargetRegressor(func=func, inverse_func=inverse_func,
                                    check_inverse=False)
    tt.fit(X, y)
    y_pred_2d_func = tt.predict(X)
    assert y_pred_2d_func.shape == (100, 1)

    # force that the function only return a 1D array
    def func(y):
        return np.sqrt(y[:, 0] ** 2 + y[:, 1] ** 2)

    tt = TransformedTargetRegressor(func=func, inverse_func=inverse_func,
                                    check_inverse=False)
    tt.fit(X, y)
    y_pred_1d_func = tt.predict(X)
    assert y_pred_1d_func.shape == (100, 1)

    assert_allclose(y_pred_1d_func, y_pred_2d_func)
示例#6
0
    def test_wpearsonr(self):
        # TODO: if unweight version changes, wp[0] format should be changed
        wp = wpearsonr(self.a, self.b)
        assert_allclose(wp[0], 0.6956083, atol=0.01)

        wp = wpearsonr(self.a, self.b, w=self.w)
        assert_allclose(wp, 0.5477226, atol=0.01)
示例#7
0
def test_multilabel_representation_invariance():
    # Generate some data
    n_classes = 4
    n_samples = 50

    _, y1 = make_multilabel_classification(n_features=1, n_classes=n_classes,
                                           random_state=0, n_samples=n_samples,
                                           allow_unlabeled=True)
    _, y2 = make_multilabel_classification(n_features=1, n_classes=n_classes,
                                           random_state=1, n_samples=n_samples,
                                           allow_unlabeled=True)

    # To make sure at least one empty label is present
    y1 = np.vstack([y1, [[0] * n_classes]])
    y2 = np.vstack([y2, [[0] * n_classes]])

    y1_sparse_indicator = sp.coo_matrix(y1)
    y2_sparse_indicator = sp.coo_matrix(y2)

    for name in MULTILABELS_METRICS:
        metric = ALL_METRICS[name]

        # XXX cruel hack to work with partial functions
        if isinstance(metric, partial):
            metric.__module__ = 'tmp'
            metric.__name__ = name

        measure = metric(y1, y2)

        # Check representation invariance
        assert_allclose(metric(y1_sparse_indicator, y2_sparse_indicator),
                        measure,
                        err_msg="%s failed representation invariance between "
                                "dense and sparse indicator formats." % name)
示例#8
0
def test_iterative_imputer_all_missing():
    n = 100
    d = 3
    X = np.zeros((n, d))
    imputer = IterativeImputer(missing_values=0, max_iter=1)
    X_imputed = imputer.fit_transform(X)
    assert_allclose(X_imputed, imputer.initial_imputer_.transform(X))
示例#9
0
def test_normalize_option_multilabel_classification():
    # Test in the multilabel case
    n_classes = 4
    n_samples = 100

    # for both random_state 0 and 1, y_true and y_pred has at least one
    # unlabelled entry
    _, y_true = make_multilabel_classification(n_features=1,
                                               n_classes=n_classes,
                                               random_state=0,
                                               allow_unlabeled=True,
                                               n_samples=n_samples)
    _, y_pred = make_multilabel_classification(n_features=1,
                                               n_classes=n_classes,
                                               random_state=1,
                                               allow_unlabeled=True,
                                               n_samples=n_samples)

    # To make sure at least one empty label is present
    y_true += [0]*n_classes
    y_pred += [0]*n_classes

    for name in METRICS_WITH_NORMALIZE_OPTION:
        metrics = ALL_METRICS[name]
        measure = metrics(y_true, y_pred, normalize=True)
        assert_array_less(-1.0 * measure, 0,
                          err_msg="We failed to test correctly the normalize "
                                  "option")
        assert_allclose(metrics(y_true, y_pred, normalize=False) / n_samples,
                        measure, err_msg="Failed with %s" % name)
示例#10
0
def test_sample_with_nn_svm():
    kind = 'svm'
    nn_k = NearestNeighbors(n_neighbors=6)
    svm = SVC(gamma='scale', random_state=RND_SEED)
    smote = SMOTE(
        random_state=RND_SEED, kind=kind, k_neighbors=nn_k, svm_estimator=svm)
    X_resampled, y_resampled = smote.fit_resample(X, Y)
    X_gt = np.array([[0.11622591, -0.0317206],
                     [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336],
                     [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504],
                     [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342],
                     [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052],
                     [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463],
                     [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734],
                     [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484],
                     [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049],
                     [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929],
                     [1.70580611, -0.11219234],
                     [0.47436887, -0.2645749],
                     [1.07844562, -0.19435291],
                     [1.44228238, -1.31256615],
                     [1.25636713, -1.04463226]])
    y_gt = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,
                     1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
示例#11
0
def test_ada_fit_sample_nn_obj():
    nn = NearestNeighbors(n_neighbors=6)
    ada = ADASYN(random_state=RND_SEED, n_neighbors=nn)
    X_resampled, y_resampled = ada.fit_sample(X, Y)
    X_gt = np.array([[0.11622591, -0.0317206],
                     [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336],
                     [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504],
                     [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342],
                     [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052],
                     [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463],
                     [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734],
                     [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484],
                     [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049],
                     [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929],
                     [1.70580611, -0.11219234],
                     [0.94899098, -0.30508981],
                     [0.28204936, -0.13953426],
                     [1.58028868, -0.04089947],
                     [0.66117333, -0.28009063]])
    y_gt = np.array([
        0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
def test_sensitivity_specificity_extra_labels(average, expected_specificty):
    y_true = [1, 3, 3, 2]
    y_pred = [1, 1, 3, 2]

    actual = specificity_score(
        y_true, y_pred, labels=[0, 1, 2, 3, 4], average=average)
    assert_allclose(expected_specificty, actual, rtol=R_TOL)
def test_geometric_mean_support_binary():
    y_true, y_pred, _ = make_prediction(binary=True)

    # compute the geometric mean for the binary problem
    geo_mean = geometric_mean_score(y_true, y_pred)

    assert_allclose(geo_mean, 0.77, rtol=R_TOL)
示例#14
0
def test_iterative_imputer_additive_matrix():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    A = rng.randn(n, d)
    B = rng.randn(n, d)
    X_filled = np.zeros(A.shape)
    for i in range(d):
        for j in range(d):
            X_filled[:, (i+j) % d] += (A[:, i] + B[:, j]) / 2
    # a quarter is randomly missing
    nan_mask = rng.rand(n, d) < 0.25
    X_missing = X_filled.copy()
    X_missing[nan_mask] = np.nan

    # split up data
    n = n // 2
    X_train = X_missing[:n]
    X_test_filled = X_filled[n:]
    X_test = X_missing[n:]

    imputer = IterativeImputer(max_iter=10,
                               verbose=1,
                               random_state=rng).fit(X_train)
    X_test_est = imputer.transform(X_test)
    assert_allclose(X_test_filled, X_test_est, rtol=1e-3, atol=0.01)
示例#15
0
def test_one_hot_encoder_pandas():
    pd = pytest.importorskip('pandas')

    X_df = pd.DataFrame({'A': ['a', 'b'], 'B': [1, 2]})

    Xtr = check_categorical_onehot(X_df)
    assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]])
示例#16
0
def test_allknn_fit_resample():
    allknn = AllKNN()
    X_resampled, y_resampled = allknn.fit_resample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [
        -0.46226554, -0.50481004
    ], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [
        1.12202806, 0.33811558
    ], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [
        0.50307437, 0.498805
    ], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [
        0.98382284, 0.37184502
    ], [0.69804044, 0.44810796], [0.04296502, -0.37981873], [
        0.28294738, -1.00125525
    ], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [
        1.59068979, -0.96622933
    ], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [
        1.16606871, -0.25641059
    ], [1.0304995, -0.16955962], [0.48921682, -1.38504507],
                     [-0.03918551, -0.68540745], [0.24991051, -1.00864997],
                     [0.80541964, -0.34465185], [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_allclose(y_resampled, y_gt, rtol=R_TOL)
示例#17
0
def test_pairwise_distances_data_derived_params(n_jobs, metric, dist_function,
                                                y_is_x):
    # check that pairwise_distances give the same result in sequential and
    # parallel, when metric has data-derived parameters.
    with config_context(working_memory=1):  # to have more than 1 chunk
        rng = np.random.RandomState(0)
        X = rng.random_sample((1000, 10))

        if y_is_x:
            Y = X
            expected_dist_default_params = squareform(pdist(X, metric=metric))
            if metric == "seuclidean":
                params = {'V': np.var(X, axis=0, ddof=1)}
            else:
                params = {'VI': np.linalg.inv(np.cov(X.T)).T}
        else:
            Y = rng.random_sample((1000, 10))
            expected_dist_default_params = cdist(X, Y, metric=metric)
            if metric == "seuclidean":
                params = {'V': np.var(np.vstack([X, Y]), axis=0, ddof=1)}
            else:
                params = {'VI': np.linalg.inv(np.cov(np.vstack([X, Y]).T)).T}

        expected_dist_explicit_params = cdist(X, Y, metric=metric, **params)
        dist = np.vstack(tuple(dist_function(X, Y,
                                             metric=metric, n_jobs=n_jobs)))

        assert_allclose(dist, expected_dist_explicit_params)
        assert_allclose(dist, expected_dist_default_params)
def test_ridge_gcv_vs_ridge_loo_cv(
        gcv_mode, X_constructor, X_shape, y_shape,
        fit_intercept, normalize, noise):
    n_samples, n_features = X_shape
    n_targets = y_shape[-1] if len(y_shape) == 2 else 1
    X, y = _make_sparse_offset_regression(
        n_samples=n_samples, n_features=n_features, n_targets=n_targets,
        random_state=0, shuffle=False, noise=noise, n_informative=5
    )
    y = y.reshape(y_shape)

    alphas = [1e-3, .1, 1., 10., 1e3]
    loo_ridge = RidgeCV(cv=n_samples, fit_intercept=fit_intercept,
                        alphas=alphas, scoring='neg_mean_squared_error',
                        normalize=normalize)
    gcv_ridge = RidgeCV(gcv_mode=gcv_mode, fit_intercept=fit_intercept,
                        alphas=alphas, normalize=normalize)

    loo_ridge.fit(X, y)

    X_gcv = X_constructor(X)
    gcv_ridge.fit(X_gcv, y)

    assert gcv_ridge.alpha_ == pytest.approx(loo_ridge.alpha_)
    assert_allclose(gcv_ridge.coef_, loo_ridge.coef_, rtol=1e-3)
    assert_allclose(gcv_ridge.intercept_, loo_ridge.intercept_, rtol=1e-3)
def test_dtype_match(solver):
    rng = np.random.RandomState(0)
    alpha = 1.0

    n_samples, n_features = 6, 5
    X_64 = rng.randn(n_samples, n_features)
    y_64 = rng.randn(n_samples)
    X_32 = X_64.astype(np.float32)
    y_32 = y_64.astype(np.float32)

    # Check type consistency 32bits
    ridge_32 = Ridge(alpha=alpha, solver=solver, max_iter=500, tol=1e-10,)
    ridge_32.fit(X_32, y_32)
    coef_32 = ridge_32.coef_

    # Check type consistency 64 bits
    ridge_64 = Ridge(alpha=alpha, solver=solver, max_iter=500, tol=1e-10,)
    ridge_64.fit(X_64, y_64)
    coef_64 = ridge_64.coef_

    # Do the actual checks at once for easier debug
    assert coef_32.dtype == X_32.dtype
    assert coef_64.dtype == X_64.dtype
    assert ridge_32.predict(X_32).dtype == X_32.dtype
    assert ridge_64.predict(X_64).dtype == X_64.dtype
    assert_allclose(ridge_32.coef_, ridge_64.coef_, rtol=1e-4)
def test_ridge_regression_dtype_stability(solver, seed):
    random_state = np.random.RandomState(seed)
    n_samples, n_features = 6, 5
    X = random_state.randn(n_samples, n_features)
    coef = random_state.randn(n_features)
    y = np.dot(X, coef) + 0.01 * random_state.randn(n_samples)
    alpha = 1.0
    results = dict()
    # XXX: Sparse CG seems to be far less numerically stable than the
    # others, maybe we should not enable float32 for this one.
    atol = 1e-3 if solver == "sparse_cg" else 1e-5
    for current_dtype in (np.float32, np.float64):
        results[current_dtype] = ridge_regression(X.astype(current_dtype),
                                                  y.astype(current_dtype),
                                                  alpha=alpha,
                                                  solver=solver,
                                                  random_state=random_state,
                                                  sample_weight=None,
                                                  max_iter=500,
                                                  tol=1e-10,
                                                  return_n_iter=False,
                                                  return_intercept=False)

    assert results[np.float32].dtype == np.float32
    assert results[np.float64].dtype == np.float64
    assert_allclose(results[np.float32], results[np.float64], atol=atol)
示例#21
0
def test_iterative_imputer_early_stopping():
    rng = np.random.RandomState(0)
    n = 50
    d = 5
    A = rng.rand(n, 1)
    B = rng.rand(1, d)
    X = np.dot(A, B)
    nan_mask = rng.rand(n, d) < 0.5
    X_missing = X.copy()
    X_missing[nan_mask] = np.nan

    imputer = IterativeImputer(max_iter=100,
                               tol=1e-3,
                               sample_posterior=False,
                               verbose=1,
                               random_state=rng)
    X_filled_100 = imputer.fit_transform(X_missing)
    assert len(imputer.imputation_sequence_) == d * imputer.n_iter_

    imputer = IterativeImputer(max_iter=imputer.n_iter_,
                               sample_posterior=False,
                               verbose=1,
                               random_state=rng)
    X_filled_early = imputer.fit_transform(X_missing)
    assert_allclose(X_filled_100, X_filled_early, atol=1e-7)

    imputer = IterativeImputer(max_iter=100,
                               tol=0,
                               sample_posterior=False,
                               verbose=1,
                               random_state=rng)
    imputer.fit(X_missing)
    assert imputer.n_iter_ == imputer.max_iter
示例#22
0
def check_samplers_pandas(name, Sampler):
    pd = pytest.importorskip("pandas")
    # Check that the samplers handle pandas dataframe and pandas series
    X, y = make_classification(n_samples=1000, n_classes=3, n_informative=4,
                               weights=[0.2, 0.3, 0.5], random_state=0)
    X_pd = pd.DataFrame(X)
    sampler = Sampler()
    if isinstance(Sampler(), SMOTE):
        samplers = [
            Sampler(random_state=0, kind=kind)
            for kind in ('regular', 'borderline1', 'borderline2', 'svm')
        ]

    elif isinstance(Sampler(), NearMiss):
        samplers = [Sampler(version=version) for version in (1, 2, 3)]

    else:
        samplers = [Sampler()]

    for sampler in samplers:
        # FIXME: in 0.6 set the random_state for all
        if name not in DONT_HAVE_RANDOM_STATE:
            set_random_state(sampler)
        X_res_pd, y_res_pd = sampler.fit_resample(X_pd, y)
        X_res, y_res = sampler.fit_resample(X, y)
        assert_allclose(X_res_pd, X_res)
        assert_allclose(y_res_pd, y_res)
示例#23
0
def test_symmetry():
    # Test the symmetry of score and loss functions
    random_state = check_random_state(0)
    y_true = random_state.randint(0, 2, size=(20, ))
    y_pred = random_state.randint(0, 2, size=(20, ))

    # We shouldn't forget any metrics
    assert_equal(SYMMETRIC_METRICS.union(
        NOT_SYMMETRIC_METRICS, set(THRESHOLDED_METRICS),
        METRIC_UNDEFINED_BINARY_MULTICLASS),
        set(ALL_METRICS))

    assert_equal(
        SYMMETRIC_METRICS.intersection(NOT_SYMMETRIC_METRICS),
        set([]))

    # Symmetric metric
    for name in SYMMETRIC_METRICS:
        metric = ALL_METRICS[name]
        assert_allclose(metric(y_true, y_pred), metric(y_pred, y_true),
                        err_msg="%s is not symmetric" % name)

    # Not symmetric metrics
    for name in NOT_SYMMETRIC_METRICS:
        metric = ALL_METRICS[name]

        # use context manager to supply custom error message
        with assert_raises(AssertionError) as cm:
            assert_array_equal(metric(y_true, y_pred), metric(y_pred, y_true))
            cm.msg = ("%s seems to be symmetric" % name)
示例#24
0
    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)
def test_nanmedian(axis, expected_median):
    X = np.array([[1, 1, 1, 2, np.nan, np.nan],
                  [np.nan, 6, 6, 6, 7, np.nan]])
    median = nanmedian(X, axis=axis)
    if axis is None:
        assert median == pytest.approx(expected_median)
    else:
        assert_allclose(median, expected_median)
示例#26
0
    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)
def test_iba_geo_mean_binary():
    y_true, y_pred, _ = make_prediction(binary=True)

    iba_gmean = make_index_balanced_accuracy(
        alpha=0.5, squared=True)(geometric_mean_score)
    iba = iba_gmean(y_true, y_pred)

    assert_allclose(iba, 0.5948, rtol=R_TOL)
示例#28
0
 def test_data_generate2(self):
     X_train, y_train, X_test, y_test = \
         generate_data(n_train=self.n_train,
                       n_test=self.n_test,
                       n_features=3,
                       contamination=self.contamination)
     assert_allclose(X_train.shape, (self.n_train, 3))
     assert_allclose(X_test.shape, (self.n_test, 3))
示例#29
0
def check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true):
    kde = KernelDensity(kernel=kernel, bandwidth=bandwidth,
                        atol=atol, rtol=rtol)
    log_dens = kde.fit(X).score_samples(Y)
    assert_allclose(np.exp(log_dens), dens_true,
                    atol=atol, rtol=max(1E-7, rtol))
    assert_allclose(np.exp(kde.score(Y)),
                    np.prod(dens_true),
                    atol=atol, rtol=max(1E-7, rtol))
示例#30
0
def test_scaling_fit_transform():
    alpha = 1
    rng = np.random.RandomState(0)
    Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng)
    spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha,
                          random_state=rng, normalize_components=True)
    results_train = spca_lars.fit_transform(Y)
    results_test = spca_lars.transform(Y[:10])
    assert_allclose(results_train[0], results_test[0])
示例#31
0
def test_partial_dependence_pipeline():
    # check that the partial dependence support pipeline
    iris = load_iris()

    scaler = StandardScaler()
    clf = DummyClassifier(random_state=42)
    pipe = make_pipeline(scaler, clf)

    clf.fit(scaler.fit_transform(iris.data), iris.target)
    pipe.fit(iris.data, iris.target)

    features = 0
    pdp_pipe, values_pipe = partial_dependence(
        pipe, iris.data, features=[features]
    )
    pdp_clf, values_clf = partial_dependence(
        clf, scaler.transform(iris.data), features=[features]
    )
    assert_allclose(pdp_pipe, pdp_clf)
    assert_allclose(
        values_pipe[0],
        values_clf[0] * scaler.scale_[features] + scaler.mean_[features]
    )
def test_transform_target_regressor_functions_multioutput():
    X = friedman[0]
    y = np.vstack((friedman[1], friedman[1]**2 + 1)).T
    regr = TransformedTargetRegressor(regressor=LinearRegression(),
                                      func=np.log,
                                      inverse_func=np.exp)
    y_pred = regr.fit(X, y).predict(X)
    # check the transformer output
    y_tran = regr.transformer_.transform(y)
    assert_allclose(np.log(y), y_tran)
    assert_allclose(y, regr.transformer_.inverse_transform(y_tran))
    assert y.shape == y_pred.shape
    assert_allclose(y_pred, regr.inverse_func(regr.regressor_.predict(X)))
    # check the regressor output
    lr = LinearRegression().fit(X, regr.func(y))
    assert_allclose(regr.regressor_.coef_.ravel(), lr.coef_.ravel())
示例#33
0
def test_recursion_decision_function(target_feature):
    # Make sure the recursion method (implicitly uses decision_function) has
    # the same result as using brute method with
    # response_method=decision_function

    X, y = make_classification(n_classes=2,
                               n_clusters_per_class=1,
                               random_state=1)
    assert np.mean(y) == .5  # make sure the init estimator predicts 0 anyway

    est = GradientBoostingClassifier(random_state=0, loss='deviance')
    est.fit(X, y)

    preds_1, _ = partial_dependence(est,
                                    X, [target_feature],
                                    response_method='decision_function',
                                    method='recursion')
    preds_2, _ = partial_dependence(est,
                                    X, [target_feature],
                                    response_method='decision_function',
                                    method='brute')

    assert_allclose(preds_1, preds_2, atol=1e-7)
示例#34
0
def test_permute_labels(metric_name):
    # All clustering metrics do not change score due to permutations of labels
    # that is when 0 and 1 exchanged.
    y_label = np.array([0, 0, 0, 1, 1, 0, 1])
    y_pred = np.array([1, 0, 1, 0, 1, 1, 0])
    if metric_name in SUPERVISED_METRICS:
        metric = SUPERVISED_METRICS[metric_name]
        score_1 = metric(y_pred, y_label)
        assert_allclose(score_1, metric(1 - y_pred, y_label))
        assert_allclose(score_1, metric(1 - y_pred, 1 - y_label))
        assert_allclose(score_1, metric(y_pred, 1 - y_label))
    else:
        metric = UNSUPERVISED_METRICS[metric_name]
        X = np.random.randint(10, size=(7, 10))
        score_1 = metric(X, y_pred)
        assert_allclose(score_1, metric(X, 1 - y_pred))
示例#35
0
    def test_integration_quic_graphical_lasso_cv(self, params_in, expected):
        """
        Just tests inputs/outputs (not validity of result).
        """
        X = datasets.load_diabetes().data
        ic = QuicGraphicalLassoCV(**params_in)
        ic.fit(X)

        result_vec = [
            np.linalg.norm(ic.covariance_),
            np.linalg.norm(ic.precision_),
            np.linalg.norm(ic.opt_),
            np.linalg.norm(ic.duality_gap_),
        ]
        if isinstance(ic.lam_, float):
            result_vec.append(ic.lam_)
        elif isinstance(ic.lam_, np.ndarray):
            assert ic.lam_.shape == params_in["lam"].shape

        print(result_vec)
        assert_allclose(expected, result_vec, atol=1e-1, rtol=1e-1)

        assert len(ic.grid_scores_) == len(ic.cv_lams_)
示例#36
0
def test_grlvq_iris():
    check_estimator(GrlvqModel)
    model = GrlvqModel(regularization=0.5)
    model.fit(iris.data, iris.target)
    assert_greater(model.score(iris.data, iris.target), 0.89)

    model = GrlvqModel(initial_prototypes=[[0, 0, 0], [4, 4, 1]])
    nb_ppc = 10
    x = np.append(
        np.random.multivariate_normal([0, 0], np.array([[0.3, 0], [0, 4]]),
                                      size=nb_ppc),
        np.random.multivariate_normal([4, 4], np.array([[0.3, 0], [0, 4]]),
                                      size=nb_ppc), axis=0)
    y = np.append(np.zeros(nb_ppc), np.ones(nb_ppc), axis=0)
    model.fit(x, y)
    assert_allclose(np.array([1.0, 0.0]), model.lambda_, atol=0.2)

    assert_raise_message(ValueError, 'length of initial relevances is wrong',
                         GrlvqModel(initial_relevances=[1, 2]).fit, iris.data,
                         iris.target)
    assert_raise_message(ValueError, 'regularization must be a positive float',
                         GrlvqModel(regularization=-1.0).fit, iris.data,
                         iris.target)
def test_one_hot_encoder_handle_unknown():
    X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
    X2 = np.array([[4, 1, 1]])

    # Test that one hot encoder raises error for unknown features
    # present during transform.
    oh = OneHotEncoder(handle_unknown='error')
    assert_warns(FutureWarning, oh.fit, X)
    assert_raises(ValueError, oh.transform, X2)

    # Test the ignore option, ignores unknown features (giving all 0's)
    oh = OneHotEncoder(handle_unknown='ignore')
    oh.fit(X)
    X2_passed = X2.copy()
    assert_array_equal(
        oh.transform(X2_passed).toarray(),
        np.array([[0., 0., 0., 0., 1., 0., 0.]]))
    # ensure transformed data was not modified in place
    assert_allclose(X2, X2_passed)

    # Raise error if handle_unknown is neither ignore or error.
    oh = OneHotEncoder(handle_unknown='42')
    assert_raises(ValueError, oh.fit, X)
示例#38
0
def test_warmstart(gaussian_data):

    X, y = gaussian_data
    n_samples = y.shape[1]

    Xty = np.array([xx.T.dot(yy) for xx, yy in zip(X, y)])
    alpha_max = np.linalg.norm(Xty, axis=0).max()
    alpha1 = 0.6 * alpha_max / n_samples
    alpha2 = 0.05 * alpha_max / n_samples

    est = DirtyModel(alpha=alpha1, beta=alpha1, warm_start=True, tol=1e-5)
    est.fit(X, y)
    est.alpha = alpha2
    est.fit(X, y)
    assert hasattr(est, 'is_fitted_')
    assert_allclose(est.coef_specific_, 0.)
    coef1 = est.coef_.copy()

    est = DirtyModel(alpha=alpha2, beta=alpha1, tol=1e-5)
    est.fit(X, y)
    coef2 = est.coef_

    assert_allclose(coef1, coef2, 1e-4)
示例#39
0
def test_one_hot_encoder(X):
    Xtr = check_categorical_onehot(np.array(X)[:, [0]])
    assert_allclose(Xtr, [[0, 1], [1, 0]])

    Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]])
    assert_allclose(Xtr, [[0, 1, 1, 0], [1, 0, 0, 1]])

    Xtr = OneHotEncoder(categories='auto').fit_transform(X)
    assert_allclose(Xtr.toarray(), [[0, 1, 1, 0, 1], [1, 0, 0, 1, 1]])
def test_signed_circulant_matrix(gamma, n_components, use_offset):
    # compute exact kernel
    kernel = rbf_kernel(X, Y, gamma)

    # approximate kernel mapping
    transformer = SignedCirculantRandomMatrix(n_components=n_components, 
                                              gamma=gamma, 
                                              use_offset=use_offset,
                                              random_state=0)
    X_trans = transformer.fit_transform(X)
    Y_trans = transformer.transform(Y)
    kernel_approx = np.dot(X_trans, Y_trans.T)

    error = kernel - kernel_approx
    assert np.abs(np.mean(error)) < 0.01
    assert np.max(error) < 0.1  # nothing too far off
    assert np.mean(error) < 0.05  # mean is fairly close
    # for sparse matrix
    X_trans_sp = transformer.transform(csr_matrix(X))
    assert_allclose_dense_sparse(X_trans, X_trans_sp)

    # comparing naive computation
    result = []
    for random_weights, sign in zip(transformer.random_weights_,
                                    transformer.random_sign_):
        circ = circulant(ifft(random_weights).real)
        circ = np.dot(np.diag(sign), circ)
        result += [np.dot(X, circ.T)*np.sqrt(2*gamma)]
    X_trans_naive = np.hstack(result)
    
    if use_offset:
        X_trans_naive = np.cos(X_trans_naive+transformer.random_offset_)
    else:
        X_trans_naive = np.hstack([np.cos(X_trans_naive),
                                   np.sin(X_trans_naive)])
    X_trans_naive *= np.sqrt(2/n_components)
    assert_allclose(X_trans, X_trans_naive)
示例#41
0
def test_missing_value_handling(est):
    # check that the preprocessing method let pass nan
    rng = np.random.RandomState(42)
    X = iris.data.copy()
    n_missing = 50
    X[rng.randint(X.shape[0], size=n_missing),
      rng.randint(X.shape[1], size=n_missing)] = np.nan
    X_train, X_test = train_test_split(X, random_state=1)
    # sanity check
    assert not np.all(np.isnan(X_train), axis=0).any()
    assert np.any(np.isnan(X_train), axis=0).all()
    assert np.any(np.isnan(X_test), axis=0).all()
    X_test[:, 0] = np.nan  # make sure this boundary case is tested

    Xt = est.fit(X_train).transform(X_test)
    # missing values should still be missing, and only them
    assert_array_equal(np.isnan(Xt), np.isnan(X_test))

    # check that the inverse transform keep NaN
    Xt_inv = est.inverse_transform(Xt)
    assert_array_equal(np.isnan(Xt_inv), np.isnan(X_test))
    # FIXME: we can introduce equal_nan=True in recent version of numpy.
    # For the moment which just check that non-NaN values are almost equal.
    assert_allclose(Xt_inv[~np.isnan(Xt_inv)], X_test[~np.isnan(X_test)])

    for i in range(X.shape[1]):
        # train only on non-NaN
        est.fit(X_train[:, [i]][~np.isnan(X_train[:, i])])
        # check transforming with NaN works even when training without NaN
        Xt_col = est.transform(X_test[:, [i]])
        assert_array_equal(Xt_col, Xt[:, [i]])
        # check non-NaN is handled as before - the 1st column is all nan
        if not np.isnan(X_test[:, i]).all():
            Xt_col_nonan = est.transform(X_test[:,
                                                [i]][~np.isnan(X_test[:, i])])
            assert_array_equal(Xt_col_nonan,
                               Xt_col[~np.isnan(Xt_col.squeeze())])
def test_fit_sample_nn_obj():
    kind = 'borderline1'
    nn_m = NearestNeighbors(n_neighbors=11)
    nn_k = NearestNeighbors(n_neighbors=6)
    smote = SMOTE(random_state=RND_SEED, kind=kind, k_neighbors=nn_k,
                  m_neighbors=nn_m)
    X_resampled, y_resampled = smote.fit_sample(X, Y)
    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336], [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052], [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463], [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734], [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484], [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049], [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929], [1.70580611, -0.11219234],
                     [0.3765279, -0.2009615], [0.55276636, -0.10550373],
                     [0.45413452, -0.08883319], [1.21118683, -0.22817957]])
    y_gt = np.array([
        0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
示例#43
0
def test_mice_transform_recovery(rank):
    rng = np.random.RandomState(0)
    n = 100
    d = 100
    A = rng.rand(n, rank)
    B = rng.rand(rank, d)
    X_filled = np.dot(A, B)
    # half is randomly missing
    nan_mask = rng.rand(n, d) < 0.5
    X_missing = X_filled.copy()
    X_missing[nan_mask] = np.nan

    # split up data in half
    n = n // 2
    X_train = X_missing[:n]
    X_test_filled = X_filled[n:]
    X_test = X_missing[n:]

    imputer = MICEImputer(n_imputations=10,
                          n_burn_in=10,
                          verbose=True,
                          random_state=rng).fit(X_train)
    X_test_est = imputer.transform(X_test)
    assert_allclose(X_test_filled, X_test_est, rtol=1e-5, atol=0.1)
示例#44
0
def test_sample_regular_with_nn():
    nn_k = NearestNeighbors(n_neighbors=6)
    smote = SMOTE(random_state=RND_SEED, k_neighbors=nn_k)
    X_resampled, y_resampled = smote.fit_resample(X, Y)
    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [
        1.25192108, -0.22367336
    ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [
        -0.28162401, -2.10400981
    ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [
        0.70472253, -0.73309052
    ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [
        0.88407872, 0.35454207
    ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [
        -0.18410027, -0.45194484
    ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [
        -0.41635887, -0.38299653
    ], [0.08711622, 0.93259929], [1.70580611, -0.11219234],
                     [0.29307743, -0.14670439], [0.84976473, -0.15570176],
                     [0.61319159, -0.11571668], [0.66052536, -0.28246517]])
    y_gt = np.array([
        0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
示例#45
0
def test_ridge_gcv_sample_weights(gcv_mode, X_constructor, fit_intercept,
                                  n_features, y_shape, noise):
    alphas = [1e-3, .1, 1., 10., 1e3]
    rng = np.random.RandomState(0)
    n_targets = y_shape[-1] if len(y_shape) == 2 else 1
    X, y = _make_sparse_offset_regression(n_samples=11,
                                          n_features=n_features,
                                          n_targets=n_targets,
                                          random_state=0,
                                          shuffle=False,
                                          noise=noise)
    y = y.reshape(y_shape)

    sample_weight = 3 * rng.randn(len(X))
    sample_weight = (sample_weight - sample_weight.min() + 1).astype(int)
    indices = np.repeat(np.arange(X.shape[0]), sample_weight)
    sample_weight = sample_weight.astype(float)
    X_tiled, y_tiled = X[indices], y[indices]

    cv = GroupKFold(n_splits=X.shape[0])
    splits = cv.split(X_tiled, y_tiled, groups=indices)
    kfold = RidgeCV(alphas=alphas,
                    cv=splits,
                    scoring='neg_mean_squared_error',
                    fit_intercept=fit_intercept)
    # ignore warning from GridSearchCV: DeprecationWarning: The default of the
    # `iid` parameter will change from True to False in version 0.22 and will
    # be removed in 0.24
    with ignore_warnings(category=DeprecationWarning):
        kfold.fit(X_tiled, y_tiled)

    ridge_reg = Ridge(alpha=kfold.alpha_, fit_intercept=fit_intercept)
    splits = cv.split(X_tiled, y_tiled, groups=indices)
    predictions = cross_val_predict(ridge_reg, X_tiled, y_tiled, cv=splits)
    kfold_errors = (y_tiled - predictions)**2
    kfold_errors = [
        np.sum(kfold_errors[indices == i], axis=0)
        for i in np.arange(X.shape[0])
    ]
    kfold_errors = np.asarray(kfold_errors)

    X_gcv = X_constructor(X)
    gcv_ridge = RidgeCV(alphas=alphas,
                        store_cv_values=True,
                        gcv_mode=gcv_mode,
                        fit_intercept=fit_intercept)
    gcv_ridge.fit(X_gcv, y, sample_weight=sample_weight)
    if len(y_shape) == 2:
        gcv_errors = gcv_ridge.cv_values_[:, :, alphas.index(kfold.alpha_)]
    else:
        gcv_errors = gcv_ridge.cv_values_[:, alphas.index(kfold.alpha_)]

    assert kfold.alpha_ == pytest.approx(gcv_ridge.alpha_)
    assert_allclose(gcv_errors, kfold_errors, rtol=1e-3)
    assert_allclose(gcv_ridge.coef_, kfold.coef_, rtol=1e-3)
    assert_allclose(gcv_ridge.intercept_, kfold.intercept_, rtol=1e-3)
def test_validate_estimator_init():
    smote = SMOTE(random_state=RND_SEED)
    tomek = TomekLinks(random_state=RND_SEED, ratio='all')
    smt = SMOTETomek(smote=smote, tomek=tomek, random_state=RND_SEED)
    X_resampled, y_resampled = smt.fit_sample(X, Y)
    X_gt = np.array([[0.68481731, 0.51935141],
                     [1.34192108, -0.13367336],
                     [0.62366841, -0.21312976],
                     [1.61091956, -0.40283504],
                     [-0.37162401, -2.19400981],
                     [0.74680821, 1.63827342],
                     [0.61472253, -0.82309052],
                     [0.19893132, -0.47761769],
                     [1.40301027, -0.83648734],
                     [-1.20515198, -1.02689695],
                     [-0.23374509, 0.18370049],
                     [-0.00288378, 0.84259929],
                     [1.79580611, -0.02219234],
                     [0.38307743, -0.05670439],
                     [0.70319159, -0.02571667],
                     [0.75052536, -0.19246518]])
    y_gt = np.array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
def test_offset_transformer(data):
    X = data

    X_offset = X.copy()
    phi_offset = np.deg2rad(1.0)
    t = X_offset.index
    delta_t = t[-1] - t[0]
    d_phi = phi_offset / delta_t
    X_offset['phi'] = X_offset['phi'] + d_phi * t

    trans = OffsetTransformer()

    trans.fit(X=X)

    X_trans = trans.transform(X=X)

    fig, ax = plt.subplots()
    X.plot(y='phi', ax=ax, label='true')
    X_offset.plot(y='phi', ax=ax, label='model test')
    X_trans.plot(y='phi', ax=ax, style='--', label='offset removed')
    ax.legend()
    plt.show()

    assert_allclose(X_trans['phi'], X['phi'], atol=0.01)
示例#48
0
def test_dtype_match(solver):
    rng = np.random.RandomState(0)
    alpha = 1.0

    n_samples, n_features = 6, 5
    X_64 = rng.randn(n_samples, n_features)
    y_64 = rng.randn(n_samples)
    X_32 = X_64.astype(np.float32)
    y_32 = y_64.astype(np.float32)

    # Check type consistency 32bits
    ridge_32 = Ridge(
        alpha=alpha,
        solver=solver,
        max_iter=500,
        tol=1e-10,
    )
    ridge_32.fit(X_32, y_32)
    coef_32 = ridge_32.coef_

    # Check type consistency 64 bits
    ridge_64 = Ridge(
        alpha=alpha,
        solver=solver,
        max_iter=500,
        tol=1e-10,
    )
    ridge_64.fit(X_64, y_64)
    coef_64 = ridge_64.coef_

    # Do the actual checks at once for easier debug
    assert coef_32.dtype == X_32.dtype
    assert coef_64.dtype == X_64.dtype
    assert ridge_32.predict(X_32).dtype == X_32.dtype
    assert ridge_64.predict(X_64).dtype == X_64.dtype
    assert_allclose(ridge_32.coef_, ridge_64.coef_, rtol=1e-4)
示例#49
0
def test_allknn_fit_resample():
    allknn = AllKNN()
    X_resampled, y_resampled = allknn.fit_resample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387],
                     [-0.46226554, -0.50481004], [-0.34474418, 0.21969797],
                     [1.02956816, 0.36061601], [1.12202806, 0.33811558],
                     [-1.10146139, 0.91782682], [0.73489726, 0.43915195],
                     [0.50307437, 0.498805], [0.84929742, 0.41042894],
                     [0.62649535, 0.46600596], [0.98382284, 0.37184502],
                     [0.69804044, 0.44810796], [0.04296502, -0.37981873],
                     [0.28294738, -1.00125525], [0.34218094, -0.58781961],
                     [0.2096964, -0.61814058], [1.59068979, -0.96622933],
                     [0.73418199, -0.02222847], [0.79270821, -0.41386668],
                     [1.16606871, -0.25641059], [1.0304995, -0.16955962],
                     [0.48921682, -1.38504507], [-0.03918551, -0.68540745],
                     [0.24991051, -1.00864997], [0.80541964, -0.34465185],
                     [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_allclose(y_resampled, y_gt, rtol=R_TOL)
def test_transform_target_regressor_functions():
    X, y = friedman
    regr = TransformedTargetRegressor(regressor=LinearRegression(),
                                      func=np.log,
                                      inverse_func=np.exp)
    y_pred = regr.fit(X, y).predict(X)
    # check the transformer output
    y_tran = regr.transformer_.transform(y.reshape(-1, 1)).squeeze()
    assert_allclose(np.log(y), y_tran)
    assert_allclose(
        y,
        regr.transformer_.inverse_transform(y_tran.reshape(-1, 1)).squeeze())
    assert y.shape == y_pred.shape
    assert_allclose(y_pred, regr.inverse_func(regr.regressor_.predict(X)))
    # check the regressor output
    lr = LinearRegression().fit(X, regr.func(y))
    assert_allclose(regr.regressor_.coef_.ravel(), lr.coef_.ravel())
示例#51
0
def test_partial_dependence_helpers(est, method, target_feature):
    # Check that what is returned by _partial_dependence_brute or
    # _partial_dependence_recursion is equivalent to manually setting a target
    # feature to a given value, and computing the average prediction over all
    # samples.
    # This also checks that the brute and recursion methods give the same
    # output.

    X, y = make_regression(random_state=0)
    # The 'init' estimator for GBDT (here the average prediction) isn't taken
    # into account with the recursion method, for technical reasons. We set
    # the mean to 0 to that this 'bug' doesn't have any effect.
    y = y - y.mean()
    est.fit(X, y)

    # target feature will be set to .5 and then to 123
    features = np.array([target_feature], dtype=np.int32)
    grid = np.array([[.5], [123]])

    if method == 'brute':
        pdp = _partial_dependence_brute(est,
                                        grid,
                                        features,
                                        X,
                                        response_method='auto')
    else:
        pdp = _partial_dependence_recursion(est, grid, features)

    mean_predictions = []
    for val in (.5, 123):
        X_ = X.copy()
        X_[:, target_feature] = val
        mean_predictions.append(est.predict(X_).mean())

    pdp = pdp[0]  # (shape is (1, 2) so make it (2,))
    assert_allclose(pdp, mean_predictions, atol=1e-3)
示例#52
0
def test_iterative_imputer_zero_iters():
    rng = np.random.RandomState(0)

    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    missing_flag = X == 0
    X[missing_flag] = np.nan

    imputer = IterativeImputer(max_iter=0)
    X_imputed = imputer.fit_transform(X)
    # with max_iter=0, only initial imputation is performed
    assert_allclose(X_imputed, imputer.initial_imputer_.transform(X))

    # repeat but force n_iter_ to 0
    imputer = IterativeImputer(max_iter=5).fit(X)
    # transformed should not be equal to initial imputation
    assert not np.all(
        imputer.transform(X) == imputer.initial_imputer_.transform(X))

    imputer.n_iter_ = 0
    # now they should be equal as only initial imputation is done
    assert_allclose(imputer.transform(X),
                    imputer.initial_imputer_.transform(X))
示例#53
0
def test_one_hot_encoder():
    X = [['abc', 1, 55], ['def', 2, 55]]

    Xtr = check_categorical_onehot(np.array(X)[:, [0]])
    assert_allclose(Xtr, [[1, 0], [0, 1]])

    Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]])
    assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]])

    Xtr = OneHotEncoder().fit_transform(X)
    assert_allclose(Xtr.toarray(), [[1, 0, 1, 0, 1], [0, 1, 0, 1, 1]])
示例#54
0
def test_gaussian_mixture_fit():
    # recover the ground truth
    rng = np.random.RandomState(0)
    rand_data = RandomData(rng)
    n_features = rand_data.n_features
    n_components = rand_data.n_components

    for covar_type in COVARIANCE_TYPE:
        X = rand_data.X[covar_type]
        g = GaussianMixture(n_components=n_components,
                            n_init=20,
                            reg_covar=0,
                            random_state=rng,
                            covariance_type=covar_type)
        g.fit(X)

        # needs more data to pass the test with rtol=1e-7
        assert_allclose(np.sort(g.weights_),
                        np.sort(rand_data.weights),
                        rtol=0.1,
                        atol=1e-2)

        arg_idx1 = g.means_[:, 0].argsort()
        arg_idx2 = rand_data.means[:, 0].argsort()
        assert_allclose(g.means_[arg_idx1],
                        rand_data.means[arg_idx2],
                        rtol=0.1,
                        atol=1e-2)

        if covar_type == 'full':
            prec_pred = g.precisions_
            prec_test = rand_data.precisions['full']
        elif covar_type == 'tied':
            prec_pred = np.array([g.precisions_] * n_components)
            prec_test = np.array([rand_data.precisions['tied']] * n_components)
        elif covar_type == 'spherical':
            prec_pred = np.array(
                [np.eye(n_features) * c for c in g.precisions_])
            prec_test = np.array([
                np.eye(n_features) * c
                for c in rand_data.precisions['spherical']
            ])
        elif covar_type == 'diag':
            prec_pred = np.array([np.diag(d) for d in g.precisions_])
            prec_test = np.array(
                [np.diag(d) for d in rand_data.precisions['diag']])

        arg_idx1 = np.trace(prec_pred, axis1=1, axis2=2).argsort()
        arg_idx2 = np.trace(prec_test, axis1=1, axis2=2).argsort()
        for k, h in zip(arg_idx1, arg_idx2):
            ecov = EmpiricalCovariance()
            ecov.covariance_ = prec_test[h]
            # the accuracy depends on the number of data and randomness, rng
            assert_allclose(ecov.error_norm(prec_pred[k]), 0, atol=0.1)
示例#55
0
def test_missing_value_handling(est, support_sparse):
    # check that the preprocessing method let pass nan
    rng = np.random.RandomState(42)
    X = iris.data.copy()
    n_missing = 50
    X[rng.randint(X.shape[0], size=n_missing),
      rng.randint(X.shape[1], size=n_missing)] = np.nan
    X_train, X_test = train_test_split(X, random_state=1)
    # sanity check
    assert not np.all(np.isnan(X_train), axis=0).any()
    assert np.any(np.isnan(X_train), axis=0).all()
    assert np.any(np.isnan(X_test), axis=0).all()
    X_test[:, 0] = np.nan  # make sure this boundary case is tested

    Xt = est.fit(X_train).transform(X_test)
    # missing values should still be missing, and only them
    assert_array_equal(np.isnan(Xt), np.isnan(X_test))

    # check that the inverse transform keep NaN
    Xt_inv = est.inverse_transform(Xt)
    assert_array_equal(np.isnan(Xt_inv), np.isnan(X_test))
    # FIXME: we can introduce equal_nan=True in recent version of numpy.
    # For the moment which just check that non-NaN values are almost equal.
    assert_allclose(Xt_inv[~np.isnan(Xt_inv)], X_test[~np.isnan(X_test)])

    for i in range(X.shape[1]):
        # train only on non-NaN
        est.fit(_get_valid_samples_by_column(X_train, i))
        # check transforming with NaN works even when training without NaN
        Xt_col = est.transform(X_test[:, [i]])
        assert_array_equal(Xt_col, Xt[:, [i]])
        # check non-NaN is handled as before - the 1st column is all nan
        if not np.isnan(X_test[:, i]).all():
            Xt_col_nonan = est.transform(
                _get_valid_samples_by_column(X_test, i))
            assert_array_equal(Xt_col_nonan,
                               Xt_col[~np.isnan(Xt_col.squeeze())])

    if support_sparse:
        est_dense = clone(est)
        est_sparse = clone(est)

        Xt_dense = est_dense.fit(X_train).transform(X_test)
        Xt_inv_dense = est_dense.inverse_transform(Xt_dense)
        for sparse_constructor in (sparse.csr_matrix, sparse.csc_matrix,
                                   sparse.bsr_matrix, sparse.coo_matrix,
                                   sparse.dia_matrix, sparse.dok_matrix,
                                   sparse.lil_matrix):
            # check that the dense and sparse inputs lead to the same results
            Xt_sparse = (est_sparse.fit(sparse_constructor(X_train)).transform(
                sparse_constructor(X_test)))
            assert_allclose(Xt_sparse.A, Xt_dense)
            Xt_inv_sparse = est_sparse.inverse_transform(Xt_sparse)
            assert_allclose(Xt_inv_sparse.A, Xt_inv_dense)
示例#56
0
    def test_invert_order(self):
        target = np.array([-0.1, -0.3, -0.5, -0.7, -0.2, -0.1]).ravel()
        scores1 = invert_order(self.scores1)
        assert_allclose(scores1, target)

        scores2 = invert_order(self.scores2)
        assert_allclose(scores2, target)

        target = np.array([0.6, 0.4, 0.2, 0, 0.5, 0.6]).ravel()
        scores2 = invert_order(self.scores2, method='subtraction')
        assert_allclose(scores2, target)
def test_nan_euclidean_distances_2x2(X, X_diag, missing_value):

    exp_dist = np.array([[0., X_diag], [X_diag, 0]])

    dist = nan_euclidean_distances(X, missing_values=missing_value)
    assert_allclose(exp_dist, dist)

    dist_sq = nan_euclidean_distances(X,
                                      squared=True,
                                      missing_values=missing_value)
    assert_allclose(exp_dist**2, dist_sq)

    dist_two = nan_euclidean_distances(X, X, missing_values=missing_value)
    assert_allclose(exp_dist, dist_two)

    dist_two_copy = nan_euclidean_distances(X,
                                            X.copy(),
                                            missing_values=missing_value)
    assert_allclose(exp_dist, dist_two_copy)
示例#58
0
def test_missing_indicator_new(missing_values, arr_type, dtype, param_features,
                               n_features, features_indices):
    X_fit = np.array([[missing_values, missing_values, 1],
                      [4, missing_values, 2]])
    X_trans = np.array([[missing_values, missing_values, 1],
                        [4, 12, 10]])
    X_fit_expected = np.array([[1, 1, 0], [0, 1, 0]])
    X_trans_expected = np.array([[1, 1, 0], [0, 0, 0]])

    # convert the input to the right array format and right dtype
    X_fit = arr_type(X_fit).astype(dtype)
    X_trans = arr_type(X_trans).astype(dtype)
    X_fit_expected = X_fit_expected.astype(dtype)
    X_trans_expected = X_trans_expected.astype(dtype)

    indicator = MissingIndicator(missing_values=missing_values,
                                 features=param_features,
                                 sparse=False)
    X_fit_mask = indicator.fit_transform(X_fit)
    X_trans_mask = indicator.transform(X_trans)

    assert X_fit_mask.shape[1] == n_features
    assert X_trans_mask.shape[1] == n_features

    assert_array_equal(indicator.features_, features_indices)
    assert_allclose(X_fit_mask, X_fit_expected[:, features_indices])
    assert_allclose(X_trans_mask, X_trans_expected[:, features_indices])

    assert X_fit_mask.dtype == bool
    assert X_trans_mask.dtype == bool
    assert isinstance(X_fit_mask, np.ndarray)
    assert isinstance(X_trans_mask, np.ndarray)

    indicator.set_params(sparse=True)
    X_fit_mask_sparse = indicator.fit_transform(X_fit)
    X_trans_mask_sparse = indicator.transform(X_trans)

    assert X_fit_mask_sparse.dtype == bool
    assert X_trans_mask_sparse.dtype == bool
    assert X_fit_mask_sparse.format == 'csc'
    assert X_trans_mask_sparse.format == 'csc'
    assert_allclose(X_fit_mask_sparse.toarray(), X_fit_mask)
    assert_allclose(X_trans_mask_sparse.toarray(), X_trans_mask)
    def test_data_generate3(self):
        X_train, y_train, X_test, y_test = \
            generate_data(n_train=self.n_train,
                          n_test=self.n_test,
                          n_features=2,
                          contamination=self.contamination,
                          random_state=42)

        X_train2, y_train2, X_test2, y_test2 = \
            generate_data(n_train=self.n_train,
                          n_test=self.n_test,
                          n_features=2,
                          contamination=self.contamination,
                          random_state=42)

        assert_allclose(X_train, X_train2)
        assert_allclose(X_test, X_test2)
        assert_allclose(y_train, y_train2)
        assert_allclose(y_test, y_test2)
    def test_data_generate_cluster3(self):
        X_train, y_train, X_test, y_test = \
            generate_data_clusters(n_train=self.n_train,
                                   n_test=self.n_test,
                                   n_features=3,
                                   contamination=self.contamination,
                                   random_state=self.random_state)

        X_train2, y_train2, X_test2, y_test2 = \
            generate_data_clusters(n_train=self.n_train,
                                   n_test=self.n_test,
                                   n_features=3,
                                   contamination=self.contamination,
                                   random_state=self.random_state)

        assert_allclose(X_train, X_train2)
        assert_allclose(X_test, X_test2)
        assert_allclose(y_train, y_train2)
        assert_allclose(y_test, y_test2)