def test_select_kbest_classif(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the k best heuristic X, y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) univariate_filter = SelectKBest(f_classif, k=5) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode='k_best', param=5).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def test_ovr_decision_function(): # test properties for ovr decision function predictions = np.array([[0, 1, 1], [0, 1, 0], [0, 1, 1], [0, 1, 1]]) confidences = np.array([[-1e16, 0, -1e16], [1., 2., -3.], [-5., 2., 5.], [-0.5, 0.2, 0.5]]) n_classes = 3 dec_values = _ovr_decision_function(predictions, confidences, n_classes) # check that the decision values are within 0.5 range of the votes votes = np.array([[1, 0, 2], [1, 1, 1], [1, 0, 2], [1, 0, 2]]) assert_allclose(votes, dec_values, atol=0.5) # check that the prediction are what we expect # highest vote or highest confidence if there is a tie. # for the second sample we have a tie (should be won by 1) expected_prediction = np.array([2, 1, 2, 2]) assert_array_equal(np.argmax(dec_values, axis=1), expected_prediction) # third and fourth sample have the same vote but third sample # has higher confidence, this should reflect on the decision values assert (dec_values[2, 2] > dec_values[3, 2]) # assert subset invariance. dec_values_one = [ _ovr_decision_function(np.array([predictions[i]]), np.array([confidences[i]]), n_classes)[0] for i in range(4) ] assert_allclose(dec_values, dec_values_one, atol=1e-6)
def test_linkage_misc(): # Misc tests on linkage rng = np.random.RandomState(42) X = rng.normal(size=(5, 5)) with pytest.raises(ValueError): AgglomerativeClustering(linkage='foo').fit(X) with pytest.raises(ValueError): linkage_tree(X, linkage='foo') with pytest.raises(ValueError): linkage_tree(X, connectivity=np.ones((4, 4))) # Smoke test FeatureAgglomeration FeatureAgglomeration().fit(X) # test hierarchical clustering on a precomputed distances matrix dis = cosine_distances(X) res = linkage_tree(dis, affinity="precomputed") assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0]) # test hierarchical clustering on a precomputed distances matrix res = linkage_tree(X, affinity=manhattan_distances) assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
def test_missing_indicator_with_imputer(X, missing_values, X_trans_exp): trans = make_union( SimpleImputer(missing_values=missing_values, strategy='most_frequent'), MissingIndicator(missing_values=missing_values) ) X_trans = trans.fit_transform(X) assert_array_equal(X_trans, X_trans_exp)
def test_fit_best_piecewise(): model = SpectralBiclustering(random_state=0) vectors = np.array([[0, 0, 0, 1, 1, 1], [2, 2, 2, 3, 3, 3], [0, 1, 2, 3, 4, 5]]) best = model._fit_best_piecewise(vectors, n_best=2, n_clusters=2) assert_array_equal(best, vectors[:2])
def test_adam_optimizer(): params = [np.zeros(shape) for shape in shapes] lr = 0.001 epsilon = 1e-8 for beta_1 in np.arange(0.9, 1.0, 0.05): for beta_2 in np.arange(0.995, 1.0, 0.001): optimizer = AdamOptimizer(params, lr, beta_1, beta_2, epsilon) ms = [np.random.random(shape) for shape in shapes] vs = [np.random.random(shape) for shape in shapes] t = 10 optimizer.ms = ms optimizer.vs = vs optimizer.t = t - 1 grads = [np.random.random(shape) for shape in shapes] ms = [ beta_1 * m + (1 - beta_1) * grad for m, grad in zip(ms, grads) ] vs = [ beta_2 * v + (1 - beta_2) * (grad**2) for v, grad in zip(vs, grads) ] learning_rate = lr * np.sqrt(1 - beta_2**t) / (1 - beta_1**t) updates = [ -learning_rate * m / (np.sqrt(v) + epsilon) for m, v in zip(ms, vs) ] expected = [ param + update for param, update in zip(params, updates) ] optimizer.update_params(grads) for exp, param in zip(expected, optimizer.params): assert_array_equal(exp, param)
def test_predict_on_toy_problem(): """Manually check predicted class labels for toy dataset.""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2], [2.1, 1.4], [3.1, 2.3]]) y = np.array([1, 1, 1, 2, 2, 2]) assert_array_equal(clf1.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2]) assert_array_equal(clf2.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2]) assert_array_equal(clf3.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2]) eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard', weights=[1, 1, 1]) assert_array_equal(eclf.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2]) eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', weights=[1, 1, 1]) assert_array_equal(eclf.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
def test_check_array_pandas_dtype_casting(): # test that data-frames with homogeneous dtype are not upcast pd = pytest.importorskip('pandas') X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32) X_df = pd.DataFrame(X) assert check_array(X_df).dtype == np.float32 assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32 X_df.iloc[:, 0] = X_df.iloc[:, 0].astype(np.float16) assert_array_equal(X_df.dtypes, (np.float16, np.float32, np.float32)) assert check_array(X_df).dtype == np.float32 assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32 X_df.iloc[:, 1] = X_df.iloc[:, 1].astype(np.int16) # float16, int16, float32 casts to float32 assert check_array(X_df).dtype == np.float32 assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32 X_df.iloc[:, 2] = X_df.iloc[:, 2].astype(np.float16) # float16, int16, float16 casts to float32 assert check_array(X_df).dtype == np.float32 assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float32 X_df = X_df.astype(np.int16) assert check_array(X_df).dtype == np.int16 # we're not using upcasting rules for determining # the target type yet, so we cast to the default of float64 assert check_array(X_df, dtype=FLOAT_DTYPES).dtype == np.float64 # check that we handle pandas dtypes in a semi-reasonable way # this is actually tricky because we can't really know that this # should be integer ahead of converting it. cat_df = pd.DataFrame([pd.Categorical([1, 2, 3])]) assert (check_array(cat_df).dtype == np.int64) assert (check_array(cat_df, dtype=FLOAT_DTYPES).dtype == np.float64)
def test_check_symmetric(): arr_sym = np.array([[0, 1], [1, 2]]) arr_bad = np.ones(2) arr_asym = np.array([[0, 2], [0, 2]]) test_arrays = { 'dense': arr_asym, 'dok': sp.dok_matrix(arr_asym), 'csr': sp.csr_matrix(arr_asym), 'csc': sp.csc_matrix(arr_asym), 'coo': sp.coo_matrix(arr_asym), 'lil': sp.lil_matrix(arr_asym), 'bsr': sp.bsr_matrix(arr_asym) } # check error for bad inputs assert_raises(ValueError, check_symmetric, arr_bad) # check that asymmetric arrays are properly symmetrized for arr_format, arr in test_arrays.items(): # Check for warnings and errors assert_warns(UserWarning, check_symmetric, arr) assert_raises(ValueError, check_symmetric, arr, raise_exception=True) output = check_symmetric(arr, raise_warning=False) if sp.issparse(output): assert output.format == arr_format assert_array_equal(output.toarray(), arr_sym) else: assert_array_equal(output, arr_sym)
def test_delegate_to_func(): # (args|kwargs)_store will hold the positional and keyword arguments # passed to the function inside the FunctionTransformer. args_store = [] kwargs_store = {} X = np.arange(10).reshape((5, 2)) assert_array_equal( FunctionTransformer(_make_func(args_store, kwargs_store)).transform(X), X, 'transform should have returned X unchanged', ) # The function should only have received X. assert args_store == [X], ('Incorrect positional arguments passed to ' 'func: {args}'.format(args=args_store)) assert not kwargs_store, ('Unexpected keyword arguments passed to ' 'func: {args}'.format(args=kwargs_store)) # reset the argument stores. args_store[:] = [] kwargs_store.clear() transformed = FunctionTransformer( _make_func(args_store, kwargs_store), ).transform(X) assert_array_equal(transformed, X, err_msg='transform should have returned X unchanged') # The function should have received X assert args_store == [X], ('Incorrect positional arguments passed ' 'to func: {args}'.format(args=args_store)) assert not kwargs_store, ('Unexpected keyword arguments passed to ' 'func: {args}'.format(args=kwargs_store))
def test_make_multilabel_classification_return_indicator(): for allow_unlabeled, min_length in zip((True, False), (0, 1)): X, Y = make_multilabel_classification(n_samples=25, n_features=20, n_classes=3, random_state=0, allow_unlabeled=allow_unlabeled) assert X.shape == (25, 20), "X shape mismatch" assert Y.shape == (25, 3), "Y shape mismatch" assert np.all(np.sum(Y, axis=0) > min_length) # Also test return_distributions and return_indicator with True X2, Y2, p_c, p_w_c = make_multilabel_classification( n_samples=25, n_features=20, n_classes=3, random_state=0, allow_unlabeled=allow_unlabeled, return_distributions=True) assert_array_almost_equal(X, X2) assert_array_equal(Y, Y2) assert p_c.shape == (3, ) assert_almost_equal(p_c.sum(), 1) assert p_w_c.shape == (20, 3) assert_almost_equal(p_w_c.sum(axis=0), [1] * 3)
def single_fdr(alpha, n_informative, random_state): X, y = make_regression(n_samples=150, n_features=20, n_informative=n_informative, shuffle=False, random_state=random_state, noise=10) with warnings.catch_warnings(record=True): # Warnings can be raised when no features are selected # (low alpha or very noisy data) univariate_filter = SelectFdr(f_regression, alpha=alpha) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='fdr', param=alpha).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() num_false_positives = np.sum(support[n_informative:] == 1) num_true_positives = np.sum(support[:n_informative] == 1) if num_false_positives == 0: return 0. false_discovery_rate = (num_false_positives / (num_true_positives + num_false_positives)) return false_discovery_rate
def test_boundary_case_ch2(): # Test boundary case, and always aim to select 1 feature. X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) assert_array_almost_equal(scores, np.array([4., 0.71428571])) assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) filter_fdr.fit(X, y) support_fdr = filter_fdr.get_support() assert_array_equal(support_fdr, np.array([True, False])) filter_kbest = SelectKBest(chi2, k=1) filter_kbest.fit(X, y) support_kbest = filter_kbest.get_support() assert_array_equal(support_kbest, np.array([True, False])) filter_percentile = SelectPercentile(chi2, percentile=50) filter_percentile.fit(X, y) support_percentile = filter_percentile.get_support() assert_array_equal(support_percentile, np.array([True, False])) filter_fpr = SelectFpr(chi2, alpha=0.1) filter_fpr.fit(X, y) support_fpr = filter_fpr.get_support() assert_array_equal(support_fpr, np.array([True, False])) filter_fwe = SelectFwe(chi2, alpha=0.1) filter_fwe.fit(X, y) support_fwe = filter_fwe.get_support() assert_array_equal(support_fwe, np.array([True, False]))
def test_select_heuristics_classif(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the fdr, fwe and fpr heuristics X, y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) univariate_filter = SelectFwe(f_classif, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) gtruth = np.zeros(20) gtruth[:5] = 1 for mode in ['fdr', 'fpr', 'fwe']: X_r2 = GenericUnivariateSelect(f_classif, mode=mode, param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() assert_array_almost_equal(support, gtruth)
def test_iforest_sparse(): """Check IForest for various parameter settings on sparse input.""" rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], boston.target[:50], random_state=rng) grid = ParameterGrid({ "max_samples": [0.5, 1.0], "bootstrap": [True, False] }) for sparse_format in [csc_matrix, csr_matrix]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) for params in grid: # Trained on sparse format sparse_classifier = IsolationForest(n_estimators=10, random_state=1, **params).fit(X_train_sparse) sparse_results = sparse_classifier.predict(X_test_sparse) # Trained on dense format dense_classifier = IsolationForest(n_estimators=10, random_state=1, **params).fit(X_train) dense_results = dense_classifier.predict(X_test) assert_array_equal(sparse_results, dense_results)
def test_n_clusters(): # Test that n_clusters param works properly X, y = make_blobs(n_samples=100, centers=10) brc1 = Birch(n_clusters=10) brc1.fit(X) assert len(brc1.subcluster_centers_) > 10 assert len(np.unique(brc1.labels_)) == 10 # Test that n_clusters = Agglomerative Clustering gives # the same results. gc = AgglomerativeClustering(n_clusters=10) brc2 = Birch(n_clusters=gc) brc2.fit(X) assert_array_equal(brc1.subcluster_labels_, brc2.subcluster_labels_) assert_array_equal(brc1.labels_, brc2.labels_) # Test that the wrong global clustering step raises an Error. clf = ElasticNet() brc3 = Birch(n_clusters=clf) with pytest.raises(ValueError): brc3.fit(X) # Test that a small number of clusters raises a warning. brc4 = Birch(threshold=10000.) assert_warns(ConvergenceWarning, brc4.fit, X)
def test_gaussian_mixture_fit_predict_n_init(): # Check that fit_predict is equivalent to fit.predict, when n_init > 1 X = np.random.RandomState(0).randn(1000, 5) gm = GaussianMixture(n_components=5, n_init=5, random_state=0) y_pred1 = gm.fit_predict(X) y_pred2 = gm.predict(X) assert_array_equal(y_pred1, y_pred2)
def test_iris(): # Check consistency on dataset iris. classes = np.unique(iris.target) clf_samme = prob_samme = None for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg) clf.fit(iris.data, iris.target) assert_array_equal(classes, clf.classes_) proba = clf.predict_proba(iris.data) if alg == "SAMME": clf_samme = clf prob_samme = proba assert proba.shape[1] == len(classes) assert clf.decision_function(iris.data).shape[1] == len(classes) score = clf.score(iris.data, iris.target) assert score > 0.9, "Failed with algorithm %s and score = %f" % \ (alg, score) # Check we used multiple estimators assert len(clf.estimators_) > 1 # Check for distinct random states (see issue #7408) assert (len(set(est.random_state for est in clf.estimators_)) == len(clf.estimators_)) # Somewhat hacky regression test: prior to # ae7adc880d624615a34bafdb1d75ef67051b8200, # predict_proba returned SAMME.R values for SAMME. clf_samme.algorithm = "SAMME.R" assert_array_less(0, np.abs(clf_samme.predict_proba(iris.data) - prob_samme))
def test_fast_predict(): # test that the faster prediction change doesn't # affect out-of-sample predictions: # https://github.com/scikit-learn/scikit-learn/pull/6206 rng = np.random.RandomState(123) n_samples = 10**3 # X values over the -10,10 range X_train = 20.0 * rng.rand(n_samples) - 10 y_train = np.less(rng.rand(n_samples), expit(X_train)).astype('int64').astype('float64') weights = rng.rand(n_samples) # we also want to test that everything still works when some weights are 0 weights[rng.rand(n_samples) < 0.1] = 0 slow_model = IsotonicRegression(y_min=0, y_max=1, out_of_bounds="clip") fast_model = IsotonicRegression(y_min=0, y_max=1, out_of_bounds="clip") # Build interpolation function with ALL input data, not just the # non-redundant subset. The following 2 lines are taken from the # .fit() method, without removing unnecessary points X_train_fit, y_train_fit = slow_model._build_y(X_train, y_train, sample_weight=weights, trim_duplicates=False) slow_model._build_f(X_train_fit, y_train_fit) # fit with just the necessary data fast_model.fit(X_train, y_train, sample_weight=weights) X_test = 20.0 * rng.rand(n_samples) - 10 y_pred_slow = slow_model.predict(X_test) y_pred_fast = fast_model.predict(X_test) assert_array_equal(y_pred_slow, y_pred_fast)
def test_dbscan_sparse(): core_sparse, labels_sparse = dbscan(sparse.lil_matrix(X), eps=.8, min_samples=10) core_dense, labels_dense = dbscan(X, eps=.8, min_samples=10) assert_array_equal(core_dense, core_sparse) assert_array_equal(labels_dense, labels_sparse)
def test_predict_consistent_structured(): # Check binary predict decision has also predicted probability above 0.5. X = ['A', 'AB', 'B'] y = np.array([True, False, True]) kernel = MiniSeqKernel(baseline_similarity_bounds='fixed') gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) assert_array_equal(gpc.predict(X), gpc.predict_proba(X)[:, 1] >= 0.5)
def test_estimators_samples_deterministic(): # This test is a regression test to check that with a random step # (e.g. SparseRandomProjection) and a given random state, the results # generated at fit time can be identically reproduced at a later time using # data saved in object attributes. Check issue #9524 for full discussion. iris = load_iris() X, y = iris.data, iris.target base_pipeline = make_pipeline(SparseRandomProjection(n_components=2), LogisticRegression()) clf = BaggingClassifier(base_estimator=base_pipeline, max_samples=0.5, random_state=0) clf.fit(X, y) pipeline_estimator_coef = clf.estimators_[0].steps[-1][1].coef_.copy() estimator = clf.estimators_[0] estimator_sample = clf.estimators_samples_[0] estimator_feature = clf.estimators_features_[0] X_train = (X[estimator_sample])[:, estimator_feature] y_train = y[estimator_sample] estimator.fit(X_train, y_train) assert_array_equal(estimator.steps[-1][1].coef_, pipeline_estimator_coef)
def test_feature_union_weights(): # test feature union with transformer weights X = iris.data y = iris.target pca = PCA(n_components=2, svd_solver='randomized', random_state=0) select = SelectKBest(k=1) # test using fit followed by transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) fs.fit(X, y) X_transformed = fs.transform(X) # test using fit_transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) X_fit_transformed = fs.fit_transform(X, y) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", Transf()), ("pca", pca), ("select", select)], transformer_weights={"mock": 10}) X_fit_transformed_wo_method = fs.fit_transform(X, y) # check against expected result # We use a different pca object to control the random_state stream assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_array_almost_equal(X_fit_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_fit_transformed[:, -1], select.fit_transform(X, y).ravel()) assert X_fit_transformed_wo_method.shape == (X.shape[0], 7)
def test_grid_from_X(): # tests for _grid_from_X: sanity check for output, and for shapes. # Make sure that the grid is a cartesian product of the input (it will use # the unique values instead of the percentiles) percentiles = (.05, .95) grid_resolution = 100 X = np.asarray([[1, 2], [3, 4]]) grid, axes = _grid_from_X(X, percentiles, grid_resolution) assert_array_equal(grid, [[1, 2], [1, 4], [3, 2], [3, 4]]) assert_array_equal(axes, X.T) # test shapes of returned objects depending on the number of unique values # for a feature. rng = np.random.RandomState(0) grid_resolution = 15 # n_unique_values > grid_resolution X = rng.normal(size=(20, 2)) grid, axes = _grid_from_X(X, percentiles, grid_resolution=grid_resolution) assert grid.shape == (grid_resolution * grid_resolution, X.shape[1]) assert np.asarray(axes).shape == (2, grid_resolution) # n_unique_values < grid_resolution, will use actual values n_unique_values = 12 X[n_unique_values - 1:, 0] = 12345 rng.shuffle(X) # just to make sure the order is irrelevant grid, axes = _grid_from_X(X, percentiles, grid_resolution=grid_resolution) assert grid.shape == (n_unique_values * grid_resolution, X.shape[1]) # axes is a list of arrays of different shapes assert axes[0].shape == (n_unique_values, ) assert axes[1].shape == (grid_resolution, )
def test_spectral_coclustering(): # Test Dhillon's Spectral CoClustering on a simple problem. param_grid = {'svd_method': ['randomized', 'arpack'], 'n_svd_vecs': [None, 20], 'mini_batch': [False, True], 'init': ['k-means++'], 'n_init': [10], 'n_jobs': [1]} random_state = 0 S, rows, cols = make_biclusters((30, 30), 3, noise=0.5, random_state=random_state) S -= S.min() # needs to be nonnegative before making it sparse S = np.where(S < 1, 0, S) # threshold some values for mat in (S, csr_matrix(S)): for kwargs in ParameterGrid(param_grid): model = SpectralCoclustering(n_clusters=3, random_state=random_state, **kwargs) model.fit(mat) assert model.rows_.shape == (3, 30) assert_array_equal(model.rows_.sum(axis=0), np.ones(30)) assert_array_equal(model.columns_.sum(axis=0), np.ones(30)) assert consensus_score(model.biclusters_, (rows, cols)) == 1 _test_shape_indices(model)
def test_bayesian_mixture_predict_predict_proba(): # this is the same test as test_gaussian_mixture_predict_predict_proba() rng = np.random.RandomState(0) rand_data = RandomData(rng) for prior_type in PRIOR_TYPE: for covar_type in COVARIANCE_TYPE: X = rand_data.X[covar_type] Y = rand_data.Y bgmm = BayesianGaussianMixture( n_components=rand_data.n_components, random_state=rng, weight_concentration_prior_type=prior_type, covariance_type=covar_type) # Check a warning message arrive if we don't do fit assert_raise_message( NotFittedError, "This BayesianGaussianMixture instance" " is not fitted yet. Call 'fit' with " "appropriate arguments before using " "this estimator.", bgmm.predict, X) bgmm.fit(X) Y_pred = bgmm.predict(X) Y_pred_proba = bgmm.predict_proba(X).argmax(axis=1) assert_array_equal(Y_pred, Y_pred_proba) assert adjusted_rand_score(Y, Y_pred) >= .95
def test_scikit_vs_scipy(): # Test scikit linkage with full connectivity (i.e. unstructured) vs scipy n, p, k = 10, 5, 3 rng = np.random.RandomState(0) # Not using a lil_matrix here, just to check that non sparse # matrices are well handled connectivity = np.ones((n, n)) for linkage in _TREE_BUILDERS.keys(): for i in range(5): X = .1 * rng.normal(size=(n, p)) X -= 4. * np.arange(n)[:, np.newaxis] X -= X.mean(axis=1)[:, np.newaxis] out = hierarchy.linkage(X, method=linkage) children_ = out[:, :2].astype(np.int, copy=False) children, _, n_leaves, _ = _TREE_BUILDERS[linkage](X, connectivity) # Sort the order of child nodes per row for consistency children.sort(axis=1) assert_array_equal(children, children_, 'linkage tree differs' ' from scipy impl for' ' linkage: ' + linkage) cut = _hc_cut(k, children, n_leaves) cut_ = _hc_cut(k, children_, n_leaves) assess_same_labelling(cut, cut_) # Test error management in _hc_cut with pytest.raises(ValueError): _hc_cut(n_leaves + 1, children, n_leaves)
def test_load_svmlight_file(): X, y = load_svmlight_file(datafile) # test X's shape assert X.indptr.shape[0] == 7 assert X.shape[0] == 6 assert X.shape[1] == 21 assert y.shape[0] == 6 # test X's non-zero values for i, j, val in ((0, 2, 2.5), (0, 10, -5.2), (0, 15, 1.5), (1, 5, 1.0), (1, 12, -3), (2, 20, 27)): assert X[i, j] == val # tests X's zero values assert X[0, 3] == 0 assert X[0, 5] == 0 assert X[1, 8] == 0 assert X[1, 16] == 0 assert X[2, 18] == 0 # test can change X's values X[0, 2] *= 2 assert X[0, 2] == 5 # test y assert_array_equal(y, [1, 2, 3, 4, 1, 2])
def test_incremental_variance_ddof(): # Test that degrees of freedom parameter for calculations are correct. rng = np.random.RandomState(1999) X = rng.randn(50, 10) n_samples, n_features = X.shape for batch_size in [11, 20, 37]: steps = np.arange(0, X.shape[0], batch_size) if steps[-1] != X.shape[0]: steps = np.hstack([steps, n_samples]) for i, j in zip(steps[:-1], steps[1:]): batch = X[i:j, :] if i == 0: incremental_means = batch.mean(axis=0) incremental_variances = batch.var(axis=0) # Assign this twice so that the test logic is consistent incremental_count = batch.shape[0] sample_count = np.full(batch.shape[1], batch.shape[0], dtype=np.int32) else: result = _incremental_mean_and_var(batch, incremental_means, incremental_variances, sample_count) (incremental_means, incremental_variances, incremental_count) = result sample_count += batch.shape[0] calculated_means = np.mean(X[:j], axis=0) calculated_variances = np.var(X[:j], axis=0) assert_almost_equal(incremental_means, calculated_means, 6) assert_almost_equal(incremental_variances, calculated_variances, 6) assert_array_equal(incremental_count, sample_count)
def test_one_hot_encoder_handle_unknown(): X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]]) X2 = np.array([[4, 1, 1]]) # Test that one hot encoder raises error for unknown features # present during transform. oh = OneHotEncoder(handle_unknown='error') oh.fit(X) with pytest.raises(ValueError, match='Found unknown categories'): oh.transform(X2) # Test the ignore option, ignores unknown features (giving all 0's) oh = OneHotEncoder(handle_unknown='ignore') oh.fit(X) X2_passed = X2.copy() assert_array_equal( oh.transform(X2_passed).toarray(), np.array([[0., 0., 0., 0., 1., 0., 0.]])) # ensure transformed data was not modified in place assert_allclose(X2, X2_passed) # Raise error if handle_unknown is neither ignore or error. oh = OneHotEncoder(handle_unknown='42') with pytest.raises(ValueError, match='handle_unknown should be either'): oh.fit(X)