def test_linearsvc(): # Test basic routines using LinearSVC clf = svm.LinearSVC(random_state=0).fit(X, Y) # by default should have intercept assert_true(clf.fit_intercept) assert_array_equal(clf.predict(T), true_result) assert_array_almost_equal(clf.intercept_, [0], decimal=3) # the same with l1 penalty clf = svm.LinearSVC(penalty='l1', loss='squared_hinge', dual=False, random_state=0).fit(X, Y) assert_array_equal(clf.predict(T), true_result) # l2 penalty with dual formulation clf = svm.LinearSVC(penalty='l2', dual=True, random_state=0).fit(X, Y) assert_array_equal(clf.predict(T), true_result) # l2 penalty, l1 loss clf = svm.LinearSVC(penalty='l2', loss='hinge', dual=True, random_state=0) clf.fit(X, Y) assert_array_equal(clf.predict(T), true_result) # test also decision function dec = clf.decision_function(T) res = (dec > 0).astype(np.int) + 1 assert_array_equal(res, true_result)
def test_RadiusNeighborsRegressor_multioutput(n_samples=40, n_features=5, n_test_pts=10, n_neighbors=3, random_state=0): """Test k-neighbors in multi-output regression with various weight""" rng = np.random.RandomState(random_state) X = 2 * rng.rand(n_samples, n_features) - 1 y = np.sqrt((X ** 2).sum(1)) y /= y.max() y = np.vstack([y, y]).T y_target = y[:n_test_pts] weights = ['uniform', 'distance', _weight_func] for algorithm, weights in product(ALGORITHMS, weights): rnn = neighbors.RadiusNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm) rnn.fit(X, y) epsilon = 1E-5 * (2 * rng.rand(1, n_features) - 1) y_pred = rnn.predict(X[:n_test_pts] + epsilon) assert_equal(y_pred.shape, y_target.shape) assert_true(np.all(np.abs(y_pred - y_target) < 0.3))
def test_parameter_grid(): """Test basic properties of ParameterGrid.""" params1 = {"foo": [1, 2, 3]} grid1 = ParameterGrid(params1) assert_true(isinstance(grid1, Iterable)) assert_true(isinstance(grid1, Sized)) assert_equal(len(grid1), 3) params2 = {"foo": [4, 2], "bar": ["ham", "spam", "eggs"]} grid2 = ParameterGrid(params2) assert_equal(len(grid2), 6) # loop to assert we can iterate over the grid multiple times for i in xrange(2): # tuple + chain transforms {"a": 1, "b": 2} to ("a", 1, "b", 2) points = set(tuple(chain(*(sorted(p.items())))) for p in grid2) assert_equal(points, set(("bar", x, "foo", y) for x, y in product(params2["bar"], params2["foo"]))) # Special case: empty grid (useful to get default estimator settings) empty = ParameterGrid({}) assert_equal(len(empty), 1) assert_equal(list(empty), [{}]) has_empty = ParameterGrid([{'C': [1, 10]}, {}]) assert_equal(len(has_empty), 3) assert_equal(list(has_empty), [{'C': 1}, {'C': 10}, {}])
def test_dense_liblinear_intercept_handling(classifier=svm.LinearSVC): # Test that dense liblinear honours intercept_scaling param X = [[2, 1], [3, 1], [1, 3], [2, 3]] y = [0, 0, 1, 1] clf = classifier(fit_intercept=True, penalty='l1', loss='squared_hinge', dual=False, C=4, tol=1e-7, random_state=0) assert_true(clf.intercept_scaling == 1, clf.intercept_scaling) assert_true(clf.fit_intercept) # when intercept_scaling is low the intercept value is highly "penalized" # by regularization clf.intercept_scaling = 1 clf.fit(X, y) assert_almost_equal(clf.intercept_, 0, decimal=5) # when intercept_scaling is sufficiently high, the intercept value # is not affected by regularization clf.intercept_scaling = 100 clf.fit(X, y) intercept1 = clf.intercept_ assert_less(intercept1, -1) # when intercept_scaling is sufficiently high, the intercept value # doesn't depend on intercept_scaling value clf.intercept_scaling = 1000 clf.fit(X, y) intercept2 = clf.intercept_ assert_array_almost_equal(intercept1, intercept2, decimal=2)
def test_grid_search_precomputed_kernel(): """Test that grid search works when the input features are given in the form of a precomputed kernel matrix """ X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) # compute the training kernel matrix corresponding to the linear kernel K_train = np.dot(X_[:180], X_[:180].T) y_train = y_[:180] clf = SVC(kernel='precomputed') cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) cv.fit(K_train, y_train) assert_true(cv.best_score_ >= 0) # compute the test kernel matrix K_test = np.dot(X_[180:], X_[:180].T) y_test = y_[180:] y_pred = cv.predict(K_test) assert_true(np.mean(y_pred == y_test) >= 0) # test error is raised when the precomputed kernel is not array-like # or sparse assert_raises(ValueError, cv.fit, K_train.tolist(), y_train)
def test_check_is_partition(): p = np.arange(100) assert_true(cval._check_is_partition(p, 100)) assert_false(cval._check_is_partition(np.delete(p, 23), 100)) p[0] = 23 assert_false(cval._check_is_partition(p, 100))
def test_labels_assignment_and_inertia(): # pure numpy implementation as easily auditable reference gold # implementation rng = np.random.RandomState(42) noisy_centers = centers + rng.normal(size=centers.shape) labels_gold = - np.ones(n_samples, dtype=np.int) mindist = np.empty(n_samples) mindist.fill(np.infty) for center_id in range(n_clusters): dist = np.sum((X - noisy_centers[center_id]) ** 2, axis=1) labels_gold[dist < mindist] = center_id mindist = np.minimum(dist, mindist) inertia_gold = mindist.sum() assert_true((mindist >= 0.0).all()) assert_true((labels_gold != -1).all()) # perform label assignment using the dense array input x_squared_norms = (X ** 2).sum(axis=1) labels_array, inertia_array = _labels_inertia( X, x_squared_norms, noisy_centers) assert_array_almost_equal(inertia_array, inertia_gold) assert_array_equal(labels_array, labels_gold) # perform label assignment using the sparse CSR input x_squared_norms_from_csr = row_norms(X_csr, squared=True) labels_csr, inertia_csr = _labels_inertia( X_csr, x_squared_norms_from_csr, noisy_centers) assert_array_almost_equal(inertia_csr, inertia_gold) assert_array_equal(labels_csr, labels_gold)
def test_check_is_permutation(): p = np.arange(100) assert_true(_check_is_permutation(p, 100)) assert_false(_check_is_permutation(np.delete(p, 23), 100)) p[0] = 23 assert_false(_check_is_permutation(p, 100))
def train_test_split_mock_pandas(): # X mock dataframe X_df = MockDataFrame(X) X_train, X_test = train_test_split(X_df) assert_true(isinstance(X_train, MockDataFrame)) assert_true(isinstance(X_test, MockDataFrame)) X_train_arr, X_test_arr = train_test_split(X_df)
def test_ovo_gridsearch(): ovo = OneVsOneClassifier(LinearSVC(random_state=0)) Cs = [0.1, 0.5, 0.8] cv = GridSearchCV(ovo, {'estimator__C': Cs}) cv.fit(iris.data, iris.target) best_C = cv.best_estimator_.estimators_[0].C assert_true(best_C in Cs)
def test_ovr_multilabel(): # Toy dataset where features correspond directly to labels. X = np.array([[0, 4, 5], [0, 5, 0], [3, 3, 3], [4, 0, 6], [6, 0, 0]]) y = [["spam", "eggs"], ["spam"], ["ham", "eggs", "spam"], ["ham", "eggs"], ["ham"]] #y = [[1, 2], [1], [0, 1, 2], [0, 2], [0]] Y = np.array([[0, 1, 1], [0, 1, 0], [1, 1, 1], [1, 0, 1], [1, 0, 0]]) classes = set("ham eggs spam".split()) for base_clf in (MultinomialNB(), LinearSVC(random_state=0), LinearRegression(), Ridge(), ElasticNet(), Lasso(alpha=0.5)): # test input as lists of tuples clf = assert_warns(DeprecationWarning, OneVsRestClassifier(base_clf).fit, X, y) assert_equal(set(clf.classes_), classes) y_pred = clf.predict([[0, 4, 4]])[0] assert_equal(set(y_pred), set(["spam", "eggs"])) assert_true(clf.multilabel_) # test input as label indicator matrix clf = OneVsRestClassifier(base_clf).fit(X, Y) y_pred = clf.predict([[0, 4, 4]])[0] assert_array_equal(y_pred, [0, 1, 1]) assert_true(clf.multilabel_)
def check_warm_start_oob(name): # Test that the warm start computes oob score when asked. X, y = hastie_X, hastie_y ForestEstimator = FOREST_ESTIMATORS[name] # Use 15 estimators to avoid 'some inputs do not have OOB scores' warning. clf = ForestEstimator(n_estimators=15, max_depth=3, warm_start=False, random_state=1, bootstrap=True, oob_score=True) clf.fit(X, y) clf_2 = ForestEstimator(n_estimators=5, max_depth=3, warm_start=False, random_state=1, bootstrap=True, oob_score=False) clf_2.fit(X, y) clf_2.set_params(warm_start=True, oob_score=True, n_estimators=15) clf_2.fit(X, y) assert_true(hasattr(clf_2, 'oob_score_')) assert_equal(clf.oob_score_, clf_2.oob_score_) # Test that oob_score is computed even if we don't need to train # additional trees. clf_3 = ForestEstimator(n_estimators=15, max_depth=3, warm_start=True, random_state=1, bootstrap=True, oob_score=False) clf_3.fit(X, y) assert_true(not(hasattr(clf_3, 'oob_score_'))) clf_3.set_params(oob_score=True) ignore_warnings(clf_3.fit)(X, y) assert_equal(clf.oob_score_, clf_3.oob_score_)
def check_get_params_invariance(name, estimator): class T(BaseEstimator): """Mock classifier """ def __init__(self): pass def fit(self, X, y): return self if name in ('FeatureUnion', 'Pipeline'): e = estimator([('clf', T())]) elif name in ('GridSearchCV' 'RandomizedSearchCV'): return else: e = estimator() shallow_params = e.get_params(deep=False) deep_params = e.get_params(deep=True) assert_true(all(item in deep_params.items() for item in shallow_params.items()))
def test_kfold_valueerrors(): # Check that errors are raised if there is not enough samples assert_raises(ValueError, cval.KFold, 3, 4) # Check that a warning is raised if the least populated class has too few # members. with warnings.catch_warnings(record=True) as w: warnings.simplefilter('always') y = [3, 3, -1, -1, 2] cv = cval.StratifiedKFold(y, 3) # checking there was only one warning. assert_equal(len(w), 1) # checking it has the right type assert_equal(w[0].category, Warning) # checking it's the right warning. This might be a bad test since it's # a characteristic of the code and not a behavior assert_true("The least populated class" in str(w[0])) # Check that despite the warning the folds are still computed even # though all the classes are not necessarily represented at on each # side of the split at each split check_cv_coverage(cv, expected_n_iter=3, n_samples=len(y)) # Error when number of folds is <= 1 assert_raises(ValueError, cval.KFold, 2, 0) assert_raises(ValueError, cval.KFold, 2, 1) assert_raises(ValueError, cval.StratifiedKFold, y, 0) assert_raises(ValueError, cval.StratifiedKFold, y, 1) # When n is not integer: assert_raises(ValueError, cval.KFold, 2.5, 2) # When n_folds is not integer: assert_raises(ValueError, cval.KFold, 5, 1.5) assert_raises(ValueError, cval.StratifiedKFold, y, 1.5)
def test_symmetry(): """Test the symmetry of score and loss functions""" random_state = check_random_state(0) y_true = random_state.randint(0, 2, size=(20, )) y_pred = random_state.randint(0, 2, size=(20, )) # We shouldn't forget any metrics assert_equal(set(SYMMETRIC_METRICS).union(NOT_SYMMETRIC_METRICS, THRESHOLDED_METRICS, METRIC_UNDEFINED_MULTICLASS), set(ALL_METRICS)) assert_equal( set(SYMMETRIC_METRICS).intersection(set(NOT_SYMMETRIC_METRICS)), set([])) # Symmetric metric for name in SYMMETRIC_METRICS: metric = ALL_METRICS[name] assert_almost_equal(metric(y_true, y_pred), metric(y_pred, y_true), err_msg="%s is not symmetric" % name) # Not symmetric metrics for name in NOT_SYMMETRIC_METRICS: metric = ALL_METRICS[name] assert_true(np.any(metric(y_true, y_pred) != metric(y_pred, y_true)), msg="%s seems to be symmetric" % name)
def test_fetch_rcv1(): try: data1 = fetch_rcv1(shuffle=False, download_if_missing=False) except IOError as e: if e.errno == errno.ENOENT: raise SkipTest("Download RCV1 dataset to run this test.") X1, Y1 = data1.data, data1.target cat_list, s1 = data1.target_names.tolist(), data1.sample_id # test sparsity assert_true(sp.issparse(X1)) assert_true(sp.issparse(Y1)) assert_equal(60915113, X1.data.size) assert_equal(2606875, Y1.data.size) # test shapes assert_equal((804414, 47236), X1.shape) assert_equal((804414, 103), Y1.shape) assert_equal((804414,), s1.shape) assert_equal(103, len(cat_list)) # test ordering of categories first_categories = [u'C11', u'C12', u'C13', u'C14', u'C15', u'C151'] assert_array_equal(first_categories, cat_list[:6]) # test number of sample for some categories some_categories = ('GMIL', 'E143', 'CCAT') number_non_zero_in_cat = (5, 1206, 381327) for num, cat in zip(number_non_zero_in_cat, some_categories): j = cat_list.index(cat) assert_equal(num, Y1[:, j].data.size) # test shuffling and subset data2 = fetch_rcv1(shuffle=True, subset='train', random_state=77, download_if_missing=False) X2, Y2 = data2.data, data2.target s2 = data2.sample_id # test return_X_y option fetch_func = partial(fetch_rcv1, shuffle=False, subset='train', download_if_missing=False) check_return_X_y(data2, fetch_func) # The first 23149 samples are the training samples assert_array_equal(np.sort(s1[:23149]), np.sort(s2)) # test some precise values some_sample_ids = (2286, 3274, 14042) for sample_id in some_sample_ids: idx1 = s1.tolist().index(sample_id) idx2 = s2.tolist().index(sample_id) feature_values_1 = X1[idx1, :].toarray() feature_values_2 = X2[idx2, :].toarray() assert_almost_equal(feature_values_1, feature_values_2) target_values_1 = Y1[idx1, :].toarray() target_values_2 = Y2[idx2, :].toarray() assert_almost_equal(target_values_1, target_values_2)
def test_neighbors_accuracy_with_n_estimators(): # Checks whether accuracy increases as `n_estimators` increases. n_estimators = np.array([1, 10, 100]) n_samples = 100 n_features = 10 n_iter = 10 n_points = 5 rng = np.random.RandomState(42) accuracies = np.zeros(n_estimators.shape[0], dtype=float) X = rng.rand(n_samples, n_features) for i, t in enumerate(n_estimators): lshf = ignore_warnings(LSHForest, category=DeprecationWarning)( n_candidates=500, n_estimators=t) ignore_warnings(lshf.fit)(X) for j in range(n_iter): query = X[rng.randint(0, n_samples)].reshape(1, -1) neighbors = lshf.kneighbors(query, n_neighbors=n_points, return_distance=False) distances = pairwise_distances(query, X, metric='cosine') ranks = np.argsort(distances)[0, :n_points] intersection = np.intersect1d(ranks, neighbors).shape[0] ratio = intersection / float(n_points) accuracies[i] = accuracies[i] + ratio accuracies[i] = accuracies[i] / float(n_iter) # Sorted accuracies should be equal to original accuracies assert_true(np.all(np.diff(accuracies) >= 0), msg="Accuracies are not non-decreasing.") # Highest accuracy should be strictly greater than the lowest assert_true(np.ptp(accuracies) > 0, msg="Highest accuracy is not strictly greater than lowest.")
def test_ward_clustering(): """ Check that we obtain the correct number of clusters with Ward clustering. """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rnd.randn(100, 50) connectivity = grid_to_graph(*mask.shape) clustering = Ward(n_clusters=10, connectivity=connectivity) clustering.fit(X) # test caching clustering = Ward(n_clusters=10, connectivity=connectivity, memory=mkdtemp()) clustering.fit(X) labels = clustering.labels_ assert_true(np.size(np.unique(labels)) == 10) # Turn caching off now clustering = Ward(n_clusters=10, connectivity=connectivity) # Check that we obtain the same solution with early-stopping of the # tree building clustering.compute_full_tree = False clustering.fit(X) np.testing.assert_array_equal(clustering.labels_, labels) clustering.connectivity = None clustering.fit(X) assert_true(np.size(np.unique(clustering.labels_)) == 10) # Check that we raise a TypeError on dense matrices clustering = Ward(n_clusters=10, connectivity=connectivity.todense()) assert_raises(TypeError, clustering.fit, X) clustering = Ward(n_clusters=10, connectivity=sparse.lil_matrix( connectivity.todense()[:10, :10])) assert_raises(ValueError, clustering.fit, X)
def test_enet_positive_constraint(): X = [[-1], [0], [1]] y = [1, 0, -1] # just a straight line with negative slope enet = ElasticNet(alpha=0.1, max_iter=1000, positive=True) enet.fit(X, y) assert_true(min(enet.coef_) >= 0)
def test_grid_search_score_method(): X, y = make_classification(n_samples=100, n_classes=2, flip_y=.2, random_state=0) clf = LinearSVC(random_state=0) grid = {'C': [.1]} search_no_scoring = GridSearchCV(clf, grid, scoring=None).fit(X, y) search_accuracy = GridSearchCV(clf, grid, scoring='accuracy').fit(X, y) search_no_score_method_auc = GridSearchCV(LinearSVCNoScore(), grid, scoring='roc_auc').fit(X, y) search_auc = GridSearchCV(clf, grid, scoring='roc_auc').fit(X, y) # ChangedBehaviourWarning occurred previously (prior to #9005) score_no_scoring = assert_no_warnings(search_no_scoring.score, X, y) score_accuracy = assert_no_warnings(search_accuracy.score, X, y) score_no_score_auc = assert_no_warnings(search_no_score_method_auc.score, X, y) score_auc = assert_no_warnings(search_auc.score, X, y) # ensure the test is sane assert_true(score_auc < 1.0) assert_true(score_accuracy < 1.0) assert_not_equal(score_auc, score_accuracy) assert_almost_equal(score_accuracy, score_no_scoring) assert_almost_equal(score_auc, score_no_score_auc)
def test_parameters_sampler_replacement(): # raise error if n_iter too large params = {'first': [0, 1], 'second': ['a', 'b', 'c']} sampler = ParameterSampler(params, n_iter=7) assert_raises(ValueError, list, sampler) # degenerates to GridSearchCV if n_iter the same as grid_size sampler = ParameterSampler(params, n_iter=6) samples = list(sampler) assert_equal(len(samples), 6) for values in ParameterGrid(params): assert_true(values in samples) # test sampling without replacement in a large grid params = {'a': range(10), 'b': range(10), 'c': range(10)} sampler = ParameterSampler(params, n_iter=99, random_state=42) samples = list(sampler) assert_equal(len(samples), 99) hashable_samples = ["a%db%dc%d" % (p['a'], p['b'], p['c']) for p in samples] assert_equal(len(set(hashable_samples)), 99) # doesn't go into infinite loops params_distribution = {'first': bernoulli(.5), 'second': ['a', 'b', 'c']} sampler = ParameterSampler(params_distribution, n_iter=7) samples = list(sampler) assert_equal(len(samples), 7)
def test_simple(): # Principle of Lars is to keep covariances tied and decreasing # also test verbose output from sklearn.externals.six.moves import cStringIO as StringIO import sys old_stdout = sys.stdout try: sys.stdout = StringIO() alphas_, active, coef_path_ = linear_model.lars_path( diabetes.data, diabetes.target, method="lar", verbose=10) sys.stdout = old_stdout for (i, coef_) in enumerate(coef_path_.T): res = y - np.dot(X, coef_) cov = np.dot(X.T, res) C = np.max(abs(cov)) eps = 1e-3 ocur = len(cov[C - eps < abs(cov)]) if i < X.shape[1]: assert_true(ocur == i + 1) else: # no more than max_pred variables can go into the active set assert_true(ocur == X.shape[1]) finally: sys.stdout = old_stdout
def test_enet_path_positive(): # Test that the coefs returned by positive=True in enet_path are positive X, y, _, _ = build_dataset(n_samples=50, n_features=50) for path in [enet_path, lasso_path]: pos_path_coef = path(X, y, positive=True)[1] assert_true(np.all(pos_path_coef >= 0))
def test_check_increasing_up_extreme(): x = [0, 1, 2, 3, 4, 5] y = [0, 1, 2, 3, 4, 5] # Check that we got increasing=True and no warnings is_increasing = assert_no_warnings(check_increasing, x, y) assert_true(is_increasing)
def test_lasso_cv(): X, y, X_test, y_test = build_dataset() max_iter = 150 clf = LassoCV(n_alphas=10, eps=1e-3, max_iter=max_iter).fit(X, y) assert_almost_equal(clf.alpha_, 0.056, 2) clf = LassoCV(n_alphas=10, eps=1e-3, max_iter=max_iter, precompute=True) clf.fit(X, y) assert_almost_equal(clf.alpha_, 0.056, 2) # Check that the lars and the coordinate descent implementation # select a similar alpha lars = LassoLarsCV(normalize=False, max_iter=30).fit(X, y) # for this we check that they don't fall in the grid of # clf.alphas further than 1 assert_true(np.abs( np.searchsorted(clf.alphas_[::-1], lars.alpha_) - np.searchsorted(clf.alphas_[::-1], clf.alpha_)) <= 1) # check that they also give a similar MSE mse_lars = interpolate.interp1d(lars.cv_alphas_, lars.cv_mse_path_.T) np.testing.assert_approx_equal(mse_lars(clf.alphas_[5]).mean(), clf.mse_path_[5].mean(), significant=2) # test set assert_greater(clf.score(X_test, y_test), 0.99)
def test_fit_transform(): alpha = 1 rng = np.random.RandomState(0) Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha, random_state=0) spca_lars.fit(Y) U1 = spca_lars.transform(Y) # Test multiple CPUs if sys.platform == 'win32': # fake parallelism for win32 import sklearn.externals.joblib.parallel as joblib_par _mp = joblib_par.multiprocessing joblib_par.multiprocessing = None try: spca = SparsePCA(n_components=3, n_jobs=2, random_state=0, alpha=alpha).fit(Y) U2 = spca.transform(Y) finally: joblib_par.multiprocessing = _mp else: # we can efficiently use parallelism spca = SparsePCA(n_components=3, n_jobs=2, method='lars', alpha=alpha, random_state=0).fit(Y) U2 = spca.transform(Y) assert_true(not np.all(spca_lars.components_ == 0)) assert_array_almost_equal(U1, U2) # Test that CD gives similar results spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0, alpha=alpha) spca_lasso.fit(Y) assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
def test_check_increasing_up(): x = [0, 1, 2, 3, 4, 5] y = [0, 1.5, 2.77, 8.99, 8.99, 50] # Check that we got increasing=True and no warnings is_increasing = assert_no_warnings(check_increasing, x, y) assert_true(is_increasing)
def test_sgd_l1(self): """Test L1 regularization""" n = len(X4) rng = np.random.RandomState(13) idx = np.arange(n) rng.shuffle(idx) X = X4[idx, :] Y = Y4[idx] clf = self.factory(penalty="l1", alpha=0.2, fit_intercept=False, n_iter=2000, shuffle=False) clf.fit(X, Y) assert_array_equal(clf.coef_[0, 1:-1], np.zeros((4,))) pred = clf.predict(X) assert_array_equal(pred, Y) # test sparsify with dense inputs clf.sparsify() assert_true(sp.issparse(clf.coef_)) pred = clf.predict(X) assert_array_equal(pred, Y) # pickle and unpickle with sparse coef_ clf = pickle.loads(pickle.dumps(clf)) assert_true(sp.issparse(clf.coef_)) pred = clf.predict(X) assert_array_equal(pred, Y)
def test_scale_function_without_centering(): rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero X_csr = sparse.csr_matrix(X) X_scaled = scale(X, with_mean=False) assert_false(np.any(np.isnan(X_scaled))) X_csr_scaled = scale(X_csr, with_mean=False) assert_false(np.any(np.isnan(X_csr_scaled.data))) # test csc has same outcome X_csc_scaled = scale(X_csr.tocsc(), with_mean=False) assert_array_almost_equal(X_scaled, X_csc_scaled.toarray()) # raises value error on axis != 0 assert_raises(ValueError, scale, X_csr, with_mean=False, axis=1) assert_array_almost_equal(X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert_true(X_scaled is not X) X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))
def test_lars_path_positive_constraint(): # this is the main test for the positive parameter on the lars_path method # the estimator classes just make use of this function # we do the test on the diabetes dataset # ensure that we get negative coefficients when positive=False # and all positive when positive=True # for method 'lar' (default) and lasso # Once deprecation of LAR + positive option is done use these: # assert_raises(ValueError, linear_model.lars_path, diabetes['data'], # diabetes['target'], method='lar', positive=True) with pytest.warns(DeprecationWarning, match="broken"): linear_model.lars_path(diabetes['data'], diabetes['target'], return_path=True, method='lar', positive=True) method = 'lasso' alpha, active, coefs = \ linear_model.lars_path(diabetes['data'], diabetes['target'], return_path=True, method=method, positive=False) assert_true(coefs.min() < 0) alpha, active, coefs = \ linear_model.lars_path(diabetes['data'], diabetes['target'], return_path=True, method=method, positive=True) assert_true(coefs.min() >= 0)
def test_parameters(self): assert_true( hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true( hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true( hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true( hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert_true(hasattr(self.clf, 'hist_') and self.clf.hist_ is not None) assert_true( hasattr(self.clf, 'bin_edges_') and self.clf.bin_edges_ is not None)
def test_normalize(): a = Real(2.0, 30.0, transform="normalize") for i in range(50): check_limits(a.rvs(random_state=i), 2, 30) rng = np.random.RandomState(0) X = rng.randn(100) X = 28 * (X - X.min()) / (X.max() - X.min()) + 2 # Check transformed values are in [0, 1] assert_true(np.all(a.transform(X) <= np.ones_like(X))) assert_true(np.all(np.zeros_like(X) <= a.transform(X))) # Check inverse transform assert_array_almost_equal(a.inverse_transform(a.transform(X)), X) # log-uniform prior a = Real(10**2.0, 10**4.0, prior="log-uniform", transform="normalize") for i in range(50): check_limits(a.rvs(random_state=i), 10**2, 10**4) rng = np.random.RandomState(0) X = np.clip(10**3 * rng.randn(100), 10**2.0, 10**4.0) # Check transform assert_true(np.all(a.transform(X) <= np.ones_like(X))) assert_true(np.all(np.zeros_like(X) <= a.transform(X))) # Check inverse transform assert_array_almost_equal(a.inverse_transform(a.transform(X)), X) a = Integer(2, 30, transform="normalize") for i in range(50): check_limits(a.rvs(random_state=i), 2, 30) assert_array_equal(a.transformed_bounds, (0, 1)) X = rng.randint(2, 31) # Check transformed values are in [0, 1] assert_true(np.all(a.transform(X) <= np.ones_like(X))) assert_true(np.all(np.zeros_like(X) <= a.transform(X))) # Check inverse transform X_orig = a.inverse_transform(a.transform(X)) assert_equal(X_orig.dtype, "int64") assert_array_equal(X_orig, X)
def test_space_api(): space = Space([(0.0, 1.0), (-5, 5), ("a", "b", "c"), (1.0, 5.0, "log-uniform"), ("e", "f")]) cat_space = Space([(1, "r"), (1.0, "r")]) assert isinstance(cat_space.dimensions[0], Categorical) assert isinstance(cat_space.dimensions[1], Categorical) assert_equal(len(space.dimensions), 5) assert_true(isinstance(space.dimensions[0], Real)) assert_true(isinstance(space.dimensions[1], Integer)) assert_true(isinstance(space.dimensions[2], Categorical)) assert_true(isinstance(space.dimensions[3], Real)) assert_true(isinstance(space.dimensions[4], Categorical)) samples = space.rvs(n_samples=10, random_state=0) assert_equal(len(samples), 10) assert_equal(len(samples[0]), 5) assert_true(isinstance(samples, list)) for n in range(4): assert_true(isinstance(samples[n], list)) assert_true(isinstance(samples[0][0], numbers.Real)) assert_true(isinstance(samples[0][1], numbers.Integral)) assert_true(isinstance(samples[0][2], str)) assert_true(isinstance(samples[0][3], numbers.Real)) assert_true(isinstance(samples[0][4], str)) samples_transformed = space.transform(samples) assert_equal(samples_transformed.shape[0], len(samples)) assert_equal(samples_transformed.shape[1], 1 + 1 + 3 + 1 + 1) assert_array_equal(samples, space.inverse_transform(samples_transformed)) samples = space.inverse_transform(samples_transformed) assert_true(isinstance(samples[0][0], numbers.Real)) assert_true(isinstance(samples[0][1], numbers.Integral)) assert_true(isinstance(samples[0][2], str)) assert_true(isinstance(samples[0][3], numbers.Real)) assert_true(isinstance(samples[0][4], str)) for b1, b2 in zip(space.bounds, [(0.0, 1.0), (-5, 5), np.asarray(["a", "b", "c"]), (1.0, 5.0), np.asarray(["e", "f"])]): assert_array_equal(b1, b2) for b1, b2 in zip(space.transformed_bounds, [(0.0, 1.0), (-5, 5), (0.0, 1.0), (0.0, 1.0), (0.0, 1.0), (np.log10(1.0), np.log10(5.0)), (0.0, 1.0)]): assert_array_equal(b1, b2)
def test_metaestimator_delegation(): """Ensures specified metaestimators have methods iff subestimator does""" def hides(method): @property def wrapper(obj): if obj.hidden_method == method.__name__: raise AttributeError('%r is hidden' % obj.hidden_method) return functools.partial(method, obj) return wrapper class SubEstimator(BaseEstimator): def __init__(self, param=1, hidden_method=None): self.param = param self.hidden_method = hidden_method def fit(self, X, y=None, *args, **kwargs): self.coef_ = np.arange(X.shape[1]) return True def _check_fit(self): if not hasattr(self, 'coef_'): raise RuntimeError('Estimator is not fit') @hides def inverse_transform(self, X, *args, **kwargs): self._check_fit() return X @hides def transform(self, X, *args, **kwargs): self._check_fit() return X @hides def predict(self, X, *args, **kwargs): self._check_fit() return np.ones(X.shape[0]) @hides def predict_proba(self, X, *args, **kwargs): self._check_fit() return np.ones(X.shape[0]) @hides def predict_log_proba(self, X, *args, **kwargs): self._check_fit() return np.ones(X.shape[0]) @hides def decision_function(self, X, *args, **kwargs): self._check_fit() return np.ones(X.shape[0]) @hides def score(self, X, *args, **kwargs): self._check_fit() return 1.0 methods = [ k for k in iterkeys(SubEstimator.__dict__) if not k.startswith('_') and not k.startswith('fit') ] methods.sort() for delegator_data in DELEGATING_METAESTIMATORS: delegate = SubEstimator() delegator = delegator_data.construct(delegate) for method in methods: if method in delegator_data.skip_methods: continue assert_true(hasattr(delegate, method)) assert_true( hasattr(delegator, method), msg="%s does not have method %r when its delegate does" % (delegator_data.name, method)) # delegation before fit raises an exception assert_raises(Exception, getattr(delegator, method), delegator_data.fit_args[0]) delegator.fit(*delegator_data.fit_args) for method in methods: if method in delegator_data.skip_methods: continue # smoke test delegation getattr(delegator, method)(delegator_data.fit_args[0]) for method in methods: if method in delegator_data.skip_methods: continue delegate = SubEstimator(hidden_method=method) delegator = delegator_data.construct(delegate) assert_false(hasattr(delegate, method)) assert_false(hasattr(delegator, method), msg="%s has method %r when its delegate does not" % (delegator_data.name, method))
def test_has_fit_parameter(): assert_false(has_fit_parameter(KNeighborsClassifier, "sample_weight")) assert_true(has_fit_parameter(RandomForestRegressor, "sample_weight")) assert_true(has_fit_parameter(SVR, "sample_weight")) assert_true(has_fit_parameter(SVR(), "sample_weight"))
def test_check_array_dtype_warning(): X_int_list = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] X_float64 = np.asarray(X_int_list, dtype=np.float64) X_float32 = np.asarray(X_int_list, dtype=np.float32) X_int64 = np.asarray(X_int_list, dtype=np.int64) X_csr_float64 = sp.csr_matrix(X_float64) X_csr_float32 = sp.csr_matrix(X_float32) X_csc_float32 = sp.csc_matrix(X_float32) X_csc_int32 = sp.csc_matrix(X_int64, dtype=np.int32) y = [0, 0, 1] integer_data = [X_int64, X_csc_int32] float64_data = [X_float64, X_csr_float64] float32_data = [X_float32, X_csr_float32, X_csc_float32] for X in integer_data: X_checked = assert_no_warnings(check_array, X, dtype=np.float64, accept_sparse=True) assert_equal(X_checked.dtype, np.float64) X_checked = assert_warns(DataConversionWarning, check_array, X, dtype=np.float64, accept_sparse=True, warn_on_dtype=True) assert_equal(X_checked.dtype, np.float64) # Check that the warning message includes the name of the Estimator X_checked = assert_warns_message(DataConversionWarning, 'SomeEstimator', check_array, X, dtype=[np.float64, np.float32], accept_sparse=True, warn_on_dtype=True, estimator='SomeEstimator') assert_equal(X_checked.dtype, np.float64) X_checked, y_checked = assert_warns_message( DataConversionWarning, 'KNeighborsClassifier', check_X_y, X, y, dtype=np.float64, accept_sparse=True, warn_on_dtype=True, estimator=KNeighborsClassifier()) assert_equal(X_checked.dtype, np.float64) for X in float64_data: X_checked = assert_no_warnings(check_array, X, dtype=np.float64, accept_sparse=True, warn_on_dtype=True) assert_equal(X_checked.dtype, np.float64) X_checked = assert_no_warnings(check_array, X, dtype=np.float64, accept_sparse=True, warn_on_dtype=False) assert_equal(X_checked.dtype, np.float64) for X in float32_data: X_checked = assert_no_warnings(check_array, X, dtype=[np.float64, np.float32], accept_sparse=True) assert_equal(X_checked.dtype, np.float32) assert_true(X_checked is X) X_checked = assert_no_warnings(check_array, X, dtype=[np.float64, np.float32], accept_sparse=['csr', 'dok'], copy=True) assert_equal(X_checked.dtype, np.float32) assert_false(X_checked is X) X_checked = assert_no_warnings(check_array, X_csc_float32, dtype=[np.float64, np.float32], accept_sparse=['csr', 'dok'], copy=False) assert_equal(X_checked.dtype, np.float32) assert_false(X_checked is X_csc_float32) assert_equal(X_checked.format, 'csr')
def test_check_array(): # accept_sparse == None # raise error on sparse inputs X = [[1, 2], [3, 4]] X_csr = sp.csr_matrix(X) assert_raises(TypeError, check_array, X_csr) # ensure_2d X_array = check_array([0, 1, 2], ensure_2d=False) assert_equal(X_array.ndim, 1) # don't allow ndim > 3 X_ndim = np.arange(8).reshape(2, 2, 2) assert_raises(ValueError, check_array, X_ndim) check_array(X_ndim, allow_nd=True) # doesn't raise # force_all_finite X_inf = np.arange(4).reshape(2, 2).astype(np.float) X_inf[0, 0] = np.inf assert_raises(ValueError, check_array, X_inf) check_array(X_inf, force_all_finite=False) # no raise # nan check X_nan = np.arange(4).reshape(2, 2).astype(np.float) X_nan[0, 0] = np.nan assert_raises(ValueError, check_array, X_nan) check_array(X_inf, force_all_finite=False) # no raise # dtype and order enforcement. X_C = np.arange(4).reshape(2, 2).copy("C") X_F = X_C.copy("F") X_int = X_C.astype(np.int) X_float = X_C.astype(np.float) Xs = [X_C, X_F, X_int, X_float] dtypes = [np.int32, np.int, np.float, np.float32, None, np.bool, object] orders = ['C', 'F', None] copys = [True, False] for X, dtype, order, copy in product(Xs, dtypes, orders, copys): X_checked = check_array(X, dtype=dtype, order=order, copy=copy) if dtype is not None: assert_equal(X_checked.dtype, dtype) else: assert_equal(X_checked.dtype, X.dtype) if order == 'C': assert_true(X_checked.flags['C_CONTIGUOUS']) assert_false(X_checked.flags['F_CONTIGUOUS']) elif order == 'F': assert_true(X_checked.flags['F_CONTIGUOUS']) assert_false(X_checked.flags['C_CONTIGUOUS']) if copy: assert_false(X is X_checked) else: # doesn't copy if it was already good if (X.dtype == X_checked.dtype and X_checked.flags['C_CONTIGUOUS'] == X.flags['C_CONTIGUOUS'] and X_checked.flags['F_CONTIGUOUS'] == X.flags['F_CONTIGUOUS']): assert_true(X is X_checked) # allowed sparse != None X_csc = sp.csc_matrix(X_C) X_coo = X_csc.tocoo() X_dok = X_csc.todok() X_int = X_csc.astype(np.int) X_float = X_csc.astype(np.float) Xs = [X_csc, X_coo, X_dok, X_int, X_float] accept_sparses = [['csr', 'coo'], ['coo', 'dok']] for X, dtype, accept_sparse, copy in product(Xs, dtypes, accept_sparses, copys): with warnings.catch_warnings(record=True) as w: X_checked = check_array(X, dtype=dtype, accept_sparse=accept_sparse, copy=copy) if (dtype is object or sp.isspmatrix_dok(X)) and len(w): message = str(w[0].message) messages = [ "object dtype is not supported by sparse matrices", "Can't check dok sparse matrix for nan or inf." ] assert_true(message in messages) else: assert_equal(len(w), 0) if dtype is not None: assert_equal(X_checked.dtype, dtype) else: assert_equal(X_checked.dtype, X.dtype) if X.format in accept_sparse: # no change if allowed assert_equal(X.format, X_checked.format) else: # got converted assert_equal(X_checked.format, accept_sparse[0]) if copy: assert_false(X is X_checked) else: # doesn't copy if it was already good if (X.dtype == X_checked.dtype and X.format == X_checked.format): assert_true(X is X_checked) # other input formats # convert lists to arrays X_dense = check_array([[1, 2], [3, 4]]) assert_true(isinstance(X_dense, np.ndarray)) # raise on too deep lists assert_raises(ValueError, check_array, X_ndim.tolist()) check_array(X_ndim.tolist(), allow_nd=True) # doesn't raise # convert weird stuff to arrays X_no_array = NotAnArray(X_dense) result = check_array(X_no_array) assert_true(isinstance(result, np.ndarray))
def test_n_nonzero_coefs(): assert_true(count_nonzero(orthogonal_mp(X, y[:, 0], n_nonzero_coefs=5)) <= 5) assert_true(count_nonzero(orthogonal_mp(X, y[:, 0], n_nonzero_coefs=5, precompute=True)) <= 5)
def test_classifiers_train(): # test if classifiers do something sensible on training set # also test all shapes / shape errors classifiers = all_estimators(type_filter='classifier') X_m, y_m = make_blobs(random_state=0) X_m, y_m = shuffle(X_m, y_m, random_state=7) X_m = StandardScaler().fit_transform(X_m) # generate binary problem from multi-class one y_b = y_m[y_m != 2] X_b = X_m[y_m != 2] for (X, y) in [(X_m, y_m), (X_b, y_b)]: # do it once with binary, once with multiclass classes = np.unique(y) n_classes = len(classes) n_samples, n_features = X.shape for name, Classifier in classifiers: if name in dont_test: continue if name in ['MultinomialNB', 'BernoulliNB']: # TODO also test these! continue # catch deprecation warnings with warnings.catch_warnings(record=True): classifier = Classifier() # raises error on malformed input for fit assert_raises(ValueError, classifier.fit, X, y[:-1]) # fit classifier.fit(X, y) assert_true(hasattr(classifier, "classes_")) y_pred = classifier.predict(X) assert_equal(y_pred.shape, (n_samples, )) # training set performance assert_greater(accuracy_score(y, y_pred), 0.85) # raises error on malformed input for predict assert_raises(ValueError, classifier.predict, X.T) if hasattr(classifier, "decision_function"): try: # decision_function agrees with predict: decision = classifier.decision_function(X) if n_classes is 2: assert_equal(decision.ravel().shape, (n_samples, )) dec_pred = (decision.ravel() > 0).astype(np.int) assert_array_equal(dec_pred, y_pred) if (n_classes is 3 and not isinstance(classifier, BaseLibSVM)): # 1on1 of LibSVM works differently assert_equal(decision.shape, (n_samples, n_classes)) assert_array_equal(np.argmax(decision, axis=1), y_pred) # raises error on malformed input assert_raises(ValueError, classifier.decision_function, X.T) # raises error on malformed input for decision_function assert_raises(ValueError, classifier.decision_function, X.T) except NotImplementedError: pass if hasattr(classifier, "predict_proba"): try: # predict_proba agrees with predict: y_prob = classifier.predict_proba(X) assert_equal(y_prob.shape, (n_samples, n_classes)) assert_array_equal(np.argmax(y_prob, axis=1), y_pred) # check that probas for all classes sum to one assert_array_almost_equal(np.sum(y_prob, axis=1), np.ones(n_samples)) # raises error on malformed input assert_raises(ValueError, classifier.predict_proba, X.T) # raises error on malformed input for predict_proba assert_raises(ValueError, classifier.predict_proba, X.T) except NotImplementedError: pass
def test_tol(): tol = 0.5 gamma = orthogonal_mp(X, y[:, 0], tol=tol) gamma_gram = orthogonal_mp(X, y[:, 0], tol=tol, precompute=True) assert_true(np.sum((y[:, 0] - np.dot(X, gamma)) ** 2) <= tol) assert_true(np.sum((y[:, 0] - np.dot(X, gamma_gram)) ** 2) <= tol)
def test_transformers(): # test if transformers do something sensible on training set # also test all shapes / shape errors transformers = all_estimators(type_filter='transformer') X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) n_samples, n_features = X.shape X = StandardScaler().fit_transform(X) X -= X.min() succeeded = True for name, Transformer in transformers: if name in dont_test: continue # these don't actually fit the data: if name in ['AdditiveChi2Sampler', 'Binarizer', 'Normalizer']: continue # catch deprecation warnings with warnings.catch_warnings(record=True): transformer = Transformer() set_random_state(transformer) if hasattr(transformer, 'compute_importances'): transformer.compute_importances = True if name == 'SelectKBest': # SelectKBest has a default of k=10 # which is more feature than we have. transformer.k = 1 elif name in ['GaussianRandomProjection', 'SparseRandomProjection']: # Due to the jl lemma and very few samples, the number # of components of the random matrix projection will be greater # than the number of features. # So we impose a smaller number (avoid "auto" mode) transformer.n_components = 1 elif name == "MiniBatchDictionaryLearning": transformer.set_params(n_iter=5) # default = 1000 elif name == "KernelPCA": transformer.remove_zero_eig = False # fit if name in ('PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'): y_ = np.c_[y, y] y_[::2, 1] *= 2 else: y_ = y try: transformer.fit(X, y_) X_pred = transformer.fit_transform(X, y=y_) if isinstance(X_pred, tuple): for x_pred in X_pred: assert_equal(x_pred.shape[0], n_samples) else: assert_equal(X_pred.shape[0], n_samples) except Exception as e: print(transformer) print(e) print() succeeded = False continue if hasattr(transformer, 'transform'): if name in ('PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'): X_pred2 = transformer.transform(X, y_) X_pred3 = transformer.fit_transform(X, y=y_) else: X_pred2 = transformer.transform(X) X_pred3 = transformer.fit_transform(X, y=y_) if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple): for x_pred, x_pred2, x_pred3 in zip(X_pred, X_pred2, X_pred3): assert_array_almost_equal( x_pred, x_pred2, 2, "fit_transform not correct in %s" % Transformer) assert_array_almost_equal( x_pred3, x_pred2, 2, "fit_transform not correct in %s" % Transformer) else: assert_array_almost_equal( X_pred, X_pred2, 2, "fit_transform not correct in %s" % Transformer) assert_array_almost_equal( X_pred3, X_pred2, 2, "fit_transform not correct in %s" % Transformer) # raises error on malformed input for transform assert_raises(ValueError, transformer.transform, X.T) assert_true(succeeded)
def fit(self, X, Y): assert_true(len(X) == len(Y)) self.coef_ = np.ones(X.shape[1], dtype=np.float64) return self
def test_fastica(add_noise=False): """ Test the FastICA algorithm on very simple data. """ # scipy.stats uses the global RNG: rng = np.random.RandomState(0) n_samples = 1000 # Generate two sources: s1 = (2 * np.sin(np.linspace(0, 100, n_samples)) > 0) - 1 s2 = stats.t.rvs(1, size=n_samples) s = np.c_[s1, s2].T center_and_norm(s) s1, s2 = s # Mixing angle phi = 0.6 mixing = np.array([[np.cos(phi), np.sin(phi)], [np.sin(phi), -np.cos(phi)]]) m = np.dot(mixing, s) if add_noise: m += 0.1 * rng.randn(2, 1000) center_and_norm(m) algos = ['parallel', 'deflation'] nls = ['logcosh', 'exp', 'cube'] whitening = [True, False] for algo, nl, whiten in itertools.product(algos, nls, whitening): if whiten: k_, mixing_, s_ = fastica(m.T, fun=nl, algorithm=algo) else: X = PCA(n_components=2, whiten=True).fit_transform(m.T) k_, mixing_, s_ = fastica(X, fun=nl, algorithm=algo, whiten=False) s_ = s_.T # Check that the mixing model described in the docstring holds: if whiten: assert_almost_equal(s_, np.dot(np.dot(mixing_, k_), m)) center_and_norm(s_) s1_, s2_ = s_ # Check to see if the sources have been estimated # in the wrong order if abs(np.dot(s1_, s2)) > abs(np.dot(s1_, s1)): s2_, s1_ = s_ s1_ *= np.sign(np.dot(s1_, s1)) s2_ *= np.sign(np.dot(s2_, s2)) # Check that we have estimated the original sources if add_noise == False: assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=2) assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=2) else: assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=1) assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=1) # Test FastICA class ica = FastICA(fun=nl, algorithm=algo, random_state=0) ica.fit(m.T) ica.get_mixing_matrix() assert_true(ica.components_.shape == (2, 2)) assert_true(ica.sources_.shape == (1000, 2))
def test_transformers_pickle(): # test if transformers do something sensible on training set # also test all shapes / shape errors transformers = all_estimators(type_filter='transformer') X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) n_samples, n_features = X.shape X = StandardScaler().fit_transform(X) X -= X.min() succeeded = True for name, Transformer in transformers: if name in dont_test: continue # catch deprecation warnings with warnings.catch_warnings(record=True): transformer = Transformer() if not hasattr(transformer, 'transform'): continue set_random_state(transformer) if hasattr(transformer, 'compute_importances'): transformer.compute_importances = True if name == "SelectKBest": # SelectKBest has a default of k=10 # which is more feature than we have. transformer.k = 1 elif name in ['GaussianRandomProjection', 'SparseRandomProjection']: # Due to the jl lemma and very few samples, the number # of components of the random matrix projection will be greater # than the number of features. # So we impose a smaller number (avoid "auto" mode) transformer.n_components = 1 # fit if name in ('PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'): random_state = np.random.RandomState(seed=12345) y_ = np.vstack([y, 2 * y + random_state.randint(2, size=len(y))]) y_ = y_.T else: y_ = y transformer.fit(X, y_) X_pred = transformer.fit(X, y_).transform(X) pickled_transformer = pickle.dumps(transformer) unpickled_transformer = pickle.loads(pickled_transformer) pickled_X_pred = unpickled_transformer.transform(X) try: assert_array_almost_equal(pickled_X_pred, X_pred) except Exception as exc: succeeded = False print("Transformer %s doesn't predict the same value " "after pickling" % name) raise exc assert_true(succeeded)
def test_check_array(): # accept_sparse == None # raise error on sparse inputs X = [[1, 2], [3, 4]] X_csr = sp.csr_matrix(X) assert_raises(TypeError, check_array, X_csr) # ensure_2d=False X_array = check_array([0, 1, 2], ensure_2d=False) assert_equal(X_array.ndim, 1) # ensure_2d=True with 1d array assert_raise_message(ValueError, 'Expected 2D array, got 1D array instead', check_array, [0, 1, 2], ensure_2d=True) # ensure_2d=True with scalar array assert_raise_message(ValueError, 'Expected 2D array, got scalar array instead', check_array, 10, ensure_2d=True) # don't allow ndim > 3 X_ndim = np.arange(8).reshape(2, 2, 2) assert_raises(ValueError, check_array, X_ndim) check_array(X_ndim, allow_nd=True) # doesn't raise # dtype and order enforcement. X_C = np.arange(4).reshape(2, 2).copy("C") X_F = X_C.copy("F") X_int = X_C.astype(np.int) X_float = X_C.astype(np.float) Xs = [X_C, X_F, X_int, X_float] dtypes = [np.int32, np.int, np.float, np.float32, None, np.bool, object] orders = ['C', 'F', None] copys = [True, False] for X, dtype, order, copy in product(Xs, dtypes, orders, copys): X_checked = check_array(X, dtype=dtype, order=order, copy=copy) if dtype is not None: assert_equal(X_checked.dtype, dtype) else: assert_equal(X_checked.dtype, X.dtype) if order == 'C': assert_true(X_checked.flags['C_CONTIGUOUS']) assert_false(X_checked.flags['F_CONTIGUOUS']) elif order == 'F': assert_true(X_checked.flags['F_CONTIGUOUS']) assert_false(X_checked.flags['C_CONTIGUOUS']) if copy: assert_false(X is X_checked) else: # doesn't copy if it was already good if (X.dtype == X_checked.dtype and X_checked.flags['C_CONTIGUOUS'] == X.flags['C_CONTIGUOUS'] and X_checked.flags['F_CONTIGUOUS'] == X.flags['F_CONTIGUOUS']): assert_true(X is X_checked) # allowed sparse != None X_csc = sp.csc_matrix(X_C) X_coo = X_csc.tocoo() X_dok = X_csc.todok() X_int = X_csc.astype(np.int) X_float = X_csc.astype(np.float) Xs = [X_csc, X_coo, X_dok, X_int, X_float] accept_sparses = [['csr', 'coo'], ['coo', 'dok']] for X, dtype, accept_sparse, copy in product(Xs, dtypes, accept_sparses, copys): with warnings.catch_warnings(record=True) as w: X_checked = check_array(X, dtype=dtype, accept_sparse=accept_sparse, copy=copy) if (dtype is object or sp.isspmatrix_dok(X)) and len(w): message = str(w[0].message) messages = [ "object dtype is not supported by sparse matrices", "Can't check dok sparse matrix for nan or inf." ] assert_true(message in messages) else: assert_equal(len(w), 0) if dtype is not None: assert_equal(X_checked.dtype, dtype) else: assert_equal(X_checked.dtype, X.dtype) if X.format in accept_sparse: # no change if allowed assert_equal(X.format, X_checked.format) else: # got converted assert_equal(X_checked.format, accept_sparse[0]) if copy: assert_false(X is X_checked) else: # doesn't copy if it was already good if (X.dtype == X_checked.dtype and X.format == X_checked.format): assert_true(X is X_checked) # other input formats # convert lists to arrays X_dense = check_array([[1, 2], [3, 4]]) assert_true(isinstance(X_dense, np.ndarray)) # raise on too deep lists assert_raises(ValueError, check_array, X_ndim.tolist()) check_array(X_ndim.tolist(), allow_nd=True) # doesn't raise # convert weird stuff to arrays X_no_array = NotAnArray(X_dense) result = check_array(X_no_array) assert_true(isinstance(result, np.ndarray)) # deprecation warning if string-like array with dtype="numeric" expected_warn_regex = r"converted to decimal numbers if dtype='numeric'" X_str = [['11', '12'], ['13', 'xx']] for X in [X_str, np.array(X_str, dtype='U'), np.array(X_str, dtype='S')]: with pytest.warns(FutureWarning, match=expected_warn_regex): check_array(X, dtype="numeric") # deprecation warning if byte-like array with dtype="numeric" X_bytes = [[b'a', b'b'], [b'c', b'd']] for X in [X_bytes, np.array(X_bytes, dtype='V1')]: with pytest.warns(FutureWarning, match=expected_warn_regex): check_array(X, dtype="numeric")
def test_base_estimator(): # Check base_estimator and its default values. rng = check_random_state(0) # Classification X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) ensemble = BaggingClassifier(None, n_jobs=3, random_state=0).fit(X_train, y_train) assert_true(isinstance(ensemble.base_estimator_, DecisionTreeClassifier)) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=3, random_state=0).fit(X_train, y_train) assert_true(isinstance(ensemble.base_estimator_, DecisionTreeClassifier)) ensemble = BaggingClassifier(Perceptron(), n_jobs=3, random_state=0).fit(X_train, y_train) assert_true(isinstance(ensemble.base_estimator_, Perceptron)) # Regression X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) ensemble = BaggingRegressor(None, n_jobs=3, random_state=0).fit(X_train, y_train) assert_true(isinstance(ensemble.base_estimator_, DecisionTreeRegressor)) ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(X_train, y_train) assert_true(isinstance(ensemble.base_estimator_, DecisionTreeRegressor)) ensemble = BaggingRegressor(SVR(), n_jobs=3, random_state=0).fit(X_train, y_train) assert_true(isinstance(ensemble.base_estimator_, SVR))
def test_hashing_vectorizer(): v = HashingVectorizer() X = v.transform(ALL_FOOD_DOCS) token_nnz = X.nnz assert_equal(X.shape, (len(ALL_FOOD_DOCS), v.n_features)) assert_equal(X.dtype, v.dtype) # By default the hashed values receive a random sign and l2 normalization # makes the feature values bounded assert_true(np.min(X.data) > -1) assert_true(np.min(X.data) < 0) assert_true(np.max(X.data) > 0) assert_true(np.max(X.data) < 1) # Check that the rows are normalized for i in range(X.shape[0]): assert_almost_equal(np.linalg.norm(X[0].data, 2), 1.0) # Check vectorization with some non-default parameters v = HashingVectorizer(ngram_range=(1, 2), non_negative=True, norm='l1') X = v.transform(ALL_FOOD_DOCS) assert_equal(X.shape, (len(ALL_FOOD_DOCS), v.n_features)) assert_equal(X.dtype, v.dtype) # ngrams generate more non zeros ngrams_nnz = X.nnz assert_true(ngrams_nnz > token_nnz) assert_true(ngrams_nnz < 2 * token_nnz) # makes the feature values bounded assert_true(np.min(X.data) > 0) assert_true(np.max(X.data) < 1) # Check that the rows are normalized for i in range(X.shape[0]): assert_almost_equal(np.linalg.norm(X[0].data, 1), 1.0)
def test_sgd_proba(self): # Check SGD.predict_proba # Hinge loss does not allow for conditional prob estimate. # We cannot use the factory here, because it defines predict_proba # anyway. clf = SGDClassifier(loss="hinge", alpha=0.01, n_iter=10).fit(X, Y) assert_false(hasattr(clf, "predict_proba")) assert_false(hasattr(clf, "predict_log_proba")) # log and modified_huber losses can output probability estimates # binary case for loss in ["log", "modified_huber"]: clf = self.factory(loss="modified_huber", alpha=0.01, n_iter=10) clf.fit(X, Y) p = clf.predict_proba([3, 2]) assert_true(p[0, 1] > 0.5) p = clf.predict_proba([-1, -1]) assert_true(p[0, 1] < 0.5) p = clf.predict_log_proba([3, 2]) assert_true(p[0, 1] > p[0, 0]) p = clf.predict_log_proba([-1, -1]) assert_true(p[0, 1] < p[0, 0]) # log loss multiclass probability estimates clf = self.factory(loss="log", alpha=0.01, n_iter=10).fit(X2, Y2) d = clf.decision_function([[.1, -.1], [.3, .2]]) p = clf.predict_proba([[.1, -.1], [.3, .2]]) assert_array_equal(np.argmax(p, axis=1), np.argmax(d, axis=1)) assert_almost_equal(p[0].sum(), 1) assert_true(np.all(p[0] >= 0)) p = clf.predict_proba([-1, -1]) d = clf.decision_function([-1, -1]) assert_array_equal(np.argsort(p[0]), np.argsort(d[0])) l = clf.predict_log_proba([3, 2]) p = clf.predict_proba([3, 2]) assert_array_almost_equal(np.log(p), l) l = clf.predict_log_proba([-1, -1]) p = clf.predict_proba([-1, -1]) assert_array_almost_equal(np.log(p), l) # Modified Huber multiclass probability estimates; requires a separate # test because the hard zero/one probabilities may destroy the # ordering present in decision_function output. clf = self.factory(loss="modified_huber", alpha=0.01, n_iter=10) clf.fit(X2, Y2) d = clf.decision_function([3, 2]) p = clf.predict_proba([3, 2]) if not isinstance(self, SparseSGDClassifierTestCase): assert_equal(np.argmax(d, axis=1), np.argmax(p, axis=1)) else: # XXX the sparse test gets a different X2 (?) assert_equal(np.argmin(d, axis=1), np.argmin(p, axis=1)) # the following sample produces decision_function values < -1, # which would cause naive normalization to fail (see comment # in SGDClassifier.predict_proba) x = X.mean(axis=0) d = clf.decision_function(x) if np.all(d < -1): # XXX not true in sparse test case (why?) p = clf.predict_proba(x) assert_array_almost_equal(p[0], [1 / 3.] * 3)
def test_parameters(self): assert_true( hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true( hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true( hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true( hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert_true( hasattr(self.clf, 'support_') and self.clf.support_ is not None) assert_true( hasattr(self.clf, 'support_vectors_') and self.clf.support_vectors_ is not None) assert_true( hasattr(self.clf, 'dual_coef_') and self.clf.dual_coef_ is not None) assert_true( hasattr(self.clf, 'intercept_') and self.clf.intercept_ is not None)
def test_input_estimator_unchanged(): # Test that SelectFromModel fits on a clone of the estimator. est = RandomForestClassifier() transformer = SelectFromModel(estimator=est) transformer.fit(data, y) assert_true(transformer.estimator is est)
def test_make_imbalance_5(): X_, y_ = make_imbalance(X, Y, ratio=0.01, min_c_=0) counter = Counter(y_) assert_equal(counter[1], 500) assert_equal(counter[0], 5) assert_true(np.all([X_i in X for X_i in X_]))
def test_bagging_with_pipeline(): estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), max_features=2) estimator.fit(iris.data, iris.target) assert_true(isinstance(estimator[0].steps[-1][1].random_state, int))
def test_check_cv_return_types(): X = np.ones((9, 2)) cv = cval._check_cv(3, X, classifier=False) assert_true(isinstance(cv, cval.KFold)) y_binary = np.array([0, 1, 0, 1, 0, 0, 1, 1, 1]) cv = cval._check_cv(3, X, y_binary, classifier=True) assert_true(isinstance(cv, cval.StratifiedKFold)) y_multiclass = np.array([0, 1, 0, 1, 2, 1, 2, 0, 2]) cv = cval._check_cv(3, X, y_multiclass, classifier=True) assert_true(isinstance(cv, cval.StratifiedKFold)) X = np.ones((5, 2)) y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]] with warnings.catch_warnings(record=True): # deprecated sequence of sequence format cv = cval._check_cv(3, X, y_seq_of_seqs, classifier=True) assert_true(isinstance(cv, cval.KFold)) y_indicator_matrix = LabelBinarizer().fit_transform(y_seq_of_seqs) cv = cval._check_cv(3, X, y_indicator_matrix, classifier=True) assert_true(isinstance(cv, cval.KFold)) y_multioutput = np.array([[1, 2], [0, 3], [0, 0], [3, 1], [2, 0]]) cv = cval._check_cv(3, X, y_multioutput, classifier=True) assert_true(isinstance(cv, cval.KFold))
def check_classifiers_train(name, Classifier): X_m, y_m = make_blobs(random_state=0) X_m, y_m = shuffle(X_m, y_m, random_state=7) X_m = StandardScaler().fit_transform(X_m) # generate binary problem from multi-class one y_b = y_m[y_m != 2] X_b = X_m[y_m != 2] for (X, y) in [(X_m, y_m), (X_b, y_b)]: # catch deprecation warnings classes = np.unique(y) n_classes = len(classes) n_samples, n_features = X.shape with warnings.catch_warnings(record=True): classifier = Classifier() if name in ['BernoulliNB', 'MultinomialNB']: X -= X.min() set_fast_parameters(classifier) set_random_state(classifier) # raises error on malformed input for fit assert_raises(ValueError, classifier.fit, X, y[:-1]) # fit classifier.fit(X, y) # with lists classifier.fit(X.tolist(), y.tolist()) assert_true(hasattr(classifier, "classes_")) y_pred = classifier.predict(X) assert_equal(y_pred.shape, (n_samples, )) # training set performance if name not in ['BernoulliNB', 'MultinomialNB']: assert_greater(accuracy_score(y, y_pred), 0.85) # raises error on malformed input for predict assert_raises(ValueError, classifier.predict, X.T) if hasattr(classifier, "decision_function"): try: # decision_function agrees with predict decision = classifier.decision_function(X) if n_classes is 2: assert_equal(decision.shape, (n_samples, )) dec_pred = (decision.ravel() > 0).astype(np.int) assert_array_equal(dec_pred, y_pred) if (n_classes is 3 and not isinstance(classifier, BaseLibSVM)): # 1on1 of LibSVM works differently assert_equal(decision.shape, (n_samples, n_classes)) assert_array_equal(np.argmax(decision, axis=1), y_pred) # raises error on malformed input assert_raises(ValueError, classifier.decision_function, X.T) # raises error on malformed input for decision_function assert_raises(ValueError, classifier.decision_function, X.T) except NotImplementedError: pass if hasattr(classifier, "predict_proba"): # predict_proba agrees with predict y_prob = classifier.predict_proba(X) assert_equal(y_prob.shape, (n_samples, n_classes)) assert_array_equal(np.argmax(y_prob, axis=1), y_pred) # check that probas for all classes sum to one assert_array_almost_equal(np.sum(y_prob, axis=1), np.ones(n_samples)) # raises error on malformed input assert_raises(ValueError, classifier.predict_proba, X.T) # raises error on malformed input for predict_proba assert_raises(ValueError, classifier.predict_proba, X.T)
def test_space_api(): space = Space([(0.0, 1.0), (-5, 5), ("a", "b", "c"), (1.0, 5.0, "log-uniform"), ("e", "f")]) cat_space = Space([(1, "r"), (1.0, "r")]) assert isinstance(cat_space.dimensions[0], Categorical) assert isinstance(cat_space.dimensions[1], Categorical) assert_equal(len(space.dimensions), 5) assert_true(isinstance(space.dimensions[0], Real)) assert_true(isinstance(space.dimensions[1], Integer)) assert_true(isinstance(space.dimensions[2], Categorical)) assert_true(isinstance(space.dimensions[3], Real)) assert_true(isinstance(space.dimensions[4], Categorical)) samples = space.rvs(n_samples=10, random_state=0) assert_equal(len(samples), 10) assert_equal(len(samples[0]), 5) assert_true(isinstance(samples, list)) for n in range(4): assert_true(isinstance(samples[n], list)) assert_true(isinstance(samples[0][0], numbers.Real)) assert_true(isinstance(samples[0][1], numbers.Integral)) assert_true(isinstance(samples[0][2], str)) assert_true(isinstance(samples[0][3], numbers.Real)) assert_true(isinstance(samples[0][4], str)) samples_transformed = space.transform(samples) assert_equal(samples_transformed.shape[0], len(samples)) assert_equal(samples_transformed.shape[1], 1 + 1 + 3 + 1 + 1) # our space contains mixed types, this means we can't use # `array_allclose` or similar to check points are close after a round-trip # of transformations for orig, round_trip in zip(samples, space.inverse_transform(samples_transformed)): assert space.distance(orig, round_trip) < 1.e-8 samples = space.inverse_transform(samples_transformed) assert_true(isinstance(samples[0][0], numbers.Real)) assert_true(isinstance(samples[0][1], numbers.Integral)) assert_true(isinstance(samples[0][2], str)) assert_true(isinstance(samples[0][3], numbers.Real)) assert_true(isinstance(samples[0][4], str)) for b1, b2 in zip(space.bounds, [(0.0, 1.0), (-5, 5), np.asarray(["a", "b", "c"]), (1.0, 5.0), np.asarray(["e", "f"])]): assert_array_equal(b1, b2) for b1, b2 in zip(space.transformed_bounds, [(0.0, 1.0), (-5, 5), (0.0, 1.0), (0.0, 1.0), (0.0, 1.0), (np.log10(1.0), np.log10(5.0)), (0.0, 1.0)]): assert_array_equal(b1, b2)
def test_permutation_score(): iris = load_iris() X = iris.data X_sparse = coo_matrix(X) y = iris.target svm = SVC(kernel='linear') cv = cval.StratifiedKFold(y, 2) score, scores, pvalue = cval.permutation_test_score(svm, X, y, n_permutations=30, cv=cv, scoring="accuracy") assert_greater(score, 0.9) assert_almost_equal(pvalue, 0.0, 1) score_label, _, pvalue_label = cval.permutation_test_score( svm, X, y, n_permutations=30, cv=cv, scoring="accuracy", labels=np.ones(y.size), random_state=0) assert_true(score_label == score) assert_true(pvalue_label == pvalue) # check that we obtain the same results with a sparse representation svm_sparse = SVC(kernel='linear') cv_sparse = cval.StratifiedKFold(y, 2) score_label, _, pvalue_label = cval.permutation_test_score( svm_sparse, X_sparse, y, n_permutations=30, cv=cv_sparse, scoring="accuracy", labels=np.ones(y.size), random_state=0) assert_true(score_label == score) assert_true(pvalue_label == pvalue) # test with custom scoring object def custom_score(y_true, y_pred): return (((y_true == y_pred).sum() - (y_true != y_pred).sum()) / y_true.shape[0]) scorer = make_scorer(custom_score) score, _, pvalue = cval.permutation_test_score(svm, X, y, n_permutations=100, scoring=scorer, cv=cv, random_state=0) assert_almost_equal(score, .93, 2) assert_almost_equal(pvalue, 0.01, 3) # set random y y = np.mod(np.arange(len(y)), 3) score, scores, pvalue = cval.permutation_test_score(svm, X, y, n_permutations=30, cv=cv, scoring="accuracy") assert_less(score, 0.5) assert_greater(pvalue, 0.2)
def test_scaler_2d_arrays(): """Test scaling of 2d array along first axis""" rng = np.random.RandomState(0) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has been copied assert_true(X_scaled is not X) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_true(X_scaled_back is not X) assert_true(X_scaled_back is not X_scaled) assert_array_almost_equal(X_scaled_back, X) X_scaled = scale(X, axis=1, with_std=False) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) X_scaled = scale(X, axis=1, with_std=True) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0]) # Check that the data hasn't been modified assert_true(X_scaled is not X) X_scaled = scaler.fit(X).transform(X, copy=False) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert_true(X_scaled is X) X = rng.randn(4, 5) X[:, 0] = 1.0 # first feature is a constant, non zero feature scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert_true(X_scaled is not X)
def test_parameters(self): assert_true( hasattr(self.estimator, 'base_estimators') and self.estimator.base_estimators is not None)
def test_add_dummy_feature_csr(): X = sparse.csr_matrix([[1, 0], [0, 1], [0, 1]]) X = add_dummy_feature(X) assert_true(sparse.isspmatrix_csr(X), X) assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
def test_factor_analysis(): """Test FactorAnalysis ability to recover the data covariance structure """ rng = np.random.RandomState(0) n_samples, n_features, n_components = 20, 5, 3 # Some random settings for the generative model W = rng.randn(n_components, n_features) # latent variable of dim 3, 20 of it h = rng.randn(n_samples, n_components) # using gamma to model different noise variance # per component noise = rng.gamma(1, size=n_features) * rng.randn(n_samples, n_features) # generate observations # wlog, mean is 0 X = np.dot(h, W) + noise assert_raises(ValueError, FactorAnalysis, svd_method='foo') fa_fail = FactorAnalysis() fa_fail.svd_method = 'foo' assert_raises(ValueError, fa_fail.fit, X) fas = [] for method in ['randomized', 'lapack']: fa = FactorAnalysis(n_components=n_components, svd_method=method) fa.fit(X) fas.append(fa) X_t = fa.transform(X) assert_equal(X_t.shape, (n_samples, n_components)) assert_almost_equal(fa.loglike_[-1], fa.score_samples(X).sum()) assert_almost_equal(fa.score_samples(X).mean(), fa.score(X)) diff = np.all(np.diff(fa.loglike_)) assert_greater(diff, 0., 'Log likelihood dif not increase') # Sample Covariance scov = np.cov(X, rowvar=0., bias=1.) # Model Covariance mcov = fa.get_covariance() diff = np.sum(np.abs(scov - mcov)) / W.size assert_less(diff, 0.1, "Mean absolute difference is %f" % diff) fa = FactorAnalysis(n_components=n_components, noise_variance_init=np.ones(n_features)) assert_raises(ValueError, fa.fit, X[:, :2]) f = lambda x, y: np.abs(getattr(x, y)) # sign will not be equal fa1, fa2 = fas for attr in ['loglike_', 'components_', 'noise_variance_']: assert_almost_equal(f(fa1, attr), f(fa2, attr)) with warnings.catch_warnings(record=True) as w: warnings.simplefilter('always', ConvergenceWarning) fa1.max_iter = 1 fa1.verbose = True fa1.fit(X) assert_true(w[-1].category == ConvergenceWarning) warnings.simplefilter('always', DeprecationWarning) FactorAnalysis(verbose=1) assert_true(w[-1].category == DeprecationWarning) # Test get_covariance and get_precision with n_components == n_features # with n_components < n_features and with n_components == 0 for n_components in [0, 2, X.shape[1]]: fa.n_components = n_components fa.fit(X) cov = fa.get_covariance() precision = fa.get_precision() assert_array_almost_equal(np.dot(cov, precision), np.eye(X.shape[1]), 12)