def test_scaler(): """Test scaling of dataset along all axis """ # First test with 1D data X = np.random.randn(5) scaler = Scaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) X = np.random.randn(4, 5) scaler = Scaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 5*[0.0]) assert_array_almost_equal(X_scaled.std(axis=0), 5*[1.0]) # Check that X has not been copied assert X_scaled is X X_scaled = scaler.fit(X).transform(X, copy=True) assert_array_almost_equal(X_scaled.mean(axis=0), 5*[0.0]) assert_array_almost_equal(X_scaled.std(axis=0), 5*[1.0]) # Check that X has not been copied assert X_scaled is not X X_scaled = scale(X, axis=1, with_std=False) assert_array_almost_equal(X_scaled.mean(axis=1), 4*[0.0]) X_scaled = scale(X, axis=1, with_std=True) assert_array_almost_equal(X_scaled.std(axis=1), 4*[1.0])
def test_scaler(): """Test scaling of dataset along all axis""" # First test with 1D data X = np.random.randn(5) scaler = Scaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) X = np.random.randn(4, 5) scaler = Scaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), 5 * [1.0]) # Check that X has not been copied assert X_scaled is X X_scaled = scaler.fit(X).transform(X, copy=True) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), 5 * [1.0]) # Check that X has not been copied assert X_scaled is not X X_scaled = scale(X, axis=1, with_std=False) assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) X_scaled = scale(X, axis=1, with_std=True) assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0]) # Check that the data hasn't been modified assert X_scaled is not X
def test_scaler(): """Test scaling of dataset along all axis""" # First test with 1D data X = np.random.randn(5) scaler = Scaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) # Test with 1D list X = [0., 1., 2, 0.4, 1.] scaler = Scaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) X_scaled = scale(X) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) # Test with 2D data X = np.random.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero scaler = Scaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert not np.any(np.isnan(X_scaled)) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert X_scaled is not X X_scaled = scale(X, axis=1, with_std=False) assert not np.any(np.isnan(X_scaled)) assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) X_scaled = scale(X, axis=1, with_std=True) assert not np.any(np.isnan(X_scaled)) assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0]) # Check that the data hasn't been modified assert X_scaled is not X X_scaled = scaler.fit(X).transform(X, copy=False) assert not np.any(np.isnan(X_scaled)) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert X_scaled is X X = np.random.randn(4, 5) X[:, 0] = 1.0 # first feature is a constant, non zero feature scaler = Scaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert not np.any(np.isnan(X_scaled)) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert X_scaled is not X
def test_auto_weight(self): """Test class weights for imbalanced data""" # compute reference metrics on iris dataset that is quite balanced by # default X, y = iris.data, iris.target X = preprocessing.scale(X) idx = np.arange(X.shape[0]) np.random.seed(13) np.random.shuffle(idx) X = X[idx] y = y[idx] clf = self.factory(alpha=0.0001, n_iter=1000).fit(X, y) assert_approx_equal(metrics.f1_score(y, clf.predict(X)), 0.96, 2) # make the same prediction using automated class_weight clf_auto = self.factory(alpha=0.0001, n_iter=1000).fit(X, y, class_weight="auto") assert_approx_equal(metrics.f1_score(y, clf_auto.predict(X)), 0.96, 2) # Make sure that in the balanced case it does not change anything # to use "auto" assert_array_almost_equal(clf.coef_, clf_auto.coef_, 6) # build an very very imbalanced dataset out of iris data X_0 = X[y == 0, :] y_0 = y[y == 0] X_imbalanced = np.vstack([X] + [X_0] * 10) y_imbalanced = np.concatenate([y] + [y_0] * 10) # fit a model on the imbalanced data without class weight info clf = self.factory(n_iter=1000) clf.fit(X_imbalanced, y_imbalanced) y_pred = clf.predict(X) assert metrics.f1_score(y, y_pred) < 0.96 # fit a model with auto class_weight enabled clf = self.factory(n_iter=1000) clf.fit(X_imbalanced, y_imbalanced, class_weight="auto") y_pred = clf.predict(X) assert metrics.f1_score(y, y_pred) > 0.96
def test_scaler_without_centering(): rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero scaler = Scaler(with_mean=False) X_scaled = scaler.fit(X).transform(X, copy=True) assert not np.any(np.isnan(X_scaled)) assert_array_almost_equal(X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert X_scaled is not X X_scaled = scale(X, with_mean=False) assert not np.any(np.isnan(X_scaled)) assert_array_almost_equal(X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert X_scaled is not X
def test_scaler_without_centering(): rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero scaler = Scaler(with_mean=False) X_scaled = scaler.fit(X).transform(X, copy=True) assert not np.any(np.isnan(X_scaled)) assert_array_almost_equal( X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert X_scaled is not X X_scaled = scale(X, with_mean=False) assert not np.any(np.isnan(X_scaled)) assert_array_almost_equal( X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert X_scaled is not X
the Pearson correlation as an additional measure of the clustering quality. """ print __doc__ from time import time import numpy as np from scikits.learn.cluster import KMeans from scikits.learn.datasets import load_digits from scikits.learn.pca import PCA from scikits.learn.preprocessing import scale np.random.seed(42) digits = load_digits() data = scale(digits.data) n_samples, n_features = data.shape n_digits = len(np.unique(digits.target)) print "n_digits: %d" % n_digits print "n_features: %d" % n_features print "n_samples: %d" % n_samples print print "Raw k-means with k-means++ init..." t0 = time() km = KMeans(init='k-means++', k=n_digits, n_init=10).fit(data) print "done in %0.3fs" % (time() - t0) print "inertia: %f" % km.inertia_ print