def test_errors_alpha_out_of_bounds(): X, y = datasets.make_cubic(random_state=123) sir = SlicedInverseRegression(alpha=10) with pytest.raises(ValueError): sir.fit(X, y)
def test_n_directions_auto_heuristic(): X, y = datasets.make_exponential(random_state=123) sir = SlicedInverseRegression(n_directions='auto').fit(X, y) assert sir.n_directions_ == 2 X_sir = sir.transform(X) assert X_sir.shape == (500, 2)
def sir_pre(data_train_center, data_test_center, y_label): sir = SlicedInverseRegression(n_directions=9, n_slices=10) nonzero_id = np.where(np.std(data_train_center, 0) != 0)[0] data_train_center_shrink = data_train_center[:, nonzero_id] sir.fit(data_train_center_shrink, y_label) data_sir_fit = data_train_center_shrink @ np.transpose(sir.directions_) data_sir_test = data_test_center[:, nonzero_id] @ np.transpose( sir.directions_) return data_sir_fit, data_sir_test
def test_all_zero_coefficients_warns_and_does_not_zero_out(): """To avoid errors a t-test that indicates that all coefficients are zero will not zero-out the directions vector. """ X, y = load_breast_cancer(return_X_y=True) sir = SlicedInverseRegression(n_directions=2, alpha=0.05) with pytest.warns(RuntimeWarning): sir.fit(X, y) assert np.any(sir.directions_ != 0)
def test_classification(): """SIR is LDA for classification so lets test some predictions.""" # Data is just 6 separable points in the plane X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]], dtype=np.float64) y = np.array([1, 1, 1, 0, 0, 0]) sir = SlicedInverseRegression(n_directions=1, n_slices=2).fit(X, y) lda = LinearDiscriminantAnalysis(solver='eigen').fit(X, y) y_pred = sir.transform(X) > 0 np.testing.assert_equal(y, y_pred.ravel()) np.testing.assert_equal(lda.predict(X), y_pred.ravel())
def test_matches_athletes(): """Test that the resutls match the R dr package on a ais dataset. """ X, y = datasets.load_athletes() sir = SlicedInverseRegression(n_directions=4, n_slices=11).fit(X, y) np.testing.assert_allclose( sir.eigenvalues_, np.array([0.957661631, 0.245041613, 0.107075941, 0.090413047])) expected_directions = np.array( [[ 1.50963358e-01, -9.16480522e-01, -1.31538894e-01, -9.33588596e-02, 4.46783829e-03, -1.88973540e-01, 2.74758965e-01, -5.63123794e-03 ], [ -5.01785457e-02, -1.94229862e-01, 6.85475076e-01, -4.33408964e-02, 1.83380846e-04, 3.47565293e-01, -6.05830142e-01, 1.30588502e-02 ], [ 1.08983356e-01, -2.01236965e-01, 7.19975455e-01, 4.64453982e-01, 4.49759016e-02, 2.94969081e-01, -3.41966152e-01, -8.70270913e-02 ], [ -2.21020634e-03, -8.97220257e-02, -6.63097774e-01, 2.90838658e-01, 7.19045566e-02, 3.70563626e-02, 6.78877114e-01, 1.55472144e-02 ]]) np.testing.assert_allclose(sir.directions_, expected_directions)
def test_single_y_value(): rng = np.random.RandomState(123) X = rng.randn(100, 4) y = np.ones(100) with pytest.raises(ValueError): SlicedInverseRegression().fit(X, y)
def test_n_slices_too_big(): X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]], dtype=np.float64) y = np.array([1, 1, 1, 0, 0, 0]) sir = SlicedInverseRegression(n_directions=1, n_slices=10).fit(X, y) assert sir.n_slices_ == 2
def test_sparse_coefficient(): """Test the component-wise t-test works on a simple synthetic dataset. """ rng = np.random.RandomState(123) n_samples = 300 n_features = 6 X = rng.randn(n_samples, n_features) noise = rng.randn(n_samples).reshape(-1, 1) beta = np.array([1, 0, 0, -1, 0, 0]).reshape(-1, 1) y = np.exp(-0.75 * np.dot(X, beta)) + 0.5 * noise sir = SlicedInverseRegression(alpha=0.05) sir.fit(X, y.ravel()) np.testing.assert_array_equal(sir.directions_.ravel() != 0, beta.ravel() != 0)
def test_regression(): """NOTE: subsequent calls may flip the direction of eigenvectors (mulitply by -1), so we can only compare absolute values. This was not a problem for svds.. investigate if we can get deterministic behavior back. """ X, y = datasets.make_cubic(random_state=123) for n_dir in range(1, X.shape[1]): sir = SlicedInverseRegression(n_directions=n_dir) # take shape is correct X_sir = sir.fit(X, y).transform(X) np.testing.assert_equal(X_sir.shape[1], n_dir) # should match fit_transform X_sir2 = sir.fit_transform(X, y) np.testing.assert_allclose(np.abs(X_sir), np.abs(X_sir2)) # call transform again and check if things are okay X_sir = sir.transform(X) X_sir2 = sir.fit_transform(X, y) np.testing.assert_allclose(np.abs(X_sir), np.abs(X_sir2)) # there is one true angle it should fine true_beta = (1 / np.sqrt(2)) * np.hstack((np.ones(2), np.zeros(8))) angle = np.dot(true_beta, sir.directions_[0, :]) np.testing.assert_allclose(np.abs(angle), 1, rtol=1e-1)
def test_sparse_coefficient_multiple_dimensions(): """Perform the t-test on a dataset with two directions. """ rng = np.random.RandomState(123) n_samples = 300 n_features = 15 beta = np.zeros((2, n_features)) beta[0, :9] = 1 beta[1, 9:] = 1 X = rng.randn(n_samples, n_features) y = (np.sign(np.dot(X, beta[0, :])) * np.log(np.abs(np.dot(X, beta[1, :]) + 5))) sir = SlicedInverseRegression(n_directions=2, alpha=0.05) sir.fit(X, y.ravel()) # the first dimension is found without error np.testing.assert_array_equal(sir.directions_[1, :].ravel() != 0, beta[0, :].ravel() != 0) # the second dimension picks up a few spurious coefficients assert np.all(sir.directions_[0, :].ravel()[9:] != 0)
def __init__(self): super().__init__() self.regressor = SlicedInverseRegression()
============================================ A comparison of the subspace found by sliced inverse regression and principal component analysis on the australian athletes dataset. """ import matplotlib.pyplot as plt from sklearn.decomposition import PCA from sliced.datasets import load_athletes from sliced import SlicedInverseRegression X, y = load_athletes() # fit SIR model sir = SlicedInverseRegression(n_slices=11).fit(X, y) X_sir = sir.transform(X) # fit PCA pca = PCA(random_state=123).fit(X, y) X_pca = pca.transform(X) f, (ax1, ax2) = plt.subplots(1, 2, sharey=True) ax1.scatter(X_sir[:, 0], y, c=y, cmap='viridis', linewidth=0.5, edgecolor='k') ax1.set_title('SIR Subspace') ax1.set_xlabel("$X\hat{\\beta}_{SIR}$") ax1.set_ylabel("Lean Body Mass (kg)") ax2.scatter(X_pca[:, 0], y, c=y, cmap='viridis', linewidth=0.5, edgecolor='k') ax2.set_title('PCA Subspace')
def test_sparse_not_supported(): X, y = datasets.make_cubic(random_state=123) X = sparse.csr_matrix(X) with pytest.raises(TypeError): SlicedInverseRegression().fit(X, y)
""" ========================= Sliced Inverse Regression ========================= An example plot of :class:`sliced.sir.SlicedInverseRegression` """ import numpy as np import matplotlib.pyplot as plt from sliced import SlicedInverseRegression from sliced import datasets X, y = datasets.make_cubic(random_state=123) sir = SlicedInverseRegression() X_sir = sir.fit_transform(X, y) # estimate of the first dimension reducing directions beta1_hat = sir.directions_[0, :] # plot data projected onto the first direction plt.scatter(X_sir[:, 0], y, c=y, cmap='viridis', linewidth=0.5, edgecolor='k') plt.xlabel("$X\hat{\\beta_1}$") plt.ylabel("y") # annotation showing the direction found beta_text = "$\\beta_1$ = " + "{0}".format([0.707, 0.707]) plt.annotate(beta_text, xy=(-2, 6.5)) beta1_hat_text = "$\hat{\\beta_1}$ = " + "{0}".format( np.round(beta1_hat, 3).tolist()[:2])
def test_n_directions_none(): X, y = datasets.make_cubic(random_state=123) sir = SlicedInverseRegression(n_directions=None).fit(X, y) np.testing.assert_equal(sir.n_directions_, X.shape[1])
""" ======================= Binary Targets with SIR ======================= Sliced Inverse Regression is able to find a one-dimensional subspace that seperates cases in the famous breast cancer dataset. """ import matplotlib.pyplot as plt from sklearn.datasets import load_breast_cancer from sliced import SlicedInverseRegression X, y = load_breast_cancer(return_X_y=True) sir = SlicedInverseRegression(n_directions=2).fit(X, y) X_sir = sir.transform(X) plt.scatter(X_sir[:, 0], X_sir[:, 1], c=y, alpha=0.8, edgecolor='k') plt.xlabel("$X\hat{\\beta}_{1}$") plt.ylabel("$X\hat{\\beta}_{2}$") plt.title("Breast Cancer Data") plt.show()
def test_zero_variance_features(): """Raise an informative error message when features of zero variance.""" X, y = load_digits(return_X_y=True) with pytest.raises(linalg.LinAlgError): sir = SlicedInverseRegression(n_directions='auto').fit(X, y)