def test_PowerMethod_std_method(): N = 1000 P = 100 k = 10 array = da.random.randint(0, 3, size=(N, P)) for method in ['norm', 'binom']: new_array = make_snp_array(array, std_method=method) PM = PowerMethod(k=k, scoring_method='q-vals', tol=1e-13, factor=None) U_PM, S_PM, V_PM = PM.svd(array=new_array) mean = array.mean(axis=0) if method == 'norm': std = array.std(axis=0) else: p = mean / 2 std = da.sqrt(2 * p * (1 - p)) x = (array - mean).dot(np.diag(1 / std)) U, S, V = da.linalg.svd(x) U_k, S_k, V_k = svd_to_trunc_svd(U, S, V, k=k) np.testing.assert_almost_equal(subspace_dist(U_PM, U_k, S_k), 0, decimal=3) np.testing.assert_almost_equal(subspace_dist(V_PM, V_k, S_k), 0, decimal=3) np.testing.assert_array_almost_equal(S_k, S_PM, decimal=2)
def test_PowerMethod_case1(): n = 100 p = 80 array = np.random.rand(100, 80) mu = array.mean(axis=0) std = np.diag(1 / array.std(axis=0)) scaled_centered_array = (array - mu).dot(std) U, S, V = np.linalg.svd(scaled_centered_array, full_matrices=False) # Ground Truth array = make_snp_array(da.array(array), mean=True, std=True, std_method='norm', mask_nan=False, dtype='float64') for k in range(1, 10): U_k, S_k, V_k = U[:, :k], S[:k], V[:k, :] PM = PowerMethod(k=k, tol=1e-9, scoring_method='rmse', max_iter=100, sub_svd_start=False, init_row_sampling_factor=1, factor=None, lmbd=0) U_k_PM, S_k_PM, V_k_PM = PM.svd(array) np.testing.assert_array_almost_equal(S_k, S_k_PM) assert V_k.shape == V_k_PM.shape == (k, p) assert U_k.shape == U_k_PM.shape == (n, k) np.testing.assert_almost_equal(subspace_dist(V_k, V_k_PM, S_k_PM), 0) np.testing.assert_almost_equal(subspace_dist(U_k, U_k_PM, S_k_PM), 0)
def test_mask_snp_array_casse1(): array = np.random.rand(100, 80) mu = array.mean(axis=0) std = np.diag(1 / array.std(axis=0)) scaled_centered_array = (array - mu).dot(std) array = utils.make_snp_array(da.array(array), mean=True, std=True, std_method='norm', mask_nan=False, dtype='float64') np.testing.assert_array_almost_equal(scaled_centered_array, array)
def test_PowerMethod_nan_arrays(): array = np.random.randn(100, 100) for bad_type in [float('nan')]: array[0, 0] = bad_type for start in [True, False]: PM = PowerMethod(sub_svd_start=start, max_iter=2) with pytest.raises(np.linalg.LinAlgError): _, _, _ = PM.svd(da.array(array)) clean_array = make_snp_array(da.array(array), mask_nan=True, std_method='norm', dtype='float64') _, _, _ = PM.svd(clean_array)
def test_make_snp_array_case_binom(shape, threshold): assume(shape[0] > 1 and shape[1] > 1) # Assumes not degenerate 2d Array arr = da.random.random(size=shape) arr[arr > threshold] = float('nan') assume(da.mean(da.mean(da.isnan(arr), axis=0) < 1) == 1) # Asserts that every tested arr has at least 1 non-nan value in each column snp_array = utils.make_snp_array(arr, mean=True, std=True, std_method='binom', dtype='float') mean = snp_array.mean(axis=0) np.testing.assert_array_almost_equal(1 + mean, np.ones(shape[1]))
def test_make_snp_array_case_normal(shape, threshold): assume(shape[0] > 1 and shape[1] > 1) # Assumes not degenerate 2d Array arr = da.random.random(size=shape) arr[arr > threshold] = float('nan') assume(da.mean(da.nanstd(arr, axis=0) > 0) == 1) # Asserts that every tested arr has a non-zero std for each column snp_array = utils.make_snp_array(arr, mean=True, std=True, std_method='norm', dtype='float') np.testing.assert_array_almost_equal(1 + snp_array.mean(axis=0), np.ones(shape[1]))
def test_make_snp_array_case_normal(shape, max_value, mask_nans): assume(shape[0] > 1 and shape[1] > 1) # Assumes not degenerate 2d Array arr = da.random.randint(0, max_value, size=shape) if mask_nans: arr[arr == max_value - 1] = float('nan') assume(da.mean(da.nanstd(arr, axis=0) > 0) == 1) # Asserts that every tested arr has a non-zero std for each column snp_array = utils.make_snp_array(arr, mean=True, std=True, std_method='norm', mask_nan=mask_nans, dtype='int8') np.testing.assert_array_almost_equal(1 + snp_array.mean(axis=0), np.ones(shape[1]))
def test_PowerMethod_case2(): array = np.random.rand(100, 100) mu = array.mean(axis=0) std = np.diag(1 / array.std(axis=0)) scaled_centered_array = (array - mu).dot(std) array = make_snp_array(da.array(array), mean=True, std=True, std_method='norm', mask_nan=False, dtype='float64') U, S, V = np.linalg.svd(scaled_centered_array.dot(scaled_centered_array.T), full_matrices=False) # Ground Truth _, _, V = np.linalg.svd(scaled_centered_array.T.dot(scaled_centered_array), full_matrices=False) S = np.sqrt(S) k = 10 U_k, S_k, V_k = U[:, :k], S[:k], V[:k, :] previous_S_error = float('inf') previous_U_error = float('inf') previous_V_error = float('inf') for t in np.logspace(0, -12, 20): PM = PowerMethod(k=k, tol=t, scoring_method='q-vals', max_iter=100, factor=None, lmbd=0) U_k_PM, S_k_PM, V_k_PM = PM.svd(array) assert subspace_dist(U_k, U_k_PM, S_k) <= previous_U_error assert subspace_dist(V_k, V_k_PM, S_k) <= previous_V_error assert np.linalg.norm(S_k - S_k_PM) <= previous_S_error previous_S_error = np.linalg.norm(S_k - S_k_PM) previous_U_error = subspace_dist(U_k, U_k_PM, S_k) previous_V_error = subspace_dist(V_k, V_k_PM, S_k) assert subspace_dist(U_k, U_k_PM, S_k) <= 1e-9 assert subspace_dist(V_k, V_k_PM, S_k) <= 1e-9 assert np.linalg.norm(S_k - S_k_PM) <= 1e-12
def test_PowerMethod_scale_center(): array = np.random.rand(100, 70) mu = array.mean(axis=0) std = np.diag(1 / array.std(axis=0)) k = 10 for scale in [True, False]: for center in [True, False]: new_array = array if center: new_array = new_array - mu if scale: new_array = new_array.dot(std) U, S, _ = np.linalg.svd(new_array.dot(new_array.T), full_matrices=False) # Ground Truth _, _, V = np.linalg.svd(new_array.T.dot(new_array), full_matrices=False) # Ground Truth S = np.sqrt(S) U_k, S_k, V_k = U[:, :k], S[:k], V[:k, :] snp_array = make_snp_array(da.array(array), std=scale, mean=center, std_method='norm', dtype='float64') np.testing.assert_array_almost_equal(new_array, snp_array) PM = PowerMethod(k=k, tol=1e-12, scoring_method='q-vals', max_iter=100, factor=None, lmbd=0) U_q, S_q, V_q = PM.svd(snp_array) assert subspace_dist(U_k, U_q, S_k) <= 1e-8 assert subspace_dist(V_k, V_q, S_k) <= 1e-8 assert np.linalg.norm(S_k - S_q) <= 1e-9
def test_PowerMethod_factor(): n = 100 p = 80 array = np.random.rand(n, p) sym_array = array.dot(array.T) for f in ['n', 'p', None]: if f == 'n': factor = n elif f == 'p': factor = p else: factor = 1 U, S, V = np.linalg.svd(sym_array / factor, full_matrices=False) S = np.sqrt(S) k = 10 U_k, S_k, V_k = U[:, :k], S[:k], V[:k, :] array = make_snp_array(da.array(array), mean=False, std=False, std_method='norm', mask_nan=False, dtype='float64') PM = PowerMethod(k=k, tol=1e-9, scoring_method='q-vals', max_iter=100, factor=f, lmbd=0) U_k_PM, S_k_PM, V_k_PM = PM.svd(array) np.testing.assert_array_almost_equal(S_k, S_k_PM) assert U_k.shape == U_k_PM.shape == (n, k) np.testing.assert_almost_equal(subspace_dist(U_k, U_k_PM, S_k_PM), 0)