def test_qc_kinship_dataarray(): random = RandomState(0) X = random.randn(3, 5) K = X.dot(X.T) K = da.from_array(K, chunks=2) K = xr.DataArray(K) K1 = zeros((3, 3)) K0 = normalise_covariance(K) K2 = normalise_covariance(K, out=K1) Kf = [ [2.5990890007787586, -0.1951278087849671, 0.5472860002747189], [-0.1951278087849671, 0.4202620710126438, 0.2642930556468809], [0.5472860002747189, 0.2642930556468809, 0.5971001753452302], ] assert_allclose(K0, Kf) assert_(isinstance(K0, xr.DataArray)) assert_allclose(K0, K1) assert_(isinstance(K1, ndarray)) assert_(isinstance(K2, ndarray)) assert_allclose(K0, K2) assert_(K2 is K1)
def test_qtl_scan_three_hypotheses_mt(): random = RandomState(0) n = 30 ntraits = 2 ncovariates = 3 A = random.randn(ntraits, ntraits) A = A @ A.T M = random.randn(n, ncovariates) C0 = random.randn(ntraits, ntraits) C0 = C0 @ C0.T C1 = random.randn(ntraits, ntraits) C1 = C1 @ C1.T G = random.randn(n, 4) A0 = random.randn(ntraits, 1) A1 = random.randn(ntraits, 2) A01 = concatenate((A0, A1), axis=1) K = random.randn(n, n + 1) K = normalise_covariance(K @ K.T) beta = vec(random.randn(ntraits, ncovariates)) alpha = vec(random.randn(A01.shape[1], G.shape[1])) m = kron(A, M) @ beta + kron(A01, G) @ alpha Y = unvec(mvn(random, m, kron(C0, K) + kron(C1, eye(n))), (n, -1)) idx = [[0, 1], 2, [3]] r = scan(G, Y, idx=idx, K=K, M=M, A=A, A0=A0, A1=A1, verbose=False) str(r)
def test_glmm_composer(): random = RandomState(0) nsamples = 50 glmm = GLMMComposer(nsamples) glmm.fixed_effects.append_offset() X0 = random.randn(nsamples) glmm.fixed_effects.append(X0) glmm.fixed_effects[0].offset = 1 glmm.fixed_effects[1].effsizes = [1] assert_allclose(glmm.fixed_effects.mean.value() - X0, ones(nsamples)) X12 = random.randn(nsamples, 2) glmm.fixed_effects.append(X12) G0 = random.randn(nsamples, 100) K0 = normalise_covariance(dot(G0, G0.T)) glmm.covariance_matrices.append(K0) G1 = random.randn(nsamples, 100) K1 = normalise_covariance(dot(G1, G1.T)) glmm.covariance_matrices.append(K1) glmm.covariance_matrices.append_iid_noise() glmm.covariance_matrices[0].scale = 1 glmm.covariance_matrices[1].scale = 0 glmm.covariance_matrices[2].scale = 1 K = glmm.covariance_matrices.cov.value() assert_allclose(K, K0 + eye(nsamples)) y = random.randn(nsamples) glmm.y = y glmm.fit(verbose=False) assert_allclose(glmm.covariance_matrices[0].scale, 0, atol=1e-6) assert_allclose(glmm.covariance_matrices[1].scale, 0, atol=1e-6) assert_allclose(glmm.covariance_matrices[2].scale, 1.099905167170892, atol=1e-6) assert_allclose(glmm.lml(), -73.32753446649403, atol=1e-6)
def _train_gblup(y, Z, X, include_ses=False, p_threshold=0.01): log = logging.getLogger(pyfocus.LOG) try: from limix.qc import normalise_covariance except ImportError as ie: log.error( "Training submodule requires limix>=2.0.0 and sklearn to be installed." ) raise from numpy.linalg import multi_dot as mdot from scipy.linalg import pinvh log.debug("Initializing GBLUP model") attrs = dict() # estimate heritability using limix K_cis = np.dot(Z, Z.T) K_cis = normalise_covariance(K_cis) fe_var, s2u, s2e, logl, fixed_betas, pval = _fit_cis_herit(y, K_cis, X) yresid = y - np.dot(X, fixed_betas) if pval > p_threshold: log.info("h2g pvalue {} greater than threshold {}. Skipping".format( pval, p_threshold)) return None attrs["h2g"] = s2u / (fe_var + s2u + s2e) attrs["h2g.logl"] = logl attrs["h2g.pvalue"] = pval # Total variance n, p = Z.shape # ridge solution (i.e. rrBLUP) # this will be slower than normal GBLUP when p > n but is a little bit more flexible ZtZpDinv = pinvh(np.dot(Z.T, Z) + np.eye(p) * (s2e / s2u)) betas = mdot([ZtZpDinv, Z.T, yresid]) if include_ses: # TODO: come back to this with matrix operations rather than list comprehensions # jack-knife standard-errors over the fast leave-one-out estimates using rrBLUP """ h = np.array([mdot([Z[i], ZtZpDinv, Z[i]]) for i in range(n)]) e = yresid - np.dot(Z, betas) beta_jk = [betas - np.dot(ZtZpDinv, Z[i] * e[i]) / (1 - h[i]) for i in range(n)] ses = np.sqrt(np.mean(beta_jk, axis=0) * (n - 1)) """ ses = None else: ses = None return betas, ses, attrs
def _train_enet(y, Z, X, include_ses=False, p_threshold=0.01): log = logging.getLogger(pyfocus.LOG) try: from limix.qc import normalise_covariance from sklearn.linear_model import ElasticNetCV except ImportError as ie: log.error( "Training submodule requires limix>=2.0.0 and sklearn to be installed." ) raise from scipy.linalg import lstsq log.debug("Initializing ElasticNet model") n = len(y) attrs = dict() K_cis = np.dot(Z, Z.T) K_cis = normalise_covariance(K_cis) fe_var, s2u, s2e, logl, fixed_betas, pval = _fit_cis_herit(y, K_cis, X) if pval > p_threshold: log.info("h2g pvalue {} greater than threshold {}. Skipping".format( pval, p_threshold)) return None h2g = s2u / (s2u + s2e + fe_var) attrs["h2g"] = h2g attrs["h2g.logl"] = logl attrs["h2g.pvalue"] = pval # we only want to penalize SNP effects and not covariate effects... fixed_betas, sum_resid, ranks, svals = lstsq(X, y) yresid = y - np.dot(X, fixed_betas) enet = ElasticNetCV(l1_ratio=0.5, fit_intercept=True, cv=5) enet.fit(Z, yresid) betas = enet.coef_ attrs["r2"] = enet.score(Z, yresid) attrs["resid.var"] = sum((yresid - enet.predict(Z))**2) / (n - 1) if include_ses: # TODO: bootstrap? ses = None else: ses = None return betas, ses, attrs
def _test_qtl_scan_st(lik): random = RandomState(0) n = 30 ncovariates = 3 M = random.randn(n, ncovariates) v0 = random.rand() v1 = random.rand() G = random.randn(n, 4) K = random.randn(n, n + 1) K = normalise_covariance(K @ K.T) beta = random.randn(ncovariates) alpha = random.randn(G.shape[1]) m = M @ beta + G @ alpha y = mvn(random, m, v0 * K + v1 * eye(n)) idx = [[0, 1], 2, [3]] if lik == "poisson": y = random.poisson(exp(y)) elif lik == "bernoulli": y = random.binomial(1, 1 / (1 + exp(-y))) elif lik == "probit": y = random.binomial(1, st.norm.cdf(y)) elif lik == "binomial": ntrials = random.randint(0, 30, len(y)) y = random.binomial(ntrials, 1 / (1 + exp(-y))) lik = (lik, ntrials) r = scan(G, y, lik=lik, idx=idx, K=K, M=M, verbose=False) str(r) str(r.stats.head()) str(r.effsizes["h2"].head()) str(r.h0.trait) str(r.h0.likelihood) str(r.h0.lml) str(r.h0.effsizes) str(r.h0.variances)
def test_qtl_scan_two_hypotheses_mt_A0A1_none(): random = RandomState(0) n = 30 ntraits = 2 ncovariates = 3 A = random.randn(ntraits, ntraits) A = A @ A.T M = random.randn(n, ncovariates) C0 = random.randn(ntraits, ntraits) C0 = C0 @ C0.T C1 = random.randn(ntraits, ntraits) C1 = C1 @ C1.T G = random.randn(n, 4) A1 = eye(ntraits) K = random.randn(n, n + 1) K = normalise_covariance(K @ K.T) beta = vec(random.randn(ntraits, ncovariates)) alpha = vec(random.randn(A1.shape[1], G.shape[1])) m = kron(A, M) @ beta + kron(A1, G) @ alpha Y = unvec(mvn(random, m, kron(C0, K) + kron(C1, eye(n))), (n, -1)) Y = DataArray(Y, dims=["sample", "trait"], coords={"trait": ["WA", "Cx"]}) idx = [[0, 1], 2, [3]] r = scan(G, Y, idx=idx, K=K, M=M, A=A, verbose=False) df = r.effsizes["h2"] df = df[df["test"] == 0] assert_array_equal(df["trait"], ["WA"] * 3 + ["Cx"] * 3 + [None] * 4) assert_array_equal( df["env"], [None] * 6 + ["env1_WA", "env1_WA", "env1_Cx", "env1_Cx"] ) str(r)
with localconverter(default_converter + pandas2ri.converter) as cv: pd_K = r('K_matrix') K_data = np.array(pd_K) #or make it using limix from limix.stats import linear_kinship K = linear_kinship(SNP_data, verbose=True) K_data = K #Another way to make the kinship matrix from numpy import dot from limix.qc import normalise_covariance X = SNP_data K = dot(X, X.T) Kn = normalise_covariance(K) #Missing Value Threshold Miss_Tol = .3 #Minor Allele Frequency Threshold MAF_Tol = .05 #estimating the allele frequencies in the data SNPsum = np.nansum(SNP_data, axis=0) nInd = np.sum(~np.isnan(SNP_data), axis=0) freq_hat = np.array(SNPsum, dtype="float") / (2 * nInd) mask = np.ndarray.flatten( np.array(np.all([freq_hat > MAF_Tol, freq_hat < (1 - MAF_Tol)], axis=0)).astype("bool")) SNP_data = SNP_data[:, mask] SNP_names = SNP_names[mask, :] #Too many indices for array error MAF = freq_hat[mask]
def _train_lasso(y, Z, X, include_ses=False, p_threshold=0.01): log = logging.getLogger(pyfocus.LOG) try: from limix.qc import normalise_covariance from sklearn.linear_model import Lasso except ImportError as ie: log.error( "Training submodule requires limix>=2.0.0 and sklearn to be installed." ) raise from scipy.linalg import lstsq log.debug("Initializing LASSO model") n = len(y) attrs = dict() K_cis = np.dot(Z, Z.T) K_cis = normalise_covariance(K_cis) fe_var, s2u, s2e, logl, fixed_betas, pval = _fit_cis_herit(y, K_cis, X) if pval > p_threshold: log.info("h2g pvalue {} greater than threshold {}. Skipping".format( pval, p_threshold)) return None h2g = s2u / (s2u + s2e + fe_var) attrs["h2g"] = h2g attrs["h2g.logl"] = logl attrs["h2g.pvalue"] = pval # we only want to penalize SNP effects and not covariate effects... fixed_betas, sum_resid, ranks, svals = lstsq(X, y) yresid = y - np.dot(X, fixed_betas) # PLINK-style LASSO lambda_max = np.linalg.norm(Z.T.dot(yresid), np.inf) / float(n) def _gen_e(): e = np.random.normal(size=n) return np.linalg.norm(Z.T.dot(e), np.inf) min_tmp = np.median([_gen_e() for _ in range(1000)]) sige = np.sqrt(1.0 - h2g + (1.0 / float(n))) lambda_min = (sige / n) * min_tmp # 100 values spaced logarithmically from lambda-min to lambda-max alphas = np.exp(np.linspace(np.log(lambda_min), np.log(lambda_max), 100)) # fit LASSO solution using coordinate descent, updating with consecutively smaller penalties lasso = Lasso(fit_intercept=True, warm_start=True) for penalty in reversed(alphas): lasso.set_params(alpha=penalty) lasso.fit(Z, yresid) betas = lasso.coef_ attrs["r2"] = lasso.score(Z, yresid) attrs["resid.var"] = sum((yresid - lasso.predict(Z))**2) / (n - 1) if include_ses: # TODO: bootstrap? ses = None else: ses = None return betas, ses, attrs