def test_qtl_scan_lmm_repeat_samples_by_index(): random = RandomState(0) nsamples = 30 samples = ["sample{}".format(i) for i in range(nsamples)] G = random.randn(nsamples, 100) G = DataFrame(data=G, index=samples) K = linear_kinship(G.values[:, 0:80], verbose=False) K = DataFrame(data=K, index=samples, columns=samples) y0 = dot(G, random.randn(100)) / sqrt(100) + 0.2 * random.randn(nsamples) y1 = dot(G, random.randn(100)) / sqrt(100) + 0.2 * random.randn(nsamples) y = concatenate((y0, y1)) y = DataFrame(data=y, index=samples + samples) M = G.values[:, :5] X = G.values[:, 68:70] M = DataFrame(data=M, index=samples) X = DataFrame(data=X, index=samples) result = scan(X, y, "normal", K, M=M, verbose=False) pv = result.stats["pv20"] assert_allclose(pv.values[0], 0.9920306566395604, rtol=1e-6) ix_best_snp = argmin(array(result.stats["pv20"])) M = concatenate((M, X.loc[:, [ix_best_snp]]), axis=1) M = DataFrame(data=M, index=samples) result = scan(X, y, "normal", K, M=M, verbose=False) pv = result.stats["pv20"] assert_allclose(pv[ix_best_snp], 1.0, rtol=1e-6) assert_allclose(pv.values[0], 0.6684700834450028, rtol=1e-6)
def test_qtl_scan_lmm_different_samples_order(): random = RandomState(0) nsamples = 50 samples = ["sample{}".format(i) for i in range(nsamples)] G = random.randn(nsamples, 100) G = DataFrame(data=G, index=samples) K = linear_kinship(G.values[:, 0:80], verbose=False) K = DataFrame(data=K, index=samples, columns=samples) y = dot(G, random.randn(100)) / sqrt(100) + 0.2 * random.randn(nsamples) y = DataFrame(data=y, index=samples) M = G.values[:, :5] X = G.values[:, 68:70] M = DataFrame(data=M, index=samples) X = DataFrame(data=X, index=samples) result = scan(X, y, "normal", K, M=M, verbose=False) pv = result.stats["pv20"] assert_allclose(pv.values[1], 0.10807353644788478, rtol=1e-6) X.sort_index(inplace=True, ascending=False) X = DataFrame(X.values, index=X.index.values) result = scan(X, y, "normal", K, M=M, verbose=False) pv = result.stats["pv20"] assert_allclose(pv.values[1], 0.10807353644788478, rtol=1e-6)
def test_qtl_scan_three_hypotheses_mt(): random = RandomState(0) n = 30 ntraits = 2 ncovariates = 3 A = random.randn(ntraits, ntraits) A = A @ A.T M = random.randn(n, ncovariates) C0 = random.randn(ntraits, ntraits) C0 = C0 @ C0.T C1 = random.randn(ntraits, ntraits) C1 = C1 @ C1.T G = random.randn(n, 4) A0 = random.randn(ntraits, 1) A1 = random.randn(ntraits, 2) A01 = concatenate((A0, A1), axis=1) K = random.randn(n, n + 1) K = normalise_covariance(K @ K.T) beta = vec(random.randn(ntraits, ncovariates)) alpha = vec(random.randn(A01.shape[1], G.shape[1])) m = kron(A, M) @ beta + kron(A01, G) @ alpha Y = unvec(mvn(random, m, kron(C0, K) + kron(C1, eye(n))), (n, -1)) idx = [[0, 1], 2, [3]] r = scan(G, Y, idx=idx, K=K, M=M, A=A, A0=A0, A1=A1, verbose=False) str(r)
def test_qtl_scan_glmm_wrong_dimensions(): random = RandomState(0) nsamples = 25 X = random.randn(nsamples, 2) G = random.randn(nsamples, 100) K = dot(G, G.T) ntrials = random.randint(1, 100, nsamples) z = dot(G, random.randn(100)) / sqrt(100) successes = zeros(len(ntrials), int) for i, nt in enumerate(ntrials): for _ in range(nt): successes[i] += int(z[i] + 0.5 * random.randn() > 0) M = random.randn(49, 2) scan(X, successes, ("binomial", ntrials), K, M=M, verbose=False)
def test_qtl_scan_lmm(): random = RandomState(0) nsamples = 50 G = random.randn(50, 100) K = linear_kinship(G[:, 0:80], verbose=False) y = dot(G, random.randn(100)) / sqrt(100) + 0.2 * random.randn(nsamples) M = G[:, :5] X = G[:, 68:70] result = scan(X, y, lik="normal", K=K, M=M, verbose=False) pv = result.stats["pv20"] ix_best_snp = argmin(array(pv)) M = concatenate((M, X[:, [ix_best_snp]]), axis=1) result = scan(X, y, "normal", K, M=M, verbose=False) pv = result.stats["pv20"] assert_allclose(pv[ix_best_snp], 1.0, atol=1e-6)
def test_qtl_finite(): random = RandomState(0) nsamples = 20 X = random.randn(50, 2) G = random.randn(50, 100) K = dot(G, G.T) ntrials = random.randint(1, 100, nsamples) z = dot(G, random.randn(100)) / sqrt(100) successes = zeros(len(ntrials), int) for i, nt in enumerate(ntrials): for _ in range(nt): successes[i] += int(z[i] + 0.5 * random.randn() > 0) successes = successes.astype(float) ntrials = ntrials.astype(float) successes[0] = nan with pytest.raises(ValueError): scan(X, successes, ("binomial", ntrials), K, verbose=False) successes[0] = 1.0 K[0, 0] = nan with pytest.raises(ValueError): scan(X, successes, ("binomial", ntrials), K, verbose=False) K[0, 0] = 1.0 X[0, 0] = nan with pytest.raises(ValueError): scan(X, successes, ("binomial", ntrials), K, verbose=False) X[0, 0] = 1.0
def test_qtl_scan_lm(): random = RandomState(0) nsamples = 25 G = random.randn(nsamples, 100) y = dot(G, random.randn(100)) / sqrt(100) + 0.2 * random.randn(nsamples) M = G[:, :5] X = G[:, 5:] result = scan(X, y, "normal", M=M, verbose=False) pv = result.stats["pv20"] assert_allclose(pv[:2], [0.02625506841465465, 0.9162689001409643], rtol=1e-5)
def do_gwas(self, geno_mat =None): from limix.qtl import scan print("Start to perform GWAS......") if geno_mat is None: geno_mat = self.geno_matrix.T else: geno_mat = geno_mat if self.kinship is not None: res = scan(geno_mat, self.pheno_list.values, "normal", K= self.kinship.values ,verbose=False) else: res = scan(self.geno_matrix.T.values, self.pheno_list.values, "normal", K= None ,verbose=False) res_p = res.stats res_p.index = self.SNPinfo.rsid res_p.loc[:,'rsid'] = self.SNPinfo.rsid res_p.loc[:,'chrom'] = self.SNPinfo.chrom res_p.loc[:,'position'] = self.SNPinfo.position betas = np.array(res.effsizes['h2'].effsize[res.effsizes['h2'].effect_type=='candidate']) se = np.array(res.effsizes['h2'].effsize_se[res.effsizes['h2'].effect_type=='candidate']) res_p.loc[:,'beta'] = betas res_p.loc[:,'se'] = se res_p.loc[:,'z_score'] = betas/se self.res_p = res_p return res_p
def test_qtl_scan_lmm_nokinship(): random = RandomState(0) nsamples = 50 G = random.randn(50, 100) K = linear_kinship(G[:, 0:80], verbose=False) y = dot(G, random.randn(100)) / sqrt(100) + 0.2 * random.randn(nsamples) M = G[:, :5] X = G[:, 68:70] result = scan(X, y, "normal", K, M=M, verbose=False) pv = result.stats["pv20"].values assert_allclose(pv[:2], [8.159539103135342e-05, 0.10807353641893498], atol=1e-5)
def get_qtl_maps(self, covs=None): filter_nanaccs_ix = self.get_filter_accs_nans() if covs is None: covs = np.ones((self.genos.shape[0])) else: ### Need to also filter the accessions where covs has nan assert type(covs) is np.ndarray filter_nanaccs_ix = np.intersect1d( filter_nanaccs_ix, np.where(np.isfinite(np.array(covs)))[0]) if type(self.pheno) is pd.Series: lm = scan(self.genos[filter_nanaccs_ix, :], np.array(self.pheno.iloc[filter_nanaccs_ix]), covs=covs[filter_nanaccs_ix]) if len(np.where(np.isfinite(lm.getPv()[0]))[0]) == 0: return (None) return (lm) else: lm = [] for cl in self.pheno: lm.append( scan(self.genos[filter_nanaccs_ix, :], np.array(self.pheno[cl][filter_nanaccs_ix]), covs=covs[filter_nanaccs_ix])) return (lm) ## returns an array
def test_qtl_scan_glmm_bernoulli_nokinship(): random = RandomState(0) nsamples = 25 X = random.randn(nsamples, 2) G = random.randn(nsamples, 100) ntrials = random.randint(1, 2, nsamples) z = dot(G, random.randn(100)) / sqrt(100) successes = zeros(len(ntrials), int) for i, nt in enumerate(ntrials): for _ in range(nt): successes[i] += int(z[i] + 0.5 * random.randn() > 0) result = scan(X, successes, "bernoulli", verbose=False) pv = result.stats["pv20"] assert_allclose(pv, [0.3399067917883736, 0.8269568797830423], rtol=1e-5)
def test_qtl_scan_glmm_binomial(): random = RandomState(0) nsamples = 25 X = random.randn(nsamples, 2) G = random.randn(nsamples, 100) K = dot(G, G.T) ntrials = random.randint(1, 100, nsamples) z = dot(G, random.randn(100)) / sqrt(100) successes = zeros(len(ntrials), int) for i, nt in enumerate(ntrials): for _ in range(nt): successes[i] += int(z[i] + 0.5 * random.randn() > 0) result = scan(X, successes, ("binomial", ntrials), K, verbose=False) pv = result.stats["pv20"] assert_allclose(pv, [0.9315770010211236, 0.8457015828837173], atol=1e-6, rtol=1e-6)
def test_qtl_scan_gmm_binomial(): random = RandomState(0) nsamples = 25 X = random.randn(nsamples, 2) ntrials = random.randint(1, nsamples, nsamples) z = dot(X, random.randn(2)) successes = zeros(len(ntrials), int) for i in range(len(ntrials)): for _ in range(ntrials[i]): successes[i] += int(z[i] + 0.5 * random.randn() > 0) result = scan(X, successes, ("binomial", ntrials), verbose=False) pv = result.stats["pv20"] assert_allclose( pv, [2.4604711379400065e-06, 0.01823278752006871], rtol=1e-5, atol=1e-5 )
def test_qtl_scan_glmm_bernoulli(): random = RandomState(0) nsamples = 25 X = random.randn(nsamples, 2) G = random.randn(nsamples, 100) K = dot(G, G.T) ntrials = random.randint(1, 2, nsamples) z = dot(G, random.randn(100)) / sqrt(100) successes = zeros(len(ntrials), int) for i, nt in enumerate(ntrials): for _ in range(nt): successes[i] += int(z[i] + 0.5 * random.randn() > 0) result = scan(X, successes, "bernoulli", K, verbose=False) pv = result.stats["pv20"] assert_allclose(pv, [0.3399326545917558, 0.8269454251659921], rtol=1e-5)
def _test_qtl_scan_st(lik): random = RandomState(0) n = 30 ncovariates = 3 M = random.randn(n, ncovariates) v0 = random.rand() v1 = random.rand() G = random.randn(n, 4) K = random.randn(n, n + 1) K = normalise_covariance(K @ K.T) beta = random.randn(ncovariates) alpha = random.randn(G.shape[1]) m = M @ beta + G @ alpha y = mvn(random, m, v0 * K + v1 * eye(n)) idx = [[0, 1], 2, [3]] if lik == "poisson": y = random.poisson(exp(y)) elif lik == "bernoulli": y = random.binomial(1, 1 / (1 + exp(-y))) elif lik == "probit": y = random.binomial(1, st.norm.cdf(y)) elif lik == "binomial": ntrials = random.randint(0, 30, len(y)) y = random.binomial(ntrials, 1 / (1 + exp(-y))) lik = (lik, ntrials) r = scan(G, y, lik=lik, idx=idx, K=K, M=M, verbose=False) str(r) str(r.stats.head()) str(r.effsizes["h2"].head()) str(r.h0.trait) str(r.h0.likelihood) str(r.h0.lml) str(r.h0.effsizes) str(r.h0.variances)
def test_qtl_scan_two_hypotheses_mt_A0A1_none(): random = RandomState(0) n = 30 ntraits = 2 ncovariates = 3 A = random.randn(ntraits, ntraits) A = A @ A.T M = random.randn(n, ncovariates) C0 = random.randn(ntraits, ntraits) C0 = C0 @ C0.T C1 = random.randn(ntraits, ntraits) C1 = C1 @ C1.T G = random.randn(n, 4) A1 = eye(ntraits) K = random.randn(n, n + 1) K = normalise_covariance(K @ K.T) beta = vec(random.randn(ntraits, ncovariates)) alpha = vec(random.randn(A1.shape[1], G.shape[1])) m = kron(A, M) @ beta + kron(A1, G) @ alpha Y = unvec(mvn(random, m, kron(C0, K) + kron(C1, eye(n))), (n, -1)) Y = DataArray(Y, dims=["sample", "trait"], coords={"trait": ["WA", "Cx"]}) idx = [[0, 1], 2, [3]] r = scan(G, Y, idx=idx, K=K, M=M, A=A, verbose=False) df = r.effsizes["h2"] df = df[df["test"] == 0] assert_array_equal(df["trait"], ["WA"] * 3 + ["Cx"] * 3 + [None] * 4) assert_array_equal( df["env"], [None] * 6 + ["env1_WA", "env1_WA", "env1_Cx", "env1_Cx"] ) str(r)
] acn_indices.sort() K = kin_hdf['kinship'][acn_indices, :][:, acn_indices] kin_hdf.close() # get phenotype in correct order pheno = pheno.loc[acn_order] Y = pheno.to_numpy() ### MTMM TESTS ### A = np.matrix('0 1; 1 0') A0 = np.ones((len(traits), 1)) A1 = np.eye(len(traits)) # M = np.repeat(1, Y.shape[0]) r = scan(G, Y, K=K, lik='normal', A=A, A0=A0, A1=A1, verbose=True) # save results # link chromosome and positions to p-values and effect sizes geno_hdf = h5py.File(genoFile, 'r') chrIdx = geno_hdf['positions'].attrs['chr_regions'] chrom = [bisect(chrIdx[:, 1], snpIdx) + 1 for snpIdx in SNP_indices] positions = geno_hdf['positions'][:] pos = [positions[snp] for snp in SNP_indices] # G effect only pv10 = r.stats.pv10.tolist() # G + GxE pv20 = r.stats.pv20.tolist() # GxE effect only pv21 = r.stats.pv21.tolist()
fam.drop(fam.index[ind], inplace=True) # and from Kp, which gives K accession info Kp.drop(Kp.index[ind], inplace=True) Kp.drop(Kp.columns[ind], axis=1, inplace=True) GP_check = fam.index == pheno_ids.index PK_check = pheno_ids.index == Kp.index if np.count_nonzero(GP_check == False) != 0 and np.count_nonzero(PK_check == False) != 0 : print("CAUTION: Not all data files are in the same order!") if np.count_nonzero(GP_check == False) == 0 and np.count_nonzero(PK_check == False) == 0 : print("All input files in same order") ###################################### ### 7. Run marginal GWAS in limix r = scan(G=G, Y=Y, K = K, lik = 'normal', M = None, verbose = True) #################################### ### 8. Output results chrom = bim[['chrom']] pos = bim[['pos']] #extract pvals pvalues = r.stats.pv20.tolist() #extract effect sizes effsizes = r.effsizes['h2']['effsize'][r.effsizes['h2']['effect_type'] == 'candidate'].tolist() gwas_results = np.c_[chrom, pos, pvalues, effsizes] gwas_results = pd.DataFrame(data=gwas_results, index=None, columns=["chrom", "pos", "pv", "GVE"]) gwas_results.to_csv(output_file, index = False)
kin_hdf = h5py.File(args.kinship, 'r') # select kinship only for phenotyped and genotyped accessions acn_indices = [ np.where(kin_hdf['accessions'][:] == acn)[0][0] for acn in pheno.index ] acn_indices.sort() K = kin_hdf['kinship'][acn_indices, :][:, acn_indices] kin_hdf.close() # get phenotype in correct order pheno = pheno.loc[acn_order] Y = pheno.to_numpy() # scan r = scan(G, Y, K=K, lik="normal", M=None, verbose=True) # save results # link chromosome and positions to p-values and effect sizes geno_hdf = h5py.File(args.genotype, 'r') chrIdx = geno_hdf['positions'].attrs['chr_regions'] chrom = [bisect(chrIdx[:, 1], snpIdx) + 1 for snpIdx in SNP_indices] positions = geno_hdf['positions'][:] pos = [positions[snp] for snp in SNP_indices] pvalues = r.stats.pv20.tolist() effsizes = r.effsizes['h2']['effsize'][r.effsizes['h2']['effect_type'] == 'candidate'].to_list() Bonferroni = multitest.multipletests(pvalues, alpha=0.05, method='fdr_bh')[3] gwas_tuples = list(zip(chrom, pos, pvalues)) gwas_results = pd.DataFrame(gwas_tuples, columns=['chrom', 'pos', 'pv'])
def run_lm_st(inputs): for snp in inputs.geno.get_snps_iterator(is_chunked=True): lm_chunk = scan(np.array(snp[:,inputs.accinds], dtype=int).T, np.array(inputs.pheno.values), test=inputs.test) yield(lm_chunk)
def run_glmm_st(inputs): for snp in inputs.geno.get_snps_iterator(is_chunked=True): lmm_chunk = scan(np.array(snp[:,inputs.accinds], dtype=int).T, np.array(inputs.pheno.values), lik = inputs.pheno_type, K = inputs.kin, test=inputs.test, searchDelta=False) yield(lmm_chunk)
def scan(ctx, trait, genotype, covariate, kinship, lik, output_dir, verbose, dry_run, **_): """ Single-variant association testing via mixed models. This analysis requires minimally the specification of one phenotype (PHENOTYPES_FILE) and genotype data (GENOTYPE_FILE). The --filter option allows for selecting a subset of the original dataset for the analysis. For example, --filter="genotype: (chrom == '3') & (pos > 100) & (pos < 200)" states that only loci of chromosome 3 having a position inside the range (100, 200) will be considered. The --filter option can be used multiple times in the same call. In general, --filter accepts a string of the form <DATA-TYPE>: <BOOL-EXPR> where <DATA-TYPE> can be phenotype, genotype, or covariate. <BOOL-EXPR> is a boolean expression involving row or column names. Please, consult `pandas.DataFrame.query` function from Pandas package for further information. \f Examples -------- ... doctest:: # First we perform a quick file inspection. This step is optional but is very # useful to check whether `limix` is able to read them and print out their # metadata. limix show phenotypes.csv limix show genotype.bgen limix show kinship.raw # We now perform the analysis, specifying the genotype loci and the phenotype # of interest. limix phenotypes.csv genotype.bgen --kinship-file=kinship.raw \ --output-dir=results \ --filter="phenotype: col == 'height'" \ --filter="genotype: (chrom == '3') & (pos > 100) & (pos < 200)" """ import sys from os import makedirs from os.path import abspath, exists, join import traceback from limix._display import session_block, banner, session_line, indent, print_exc from limix.qtl import scan from limix.io import fetch from .pipeline import Pipeline from limix._data import conform_dataset from .preprocess import impute as impute_func from .preprocess import normalize as normalize_func from .preprocess import where as where_func from .preprocess import drop_missing, drop_maf print(banner()) if ctx.obj is None: ctx.obj = {"preprocess": []} output_dir = abspath(output_dir) if not dry_run: if not exists(output_dir): makedirs(output_dir, exist_ok=True) def _print_data_array(arr, verbose): if verbose: print("\n{}\n".format(indent(_clean_data_array_repr(arr)))) data = {"y": None, "G": None, "K": None} data["y"] = fetch("trait", trait, verbose) _print_data_array(data["y"], verbose) data["G"] = fetch("genotype", genotype, verbose) _print_data_array(data["G"], verbose) if covariate is not None: data["M"] = fetch("covariate", covariate, verbose) _print_data_array(data["M"], verbose) if kinship is not None: data["K"] = fetch("kinship", kinship, verbose) _print_data_array(data["K"], verbose) with session_line("Matching samples... "): data = conform_dataset(**data) data = {k: v for k, v in data.items() if v is not None} if data["y"].sample.size == 0: raise RuntimeError( "Exiting early because there is no sample left after matching samples." + " Please, check your sample ids.") oparams = _ordered_params(ctx) with session_block("preprocessing", disable=not verbose): pipeline = Pipeline(data) preproc_params = [ i for i in oparams if i[0] in ["impute", "normalize", "where", "drop_missing", "drop_maf"] ] for p in preproc_params: if p[0] == "where": pipeline.append(where_func, "where", p[1]) elif p[0] == "normalize": pipeline.append(normalize_func, "normalize", p[1]) elif p[0] == "impute": pipeline.append(impute_func, "impute", p[1]) elif p[0] == "drop_maf": pipeline.append(drop_maf, "drop-maf", p[1]) elif p[0] == "drop_missing": pipeline.append(drop_missing, "drop-missing", p[1]) data = pipeline.run() if dry_run: print("Exiting early because of dry-run option.") return if "K" not in data: data["K"] = None try: res = scan(data["G"], data["y"], lik=lik, K=data["K"], M=data["M"], verbose=verbose) except Exception as e: print_exc(traceback.format_stack(), e) sys.exit(1) with session_line("Saving results to `{}`... ".format(output_dir)): res.to_csv(join(output_dir, "null.csv"), join(output_dir, "alt.csv"))