def estimate(y, lik, K, M=None, verbose=True): from numpy_sugar.linalg import economic_qs from numpy import pi, var, diag from glimix_core.glmm import GLMMExpFam from glimix_core.lmm import LMM from limix._data._assert import assert_likelihood from limix._data import normalize_likelihood, conform_dataset from limix.qtl._assert import assert_finite from limix._display import session_block, session_line lik = normalize_likelihood(lik) lik_name = lik[0] with session_block("Heritability analysis", disable=not verbose): with session_line("Normalising input...", disable=not verbose): data = conform_dataset(y, M=M, K=K) y = data["y"] M = data["M"] K = data["K"] assert_finite(y, M, K) if K is not None: # K = K / diag(K).mean() QS = economic_qs(K) else: QS = None if lik_name == "normal": method = LMM(y.values, M.values, QS, restricted=True) method.fit(verbose=verbose) else: method = GLMMExpFam(y, lik, M.values, QS, n_int=500) method.fit(verbose=verbose, factr=1e6, pgtol=1e-3) g = method.scale * (1 - method.delta) e = method.scale * method.delta if lik_name == "bernoulli": e += pi * pi / 3 v = var(method.mean()) return g, v, e
def estimate(y_phe, lik, kin, marker_mat=None, verbose=True): ''' estimate variance components ''' lik = normalize_likelihood(lik) lik_name = lik[0] with session_block("Heritability analysis", disable=not verbose): with session_line("Normalising input...", disable=not verbose): data = conform_dataset(y_phe, M=marker_mat, K=kin) y_phe = data["y"] marker_mat = data["M"] kin = data["K"] assert_finite(y_phe, marker_mat, kin) if kin is not None: # K = K / diag(K).mean() q_s = economic_qs(kin) else: q_s = None if lik_name == "normal": method = LMM(y_phe.values, marker_mat.values, q_s, restricted=True) method.fit(verbose=verbose) else: method = GLMMExpFam(y_phe, lik, marker_mat.values, q_s, n_int=500) method.fit(verbose=verbose, factr=1e6, pgtol=1e-3) v_g = method.scale * (1 - method.delta) v_e = method.scale * method.delta if lik_name == "bernoulli": v_e += pi * pi / 3 v_v = var(method.mean()) return v_g, v_v, v_e
def st_scan(G, y, lik, K=None, M=None, verbose=True): r""" Single-variant association testing via generalised linear mixed models. It supports Normal (linear mixed model), Bernoulli, Probit, Binomial, and Poisson residual errors, defined by ``lik``. The columns of ``G`` define the candidates to be tested for association with the phenotype ``y``. The covariance matrix is set by ``K``. If not provided, or set to ``None``, the generalised linear model without random effects is assumed. The covariates can be set via the parameter ``M``. We recommend to always provide a column of ones when covariates are actually provided. Parameters ---------- G : array_like :math:`N` individuals by :math:`S` candidate markers. y : array_like An outcome array of :math:`N` individuals. lik : tuple, "normal", "bernoulli", "probit", binomial", "poisson" Sample likelihood describing the residual distribution. Either a tuple or a string specifiying the likelihood is required. The Normal, Bernoulli, Probit, and Poisson likelihoods can be selected by providing a string. Binomial likelihood on the other hand requires a tuple because of the number of trials: ``("binomial", array_like)``. K : array_like, optional :math:`N`-by-:math:`N` covariance matrix (e.g., kinship coefficients). Set to ``None`` for a generalised linear model without random effects. Defaults to ``None``. M : array_like, optional `N` individuals by `S` covariates. It will create a :math:`N`-by-:math:`1` matrix ``M`` of ones representing the offset covariate if ``None`` is passed. If an array is passed, it will used as is. Defaults to ``None``. verbose : bool, optional ``True`` to display progress and summary; ``False`` otherwise. Returns ------- :class:`limix.qtl.QTLModel` QTL representation. Examples -------- .. doctest:: >>> from numpy import dot, exp, sqrt, ones >>> from numpy.random import RandomState >>> from pandas import DataFrame >>> import pandas as pd >>> from limix.qtl import st_scan >>> >>> random = RandomState(1) >>> pd.options.display.float_format = "{:9.6f}".format >>> >>> n = 30 >>> p = 3 >>> samples_index = range(n) >>> >>> M = DataFrame(dict(offset=ones(n), age=random.randint(10, 60, n))) >>> M.index = samples_index >>> >>> X = random.randn(n, 100) >>> K = dot(X, X.T) >>> >>> candidates = random.randn(n, p) >>> candidates = DataFrame(candidates, index=samples_index, ... columns=['rs0', 'rs1', 'rs2']) >>> >>> y = random.poisson(exp(random.randn(n))) >>> >>> model = st_scan(candidates, y, 'poisson', K, M=M, verbose=False) >>> >>> model.variant_pvalues.to_dataframe() # doctest: +FLOAT_CMP pv candidate rs0 0.554444 rs1 0.218996 rs2 0.552200 >>> model.variant_effsizes.to_dataframe() # doctest: +FLOAT_CMP effsizes candidate rs0 -0.130867 rs1 -0.315078 rs2 -0.143869 >>> model.variant_effsizes_se.to_dataframe() # doctest: +FLOAT_CMP effsizes std candidate rs0 0.221390 rs1 0.256327 rs2 0.242013 >>> model # doctest: +FLOAT_CMP Variants -------- effsizes effsizes_se pvalues count 3 3 3 mean -0.196604 0.239910 0.441880 std 0.102807 0.017563 0.193027 min -0.315077 0.221389 0.218996 25% -0.229473 0.231701 0.385598 50% -0.143869 0.242013 0.552200 75% -0.137367 0.249170 0.553322 max -0.130866 0.256326 0.554443 <BLANKLINE> Covariate effect sizes for H0 ----------------------------- age offset -0.005568 0.395287 >>> from numpy import zeros >>> >>> nsamples = 50 >>> >>> X = random.randn(nsamples, 2) >>> G = random.randn(nsamples, 100) >>> K = dot(G, G.T) >>> ntrials = random.randint(1, 100, nsamples) >>> z = dot(G, random.randn(100)) / sqrt(100) >>> >>> successes = zeros(len(ntrials), int) >>> for i, nt in enumerate(ntrials): ... for _ in range(nt): ... successes[i] += int(z[i] + 0.5 * random.randn() > 0) >>> >>> result = st_scan(X, successes, ("binomial", ntrials), K, verbose=False) >>> print(result) # doctest: +FLOAT_CMP Variants -------- effsizes effsizes_se pvalues count 2 2 2 mean 0.227116 0.509575 0.478677 std 0.567975 0.031268 0.341791 min -0.174503 0.487466 0.236994 25% 0.026307 0.498520 0.357835 50% 0.227116 0.509575 0.478677 75% 0.427925 0.520630 0.599518 max 0.628735 0.531685 0.720359 <BLANKLINE> Covariate effect sizes for H0 ----------------------------- offset 0.409570 Notes ----- It will raise a ``ValueError`` exception if non-finite values are passed. Please, refer to the :func:`limix.qc.mean_impute` function for missing value imputation. """ from numpy_sugar import is_all_finite from numpy_sugar.linalg import economic_qs if not isinstance(lik, (tuple, list)): lik = (lik,) lik_name = lik[0].lower() lik = (lik_name,) + lik[1:] check_likelihood_name(lik_name) with session_block("qtl analysis", disable=not verbose): with session_line("Normalising input... ", disable=not verbose): data = conform_dataset(y, M, G=G, K=K) y = data["y"] M = data["M"] G = data["G"] K = data["K"] if not is_all_finite(y): raise ValueError("Outcome must have finite values only.") if not is_all_finite(M): raise ValueError("Covariates must have finite values only.") if K is not None: if not is_all_finite(K): raise ValueError("Covariate matrix must have finite values only.") QS = economic_qs(K) else: QS = None y = normalise_extreme_values(data["y"], lik) if lik_name == "normal": model = _perform_lmm(y.values, M, QS, G, verbose) else: model = _perform_glmm(y.values, lik, M, K, QS, G, verbose) if verbose: print(model) return model
def mt_scan(G, Y, M=None, K=None, Ac=None, Asnps=None, Asnps0=None, verbose=True): """ Wrapper function for multi-trait single-variant association testing using variants of the multi-trait linear mixed model. Parameters ---------- Y : (`N`, `P`) ndarray phenotype data Asnps : (`P`, `K`) ndarray trait design of snp covariance. By default, ``Asnps`` is eye(`P`). R : (`N`, `N`) ndarray LMM-covariance/genetic relatedness matrix. If not provided, then standard linear regression is considered. Alternatively, its eighenvalue decomposition can be provided through ``eigh_R``. if ``eigh_R`` is set, this parameter is ignored. eigh_R : tuple Tuple with `N` ndarray of eigenvalues of `R` and (`N`, `N`) ndarray of eigenvectors of ``R``. covs : (`N`, `D`) ndarray covariate design matrix. By default, ``covs`` is a (`N`, `1`) array of ones. Ac : (`P`, `L`) ndarray trait design matrices of the different fixed effect terms. By default, ``Ac`` is eye(`P`). Asnps0 : (`P`, `K`) ndarray trait design of snp covariance in the null model. By default, Asnps0 is not considered (i.e., no SNP effect in the null model). If specified, then three tests are considered: (i) Asnps vs , (ii) Asnps0!=0, (iii) Asnps!=Asnps0 verbose : (bool, optional): if True, details such as runtime as displayed. """ from pandas import DataFrame from scipy.stats import chi2 from numpy import eye, cov, asarray from scipy.linalg import eigh from limix_core.gp import GP2KronSum from limix_core.covar import FreeFormCov from limix_lmm.mtlmm import MTLMM if Ac is None: Ac = eye(Y.shape[1]) with session_block("single-trait association test", disable=not verbose): with session_line("Normalising input... ", disable=not verbose): data = conform_dataset(Y, M, G=G, K=K) Y = asarray(data["y"]) M = asarray(data["M"]) G = asarray(data["G"]) K = asarray(data["K"]) # case 1: multi-trait linear model if K is None: raise ValueError("multi-trait linear model not supported") eigh_R = eigh(K) # case 2: full-rank multi-trait linear model S_R, U_R = eigh_R S_R = add_jitter(S_R) gp = GP2KronSum( Y=Y, Cg=FreeFormCov(Y.shape[1]), Cn=FreeFormCov(Y.shape[1]), S_R=eigh_R[0], U_R=eigh_R[1], F=M, A=Ac, ) gp.covar.Cr.setCovariance(0.5 * cov(Y.T)) gp.covar.Cn.setCovariance(0.5 * cov(Y.T)) gp.optimize(verbose=verbose) lmm = MTLMM(Y, F=M, A=Ac, Asnp=Asnps, covar=gp.covar) if Asnps0 is not None: lmm0 = MTLMM(Y, F=M, A=Ac, Asnp=Asnps0, covar=gp.covar) if Asnps0 is None: lmm.process(G) RV = OrderedDict() RV["pv"] = lmm.getPv() RV["lrt"] = lmm.getLRT() else: lmm.process(G) lmm0.process(G) # compute pv lrt1 = lmm.getLRT() lrt0 = lmm0.getLRT() lrt = lrt1 - lrt0 pv = chi2(Asnps.shape[1] - Asnps0.shape[1]).sf(lrt) RV = OrderedDict() RV["pv1"] = lmm.getPv() RV["pv0"] = lmm0.getPv() RV["pv"] = pv RV["lrt1"] = lrt1 RV["lrt0"] = lrt0 RV["lrt"] = lrt return DataFrame(RV)
def scan(G, Y, lik="normal", K=None, M=None, idx=None, A=None, A0=None, A1=None, verbose=True): """ Multi-trait association and interaction testing via linear mixed models. Let n, c, and p be the number of samples, covariates, and traits, respectively. The outcome variable Y is a n×p matrix distributed according to :: vec(Y) ~ N((A ⊗ M) vec(𝚨), K₀ = C₀ ⊗ K + C₁ ⊗ I) under H₀. A and M are design matrices of dimensions p×p and n×c provided by the user, where X is the usual matrix of covariates commonly used in single-trait models. 𝚨 is a c×p matrix of fixed-effect sizes per trait. C₀ and C₁ are both symmetric matrices of dimensions p×p, for which C₁ is guaranteed by our implementation to be of full rank. The parameters of the H₀ model are the matrices 𝚨, C₀, and C₁. The additional models H₁ and H₂ are define as :: vec(Y) ~ N((A ⊗ M) vec(𝚨) + (A₀ ⊗ Gᵢ) vec(𝚩₀), s⋅K₀) and :: vec(Y) ~ N((A ⊗ M) vec(𝚨) + (A₀ ⊗ Gᵢ) vec(𝚩₀) + (A₁ ⊗ Gᵢ) vec(𝚩₁), s⋅K₀) It performs likelihood-ratio tests for the following cases, where the first hypothesis is the null one while the second hypothesis is the alternative one: - H₀ vs H₁: testing for vec(𝚩₀) ≠ 𝟎 while vec(𝚩₁) = 𝟎 - H₀ vs H₂: testing for [vec(𝚩₀) vec(𝚩₁)] ≠ 𝟎 - H₁ vs H₂: testing for vec(𝚩₁) ≠ 𝟎 It supports generalized linear mixed models (GLMM) when a single trait is used. In this case, the following likelihoods are implemented: - Bernoulli - Probit - Binomial - Poisson Formally, let p(𝜇) be one of the supported probability distributions where 𝜇 is its mean. The H₀ model is defined as follows:: yᵢ ∼ p(𝜇ᵢ=g(zᵢ)) for 𝐳 ∼ 𝓝(..., ...). g(⋅) is the corresponding canonical link function for the Bernoulli, Binomial, and Poisson likelihoods. The Probit likelihood, on the other hand, is a Bernoulli likelihood with probit link function. Parameters ---------- G : n×m array_like Genetic candidates. Y : n×p array_like Rows are samples and columns are phenotypes. lik : tuple, "normal", "bernoulli", "probit", "binomial", "poisson" Sample likelihood describing the residual distribution. Either a tuple or a string specifying the likelihood is required. The Normal, Bernoulli, Probit, and Poisson likelihoods can be selected by providing a string. Binomial likelihood on the other hand requires a tuple because of the number of trials: ``("binomial", array_like)``. Defaults to ``"normal"``. K : n×n array_like Sample covariance, often the so-called kinship matrix. M : n×c array_like Covariates matrix. idx : list List of candidate indices that defines the set of candidates to be used in the tests. A : p×p array_like Symmetric trait-by-trait design matrix. A0 : p×p₀ array_like, optional Matrix A₀, possibility a non-symmetric one. If ``None``, it defines an empty matrix, p₀=0. Defaults to ``None``. A1 : p×p₁ array_like, optional Matrix A₁, possibility a non-symmetric one. If ``None``, it defines an identity matrix, p₀=p. Defaults to ``None``. verbose : bool, optional ``True`` to display progress and summary; ``False`` otherwise. Returns ------- result : :class:`limix.qtl._result.STScanResult`, :class:`limix.qtl._result.MTScanResult` P-values, log of marginal likelihoods, effect sizes, and associated statistics. Examples -------- .. doctest:: >>> from limix.qtl import scan >>> from numpy import reshape, kron, eye >>> from numpy import concatenate >>> from numpy.random import RandomState >>> import scipy.stats as st >>> from limix.qc import normalise_covariance >>> >>> def vec(x): ... return reshape(x, (-1,) + x.shape[2:], order="F") >>> >>> def unvec(x, shape): ... return reshape(x, shape, order="F") >>> >>> random = RandomState(0) >>> n = 30 >>> ntraits = 2 >>> ncovariates = 3 >>> >>> A = random.randn(ntraits, ntraits) >>> A = A @ A.T >>> M = random.randn(n, ncovariates) >>> >>> C0 = random.randn(ntraits, ntraits) >>> C0 = C0 @ C0.T >>> >>> C1 = random.randn(ntraits, ntraits) >>> C1 = C1 @ C1.T >>> >>> G = random.randn(n, 4) >>> >>> A0 = random.randn(ntraits, 1) >>> A1 = random.randn(ntraits, 2) >>> A01 = concatenate((A0, A1), axis=1) >>> >>> K = random.randn(n, n + 1) >>> K = normalise_covariance(K @ K.T) >>> >>> beta = vec(random.randn(ntraits, ncovariates)) >>> alpha = vec(random.randn(A01.shape[1], G.shape[1])) >>> >>> mvn = st.multivariate_normal >>> m = kron(A, M) @ beta + kron(A01, G) @ alpha >>> Y = unvec(mvn(m, kron(C0, K) + kron(C1, eye(n))).rvs(), (n, -1)) >>> >>> idx = [[0, 1], 2, [3]] >>> r = scan(G, Y, idx=idx, K=K, M=M, A=A, A0=A0, A1=A1, verbose=False) .. doctest:: >>> from numpy import dot, exp, sqrt, ones >>> from numpy.random import RandomState >>> from pandas import DataFrame >>> import pandas as pd >>> from limix.qtl import scan >>> >>> random = RandomState(1) >>> pd.options.display.float_format = "{:9.6f}".format >>> >>> n = 30 >>> p = 3 >>> samples_index = range(n) >>> >>> M = DataFrame(dict(offset=ones(n), age=random.randint(10, 60, n))) >>> M.index = samples_index >>> >>> X = random.randn(n, 100) >>> K = dot(X, X.T) >>> >>> candidates = random.randn(n, p) >>> candidates = DataFrame(candidates, index=samples_index, ... columns=['rs0', 'rs1', 'rs2']) >>> >>> y = random.poisson(exp(random.randn(n))) >>> >>> result = scan(candidates, y, 'poisson', K, M=M, verbose=False) >>> >>> result.stats # doctest: +FLOAT_CMP +SKIP null lml alt lml pvalue dof test 0 -48.736563 -48.561855 0.554443 1 1 -48.736563 -47.981093 0.218996 1 2 -48.736563 -48.559868 0.552200 1 >>> result.alt_effsizes # doctest: +FLOAT_CMP +SKIP test candidate effsize effsize se 0 0 rs0 -0.130867 0.221390 1 1 rs1 -0.315079 0.256327 2 2 rs2 -0.143869 0.242014 >>> print(result) # doctest: +FLOAT_CMP +SKIP Null model ---------- <BLANKLINE> 𝐳 ~ 𝓝(M𝜶, 0.79*K + 0.00*I) yᵢ ~ Poisson(λᵢ=g(zᵢ)), where g(x)=eˣ M = ['offset' 'age'] 𝜶 = [ 0.39528617 -0.00556789] Log marg. lik.: -48.736563230140376 Number of models: 1 <BLANKLINE> Alt model --------- <BLANKLINE> 𝐳 ~ 𝓝(M𝜶 + Gᵢ, 0.79*K + 0.00*I) yᵢ ~ Poisson(λᵢ=g(zᵢ)), where g(x)=eˣ Min. p-value: 0.21899561824721903 First perc. p-value: 0.22565970374303942 Max. log marg. lik.: -47.981092939974765 99th perc. log marg. lik.: -47.9926684371547 Number of models: 3 >>> from numpy import zeros >>> >>> nsamples = 50 >>> >>> X = random.randn(nsamples, 2) >>> G = random.randn(nsamples, 100) >>> K = dot(G, G.T) >>> ntrials = random.randint(1, 100, nsamples) >>> z = dot(G, random.randn(100)) / sqrt(100) >>> >>> successes = zeros(len(ntrials), int) >>> for i, nt in enumerate(ntrials): ... for _ in range(nt): ... successes[i] += int(z[i] + 0.5 * random.randn() > 0) >>> >>> result = scan(X, successes, ("binomial", ntrials), K, verbose=False) >>> print(result) # doctest: +FLOAT_CMP +SKIP Null model ---------- <BLANKLINE> 𝐳 ~ 𝓝(M𝜶, 1.74*K + 0.15*I) yᵢ ~ Binom(μᵢ=g(zᵢ), nᵢ), where g(x)=1/(1+e⁻ˣ) M = ['offset'] 𝜶 = [0.40956947] Log marg. lik.: -142.9436437096321 Number of models: 1 <BLANKLINE> Alt model --------- <BLANKLINE> 𝐳 ~ 𝓝(M𝜶 + Gᵢ, 1.74*K + 0.15*I) yᵢ ~ Binom(μᵢ=g(zᵢ), nᵢ), where g(x)=1/(1+e⁻ˣ) Min. p-value: 0.23699422686919802 First perc. p-value: 0.241827874774993 Max. log marg. lik.: -142.24445140459548 99th perc. log marg. lik.: -142.25080258276773 Number of models: 2 Notes ----- It will raise a ``ValueError`` exception if non-finite values are passed. Please, refer to the :func:`limix.qc.mean_impute` function for missing value imputation. """ from numpy_sugar.linalg import economic_qs lik = normalize_likelihood(lik) if A is None: if A0 is not None or A1 is not None: raise ValueError( "You cannot define `A0` or `A1` without defining `A`.") with session_block("QTL analysis", disable=not verbose): with session_line("Normalising input... ", disable=not verbose): data = conform_dataset(Y, M, G=G, K=K) Y = data["y"] M = data["M"] G = data["G"] K = data["K"] assert_finite(Y, M, K) if K is not None: QS = economic_qs(K) else: QS = None if A is None: r = _single_trait_scan(idx, lik, Y, M, G, QS, verbose) else: r = _multi_trait_scan(idx, lik, Y, M, G, QS, A, A0, A1, verbose) r = r.create() if verbose: print(r) return r
def scan(ctx, trait, genotype, covariate, kinship, lik, output_dir, verbose, dry_run, **_): """ Single-variant association testing via mixed models. This analysis requires minimally the specification of one phenotype (PHENOTYPES_FILE) and genotype data (GENOTYPE_FILE). The --filter option allows for selecting a subset of the original dataset for the analysis. For example, --filter="genotype: (chrom == '3') & (pos > 100) & (pos < 200)" states that only loci of chromosome 3 having a position inside the range (100, 200) will be considered. The --filter option can be used multiple times in the same call. In general, --filter accepts a string of the form <DATA-TYPE>: <BOOL-EXPR> where <DATA-TYPE> can be phenotype, genotype, or covariate. <BOOL-EXPR> is a boolean expression involving row or column names. Please, consult `pandas.DataFrame.query` function from Pandas package for further information. \f Examples -------- ... doctest:: # First we perform a quick file inspection. This step is optional but is very # useful to check whether `limix` is able to read them and print out their # metadata. limix show phenotypes.csv limix show genotype.bgen limix show kinship.raw # We now perform the analysis, specifying the genotype loci and the phenotype # of interest. limix phenotypes.csv genotype.bgen --kinship-file=kinship.raw \ --output-dir=results \ --filter="phenotype: col == 'height'" \ --filter="genotype: (chrom == '3') & (pos > 100) & (pos < 200)" """ import sys from os import makedirs from os.path import abspath, exists, join import traceback from limix._display import session_block, banner, session_line, indent, print_exc from limix.qtl import scan from limix.io import fetch from .pipeline import Pipeline from limix._data import conform_dataset from .preprocess import impute as impute_func from .preprocess import normalize as normalize_func from .preprocess import where as where_func from .preprocess import drop_missing, drop_maf print(banner()) if ctx.obj is None: ctx.obj = {"preprocess": []} output_dir = abspath(output_dir) if not dry_run: if not exists(output_dir): makedirs(output_dir, exist_ok=True) def _print_data_array(arr, verbose): if verbose: print("\n{}\n".format(indent(_clean_data_array_repr(arr)))) data = {"y": None, "G": None, "K": None} data["y"] = fetch("trait", trait, verbose) _print_data_array(data["y"], verbose) data["G"] = fetch("genotype", genotype, verbose) _print_data_array(data["G"], verbose) if covariate is not None: data["M"] = fetch("covariate", covariate, verbose) _print_data_array(data["M"], verbose) if kinship is not None: data["K"] = fetch("kinship", kinship, verbose) _print_data_array(data["K"], verbose) with session_line("Matching samples... "): data = conform_dataset(**data) data = {k: v for k, v in data.items() if v is not None} if data["y"].sample.size == 0: raise RuntimeError( "Exiting early because there is no sample left after matching samples." + " Please, check your sample ids.") oparams = _ordered_params(ctx) with session_block("preprocessing", disable=not verbose): pipeline = Pipeline(data) preproc_params = [ i for i in oparams if i[0] in ["impute", "normalize", "where", "drop_missing", "drop_maf"] ] for p in preproc_params: if p[0] == "where": pipeline.append(where_func, "where", p[1]) elif p[0] == "normalize": pipeline.append(normalize_func, "normalize", p[1]) elif p[0] == "impute": pipeline.append(impute_func, "impute", p[1]) elif p[0] == "drop_maf": pipeline.append(drop_maf, "drop-maf", p[1]) elif p[0] == "drop_missing": pipeline.append(drop_missing, "drop-missing", p[1]) data = pipeline.run() if dry_run: print("Exiting early because of dry-run option.") return if "K" not in data: data["K"] = None try: res = scan(data["G"], data["y"], lik=lik, K=data["K"], M=data["M"], verbose=verbose) except Exception as e: print_exc(traceback.format_stack(), e) sys.exit(1) with session_line("Saving results to `{}`... ".format(output_dir)): res.to_csv(join(output_dir, "null.csv"), join(output_dir, "alt.csv"))
def scan(G, Y, lik="normal", K=None, M=None, idx=None, A=None, A0=None, A1=None, verbose=True): """ Multi-trait association and interaction testing via linear mixed models. Let n, c, and p be the number of samples, covariates, and traits, respectively. The outcome variable Y is a n×p matrix distributed according to :: vec(Y) ~ N((A ⊗ M) vec(𝚨), K₀ = C₀ ⊗ K + C₁ ⊗ I) under H₀. A and M are design matrices of dimensions p×p and n×c provided by the user, where X is the usual matrix of covariates commonly used in single-trait models. 𝚨 is a c×p matrix of fixed-effect sizes per trait. C₀ and C₁ are both symmetric matrices of dimensions p×p, for which C₁ is guaranteed by our implementation to be of full rank. The parameters of the H₀ model are the matrices 𝚨, C₀, and C₁. The additional models H₁ and H₂ are define as :: vec(Y) ~ N((A ⊗ M) vec(𝚨) + (A₀ ⊗ Gᵢ) vec(𝚩₀), s⋅K₀) and :: vec(Y) ~ N((A ⊗ M) vec(𝚨) + (A₀ ⊗ Gᵢ) vec(𝚩₀) + (A₁ ⊗ Gᵢ) vec(𝚩₁), s⋅K₀) It performs likelihood-ratio tests for the following cases, where the first hypothesis is the null one while the second hypothesis is the alternative one: - H₀ vs H₁: testing for vec(𝚩₀) ≠ 𝟎 while vec(𝚩₁) = 𝟎 - H₀ vs H₂: testing for [vec(𝚩₀) vec(𝚩₁)] ≠ 𝟎 - H₁ vs H₂: testing for vec(𝚩₁) ≠ 𝟎 It supports generalized linear mixed models (GLMM) when a single trait is used. In this case, the following likelihoods are implemented: - Bernoulli - Probit - Binomial - Poisson Formally, let p(𝜇) be one of the supported probability distributions where 𝜇 is its mean. The H₀ model is defined as follows:: yᵢ ∼ p(𝜇ᵢ=g(zᵢ)) for 𝐳 ∼ 𝓝(..., ...). g(⋅) is the corresponding canonical link function for the Bernoulli, Binomial, and Poisson likelihoods. The Probit likelihood, on the other hand, is a Bernoulli likelihood with probit link function. Parameters ---------- G : n×m array_like Genetic candidates. Y : n×p array_like Rows are samples and columns are phenotypes. lik : tuple, "normal", "bernoulli", "probit", "binomial", "poisson" Sample likelihood describing the residual distribution. Either a tuple or a string specifying the likelihood is required. The Normal, Bernoulli, Probit, and Poisson likelihoods can be selected by providing a string. Binomial likelihood on the other hand requires a tuple because of the number of trials: ``("binomial", array_like)``. Defaults to ``"normal"``. K : n×n array_like Sample covariance, often the so-called kinship matrix. M : n×c array_like Covariates matrix. idx : list List of candidate indices that defines the set of candidates to be used in the tests. A : p×p array_like Symmetric trait-by-trait design matrix. A0 : p×p₀ array_like, optional Matrix A₀, possibility a non-symmetric one. If ``None``, it defines an empty matrix, p₀=0. Defaults to ``None``. A1 : p×p₁ array_like, optional Matrix A₁, possibility a non-symmetric one. If ``None``, it defines an identity matrix, p₀=p. Defaults to ``None``. verbose : bool, optional ``True`` to display progress and summary; ``False`` otherwise. Returns ------- result : :class:`limix.qtl._result.STScanResult`, :class:`limix.qtl._result.MTScanResult` P-values, log of marginal likelihoods, effect sizes, and associated statistics. Examples -------- .. doctest:: >>> from limix.qtl import scan >>> from numpy import reshape, kron, eye >>> from numpy import concatenate >>> from numpy.random import RandomState >>> import scipy.stats as st >>> from limix.qc import normalise_covariance >>> >>> def vec(x): ... return reshape(x, (-1,) + x.shape[2:], order="F") >>> >>> def unvec(x, shape): ... return reshape(x, shape, order="F") >>> >>> random = RandomState(0) >>> n = 30 >>> ntraits = 2 >>> ncovariates = 3 >>> >>> A = random.randn(ntraits, ntraits) >>> A = A @ A.T >>> M = random.randn(n, ncovariates) >>> >>> C0 = random.randn(ntraits, ntraits) >>> C0 = C0 @ C0.T >>> >>> C1 = random.randn(ntraits, ntraits) >>> C1 = C1 @ C1.T >>> >>> G = random.randn(n, 4) >>> >>> A0 = random.randn(ntraits, 1) >>> A1 = random.randn(ntraits, 2) >>> A01 = concatenate((A0, A1), axis=1) >>> >>> K = random.randn(n, n + 1) >>> K = normalise_covariance(K @ K.T) >>> >>> beta = vec(random.randn(ntraits, ncovariates)) >>> alpha = vec(random.randn(A01.shape[1], G.shape[1])) >>> >>> mvn = st.multivariate_normal >>> m = kron(A, M) @ beta + kron(A01, G) @ alpha >>> Y = unvec(mvn(m, kron(C0, K) + kron(C1, eye(n))).rvs(), (n, -1)) >>> >>> idx = [[0, 1], 2, [3]] >>> r = scan(G, Y, idx=idx, K=K, M=M, A=A, A0=A0, A1=A1, verbose=False) .. doctest:: >>> from numpy import dot, exp, sqrt, ones >>> from numpy.random import RandomState >>> from pandas import DataFrame >>> import pandas as pd >>> from limix.qtl import scan >>> >>> random = RandomState(1) >>> pd.options.display.float_format = "{:9.6f}".format >>> >>> n = 30 >>> p = 3 >>> samples_index = range(n) >>> >>> M = DataFrame(dict(offset=ones(n), age=random.randint(10, 60, n))) >>> M.index = samples_index >>> >>> X = random.randn(n, 100) >>> K = dot(X, X.T) >>> >>> candidates = random.randn(n, p) >>> candidates = DataFrame(candidates, index=samples_index, ... columns=['rs0', 'rs1', 'rs2']) >>> >>> y = random.poisson(exp(random.randn(n))) >>> >>> result = scan(candidates, y, 'poisson', K, M=M, verbose=False) >>> >>> result.stats # doctest: +FLOAT_CMP lml0 lml2 dof20 scale2 pv20 test 0 -48.720890 -48.536860 1 0.943532 0.544063 1 -48.720890 -47.908341 1 0.904814 0.202382 2 -48.720890 -48.534754 1 0.943400 0.541768 >>> print(result) # doctest: +FLOAT_CMP Hypothesis 0 ------------ <BLANKLINE> 𝐳 ~ 𝓝(𝙼𝜶, 0.000⋅𝙺 + 0.788⋅𝙸) for yᵢ ~ Poisson(λᵢ=g(zᵢ)) and g(x)=eˣ <BLANKLINE> M = ['offset' 'age'] 𝜶 = [ 0.39528889 -0.00556797] se(𝜶) = [0.50173695 0.01505240] lml = -48.720890273519444 <BLANKLINE> Hypothesis 2 ------------ <BLANKLINE> 𝐳 ~ 𝓝(𝙼𝜶 + G𝛃, s(0.000⋅𝙺 + 0.788⋅𝙸)) for yᵢ ~ Poisson(λᵢ=g(zᵢ)) and g(x)=eˣ <BLANKLINE> lml cov. effsizes cand. effsizes -------------------------------------------------- mean -4.833e+01 2.393e-01 -1.966e-01 std 3.623e-01 2.713e-01 1.028e-01 min -4.854e+01 -8.490e-03 -3.151e-01 25% -4.854e+01 -7.684e-03 -2.295e-01 50% -4.853e+01 2.243e-01 -1.439e-01 75% -4.822e+01 4.725e-01 -1.374e-01 max -4.791e+01 5.255e-01 -1.309e-01 <BLANKLINE> Likelihood-ratio test p-values ------------------------------ <BLANKLINE> 𝓗₀ vs 𝓗₂ ---------------- mean 4.294e-01 std 1.966e-01 min 2.024e-01 25% 3.721e-01 50% 5.418e-01 75% 5.429e-01 max 5.441e-01 >>> from numpy import zeros >>> >>> nsamples = 50 >>> >>> X = random.randn(nsamples, 2) >>> G = random.randn(nsamples, 100) >>> K = dot(G, G.T) >>> ntrials = random.randint(1, 100, nsamples) >>> z = dot(G, random.randn(100)) / sqrt(100) >>> >>> successes = zeros(len(ntrials), int) >>> for i, nt in enumerate(ntrials): ... for _ in range(nt): ... successes[i] += int(z[i] + 0.5 * random.randn() > 0) >>> >>> result = scan(X, successes, ("binomial", ntrials), K, verbose=False) >>> print(result) # doctest: +FLOAT_CMP Hypothesis 0 ------------ <BLANKLINE> 𝐳 ~ 𝓝(𝙼𝜶, 0.152⋅𝙺 + 1.738⋅𝙸) for yᵢ ~ Binom(μᵢ=g(zᵢ), nᵢ) and g(x)=1/(1+e⁻ˣ) <BLANKLINE> M = ['offset'] 𝜶 = [0.40956942] se(𝜶) = [0.55141166] lml = -142.80784719977515 <BLANKLINE> Hypothesis 2 ------------ <BLANKLINE> 𝐳 ~ 𝓝(𝙼𝜶 + G𝛃, s(0.152⋅𝙺 + 1.738⋅𝙸)) for yᵢ ~ Binom(μᵢ=g(zᵢ), nᵢ) and g(x)=1/(1+e⁻ˣ) <BLANKLINE> lml cov. effsizes cand. effsizes -------------------------------------------------- mean -1.425e+02 3.701e-01 2.271e-01 std 4.110e-01 2.296e-02 5.680e-01 min -1.427e+02 3.539e-01 -1.745e-01 25% -1.426e+02 3.620e-01 2.631e-02 50% -1.425e+02 3.701e-01 2.271e-01 75% -1.423e+02 3.782e-01 4.279e-01 max -1.422e+02 3.864e-01 6.287e-01 <BLANKLINE> Likelihood-ratio test p-values ------------------------------ <BLANKLINE> 𝓗₀ vs 𝓗₂ ---------------- mean 4.959e-01 std 3.362e-01 min 2.582e-01 25% 3.771e-01 50% 4.959e-01 75% 6.148e-01 max 7.336e-01 Notes ----- It will raise a ``ValueError`` exception if non-finite values are passed. Please, refer to the :func:`limix.qc.mean_impute` function for missing value imputation. """ from numpy_sugar.linalg import economic_qs lik = normalize_likelihood(lik) if A is None: if A0 is not None or A1 is not None: raise ValueError( "You cannot define `A0` or `A1` without defining `A`.") with session_block("QTL analysis", disable=not verbose): with session_line("Normalising input... ", disable=not verbose): data = conform_dataset(Y, M, G=G, K=K) Y = data["y"] M = data["M"] G = data["G"] K = data["K"] assert_finite(Y, M, K) if K is not None: QS = economic_qs(K) else: QS = None if verbose: print() _print_input_info(idx, lik, Y, M, G, K) print() if A is None: r = _single_trait_scan(idx, lik, Y, M, G, QS, verbose) else: r = _multi_trait_scan(idx, lik, Y, M, G, QS, A, A0, A1, verbose) r = r.create() if verbose: print() print(r) return r
def st_iscan(G, y, K=None, M=None, E0=None, E1=None, W_R=None, verbose=True): r""" Single-variant association interation testing. Parameters ---------- pheno : (`N`, 1) ndarray phenotype data covs : (`N`, `D`) ndarray covariate design matrix. By default, ``covs`` is a (`N`, `1`) array of ones. R : (`N`, `N`) ndarray LMM-covariance/genetic relatedness matrix. If not provided, then standard linear regression is considered. Alternatively, its eighenvalue decomposition can be provided through ``eigh_R``. if ``eigh_R`` is set, this parameter is ignored. If the LMM-covariance is low-rank, ``W_R`` can be provided eigh_R : tuple Tuple with `N` ndarray of eigenvalues of `R` and (`N`, `N`) ndarray of eigenvectors of ``R``. W_R : (`N`, `R`) ndarray If the LMM-covariance is low-rank, one can provide ``W_R`` such that ``R`` = dot(``W_R``, transpose(``W_R``)). inter : (`N`, `K`) ndarray interaction variables interacting with the snp. If specified, then the current tests are considered: (i) (inter&inter0)-by-g vs no-genotype-effect; (ii) inter0-by-g vs no-genotype-effect; (iii) (inter&inter0)-by-g vs inter0-by-g. inter0 : (`N`, `K0`) ndarray interaction variables to be included in the alt and null model. By default, if inter is not specified, inter0 is ignored. By default, if inter is specified, inter0=ones so that inter0-by-g=g, i.e. an additive genetic effect is considered. verbose : (bool, optional): if True, details such as runtime as displayed. """ from limix_lmm.lmm import LMM from limix_lmm.lmm_core import LMMCore from limix_core.gp import GP2KronSum, GP2KronSumLR from limix_core.covar import FreeFormCov from scipy.linalg import eigh from numpy import ones, var, concatenate, asarray lmm0 = None with session_block("single-trait association test", disable=not verbose): # if covs is None: # covs = ones([pheno.shape[0], 1]) with session_line("Normalising input... ", disable=not verbose): data = conform_dataset(y, M, G=G, K=K) y = data["y"] M = data["M"] G = data["G"] K = data["K"] # case 1: linear model # if W_R is None and eigh_R is None and R is None: if K is None: if verbose: print("Model: lm") gp = None Kiy_fun = None # case 2: low-rank linear model elif W_R is not None: if verbose: print("Model: low-rank lmm") gp = GP2KronSumLR(Y=y, Cn=FreeFormCov(1), G=W_R, F=M, A=ones((1, 1))) gp.covar.Cr.setCovariance(var(y) * ones((1, 1))) gp.covar.Cn.setCovariance(var(y) * ones((1, 1))) gp.optimize(verbose=verbose) Kiy_fun = gp.covar.solve # case 3: full-rank linear model else: if verbose: print("Model: lmm") # if eigh_R is None: eigh_R = eigh(K) S_R, U_R = eigh_R add_jitter(S_R) gp = GP2KronSum( Y=y, Cg=FreeFormCov(1), Cn=FreeFormCov(1), S_R=S_R, U_R=U_R, F=M, A=ones((1, 1)), ) gp.covar.Cr.setCovariance(0.5 * var(y) * ones((1, 1))) gp.covar.Cn.setCovariance(0.5 * var(y) * ones((1, 1))) gp.optimize(verbose=verbose) Kiy_fun = gp.covar.solve if E1 is None: lmm = LMM(y, M, Kiy_fun) E1 = None E0 = None else: lmm = LMMCore(y, M, Kiy_fun) if E0 is None: E0 = ones([y.shape[0], 1]) if (E0 == 1).sum(): lmm0 = LMM(y, M, Kiy_fun) else: lmm0 = LMMCore(y, M, Kiy_fun) E1 = concatenate([E0, E1], 1) return _process(lmm, lmm0, asarray(G), E0, E1)
def iscan(G, y, lik="normal", K=None, M=None, idx=None, E0=None, E1=None, verbose=True): r""" Single-trait association with interaction test via generalized linear mixed models. The general formulae for normally distributed traits is .. math:: 𝐲 = 𝙼𝛂 + (𝙶⊙𝙴₀)𝛃₀ + (𝙶⊙𝙴₁)𝛃₁ + 𝐮 + 𝛆,\\ \text{where}~~ 𝐮∼𝓝(𝟎, 𝓋₀𝙺) ~~\text{and}~~ 𝛆∼𝓝(𝟎, 𝓋₁𝙸). The operator ⊙ works as follows: .. math:: 𝙰⊙𝙱 = [𝙰₀𝙱₀ ~~...~~ 𝙰₀𝙱ₙ ~~ 𝙰₁𝙱₀ ~~...~~ 𝙰₁𝙱ₙ ~~...~~ 𝙰ₘ𝙱ₙ] The covariates is enconded in matrix 𝙼 while the candidate set is enconded in matrix 𝙶. The parameters are the effect sizes 𝛂, 𝛃₀, and 𝛃₁, and the variances 𝓋₀ and 𝓋₁. It performs likelihood-ratio tests for the following cases, where the first hypothesis is the null one while the second hypothesis is the alternative one: - H₀ vs H₁: testing for vec(𝛃₀) ≠ 𝟎 while vec(𝛃₁) = 𝟎 - H₀ vs H₂: testing for [vec(𝛃₀) vec(𝛃₁)] ≠ 𝟎 - H₁ vs H₂: testing for vec(𝛃₁) ≠ 𝟎 It also supports generalized linear mixed models (GLMM). In this case, the following likelihoods are implemented: - Bernoulli - Probit - Binomial - Poisson Formally, let p(𝜇) be one of the supported probability distributions where 𝜇 is its mean. The H₀ model is defined as follows: .. math:: yᵢ ∼ p(𝜇ᵢ=g(zᵢ)) ~~\text{for}~~ 𝐳 ∼ 𝓝(𝙼𝛂 + (𝙶⊙𝙴₀)𝛃₀ + (𝙶⊙𝙴₁)𝛃₁, 𝓋₀𝙺 + 𝓋₁𝙸). g(⋅) is the corresponding canonical link function for the Bernoulli, Binomial, and Poisson likelihoods. The Probit likelihood, on the other hand, is a Bernoulli likelihood with probit link function. Parameters ---------- G : n×m array_like Genetic candidates. Y : n×p array_like Rows are samples and columns are phenotypes. lik : tuple, "normal", "bernoulli", "probit", "binomial", "poisson" Sample likelihood describing the residual distribution. Either a tuple or a string specifying the likelihood is required. The Normal, Bernoulli, Probit, and Poisson likelihoods can be selected by providing a string. Binomial likelihood on the other hand requires a tuple because of the number of trials: ``("binomial", array_like)``. Defaults to ``"normal"``. K : n×n array_like Sample covariance, often the so-called kinship matrix. M : n×c array_like Covariates matrix. idx : list List of candidate indices that defines the set of candidates to be used in the tests. E0 : array_like Matrix representing the first environment. E1 : array_like Matrix representing the second environment. verbose : bool, optional ``True`` to display progress and summary; ``False`` otherwise. Returns ------- result : :class:`limix.qtl._result.IScanResult` P-values, log of marginal likelihoods, effect sizes, and associated statistics. Notes ----- It will raise a ``ValueError`` exception if non-finite values are passed. Please, refer to the :func:`limix.qc.mean_impute` function for missing value imputation. """ from numpy_sugar.linalg import economic_qs from xarray import concat from numpy import asarray, empty, ones lik = normalize_likelihood(lik) lik_name = lik[0] with session_block("QTL analysis", disable=not verbose): with session_line("Normalising input... ", disable=not verbose): data = conform_dataset(y, M, G=G, K=K) Y = data["y"] M = data["M"] G = data["G"] K = data["K"] assert_finite(y, M, K) nsamples = y.shape[0] if E1 is None: E1 = ones((nsamples, 1)) if E0 is None: E0 = empty((nsamples, 0)) E0 = _asarray(E0, "env0", ["sample", "env"]) E1 = _asarray(E1, "env1", ["sample", "env"]) E01 = concat([E0, E1], dim="env") if K is not None: QS = economic_qs(K) else: QS = None if lik_name == "normal": scanner, v0, v1 = _lmm(Y.values.ravel(), M.values, QS, verbose) else: scanner, v0, v1 = _glmm(Y.values.ravel(), lik, M.values, QS, verbose) r = IScanResultFactory( lik_name, Y.trait, M.covariate, G.candidate, E0.env, E1.env, scanner.null_lml, scanner.null_beta, scanner.null_beta_se, v0, v1, ) if idx is None: assert E1.shape[1] > 0 idx = range(G.shape[1]) if E0.shape[1] == 0: r1 = scanner.fast_scan(G, verbose) for i in idx: i = _2d_sel(i) g = asarray(G[:, i], float) if E0.shape[1] > 0: r1 = scanner.scan(g, E0) h1 = _normalise_scan_names(r1) else: h1 = _normalise_scan_names({k: v[i] for k, v in r1.items()}) h1["covariate_effsizes"] = h1["covariate_effsizes"].ravel() h1["covariate_effsizes_se"] = h1["covariate_effsizes_se"].ravel() r2 = scanner.scan(g, E01) h2 = _normalise_scan_names(r2) r.add_test(i, h1, h2) else: for i in idx: i = _2d_sel(i) g = asarray(G[:, i], float) r1 = scanner.scan(g, E0) r2 = scanner.scan(g, E01) h1 = _normalise_scan_names(r1) h2 = _normalise_scan_names(r2) r.add_test(i, h1, h2) r = r.create() if verbose: print(r) return r
def st_sscan(G, y, E, M=None, tests=None, verbose=True): """Mixed-model with genetic effect heterogeneity. Parameters ---------- pheno : (`N`, 1) ndarray phenotype data environments : (`N`, `E`) ndarray environments data. covs : (`N`, `D`) ndarray covariate design matrix. By default, ``covs`` is a (`N`, `1`) array of ones. tests : list Which tests are performed. Element list values are ``'inter'`` and ``'assoc'``. By default, only the interaction test is considered. rhos : list for the association test, a list of ``rho`` values must be specified. The choice of ``rho`` affects the statistical power of the test (for more information see the StructLMM paper). By default, ``rho=[0, 0.1**2, 0.2**2, 0.3**2, 0.4**2, 0.5**2, 0.5, 1.]`` verbose : (bool, optional): if True, details such as runtime as displayed. """ from struct_lmm import StructLMM from numpy import zeros, hstack, asarray from pandas import DataFrame rhos = [0.0, 0.1**2, 0.2**2, 0.3**2, 0.4**2, 0.5**2, 0.5, 1.0] with session_block("struct-lmm analysis", disable=not verbose): with session_line("Normalising input... ", disable=not verbose): data = conform_dataset(y, M, G=G, K=None) y = data["y"] M = data["M"] G = data["G"] if tests is None: tests = ["inter"] if "inter" in tests: slmi = StructLMM(asarray(y, float), E, W=E, rho_list=[0]) if "assoc" in tests: slmm = StructLMM(asarray(y, float), E, W=E, rho_list=rhos) slmm.fit_null(F=asarray(M, float), verbose=False) _pvi = zeros(G.shape[1]) _pva = zeros(G.shape[1]) for snp in range(G.shape[1]): x = asarray(G[:, [snp]], float) if "inter" in tests: # interaction test M1 = hstack((M, x)) slmi.fit_null(F=M1, verbose=False) _pvi[snp] = slmi.score_2_dof(x) if "assoc" in tests: # association test _pva[snp] = slmm.score_2_dof(x) data = OrderedDict() data["pvi"] = _pvi data["pva"] = _pva return DataFrame(data)