Exemplo n.º 1
0
def test_lmm_scan_fast_scan():
    random = RandomState(9458)
    n = 30
    X = _covariates_sample(random, n, n + 1)
    offset = 1.0
    y = _outcome_sample(random, offset, X)
    QS = economic_qs_linear(X)
    M0 = random.randn(n, 2)
    M1 = random.randn(n, 2)

    lmm = LMM(y, M0, QS)
    lmm.fit(verbose=False)

    v0 = lmm.v0
    v1 = lmm.v1
    K = v0 * X @ X.T + v1 * eye(n)
    M = concatenate((M0, M1[:, [0]]), axis=1)

    def fun(x):
        beta = x[:3]
        scale = exp(x[3])
        return -st.multivariate_normal(M @ beta, scale * K).logpdf(y)

    res = minimize(fun, [0, 0, 0, 0])
    scanner = lmm.get_fast_scanner()
    r = scanner.fast_scan(M1, verbose=False)

    assert_allclose(r["lml"][0], -res.fun)
    assert_allclose(r["effsizes0"][0], res.x[:2], rtol=1e-5)
    assert_allclose(r["effsizes1"][0], res.x[2:3], rtol=1e-5)
    assert_allclose(r["scale"][0], exp(res.x[3]), rtol=1e-5)
Exemplo n.º 2
0
def test_fast_scanner_set_scale_1covariate_redundant():
    random = RandomState(9458)
    n = 10
    X = _covariates_sample(random, n, n + 1)
    offset = 1.0

    y = _outcome_sample(random, offset, X)

    QS = economic_qs_linear(X)

    M = random.randn(n, 1)
    lmm = LMM(y, M, QS)

    lmm.fit(verbose=False)

    markers = M.copy()

    scanner = lmm.get_fast_scanner()
    r = scanner.fast_scan(markers, verbose=False)
    assert_allclose(r["lml"][0], -22.357525517597185, rtol=1e-6)
    assert_allclose(r["effsizes0"], [[0.029985622694805182]])
    assert_allclose(r["effsizes1"][0],
                    0.02998562491058301,
                    rtol=1e-6,
                    atol=1e-6)
    assert_allclose(r["scale"], [1.0], rtol=1e-6)
Exemplo n.º 3
0
def test_fast_scanner_set_scale_1covariate():
    random = RandomState(9458)
    n = 10
    X = _covariates_sample(random, n, n + 1)
    offset = 1.0

    y = _outcome_sample(random, offset, X)

    QS = economic_qs_linear(X)

    M = random.randn(n, 1)
    lmm = LMM(y, M, QS)

    lmm.fit(verbose=False)
    assert_allclose(lmm.scale, 5.282731934070453)
    assert_allclose(lmm.delta, 0.7029974630034005)
    assert_allclose(lmm.beta, [0.0599712498212])

    markers = M.copy() + random.randn(n, 1)

    scanner = lmm.get_fast_scanner()
    r = scanner.fast_scan(markers, verbose=False)

    assert_allclose(r["lml"], [-21.509721], rtol=1e-6)
    assert_allclose(r["effsizes0"], [[-1.43206379971882]])
    assert_allclose(r["effsizes1"], [1.412239], rtol=1e-6)
    assert_allclose(r["scale"], [0.8440354018505616], rtol=1e-6)

    beta = lmm.beta
    assert_allclose(
        scanner.fast_scan(zeros((10, 1)), verbose=False)["effsizes0"][0], beta
    )
Exemplo n.º 4
0
def test_fast_scanner_set_scale_multicovariates():
    random = RandomState(9458)
    n = 10
    X = _covariates_sample(random, n, n + 1)
    offset = 1.0

    y = _outcome_sample(random, offset, X)

    QS = economic_qs_linear(X)

    M = random.randn(n, 3)
    lmm = LMM(y, M, QS)

    lmm.fit(verbose=False)

    markers = M.copy()

    scanner = lmm.get_fast_scanner()
    r = scanner.fast_scan(markers, verbose=False)

    want = [-19.318845, -19.318845, -19.318845]
    assert_allclose(r["lml"], want, rtol=1e-6, atol=1e-6)

    assert_allclose(
        r["effsizes0"][2],
        [-0.6923007382350215, 2.3550810825973034, -0.38157769653894497],
        rtol=1e-5,
    )

    want = [-0.34615, 1.177541, -0.381578]
    assert_allclose(r["effsizes1"], want, rtol=1e-6, atol=1e-6)
    assert_allclose(r["scale"], [1.0, 1.0, 1.0])
Exemplo n.º 5
0
def test_lm():
    random = RandomState(0)

    (y, X, _) = _full_rank(random)
    lmm = LMM(y, X)
    lmm.fit(verbose=False)
    assert_allclose(lmm.v0, 2.0129061033356781e-16, atol=1e-7)
    assert_allclose(lmm.v1, 0.9065323176914355)
    assert_allclose(lmm.beta, [0.24026567104188318, -0.17873180599015123])
Exemplo n.º 6
0
 def _fit_lmm_simple_model(self, verbose):
     from numpy_sugar.linalg import economic_qs
     from glimix_core.lmm import LMM
     from numpy import asarray
     K = self._get_matrix_simple_model()
     y = asarray(self._y, float).ravel()
     QS = None
     if K is not None:
         QS = economic_qs(K)
     lmm = LMM(y, self._M, QS)
     lmm.fit(verbose=verbose)
     self._set_simple_model_variances(lmm.v0, lmm.v1)
     self._glmm = lmm
Exemplo n.º 7
0
def estimate(y, lik, K, M=None, verbose=True):
    from numpy_sugar.linalg import economic_qs
    from numpy import pi, var, diag
    from glimix_core.glmm import GLMMExpFam
    from glimix_core.lmm import LMM
    from limix._data._assert import assert_likelihood
    from limix._data import normalize_likelihood, conform_dataset
    from limix.qtl._assert import assert_finite
    from limix._display import session_block, session_line
    lik = normalize_likelihood(lik)
    lik_name = lik[0]
    with session_block("Heritability analysis", disable=not verbose):
        with session_line("Normalising input...", disable=not verbose):
            data = conform_dataset(y, M=M, K=K)
        y = data["y"]
        M = data["M"]
        K = data["K"]
        assert_finite(y, M, K)
        if K is not None:
            # K = K / diag(K).mean()
            QS = economic_qs(K)
        else:
            QS = None
        if lik_name == "normal":
            method = LMM(y.values, M.values, QS, restricted=True)
            method.fit(verbose=verbose)
        else:
            method = GLMMExpFam(y, lik, M.values, QS, n_int=500)
            method.fit(verbose=verbose, factr=1e6, pgtol=1e-3)
        g = method.scale * (1 - method.delta)
        e = method.scale * method.delta
        if lik_name == "bernoulli":
            e += pi * pi / 3
        v = var(method.mean())
        return g, v, e
Exemplo n.º 8
0
def estimate(y_phe, lik, kin, marker_mat=None, verbose=True):
    ''' estimate variance components '''
    lik = normalize_likelihood(lik)
    lik_name = lik[0]
    with session_block("Heritability analysis", disable=not verbose):
        with session_line("Normalising input...", disable=not verbose):
            data = conform_dataset(y_phe, M=marker_mat, K=kin)
        y_phe = data["y"]
        marker_mat = data["M"]
        kin = data["K"]
        assert_finite(y_phe, marker_mat, kin)
        if kin is not None:
            # K = K / diag(K).mean()
            q_s = economic_qs(kin)
        else:
            q_s = None
        if lik_name == "normal":
            method = LMM(y_phe.values, marker_mat.values, q_s, restricted=True)
            method.fit(verbose=verbose)
        else:
            method = GLMMExpFam(y_phe, lik, marker_mat.values, q_s, n_int=500)
            method.fit(verbose=verbose, factr=1e6, pgtol=1e-3)
        v_g = method.scale * (1 - method.delta)
        v_e = method.scale * method.delta
        if lik_name == "bernoulli":
            v_e += pi * pi / 3
        v_v = var(method.mean())
        return v_g, v_v, v_e
Exemplo n.º 9
0
Arquivo: _iscan.py Projeto: phue/limix
def _lmm(y, M, QS, verbose):
    from glimix_core.lmm import LMM

    lmm = LMM(y, M, QS, restricted=False)
    lmm.fit(verbose=verbose)
    sys.stdout.flush()

    if QS is None:
        v0 = None
    else:
        v0 = lmm.v0
    v1 = lmm.v1
    scanner = ScannerWrapper(lmm.get_fast_scanner())

    return scanner, v0, v1
Exemplo n.º 10
0
Arquivo: bf.py Projeto: phue/limix
    def calc_lml(self, Env):
        from numpy import ones, concatenate
        from glimix_core.lmm import LMM
        from numpy_sugar.linalg import economic_qs_linear

        _covs = concatenate([self.F, self.W, self.x], 1)
        if Env.shape[1] == 0:
            xoE = ones(self.x.shape)
        else:
            xoE = self.x * Env

        QS = economic_qs_linear(xoE)
        gp = LMM(self.y, _covs, QS, restricted=True)
        gp.fit(verbose=False)
        return gp.lml()
Exemplo n.º 11
0
def _fit_cis_herit(y, K_cis, X=None, compute_lrt=True):
    log = logging.getLogger(pyfocus.LOG)

    try:
        from glimix_core.lmm import LMM
        from numpy_sugar.linalg import economic_qs_linear
    except ImportError as ie:
        log.error(
            "Training submodule requires glimix-core>=2.0.0 and numpy-sugar to be installed."
        )
        raise

    from scipy.stats import norm
    from scipy.linalg import lstsq

    if X is None:
        X = np.ones((len(y), 1))

    K_cis = economic_qs_linear(K_cis)
    lmm = LMM(y, X, K_cis)
    lmm.fit(verbose=False)

    fixed_betas = lmm.beta
    logl_1 = lmm.lml()

    cis_scale = lmm.v0
    noise_scale = lmm.v1
    fe_scale = lmm.fixed_effects_variance

    if compute_lrt:
        n, p = X.shape
        # reduced model is just OLS regression for fixed-effects
        fixed_betas_0, sosqs, ranks, svals = lstsq(X, y)
        s2e = sosqs / len(
            y
        )  # LMM also uses MLE estimation, so don't adjust for bias right now

        logl_0 = np.sum(
            norm.logpdf(y, loc=np.dot(X, fixed_betas_0), scale=np.sqrt(s2e)))
        pval = _lrt_pvalue(logl_0, logl_1)
        log.debug("Estimated cis-h2g = {} (P = {})".format(
            cis_scale / (cis_scale + noise_scale + fe_scale), pval))
    else:
        pval = None
        log.debug("Estimated cis-h2g = {}".format(
            cis_scale / (cis_scale + noise_scale + fe_scale)))

    return fe_scale, cis_scale, noise_scale, logl_1, fixed_betas, pval
Exemplo n.º 12
0
def test_lmm_scan_lmm_iid_prior():
    random = RandomState(9458)
    n = 30
    X = _covariates_sample(random, n, n + 1)
    markers = random.randn(n, 2)

    offset = 1.0

    y = _outcome_sample(random, offset, X)

    lmm = LMM(y, ones((n, 1)), None)

    lmm.fit(verbose=False)
    scanner = lmm.get_fast_scanner()
    lmls = scanner.fast_scan(markers, verbose=False)["lml"]
    assert_allclose(lmls[:2], [-63.16019973550036, -62.489358539276715])
Exemplo n.º 13
0
def _st_lmm(Y, M, QS, verbose):
    from numpy import nan
    from glimix_core.lmm import LMM

    lmm = LMM(Y, M, QS, restricted=False)
    lmm.fit(verbose=verbose)
    sys.stdout.flush()

    if QS is None:
        v0 = nan
    else:
        v0 = lmm.v0

    v1 = lmm.v1

    return lmm.get_fast_scanner(), v0, v1
Exemplo n.º 14
0
def test_fast_scanner_redundant_candidates():
    random = RandomState(9458)
    n = 10
    X = _covariates_sample(random, n, n + 1)
    offset = 1.0

    y = _outcome_sample(random, offset, X)

    QS = economic_qs_linear(X)

    M = ones((n, 5))
    lmm = LMM(y, M, QS, restricted=False)

    lmm.fit(verbose=False)

    markers = M.copy()

    scanner = lmm.get_fast_scanner()

    scanner.fast_scan(markers, verbose=False)
Exemplo n.º 15
0
def _perform_lmm(y, M, QS, G, verbose):
    from glimix_core.lmm import LMM
    from pandas import Series
    from xarray import DataArray

    lmm = LMM(y, M.values, QS)

    lmm.fit(verbose=verbose)
    sys.stdout.flush()

    null_lml = lmm.lml()

    beta = lmm.beta

    covariates = list(M.coords["covariate"].values)
    ncov_effsizes = Series(beta, covariates)

    flmm = lmm.get_fast_scanner()
    if hasattr(G, "data"):
        values = G.data
    else:
        values = G.values
    alt_lmls, effsizes = flmm.fast_scan(values, verbose=verbose)

    coords = {
        k: ("candidate", G.coords[k].values)
        for k in G.coords.keys()
        if G.coords[k].dims[0] == "candidate"
    }

    alt_lmls = DataArray(alt_lmls, dims=["candidate"], coords=coords)
    effsizes = DataArray(effsizes, dims=["candidate"], coords=coords)

    return QTLModel(null_lml, alt_lmls, effsizes, ncov_effsizes)
Exemplo n.º 16
0
    def calc_opt_rho(self):
        import scipy as sp
        from glimix_core.lmm import LMM
        from numpy_sugar.linalg import economic_qs_linear

        _covs = sp.concatenate([self.F, self.W, self.x], 1)
        xoE = self.x * self.Env
        QS = economic_qs_linear(xoE)
        gp = LMM(self.y, _covs, QS, restricted=True)
        gp.fit(verbose=False)

        # variance heterogenenty
        var_xEEx = ((xoE - xoE.mean(0)) ** 2).sum()
        var_xEEx /= float(self.y.shape[0] - 1)
        v_het = gp.v0 * var_xEEx

        #  variance persistent
        v_comm = sp.var(gp.beta[-1] * self.x)

        rho = v_het / (v_comm + v_het)

        return rho
Exemplo n.º 17
0
def test_lmm_scan():
    random = RandomState(9458)
    n = 30
    X = _covariates_sample(random, n, n + 1)
    offset = 1.0
    y = _outcome_sample(random, offset, X)
    QS = economic_qs_linear(X)
    M0 = random.randn(n, 2)
    M1 = random.randn(n, 2)

    lmm = LMM(y, M0, QS)
    lmm.fit(verbose=False)

    v0 = lmm.v0
    v1 = lmm.v1
    K = v0 * X @ X.T + v1 * eye(n)
    M = concatenate((M0, M1), axis=1)

    def fun(x):
        beta = x[:4]
        scale = exp(x[4])
        return -st.multivariate_normal(M @ beta, scale * K).logpdf(y)

    res = minimize(fun, [0, 0, 0, 0, 0])
    scanner = lmm.get_fast_scanner()
    r = scanner.scan(M1)

    assert_allclose(r["lml"], -res.fun)
    assert_allclose(r["effsizes0"], res.x[:2], rtol=1e-5)
    assert_allclose(r["effsizes1"], res.x[2:4], rtol=1e-5)
    assert_allclose(r["scale"], exp(res.x[4]), rtol=1e-5)
    K = r["scale"] * lmm.covariance()
    M = concatenate((M0, M1), axis=1)
    effsizes_se = sqrt(inv(M.T @ solve(K, M)).diagonal())
    assert_allclose(effsizes_se,
                    concatenate((r["effsizes0_se"], r["effsizes1_se"])))

    assert_allclose(scanner.null_lml(), -53.805721275578456, rtol=1e-5)
    assert_allclose(scanner.null_beta,
                    [0.26521964226797085, 0.4334778669761928],
                    rtol=1e-5)
    assert_allclose(
        scanner.null_beta_covariance,
        [
            [0.06302553593799207, 0.00429640179038484],
            [0.004296401790384839, 0.05591392416235412],
        ],
        rtol=1e-5,
    )
    assert_allclose(scanner.null_scale, 1.0)

    assert_allclose(scanner.null_beta, lmm.beta, rtol=1e-5)
    assert_allclose(scanner.null_beta_covariance,
                    lmm.beta_covariance,
                    rtol=1e-5)
Exemplo n.º 18
0
def test_lmm_predict():
    random = RandomState(9458)
    n = 30

    X = random.randn(n, n + 1)
    X -= X.mean(0)
    X /= X.std(0)
    X /= sqrt(X.shape[1])

    offset = 1.0

    mean = OffsetMean(n)
    mean.offset = offset

    cov_left = LinearCov(X)
    cov_left.scale = 1.5

    cov_right = EyeCov(n)
    cov_right.scale = 1.5

    cov = SumCov([cov_left, cov_right])

    lik = DeltaProdLik()

    y = GGPSampler(lik, mean, cov).sample(random)

    QS = economic_qs_linear(X)

    lmm = LMM(y, ones((n, 1)), QS)

    lmm.fit(verbose=False)

    plmm = LMMPredict(y, lmm.beta, lmm.v0, lmm.v1, lmm.mean(),
                      lmm.covariance())

    K = dot(X, X.T)
    pm = plmm.predictive_mean(ones((n, 1)), K, K.diagonal())
    assert_allclose(corrcoef(y, pm)[0, 1], 0.8358820971891354)
Exemplo n.º 19
0
def estimate(y, lik, K, M=None, verbose=True):
    r"""Estimate the so-called narrow-sense heritability.

    It supports Normal, Bernoulli, Probit, Binomial, and Poisson phenotypes.
    Let :math:`N` be the sample size and :math:`S` the number of covariates.

    Parameters
    ----------
    y : array_like
        Either a tuple of two arrays of `N` individuals each (Binomial
        phenotypes) or an array of `N` individuals (Normal, Poisson, or
        Bernoulli phenotypes). If a continuous phenotype is provided (i.e., a Normal
        one), make sure they have been normalised in such a way that its values are
        not extremely large; it might cause numerical errors otherwise. For example,
        by using :func:`limix.qc.mean_standardize` or
        :func:`limix.qc.quantile_gaussianize`.
    lik : "normal", "bernoulli", "probit", binomial", "poisson"
        Sample likelihood describing the residual distribution.
    K : array_like
        :math:`N`-by-:math:`N` covariance matrix. It might be, for example, the
        estimated kinship relationship between the individuals. The provided matrix will
        be normalised via the function :func:`limix.qc.normalise_covariance`.
    M : array_like, optional
        :math:`N` individuals by :math:`S` covariates.
        It will create a :math:`N`-by-:math:`1` matrix ``M`` of ones representing the offset
        covariate if ``None`` is passed. If an array is passed, it will used as is.
        Defaults to ``None``.
    verbose : bool, optional
        ``True`` to display progress and summary; ``False`` otherwise.

    Returns
    -------
    float
        Estimated heritability.

    Examples
    --------
    .. doctest::

        >>> from numpy import dot, exp, sqrt
        >>> from numpy.random import RandomState
        >>> from limix.her import estimate
        >>>
        >>> random = RandomState(0)
        >>>
        >>> G = random.randn(150, 200) / sqrt(200)
        >>> K = dot(G, G.T)
        >>> z = dot(G, random.randn(200)) + random.randn(150)
        >>> y = random.poisson(exp(z))
        >>>
        >>> print('%.3f' % estimate(y, 'poisson', K, verbose=False))  # doctest: +FLOAT_CMP
        0.183

    Notes
    -----
    It will raise a ``ValueError`` exception if non-finite values are passed. Please,
    refer to the :func:`limix.qc.mean_impute` function for missing value imputation.
    """
    from numpy_sugar import is_all_finite
    from numpy_sugar.linalg import economic_qs
    from numpy import ones, pi, var
    from glimix_core.glmm import GLMMExpFam
    from glimix_core.lmm import LMM

    if not isinstance(lik, (tuple, list)):
        lik = (lik,)

    lik_name = lik[0].lower()
    check_likelihood_name(lik_name)

    with session_block("heritability analysis", disable=not verbose):

        if M is None:
            M = ones((len(y), 1))

        with session_line("Normalising input...", disable=not verbose):
            data = conform_dataset(y, M=M, K=K)

        y = data["y"]
        M = data["M"]
        K = data["K"]

        if not is_all_finite(y):
            raise ValueError("Outcome must have finite values only.")

        if not is_all_finite(M):
            raise ValueError("Covariates must have finite values only.")

        if K is not None:
            if not is_all_finite(K):
                raise ValueError("Covariate matrix must have finite values only.")

            K = normalise_covariance(K)

        y = normalise_extreme_values(y, lik)

        if K is not None:
            QS = economic_qs(K)
        else:
            QS = None

        if lik_name == "normal":
            method = LMM(y.values, M.values, QS)
            method.fit(verbose=verbose)
        else:
            method = GLMMExpFam(y, lik, M.values, QS, n_int=500)
            method.fit(verbose=verbose, factr=1e6, pgtol=1e-3)

        g = method.scale * (1 - method.delta)
        e = method.scale * method.delta
        if lik_name == "bernoulli":
            e += pi * pi / 3

        if lik_name == "normal":
            v = method.fixed_effects_variance
        else:
            v = var(method.mean())

        return g / (v + g + e)
def run_QTL_analysis(pheno_filename, anno_filename, geno_prefix, plinkGenotype, output_dir, window_size=250000,
                     min_maf=0.05, min_hwe_P=0.001, min_call_rate=None, blocksize=1000, cis_mode=True,
                     skipAutosomeFiltering = False, gaussianize_method=None, minimum_test_samples= 10,
                     seed=np.random.randint(40000), n_perm=0, write_permutations = False, write_feature_top_permutations = False,
                     relatedness_score=0.95, feature_variant_covariate_filename = None, snps_filename=None, feature_filename=None,
                     snp_feature_filename=None, genetic_range='all', covariates_filename=None, randomeff_filename=None,
                     sample_mapping_filename=None, extended_anno_filename=None, regressCovariatesUpfront = False, debugger=False):
    #Manual flag to set pearson (True), spearman (False). TODO add rank as an option to gaussnorm.
    pearson=True
    
    if regressCovariatesUpfront is not None:
        #This implementation can only handle regression before the association test (correlation).
        regressCovariatesUpfront= True
    tot_time = 0
    idx = 0
    
    print(relatedness_score)

    fill_NaN = Imputer(missing_values=np.nan, strategy='mean', axis=0, copy=False)
    print('Running QTL analysis.')
    lik = 'normal'
    minimumProbabilityStep=0.1

    '''Core function to take input and run QTL tests on a given chromosome.'''
    
    # Check if relatedness_score is present as a measure of genotype similarity and hence, of sample similarity.
    if relatedness_score is not None:
        relatedness_score = float(relatedness_score)

    # Intersect files together to list the amount of samples with enough files
    if debugger:
        fun_start = time.time()
    [phenotype_df, kinship_df, randomeff_df, covariate_df, sample2individual_df,complete_annotation_df, annotation_df, snp_filter_df,
     snp_feature_filter_df, geneticaly_unique_individuals, minimum_test_samples, feature_list, bim, fam, bed, bgen,
     chromosome, selectionStart, selectionEnd, feature_variant_covariate_df]=\
    utils.run_QTL_analysis_load_intersect_phenotype_covariates_kinship_sample_mapping(pheno_filename=pheno_filename,
                    anno_filename=anno_filename, geno_prefix=geno_prefix, plinkGenotype=plinkGenotype, cis_mode=cis_mode,
                    skipAutosomeFiltering = skipAutosomeFiltering, minimum_test_samples= minimum_test_samples,
                    relatedness_score=relatedness_score, snps_filename=snps_filename, feature_filename=feature_filename,
                    snp_feature_filename=snp_feature_filename, selection=genetic_range,covariates_filename=covariates_filename,
                    randomeff_filename=randomeff_filename, sample_mapping_filename=sample_mapping_filename,
                    extended_anno_filename=extended_anno_filename, feature_variant_covariate_filename=feature_variant_covariate_filename)
    if debugger:
        fun_end = time.time()
        print(" Intersecting files took {}".format(fun_end-fun_start))

    # Check if kinship matrix is present. The matrix of pairwise genotype similarity. If they are not present took genetically unique individuals based on IDs
    mixed = kinship_df is not None
    if (kinship_df is None) or (relatedness_score is None) :
        geneticaly_unique_individuals = sample2individual_df['iid'].values
    QS = None
    
    # Check if feature list is empty (genes)
    if(feature_list==None or len(feature_list)==0):
        print ('No features to be tested.')
        sys.exit()

    #Open output files
    if debugger:
        fun_start = time.time()
    qtl_loader_utils.ensure_dir(output_dir)
    if not selectionStart is None :
        output_writer = qtl_output.hdf5_writer(output_dir+'/qtl_results_{}_{}_{}.h5'.format(chromosome,selectionStart,selectionEnd))
    else :
        output_writer = qtl_output.hdf5_writer(output_dir+'/qtl_results_{}.h5'.format(chromosome))
    if(write_permutations):
        if not selectionStart is None :
            permutation_writer = qtl_output.hdf5_permutations_writer(output_dir+'/perm_results_{}_{}_{}.h5'.format(chromosome,selectionStart,selectionEnd),n_perm)
        else :
            permutation_writer = qtl_output.hdf5_permutations_writer(output_dir+'/perm_results_{}.h5'.format(chromosome),n_perm)
    if debugger:
        fun_end = time.time()
        print(" Opening writing files took {}".format(fun_end-fun_start))

    #Arrays to store indices of snps tested and pass and fail QC SNPs for features without missingness.
    tested_snp_ids = []
    pass_qc_snps_all = []
    fail_qc_snps_all = []
    fail_qc_features = []
    alpha_params = []
    beta_params = []
    n_samples = []
    n_e_samples = []
    random_eff_param = []
    na_containing_features=0
    currentFeatureNumber=0
    snpQcInfoMain = None
    random_eff_param = []
    log = {}
    rho1 = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
    Sigma = {}
    Sigma_qs = {}
    randomeff_mix = False
    
    #############################################################################################################################################################
    # Per feature LMM fitting
    #############################################################################################################################################################

    # start per feature computations
    for feature_id in feature_list:
        gc.collect()
        start_time = time.time()
        # log file production for rho values storing and computation time
        log[(feature_id)] = []
        # feature specific parameters for QS mixing
        mixingParameters = {}
        feature_best_rho = -1
        snpQcInfo = None
        # counter
        currentFeatureNumber+= 1

        ########################################################################################################################################################
        # check if enough phenotype samples to test this gene
        if (len(phenotype_df.loc[feature_id,:]))<minimum_test_samples:
            print("Feature: "+feature_id+" not tested not enough samples do QTL test (n="+str(len(phenotype_df.loc[feature_id,:]))+").")
            fail_qc_features.append(feature_id)
            geneticaly_unique_individuals = tmp_unique_individuals
            continue
        data_written = False
        contains_missing_samples = False
        ########################################################################################################################################################

        ########################################################################################################################################################
        # SNP selection based on gene location and window size
        if debugger:
            fun_start = time.time()
        snpQuery = utils.do_snp_selection(feature_id, complete_annotation_df, bim, cis_mode, window_size, skipAutosomeFiltering)
        if debugger:
            fun_end = time.time()
            print("SNP querying took {}".format(fun_end-fun_start))

        snp_cov_df = None
        if debugger:
            fun_start = time.time()
        ########################################################################################################################################################

        #########################################################################################################################################################
        # Check if a matrix of variant covariance is present. Like for example PCs covariates -- Understand better covariates to SNP
        if(feature_variant_covariate_df is not None):
            if(feature_id in feature_variant_covariate_df['feature_id'].values):
                # array of covariates per SNP and feature
                covariateSnp = feature_variant_covariate_df['snp_id'].values[feature_variant_covariate_df['feature_id']==feature_id]
                if(any(i in  bim['snp'].values for i in covariateSnp)):
                    snpQuery_cov = bim.loc[bim['snp'].map(lambda x: x in list(covariateSnp)),:]
                    if(plinkGenotype):
                        snp_cov_df = pd.DataFrame(data=bed[snpQuery_cov['i'].values,:].compute().transpose(),index=fam.index,columns=snpQuery_cov['snp'],)
                    else:
                        ##Here we make some assumptions on the SNPs. They are expected to be ploidy 2!
                        ##Also we don't use a minimal quality to assure a value is present for all samples.
                        print('Warning, during the regression of SNPs we assume ploidy 2.')
                        snp_cov_df_t = pd.DataFrame(columns=fam.index)
                        rowNumber = 0
                        for snpId in snpQuery_cov['i'] :
                            geno = bgen["genotype"][snpId].compute()
                            if(geno["phased"]):
                                snp_df_dosage_t = geno["probs"][:,[0,2]].sum(1).astype(float)
                                snp_df_dosage_t[(np.amax(geno["probs"][:,:2],1)+np.amax(geno["probs"][:,2:4],1))<(1+minimumProbabilityStep)] = float('NaN')
                            else :
                                snp_df_dosage_t = (geno["probs"][:,0]* 2)+geno["probs"][:,1]
                                snp_df_dosage_t[np.amax(geno["probs"][:,:3],1)<((1/3)+minimumProbabilityStep)] = float('NaN')
                            snp_df_dosage_t = pd.Series(snp_df_dosage_t, index= fam.index)
                            snp_df_dosage_t.name = snpId
                            snp_cov_df_t = snp_cov_df_t.append(snp_df_dosage_t)
                            rowNumber = rowNumber +1
                        snp_cov_df = snp_cov_df_t.transpose()
                        snp_cov_df_t = None
        if debugger:
            fun_end = time.time()
            print(" Selecting feature variant covariate took {}".format(fun_end-fun_start))
        ########################################################################################################################################################

        ########################################################################################################################################################
        # Check the number of SNP to be tested and look if there is some SNP or feature filtering requirement
        if (len(snpQuery) != 0) and (snp_filter_df is not None):
            toSelect = set(snp_filter_df.index).intersection(set(snpQuery['snp']))
            snpQuery = snpQuery.loc[snpQuery['snp'].isin(toSelect)]

        if (len(snpQuery) != 0) and (snp_feature_filter_df is not None):
            toSelect = set(np.unique(snp_feature_filter_df['snp_id'].loc[snp_feature_filter_df['feature_id']==feature_id])).intersection(set(snpQuery['snp']))
            snpQuery = snpQuery.loc[snpQuery['snp'].isin(toSelect)]

        if len(snpQuery) == 0:
            print("Feature: "+feature_id+" not tested. No SNPS passed QC for phenotype.")
            fail_qc_features.append(feature_id)
            continue   
        ########################################################################################################################################################

        else:
            # selecting phenotype array
            phenotype_ds = phenotype_df.loc[feature_id]
            contains_missing_samples = any(~np.isfinite(phenotype_ds))
            #####################################################################################################################################################
            # check for missing samples according to specific feature otherwise use previous SNP QC information
            if(contains_missing_samples):
                print ('Feature: ' + feature_id + ' contains missing data.')
                phenotype_ds.dropna(inplace=True)
                na_containing_features = na_containing_features+1
            '''select indices for relevant individuals in genotype matrix
            These are not unique. NOT to be used to access phenotype/covariates data
            '''
            individual_ids = sample2individual_df.loc[phenotype_ds.index,'iid'].values
            sample2individual_feature= sample2individual_df.loc[phenotype_ds.index]
            
            if(contains_missing_samples):
                tmp_unique_individuals = geneticaly_unique_individuals
                if (kinship_df is not None) and (relatedness_score is not None):
                    geneticaly_unique_individuals = utils.get_unique_genetic_samples(kinship_df.loc[individual_ids,individual_ids], relatedness_score);
                else:
                    geneticaly_unique_individuals = individual_ids
            #####################################################################################################################################################

            else:
                #If no missing samples we can use the previous SNP QC information before actually loading data.
                #This allows for more efficient blocking and retrieving of data
                snpQuery = snpQuery.loc[snpQuery['snp'].map(lambda x: x not in list(map(str, fail_qc_snps_all)))]

            #####################################################################################################################################################
            # check for enough samples for QTL test
            if phenotype_ds.empty or len(geneticaly_unique_individuals)<minimum_test_samples :
                print("Feature: "+feature_id+" not tested not enough samples do QTL test.")
                fail_qc_features.append(feature_id)
                if contains_missing_samples:
                    geneticaly_unique_individuals = tmp_unique_individuals
                continue
            elif np.var(phenotype_ds.values) == 0:
                print("Feature: "+feature_id+" has no variance in selected individuals.")
                fail_qc_features.append(feature_id)
                if contains_missing_samples:
                    geneticaly_unique_individuals = tmp_unique_individuals
                continue
            #####################################################################################################################################################

            print ('For feature: ' +str(currentFeatureNumber)+ '/'+str(len(feature_list))+ ' (' + feature_id + '): ' + str(snpQuery.shape[0]) + ' SNPs need to be tested.\n Please stand by.')

            ##########################################################################################################################################################
            # SNP TESTING
            ##########################################################################################################################################################
            
            if(n_perm!=0):
                bestPermutationPval = np.ones((n_perm), dtype=np.float)
            
            #Here we need to start preparing the LMM, can use the fam for sample IDS in SNP matrix.
            #test if the covariates, kinship, snp and phenotype are in the same order
            if ((all(kinship_df.loc[sample2individual_df['iid'],sample2individual_df['iid']].index==sample2individual_feature.loc[phenotype_ds.index]['iid']) if kinship_df is not None else True) &\
                 (all(phenotype_ds.index==randomeff_df.loc[sample2individual_feature['sample'],sample2individual_feature['sample']].index) if randomeff_df is not None else True) &\
                 (all(phenotype_ds.index==covariate_df.loc[sample2individual_feature['sample'],:].index) if covariate_df is not None else True)):
                '''
                if all lines are in order put in arrays the correct genotype and phenotype
                x=a if cond1 else b <---> equivalent to if cond1: x=a else x=b;                 better readability of the code
                '''
                #######################################################################################################################################################
                # look if kinship or other random effect dataframe are present and QS computation !
                
                if debugger:
                    fun_start = time.time()
                if kinship_df is not None and randomeff_df is None:
                    kinship_mat = kinship_df.loc[individual_ids,individual_ids].values
                    kinship_mat = kinship_mat.astype(float)

                    ##GOWER normalization of Kinship matrix.
                    kinship_mat *= (kinship_mat.shape[0] - 1) / (kinship_mat.trace() - kinship_mat.mean(0).sum())
                    ## This needs to go with the subselection stuff.
                    if(QS is None and not contains_missing_samples):
                        QS = economic_qs(kinship_mat)
                    elif (contains_missing_samples):
                        QS = economic_qs(kinship_mat)
                # combining the two matrices
                if kinship_df is not None and randomeff_df is not None:
                    #Here we need to match names and make sure that the order is the same and the right samples get mixed.
                    randomeff_mix = True
                    if(not Sigma_qs and not contains_missing_samples):
                        kinship_mat = kinship_df.loc[individual_ids,individual_ids].values
                        kinship_mat = kinship_mat.astype(float)
                        
                        randomeff_mat = randomeff_df.loc[sample2individual_feature['sample'],sample2individual_feature['sample']].values
                        randomeff_mat = randomeff_mat.astype(float)
                        for rho in rho1:
                            Sigma[rho] = rho * kinship_mat + (1 - rho) * randomeff_mat
                            Sigma[rho] *= (Sigma[rho].shape[0] - 1) / (Sigma[rho].trace() - Sigma[rho].mean(0).sum())
                            Sigma_qs[rho] = economic_qs(Sigma[rho])

                    elif (contains_missing_samples):
                        #To fix: this needs to be reset after running with missing samples. Now Missing!!
                        kinship_mat = kinship_df.loc[individual_ids,individual_ids].values
                        kinship_mat = kinship_mat.astype(float)

                        randomeff_mat = randomeff_df.loc[sample2individual_feature['sample'],sample2individual_feature['sample']].values
                        randomeff_mat = randomeff_mat.astype(float)

                        for rho in rho1:
                            Sigma[rho] = rho * kinship_mat + (1 - rho) * randomeff_mat
                            ##GOWER normalization of Kinship matrix.
                            Sigma[rho] *= (Sigma[rho].shape[0] - 1) / (Sigma[rho].trace() - Sigma[rho].mean(0).sum())
                            Sigma_qs[rho] = economic_qs(Sigma[rho])

                # if kinship_df is None and randomeff_df is not None: 
                #     randomeff_mat = randomeff_df.loc[individual_ids,individual_ids].values
                #     randomeff_mat = randomeff_mat.astype(float)

                #     ##GOWER normalization of Kinship matrix.
                #     randomeff_mat *= (randomeff_mat.shape[0] - 1) / (randomeff_mat.trace() - randomeff_mat.mean(0).sum())
                #     ## This needs to go with the subselection stuff.
                #     if(QS is None and not contains_missing_samples):
                #         QS = economic_qs(randomeff_mat)
                #     elif (contains_missing_samples):
                #         QS = economic_qs(randomeff_mat)

                # creating a fake QS if none random effect is present or use the read depth one
                if kinship_df is None:
                    if randomeff_df is None:
                        K = np.eye(len(phenotype_ds.index))
                        if(QS is None and not contains_missing_samples):
                            QS = economic_qs(K)
                        elif (contains_missing_samples):
                            QS = economic_qs(K)
                    else:
                        if(QS is None and not contains_missing_samples):
                            QS = economic_qs(randomeff_df)
                        elif (contains_missing_samples):
                            QS = economic_qs(randomeff_df)

                if debugger:
                    fun_end = time.time()
                    print(" Computing QS took {}".format(fun_end-fun_start))
                #######################################################################################################################################################

                #######################################################################################################################################################
                # covariance matrix setting
                cov_matrix =  covariate_df.loc[sample2individual_feature['sample'],:].values if covariate_df is not None else None
                if covariate_df is None:
                    cov_matrix = np.ones((len(individual_ids), 1))
                if snp_cov_df is not None:
                    snp_cov_df_tmp = snp_cov_df.loc[individual_ids,:]
                    snp_cov_df_tmp.index=sample2individual_feature['sample']
                    snp_cov_df = pd.DataFrame(fill_NaN.fit_transform(snp_cov_df_tmp))
                    snp_cov_df.index=snp_cov_df_tmp.index
                    snp_cov_df.columns=snp_cov_df_tmp.columns
                    cov_matrix = np.concatenate((cov_matrix,snp_cov_df.values),1)
                    snp_cov_df_tmp = None
                    snp_cov_df = None
                cov_matrix = cov_matrix.astype(float)
                #######################################################################################################################################################
            else:
                print ('There is an issue in mapping phenotypes vs covariates and/or kinship')
                sys.exit()

            ###########################################################################################################################################################
            # force normal distribution of expression values
            phenotype_ds = pd.Series(data= utils.force_normal_distribution(phenotype_ds.values,method=gaussianize_method) if gaussianize_method is not None else phenotype_ds.values,index=phenotype_ds.index,name=phenotype_ds.name)
            ###########################################################################################################################################################
            
            ##########################################################################################################################################################
            # Regressing up covariates
            if debugger:
                fun_start = time.time()
            if regressCovariatesUpfront:
                phenotype = phenotype_ds.values
                ##Using LMM/LM to regress covariates up front. 
                # Computing Null Model 
                if debugger:
                    fun_start = time.time()
                if randomeff_mix:
                    mixingParameters = utils.rhoTest(None, phenotype,cov_matrix,Sigma_qs,mixed, None)
                    lmm = mixingParameters["lmm"]
                    log[(feature_id)].append(mixingParameters["rho"])
                    feature_best_rho = mixingParameters["rho"]
                
                    if mixingParameters["rho"]!=0:
                        print("Random effect has influence, mixing parameter: "+str(mixingParameters["rho"]))
                    else :
                        print("Only kinship has effect.")
                    
                else:
                    lmm = LMM(phenotype, cov_matrix, QS)
                    if not mixed:
                         lmm.delta = 1
                         lmm.fix('delta')
                    lmm.fit(verbose=False)
                if debugger:
                    fun_end = time.time()
                    print(" Computing Null model took {}".format(fun_end-fun_start))
                #Replace phenotype with corrected phenotype:
                phenotype_ds = pd.Series(data= (phenotype-cov_matrix[:,1:].dot(lmm.beta[1:])),index=phenotype_ds.index,name=phenotype_ds.name)
            
            if debugger:
                fun_end = time.time()
                print(" Regressing Covariates took {}".format(fun_end-fun_start))
            ##########################################################################################################################################################
            if debugger:
                fun_start = time.time()
            ##########################################################################################################################################################
            # Fast scanning - iterate according to a blocksize 
            countChunker = 0
            
            for snpGroup in utils.chunker(snpQuery, blocksize):
                countChunker=countChunker+1
                #print(countChunker)
                #Fix seed at the start of the first chunker so all permutations are based on the same random first split.
                np.random.seed(seed)
                #print(snpGroup)
                snp_idxs = snpGroup['i'].values
                snp_names = snpGroup['snp'].values

                tested_snp_ids.extend(snp_names)
                #subset genotype matrix, we cannot subselect at the same time, do in two steps.
                if debugger:
                    fun_start = time.time()
                ##########################################################################################################################################################
                # SNP dataframe creation
                if(plinkGenotype):
                    snp_df = pd.DataFrame(data=bed[snp_idxs,:].compute().transpose(),index=fam.index,columns=snp_names)
                else :
                    snp_df_dosage = pd.DataFrame(np.nan,index=fam.index, columns = snp_names)
                    snp_df = pd.DataFrame(np.nan,index=fam.index, columns = snp_names)
                    rowNumber = 0
                    for snpId in snp_idxs :
                        geno = bgen["genotype"][snpId].compute()
                        if (geno["ploidy"].min()>1 & geno["ploidy"].max()<3) :
                            if(geno["phased"]):
                                snp_df_dosage_t = geno["probs"][:,[0,2]].sum(1).astype(float)
                                snp_df_t = (np.abs(np.argmax(geno["probs"][:,:2], axis=1)-1)+np.abs(np.argmax(geno["probs"][:,2:4], axis=1)-1)).astype(float)
                                naId = (np.amax(geno["probs"][:,:2],1)+np.amax(geno["probs"][:,2:4],1))<(1+minimumProbabilityStep)
                                snp_df_dosage_t[naId] = float('NaN')
                                snp_df_t[naId] = float('NaN')
                            else :
                                snp_df_dosage_t = ((geno["probs"][:,0]* 2)+geno["probs"][:,1]).astype(float)
                                snp_df_t = (np.abs(np.argmax(geno["probs"][:,:3], axis=1)-2)).astype(float)
                                naId = np.amax(geno["probs"][:,:3],1)<((1/3)+minimumProbabilityStep)
                                snp_df_dosage_t[naId] = float('NaN')
                                snp_df_t[naId] = float('NaN')
                            snp_df_dosage.loc[:,snp_names[rowNumber]] = snp_df_dosage_t
                            snp_df.loc[:,snp_names[rowNumber]] = snp_df_t
                        rowNumber = rowNumber +1
                    snp_df_dosage = snp_df_dosage.loc[individual_ids,:]
                
                snp_df = snp_df.loc[individual_ids,:]
            
                snp_df = snp_df.loc[:,np.unique(snp_df.columns)[np.unique(snp_df.columns,return_counts=1)[1]==1]]
                if debugger:
                    fun_end = time.time()
                    print(" Subsetting genotype matrix took {}".format(fun_end-fun_start))

                ##########################################################################################################################################################
                #SNP QC.
                if debugger:
                    fun_start = time.time()
                if not contains_missing_samples:
                    #remove SNPs from snp_df if they have previously failed QC
                    snp_df = snp_df.loc[:,snp_df.columns[~snp_df.columns.isin(fail_qc_snps_all)]]
                    if snp_df.shape[1] == 0:
                        continue
                    snps_to_test_df = snp_df.loc[:,snp_df.columns[~snp_df.columns.isin(pass_qc_snps_all)]]
                    if snps_to_test_df.shape[1] > 0:
                        #Only do QC on relevant SNPs. join pre-QCed list and new QCed list.
                        if kinship_df is not None:
                            passed_snp_names,failed_snp_names,call_rate,maf,hweP = do_snp_qc(snps_to_test_df.iloc[np.unique(snps_to_test_df.index,return_index=1)[1]].loc[geneticaly_unique_individuals,:], min_call_rate, min_maf, min_hwe_P)
                        else:
                            passed_snp_names,failed_snp_names,call_rate,maf,hweP = do_snp_qc(snps_to_test_df, min_call_rate, min_maf, min_hwe_P)
                        snps_to_test_df = None
                        #append snp_names and failed_snp_names
                        pass_qc_snps_all.extend(passed_snp_names)
                        fail_qc_snps_all.extend(failed_snp_names)
                    snp_df = snp_df.loc[:,snp_df.columns[snp_df.columns.isin(pass_qc_snps_all)]]
                else:
                    #Do snp QC for relevant section.
                    #Get relevant slice from: phenotype_ds
                    if kinship_df is not None:
                        passed_snp_names,failed_snp_names,call_rate,maf,hweP = do_snp_qc(snp_df.iloc[np.unique(snp_df.index,return_index=1)[1]].loc[geneticaly_unique_individuals,:], min_call_rate, min_maf, min_hwe_P)
                    else:
                        passed_snp_names,failed_snp_names,call_rate,maf,hweP = do_snp_qc(snp_df, min_call_rate, min_maf, min_hwe_P)
                    snp_df = snp_df.loc[:,snp_df.columns[snp_df.columns.isin(passed_snp_names)]]
                snpQcInfo_t = None
                if call_rate is not None:
                    snpQcInfo_t = call_rate
                    if maf is not None:
                        snpQcInfo_t = pd.concat([snpQcInfo_t,maf.reindex(snpQcInfo_t.index)],axis=1)
                        if hweP is not None:
                            snpQcInfo_t = pd.concat([snpQcInfo_t,hweP.reindex(snpQcInfo_t.index)],axis=1)
                if debugger:
                    fun_end = time.time()
                    print(" SNP quality control took {}".format(fun_end-fun_start))
                ##########################################################################################################################################################

                call_rate = None
                maf = None
                hweP = None
                
                
                if snpQcInfo is None and snpQcInfo_t is not None:
                    snpQcInfo = snpQcInfo_t
                elif snpQcInfo_t is not None:
                    snpQcInfo = pd.concat([snpQcInfo, snpQcInfo_t], axis=0, sort = False)
                ##First process SNPQc than check if si can continue.
                if len(snp_df.columns) == 0:
                    continue
                ##If we use bgen we replace the genotypes here to only have the dosage matrix in mem. Trying to save some memory.
                if (not plinkGenotype):
                    snp_df= snp_df_dosage.loc[:,np.unique(snp_df.columns)]
                    snp_df_dosage = None
                #We could make use of relatedness when imputing.  And impute only based on genetically unique individuals.
                snp_df = pd.DataFrame(fill_NaN.fit_transform(snp_df),index=snp_df.index,columns=snp_df.columns)
                ##No more snp_matrix_DF > snp_df
#                test if the covariates, kinship, snp and phenotype are in the same order
                if (len(snp_df.index) != len(sample2individual_feature.loc[phenotype_ds.index]['iid']) or not all(snp_df.index==sample2individual_feature.loc[phenotype_ds.index]['iid'])):
                    print ('There is an issue in mapping phenotypes and genotypes')
                    sys.exit()

                ##########################################################################################################################################################
                # SCANNING
                if debugger:
                    fun_start = time.time()
                
                rho = [None] * snp_df.shape[1]
                pVal = [None] * snp_df.shape[1]
                if pearson: 
                    for snpPos in range(0, snp_df.shape[1]) :
                        rho[snpPos], pVal[snpPos] = sp.stats.pearsonr(snp_df.values[:,snpPos], phenotype_ds.values)
                else: 
                    for snpPos in range(0, snp_df.shape[1]) :
                        rho[snpPos], pVal[snpPos] = sp.stats.spearmanr(snp_df.values[:,snpPos], phenotype_ds.values)
                
                if debugger:
                    fun_end = time.time()
                    print(" Actual scanning took {}".format(fun_end-fun_start))
                #########################################################################################################################################################
                
                #add these results to qtl_results
                temp_df = pd.DataFrame(index = range(len(snp_df.columns)),columns=['feature_id','snp_id','p_value','beta','beta_se','empirical_feature_p_value'])
                temp_df['snp_id'] = snp_df.columns
                temp_df['feature_id'] = feature_id.replace("/","-")
                temp_df['beta'] = np.asarray(rho)
                temp_df['p_value'] = np.asarray(pVal)
                
                #insert default dummy value
                temp_df['beta_se'] = None
                temp_df['empirical_feature_p_value'] = -1.0
                
                ##########################################################################################################################################################
                # SCANNING
                if debugger:
                    fun_start = time.time()
                if(n_perm!=0):
                    pValueBuffer = []
                    totalSnpsToBeTested = (snp_df.shape[1]*n_perm)
                    permutationStepSize = np.floor(n_perm/(totalSnpsToBeTested/blocksize))
                    if(permutationStepSize>n_perm):
                        permutationStepSize=n_perm
                    elif(permutationStepSize<1):
                        permutationStepSize=1
                    
                    if(write_permutations):
                        print("Not supported.")
                        #perm_df = pd.DataFrame(index = range(len(snp_df.columns)),columns=['snp_id'] + ['permutation_'+str(x) for x in range(n_perm)])
                        #perm_df['snp_id'] = snp_df.columns
                    for currentNperm in utils.chunker(list(range(1, n_perm+1)), permutationStepSize):
                        
                        if (kinship_df is not None) and (relatedness_score is not None):
                            temp = utils.get_shuffeld_genotypes_preserving_kinship(geneticaly_unique_individuals, relatedness_score, snp_df,kinship_df.loc[individual_ids,individual_ids], len(currentNperm))
                        else:
                            temp = utils.get_shuffeld_genotypes(snp_df, len(currentNperm))
                        temp = temp.astype(float)
                        
                        var_pvalues_p = [None] * temp.shape[1]
                        if pearson: 
                            for snpPos in range(0, temp.shape[1]) :
                                rhoP, var_pvalues_p[snpPos] = sp.stats.pearsonr(temp[:,snpPos], phenotype_ds.values)
                        else : 
                            for snpPos in range(0, temp.shape[1]) :
                                rhoP, var_pvalues_p[snpPos] = sp.stats.spearmanr(temp[:,snpPos], phenotype_ds.values)
                        pValueBuffer.extend(np.asarray(var_pvalues_p))
                    if(not(len(pValueBuffer)==totalSnpsToBeTested)):
                        print(len(pValueBuffer))
                        print(pValueBuffer)
                        print(totalSnpsToBeTested)
                        print('Error in blocking logic for permutations.')
                        sys.exit()
                    perm = 0
                    for relevantOutput in utils.chunker(pValueBuffer,snp_df.shape[1]) :
                        #if(write_permutations):
                        #    perm_df['permutation_'+str(perm)] = relevantOutput
                        if(bestPermutationPval[perm] > min(relevantOutput)):
                            bestPermutationPval[perm] = min(relevantOutput)
                        perm = perm+1
                        #print(relevantOutput)
                        #print('permutation_'+str(perm))

                if not temp_df.empty :
                    data_written = True
                    output_writer.add_result_df(temp_df)
                    #if(write_permutations):
                    #    permutation_writer.add_permutation_results_df(perm_df,feature_id)
                if debugger:
                    fun_end = time.time()
                    print(" Permutations took {}".format(fun_end-fun_start))
        #This we need to change in the written file.
        if debugger:
            fun_start = time.time()
        
        if not data_written :
            fail_qc_features.append(feature_id)
        else:
            n_samples.append(phenotype_ds.size)
            n_e_samples.append(len(geneticaly_unique_individuals))
            if n_perm>1 :
                #updated_permuted_p_in_hdf5(bestPermutationPval, feature_id);
                alpha_para, beta_para = output_writer.apply_pval_correction(feature_id.replace("/","-"),bestPermutationPval, cis_mode)
                if write_feature_top_permutations:
                    np.savetxt(output_dir+"/Permutation.pValues."+feature_id.replace("/","-")+".txt",bestPermutationPval)
                alpha_params.append(alpha_para)
                beta_params.append(beta_para)
            if randomeff_mix :
                random_eff_param.append(feature_best_rho)
        
        if contains_missing_samples:
            QS = None
            Sigma_qs = None
            geneticaly_unique_individuals = tmp_unique_individuals
            del tmp_unique_individuals
            if snpQcInfo is not None:
                snpQcInfo.index.name = "snp_id"
                snpQcInfo.to_csv(output_dir+'/snp_qc_metrics_naContaining_feature_{}.txt'.format(feature_id.replace("/","-")),sep='\t')
        else:
            if (snpQcInfo is not None and snpQcInfoMain is not None):
                snpQcInfoMain = pd.concat([snpQcInfoMain, snpQcInfo], axis=0, sort=False)
            elif snpQcInfo is not None :
                snpQcInfoMain = snpQcInfo.copy(deep=True)
        if debugger:
            fun_end = time.time()
            print(" Writing took {}".format(fun_end-fun_start))
        #if snpQcInfo is not None:
            #snpQcInfo2 = snpQcInfo.copy().transpose()
            #snpQcInfo2.to_csv(output_dir+'/snp_qc_metrics_feature_{}.txt'.format(feature_id),sep='\t')
        #print('step 5')
            
        print("Time: --- %s seconds ---" % (time.time() - start_time))
        tot_time += time.time() - start_time
        idx += 1
        print("Mean: --- %s seconds ---" % (tot_time/idx))
        log[(feature_id)].append((time.time() - start_time))
        log[(feature_id)].append(tot_time/idx)

    output_writer.close()

    if(write_permutations):
        permutation_writer.close()
    fail_qc_features = np.unique(fail_qc_features)
    if((len(feature_list)-len(fail_qc_features))==0):
        time.sleep(15)
        #Safety timer to make sure the file is unlocked.
        print("Trying to remove the h5 file. Nothing has been tested.")
        print(output_dir+'qtl_results_{}_{}_{}.h5'.format(chromosome,selectionStart,selectionEnd))
        if not selectionStart is None :
            os.remove(output_dir+'qtl_results_{}_{}_{}.h5'.format(chromosome,selectionStart,selectionEnd))
        else :
            os.remove(output_dir+'qtl_results_{}.h5'.format(chromosome))
        sys.exit()
    #gather unique indexes of tested SNPs

    tested_snp_ids = list(set(tested_snp_ids))
    #write annotation and snp data to file
    snp_df = pd.DataFrame()
    snp_df['snp_id'] = bim['snp']
    snp_df['chromosome'] = bim['chrom']
    snp_df['position'] = bim['pos']
    snp_df['assessed_allele'] = bim['a1']
    snp_df.index = snp_df['snp_id']
    snp_df = snp_df.drop_duplicates()
    snp_df = snp_df.reindex(tested_snp_ids)
    snp_df = snp_df.drop_duplicates()

    if snpQcInfoMain is not None :
        snpQcInfoMain['index']=snpQcInfoMain.index
        snpQcInfoMain = snpQcInfoMain.drop_duplicates()
        del snpQcInfoMain['index']
        snp_df = pd.concat([snp_df, snpQcInfoMain.reindex(snp_df.index)], axis=1)

        if(snp_df.shape[1]==5):
            snp_df.columns = ['snp_id','chromosome','position','assessed_allele','call_rate']
        elif(snp_df.shape[1]==6):
            snp_df.columns = ['snp_id','chromosome','position','assessed_allele','call_rate','maf']
        else :
            snp_df.columns = ['snp_id','chromosome','position','assessed_allele','call_rate','maf','hwe_p']

    feature_list = list(set(feature_list) - set(fail_qc_features))
    annotation_df = annotation_df.reindex(feature_list)
    annotation_df['n_samples'] = n_samples
    annotation_df['n_e_samples'] = n_e_samples

    if(n_perm>1):
        annotation_df['alpha_param'] = alpha_params
        annotation_df['beta_param'] = beta_params
    
    if randomeff_mix:
        annotation_df['rho'] = random_eff_param

    if not selectionStart is None :
        snp_df.to_csv(output_dir+'/snp_metadata_{}_{}_{}.txt'.format(chromosome,selectionStart,selectionEnd),sep='\t',index=False)
        annotation_df.to_csv(output_dir+'/feature_metadata_{}_{}_{}.txt'.format(chromosome,selectionStart,selectionEnd),sep='\t')
    else :
        snp_df.to_csv(output_dir+'/snp_metadata_{}.txt'.format(chromosome),sep='\t',index=False)
        annotation_df.to_csv(output_dir+'/feature_metadata_{}.txt'.format(chromosome),sep='\t')

    if not selectionStart is None :
        print("saving log!")
        print(log)
        #pd.DataFrame.from_dict(log, orient="index", columns=["rho","start","mean"]).to_csv(output_dir + "/" + str(chromosome) + "_" + str(selectionStart) + "_" +  str(selectionEnd) + "_log_rho.txt", sep="\t")
    else:
        print("saving log!")
Exemplo n.º 21
0
def run_QTL_analysis(pheno_filename,
                     anno_filename,
                     geno_prefix,
                     plinkGenotype,
                     output_dir,
                     window_size=250000,
                     min_maf=0.05,
                     min_hwe_P=0.001,
                     min_call_rate=0.95,
                     blocksize=1000,
                     cis_mode=True,
                     skipAutosomeFiltering=False,
                     gaussianize_method=None,
                     minimum_test_samples=10,
                     seed=np.random.randint(40000),
                     n_perm=0,
                     write_permutations=False,
                     relatedness_score=0.95,
                     feature_variant_covariate_filename=None,
                     snps_filename=None,
                     feature_filename=None,
                     snp_feature_filename=None,
                     genetic_range='all',
                     covariates_filename=None,
                     kinship_filename=None,
                     sample_mapping_filename=None,
                     extended_anno_filename=None,
                     regressCovariatesUpfront=False):
    fill_NaN = Imputer(missing_values=np.nan,
                       strategy='mean',
                       axis=0,
                       copy=False)
    print('Running QTL analysis.')
    lik = 'normal'
    minimumProbabilityStep = 0.1
    '''Core function to take input and run QTL tests on a given chromosome.'''
    if relatedness_score is not None:
        relatedness_score = float(relatedness_score)
    [phenotype_df, kinship_df, covariate_df, sample2individual_df,complete_annotation_df, annotation_df, snp_filter_df, snp_feature_filter_df, geneticaly_unique_individuals, minimum_test_samples, feature_list, bim, fam, bed, bgen, chromosome, selectionStart, selectionEnd, feature_variant_covariate_df]=\
    utils.run_QTL_analysis_load_intersect_phenotype_covariates_kinship_sample_mapping(pheno_filename=pheno_filename, anno_filename=anno_filename, geno_prefix=geno_prefix, plinkGenotype=plinkGenotype, cis_mode=cis_mode, skipAutosomeFiltering = skipAutosomeFiltering,
                      minimum_test_samples= minimum_test_samples,  relatedness_score=relatedness_score, snps_filename=snps_filename, feature_filename=feature_filename, snp_feature_filename=snp_feature_filename, selection=genetic_range,
                     covariates_filename=covariates_filename, kinship_filename=kinship_filename, sample_mapping_filename=sample_mapping_filename, extended_anno_filename=extended_anno_filename, feature_variant_covariate_filename=feature_variant_covariate_filename)

    mixed = kinship_df is not None
    if (kinship_df is None) or (relatedness_score is None):
        geneticaly_unique_individuals = sample2individual_df['iid'].values
    QS = None
    if (feature_list == None or len(feature_list) == 0):
        print('No features to be tested.')
        sys.exit()

    #Open output files
    qtl_loader_utils.ensure_dir(output_dir)
    if not selectionStart is None:
        output_writer = qtl_output.hdf5_writer(
            output_dir + '/qtl_results_{}_{}_{}.h5'.format(
                chromosome, selectionStart, selectionEnd))
    else:
        output_writer = qtl_output.hdf5_writer(
            output_dir + '/qtl_results_{}.h5'.format(chromosome))
    if (write_permutations):
        if not selectionStart is None:
            permutation_writer = qtl_output.hdf5_permutations_writer(
                output_dir + '/perm_results_{}_{}_{}.h5'.format(
                    chromosome, selectionStart, selectionEnd), n_perm)
        else:
            permutation_writer = qtl_output.hdf5_permutations_writer(
                output_dir + '/perm_results_{}.h5'.format(chromosome), n_perm)

    #Arrays to store indices of snps tested and pass and fail QC SNPs for features without missingness.
    tested_snp_ids = []
    pass_qc_snps_all = []
    fail_qc_snps_all = []
    fail_qc_features = []
    alpha_params = []
    beta_params = []
    n_samples = []
    n_e_samples = []
    na_containing_features = 0
    currentFeatureNumber = 0
    snpQcInfoMain = None

    for feature_id in feature_list:
        snpQcInfo = None
        currentFeatureNumber += 1
        if (len(phenotype_df.loc[feature_id, :])) < minimum_test_samples:
            print("Feature: " + feature_id +
                  " not tested not enough samples do QTL test.")
            fail_qc_features.append(feature_id)
            geneticaly_unique_individuals = tmp_unique_individuals
            continue
        data_written = False
        contains_missing_samples = False
        snpQuery = utils.do_snp_selection(feature_id, complete_annotation_df,
                                          bim, cis_mode, window_size,
                                          skipAutosomeFiltering)
        snp_cov_df = None
        if (feature_variant_covariate_df is not None):
            if (feature_id in feature_variant_covariate_df['feature'].values):
                covariateSnp = feature_variant_covariate_df['snp_id'].values[
                    feature_variant_covariate_df['feature'] == feature_id]
                if (any(i in bim['snp'].values for i in covariateSnp)):
                    snpQuery_cov = bim.loc[
                        bim['snp'].map(lambda x: x in list(covariateSnp)), :]
                    if (plinkGenotype):
                        snp_cov_df = pd.DataFrame(
                            data=bed[snpQuery_cov['i'].values, :].compute().
                            transpose(),
                            index=fam.index,
                            columns=snpQuery_cov['snp'],
                        )
                    else:
                        ##Here we make some assumptions on the SNPs. They are expected to be ploidy 2!
                        ##Also we don't use a minimal quality to assure a value is present for all samples.
                        print(
                            'Warning, during the regression of SNPs we assume ploidy 2.'
                        )
                        snp_cov_df_t = pd.DataFrame(columns=fam.index)
                        rowNumber = 0
                        for snpId in snpQuery_cov['i']:
                            geno = bgen["genotype"][snpId].compute()
                            if (geno["phased"]):
                                snp_df_dosage_t = geno["probs"][:, [0, 2]].sum(
                                    1).astype(float)
                                snp_df_dosage_t[(
                                    np.amax(geno["probs"][:, :2], 1) +
                                    np.amax(geno["probs"][:, 2:4], 1)) < (
                                        1 +
                                        minimumProbabilityStep)] = float('NaN')
                            else:
                                snp_df_dosage_t = (geno["probs"][:, 0] *
                                                   2) + geno["probs"][:, 1]
                                snp_df_dosage_t[
                                    np.amax(geno["probs"][:, :3], 1) < (
                                        (1 / 3) +
                                        minimumProbabilityStep)] = float('NaN')
                            snp_df_dosage_t = pd.Series(snp_df_dosage_t,
                                                        index=fam.index)
                            snp_df_dosage_t.name = snpId
                            snp_cov_df_t = snp_cov_df_t.append(snp_df_dosage_t)
                            rowNumber = rowNumber + 1
                        snp_cov_df_t = snp_cov_df_t.transpose()

        if (len(snpQuery) != 0) and (snp_filter_df is not None):
            toSelect = set(snp_filter_df.index).intersection(
                set(snpQuery['snp']))
            snpQuery = snpQuery.loc[snpQuery['snp'].isin(toSelect)]

        if (len(snpQuery) != 0) and (snp_feature_filter_df is not None):
            toSelect = set(
                np.unique(snp_feature_filter_df['snp_id'].loc[
                    snp_feature_filter_df['feature'] ==
                    feature_id])).intersection(set(snpQuery['snp']))
            snpQuery = snpQuery.loc[snpQuery['snp'].isin(toSelect)]

        if len(snpQuery) == 0:
            print("Feature: " + feature_id +
                  " not tested. No SNPS passed QC for phenotype.")
            fail_qc_features.append(feature_id)
            continue
        else:
            phenotype_ds = phenotype_df.loc[feature_id]
            contains_missing_samples = any(~np.isfinite(phenotype_ds))
            if (contains_missing_samples):
                print('Feature: ' + feature_id + ' contains missing data.')
                phenotype_ds.dropna(inplace=True)
                na_containing_features = na_containing_features + 1
            '''select indices for relevant individuals in genotype matrix
            These are not unique. NOT to be used to access phenotype/covariates data
            '''

            individual_ids = sample2individual_df.loc[phenotype_ds.index,
                                                      'iid'].values
            sample2individual_feature = sample2individual_df.loc[
                phenotype_ds.index]

            if (contains_missing_samples):
                tmp_unique_individuals = geneticaly_unique_individuals
                if (kinship_df is not None) and (relatedness_score
                                                 is not None):
                    geneticaly_unique_individuals = utils.get_unique_genetic_samples(
                        kinship_df.loc[individual_ids, individual_ids],
                        relatedness_score)
                else:
                    geneticaly_unique_individuals = individual_ids
            else:
                #If no missing samples we can use the previous SNP Qc information before actually loading data.
                #This allows for more efficient blocking and retrieving of data
                snpQuery = snpQuery.loc[snpQuery['snp'].map(
                    lambda x: x not in list(map(str, fail_qc_snps_all)))]

            if phenotype_ds.empty or len(
                    geneticaly_unique_individuals) < minimum_test_samples:
                print("Feature: " + feature_id +
                      " not tested not enough samples do QTL test.")
                fail_qc_features.append(feature_id)
                if contains_missing_samples:
                    geneticaly_unique_individuals = tmp_unique_individuals
                continue
            elif np.var(phenotype_ds.values) == 0:
                print("Feature: " + feature_id +
                      " has no variance in selected individuals.")
                fail_qc_features.append(feature_id)
                if contains_missing_samples:
                    geneticaly_unique_individuals = tmp_unique_individuals
                continue

            print('For feature: ' + str(currentFeatureNumber) + '/' +
                  str(len(feature_list)) + ' (' + feature_id + '): ' +
                  str(snpQuery.shape[0]) +
                  ' SNPs need to be tested.\n Please stand by.')

            if (n_perm != 0):
                bestPermutationPval = np.ones((n_perm), dtype=np.float)

            #Here we need to start preparing the LMM, can use the fam for sample IDS in SNP matrix.
            #test if the covariates, kinship, snp and phenotype are in the same order
            if ((all(kinship_df.loc[individual_ids,individual_ids].index==sample2individual_feature.loc[phenotype_ds.index]['iid']) if kinship_df is not None else True) &\
                 (all(phenotype_ds.index==covariate_df.loc[sample2individual_feature['sample'],:].index)if covariate_df is not None else True)):
                '''
                if all lines are in order put in arrays the correct genotype and phenotype
                x=a if cond1 else b <---> equivalent to if cond1: x=a else x=b;                 better readability of the code
                 '''
                if kinship_df is not None:
                    kinship_mat = kinship_df.loc[individual_ids,
                                                 individual_ids].values
                    kinship_mat = kinship_mat.astype(float)
                    ##GOWER normalization of Kinship matrix.
                    kinship_mat *= (kinship_mat.shape[0] - 1) / (
                        kinship_mat.trace() - kinship_mat.mean(0).sum())
                    ## This needs to go with the subselection stuff.
                    if (QS is None and not contains_missing_samples):
                        QS = economic_qs(kinship_mat)
                    elif (contains_missing_samples):
                        QS_tmp = QS
                        QS = economic_qs(kinship_mat)
                if kinship_df is None:
                    K = np.eye(len(phenotype_ds.index))
                    if (QS is None and not contains_missing_samples):
                        QS = economic_qs(K)
                    elif (contains_missing_samples):
                        QS_tmp = QS
                        QS = economic_qs(K)
                cov_matrix = covariate_df.loc[sample2individual_feature[
                    'sample'], :].values if covariate_df is not None else None
                if covariate_df is None:
                    cov_matrix = np.ones((len(individual_ids), 1))
                if snp_cov_df is not None:
                    snp_cov_df_tmp = snp_cov_df.loc[individual_ids, :]
                    snp_cov_df_tmp.index = sample2individual_feature['sample']
                    snp_cov_df = pd.DataFrame(
                        fill_NaN.fit_transform(snp_cov_df_tmp))
                    snp_cov_df.index = snp_cov_df_tmp.index
                    snp_cov_df.columns = snp_cov_df_tmp.columns
                    cov_matrix = np.concatenate(
                        (cov_matrix, snp_cov_df.values), 1)
                    snp_cov_df_tmp = None
                    snp_cov_df = None
                cov_matrix = cov_matrix.astype(float)
            else:
                print(
                    'There is an issue in mapping phenotypes vs covariates and/or kinship'
                )
                sys.exit()

            phenotype = utils.force_normal_distribution(
                phenotype_ds.values, method=gaussianize_method
            ) if gaussianize_method is not None else phenotype_ds.values

            #Prepare LMM
            phenotype = phenotype.astype(float)

            ##Mixed and test.
            ##This is a future change so we don't need to decompose the COVs every time.
            ##Like QS this needs to happen when genetic unique individuals is the same.
            #svd_cov = economic_svd(cov_matrix)
            #lmm = LMM(phenotype, cov_matrix, QS, SVD=svd_cov)
            #These steps need to happen only once per phenotype.
            #print(QS)
            lmm = LMM(phenotype, cov_matrix, QS)
            if not mixed:
                lmm.delta = 1
                lmm.fix('delta')
            #Prepare null model.
            lmm.fit(verbose=False)
            if regressCovariatesUpfront:
                phenotype_corrected = phenotype - cov_matrix[:, 1:].dot(
                    lmm.beta[1:])
                cov_matrix_corrected = cov_matrix[:, 0]
                lmm = LMM(phenotype_corrected, cov_matrix_corrected, QS)
                lmm.fit(verbose=False)

            null_lml = lmm.lml()
            flmm = lmm.get_fast_scanner()
            countChunker = 0
            for snpGroup in utils.chunker(snpQuery, blocksize):
                countChunker = countChunker + 1
                #print(countChunker)
                #Fix seed at the start of the first chunker so all permutations are based on the same random first split.
                np.random.seed(seed)
                #print(snpGroup)
                snp_idxs = snpGroup['i'].values
                snp_names = snpGroup['snp'].values

                tested_snp_ids.extend(snp_names)
                #subset genotype matrix, we cannot subselect at the same time, do in two steps.
                if (plinkGenotype):
                    snp_df = pd.DataFrame(
                        data=bed[snp_idxs, :].compute().transpose(),
                        index=fam.index,
                        columns=snp_names)
                else:
                    snp_df_dosage = pd.DataFrame(np.nan,
                                                 index=fam.index,
                                                 columns=snp_names)
                    snp_df = pd.DataFrame(np.nan,
                                          index=fam.index,
                                          columns=snp_names)
                    rowNumber = 0
                    for snpId in snp_idxs:
                        geno = bgen["genotype"][snpId].compute()
                        if (geno["ploidy"].min() > 1 & geno["ploidy"].max() <
                                3):
                            if (geno["phased"]):
                                snp_df_dosage_t = geno["probs"][:, [0, 2]].sum(
                                    1).astype(float)
                                snp_df_t = (np.abs(
                                    np.argmax(geno["probs"][:, :2], axis=1) - 1
                                ) + np.abs(
                                    np.argmax(geno["probs"][:, 2:4], axis=1) -
                                    1)).astype(float)
                                naId = (np.amax(geno["probs"][:, :2], 1) +
                                        np.amax(geno["probs"][:, 2:4], 1)) < (
                                            1 + minimumProbabilityStep)
                                snp_df_dosage_t[naId] = float('NaN')
                                snp_df_t[naId] = float('NaN')
                            else:
                                snp_df_dosage_t = (
                                    (geno["probs"][:, 0] * 2) +
                                    geno["probs"][:, 1]).astype(float)
                                snp_df_t = (np.abs(
                                    np.argmax(geno["probs"][:, :3], axis=1) -
                                    2)).astype(float)
                                naId = np.amax(geno["probs"][:, :3], 1) < (
                                    (1 / 3) + minimumProbabilityStep)
                                snp_df_dosage_t[naId] = float('NaN')
                                snp_df_t[naId] = float('NaN')
                            snp_df_dosage.loc[:, snp_names[
                                rowNumber]] = snp_df_dosage_t
                            snp_df.loc[:, snp_names[rowNumber]] = snp_df_t
                        rowNumber = rowNumber + 1
                    snp_df_dosage = snp_df_dosage.loc[individual_ids, :]

                snp_df = snp_df.loc[individual_ids, :]

                snp_df = snp_df.loc[:,
                                    np.unique(snp_df.columns)[
                                        np.unique(snp_df.columns,
                                                  return_counts=1)[1] == 1]]
                #SNP QC.
                if not contains_missing_samples:
                    #remove SNPs from snp_df if they have previously failed QC
                    snp_df = snp_df.loc[:,
                                        snp_df.columns[~snp_df.columns.
                                                       isin(fail_qc_snps_all)]]
                    if snp_df.shape[1] == 0:
                        continue
                    snps_to_test_df = snp_df.loc[:, snp_df.columns[
                        ~snp_df.columns.isin(pass_qc_snps_all)]]
                    if snps_to_test_df.shape[1] > 0:
                        #Only do QC on relevant SNPs. join pre-QCed list and new QCed list.
                        if kinship_df is not None:
                            passed_snp_names, failed_snp_names, call_rate, maf, hweP = do_snp_qc(
                                snps_to_test_df.iloc[np.unique(
                                    snps_to_test_df.index,
                                    return_index=1)[1]].loc[
                                        geneticaly_unique_individuals, :],
                                min_call_rate, min_maf, min_hwe_P)
                        else:
                            passed_snp_names, failed_snp_names, call_rate, maf, hweP = do_snp_qc(
                                snps_to_test_df, min_call_rate, min_maf,
                                min_hwe_P)
                        snps_to_test_df = None
                        #append snp_names and failed_snp_names
                        pass_qc_snps_all.extend(passed_snp_names)
                        fail_qc_snps_all.extend(failed_snp_names)
                    snp_df = snp_df.loc[:,
                                        snp_df.columns[snp_df.columns.
                                                       isin(pass_qc_snps_all)]]
                else:
                    #Do snp QC for relevant section.
                    #Get relevant slice from: phenotype_ds
                    if kinship_df is not None:
                        passed_snp_names, failed_snp_names, call_rate, maf, hweP = do_snp_qc(
                            snp_df.iloc[np.unique(
                                snp_df.index, return_index=1)[1]].loc[
                                    geneticaly_unique_individuals, :],
                            min_call_rate, min_maf, min_hwe_P)
                    else:
                        passed_snp_names, failed_snp_names, call_rate, maf, hweP = do_snp_qc(
                            snp_df, min_call_rate, min_maf, min_hwe_P)
                    snp_df = snp_df.loc[:,
                                        snp_df.columns[snp_df.columns.
                                                       isin(passed_snp_names)]]
                snpQcInfo_t = None
                if call_rate is not None:
                    snpQcInfo_t = call_rate
                    if maf is not None:
                        snpQcInfo_t = pd.concat(
                            [snpQcInfo_t,
                             maf.reindex(snpQcInfo_t.index)],
                            axis=1)
                        if hweP is not None:
                            snpQcInfo_t = pd.concat(
                                [snpQcInfo_t,
                                 hweP.reindex(snpQcInfo_t.index)],
                                axis=1)
                call_rate = None
                maf = None
                hweP = None
                if snpQcInfo is None and snpQcInfo_t is not None:
                    snpQcInfo = snpQcInfo_t
                elif snpQcInfo_t is not None:
                    snpQcInfo = pd.concat([snpQcInfo, snpQcInfo_t],
                                          axis=0,
                                          sort=False)
                ##First process SNPQc than check if we can continue.
                if len(snp_df.columns) == 0:
                    continue
                elif (not plinkGenotype):
                    snp_df_dosage = snp_df_dosage.loc[:,
                                                      np.unique(snp_df.columns
                                                                )]
                #We could make use of relatedness when imputing.  And impute only based on genetically unique individuals.
                snp_df = pd.DataFrame(fill_NaN.fit_transform(snp_df),
                                      index=snp_df.index,
                                      columns=snp_df.columns)
                if (not plinkGenotype):
                    snp_df_dosage = pd.DataFrame(
                        fill_NaN.fit_transform(snp_df_dosage),
                        index=snp_df_dosage.index,
                        columns=snp_df_dosage.columns)
                ##No more snp_matrix_DF > snp_df


#                test if the covariates, kinship, snp and phenotype are in the same order
                if (len(snp_df.index) != len(sample2individual_feature.loc[
                        phenotype_ds.index]['iid'])
                        or not all(snp_df.index == sample2individual_feature.
                                   loc[phenotype_ds.index]['iid'])):
                    print(
                        'There is an issue in mapping phenotypes and genotypes'
                    )
                    sys.exit()

                G = snp_df.values
                if (not plinkGenotype):
                    G = snp_df_dosage.values
                G = G.astype(float)
                G_index = snp_df.columns

                alt_lmls, effsizes = flmm.fast_scan(G, verbose=False)
                var_pvalues = lrt_pvalues(null_lml, alt_lmls)
                var_effsizes_se = effsizes_se(effsizes, var_pvalues)

                #add these results to qtl_results
                temp_df = pd.DataFrame(index=range(len(G_index)),
                                       columns=[
                                           'feature_id', 'snp_id', 'p_value',
                                           'beta', 'beta_se',
                                           'empirical_feature_p_value'
                                       ])
                temp_df['snp_id'] = G_index
                temp_df['feature_id'] = feature_id
                temp_df['beta'] = np.asarray(effsizes)
                temp_df['p_value'] = np.asarray(var_pvalues)
                temp_df['beta_se'] = np.asarray(var_effsizes_se)
                #insert default dummy value
                temp_df['empirical_feature_p_value'] = -1.0

                if (n_perm != 0):
                    pValueBuffer = []
                    totalSnpsToBeTested = (G.shape[1] * n_perm)
                    permutationStepSize = np.floor(
                        n_perm / (totalSnpsToBeTested / blocksize))
                    if (permutationStepSize > n_perm):
                        permutationStepSize = n_perm
                    elif (permutationStepSize < 1):
                        permutationStepSize = 1

                    if (write_permutations):
                        perm_df = pd.DataFrame(
                            index=range(len(G_index)),
                            columns=['snp_id'] +
                            ['permutation_' + str(x) for x in range(n_perm)])
                        perm_df['snp_id'] = G_index
                    for currentNperm in utils.chunker(
                            list(range(1, n_perm + 1)), permutationStepSize):
                        if (kinship_df is not None) and (relatedness_score
                                                         is not None):
                            if (plinkGenotype):
                                temp = utils.get_shuffeld_genotypes_preserving_kinship(
                                    geneticaly_unique_individuals,
                                    relatedness_score, snp_df,
                                    kinship_df.loc[individual_ids,
                                                   individual_ids],
                                    len(currentNperm))
                            else:
                                temp = utils.get_shuffeld_genotypes_preserving_kinship(
                                    geneticaly_unique_individuals,
                                    relatedness_score, snp_df_dosage,
                                    kinship_df.loc[individual_ids,
                                                   individual_ids],
                                    len(currentNperm))
                        else:
                            if (plinkGenotype):
                                temp = utils.get_shuffeld_genotypes(
                                    snp_df, len(currentNperm))
                            else:
                                temp = utils.get_shuffeld_genotypes(
                                    snp_df_dosage, len(currentNperm))
                        temp = temp.astype(float)
                        alt_lmls_p, effsizes_p = flmm.fast_scan(temp,
                                                                verbose=False)
                        var_pvalues_p = lrt_pvalues(null_lml, alt_lmls_p)
                        pValueBuffer.extend(np.asarray(var_pvalues_p))
                    if (not (len(pValueBuffer) == totalSnpsToBeTested)):
                        #print(len(pValueBuffer))
                        #print(pValueBuffer)
                        #print(totalSnpsToBeTested)
                        print('Error in blocking logic for permutations.')
                        sys.exit()
                    perm = 0
                    for relevantOutput in utils.chunker(
                            pValueBuffer, G.shape[1]):
                        if (write_permutations):
                            perm_df['permutation_' +
                                    str(perm)] = relevantOutput
                        if (bestPermutationPval[perm] > min(relevantOutput)):
                            bestPermutationPval[perm] = min(relevantOutput)
                        perm = perm + 1
                        #print(relevantOutput)
                        #print('permutation_'+str(perm))

                if not temp_df.empty:
                    data_written = True
                    output_writer.add_result_df(temp_df)
                    if (write_permutations):
                        permutation_writer.add_permutation_results_df(
                            perm_df, feature_id)
            #This we need to change in the written file.
        if (n_perm > 1 and data_written):
            #updated_permuted_p_in_hdf5(bestPermutationPval, feature_id);
            alpha_para, beta_para = output_writer.apply_pval_correction(
                feature_id, bestPermutationPval, cis_mode)
            #np.savetxt(output_dir+"/Permutation.pValues."+feature_id+".txt",bestPermutationPval)
            alpha_params.append(alpha_para)
            beta_params.append(beta_para)
        if not data_written:
            fail_qc_features.append(feature_id)
        else:
            n_samples.append(phenotype_ds.size)
            n_e_samples.append(len(geneticaly_unique_individuals))
        if contains_missing_samples:
            QS = QS_tmp
            geneticaly_unique_individuals = tmp_unique_individuals
            del QS_tmp
            del tmp_unique_individuals
            if snpQcInfo is not None:
                snpQcInfo.index.name = "snp_id"
                snpQcInfo.to_csv(
                    output_dir +
                    '/snp_qc_metrics_naContaining_feature_{}.txt'.format(
                        feature_id),
                    sep='\t')
        else:
            if (snpQcInfo is not None and snpQcInfoMain is not None):
                snpQcInfoMain = pd.concat([snpQcInfoMain, snpQcInfo],
                                          axis=0,
                                          sort=False)
            elif snpQcInfo is not None:
                snpQcInfoMain = snpQcInfo.copy(deep=True)
        #if snpQcInfo is not None:
        #snpQcInfo2 = snpQcInfo.copy().transpose()
        #snpQcInfo2.to_csv(output_dir+'/snp_qc_metrics_feature_{}.txt'.format(feature_id),sep='\t')
        #print('step 5')
    output_writer.close()

    if (write_permutations):
        permutation_writer.close()
    fail_qc_features = np.unique(fail_qc_features)
    if ((len(feature_list) - len(fail_qc_features)) == 0):
        time.sleep(15)
        #Safety timer to make sure the file is unlocked.
        print("Trying to remove the h5 file. Nothing has been tested.")
        print(output_dir + 'qtl_results_{}_{}_{}.h5'.format(
            chromosome, selectionStart, selectionEnd))
        if not selectionStart is None:
            os.remove(output_dir + 'qtl_results_{}_{}_{}.h5'.format(
                chromosome, selectionStart, selectionEnd))
        else:
            os.remove(output_dir + 'qtl_results_{}.h5'.format(chromosome))
        sys.exit()
    #gather unique indexes of tested SNPs

    tested_snp_ids = list(set(tested_snp_ids))
    #write annotation and snp data to file
    snp_df = pd.DataFrame()
    snp_df['snp_id'] = bim['snp']
    snp_df['chromosome'] = bim['chrom']
    snp_df['position'] = bim['pos']
    snp_df['assessed_allele'] = bim['a1']
    snp_df.index = snp_df['snp_id']
    snp_df = snp_df.drop_duplicates()
    snp_df = snp_df.reindex(tested_snp_ids)
    snp_df = snp_df.drop_duplicates()

    if snpQcInfoMain is not None:
        snpQcInfoMain['index'] = snpQcInfoMain.index
        snpQcInfoMain = snpQcInfoMain.drop_duplicates()
        del snpQcInfoMain['index']
        snp_df = pd.concat(
            [snp_df, snpQcInfoMain.reindex(snp_df.index)], axis=1)

        if (snp_df.shape[1] == 5):
            snp_df.columns = [
                'snp_id', 'chromosome', 'position', 'assessed_allele',
                'call_rate'
            ]
        elif (snp_df.shape[1] == 6):
            snp_df.columns = [
                'snp_id', 'chromosome', 'position', 'assessed_allele',
                'call_rate', 'maf'
            ]
        else:
            snp_df.columns = [
                'snp_id', 'chromosome', 'position', 'assessed_allele',
                'call_rate', 'maf', 'hwe_p'
            ]

    feature_list = list(set(feature_list) - set(fail_qc_features))
    annotation_df = annotation_df.reindex(feature_list)
    annotation_df['n_samples'] = n_samples
    annotation_df['n_e_samples'] = n_e_samples

    if (n_perm > 1):
        annotation_df['alpha_param'] = alpha_params
        annotation_df['beta_param'] = beta_params

    if not selectionStart is None:
        snp_df.to_csv(output_dir + '/snp_metadata_{}_{}_{}.txt'.format(
            chromosome, selectionStart, selectionEnd),
                      sep='\t',
                      index=False)
        annotation_df.to_csv(output_dir +
                             '/feature_metadata_{}_{}_{}.txt'.format(
                                 chromosome, selectionStart, selectionEnd),
                             sep='\t')
    else:
        snp_df.to_csv(output_dir + '/snp_metadata_{}.txt'.format(chromosome),
                      sep='\t',
                      index=False)
        annotation_df.to_csv(output_dir +
                             '/feature_metadata_{}.txt'.format(chromosome),
                             sep='\t')
Exemplo n.º 22
0
def test_lmm_interface():
    random = RandomState(1)
    n = 3
    G = random.randn(n, n + 1)
    X = random.randn(n, 2)
    y = X @ random.randn(2) + G @ random.randn(G.shape[1]) + random.randn(n)
    y -= y.mean(0)
    y /= y.std(0)

    QS = economic_qs_linear(G)
    lmm = LMM(y, X, QS, restricted=False)
    lmm.name = "lmm"
    lmm.fit(verbose=False)

    assert_allclose(
        lmm.covariance(),
        [
            [0.436311031439718, 2.6243891396439837e-16, 2.0432156171727483e-16],
            [2.6243891396439837e-16, 0.4363110314397185, 4.814313140426306e-16],
            [2.0432156171727483e-16, 4.814313140426305e-16, 0.43631103143971817],
        ],
        atol=1e-7,
    )
    assert_allclose(
        lmm.mean(),
        [0.6398184791042468, -0.8738254794097052, 0.7198112606871158],
        atol=1e-7,
    )
    assert_allclose(lmm.lml(), -3.012715726960625, atol=1e-7)
    assert_allclose(lmm.value(), lmm.lml(), atol=1e-7)
    assert_allclose(lmm.lml(), -3.012715726960625, atol=1e-7)
    assert_allclose(
        lmm.X,
        [
            [-0.3224172040135075, -0.38405435466841564],
            [1.1337694423354374, -1.0998912673140309],
            [-0.17242820755043575, -0.8778584179213718],
        ],
        atol=1e-7,
    )
    assert_allclose(lmm.beta, [-1.3155159120000266, -0.5615702941530938], atol=1e-7)
    assert_allclose(
        lmm.beta_covariance,
        [
            [0.44737305797088345, 0.20431961864892412],
            [0.20431961864892412, 0.29835835133251526],
        ],
        atol=1e-7,
    )
    assert_allclose(lmm.delta, 0.9999999999999998, atol=1e-7)
    assert_equal(lmm.ncovariates, 2)
    assert_equal(lmm.nsamples, 3)
    assert_allclose(lmm.scale, 0.43631103143971767, atol=1e-7)
    assert_allclose(lmm.v0, 9.688051060046502e-17, atol=1e-7)
    assert_allclose(lmm.v1, 0.43631103143971756, atol=1e-7)
    assert_equal(lmm.name, "lmm")

    with pytest.raises(NotImplementedError):
        lmm.gradient()
Exemplo n.º 23
0
def _test_lmm(random, y, X, G, mvn, restricted):
    c = X.shape[1]
    QS = economic_qs_linear(G)
    lmm = LMM(y, X, QS, restricted=restricted)
    beta = lmm.beta
    v0 = lmm.v0
    v1 = lmm.v1

    K0 = G @ G.T
    assert_allclose(lmm.lml(), mvn(beta, v0, v1, y, X, K0))

    beta = random.randn(c)
    lmm.beta = beta
    assert_allclose(lmm.lml(), mvn(beta, v0, v1, y, X, K0))

    delta = random.rand(1).item()
    lmm.delta = delta
    v0 = lmm.v0
    v1 = lmm.v1
    assert_allclose(lmm.lml(), mvn(beta, v0, v1, y, X, K0))

    scale = random.rand(1).item()
    lmm.scale = scale
    v0 = lmm.v0
    v1 = lmm.v1
    assert_allclose(lmm.lml(), mvn(beta, v0, v1, y, X, K0))

    def fun(x):
        beta = x[:c]
        v0 = exp(x[c])
        v1 = exp(x[c + 1])
        return -mvn(beta, v0, v1, y, X, K0)

    res = minimize(fun, [0] * c + [0, 0])
    lmm.fit(verbose=False)
    assert_allclose(lmm.lml(), -res.fun, rtol=1e-3, atol=1e-6)
    assert_allclose(lmm.beta, res.x[:c], rtol=1e-3, atol=1e-6)
    assert_allclose(lmm.v0, exp(res.x[c]), rtol=1e-3, atol=1e-6)
    assert_allclose(lmm.v1, exp(res.x[c + 1]), rtol=1e-3, atol=1e-6)

    lmm = LMM(y, X, QS, restricted=restricted)
    beta = random.randn(c)
    lmm.beta = beta
    lmm.delta = random.rand(1).item()
    lmm.scale = random.rand(1).item()
    lmm.fix("beta")

    def fun(x):
        v0 = exp(x[0])
        v1 = exp(x[1])
        return -mvn(beta, v0, v1, y, X, K0)

    res = minimize(fun, [0, 0])
    lmm.fit(verbose=False)
    assert_allclose(lmm.lml(), -res.fun, rtol=1e-3, atol=1e-6)
    assert_allclose(lmm.v0, exp(res.x[0]), rtol=1e-3, atol=1e-6)
    assert_allclose(lmm.v1, exp(res.x[1]), rtol=1e-3, atol=1e-6)

    lmm = LMM(y, X, QS, restricted=restricted)
    lmm.beta = random.randn(c)
    delta = random.rand(1).item()
    lmm.delta = delta
    lmm.scale = random.rand(1).item()
    lmm.fix("delta")

    def fun(x):
        beta = x[:c]
        scale = exp(x[c])
        v0 = scale * (1 - delta)
        v1 = scale * delta
        return -mvn(beta, v0, v1, y, X, K0)

    res = minimize(fun, [0] * c + [0])
    lmm.fit(verbose=False)
    assert_allclose(lmm.lml(), -res.fun, rtol=1e-5, atol=1e-6)
    assert_allclose(lmm.beta, res.x[:c], rtol=1e-5, atol=1e-6)
    assert_allclose(lmm.scale, exp(res.x[c]), rtol=1e-5, atol=1e-6)

    lmm = LMM(y, X, QS, restricted=restricted)
    lmm.beta = random.randn(c)
    lmm.delta = random.rand(1).item()
    scale = random.rand(1).item()
    lmm.scale = scale
    lmm.fix("scale")

    def fun(x):
        beta = x[:c]
        delta = 1 / (1 + exp(-x[c]))
        v0 = scale * (1 - delta)
        v1 = scale * delta
        return -mvn(beta, v0, v1, y, X, K0)

    res = minimize(fun, [0] * c + [0])
    lmm.fit(verbose=False)
    assert_allclose(lmm.lml(), -res.fun, rtol=1e-5, atol=1e-6)
    assert_allclose(lmm.beta, res.x[:c], rtol=1e-3, atol=1e-6)
    assert_allclose(lmm.delta, 1 / (1 + exp(-res.x[c])), rtol=1e-3, atol=1e-6)
def run_plots(plinkGenotype, geno_prefix, annotation_filename, phenotype_filename,
              covariate_filename, top_qtl_results_filename, output_directory,
              sample_mapping_filename,randomeff_filename):

    # # Loading Files
    # phenotype_filename = "/Users/chaaya/dhonveli_dkfz/limix_qtl/limix_qtl-master/Limix_QTL/test_data/Expression/Geuvadis_CEU_YRI_Expr.txt"
    # annotation_filename = "/Users/chaaya/dhonveli_dkfz/limix_qtl/limix_qtl-master/Limix_QTL/test_data/Expression/Geuvadis_CEU_Annot.txt"
    # covariate_filename = "/Users/chaaya/dhonveli_dkfz/limix_qtl/limix_qtl-master/Limix_QTL/test_data/Expression/Geuvadis_CEU_YRI_covariates.txt"
    # sample_mapping_filename = "/Users/chaaya/dhonveli_dkfz/limix_qtl/limix_qtl-master/Limix_QTL/test_data/Geuvadis_CEU_gte.txt"
    # geno_prefix = "/Users/chaaya/dhonveli_dkfz/limix_qtl/limix_qtl-master/Limix_QTL/test_data/Genotypes/Geuvadis"
    # output_directory = "/Users/chaaya/dhonveli_dkfz/limix_qtl/limix_qtl-master/Limix_QTL/test_data/Output"
    # top_qtl_results_filename = "/Users/chaaya/dhonveli_dkfz/limix_qtl/limix_qtl-master/Limix_QTL/test_data/Output/top_qtl_results_all_FDR0.05.txt"
    # plinkGenotype = True


    [phenotype_df, kinship_df, readdepth_df, covariate_df, sample2individual_df,complete_annotation_df, annotation_df, snp_filter_df,
     snp_feature_filter_df, geneticaly_unique_individuals, minimum_test_samples, feature_list, bim, fam, bed, bgen,
     chromosome, selectionStart, selectionEnd, feature_variant_covariate_df]=\
    utils.run_QTL_analysis_load_intersect_phenotype_covariates_kinship_sample_mapping(pheno_filename=phenotype_filename,
                    anno_filename=annotation_filename, geno_prefix=geno_prefix, plinkGenotype=plinkGenotype, cis_mode=True,
                    skipAutosomeFiltering = False, minimum_test_samples= 10,
                    relatedness_score=None, snps_filename=None, feature_filename=None,
                    snp_feature_filename=None, selection='all',covariates_filename=covariate_filename,
                    randomeff_filename=randomeff_filename, sample_mapping_filename=sample_mapping_filename,
                    extended_anno_filename=None, feature_variant_covariate_filename=None)

    # results
    top_qtl_results_df = qtl_loader_utils.get_top_qtl_results(top_qtl_results_filename)

    for row in top_qtl_results_df.iterrows():

        # feature specific parameters for QS mixing
        rho1 = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
        Sigma = {}
        Sigma_qs = {}
        best = {}
        snpQcInfo = None
        currentFeatureNumber+= 1

        phenotype_ds = phenotype_df.loc[row[1]["feature_id"]]
        individual_ids = sample2individual_df.loc[phenotype_ds.index, 'iid'].values
        sample2individual_feature = sample2individual_df.loc[phenotype_ds.index]

        if kinship_df is not None and readdepth_df is None:
            kinship_mat = kinship_df.loc[individual_ids,individual_ids].values
            kinship_mat = kinship_mat.astype(float)
            ##GOWER normalization of Kinship matrix.
            kinship_mat *= (kinship_mat.shape[0] - 1) / (kinship_mat.trace() - kinship_mat.mean(0).sum())
            ## This needs to go with the subselection stuff.
            if(QS is None and not contains_missing_samples):
                QS = economic_qs(kinship_mat)
            elif (contains_missing_samples):
                QS_tmp = QS
                QS = economic_qs(kinship_mat)

        randomeff_mix = False
        # combining the two matrices
        if kinship_df is not None and readdepth_df is not None:
            randomeff_mix = True
            kinship_mat = kinship_df.loc[individual_ids,individual_ids].values
            kinship_mat = kinship_mat.astype(float)
            ##GOWER normalization of Kinship matrix.
            kinship_mat *= (kinship_mat.shape[0] - 1) / (kinship_mat.trace() - kinship_mat.mean(0).sum())
            for rho in rho1:
                Sigma[rho] = rho * kinship_df + (1 - rho) * readdepth_df
                Sigma_qs[rho] = economic_qs(Sigma[rho])

        # creating a fake QS if none random effect is present or use the read depth one
        if kinship_df is None:
            if readdepth_df is None:
                K = np.eye(len(phenotype_ds.index))
                if(QS is None and not contains_missing_samples):
                    QS = economic_qs(K)
                elif (contains_missing_samples):
                    QS_tmp = QS
                    QS = economic_qs(K)
            else:
                if(QS is None and not contains_missing_samples):
                    QS = economic_qs(readdepth_df)
                elif (contains_missing_samples):
                    QS_tmp = QS
                    QS = economic_qs(readdepth_df)

        # covariance matrix
        cov_matrix = covariate_df.loc[sample2individual_feature['sample'], :].values if covariate_df is not None else None
        if covariate_df is None:
            cov_matrix = np.ones((len(individual_ids), 1))
        cov_matrix = cov_matrix.astype(float)

        phenotype = utils.force_normal_distribution(phenotype_ds.values,method="gaussnorm")

        # Prepare LMM
        phenotype = phenotype.astype(float)

        if randomeff_mix:
            # initialize best to minus infinite
            best["lml"] = - math.inf
            best["lmm"] = - math.inf
            best["rho1"] = - math.inf
            for rho, QS in Sigma_qs.items():
                lmm = LMM(phenotype, cov_matrix, QS)
                if not mixed:
                    lmm.delta = 1
                    lmm.fix('delta')
                lmm.fit(verbose=False)
                lml = lmm.lml()
                if lml > best["lml"]:
                    best["lml"] = lml
                    best["lmm"] = lmm
                    best["rho1"] = rho
            lmm = best["lmm"]
            print(best["rho1"])
            if best["rho1"] != 0:
                rho_log[(feature_id)] = best["rho1"]
                print("Read depth has actually an effect!")

        else:
            lmm = LMM(phenotype, cov_matrix, QS)
            if not mixed:
                lmm.delta = 1
                lmm.fix('delta')
            lmm.fit(verbose=False)

        # create phenotype_corrected_df for plotting
        phenotype_corrected = phenotype - cov_matrix[:, 1:].dot(lmm.beta[1:])
        phenotype_corrected_ds = pd.Series(data = phenotype_corrected, index = phenotype_ds.index, name="exp")

        qtl_plots(row, phenotype_ds, phenotype_corrected_ds, plinkGenotype, bim, fam, bed, annotation_df, sample2individual_df)
Exemplo n.º 25
0
def test_lmm_beta_covariance():
    random = RandomState(0)

    (y, X, G) = _full_rank(random)
    QS = economic_qs_linear(G)
    lmm = LMM(y, X, QS)
    lmm.fit(verbose=False)

    A = [
        [0.015685784760937037, 0.006509918649859495],
        [0.006509918649859495, 0.007975242272006645],
    ]
    assert_allclose(lmm.beta_covariance, A)

    (y, X, G) = _low_rank(random)
    QS = economic_qs_linear(G)
    lmm = LMM(y, X[:, :2], QS)
    lmm.fit(verbose=False)

    A = [
        [0.002763268929325623, 0.0006651810010328699],
        [0.0006651810010328708, 0.0016910004907565248],
    ]
    assert_allclose(lmm.beta_covariance, A)

    (y, X, G) = _low_rank(random)
    QS = economic_qs_linear(G)
    lmm = LMM(y, X, QS)
    lmm.fit(verbose=False)

    A = [
        [
            0.003892850639339253,
            0.0012112513279299796,
            0.003892850639339256,
            0.0012112513279299794,
        ],
        [
            0.0012112513279299794,
            0.009340423857663259,
            0.0012112513279299833,
            0.009340423857663257,
        ],
        [
            0.0038928506393392562,
            0.0012112513279299835,
            0.003892850639339259,
            0.0012112513279299833,
        ],
        [
            0.0012112513279299794,
            0.009340423857663257,
            0.0012112513279299833,
            0.009340423857663257,
        ],
    ]
    assert_allclose(lmm.beta_covariance, A)
Exemplo n.º 26
0
def run_PrsQtl_analysis(pheno_filename,
                        anno_filename,
                        prsFile,
                        output_dir,
                        min_call_rate=0.95,
                        blocksize=1000,
                        skipAutosomeFiltering=False,
                        gaussianize_method=None,
                        minimum_test_samples=10,
                        seed=np.random.randint(40000),
                        n_perm=0,
                        write_permutations=False,
                        relatedness_score=None,
                        feature_variant_covariate_filename=None,
                        snps_filename=None,
                        feature_filename=None,
                        snp_feature_filename=None,
                        genetic_range='all',
                        covariates_filename=None,
                        kinship_filename=None,
                        sample_mapping_filename=None,
                        regressCovariatesUpfront=False):
    fill_NaN = Imputer(missing_values=np.nan, strategy='mean', axis=0)
    print('Running GRS QT analysis.')
    lik = 'normal'
    '''Core function to take input and run QTL tests on a given chromosome.'''
    if relatedness_score is not None:
        relatedness_score = float(relatedness_score)
    [phenotype_df, kinship_df, covariate_df, sample2individual_df, annotation_df, snp_filter_df, snp_feature_filter_df, geneticaly_unique_individuals, minimum_test_samples, feature_list, risk_df, chromosome, selectionStart, selectionEnd, feature_variant_covariate_df]=\
    utils.run_PrsQtl_analysis_load_intersect_phenotype_covariates_kinship_sample_mapping(pheno_filename=pheno_filename, anno_filename=anno_filename, prsFile=prsFile, skipAutosomeFiltering = skipAutosomeFiltering,
                      minimum_test_samples= minimum_test_samples,  relatedness_score=relatedness_score, snps_filename=snps_filename, feature_filename=feature_filename, snp_feature_filename=snp_feature_filename, selection=genetic_range,
                     covariates_filename=covariates_filename, kinship_filename=kinship_filename, sample_mapping_filename=sample_mapping_filename, feature_variant_covariate_filename=feature_variant_covariate_filename)

    mixed = kinship_df is not None
    if (kinship_df is None) or (relatedness_score is None):
        geneticaly_unique_individuals = sample2individual_df['iid'].values
    QS = None
    if (feature_list == None or len(feature_list) == 0):
        print('No features to be tested.')
        sys.exit()

    #Open output files
    qtl_loader_utils.ensure_dir(output_dir)
    if not selectionStart is None:
        output_writer = qtl_output.hdf5_writer(
            output_dir + '/qtl_results_{}_{}_{}.h5'.format(
                chromosome, selectionStart, selectionEnd))
    else:
        output_writer = qtl_output.hdf5_writer(
            output_dir + '/qtl_results_{}.h5'.format(chromosome))
    if (write_permutations):
        if not selectionStart is None:
            permutation_writer = qtl_output.hdf5_permutations_writer(
                output_dir + '/perm_results_{}_{}_{}.h5'.format(
                    chromosome, selectionStart, selectionEnd), n_perm)
        else:
            permutation_writer = qtl_output.hdf5_permutations_writer(
                output_dir + '/perm_results_{}.h5'.format(chromosome), n_perm)

    #Arrays to store indices of snps tested and pass and fail QC SNPs for features without missingness.
    tested_snp_names = []
    fail_qc_features = []
    alpha_params = []
    beta_params = []
    n_samples = []
    n_e_samples = []
    na_containing_features = 0
    currentFeatureNumber = 0
    snpQcInfoMain = None
    for feature_id in feature_list:
        snpQcInfo = None
        currentFeatureNumber += 1
        if (len(phenotype_df.loc[feature_id, :])) < minimum_test_samples:
            print("Feature: " + feature_id +
                  " not tested not enough samples do QTL test.")
            fail_qc_features.append(feature_id)
            geneticaly_unique_individuals = tmp_unique_individuals
            continue
        data_written = False
        contains_missing_samples = False
        snpQuery = risk_df.index.values
        snp_cov_df = None

        if (feature_variant_covariate_df is not None):
            if (feature_id in feature_variant_covariate_df['feature'].values):
                covariateSnp = feature_variant_covariate_df['snp_id'].values[
                    feature_variant_covariate_df['feature'] == feature_id]
                if (any(i in risk_df.index.values for i in covariateSnp)):
                    snp_cov_df = risk_df.loc[risk_df.index.map(
                        lambda x: x in list(covariateSnp)), :].transpose()

        if (len(snpQuery) != 0) and (snp_filter_df is not None):
            snpQuery = list(
                set(snp_filter_df.index).intersection(set(snpQuery)))

        if (len(snpQuery) != 0) and (snp_feature_filter_df is not None):
            snpQuery = list(
                set(
                    np.unique(snp_feature_filter_df['snp_id'].loc[
                        snp_feature_filter_df['feature'] ==
                        feature_id])).intersection(set(snpQuery)))

        if len(snpQuery) == 0:
            print("Feature: " + feature_id +
                  " not tested. No SNPS passed QC for phenotype.")
            fail_qc_features.append(feature_id)
            continue
        else:
            phenotype_ds = phenotype_df.loc[feature_id]
            contains_missing_samples = any(~np.isfinite(phenotype_ds))
            if (contains_missing_samples):
                #import pdb; pdb.set_trace()
                print('Feature: ' + feature_id + ' contains missing data.')
                phenotype_ds.dropna(inplace=True)
                na_containing_features = na_containing_features + 1
            '''select indices for relevant individuals in genotype matrix
            These are not unique. NOT to be used to access phenotype/covariates data
            '''
            individual_ids = sample2individual_df.loc[phenotype_ds.index,
                                                      'iid'].values
            sample2individual_feature = sample2individual_df.loc[
                phenotype_ds.index]

            if contains_missing_samples:
                tmp_unique_individuals = geneticaly_unique_individuals
                if (kinship_df is not None) and (relatedness_score
                                                 is not None):
                    geneticaly_unique_individuals = utils.get_unique_genetic_samples(
                        kinship_df.loc[individual_ids, individual_ids],
                        relatedness_score)
                else:
                    geneticaly_unique_individuals = individual_ids
            if phenotype_ds.empty or len(
                    geneticaly_unique_individuals) < minimum_test_samples:
                print("Feature: " + feature_id +
                      " not tested not enough samples do QTL test.")
                fail_qc_features.append(feature_id)
                if contains_missing_samples:
                    geneticaly_unique_individuals = tmp_unique_individuals
                continue
            elif np.var(phenotype_ds.values) == 0:
                print("Feature: " + feature_id +
                      " has no variance in selected individuals.")
                fail_qc_features.append(feature_id)
                if contains_missing_samples:
                    geneticaly_unique_individuals = tmp_unique_individuals
                continue

            print('For feature: ' + str(currentFeatureNumber) + '/' +
                  str(len(feature_list)) + ' (' + feature_id + '): ' +
                  str(len(snpQuery)) +
                  ' risk scores will be tested.\n Please stand by.')
            if (n_perm != 0):
                bestPermutationPval = np.ones((n_perm), dtype=np.float)

            #Here we need to start preparing the LMM, can use the fam for sample IDS in SNP matrix.


#                test if the covariates, kinship, snp and phenotype are in the same order
            if ((all(kinship_df.loc[individual_ids,individual_ids].index==sample2individual_feature.loc[phenotype_ds.index]['iid']) if kinship_df is not None else True) &\
                 (all(phenotype_ds.index==covariate_df.loc[sample2individual_feature['sample'],:].index)if covariate_df is not None else True)):
                '''
                if all lines are in order put in arrays the correct genotype and phenotype
                x=a if cond1 else b <---> equivalent to if cond1: x=a else x=b;                 better readability of the code
                 '''
                if kinship_df is not None:
                    kinship_mat = kinship_df.loc[individual_ids,
                                                 individual_ids].values
                    kinship_mat = kinship_mat.astype(float)
                    ##GOWER normalization of Kinship matrix.
                    kinship_mat *= (kinship_mat.shape[0] - 1) / (
                        kinship_mat.trace() - kinship_mat.mean(0).sum())
                    ## This needs to go with the subselection stuff.
                    if (QS is None and not contains_missing_samples):
                        QS = economic_qs(kinship_mat)
                    elif (contains_missing_samples):
                        QS_tmp = QS
                        QS = economic_qs(kinship_mat)
                if kinship_df is None:
                    K = np.eye(len(phenotype_ds.index))
                    if (QS is None and not contains_missing_samples):
                        QS = economic_qs(K)
                    elif (contains_missing_samples):
                        QS_tmp = QS
                        QS = economic_qs(K)
                cov_matrix = covariate_df.loc[sample2individual_feature[
                    'sample'], :].values if covariate_df is not None else None
                if covariate_df is None:
                    cov_matrix = np.ones((len(individual_ids), 1))
                #pdb.set_trace()
                if snp_cov_df is not None:
                    snp_cov_df_tmp = snp_cov_df.loc[individual_ids, :]
                    snp_cov_df = pd.DataFrame(
                        fill_NaN.fit_transform(snp_cov_df_tmp))
                    snp_cov_df.index = sample2individual_feature['sample']
                    snp_cov_df.columns = snp_cov_df_tmp.columns
                    cov_matrix = np.concatenate(
                        (cov_matrix, snp_cov_df.values), 1)
                    snp_cov_df_tmp = None
                    snp_cov_df = None
                cov_matrix = cov_matrix.astype(float)
            else:
                print(
                    'There is an issue in mapping phenotypes vs covariates and/or kinship'
                )
                sys.exit()

            phenotype = utils.force_normal_distribution(
                phenotype_ds.values, method=gaussianize_method
            ) if gaussianize_method is not None else phenotype_ds.values

            #Prepare LMM
            phenotype = phenotype.astype(float)

            ##Mixed and test.
            ##This is a future change so we don't need to decompose the COVs every time.
            ##Like QS this needs to happen when genetic unique individuals is the same.
            #svd_cov = economic_svd(cov_matrix)
            #lmm = LMM(phenotype, cov_matrix, QS, SVD=svd_cov)
            #These steps need to happen only once per phenotype.
            #print(QS)
            lmm = LMM(phenotype, cov_matrix, QS)
            if not mixed:
                lmm.delta = 1
                lmm.fix('delta')
            #Prepare null model.
            lmm.fit(verbose=False)
            if regressCovariatesUpfront:
                phenotype_corrected = phenotype - cov_matrix[:, 1:].dot(
                    lmm.beta[1:])
                cov_matrix_corrected = cov_matrix[:, 0]
                lmm = LMM(phenotype_corrected, cov_matrix_corrected, QS)
                lmm.fit(verbose=False)

            null_lml = lmm.lml()
            flmm = lmm.get_fast_scanner()
            #pdb.set_trace();
            for snpGroup in utils.chunker(snpQuery, blocksize):
                #Fix seed at the start of the first chunker so all permutations are based on the same random first split.
                np.random.seed(seed)

                snp_names = snpGroup

                tested_snp_names.extend(snp_names)
                snp_matrix_DF = risk_df.loc[snp_names,
                                            individual_ids].transpose()
                ##GRS var QC
                snp_matrix_DF = snp_matrix_DF.loc[:,
                                                  snp_matrix_DF.isna().sum(
                                                      axis=0) != snp_matrix_DF.
                                                  shape[0], ]
                snp_matrix_DF = snp_matrix_DF.loc[:, (
                    np.nanstd(snp_matrix_DF, axis=0) > 0)]
                #               test if the covariates, kinship, snp and phenotype are in the same order
                if (len(snp_matrix_DF.index) != len(
                        sample2individual_feature.loc[phenotype_ds.index]
                    ['iid']) or not all(
                        snp_matrix_DF.index == sample2individual_feature.loc[
                            phenotype_ds.index]['iid'])):
                    print(
                        'There is an issue in mapping phenotypes and genotypes'
                    )
                    sys.exit()
                #Impute missingness
                #pdb.set_trace()
                call_rate = 1 - snp_matrix_DF.isnull().sum() / len(
                    snp_matrix_DF.index)
                if snpQcInfo is None and call_rate is not None:
                    snpQcInfo = call_rate
                elif call_rate is not None:
                    snpQcInfo = pd.concat([snpQcInfo, call_rate], axis=0)
                selection = call_rate > min_call_rate
                snp_matrix_DF = snp_matrix_DF.loc[:,
                                                  list(snp_matrix_DF.
                                                       columns[selection])]
                if snp_matrix_DF.shape[1] == 0:
                    continue
                snp_matrix_DF = pd.DataFrame(
                    fill_NaN.fit_transform(snp_matrix_DF),
                    index=snp_matrix_DF.index,
                    columns=snp_matrix_DF.columns)
                #
                G = snp_matrix_DF.values
                G = G.astype(float)
                G_index = snp_matrix_DF.columns

                alt_lmls, effsizes = flmm.fast_scan(G, verbose=False)
                var_pvalues = lrt_pvalues(null_lml, alt_lmls)
                var_effsizes_se = effsizes_se(effsizes, var_pvalues)

                #add these results to qtl_results
                temp_df = pd.DataFrame(index=range(len(G_index)),
                                       columns=[
                                           'feature_id', 'snp_id', 'p_value',
                                           'beta', 'beta_se',
                                           'empirical_feature_p_value'
                                       ])
                temp_df['snp_id'] = G_index
                temp_df['feature_id'] = feature_id
                temp_df['beta'] = np.asarray(effsizes)
                temp_df['p_value'] = np.asarray(var_pvalues)
                temp_df['beta_se'] = np.asarray(var_effsizes_se)
                #insert default dummy value
                temp_df['empirical_feature_p_value'] = -1.0

                if (n_perm != 0):
                    pValueBuffer = []
                    totalSnpsToBeTested = (G.shape[1] * n_perm)
                    permutationStepSize = np.floor(
                        n_perm / (totalSnpsToBeTested / blocksize))
                    if (permutationStepSize > n_perm):
                        permutationStepSize = n_perm
                    elif (permutationStepSize < 1):
                        permutationStepSize = 1

                    if (write_permutations):
                        perm_df = pd.DataFrame(
                            index=range(len(G_index)),
                            columns=['snp_id'] +
                            ['permutation_' + str(x) for x in range(n_perm)])
                        perm_df['snp_id'] = G_index
                    for currentNperm in utils.chunker(
                            list(range(1, n_perm + 1)), permutationStepSize):
                        if (kinship_df is not None) and (relatedness_score
                                                         is not None):
                            temp = utils.get_shuffeld_genotypes_preserving_kinship(
                                geneticaly_unique_individuals,
                                relatedness_score, snp_matrix_DF,
                                kinship_df.loc[individual_ids, individual_ids],
                                len(currentNperm))
                        else:
                            temp = utils.get_shuffeld_genotypes(
                                snp_matrix_DF, len(currentNperm))
                        temp = temp.astype(float)
                        alt_lmls_p, effsizes_p = flmm.fast_scan(temp,
                                                                verbose=False)
                        var_pvalues_p = lrt_pvalues(null_lml, alt_lmls_p)
                        pValueBuffer.extend(np.asarray(var_pvalues_p))
                    if (not (len(pValueBuffer) == totalSnpsToBeTested)):
                        #print(len(pValueBuffer))
                        #print(pValueBuffer)
                        #print(totalSnpsToBeTested)
                        print('Error in blocking logic for permutations.')
                        sys.exit()
                    perm = 0
                    for relevantOutput in utils.chunker(
                            pValueBuffer, G.shape[1]):
                        if (write_permutations):
                            perm_df['permutation_' +
                                    str(perm)] = relevantOutput
                        if (bestPermutationPval[perm] > min(relevantOutput)):
                            bestPermutationPval[perm] = min(relevantOutput)
                        perm = perm + 1
                        #print(relevantOutput)
                        #print('permutation_'+str(perm))

                if not temp_df.empty:
                    data_written = True
                    output_writer.add_result_df(temp_df)
                    if (write_permutations):
                        permutation_writer.add_permutation_results_df(
                            perm_df, feature_id)
            #This we need to change in the written file.
            if (n_perm > 1 and data_written):
                #updated_permuted_p_in_hdf5(bestPermutationPval, feature_id);
                alpha_para, beta_para = output_writer.apply_pval_correction(
                    feature_id, bestPermutationPval, False)
                alpha_params.append(alpha_para)
                beta_params.append(beta_para)
                #pdb.set_trace();
            if not data_written:
                fail_qc_features.append(feature_id)
            else:
                n_samples.append(phenotype_ds.size)
                n_e_samples.append(len(geneticaly_unique_individuals))
            if contains_missing_samples:
                QS = QS_tmp
                geneticaly_unique_individuals = tmp_unique_individuals
                snpQcInfo = snpQcInfo.to_frame(name="call_rate")
                snpQcInfo.index.name = "snp_id"
                snpQcInfo.to_csv(
                    output_dir +
                    '/snp_qc_metrics_naContaining_feature_{}.txt'.format(
                        feature_id),
                    sep='\t')
                del QS_tmp
                del tmp_unique_individuals
            else:
                if (snpQcInfo is not None and snpQcInfoMain is not None):
                    snpQcInfoMain = pd.concat([snpQcInfoMain, snpQcInfo],
                                              axis=0)
                elif snpQcInfo is not None:
                    snpQcInfoMain = snpQcInfo.copy(deep=True)
                #print('step 5')
    output_writer.close()

    if (write_permutations):
        permutation_writer.close()
    fail_qc_features = np.unique(fail_qc_features)
    if ((len(feature_list) - len(fail_qc_features)) == 0):
        time.sleep(15)
        #Safety timer to make sure the file is unlocked.
        print("Trying to remove the h5 file. Nothing has been tested.")
        print(output_dir + 'qtl_results_{}_{}_{}.h5'.format(
            chromosome, selectionStart, selectionEnd))
        if not selectionStart is None:
            os.remove(output_dir + 'qtl_results_{}_{}_{}.h5'.format(
                chromosome, selectionStart, selectionEnd))
        else:
            os.remove(output_dir + 'qtl_results_{}.h5'.format(chromosome))
        sys.exit()
    #gather unique indexes of tested snps
    #write annotation and snp data to file
    snp_df = pd.DataFrame()
    snp_df['snp_id'] = np.unique(tested_snp_names)
    snp_df.index = np.unique(tested_snp_names)
    snp_df['chromosome'] = "NA"
    snp_df['position'] = "NA"
    if (snpQcInfoMain is not None):
        snpQcInfoMain = snpQcInfoMain.to_frame(name="call_rate")
        snpQcInfoMain['index'] = snpQcInfoMain.index
        snpQcInfoMain = snpQcInfoMain.drop_duplicates()
        del snpQcInfoMain['index']
        snp_df = pd.concat(
            [snp_df, snpQcInfoMain.reindex(snp_df.index)], axis=1)

    feature_list = list(set(feature_list) - set(fail_qc_features))
    annotation_df = annotation_df.reindex(feature_list)
    annotation_df['n_samples'] = n_samples
    annotation_df['n_e_samples'] = n_e_samples
    if (n_perm > 1):
        annotation_df['alpha_param'] = alpha_params
        annotation_df['beta_param'] = beta_params
    if not selectionStart is None:
        snp_df.to_csv(output_dir + '/snp_metadata_{}_{}_{}.txt'.format(
            chromosome, selectionStart, selectionEnd),
                      sep='\t',
                      index=False)
        annotation_df.to_csv(output_dir +
                             '/feature_metadata_{}_{}_{}.txt'.format(
                                 chromosome, selectionStart, selectionEnd),
                             sep='\t')
    else:
        snp_df.to_csv(output_dir + '/snp_metadata_{}.txt'.format(chromosome),
                      sep='\t',
                      index=False)
        annotation_df.to_csv(output_dir +
                             '/feature_metadata_{}.txt'.format(chromosome),
                             sep='\t')
Exemplo n.º 27
0
def test_fast_scanner_statsmodel_gls():
    from numpy.linalg import lstsq

    def _lstsq(A, B):
        return lstsq(A, B, rcond=None)[0]

    # data = sm.datasets.longley.load()
    # data.exog = sm.add_constant(data.exog)
    # ols_resid = sm.OLS(data.endog, data.exog).fit().resid
    # resid_fit = sm.OLS(ols_resid[1:], sm.add_constant(ols_resid[:-1])).fit()
    # rho = resid_fit.params[1]
    rho = -0.3634294908774683
    # order = toeplitz(range(len(ols_resid)))
    order = toeplitz(range(16))
    sigma = rho**order

    QS = economic_qs(sigma)
    endog = reshape(
        [
            60323.0,
            61122.0,
            60171.0,
            61187.0,
            63221.0,
            63639.0,
            64989.0,
            63761.0,
            66019.0,
            67857.0,
            68169.0,
            66513.0,
            68655.0,
            69564.0,
            69331.0,
            70551.0,
        ],
        (16, ),
    )
    exog = reshape(
        [
            1.0,
            83.0,
            234289.0,
            2356.0,
            1590.0,
            107608.0,
            1947.0,
            1.0,
            88.5,
            259426.0,
            2325.0,
            1456.0,
            108632.0,
            1948.0,
            1.0,
            88.2,
            258054.0,
            3682.0,
            1616.0,
            109773.0,
            1949.0,
            1.0,
            89.5,
            284599.0,
            3351.0,
            1650.0,
            110929.0,
            1950.0,
            1.0,
            96.2,
            328975.0,
            2099.0,
            3099.0,
            112075.0,
            1951.0,
            1.0,
            98.1,
            346999.0,
            1932.0,
            3594.0,
            113270.0,
            1952.0,
            1.0,
            99.0,
            365385.0,
            1870.0,
            3547.0,
            115094.0,
            1953.0,
            1.0,
            100.0,
            363112.0,
            3578.0,
            3350.0,
            116219.0,
            1954.0,
            1.0,
            101.2,
            397469.0,
            2904.0,
            3048.0,
            117388.0,
            1955.0,
            1.0,
            104.6,
            419180.0,
            2822.0,
            2857.0,
            118734.0,
            1956.0,
            1.0,
            108.4,
            442769.0,
            2936.0,
            2798.0,
            120445.0,
            1957.0,
            1.0,
            110.8,
            444546.0,
            4681.0,
            2637.0,
            121950.0,
            1958.0,
            1.0,
            112.6,
            482704.0,
            3813.0,
            2552.0,
            123366.0,
            1959.0,
            1.0,
            114.2,
            502601.0,
            3931.0,
            2514.0,
            125368.0,
            1960.0,
            1.0,
            115.7,
            518173.0,
            4806.0,
            2572.0,
            127852.0,
            1961.0,
            1.0,
            116.9,
            554894.0,
            4007.0,
            2827.0,
            130081.0,
            1962.0,
        ],
        (16, 7),
    )
    lmm = LMM(endog, exog, QS)
    lmm.fit(verbose=False)

    sigma = lmm.covariance()
    scanner = lmm.get_fast_scanner()
    best_beta_se = _lstsq(exog.T @ _lstsq(lmm.covariance(), exog), eye(7))
    best_beta_se = sqrt(best_beta_se.diagonal())
    assert_allclose(scanner.null_beta_se, best_beta_se, atol=1e-4)

    endog = endog.copy()
    endog -= endog.mean(0)
    endog /= endog.std(0)

    exog = exog.copy()
    exog -= exog.mean(0)
    with errstate(invalid="ignore", divide="ignore"):
        exog /= exog.std(0)
    exog[:, 0] = 1

    lmm = LMM(endog, exog, QS)
    lmm.fit(verbose=False)

    sigma = lmm.covariance()
    scanner = lmm.get_fast_scanner()

    # gls_model = sm.GLS(endog, exog, sigma=sigma)
    # gls_results = gls_model.fit()
    # scale = gls_results.scale
    scale = 1.7777777777782937
    # beta_se = gls_results.bse
    beta_se = array([
        0.014636888951505144,
        0.21334653097414055,
        0.7428559936739378,
        0.10174713767252333,
        0.032745906589939845,
        0.3494488802468581,
        0.4644879873404213,
    ])
    our_beta_se = sqrt(scanner.null_beta_covariance.diagonal())
    # statsmodels scales the covariance matrix we pass, that is why
    # we need to account for it here.
    assert_allclose(our_beta_se, beta_se / sqrt(scale), rtol=1e-6)
    assert_allclose(scanner.null_beta_se, beta_se / sqrt(scale), rtol=1e-6)
Exemplo n.º 28
0
def test_fast_scanner_statsmodel_gls():
    import statsmodels.api as sm
    from numpy.linalg import lstsq

    def _lstsq(A, B):
        return lstsq(A, B, rcond=None)[0]

    data = sm.datasets.longley.load()
    data.exog = sm.add_constant(data.exog)
    ols_resid = sm.OLS(data.endog, data.exog).fit().resid
    resid_fit = sm.OLS(ols_resid[1:], sm.add_constant(ols_resid[:-1])).fit()
    rho = resid_fit.params[1]
    order = toeplitz(range(len(ols_resid)))
    sigma = rho ** order

    QS = economic_qs(sigma)
    lmm = LMM(data.endog, data.exog, QS)
    lmm.fit(verbose=False)

    sigma = lmm.covariance()
    scanner = lmm.get_fast_scanner()
    best_beta_se = _lstsq(data.exog.T @ _lstsq(lmm.covariance(), data.exog), eye(7))
    best_beta_se = sqrt(best_beta_se.diagonal())
    assert_allclose(scanner.null_beta_se, best_beta_se, atol=1e-5)

    endog = data.endog.copy()
    endog -= endog.mean(0)
    endog /= endog.std(0)

    exog = data.exog.copy()
    exog -= exog.mean(0)
    with errstate(invalid="ignore", divide="ignore"):
        exog /= exog.std(0)
    exog[:, 0] = 1

    lmm = LMM(endog, exog, QS)
    lmm.fit(verbose=False)

    sigma = lmm.covariance()
    scanner = lmm.get_fast_scanner()

    gls_model = sm.GLS(endog, exog, sigma=sigma)
    gls_results = gls_model.fit()
    beta_se = gls_results.bse
    our_beta_se = sqrt(scanner.null_beta_covariance.diagonal())
    # statsmodels scales the covariance matrix we pass, that is why
    # we need to account for it here.
    assert_allclose(our_beta_se, beta_se / sqrt(gls_results.scale))
    assert_allclose(scanner.null_beta_se, beta_se / sqrt(gls_results.scale))