예제 #1
0
def test_fast_scanner_statsmodel_gls():
    import statsmodels.api as sm
    from numpy.linalg import lstsq

    def _lstsq(A, B):
        return lstsq(A, B, rcond=None)[0]

    data = sm.datasets.longley.load()
    data.exog = sm.add_constant(data.exog)
    ols_resid = sm.OLS(data.endog, data.exog).fit().resid
    resid_fit = sm.OLS(ols_resid[1:], sm.add_constant(ols_resid[:-1])).fit()
    rho = resid_fit.params[1]
    order = toeplitz(range(len(ols_resid)))
    sigma = rho ** order

    QS = economic_qs(sigma)
    lmm = LMM(data.endog, data.exog, QS)
    lmm.fit(verbose=False)

    sigma = lmm.covariance()
    scanner = lmm.get_fast_scanner()
    best_beta_se = _lstsq(data.exog.T @ _lstsq(lmm.covariance(), data.exog), eye(7))
    best_beta_se = sqrt(best_beta_se.diagonal())
    assert_allclose(scanner.null_beta_se, best_beta_se, atol=1e-5)

    endog = data.endog.copy()
    endog -= endog.mean(0)
    endog /= endog.std(0)

    exog = data.exog.copy()
    exog -= exog.mean(0)
    with errstate(invalid="ignore", divide="ignore"):
        exog /= exog.std(0)
    exog[:, 0] = 1

    lmm = LMM(endog, exog, QS)
    lmm.fit(verbose=False)

    sigma = lmm.covariance()
    scanner = lmm.get_fast_scanner()

    gls_model = sm.GLS(endog, exog, sigma=sigma)
    gls_results = gls_model.fit()
    beta_se = gls_results.bse
    our_beta_se = sqrt(scanner.null_beta_covariance.diagonal())
    # statsmodels scales the covariance matrix we pass, that is why
    # we need to account for it here.
    assert_allclose(our_beta_se, beta_se / sqrt(gls_results.scale))
    assert_allclose(scanner.null_beta_se, beta_se / sqrt(gls_results.scale))
예제 #2
0
파일: _st_scan.py 프로젝트: zhzheng92/limix
def _perform_lmm(y, M, QS, G, verbose):
    from glimix_core.lmm import LMM
    from pandas import Series
    from xarray import DataArray

    lmm = LMM(y, M.values, QS)

    lmm.fit(verbose=verbose)
    sys.stdout.flush()

    null_lml = lmm.lml()

    beta = lmm.beta

    covariates = list(M.coords["covariate"].values)
    ncov_effsizes = Series(beta, covariates)

    flmm = lmm.get_fast_scanner()
    if hasattr(G, "data"):
        values = G.data
    else:
        values = G.values
    alt_lmls, effsizes = flmm.fast_scan(values, verbose=verbose)

    coords = {
        k: ("candidate", G.coords[k].values)
        for k in G.coords.keys()
        if G.coords[k].dims[0] == "candidate"
    }

    alt_lmls = DataArray(alt_lmls, dims=["candidate"], coords=coords)
    effsizes = DataArray(effsizes, dims=["candidate"], coords=coords)

    return QTLModel(null_lml, alt_lmls, effsizes, ncov_effsizes)
예제 #3
0
def test_fast_scanner_set_scale_1covariate():
    random = RandomState(9458)
    n = 10
    X = _covariates_sample(random, n, n + 1)
    offset = 1.0

    y = _outcome_sample(random, offset, X)

    QS = economic_qs_linear(X)

    M = random.randn(n, 1)
    lmm = LMM(y, M, QS)

    lmm.fit(verbose=False)
    assert_allclose(lmm.scale, 5.282731934070453)
    assert_allclose(lmm.delta, 0.7029974630034005)
    assert_allclose(lmm.beta, [0.0599712498212])

    markers = M.copy() + random.randn(n, 1)

    scanner = lmm.get_fast_scanner()
    r = scanner.fast_scan(markers, verbose=False)

    assert_allclose(r["lml"], [-21.509721], rtol=1e-6)
    assert_allclose(r["effsizes0"], [[-1.43206379971882]])
    assert_allclose(r["effsizes1"], [1.412239], rtol=1e-6)
    assert_allclose(r["scale"], [0.8440354018505616], rtol=1e-6)

    beta = lmm.beta
    assert_allclose(
        scanner.fast_scan(zeros((10, 1)), verbose=False)["effsizes0"][0], beta
    )
예제 #4
0
def test_lmm_scan_fast_scan():
    random = RandomState(9458)
    n = 30
    X = _covariates_sample(random, n, n + 1)
    offset = 1.0
    y = _outcome_sample(random, offset, X)
    QS = economic_qs_linear(X)
    M0 = random.randn(n, 2)
    M1 = random.randn(n, 2)

    lmm = LMM(y, M0, QS)
    lmm.fit(verbose=False)

    v0 = lmm.v0
    v1 = lmm.v1
    K = v0 * X @ X.T + v1 * eye(n)
    M = concatenate((M0, M1[:, [0]]), axis=1)

    def fun(x):
        beta = x[:3]
        scale = exp(x[3])
        return -st.multivariate_normal(M @ beta, scale * K).logpdf(y)

    res = minimize(fun, [0, 0, 0, 0])
    scanner = lmm.get_fast_scanner()
    r = scanner.fast_scan(M1, verbose=False)

    assert_allclose(r["lml"][0], -res.fun)
    assert_allclose(r["effsizes0"][0], res.x[:2], rtol=1e-5)
    assert_allclose(r["effsizes1"][0], res.x[2:3], rtol=1e-5)
    assert_allclose(r["scale"][0], exp(res.x[3]), rtol=1e-5)
예제 #5
0
def test_fast_scanner_set_scale_multicovariates():
    random = RandomState(9458)
    n = 10
    X = _covariates_sample(random, n, n + 1)
    offset = 1.0

    y = _outcome_sample(random, offset, X)

    QS = economic_qs_linear(X)

    M = random.randn(n, 3)
    lmm = LMM(y, M, QS)

    lmm.fit(verbose=False)

    markers = M.copy()

    scanner = lmm.get_fast_scanner()
    r = scanner.fast_scan(markers, verbose=False)

    want = [-19.318845, -19.318845, -19.318845]
    assert_allclose(r["lml"], want, rtol=1e-6, atol=1e-6)

    assert_allclose(
        r["effsizes0"][2],
        [-0.6923007382350215, 2.3550810825973034, -0.38157769653894497],
        rtol=1e-5,
    )

    want = [-0.34615, 1.177541, -0.381578]
    assert_allclose(r["effsizes1"], want, rtol=1e-6, atol=1e-6)
    assert_allclose(r["scale"], [1.0, 1.0, 1.0])
예제 #6
0
def test_fast_scanner_set_scale_1covariate_redundant():
    random = RandomState(9458)
    n = 10
    X = _covariates_sample(random, n, n + 1)
    offset = 1.0

    y = _outcome_sample(random, offset, X)

    QS = economic_qs_linear(X)

    M = random.randn(n, 1)
    lmm = LMM(y, M, QS)

    lmm.fit(verbose=False)

    markers = M.copy()

    scanner = lmm.get_fast_scanner()
    r = scanner.fast_scan(markers, verbose=False)
    assert_allclose(r["lml"][0], -22.357525517597185, rtol=1e-6)
    assert_allclose(r["effsizes0"], [[0.029985622694805182]])
    assert_allclose(r["effsizes1"][0],
                    0.02998562491058301,
                    rtol=1e-6,
                    atol=1e-6)
    assert_allclose(r["scale"], [1.0], rtol=1e-6)
예제 #7
0
def test_lmm_scan():
    random = RandomState(9458)
    n = 30
    X = _covariates_sample(random, n, n + 1)
    offset = 1.0
    y = _outcome_sample(random, offset, X)
    QS = economic_qs_linear(X)
    M0 = random.randn(n, 2)
    M1 = random.randn(n, 2)

    lmm = LMM(y, M0, QS)
    lmm.fit(verbose=False)

    v0 = lmm.v0
    v1 = lmm.v1
    K = v0 * X @ X.T + v1 * eye(n)
    M = concatenate((M0, M1), axis=1)

    def fun(x):
        beta = x[:4]
        scale = exp(x[4])
        return -st.multivariate_normal(M @ beta, scale * K).logpdf(y)

    res = minimize(fun, [0, 0, 0, 0, 0])
    scanner = lmm.get_fast_scanner()
    r = scanner.scan(M1)

    assert_allclose(r["lml"], -res.fun)
    assert_allclose(r["effsizes0"], res.x[:2], rtol=1e-5)
    assert_allclose(r["effsizes1"], res.x[2:4], rtol=1e-5)
    assert_allclose(r["scale"], exp(res.x[4]), rtol=1e-5)
    K = r["scale"] * lmm.covariance()
    M = concatenate((M0, M1), axis=1)
    effsizes_se = sqrt(inv(M.T @ solve(K, M)).diagonal())
    assert_allclose(effsizes_se,
                    concatenate((r["effsizes0_se"], r["effsizes1_se"])))

    assert_allclose(scanner.null_lml(), -53.805721275578456, rtol=1e-5)
    assert_allclose(scanner.null_beta,
                    [0.26521964226797085, 0.4334778669761928],
                    rtol=1e-5)
    assert_allclose(
        scanner.null_beta_covariance,
        [
            [0.06302553593799207, 0.00429640179038484],
            [0.004296401790384839, 0.05591392416235412],
        ],
        rtol=1e-5,
    )
    assert_allclose(scanner.null_scale, 1.0)

    assert_allclose(scanner.null_beta, lmm.beta, rtol=1e-5)
    assert_allclose(scanner.null_beta_covariance,
                    lmm.beta_covariance,
                    rtol=1e-5)
예제 #8
0
파일: _iscan.py 프로젝트: phue/limix
def _lmm(y, M, QS, verbose):
    from glimix_core.lmm import LMM

    lmm = LMM(y, M, QS, restricted=False)
    lmm.fit(verbose=verbose)
    sys.stdout.flush()

    if QS is None:
        v0 = None
    else:
        v0 = lmm.v0
    v1 = lmm.v1
    scanner = ScannerWrapper(lmm.get_fast_scanner())

    return scanner, v0, v1
예제 #9
0
파일: _scan.py 프로젝트: Joyvalley/limix
def _st_lmm(Y, M, QS, verbose):
    from numpy import nan
    from glimix_core.lmm import LMM

    lmm = LMM(Y, M, QS, restricted=False)
    lmm.fit(verbose=verbose)
    sys.stdout.flush()

    if QS is None:
        v0 = nan
    else:
        v0 = lmm.v0

    v1 = lmm.v1

    return lmm.get_fast_scanner(), v0, v1
예제 #10
0
def test_lmm_scan_lmm_iid_prior():
    random = RandomState(9458)
    n = 30
    X = _covariates_sample(random, n, n + 1)
    markers = random.randn(n, 2)

    offset = 1.0

    y = _outcome_sample(random, offset, X)

    lmm = LMM(y, ones((n, 1)), None)

    lmm.fit(verbose=False)
    scanner = lmm.get_fast_scanner()
    lmls = scanner.fast_scan(markers, verbose=False)["lml"]
    assert_allclose(lmls[:2], [-63.16019973550036, -62.489358539276715])
예제 #11
0
def test_fast_scanner_redundant_candidates():
    random = RandomState(9458)
    n = 10
    X = _covariates_sample(random, n, n + 1)
    offset = 1.0

    y = _outcome_sample(random, offset, X)

    QS = economic_qs_linear(X)

    M = ones((n, 5))
    lmm = LMM(y, M, QS, restricted=False)

    lmm.fit(verbose=False)

    markers = M.copy()

    scanner = lmm.get_fast_scanner()

    scanner.fast_scan(markers, verbose=False)
예제 #12
0
def run_QTL_analysis(pheno_filename,
                     anno_filename,
                     geno_prefix,
                     plinkGenotype,
                     output_dir,
                     window_size=250000,
                     min_maf=0.05,
                     min_hwe_P=0.001,
                     min_call_rate=0.95,
                     blocksize=1000,
                     cis_mode=True,
                     skipAutosomeFiltering=False,
                     gaussianize_method=None,
                     minimum_test_samples=10,
                     seed=np.random.randint(40000),
                     n_perm=0,
                     write_permutations=False,
                     relatedness_score=0.95,
                     feature_variant_covariate_filename=None,
                     snps_filename=None,
                     feature_filename=None,
                     snp_feature_filename=None,
                     genetic_range='all',
                     covariates_filename=None,
                     kinship_filename=None,
                     sample_mapping_filename=None,
                     extended_anno_filename=None,
                     regressCovariatesUpfront=False):
    fill_NaN = Imputer(missing_values=np.nan,
                       strategy='mean',
                       axis=0,
                       copy=False)
    print('Running QTL analysis.')
    lik = 'normal'
    minimumProbabilityStep = 0.1
    '''Core function to take input and run QTL tests on a given chromosome.'''
    if relatedness_score is not None:
        relatedness_score = float(relatedness_score)
    [phenotype_df, kinship_df, covariate_df, sample2individual_df,complete_annotation_df, annotation_df, snp_filter_df, snp_feature_filter_df, geneticaly_unique_individuals, minimum_test_samples, feature_list, bim, fam, bed, bgen, chromosome, selectionStart, selectionEnd, feature_variant_covariate_df]=\
    utils.run_QTL_analysis_load_intersect_phenotype_covariates_kinship_sample_mapping(pheno_filename=pheno_filename, anno_filename=anno_filename, geno_prefix=geno_prefix, plinkGenotype=plinkGenotype, cis_mode=cis_mode, skipAutosomeFiltering = skipAutosomeFiltering,
                      minimum_test_samples= minimum_test_samples,  relatedness_score=relatedness_score, snps_filename=snps_filename, feature_filename=feature_filename, snp_feature_filename=snp_feature_filename, selection=genetic_range,
                     covariates_filename=covariates_filename, kinship_filename=kinship_filename, sample_mapping_filename=sample_mapping_filename, extended_anno_filename=extended_anno_filename, feature_variant_covariate_filename=feature_variant_covariate_filename)

    mixed = kinship_df is not None
    if (kinship_df is None) or (relatedness_score is None):
        geneticaly_unique_individuals = sample2individual_df['iid'].values
    QS = None
    if (feature_list == None or len(feature_list) == 0):
        print('No features to be tested.')
        sys.exit()

    #Open output files
    qtl_loader_utils.ensure_dir(output_dir)
    if not selectionStart is None:
        output_writer = qtl_output.hdf5_writer(
            output_dir + '/qtl_results_{}_{}_{}.h5'.format(
                chromosome, selectionStart, selectionEnd))
    else:
        output_writer = qtl_output.hdf5_writer(
            output_dir + '/qtl_results_{}.h5'.format(chromosome))
    if (write_permutations):
        if not selectionStart is None:
            permutation_writer = qtl_output.hdf5_permutations_writer(
                output_dir + '/perm_results_{}_{}_{}.h5'.format(
                    chromosome, selectionStart, selectionEnd), n_perm)
        else:
            permutation_writer = qtl_output.hdf5_permutations_writer(
                output_dir + '/perm_results_{}.h5'.format(chromosome), n_perm)

    #Arrays to store indices of snps tested and pass and fail QC SNPs for features without missingness.
    tested_snp_ids = []
    pass_qc_snps_all = []
    fail_qc_snps_all = []
    fail_qc_features = []
    alpha_params = []
    beta_params = []
    n_samples = []
    n_e_samples = []
    na_containing_features = 0
    currentFeatureNumber = 0
    snpQcInfoMain = None

    for feature_id in feature_list:
        snpQcInfo = None
        currentFeatureNumber += 1
        if (len(phenotype_df.loc[feature_id, :])) < minimum_test_samples:
            print("Feature: " + feature_id +
                  " not tested not enough samples do QTL test.")
            fail_qc_features.append(feature_id)
            geneticaly_unique_individuals = tmp_unique_individuals
            continue
        data_written = False
        contains_missing_samples = False
        snpQuery = utils.do_snp_selection(feature_id, complete_annotation_df,
                                          bim, cis_mode, window_size,
                                          skipAutosomeFiltering)
        snp_cov_df = None
        if (feature_variant_covariate_df is not None):
            if (feature_id in feature_variant_covariate_df['feature'].values):
                covariateSnp = feature_variant_covariate_df['snp_id'].values[
                    feature_variant_covariate_df['feature'] == feature_id]
                if (any(i in bim['snp'].values for i in covariateSnp)):
                    snpQuery_cov = bim.loc[
                        bim['snp'].map(lambda x: x in list(covariateSnp)), :]
                    if (plinkGenotype):
                        snp_cov_df = pd.DataFrame(
                            data=bed[snpQuery_cov['i'].values, :].compute().
                            transpose(),
                            index=fam.index,
                            columns=snpQuery_cov['snp'],
                        )
                    else:
                        ##Here we make some assumptions on the SNPs. They are expected to be ploidy 2!
                        ##Also we don't use a minimal quality to assure a value is present for all samples.
                        print(
                            'Warning, during the regression of SNPs we assume ploidy 2.'
                        )
                        snp_cov_df_t = pd.DataFrame(columns=fam.index)
                        rowNumber = 0
                        for snpId in snpQuery_cov['i']:
                            geno = bgen["genotype"][snpId].compute()
                            if (geno["phased"]):
                                snp_df_dosage_t = geno["probs"][:, [0, 2]].sum(
                                    1).astype(float)
                                snp_df_dosage_t[(
                                    np.amax(geno["probs"][:, :2], 1) +
                                    np.amax(geno["probs"][:, 2:4], 1)) < (
                                        1 +
                                        minimumProbabilityStep)] = float('NaN')
                            else:
                                snp_df_dosage_t = (geno["probs"][:, 0] *
                                                   2) + geno["probs"][:, 1]
                                snp_df_dosage_t[
                                    np.amax(geno["probs"][:, :3], 1) < (
                                        (1 / 3) +
                                        minimumProbabilityStep)] = float('NaN')
                            snp_df_dosage_t = pd.Series(snp_df_dosage_t,
                                                        index=fam.index)
                            snp_df_dosage_t.name = snpId
                            snp_cov_df_t = snp_cov_df_t.append(snp_df_dosage_t)
                            rowNumber = rowNumber + 1
                        snp_cov_df_t = snp_cov_df_t.transpose()

        if (len(snpQuery) != 0) and (snp_filter_df is not None):
            toSelect = set(snp_filter_df.index).intersection(
                set(snpQuery['snp']))
            snpQuery = snpQuery.loc[snpQuery['snp'].isin(toSelect)]

        if (len(snpQuery) != 0) and (snp_feature_filter_df is not None):
            toSelect = set(
                np.unique(snp_feature_filter_df['snp_id'].loc[
                    snp_feature_filter_df['feature'] ==
                    feature_id])).intersection(set(snpQuery['snp']))
            snpQuery = snpQuery.loc[snpQuery['snp'].isin(toSelect)]

        if len(snpQuery) == 0:
            print("Feature: " + feature_id +
                  " not tested. No SNPS passed QC for phenotype.")
            fail_qc_features.append(feature_id)
            continue
        else:
            phenotype_ds = phenotype_df.loc[feature_id]
            contains_missing_samples = any(~np.isfinite(phenotype_ds))
            if (contains_missing_samples):
                print('Feature: ' + feature_id + ' contains missing data.')
                phenotype_ds.dropna(inplace=True)
                na_containing_features = na_containing_features + 1
            '''select indices for relevant individuals in genotype matrix
            These are not unique. NOT to be used to access phenotype/covariates data
            '''

            individual_ids = sample2individual_df.loc[phenotype_ds.index,
                                                      'iid'].values
            sample2individual_feature = sample2individual_df.loc[
                phenotype_ds.index]

            if (contains_missing_samples):
                tmp_unique_individuals = geneticaly_unique_individuals
                if (kinship_df is not None) and (relatedness_score
                                                 is not None):
                    geneticaly_unique_individuals = utils.get_unique_genetic_samples(
                        kinship_df.loc[individual_ids, individual_ids],
                        relatedness_score)
                else:
                    geneticaly_unique_individuals = individual_ids
            else:
                #If no missing samples we can use the previous SNP Qc information before actually loading data.
                #This allows for more efficient blocking and retrieving of data
                snpQuery = snpQuery.loc[snpQuery['snp'].map(
                    lambda x: x not in list(map(str, fail_qc_snps_all)))]

            if phenotype_ds.empty or len(
                    geneticaly_unique_individuals) < minimum_test_samples:
                print("Feature: " + feature_id +
                      " not tested not enough samples do QTL test.")
                fail_qc_features.append(feature_id)
                if contains_missing_samples:
                    geneticaly_unique_individuals = tmp_unique_individuals
                continue
            elif np.var(phenotype_ds.values) == 0:
                print("Feature: " + feature_id +
                      " has no variance in selected individuals.")
                fail_qc_features.append(feature_id)
                if contains_missing_samples:
                    geneticaly_unique_individuals = tmp_unique_individuals
                continue

            print('For feature: ' + str(currentFeatureNumber) + '/' +
                  str(len(feature_list)) + ' (' + feature_id + '): ' +
                  str(snpQuery.shape[0]) +
                  ' SNPs need to be tested.\n Please stand by.')

            if (n_perm != 0):
                bestPermutationPval = np.ones((n_perm), dtype=np.float)

            #Here we need to start preparing the LMM, can use the fam for sample IDS in SNP matrix.
            #test if the covariates, kinship, snp and phenotype are in the same order
            if ((all(kinship_df.loc[individual_ids,individual_ids].index==sample2individual_feature.loc[phenotype_ds.index]['iid']) if kinship_df is not None else True) &\
                 (all(phenotype_ds.index==covariate_df.loc[sample2individual_feature['sample'],:].index)if covariate_df is not None else True)):
                '''
                if all lines are in order put in arrays the correct genotype and phenotype
                x=a if cond1 else b <---> equivalent to if cond1: x=a else x=b;                 better readability of the code
                 '''
                if kinship_df is not None:
                    kinship_mat = kinship_df.loc[individual_ids,
                                                 individual_ids].values
                    kinship_mat = kinship_mat.astype(float)
                    ##GOWER normalization of Kinship matrix.
                    kinship_mat *= (kinship_mat.shape[0] - 1) / (
                        kinship_mat.trace() - kinship_mat.mean(0).sum())
                    ## This needs to go with the subselection stuff.
                    if (QS is None and not contains_missing_samples):
                        QS = economic_qs(kinship_mat)
                    elif (contains_missing_samples):
                        QS_tmp = QS
                        QS = economic_qs(kinship_mat)
                if kinship_df is None:
                    K = np.eye(len(phenotype_ds.index))
                    if (QS is None and not contains_missing_samples):
                        QS = economic_qs(K)
                    elif (contains_missing_samples):
                        QS_tmp = QS
                        QS = economic_qs(K)
                cov_matrix = covariate_df.loc[sample2individual_feature[
                    'sample'], :].values if covariate_df is not None else None
                if covariate_df is None:
                    cov_matrix = np.ones((len(individual_ids), 1))
                if snp_cov_df is not None:
                    snp_cov_df_tmp = snp_cov_df.loc[individual_ids, :]
                    snp_cov_df_tmp.index = sample2individual_feature['sample']
                    snp_cov_df = pd.DataFrame(
                        fill_NaN.fit_transform(snp_cov_df_tmp))
                    snp_cov_df.index = snp_cov_df_tmp.index
                    snp_cov_df.columns = snp_cov_df_tmp.columns
                    cov_matrix = np.concatenate(
                        (cov_matrix, snp_cov_df.values), 1)
                    snp_cov_df_tmp = None
                    snp_cov_df = None
                cov_matrix = cov_matrix.astype(float)
            else:
                print(
                    'There is an issue in mapping phenotypes vs covariates and/or kinship'
                )
                sys.exit()

            phenotype = utils.force_normal_distribution(
                phenotype_ds.values, method=gaussianize_method
            ) if gaussianize_method is not None else phenotype_ds.values

            #Prepare LMM
            phenotype = phenotype.astype(float)

            ##Mixed and test.
            ##This is a future change so we don't need to decompose the COVs every time.
            ##Like QS this needs to happen when genetic unique individuals is the same.
            #svd_cov = economic_svd(cov_matrix)
            #lmm = LMM(phenotype, cov_matrix, QS, SVD=svd_cov)
            #These steps need to happen only once per phenotype.
            #print(QS)
            lmm = LMM(phenotype, cov_matrix, QS)
            if not mixed:
                lmm.delta = 1
                lmm.fix('delta')
            #Prepare null model.
            lmm.fit(verbose=False)
            if regressCovariatesUpfront:
                phenotype_corrected = phenotype - cov_matrix[:, 1:].dot(
                    lmm.beta[1:])
                cov_matrix_corrected = cov_matrix[:, 0]
                lmm = LMM(phenotype_corrected, cov_matrix_corrected, QS)
                lmm.fit(verbose=False)

            null_lml = lmm.lml()
            flmm = lmm.get_fast_scanner()
            countChunker = 0
            for snpGroup in utils.chunker(snpQuery, blocksize):
                countChunker = countChunker + 1
                #print(countChunker)
                #Fix seed at the start of the first chunker so all permutations are based on the same random first split.
                np.random.seed(seed)
                #print(snpGroup)
                snp_idxs = snpGroup['i'].values
                snp_names = snpGroup['snp'].values

                tested_snp_ids.extend(snp_names)
                #subset genotype matrix, we cannot subselect at the same time, do in two steps.
                if (plinkGenotype):
                    snp_df = pd.DataFrame(
                        data=bed[snp_idxs, :].compute().transpose(),
                        index=fam.index,
                        columns=snp_names)
                else:
                    snp_df_dosage = pd.DataFrame(np.nan,
                                                 index=fam.index,
                                                 columns=snp_names)
                    snp_df = pd.DataFrame(np.nan,
                                          index=fam.index,
                                          columns=snp_names)
                    rowNumber = 0
                    for snpId in snp_idxs:
                        geno = bgen["genotype"][snpId].compute()
                        if (geno["ploidy"].min() > 1 & geno["ploidy"].max() <
                                3):
                            if (geno["phased"]):
                                snp_df_dosage_t = geno["probs"][:, [0, 2]].sum(
                                    1).astype(float)
                                snp_df_t = (np.abs(
                                    np.argmax(geno["probs"][:, :2], axis=1) - 1
                                ) + np.abs(
                                    np.argmax(geno["probs"][:, 2:4], axis=1) -
                                    1)).astype(float)
                                naId = (np.amax(geno["probs"][:, :2], 1) +
                                        np.amax(geno["probs"][:, 2:4], 1)) < (
                                            1 + minimumProbabilityStep)
                                snp_df_dosage_t[naId] = float('NaN')
                                snp_df_t[naId] = float('NaN')
                            else:
                                snp_df_dosage_t = (
                                    (geno["probs"][:, 0] * 2) +
                                    geno["probs"][:, 1]).astype(float)
                                snp_df_t = (np.abs(
                                    np.argmax(geno["probs"][:, :3], axis=1) -
                                    2)).astype(float)
                                naId = np.amax(geno["probs"][:, :3], 1) < (
                                    (1 / 3) + minimumProbabilityStep)
                                snp_df_dosage_t[naId] = float('NaN')
                                snp_df_t[naId] = float('NaN')
                            snp_df_dosage.loc[:, snp_names[
                                rowNumber]] = snp_df_dosage_t
                            snp_df.loc[:, snp_names[rowNumber]] = snp_df_t
                        rowNumber = rowNumber + 1
                    snp_df_dosage = snp_df_dosage.loc[individual_ids, :]

                snp_df = snp_df.loc[individual_ids, :]

                snp_df = snp_df.loc[:,
                                    np.unique(snp_df.columns)[
                                        np.unique(snp_df.columns,
                                                  return_counts=1)[1] == 1]]
                #SNP QC.
                if not contains_missing_samples:
                    #remove SNPs from snp_df if they have previously failed QC
                    snp_df = snp_df.loc[:,
                                        snp_df.columns[~snp_df.columns.
                                                       isin(fail_qc_snps_all)]]
                    if snp_df.shape[1] == 0:
                        continue
                    snps_to_test_df = snp_df.loc[:, snp_df.columns[
                        ~snp_df.columns.isin(pass_qc_snps_all)]]
                    if snps_to_test_df.shape[1] > 0:
                        #Only do QC on relevant SNPs. join pre-QCed list and new QCed list.
                        if kinship_df is not None:
                            passed_snp_names, failed_snp_names, call_rate, maf, hweP = do_snp_qc(
                                snps_to_test_df.iloc[np.unique(
                                    snps_to_test_df.index,
                                    return_index=1)[1]].loc[
                                        geneticaly_unique_individuals, :],
                                min_call_rate, min_maf, min_hwe_P)
                        else:
                            passed_snp_names, failed_snp_names, call_rate, maf, hweP = do_snp_qc(
                                snps_to_test_df, min_call_rate, min_maf,
                                min_hwe_P)
                        snps_to_test_df = None
                        #append snp_names and failed_snp_names
                        pass_qc_snps_all.extend(passed_snp_names)
                        fail_qc_snps_all.extend(failed_snp_names)
                    snp_df = snp_df.loc[:,
                                        snp_df.columns[snp_df.columns.
                                                       isin(pass_qc_snps_all)]]
                else:
                    #Do snp QC for relevant section.
                    #Get relevant slice from: phenotype_ds
                    if kinship_df is not None:
                        passed_snp_names, failed_snp_names, call_rate, maf, hweP = do_snp_qc(
                            snp_df.iloc[np.unique(
                                snp_df.index, return_index=1)[1]].loc[
                                    geneticaly_unique_individuals, :],
                            min_call_rate, min_maf, min_hwe_P)
                    else:
                        passed_snp_names, failed_snp_names, call_rate, maf, hweP = do_snp_qc(
                            snp_df, min_call_rate, min_maf, min_hwe_P)
                    snp_df = snp_df.loc[:,
                                        snp_df.columns[snp_df.columns.
                                                       isin(passed_snp_names)]]
                snpQcInfo_t = None
                if call_rate is not None:
                    snpQcInfo_t = call_rate
                    if maf is not None:
                        snpQcInfo_t = pd.concat(
                            [snpQcInfo_t,
                             maf.reindex(snpQcInfo_t.index)],
                            axis=1)
                        if hweP is not None:
                            snpQcInfo_t = pd.concat(
                                [snpQcInfo_t,
                                 hweP.reindex(snpQcInfo_t.index)],
                                axis=1)
                call_rate = None
                maf = None
                hweP = None
                if snpQcInfo is None and snpQcInfo_t is not None:
                    snpQcInfo = snpQcInfo_t
                elif snpQcInfo_t is not None:
                    snpQcInfo = pd.concat([snpQcInfo, snpQcInfo_t],
                                          axis=0,
                                          sort=False)
                ##First process SNPQc than check if we can continue.
                if len(snp_df.columns) == 0:
                    continue
                elif (not plinkGenotype):
                    snp_df_dosage = snp_df_dosage.loc[:,
                                                      np.unique(snp_df.columns
                                                                )]
                #We could make use of relatedness when imputing.  And impute only based on genetically unique individuals.
                snp_df = pd.DataFrame(fill_NaN.fit_transform(snp_df),
                                      index=snp_df.index,
                                      columns=snp_df.columns)
                if (not plinkGenotype):
                    snp_df_dosage = pd.DataFrame(
                        fill_NaN.fit_transform(snp_df_dosage),
                        index=snp_df_dosage.index,
                        columns=snp_df_dosage.columns)
                ##No more snp_matrix_DF > snp_df


#                test if the covariates, kinship, snp and phenotype are in the same order
                if (len(snp_df.index) != len(sample2individual_feature.loc[
                        phenotype_ds.index]['iid'])
                        or not all(snp_df.index == sample2individual_feature.
                                   loc[phenotype_ds.index]['iid'])):
                    print(
                        'There is an issue in mapping phenotypes and genotypes'
                    )
                    sys.exit()

                G = snp_df.values
                if (not plinkGenotype):
                    G = snp_df_dosage.values
                G = G.astype(float)
                G_index = snp_df.columns

                alt_lmls, effsizes = flmm.fast_scan(G, verbose=False)
                var_pvalues = lrt_pvalues(null_lml, alt_lmls)
                var_effsizes_se = effsizes_se(effsizes, var_pvalues)

                #add these results to qtl_results
                temp_df = pd.DataFrame(index=range(len(G_index)),
                                       columns=[
                                           'feature_id', 'snp_id', 'p_value',
                                           'beta', 'beta_se',
                                           'empirical_feature_p_value'
                                       ])
                temp_df['snp_id'] = G_index
                temp_df['feature_id'] = feature_id
                temp_df['beta'] = np.asarray(effsizes)
                temp_df['p_value'] = np.asarray(var_pvalues)
                temp_df['beta_se'] = np.asarray(var_effsizes_se)
                #insert default dummy value
                temp_df['empirical_feature_p_value'] = -1.0

                if (n_perm != 0):
                    pValueBuffer = []
                    totalSnpsToBeTested = (G.shape[1] * n_perm)
                    permutationStepSize = np.floor(
                        n_perm / (totalSnpsToBeTested / blocksize))
                    if (permutationStepSize > n_perm):
                        permutationStepSize = n_perm
                    elif (permutationStepSize < 1):
                        permutationStepSize = 1

                    if (write_permutations):
                        perm_df = pd.DataFrame(
                            index=range(len(G_index)),
                            columns=['snp_id'] +
                            ['permutation_' + str(x) for x in range(n_perm)])
                        perm_df['snp_id'] = G_index
                    for currentNperm in utils.chunker(
                            list(range(1, n_perm + 1)), permutationStepSize):
                        if (kinship_df is not None) and (relatedness_score
                                                         is not None):
                            if (plinkGenotype):
                                temp = utils.get_shuffeld_genotypes_preserving_kinship(
                                    geneticaly_unique_individuals,
                                    relatedness_score, snp_df,
                                    kinship_df.loc[individual_ids,
                                                   individual_ids],
                                    len(currentNperm))
                            else:
                                temp = utils.get_shuffeld_genotypes_preserving_kinship(
                                    geneticaly_unique_individuals,
                                    relatedness_score, snp_df_dosage,
                                    kinship_df.loc[individual_ids,
                                                   individual_ids],
                                    len(currentNperm))
                        else:
                            if (plinkGenotype):
                                temp = utils.get_shuffeld_genotypes(
                                    snp_df, len(currentNperm))
                            else:
                                temp = utils.get_shuffeld_genotypes(
                                    snp_df_dosage, len(currentNperm))
                        temp = temp.astype(float)
                        alt_lmls_p, effsizes_p = flmm.fast_scan(temp,
                                                                verbose=False)
                        var_pvalues_p = lrt_pvalues(null_lml, alt_lmls_p)
                        pValueBuffer.extend(np.asarray(var_pvalues_p))
                    if (not (len(pValueBuffer) == totalSnpsToBeTested)):
                        #print(len(pValueBuffer))
                        #print(pValueBuffer)
                        #print(totalSnpsToBeTested)
                        print('Error in blocking logic for permutations.')
                        sys.exit()
                    perm = 0
                    for relevantOutput in utils.chunker(
                            pValueBuffer, G.shape[1]):
                        if (write_permutations):
                            perm_df['permutation_' +
                                    str(perm)] = relevantOutput
                        if (bestPermutationPval[perm] > min(relevantOutput)):
                            bestPermutationPval[perm] = min(relevantOutput)
                        perm = perm + 1
                        #print(relevantOutput)
                        #print('permutation_'+str(perm))

                if not temp_df.empty:
                    data_written = True
                    output_writer.add_result_df(temp_df)
                    if (write_permutations):
                        permutation_writer.add_permutation_results_df(
                            perm_df, feature_id)
            #This we need to change in the written file.
        if (n_perm > 1 and data_written):
            #updated_permuted_p_in_hdf5(bestPermutationPval, feature_id);
            alpha_para, beta_para = output_writer.apply_pval_correction(
                feature_id, bestPermutationPval, cis_mode)
            #np.savetxt(output_dir+"/Permutation.pValues."+feature_id+".txt",bestPermutationPval)
            alpha_params.append(alpha_para)
            beta_params.append(beta_para)
        if not data_written:
            fail_qc_features.append(feature_id)
        else:
            n_samples.append(phenotype_ds.size)
            n_e_samples.append(len(geneticaly_unique_individuals))
        if contains_missing_samples:
            QS = QS_tmp
            geneticaly_unique_individuals = tmp_unique_individuals
            del QS_tmp
            del tmp_unique_individuals
            if snpQcInfo is not None:
                snpQcInfo.index.name = "snp_id"
                snpQcInfo.to_csv(
                    output_dir +
                    '/snp_qc_metrics_naContaining_feature_{}.txt'.format(
                        feature_id),
                    sep='\t')
        else:
            if (snpQcInfo is not None and snpQcInfoMain is not None):
                snpQcInfoMain = pd.concat([snpQcInfoMain, snpQcInfo],
                                          axis=0,
                                          sort=False)
            elif snpQcInfo is not None:
                snpQcInfoMain = snpQcInfo.copy(deep=True)
        #if snpQcInfo is not None:
        #snpQcInfo2 = snpQcInfo.copy().transpose()
        #snpQcInfo2.to_csv(output_dir+'/snp_qc_metrics_feature_{}.txt'.format(feature_id),sep='\t')
        #print('step 5')
    output_writer.close()

    if (write_permutations):
        permutation_writer.close()
    fail_qc_features = np.unique(fail_qc_features)
    if ((len(feature_list) - len(fail_qc_features)) == 0):
        time.sleep(15)
        #Safety timer to make sure the file is unlocked.
        print("Trying to remove the h5 file. Nothing has been tested.")
        print(output_dir + 'qtl_results_{}_{}_{}.h5'.format(
            chromosome, selectionStart, selectionEnd))
        if not selectionStart is None:
            os.remove(output_dir + 'qtl_results_{}_{}_{}.h5'.format(
                chromosome, selectionStart, selectionEnd))
        else:
            os.remove(output_dir + 'qtl_results_{}.h5'.format(chromosome))
        sys.exit()
    #gather unique indexes of tested SNPs

    tested_snp_ids = list(set(tested_snp_ids))
    #write annotation and snp data to file
    snp_df = pd.DataFrame()
    snp_df['snp_id'] = bim['snp']
    snp_df['chromosome'] = bim['chrom']
    snp_df['position'] = bim['pos']
    snp_df['assessed_allele'] = bim['a1']
    snp_df.index = snp_df['snp_id']
    snp_df = snp_df.drop_duplicates()
    snp_df = snp_df.reindex(tested_snp_ids)
    snp_df = snp_df.drop_duplicates()

    if snpQcInfoMain is not None:
        snpQcInfoMain['index'] = snpQcInfoMain.index
        snpQcInfoMain = snpQcInfoMain.drop_duplicates()
        del snpQcInfoMain['index']
        snp_df = pd.concat(
            [snp_df, snpQcInfoMain.reindex(snp_df.index)], axis=1)

        if (snp_df.shape[1] == 5):
            snp_df.columns = [
                'snp_id', 'chromosome', 'position', 'assessed_allele',
                'call_rate'
            ]
        elif (snp_df.shape[1] == 6):
            snp_df.columns = [
                'snp_id', 'chromosome', 'position', 'assessed_allele',
                'call_rate', 'maf'
            ]
        else:
            snp_df.columns = [
                'snp_id', 'chromosome', 'position', 'assessed_allele',
                'call_rate', 'maf', 'hwe_p'
            ]

    feature_list = list(set(feature_list) - set(fail_qc_features))
    annotation_df = annotation_df.reindex(feature_list)
    annotation_df['n_samples'] = n_samples
    annotation_df['n_e_samples'] = n_e_samples

    if (n_perm > 1):
        annotation_df['alpha_param'] = alpha_params
        annotation_df['beta_param'] = beta_params

    if not selectionStart is None:
        snp_df.to_csv(output_dir + '/snp_metadata_{}_{}_{}.txt'.format(
            chromosome, selectionStart, selectionEnd),
                      sep='\t',
                      index=False)
        annotation_df.to_csv(output_dir +
                             '/feature_metadata_{}_{}_{}.txt'.format(
                                 chromosome, selectionStart, selectionEnd),
                             sep='\t')
    else:
        snp_df.to_csv(output_dir + '/snp_metadata_{}.txt'.format(chromosome),
                      sep='\t',
                      index=False)
        annotation_df.to_csv(output_dir +
                             '/feature_metadata_{}.txt'.format(chromosome),
                             sep='\t')
예제 #13
0
def test_fast_scanner_statsmodel_gls():
    from numpy.linalg import lstsq

    def _lstsq(A, B):
        return lstsq(A, B, rcond=None)[0]

    # data = sm.datasets.longley.load()
    # data.exog = sm.add_constant(data.exog)
    # ols_resid = sm.OLS(data.endog, data.exog).fit().resid
    # resid_fit = sm.OLS(ols_resid[1:], sm.add_constant(ols_resid[:-1])).fit()
    # rho = resid_fit.params[1]
    rho = -0.3634294908774683
    # order = toeplitz(range(len(ols_resid)))
    order = toeplitz(range(16))
    sigma = rho**order

    QS = economic_qs(sigma)
    endog = reshape(
        [
            60323.0,
            61122.0,
            60171.0,
            61187.0,
            63221.0,
            63639.0,
            64989.0,
            63761.0,
            66019.0,
            67857.0,
            68169.0,
            66513.0,
            68655.0,
            69564.0,
            69331.0,
            70551.0,
        ],
        (16, ),
    )
    exog = reshape(
        [
            1.0,
            83.0,
            234289.0,
            2356.0,
            1590.0,
            107608.0,
            1947.0,
            1.0,
            88.5,
            259426.0,
            2325.0,
            1456.0,
            108632.0,
            1948.0,
            1.0,
            88.2,
            258054.0,
            3682.0,
            1616.0,
            109773.0,
            1949.0,
            1.0,
            89.5,
            284599.0,
            3351.0,
            1650.0,
            110929.0,
            1950.0,
            1.0,
            96.2,
            328975.0,
            2099.0,
            3099.0,
            112075.0,
            1951.0,
            1.0,
            98.1,
            346999.0,
            1932.0,
            3594.0,
            113270.0,
            1952.0,
            1.0,
            99.0,
            365385.0,
            1870.0,
            3547.0,
            115094.0,
            1953.0,
            1.0,
            100.0,
            363112.0,
            3578.0,
            3350.0,
            116219.0,
            1954.0,
            1.0,
            101.2,
            397469.0,
            2904.0,
            3048.0,
            117388.0,
            1955.0,
            1.0,
            104.6,
            419180.0,
            2822.0,
            2857.0,
            118734.0,
            1956.0,
            1.0,
            108.4,
            442769.0,
            2936.0,
            2798.0,
            120445.0,
            1957.0,
            1.0,
            110.8,
            444546.0,
            4681.0,
            2637.0,
            121950.0,
            1958.0,
            1.0,
            112.6,
            482704.0,
            3813.0,
            2552.0,
            123366.0,
            1959.0,
            1.0,
            114.2,
            502601.0,
            3931.0,
            2514.0,
            125368.0,
            1960.0,
            1.0,
            115.7,
            518173.0,
            4806.0,
            2572.0,
            127852.0,
            1961.0,
            1.0,
            116.9,
            554894.0,
            4007.0,
            2827.0,
            130081.0,
            1962.0,
        ],
        (16, 7),
    )
    lmm = LMM(endog, exog, QS)
    lmm.fit(verbose=False)

    sigma = lmm.covariance()
    scanner = lmm.get_fast_scanner()
    best_beta_se = _lstsq(exog.T @ _lstsq(lmm.covariance(), exog), eye(7))
    best_beta_se = sqrt(best_beta_se.diagonal())
    assert_allclose(scanner.null_beta_se, best_beta_se, atol=1e-4)

    endog = endog.copy()
    endog -= endog.mean(0)
    endog /= endog.std(0)

    exog = exog.copy()
    exog -= exog.mean(0)
    with errstate(invalid="ignore", divide="ignore"):
        exog /= exog.std(0)
    exog[:, 0] = 1

    lmm = LMM(endog, exog, QS)
    lmm.fit(verbose=False)

    sigma = lmm.covariance()
    scanner = lmm.get_fast_scanner()

    # gls_model = sm.GLS(endog, exog, sigma=sigma)
    # gls_results = gls_model.fit()
    # scale = gls_results.scale
    scale = 1.7777777777782937
    # beta_se = gls_results.bse
    beta_se = array([
        0.014636888951505144,
        0.21334653097414055,
        0.7428559936739378,
        0.10174713767252333,
        0.032745906589939845,
        0.3494488802468581,
        0.4644879873404213,
    ])
    our_beta_se = sqrt(scanner.null_beta_covariance.diagonal())
    # statsmodels scales the covariance matrix we pass, that is why
    # we need to account for it here.
    assert_allclose(our_beta_se, beta_se / sqrt(scale), rtol=1e-6)
    assert_allclose(scanner.null_beta_se, beta_se / sqrt(scale), rtol=1e-6)
예제 #14
0
def run_PrsQtl_analysis(pheno_filename,
                        anno_filename,
                        prsFile,
                        output_dir,
                        min_call_rate=0.95,
                        blocksize=1000,
                        skipAutosomeFiltering=False,
                        gaussianize_method=None,
                        minimum_test_samples=10,
                        seed=np.random.randint(40000),
                        n_perm=0,
                        write_permutations=False,
                        relatedness_score=None,
                        feature_variant_covariate_filename=None,
                        snps_filename=None,
                        feature_filename=None,
                        snp_feature_filename=None,
                        genetic_range='all',
                        covariates_filename=None,
                        kinship_filename=None,
                        sample_mapping_filename=None,
                        regressCovariatesUpfront=False):
    fill_NaN = Imputer(missing_values=np.nan, strategy='mean', axis=0)
    print('Running GRS QT analysis.')
    lik = 'normal'
    '''Core function to take input and run QTL tests on a given chromosome.'''
    if relatedness_score is not None:
        relatedness_score = float(relatedness_score)
    [phenotype_df, kinship_df, covariate_df, sample2individual_df, annotation_df, snp_filter_df, snp_feature_filter_df, geneticaly_unique_individuals, minimum_test_samples, feature_list, risk_df, chromosome, selectionStart, selectionEnd, feature_variant_covariate_df]=\
    utils.run_PrsQtl_analysis_load_intersect_phenotype_covariates_kinship_sample_mapping(pheno_filename=pheno_filename, anno_filename=anno_filename, prsFile=prsFile, skipAutosomeFiltering = skipAutosomeFiltering,
                      minimum_test_samples= minimum_test_samples,  relatedness_score=relatedness_score, snps_filename=snps_filename, feature_filename=feature_filename, snp_feature_filename=snp_feature_filename, selection=genetic_range,
                     covariates_filename=covariates_filename, kinship_filename=kinship_filename, sample_mapping_filename=sample_mapping_filename, feature_variant_covariate_filename=feature_variant_covariate_filename)

    mixed = kinship_df is not None
    if (kinship_df is None) or (relatedness_score is None):
        geneticaly_unique_individuals = sample2individual_df['iid'].values
    QS = None
    if (feature_list == None or len(feature_list) == 0):
        print('No features to be tested.')
        sys.exit()

    #Open output files
    qtl_loader_utils.ensure_dir(output_dir)
    if not selectionStart is None:
        output_writer = qtl_output.hdf5_writer(
            output_dir + '/qtl_results_{}_{}_{}.h5'.format(
                chromosome, selectionStart, selectionEnd))
    else:
        output_writer = qtl_output.hdf5_writer(
            output_dir + '/qtl_results_{}.h5'.format(chromosome))
    if (write_permutations):
        if not selectionStart is None:
            permutation_writer = qtl_output.hdf5_permutations_writer(
                output_dir + '/perm_results_{}_{}_{}.h5'.format(
                    chromosome, selectionStart, selectionEnd), n_perm)
        else:
            permutation_writer = qtl_output.hdf5_permutations_writer(
                output_dir + '/perm_results_{}.h5'.format(chromosome), n_perm)

    #Arrays to store indices of snps tested and pass and fail QC SNPs for features without missingness.
    tested_snp_names = []
    fail_qc_features = []
    alpha_params = []
    beta_params = []
    n_samples = []
    n_e_samples = []
    na_containing_features = 0
    currentFeatureNumber = 0
    snpQcInfoMain = None
    for feature_id in feature_list:
        snpQcInfo = None
        currentFeatureNumber += 1
        if (len(phenotype_df.loc[feature_id, :])) < minimum_test_samples:
            print("Feature: " + feature_id +
                  " not tested not enough samples do QTL test.")
            fail_qc_features.append(feature_id)
            geneticaly_unique_individuals = tmp_unique_individuals
            continue
        data_written = False
        contains_missing_samples = False
        snpQuery = risk_df.index.values
        snp_cov_df = None

        if (feature_variant_covariate_df is not None):
            if (feature_id in feature_variant_covariate_df['feature'].values):
                covariateSnp = feature_variant_covariate_df['snp_id'].values[
                    feature_variant_covariate_df['feature'] == feature_id]
                if (any(i in risk_df.index.values for i in covariateSnp)):
                    snp_cov_df = risk_df.loc[risk_df.index.map(
                        lambda x: x in list(covariateSnp)), :].transpose()

        if (len(snpQuery) != 0) and (snp_filter_df is not None):
            snpQuery = list(
                set(snp_filter_df.index).intersection(set(snpQuery)))

        if (len(snpQuery) != 0) and (snp_feature_filter_df is not None):
            snpQuery = list(
                set(
                    np.unique(snp_feature_filter_df['snp_id'].loc[
                        snp_feature_filter_df['feature'] ==
                        feature_id])).intersection(set(snpQuery)))

        if len(snpQuery) == 0:
            print("Feature: " + feature_id +
                  " not tested. No SNPS passed QC for phenotype.")
            fail_qc_features.append(feature_id)
            continue
        else:
            phenotype_ds = phenotype_df.loc[feature_id]
            contains_missing_samples = any(~np.isfinite(phenotype_ds))
            if (contains_missing_samples):
                #import pdb; pdb.set_trace()
                print('Feature: ' + feature_id + ' contains missing data.')
                phenotype_ds.dropna(inplace=True)
                na_containing_features = na_containing_features + 1
            '''select indices for relevant individuals in genotype matrix
            These are not unique. NOT to be used to access phenotype/covariates data
            '''
            individual_ids = sample2individual_df.loc[phenotype_ds.index,
                                                      'iid'].values
            sample2individual_feature = sample2individual_df.loc[
                phenotype_ds.index]

            if contains_missing_samples:
                tmp_unique_individuals = geneticaly_unique_individuals
                if (kinship_df is not None) and (relatedness_score
                                                 is not None):
                    geneticaly_unique_individuals = utils.get_unique_genetic_samples(
                        kinship_df.loc[individual_ids, individual_ids],
                        relatedness_score)
                else:
                    geneticaly_unique_individuals = individual_ids
            if phenotype_ds.empty or len(
                    geneticaly_unique_individuals) < minimum_test_samples:
                print("Feature: " + feature_id +
                      " not tested not enough samples do QTL test.")
                fail_qc_features.append(feature_id)
                if contains_missing_samples:
                    geneticaly_unique_individuals = tmp_unique_individuals
                continue
            elif np.var(phenotype_ds.values) == 0:
                print("Feature: " + feature_id +
                      " has no variance in selected individuals.")
                fail_qc_features.append(feature_id)
                if contains_missing_samples:
                    geneticaly_unique_individuals = tmp_unique_individuals
                continue

            print('For feature: ' + str(currentFeatureNumber) + '/' +
                  str(len(feature_list)) + ' (' + feature_id + '): ' +
                  str(len(snpQuery)) +
                  ' risk scores will be tested.\n Please stand by.')
            if (n_perm != 0):
                bestPermutationPval = np.ones((n_perm), dtype=np.float)

            #Here we need to start preparing the LMM, can use the fam for sample IDS in SNP matrix.


#                test if the covariates, kinship, snp and phenotype are in the same order
            if ((all(kinship_df.loc[individual_ids,individual_ids].index==sample2individual_feature.loc[phenotype_ds.index]['iid']) if kinship_df is not None else True) &\
                 (all(phenotype_ds.index==covariate_df.loc[sample2individual_feature['sample'],:].index)if covariate_df is not None else True)):
                '''
                if all lines are in order put in arrays the correct genotype and phenotype
                x=a if cond1 else b <---> equivalent to if cond1: x=a else x=b;                 better readability of the code
                 '''
                if kinship_df is not None:
                    kinship_mat = kinship_df.loc[individual_ids,
                                                 individual_ids].values
                    kinship_mat = kinship_mat.astype(float)
                    ##GOWER normalization of Kinship matrix.
                    kinship_mat *= (kinship_mat.shape[0] - 1) / (
                        kinship_mat.trace() - kinship_mat.mean(0).sum())
                    ## This needs to go with the subselection stuff.
                    if (QS is None and not contains_missing_samples):
                        QS = economic_qs(kinship_mat)
                    elif (contains_missing_samples):
                        QS_tmp = QS
                        QS = economic_qs(kinship_mat)
                if kinship_df is None:
                    K = np.eye(len(phenotype_ds.index))
                    if (QS is None and not contains_missing_samples):
                        QS = economic_qs(K)
                    elif (contains_missing_samples):
                        QS_tmp = QS
                        QS = economic_qs(K)
                cov_matrix = covariate_df.loc[sample2individual_feature[
                    'sample'], :].values if covariate_df is not None else None
                if covariate_df is None:
                    cov_matrix = np.ones((len(individual_ids), 1))
                #pdb.set_trace()
                if snp_cov_df is not None:
                    snp_cov_df_tmp = snp_cov_df.loc[individual_ids, :]
                    snp_cov_df = pd.DataFrame(
                        fill_NaN.fit_transform(snp_cov_df_tmp))
                    snp_cov_df.index = sample2individual_feature['sample']
                    snp_cov_df.columns = snp_cov_df_tmp.columns
                    cov_matrix = np.concatenate(
                        (cov_matrix, snp_cov_df.values), 1)
                    snp_cov_df_tmp = None
                    snp_cov_df = None
                cov_matrix = cov_matrix.astype(float)
            else:
                print(
                    'There is an issue in mapping phenotypes vs covariates and/or kinship'
                )
                sys.exit()

            phenotype = utils.force_normal_distribution(
                phenotype_ds.values, method=gaussianize_method
            ) if gaussianize_method is not None else phenotype_ds.values

            #Prepare LMM
            phenotype = phenotype.astype(float)

            ##Mixed and test.
            ##This is a future change so we don't need to decompose the COVs every time.
            ##Like QS this needs to happen when genetic unique individuals is the same.
            #svd_cov = economic_svd(cov_matrix)
            #lmm = LMM(phenotype, cov_matrix, QS, SVD=svd_cov)
            #These steps need to happen only once per phenotype.
            #print(QS)
            lmm = LMM(phenotype, cov_matrix, QS)
            if not mixed:
                lmm.delta = 1
                lmm.fix('delta')
            #Prepare null model.
            lmm.fit(verbose=False)
            if regressCovariatesUpfront:
                phenotype_corrected = phenotype - cov_matrix[:, 1:].dot(
                    lmm.beta[1:])
                cov_matrix_corrected = cov_matrix[:, 0]
                lmm = LMM(phenotype_corrected, cov_matrix_corrected, QS)
                lmm.fit(verbose=False)

            null_lml = lmm.lml()
            flmm = lmm.get_fast_scanner()
            #pdb.set_trace();
            for snpGroup in utils.chunker(snpQuery, blocksize):
                #Fix seed at the start of the first chunker so all permutations are based on the same random first split.
                np.random.seed(seed)

                snp_names = snpGroup

                tested_snp_names.extend(snp_names)
                snp_matrix_DF = risk_df.loc[snp_names,
                                            individual_ids].transpose()
                ##GRS var QC
                snp_matrix_DF = snp_matrix_DF.loc[:,
                                                  snp_matrix_DF.isna().sum(
                                                      axis=0) != snp_matrix_DF.
                                                  shape[0], ]
                snp_matrix_DF = snp_matrix_DF.loc[:, (
                    np.nanstd(snp_matrix_DF, axis=0) > 0)]
                #               test if the covariates, kinship, snp and phenotype are in the same order
                if (len(snp_matrix_DF.index) != len(
                        sample2individual_feature.loc[phenotype_ds.index]
                    ['iid']) or not all(
                        snp_matrix_DF.index == sample2individual_feature.loc[
                            phenotype_ds.index]['iid'])):
                    print(
                        'There is an issue in mapping phenotypes and genotypes'
                    )
                    sys.exit()
                #Impute missingness
                #pdb.set_trace()
                call_rate = 1 - snp_matrix_DF.isnull().sum() / len(
                    snp_matrix_DF.index)
                if snpQcInfo is None and call_rate is not None:
                    snpQcInfo = call_rate
                elif call_rate is not None:
                    snpQcInfo = pd.concat([snpQcInfo, call_rate], axis=0)
                selection = call_rate > min_call_rate
                snp_matrix_DF = snp_matrix_DF.loc[:,
                                                  list(snp_matrix_DF.
                                                       columns[selection])]
                if snp_matrix_DF.shape[1] == 0:
                    continue
                snp_matrix_DF = pd.DataFrame(
                    fill_NaN.fit_transform(snp_matrix_DF),
                    index=snp_matrix_DF.index,
                    columns=snp_matrix_DF.columns)
                #
                G = snp_matrix_DF.values
                G = G.astype(float)
                G_index = snp_matrix_DF.columns

                alt_lmls, effsizes = flmm.fast_scan(G, verbose=False)
                var_pvalues = lrt_pvalues(null_lml, alt_lmls)
                var_effsizes_se = effsizes_se(effsizes, var_pvalues)

                #add these results to qtl_results
                temp_df = pd.DataFrame(index=range(len(G_index)),
                                       columns=[
                                           'feature_id', 'snp_id', 'p_value',
                                           'beta', 'beta_se',
                                           'empirical_feature_p_value'
                                       ])
                temp_df['snp_id'] = G_index
                temp_df['feature_id'] = feature_id
                temp_df['beta'] = np.asarray(effsizes)
                temp_df['p_value'] = np.asarray(var_pvalues)
                temp_df['beta_se'] = np.asarray(var_effsizes_se)
                #insert default dummy value
                temp_df['empirical_feature_p_value'] = -1.0

                if (n_perm != 0):
                    pValueBuffer = []
                    totalSnpsToBeTested = (G.shape[1] * n_perm)
                    permutationStepSize = np.floor(
                        n_perm / (totalSnpsToBeTested / blocksize))
                    if (permutationStepSize > n_perm):
                        permutationStepSize = n_perm
                    elif (permutationStepSize < 1):
                        permutationStepSize = 1

                    if (write_permutations):
                        perm_df = pd.DataFrame(
                            index=range(len(G_index)),
                            columns=['snp_id'] +
                            ['permutation_' + str(x) for x in range(n_perm)])
                        perm_df['snp_id'] = G_index
                    for currentNperm in utils.chunker(
                            list(range(1, n_perm + 1)), permutationStepSize):
                        if (kinship_df is not None) and (relatedness_score
                                                         is not None):
                            temp = utils.get_shuffeld_genotypes_preserving_kinship(
                                geneticaly_unique_individuals,
                                relatedness_score, snp_matrix_DF,
                                kinship_df.loc[individual_ids, individual_ids],
                                len(currentNperm))
                        else:
                            temp = utils.get_shuffeld_genotypes(
                                snp_matrix_DF, len(currentNperm))
                        temp = temp.astype(float)
                        alt_lmls_p, effsizes_p = flmm.fast_scan(temp,
                                                                verbose=False)
                        var_pvalues_p = lrt_pvalues(null_lml, alt_lmls_p)
                        pValueBuffer.extend(np.asarray(var_pvalues_p))
                    if (not (len(pValueBuffer) == totalSnpsToBeTested)):
                        #print(len(pValueBuffer))
                        #print(pValueBuffer)
                        #print(totalSnpsToBeTested)
                        print('Error in blocking logic for permutations.')
                        sys.exit()
                    perm = 0
                    for relevantOutput in utils.chunker(
                            pValueBuffer, G.shape[1]):
                        if (write_permutations):
                            perm_df['permutation_' +
                                    str(perm)] = relevantOutput
                        if (bestPermutationPval[perm] > min(relevantOutput)):
                            bestPermutationPval[perm] = min(relevantOutput)
                        perm = perm + 1
                        #print(relevantOutput)
                        #print('permutation_'+str(perm))

                if not temp_df.empty:
                    data_written = True
                    output_writer.add_result_df(temp_df)
                    if (write_permutations):
                        permutation_writer.add_permutation_results_df(
                            perm_df, feature_id)
            #This we need to change in the written file.
            if (n_perm > 1 and data_written):
                #updated_permuted_p_in_hdf5(bestPermutationPval, feature_id);
                alpha_para, beta_para = output_writer.apply_pval_correction(
                    feature_id, bestPermutationPval, False)
                alpha_params.append(alpha_para)
                beta_params.append(beta_para)
                #pdb.set_trace();
            if not data_written:
                fail_qc_features.append(feature_id)
            else:
                n_samples.append(phenotype_ds.size)
                n_e_samples.append(len(geneticaly_unique_individuals))
            if contains_missing_samples:
                QS = QS_tmp
                geneticaly_unique_individuals = tmp_unique_individuals
                snpQcInfo = snpQcInfo.to_frame(name="call_rate")
                snpQcInfo.index.name = "snp_id"
                snpQcInfo.to_csv(
                    output_dir +
                    '/snp_qc_metrics_naContaining_feature_{}.txt'.format(
                        feature_id),
                    sep='\t')
                del QS_tmp
                del tmp_unique_individuals
            else:
                if (snpQcInfo is not None and snpQcInfoMain is not None):
                    snpQcInfoMain = pd.concat([snpQcInfoMain, snpQcInfo],
                                              axis=0)
                elif snpQcInfo is not None:
                    snpQcInfoMain = snpQcInfo.copy(deep=True)
                #print('step 5')
    output_writer.close()

    if (write_permutations):
        permutation_writer.close()
    fail_qc_features = np.unique(fail_qc_features)
    if ((len(feature_list) - len(fail_qc_features)) == 0):
        time.sleep(15)
        #Safety timer to make sure the file is unlocked.
        print("Trying to remove the h5 file. Nothing has been tested.")
        print(output_dir + 'qtl_results_{}_{}_{}.h5'.format(
            chromosome, selectionStart, selectionEnd))
        if not selectionStart is None:
            os.remove(output_dir + 'qtl_results_{}_{}_{}.h5'.format(
                chromosome, selectionStart, selectionEnd))
        else:
            os.remove(output_dir + 'qtl_results_{}.h5'.format(chromosome))
        sys.exit()
    #gather unique indexes of tested snps
    #write annotation and snp data to file
    snp_df = pd.DataFrame()
    snp_df['snp_id'] = np.unique(tested_snp_names)
    snp_df.index = np.unique(tested_snp_names)
    snp_df['chromosome'] = "NA"
    snp_df['position'] = "NA"
    if (snpQcInfoMain is not None):
        snpQcInfoMain = snpQcInfoMain.to_frame(name="call_rate")
        snpQcInfoMain['index'] = snpQcInfoMain.index
        snpQcInfoMain = snpQcInfoMain.drop_duplicates()
        del snpQcInfoMain['index']
        snp_df = pd.concat(
            [snp_df, snpQcInfoMain.reindex(snp_df.index)], axis=1)

    feature_list = list(set(feature_list) - set(fail_qc_features))
    annotation_df = annotation_df.reindex(feature_list)
    annotation_df['n_samples'] = n_samples
    annotation_df['n_e_samples'] = n_e_samples
    if (n_perm > 1):
        annotation_df['alpha_param'] = alpha_params
        annotation_df['beta_param'] = beta_params
    if not selectionStart is None:
        snp_df.to_csv(output_dir + '/snp_metadata_{}_{}_{}.txt'.format(
            chromosome, selectionStart, selectionEnd),
                      sep='\t',
                      index=False)
        annotation_df.to_csv(output_dir +
                             '/feature_metadata_{}_{}_{}.txt'.format(
                                 chromosome, selectionStart, selectionEnd),
                             sep='\t')
    else:
        snp_df.to_csv(output_dir + '/snp_metadata_{}.txt'.format(chromosome),
                      sep='\t',
                      index=False)
        annotation_df.to_csv(output_dir +
                             '/feature_metadata_{}.txt'.format(chromosome),
                             sep='\t')