예제 #1
0
def test_lmm_interface():
    random = RandomState(1)
    n = 3
    G = random.randn(n, n + 1)
    X = random.randn(n, 2)
    y = X @ random.randn(2) + G @ random.randn(G.shape[1]) + random.randn(n)
    y -= y.mean(0)
    y /= y.std(0)

    QS = economic_qs_linear(G)
    lmm = LMM(y, X, QS, restricted=False)
    lmm.name = "lmm"
    lmm.fit(verbose=False)

    assert_allclose(
        lmm.covariance(),
        [
            [0.436311031439718, 2.6243891396439837e-16, 2.0432156171727483e-16],
            [2.6243891396439837e-16, 0.4363110314397185, 4.814313140426306e-16],
            [2.0432156171727483e-16, 4.814313140426305e-16, 0.43631103143971817],
        ],
        atol=1e-7,
    )
    assert_allclose(
        lmm.mean(),
        [0.6398184791042468, -0.8738254794097052, 0.7198112606871158],
        atol=1e-7,
    )
    assert_allclose(lmm.lml(), -3.012715726960625, atol=1e-7)
    assert_allclose(lmm.value(), lmm.lml(), atol=1e-7)
    assert_allclose(lmm.lml(), -3.012715726960625, atol=1e-7)
    assert_allclose(
        lmm.X,
        [
            [-0.3224172040135075, -0.38405435466841564],
            [1.1337694423354374, -1.0998912673140309],
            [-0.17242820755043575, -0.8778584179213718],
        ],
        atol=1e-7,
    )
    assert_allclose(lmm.beta, [-1.3155159120000266, -0.5615702941530938], atol=1e-7)
    assert_allclose(
        lmm.beta_covariance,
        [
            [0.44737305797088345, 0.20431961864892412],
            [0.20431961864892412, 0.29835835133251526],
        ],
        atol=1e-7,
    )
    assert_allclose(lmm.delta, 0.9999999999999998, atol=1e-7)
    assert_equal(lmm.ncovariates, 2)
    assert_equal(lmm.nsamples, 3)
    assert_allclose(lmm.scale, 0.43631103143971767, atol=1e-7)
    assert_allclose(lmm.v0, 9.688051060046502e-17, atol=1e-7)
    assert_allclose(lmm.v1, 0.43631103143971756, atol=1e-7)
    assert_equal(lmm.name, "lmm")

    with pytest.raises(NotImplementedError):
        lmm.gradient()
예제 #2
0
파일: _st_scan.py 프로젝트: zhzheng92/limix
def _perform_lmm(y, M, QS, G, verbose):
    from glimix_core.lmm import LMM
    from pandas import Series
    from xarray import DataArray

    lmm = LMM(y, M.values, QS)

    lmm.fit(verbose=verbose)
    sys.stdout.flush()

    null_lml = lmm.lml()

    beta = lmm.beta

    covariates = list(M.coords["covariate"].values)
    ncov_effsizes = Series(beta, covariates)

    flmm = lmm.get_fast_scanner()
    if hasattr(G, "data"):
        values = G.data
    else:
        values = G.values
    alt_lmls, effsizes = flmm.fast_scan(values, verbose=verbose)

    coords = {
        k: ("candidate", G.coords[k].values)
        for k in G.coords.keys()
        if G.coords[k].dims[0] == "candidate"
    }

    alt_lmls = DataArray(alt_lmls, dims=["candidate"], coords=coords)
    effsizes = DataArray(effsizes, dims=["candidate"], coords=coords)

    return QTLModel(null_lml, alt_lmls, effsizes, ncov_effsizes)
예제 #3
0
파일: bf.py 프로젝트: phue/limix
    def calc_lml(self, Env):
        from numpy import ones, concatenate
        from glimix_core.lmm import LMM
        from numpy_sugar.linalg import economic_qs_linear

        _covs = concatenate([self.F, self.W, self.x], 1)
        if Env.shape[1] == 0:
            xoE = ones(self.x.shape)
        else:
            xoE = self.x * Env

        QS = economic_qs_linear(xoE)
        gp = LMM(self.y, _covs, QS, restricted=True)
        gp.fit(verbose=False)
        return gp.lml()
예제 #4
0
def _fit_cis_herit(y, K_cis, X=None, compute_lrt=True):
    log = logging.getLogger(pyfocus.LOG)

    try:
        from glimix_core.lmm import LMM
        from numpy_sugar.linalg import economic_qs_linear
    except ImportError as ie:
        log.error(
            "Training submodule requires glimix-core>=2.0.0 and numpy-sugar to be installed."
        )
        raise

    from scipy.stats import norm
    from scipy.linalg import lstsq

    if X is None:
        X = np.ones((len(y), 1))

    K_cis = economic_qs_linear(K_cis)
    lmm = LMM(y, X, K_cis)
    lmm.fit(verbose=False)

    fixed_betas = lmm.beta
    logl_1 = lmm.lml()

    cis_scale = lmm.v0
    noise_scale = lmm.v1
    fe_scale = lmm.fixed_effects_variance

    if compute_lrt:
        n, p = X.shape
        # reduced model is just OLS regression for fixed-effects
        fixed_betas_0, sosqs, ranks, svals = lstsq(X, y)
        s2e = sosqs / len(
            y
        )  # LMM also uses MLE estimation, so don't adjust for bias right now

        logl_0 = np.sum(
            norm.logpdf(y, loc=np.dot(X, fixed_betas_0), scale=np.sqrt(s2e)))
        pval = _lrt_pvalue(logl_0, logl_1)
        log.debug("Estimated cis-h2g = {} (P = {})".format(
            cis_scale / (cis_scale + noise_scale + fe_scale), pval))
    else:
        pval = None
        log.debug("Estimated cis-h2g = {}".format(
            cis_scale / (cis_scale + noise_scale + fe_scale)))

    return fe_scale, cis_scale, noise_scale, logl_1, fixed_betas, pval
예제 #5
0
def run_QTL_analysis(pheno_filename,
                     anno_filename,
                     geno_prefix,
                     plinkGenotype,
                     output_dir,
                     window_size=250000,
                     min_maf=0.05,
                     min_hwe_P=0.001,
                     min_call_rate=0.95,
                     blocksize=1000,
                     cis_mode=True,
                     skipAutosomeFiltering=False,
                     gaussianize_method=None,
                     minimum_test_samples=10,
                     seed=np.random.randint(40000),
                     n_perm=0,
                     write_permutations=False,
                     relatedness_score=0.95,
                     feature_variant_covariate_filename=None,
                     snps_filename=None,
                     feature_filename=None,
                     snp_feature_filename=None,
                     genetic_range='all',
                     covariates_filename=None,
                     kinship_filename=None,
                     sample_mapping_filename=None,
                     extended_anno_filename=None,
                     regressCovariatesUpfront=False):
    fill_NaN = Imputer(missing_values=np.nan,
                       strategy='mean',
                       axis=0,
                       copy=False)
    print('Running QTL analysis.')
    lik = 'normal'
    minimumProbabilityStep = 0.1
    '''Core function to take input and run QTL tests on a given chromosome.'''
    if relatedness_score is not None:
        relatedness_score = float(relatedness_score)
    [phenotype_df, kinship_df, covariate_df, sample2individual_df,complete_annotation_df, annotation_df, snp_filter_df, snp_feature_filter_df, geneticaly_unique_individuals, minimum_test_samples, feature_list, bim, fam, bed, bgen, chromosome, selectionStart, selectionEnd, feature_variant_covariate_df]=\
    utils.run_QTL_analysis_load_intersect_phenotype_covariates_kinship_sample_mapping(pheno_filename=pheno_filename, anno_filename=anno_filename, geno_prefix=geno_prefix, plinkGenotype=plinkGenotype, cis_mode=cis_mode, skipAutosomeFiltering = skipAutosomeFiltering,
                      minimum_test_samples= minimum_test_samples,  relatedness_score=relatedness_score, snps_filename=snps_filename, feature_filename=feature_filename, snp_feature_filename=snp_feature_filename, selection=genetic_range,
                     covariates_filename=covariates_filename, kinship_filename=kinship_filename, sample_mapping_filename=sample_mapping_filename, extended_anno_filename=extended_anno_filename, feature_variant_covariate_filename=feature_variant_covariate_filename)

    mixed = kinship_df is not None
    if (kinship_df is None) or (relatedness_score is None):
        geneticaly_unique_individuals = sample2individual_df['iid'].values
    QS = None
    if (feature_list == None or len(feature_list) == 0):
        print('No features to be tested.')
        sys.exit()

    #Open output files
    qtl_loader_utils.ensure_dir(output_dir)
    if not selectionStart is None:
        output_writer = qtl_output.hdf5_writer(
            output_dir + '/qtl_results_{}_{}_{}.h5'.format(
                chromosome, selectionStart, selectionEnd))
    else:
        output_writer = qtl_output.hdf5_writer(
            output_dir + '/qtl_results_{}.h5'.format(chromosome))
    if (write_permutations):
        if not selectionStart is None:
            permutation_writer = qtl_output.hdf5_permutations_writer(
                output_dir + '/perm_results_{}_{}_{}.h5'.format(
                    chromosome, selectionStart, selectionEnd), n_perm)
        else:
            permutation_writer = qtl_output.hdf5_permutations_writer(
                output_dir + '/perm_results_{}.h5'.format(chromosome), n_perm)

    #Arrays to store indices of snps tested and pass and fail QC SNPs for features without missingness.
    tested_snp_ids = []
    pass_qc_snps_all = []
    fail_qc_snps_all = []
    fail_qc_features = []
    alpha_params = []
    beta_params = []
    n_samples = []
    n_e_samples = []
    na_containing_features = 0
    currentFeatureNumber = 0
    snpQcInfoMain = None

    for feature_id in feature_list:
        snpQcInfo = None
        currentFeatureNumber += 1
        if (len(phenotype_df.loc[feature_id, :])) < minimum_test_samples:
            print("Feature: " + feature_id +
                  " not tested not enough samples do QTL test.")
            fail_qc_features.append(feature_id)
            geneticaly_unique_individuals = tmp_unique_individuals
            continue
        data_written = False
        contains_missing_samples = False
        snpQuery = utils.do_snp_selection(feature_id, complete_annotation_df,
                                          bim, cis_mode, window_size,
                                          skipAutosomeFiltering)
        snp_cov_df = None
        if (feature_variant_covariate_df is not None):
            if (feature_id in feature_variant_covariate_df['feature'].values):
                covariateSnp = feature_variant_covariate_df['snp_id'].values[
                    feature_variant_covariate_df['feature'] == feature_id]
                if (any(i in bim['snp'].values for i in covariateSnp)):
                    snpQuery_cov = bim.loc[
                        bim['snp'].map(lambda x: x in list(covariateSnp)), :]
                    if (plinkGenotype):
                        snp_cov_df = pd.DataFrame(
                            data=bed[snpQuery_cov['i'].values, :].compute().
                            transpose(),
                            index=fam.index,
                            columns=snpQuery_cov['snp'],
                        )
                    else:
                        ##Here we make some assumptions on the SNPs. They are expected to be ploidy 2!
                        ##Also we don't use a minimal quality to assure a value is present for all samples.
                        print(
                            'Warning, during the regression of SNPs we assume ploidy 2.'
                        )
                        snp_cov_df_t = pd.DataFrame(columns=fam.index)
                        rowNumber = 0
                        for snpId in snpQuery_cov['i']:
                            geno = bgen["genotype"][snpId].compute()
                            if (geno["phased"]):
                                snp_df_dosage_t = geno["probs"][:, [0, 2]].sum(
                                    1).astype(float)
                                snp_df_dosage_t[(
                                    np.amax(geno["probs"][:, :2], 1) +
                                    np.amax(geno["probs"][:, 2:4], 1)) < (
                                        1 +
                                        minimumProbabilityStep)] = float('NaN')
                            else:
                                snp_df_dosage_t = (geno["probs"][:, 0] *
                                                   2) + geno["probs"][:, 1]
                                snp_df_dosage_t[
                                    np.amax(geno["probs"][:, :3], 1) < (
                                        (1 / 3) +
                                        minimumProbabilityStep)] = float('NaN')
                            snp_df_dosage_t = pd.Series(snp_df_dosage_t,
                                                        index=fam.index)
                            snp_df_dosage_t.name = snpId
                            snp_cov_df_t = snp_cov_df_t.append(snp_df_dosage_t)
                            rowNumber = rowNumber + 1
                        snp_cov_df_t = snp_cov_df_t.transpose()

        if (len(snpQuery) != 0) and (snp_filter_df is not None):
            toSelect = set(snp_filter_df.index).intersection(
                set(snpQuery['snp']))
            snpQuery = snpQuery.loc[snpQuery['snp'].isin(toSelect)]

        if (len(snpQuery) != 0) and (snp_feature_filter_df is not None):
            toSelect = set(
                np.unique(snp_feature_filter_df['snp_id'].loc[
                    snp_feature_filter_df['feature'] ==
                    feature_id])).intersection(set(snpQuery['snp']))
            snpQuery = snpQuery.loc[snpQuery['snp'].isin(toSelect)]

        if len(snpQuery) == 0:
            print("Feature: " + feature_id +
                  " not tested. No SNPS passed QC for phenotype.")
            fail_qc_features.append(feature_id)
            continue
        else:
            phenotype_ds = phenotype_df.loc[feature_id]
            contains_missing_samples = any(~np.isfinite(phenotype_ds))
            if (contains_missing_samples):
                print('Feature: ' + feature_id + ' contains missing data.')
                phenotype_ds.dropna(inplace=True)
                na_containing_features = na_containing_features + 1
            '''select indices for relevant individuals in genotype matrix
            These are not unique. NOT to be used to access phenotype/covariates data
            '''

            individual_ids = sample2individual_df.loc[phenotype_ds.index,
                                                      'iid'].values
            sample2individual_feature = sample2individual_df.loc[
                phenotype_ds.index]

            if (contains_missing_samples):
                tmp_unique_individuals = geneticaly_unique_individuals
                if (kinship_df is not None) and (relatedness_score
                                                 is not None):
                    geneticaly_unique_individuals = utils.get_unique_genetic_samples(
                        kinship_df.loc[individual_ids, individual_ids],
                        relatedness_score)
                else:
                    geneticaly_unique_individuals = individual_ids
            else:
                #If no missing samples we can use the previous SNP Qc information before actually loading data.
                #This allows for more efficient blocking and retrieving of data
                snpQuery = snpQuery.loc[snpQuery['snp'].map(
                    lambda x: x not in list(map(str, fail_qc_snps_all)))]

            if phenotype_ds.empty or len(
                    geneticaly_unique_individuals) < minimum_test_samples:
                print("Feature: " + feature_id +
                      " not tested not enough samples do QTL test.")
                fail_qc_features.append(feature_id)
                if contains_missing_samples:
                    geneticaly_unique_individuals = tmp_unique_individuals
                continue
            elif np.var(phenotype_ds.values) == 0:
                print("Feature: " + feature_id +
                      " has no variance in selected individuals.")
                fail_qc_features.append(feature_id)
                if contains_missing_samples:
                    geneticaly_unique_individuals = tmp_unique_individuals
                continue

            print('For feature: ' + str(currentFeatureNumber) + '/' +
                  str(len(feature_list)) + ' (' + feature_id + '): ' +
                  str(snpQuery.shape[0]) +
                  ' SNPs need to be tested.\n Please stand by.')

            if (n_perm != 0):
                bestPermutationPval = np.ones((n_perm), dtype=np.float)

            #Here we need to start preparing the LMM, can use the fam for sample IDS in SNP matrix.
            #test if the covariates, kinship, snp and phenotype are in the same order
            if ((all(kinship_df.loc[individual_ids,individual_ids].index==sample2individual_feature.loc[phenotype_ds.index]['iid']) if kinship_df is not None else True) &\
                 (all(phenotype_ds.index==covariate_df.loc[sample2individual_feature['sample'],:].index)if covariate_df is not None else True)):
                '''
                if all lines are in order put in arrays the correct genotype and phenotype
                x=a if cond1 else b <---> equivalent to if cond1: x=a else x=b;                 better readability of the code
                 '''
                if kinship_df is not None:
                    kinship_mat = kinship_df.loc[individual_ids,
                                                 individual_ids].values
                    kinship_mat = kinship_mat.astype(float)
                    ##GOWER normalization of Kinship matrix.
                    kinship_mat *= (kinship_mat.shape[0] - 1) / (
                        kinship_mat.trace() - kinship_mat.mean(0).sum())
                    ## This needs to go with the subselection stuff.
                    if (QS is None and not contains_missing_samples):
                        QS = economic_qs(kinship_mat)
                    elif (contains_missing_samples):
                        QS_tmp = QS
                        QS = economic_qs(kinship_mat)
                if kinship_df is None:
                    K = np.eye(len(phenotype_ds.index))
                    if (QS is None and not contains_missing_samples):
                        QS = economic_qs(K)
                    elif (contains_missing_samples):
                        QS_tmp = QS
                        QS = economic_qs(K)
                cov_matrix = covariate_df.loc[sample2individual_feature[
                    'sample'], :].values if covariate_df is not None else None
                if covariate_df is None:
                    cov_matrix = np.ones((len(individual_ids), 1))
                if snp_cov_df is not None:
                    snp_cov_df_tmp = snp_cov_df.loc[individual_ids, :]
                    snp_cov_df_tmp.index = sample2individual_feature['sample']
                    snp_cov_df = pd.DataFrame(
                        fill_NaN.fit_transform(snp_cov_df_tmp))
                    snp_cov_df.index = snp_cov_df_tmp.index
                    snp_cov_df.columns = snp_cov_df_tmp.columns
                    cov_matrix = np.concatenate(
                        (cov_matrix, snp_cov_df.values), 1)
                    snp_cov_df_tmp = None
                    snp_cov_df = None
                cov_matrix = cov_matrix.astype(float)
            else:
                print(
                    'There is an issue in mapping phenotypes vs covariates and/or kinship'
                )
                sys.exit()

            phenotype = utils.force_normal_distribution(
                phenotype_ds.values, method=gaussianize_method
            ) if gaussianize_method is not None else phenotype_ds.values

            #Prepare LMM
            phenotype = phenotype.astype(float)

            ##Mixed and test.
            ##This is a future change so we don't need to decompose the COVs every time.
            ##Like QS this needs to happen when genetic unique individuals is the same.
            #svd_cov = economic_svd(cov_matrix)
            #lmm = LMM(phenotype, cov_matrix, QS, SVD=svd_cov)
            #These steps need to happen only once per phenotype.
            #print(QS)
            lmm = LMM(phenotype, cov_matrix, QS)
            if not mixed:
                lmm.delta = 1
                lmm.fix('delta')
            #Prepare null model.
            lmm.fit(verbose=False)
            if regressCovariatesUpfront:
                phenotype_corrected = phenotype - cov_matrix[:, 1:].dot(
                    lmm.beta[1:])
                cov_matrix_corrected = cov_matrix[:, 0]
                lmm = LMM(phenotype_corrected, cov_matrix_corrected, QS)
                lmm.fit(verbose=False)

            null_lml = lmm.lml()
            flmm = lmm.get_fast_scanner()
            countChunker = 0
            for snpGroup in utils.chunker(snpQuery, blocksize):
                countChunker = countChunker + 1
                #print(countChunker)
                #Fix seed at the start of the first chunker so all permutations are based on the same random first split.
                np.random.seed(seed)
                #print(snpGroup)
                snp_idxs = snpGroup['i'].values
                snp_names = snpGroup['snp'].values

                tested_snp_ids.extend(snp_names)
                #subset genotype matrix, we cannot subselect at the same time, do in two steps.
                if (plinkGenotype):
                    snp_df = pd.DataFrame(
                        data=bed[snp_idxs, :].compute().transpose(),
                        index=fam.index,
                        columns=snp_names)
                else:
                    snp_df_dosage = pd.DataFrame(np.nan,
                                                 index=fam.index,
                                                 columns=snp_names)
                    snp_df = pd.DataFrame(np.nan,
                                          index=fam.index,
                                          columns=snp_names)
                    rowNumber = 0
                    for snpId in snp_idxs:
                        geno = bgen["genotype"][snpId].compute()
                        if (geno["ploidy"].min() > 1 & geno["ploidy"].max() <
                                3):
                            if (geno["phased"]):
                                snp_df_dosage_t = geno["probs"][:, [0, 2]].sum(
                                    1).astype(float)
                                snp_df_t = (np.abs(
                                    np.argmax(geno["probs"][:, :2], axis=1) - 1
                                ) + np.abs(
                                    np.argmax(geno["probs"][:, 2:4], axis=1) -
                                    1)).astype(float)
                                naId = (np.amax(geno["probs"][:, :2], 1) +
                                        np.amax(geno["probs"][:, 2:4], 1)) < (
                                            1 + minimumProbabilityStep)
                                snp_df_dosage_t[naId] = float('NaN')
                                snp_df_t[naId] = float('NaN')
                            else:
                                snp_df_dosage_t = (
                                    (geno["probs"][:, 0] * 2) +
                                    geno["probs"][:, 1]).astype(float)
                                snp_df_t = (np.abs(
                                    np.argmax(geno["probs"][:, :3], axis=1) -
                                    2)).astype(float)
                                naId = np.amax(geno["probs"][:, :3], 1) < (
                                    (1 / 3) + minimumProbabilityStep)
                                snp_df_dosage_t[naId] = float('NaN')
                                snp_df_t[naId] = float('NaN')
                            snp_df_dosage.loc[:, snp_names[
                                rowNumber]] = snp_df_dosage_t
                            snp_df.loc[:, snp_names[rowNumber]] = snp_df_t
                        rowNumber = rowNumber + 1
                    snp_df_dosage = snp_df_dosage.loc[individual_ids, :]

                snp_df = snp_df.loc[individual_ids, :]

                snp_df = snp_df.loc[:,
                                    np.unique(snp_df.columns)[
                                        np.unique(snp_df.columns,
                                                  return_counts=1)[1] == 1]]
                #SNP QC.
                if not contains_missing_samples:
                    #remove SNPs from snp_df if they have previously failed QC
                    snp_df = snp_df.loc[:,
                                        snp_df.columns[~snp_df.columns.
                                                       isin(fail_qc_snps_all)]]
                    if snp_df.shape[1] == 0:
                        continue
                    snps_to_test_df = snp_df.loc[:, snp_df.columns[
                        ~snp_df.columns.isin(pass_qc_snps_all)]]
                    if snps_to_test_df.shape[1] > 0:
                        #Only do QC on relevant SNPs. join pre-QCed list and new QCed list.
                        if kinship_df is not None:
                            passed_snp_names, failed_snp_names, call_rate, maf, hweP = do_snp_qc(
                                snps_to_test_df.iloc[np.unique(
                                    snps_to_test_df.index,
                                    return_index=1)[1]].loc[
                                        geneticaly_unique_individuals, :],
                                min_call_rate, min_maf, min_hwe_P)
                        else:
                            passed_snp_names, failed_snp_names, call_rate, maf, hweP = do_snp_qc(
                                snps_to_test_df, min_call_rate, min_maf,
                                min_hwe_P)
                        snps_to_test_df = None
                        #append snp_names and failed_snp_names
                        pass_qc_snps_all.extend(passed_snp_names)
                        fail_qc_snps_all.extend(failed_snp_names)
                    snp_df = snp_df.loc[:,
                                        snp_df.columns[snp_df.columns.
                                                       isin(pass_qc_snps_all)]]
                else:
                    #Do snp QC for relevant section.
                    #Get relevant slice from: phenotype_ds
                    if kinship_df is not None:
                        passed_snp_names, failed_snp_names, call_rate, maf, hweP = do_snp_qc(
                            snp_df.iloc[np.unique(
                                snp_df.index, return_index=1)[1]].loc[
                                    geneticaly_unique_individuals, :],
                            min_call_rate, min_maf, min_hwe_P)
                    else:
                        passed_snp_names, failed_snp_names, call_rate, maf, hweP = do_snp_qc(
                            snp_df, min_call_rate, min_maf, min_hwe_P)
                    snp_df = snp_df.loc[:,
                                        snp_df.columns[snp_df.columns.
                                                       isin(passed_snp_names)]]
                snpQcInfo_t = None
                if call_rate is not None:
                    snpQcInfo_t = call_rate
                    if maf is not None:
                        snpQcInfo_t = pd.concat(
                            [snpQcInfo_t,
                             maf.reindex(snpQcInfo_t.index)],
                            axis=1)
                        if hweP is not None:
                            snpQcInfo_t = pd.concat(
                                [snpQcInfo_t,
                                 hweP.reindex(snpQcInfo_t.index)],
                                axis=1)
                call_rate = None
                maf = None
                hweP = None
                if snpQcInfo is None and snpQcInfo_t is not None:
                    snpQcInfo = snpQcInfo_t
                elif snpQcInfo_t is not None:
                    snpQcInfo = pd.concat([snpQcInfo, snpQcInfo_t],
                                          axis=0,
                                          sort=False)
                ##First process SNPQc than check if we can continue.
                if len(snp_df.columns) == 0:
                    continue
                elif (not plinkGenotype):
                    snp_df_dosage = snp_df_dosage.loc[:,
                                                      np.unique(snp_df.columns
                                                                )]
                #We could make use of relatedness when imputing.  And impute only based on genetically unique individuals.
                snp_df = pd.DataFrame(fill_NaN.fit_transform(snp_df),
                                      index=snp_df.index,
                                      columns=snp_df.columns)
                if (not plinkGenotype):
                    snp_df_dosage = pd.DataFrame(
                        fill_NaN.fit_transform(snp_df_dosage),
                        index=snp_df_dosage.index,
                        columns=snp_df_dosage.columns)
                ##No more snp_matrix_DF > snp_df


#                test if the covariates, kinship, snp and phenotype are in the same order
                if (len(snp_df.index) != len(sample2individual_feature.loc[
                        phenotype_ds.index]['iid'])
                        or not all(snp_df.index == sample2individual_feature.
                                   loc[phenotype_ds.index]['iid'])):
                    print(
                        'There is an issue in mapping phenotypes and genotypes'
                    )
                    sys.exit()

                G = snp_df.values
                if (not plinkGenotype):
                    G = snp_df_dosage.values
                G = G.astype(float)
                G_index = snp_df.columns

                alt_lmls, effsizes = flmm.fast_scan(G, verbose=False)
                var_pvalues = lrt_pvalues(null_lml, alt_lmls)
                var_effsizes_se = effsizes_se(effsizes, var_pvalues)

                #add these results to qtl_results
                temp_df = pd.DataFrame(index=range(len(G_index)),
                                       columns=[
                                           'feature_id', 'snp_id', 'p_value',
                                           'beta', 'beta_se',
                                           'empirical_feature_p_value'
                                       ])
                temp_df['snp_id'] = G_index
                temp_df['feature_id'] = feature_id
                temp_df['beta'] = np.asarray(effsizes)
                temp_df['p_value'] = np.asarray(var_pvalues)
                temp_df['beta_se'] = np.asarray(var_effsizes_se)
                #insert default dummy value
                temp_df['empirical_feature_p_value'] = -1.0

                if (n_perm != 0):
                    pValueBuffer = []
                    totalSnpsToBeTested = (G.shape[1] * n_perm)
                    permutationStepSize = np.floor(
                        n_perm / (totalSnpsToBeTested / blocksize))
                    if (permutationStepSize > n_perm):
                        permutationStepSize = n_perm
                    elif (permutationStepSize < 1):
                        permutationStepSize = 1

                    if (write_permutations):
                        perm_df = pd.DataFrame(
                            index=range(len(G_index)),
                            columns=['snp_id'] +
                            ['permutation_' + str(x) for x in range(n_perm)])
                        perm_df['snp_id'] = G_index
                    for currentNperm in utils.chunker(
                            list(range(1, n_perm + 1)), permutationStepSize):
                        if (kinship_df is not None) and (relatedness_score
                                                         is not None):
                            if (plinkGenotype):
                                temp = utils.get_shuffeld_genotypes_preserving_kinship(
                                    geneticaly_unique_individuals,
                                    relatedness_score, snp_df,
                                    kinship_df.loc[individual_ids,
                                                   individual_ids],
                                    len(currentNperm))
                            else:
                                temp = utils.get_shuffeld_genotypes_preserving_kinship(
                                    geneticaly_unique_individuals,
                                    relatedness_score, snp_df_dosage,
                                    kinship_df.loc[individual_ids,
                                                   individual_ids],
                                    len(currentNperm))
                        else:
                            if (plinkGenotype):
                                temp = utils.get_shuffeld_genotypes(
                                    snp_df, len(currentNperm))
                            else:
                                temp = utils.get_shuffeld_genotypes(
                                    snp_df_dosage, len(currentNperm))
                        temp = temp.astype(float)
                        alt_lmls_p, effsizes_p = flmm.fast_scan(temp,
                                                                verbose=False)
                        var_pvalues_p = lrt_pvalues(null_lml, alt_lmls_p)
                        pValueBuffer.extend(np.asarray(var_pvalues_p))
                    if (not (len(pValueBuffer) == totalSnpsToBeTested)):
                        #print(len(pValueBuffer))
                        #print(pValueBuffer)
                        #print(totalSnpsToBeTested)
                        print('Error in blocking logic for permutations.')
                        sys.exit()
                    perm = 0
                    for relevantOutput in utils.chunker(
                            pValueBuffer, G.shape[1]):
                        if (write_permutations):
                            perm_df['permutation_' +
                                    str(perm)] = relevantOutput
                        if (bestPermutationPval[perm] > min(relevantOutput)):
                            bestPermutationPval[perm] = min(relevantOutput)
                        perm = perm + 1
                        #print(relevantOutput)
                        #print('permutation_'+str(perm))

                if not temp_df.empty:
                    data_written = True
                    output_writer.add_result_df(temp_df)
                    if (write_permutations):
                        permutation_writer.add_permutation_results_df(
                            perm_df, feature_id)
            #This we need to change in the written file.
        if (n_perm > 1 and data_written):
            #updated_permuted_p_in_hdf5(bestPermutationPval, feature_id);
            alpha_para, beta_para = output_writer.apply_pval_correction(
                feature_id, bestPermutationPval, cis_mode)
            #np.savetxt(output_dir+"/Permutation.pValues."+feature_id+".txt",bestPermutationPval)
            alpha_params.append(alpha_para)
            beta_params.append(beta_para)
        if not data_written:
            fail_qc_features.append(feature_id)
        else:
            n_samples.append(phenotype_ds.size)
            n_e_samples.append(len(geneticaly_unique_individuals))
        if contains_missing_samples:
            QS = QS_tmp
            geneticaly_unique_individuals = tmp_unique_individuals
            del QS_tmp
            del tmp_unique_individuals
            if snpQcInfo is not None:
                snpQcInfo.index.name = "snp_id"
                snpQcInfo.to_csv(
                    output_dir +
                    '/snp_qc_metrics_naContaining_feature_{}.txt'.format(
                        feature_id),
                    sep='\t')
        else:
            if (snpQcInfo is not None and snpQcInfoMain is not None):
                snpQcInfoMain = pd.concat([snpQcInfoMain, snpQcInfo],
                                          axis=0,
                                          sort=False)
            elif snpQcInfo is not None:
                snpQcInfoMain = snpQcInfo.copy(deep=True)
        #if snpQcInfo is not None:
        #snpQcInfo2 = snpQcInfo.copy().transpose()
        #snpQcInfo2.to_csv(output_dir+'/snp_qc_metrics_feature_{}.txt'.format(feature_id),sep='\t')
        #print('step 5')
    output_writer.close()

    if (write_permutations):
        permutation_writer.close()
    fail_qc_features = np.unique(fail_qc_features)
    if ((len(feature_list) - len(fail_qc_features)) == 0):
        time.sleep(15)
        #Safety timer to make sure the file is unlocked.
        print("Trying to remove the h5 file. Nothing has been tested.")
        print(output_dir + 'qtl_results_{}_{}_{}.h5'.format(
            chromosome, selectionStart, selectionEnd))
        if not selectionStart is None:
            os.remove(output_dir + 'qtl_results_{}_{}_{}.h5'.format(
                chromosome, selectionStart, selectionEnd))
        else:
            os.remove(output_dir + 'qtl_results_{}.h5'.format(chromosome))
        sys.exit()
    #gather unique indexes of tested SNPs

    tested_snp_ids = list(set(tested_snp_ids))
    #write annotation and snp data to file
    snp_df = pd.DataFrame()
    snp_df['snp_id'] = bim['snp']
    snp_df['chromosome'] = bim['chrom']
    snp_df['position'] = bim['pos']
    snp_df['assessed_allele'] = bim['a1']
    snp_df.index = snp_df['snp_id']
    snp_df = snp_df.drop_duplicates()
    snp_df = snp_df.reindex(tested_snp_ids)
    snp_df = snp_df.drop_duplicates()

    if snpQcInfoMain is not None:
        snpQcInfoMain['index'] = snpQcInfoMain.index
        snpQcInfoMain = snpQcInfoMain.drop_duplicates()
        del snpQcInfoMain['index']
        snp_df = pd.concat(
            [snp_df, snpQcInfoMain.reindex(snp_df.index)], axis=1)

        if (snp_df.shape[1] == 5):
            snp_df.columns = [
                'snp_id', 'chromosome', 'position', 'assessed_allele',
                'call_rate'
            ]
        elif (snp_df.shape[1] == 6):
            snp_df.columns = [
                'snp_id', 'chromosome', 'position', 'assessed_allele',
                'call_rate', 'maf'
            ]
        else:
            snp_df.columns = [
                'snp_id', 'chromosome', 'position', 'assessed_allele',
                'call_rate', 'maf', 'hwe_p'
            ]

    feature_list = list(set(feature_list) - set(fail_qc_features))
    annotation_df = annotation_df.reindex(feature_list)
    annotation_df['n_samples'] = n_samples
    annotation_df['n_e_samples'] = n_e_samples

    if (n_perm > 1):
        annotation_df['alpha_param'] = alpha_params
        annotation_df['beta_param'] = beta_params

    if not selectionStart is None:
        snp_df.to_csv(output_dir + '/snp_metadata_{}_{}_{}.txt'.format(
            chromosome, selectionStart, selectionEnd),
                      sep='\t',
                      index=False)
        annotation_df.to_csv(output_dir +
                             '/feature_metadata_{}_{}_{}.txt'.format(
                                 chromosome, selectionStart, selectionEnd),
                             sep='\t')
    else:
        snp_df.to_csv(output_dir + '/snp_metadata_{}.txt'.format(chromosome),
                      sep='\t',
                      index=False)
        annotation_df.to_csv(output_dir +
                             '/feature_metadata_{}.txt'.format(chromosome),
                             sep='\t')
def run_plots(plinkGenotype, geno_prefix, annotation_filename, phenotype_filename,
              covariate_filename, top_qtl_results_filename, output_directory,
              sample_mapping_filename,randomeff_filename):

    # # Loading Files
    # phenotype_filename = "/Users/chaaya/dhonveli_dkfz/limix_qtl/limix_qtl-master/Limix_QTL/test_data/Expression/Geuvadis_CEU_YRI_Expr.txt"
    # annotation_filename = "/Users/chaaya/dhonveli_dkfz/limix_qtl/limix_qtl-master/Limix_QTL/test_data/Expression/Geuvadis_CEU_Annot.txt"
    # covariate_filename = "/Users/chaaya/dhonveli_dkfz/limix_qtl/limix_qtl-master/Limix_QTL/test_data/Expression/Geuvadis_CEU_YRI_covariates.txt"
    # sample_mapping_filename = "/Users/chaaya/dhonveli_dkfz/limix_qtl/limix_qtl-master/Limix_QTL/test_data/Geuvadis_CEU_gte.txt"
    # geno_prefix = "/Users/chaaya/dhonveli_dkfz/limix_qtl/limix_qtl-master/Limix_QTL/test_data/Genotypes/Geuvadis"
    # output_directory = "/Users/chaaya/dhonveli_dkfz/limix_qtl/limix_qtl-master/Limix_QTL/test_data/Output"
    # top_qtl_results_filename = "/Users/chaaya/dhonveli_dkfz/limix_qtl/limix_qtl-master/Limix_QTL/test_data/Output/top_qtl_results_all_FDR0.05.txt"
    # plinkGenotype = True


    [phenotype_df, kinship_df, readdepth_df, covariate_df, sample2individual_df,complete_annotation_df, annotation_df, snp_filter_df,
     snp_feature_filter_df, geneticaly_unique_individuals, minimum_test_samples, feature_list, bim, fam, bed, bgen,
     chromosome, selectionStart, selectionEnd, feature_variant_covariate_df]=\
    utils.run_QTL_analysis_load_intersect_phenotype_covariates_kinship_sample_mapping(pheno_filename=phenotype_filename,
                    anno_filename=annotation_filename, geno_prefix=geno_prefix, plinkGenotype=plinkGenotype, cis_mode=True,
                    skipAutosomeFiltering = False, minimum_test_samples= 10,
                    relatedness_score=None, snps_filename=None, feature_filename=None,
                    snp_feature_filename=None, selection='all',covariates_filename=covariate_filename,
                    randomeff_filename=randomeff_filename, sample_mapping_filename=sample_mapping_filename,
                    extended_anno_filename=None, feature_variant_covariate_filename=None)

    # results
    top_qtl_results_df = qtl_loader_utils.get_top_qtl_results(top_qtl_results_filename)

    for row in top_qtl_results_df.iterrows():

        # feature specific parameters for QS mixing
        rho1 = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
        Sigma = {}
        Sigma_qs = {}
        best = {}
        snpQcInfo = None
        currentFeatureNumber+= 1

        phenotype_ds = phenotype_df.loc[row[1]["feature_id"]]
        individual_ids = sample2individual_df.loc[phenotype_ds.index, 'iid'].values
        sample2individual_feature = sample2individual_df.loc[phenotype_ds.index]

        if kinship_df is not None and readdepth_df is None:
            kinship_mat = kinship_df.loc[individual_ids,individual_ids].values
            kinship_mat = kinship_mat.astype(float)
            ##GOWER normalization of Kinship matrix.
            kinship_mat *= (kinship_mat.shape[0] - 1) / (kinship_mat.trace() - kinship_mat.mean(0).sum())
            ## This needs to go with the subselection stuff.
            if(QS is None and not contains_missing_samples):
                QS = economic_qs(kinship_mat)
            elif (contains_missing_samples):
                QS_tmp = QS
                QS = economic_qs(kinship_mat)

        randomeff_mix = False
        # combining the two matrices
        if kinship_df is not None and readdepth_df is not None:
            randomeff_mix = True
            kinship_mat = kinship_df.loc[individual_ids,individual_ids].values
            kinship_mat = kinship_mat.astype(float)
            ##GOWER normalization of Kinship matrix.
            kinship_mat *= (kinship_mat.shape[0] - 1) / (kinship_mat.trace() - kinship_mat.mean(0).sum())
            for rho in rho1:
                Sigma[rho] = rho * kinship_df + (1 - rho) * readdepth_df
                Sigma_qs[rho] = economic_qs(Sigma[rho])

        # creating a fake QS if none random effect is present or use the read depth one
        if kinship_df is None:
            if readdepth_df is None:
                K = np.eye(len(phenotype_ds.index))
                if(QS is None and not contains_missing_samples):
                    QS = economic_qs(K)
                elif (contains_missing_samples):
                    QS_tmp = QS
                    QS = economic_qs(K)
            else:
                if(QS is None and not contains_missing_samples):
                    QS = economic_qs(readdepth_df)
                elif (contains_missing_samples):
                    QS_tmp = QS
                    QS = economic_qs(readdepth_df)

        # covariance matrix
        cov_matrix = covariate_df.loc[sample2individual_feature['sample'], :].values if covariate_df is not None else None
        if covariate_df is None:
            cov_matrix = np.ones((len(individual_ids), 1))
        cov_matrix = cov_matrix.astype(float)

        phenotype = utils.force_normal_distribution(phenotype_ds.values,method="gaussnorm")

        # Prepare LMM
        phenotype = phenotype.astype(float)

        if randomeff_mix:
            # initialize best to minus infinite
            best["lml"] = - math.inf
            best["lmm"] = - math.inf
            best["rho1"] = - math.inf
            for rho, QS in Sigma_qs.items():
                lmm = LMM(phenotype, cov_matrix, QS)
                if not mixed:
                    lmm.delta = 1
                    lmm.fix('delta')
                lmm.fit(verbose=False)
                lml = lmm.lml()
                if lml > best["lml"]:
                    best["lml"] = lml
                    best["lmm"] = lmm
                    best["rho1"] = rho
            lmm = best["lmm"]
            print(best["rho1"])
            if best["rho1"] != 0:
                rho_log[(feature_id)] = best["rho1"]
                print("Read depth has actually an effect!")

        else:
            lmm = LMM(phenotype, cov_matrix, QS)
            if not mixed:
                lmm.delta = 1
                lmm.fix('delta')
            lmm.fit(verbose=False)

        # create phenotype_corrected_df for plotting
        phenotype_corrected = phenotype - cov_matrix[:, 1:].dot(lmm.beta[1:])
        phenotype_corrected_ds = pd.Series(data = phenotype_corrected, index = phenotype_ds.index, name="exp")

        qtl_plots(row, phenotype_ds, phenotype_corrected_ds, plinkGenotype, bim, fam, bed, annotation_df, sample2individual_df)
예제 #7
0
def run_PrsQtl_analysis(pheno_filename,
                        anno_filename,
                        prsFile,
                        output_dir,
                        min_call_rate=0.95,
                        blocksize=1000,
                        skipAutosomeFiltering=False,
                        gaussianize_method=None,
                        minimum_test_samples=10,
                        seed=np.random.randint(40000),
                        n_perm=0,
                        write_permutations=False,
                        relatedness_score=None,
                        feature_variant_covariate_filename=None,
                        snps_filename=None,
                        feature_filename=None,
                        snp_feature_filename=None,
                        genetic_range='all',
                        covariates_filename=None,
                        kinship_filename=None,
                        sample_mapping_filename=None,
                        regressCovariatesUpfront=False):
    fill_NaN = Imputer(missing_values=np.nan, strategy='mean', axis=0)
    print('Running GRS QT analysis.')
    lik = 'normal'
    '''Core function to take input and run QTL tests on a given chromosome.'''
    if relatedness_score is not None:
        relatedness_score = float(relatedness_score)
    [phenotype_df, kinship_df, covariate_df, sample2individual_df, annotation_df, snp_filter_df, snp_feature_filter_df, geneticaly_unique_individuals, minimum_test_samples, feature_list, risk_df, chromosome, selectionStart, selectionEnd, feature_variant_covariate_df]=\
    utils.run_PrsQtl_analysis_load_intersect_phenotype_covariates_kinship_sample_mapping(pheno_filename=pheno_filename, anno_filename=anno_filename, prsFile=prsFile, skipAutosomeFiltering = skipAutosomeFiltering,
                      minimum_test_samples= minimum_test_samples,  relatedness_score=relatedness_score, snps_filename=snps_filename, feature_filename=feature_filename, snp_feature_filename=snp_feature_filename, selection=genetic_range,
                     covariates_filename=covariates_filename, kinship_filename=kinship_filename, sample_mapping_filename=sample_mapping_filename, feature_variant_covariate_filename=feature_variant_covariate_filename)

    mixed = kinship_df is not None
    if (kinship_df is None) or (relatedness_score is None):
        geneticaly_unique_individuals = sample2individual_df['iid'].values
    QS = None
    if (feature_list == None or len(feature_list) == 0):
        print('No features to be tested.')
        sys.exit()

    #Open output files
    qtl_loader_utils.ensure_dir(output_dir)
    if not selectionStart is None:
        output_writer = qtl_output.hdf5_writer(
            output_dir + '/qtl_results_{}_{}_{}.h5'.format(
                chromosome, selectionStart, selectionEnd))
    else:
        output_writer = qtl_output.hdf5_writer(
            output_dir + '/qtl_results_{}.h5'.format(chromosome))
    if (write_permutations):
        if not selectionStart is None:
            permutation_writer = qtl_output.hdf5_permutations_writer(
                output_dir + '/perm_results_{}_{}_{}.h5'.format(
                    chromosome, selectionStart, selectionEnd), n_perm)
        else:
            permutation_writer = qtl_output.hdf5_permutations_writer(
                output_dir + '/perm_results_{}.h5'.format(chromosome), n_perm)

    #Arrays to store indices of snps tested and pass and fail QC SNPs for features without missingness.
    tested_snp_names = []
    fail_qc_features = []
    alpha_params = []
    beta_params = []
    n_samples = []
    n_e_samples = []
    na_containing_features = 0
    currentFeatureNumber = 0
    snpQcInfoMain = None
    for feature_id in feature_list:
        snpQcInfo = None
        currentFeatureNumber += 1
        if (len(phenotype_df.loc[feature_id, :])) < minimum_test_samples:
            print("Feature: " + feature_id +
                  " not tested not enough samples do QTL test.")
            fail_qc_features.append(feature_id)
            geneticaly_unique_individuals = tmp_unique_individuals
            continue
        data_written = False
        contains_missing_samples = False
        snpQuery = risk_df.index.values
        snp_cov_df = None

        if (feature_variant_covariate_df is not None):
            if (feature_id in feature_variant_covariate_df['feature'].values):
                covariateSnp = feature_variant_covariate_df['snp_id'].values[
                    feature_variant_covariate_df['feature'] == feature_id]
                if (any(i in risk_df.index.values for i in covariateSnp)):
                    snp_cov_df = risk_df.loc[risk_df.index.map(
                        lambda x: x in list(covariateSnp)), :].transpose()

        if (len(snpQuery) != 0) and (snp_filter_df is not None):
            snpQuery = list(
                set(snp_filter_df.index).intersection(set(snpQuery)))

        if (len(snpQuery) != 0) and (snp_feature_filter_df is not None):
            snpQuery = list(
                set(
                    np.unique(snp_feature_filter_df['snp_id'].loc[
                        snp_feature_filter_df['feature'] ==
                        feature_id])).intersection(set(snpQuery)))

        if len(snpQuery) == 0:
            print("Feature: " + feature_id +
                  " not tested. No SNPS passed QC for phenotype.")
            fail_qc_features.append(feature_id)
            continue
        else:
            phenotype_ds = phenotype_df.loc[feature_id]
            contains_missing_samples = any(~np.isfinite(phenotype_ds))
            if (contains_missing_samples):
                #import pdb; pdb.set_trace()
                print('Feature: ' + feature_id + ' contains missing data.')
                phenotype_ds.dropna(inplace=True)
                na_containing_features = na_containing_features + 1
            '''select indices for relevant individuals in genotype matrix
            These are not unique. NOT to be used to access phenotype/covariates data
            '''
            individual_ids = sample2individual_df.loc[phenotype_ds.index,
                                                      'iid'].values
            sample2individual_feature = sample2individual_df.loc[
                phenotype_ds.index]

            if contains_missing_samples:
                tmp_unique_individuals = geneticaly_unique_individuals
                if (kinship_df is not None) and (relatedness_score
                                                 is not None):
                    geneticaly_unique_individuals = utils.get_unique_genetic_samples(
                        kinship_df.loc[individual_ids, individual_ids],
                        relatedness_score)
                else:
                    geneticaly_unique_individuals = individual_ids
            if phenotype_ds.empty or len(
                    geneticaly_unique_individuals) < minimum_test_samples:
                print("Feature: " + feature_id +
                      " not tested not enough samples do QTL test.")
                fail_qc_features.append(feature_id)
                if contains_missing_samples:
                    geneticaly_unique_individuals = tmp_unique_individuals
                continue
            elif np.var(phenotype_ds.values) == 0:
                print("Feature: " + feature_id +
                      " has no variance in selected individuals.")
                fail_qc_features.append(feature_id)
                if contains_missing_samples:
                    geneticaly_unique_individuals = tmp_unique_individuals
                continue

            print('For feature: ' + str(currentFeatureNumber) + '/' +
                  str(len(feature_list)) + ' (' + feature_id + '): ' +
                  str(len(snpQuery)) +
                  ' risk scores will be tested.\n Please stand by.')
            if (n_perm != 0):
                bestPermutationPval = np.ones((n_perm), dtype=np.float)

            #Here we need to start preparing the LMM, can use the fam for sample IDS in SNP matrix.


#                test if the covariates, kinship, snp and phenotype are in the same order
            if ((all(kinship_df.loc[individual_ids,individual_ids].index==sample2individual_feature.loc[phenotype_ds.index]['iid']) if kinship_df is not None else True) &\
                 (all(phenotype_ds.index==covariate_df.loc[sample2individual_feature['sample'],:].index)if covariate_df is not None else True)):
                '''
                if all lines are in order put in arrays the correct genotype and phenotype
                x=a if cond1 else b <---> equivalent to if cond1: x=a else x=b;                 better readability of the code
                 '''
                if kinship_df is not None:
                    kinship_mat = kinship_df.loc[individual_ids,
                                                 individual_ids].values
                    kinship_mat = kinship_mat.astype(float)
                    ##GOWER normalization of Kinship matrix.
                    kinship_mat *= (kinship_mat.shape[0] - 1) / (
                        kinship_mat.trace() - kinship_mat.mean(0).sum())
                    ## This needs to go with the subselection stuff.
                    if (QS is None and not contains_missing_samples):
                        QS = economic_qs(kinship_mat)
                    elif (contains_missing_samples):
                        QS_tmp = QS
                        QS = economic_qs(kinship_mat)
                if kinship_df is None:
                    K = np.eye(len(phenotype_ds.index))
                    if (QS is None and not contains_missing_samples):
                        QS = economic_qs(K)
                    elif (contains_missing_samples):
                        QS_tmp = QS
                        QS = economic_qs(K)
                cov_matrix = covariate_df.loc[sample2individual_feature[
                    'sample'], :].values if covariate_df is not None else None
                if covariate_df is None:
                    cov_matrix = np.ones((len(individual_ids), 1))
                #pdb.set_trace()
                if snp_cov_df is not None:
                    snp_cov_df_tmp = snp_cov_df.loc[individual_ids, :]
                    snp_cov_df = pd.DataFrame(
                        fill_NaN.fit_transform(snp_cov_df_tmp))
                    snp_cov_df.index = sample2individual_feature['sample']
                    snp_cov_df.columns = snp_cov_df_tmp.columns
                    cov_matrix = np.concatenate(
                        (cov_matrix, snp_cov_df.values), 1)
                    snp_cov_df_tmp = None
                    snp_cov_df = None
                cov_matrix = cov_matrix.astype(float)
            else:
                print(
                    'There is an issue in mapping phenotypes vs covariates and/or kinship'
                )
                sys.exit()

            phenotype = utils.force_normal_distribution(
                phenotype_ds.values, method=gaussianize_method
            ) if gaussianize_method is not None else phenotype_ds.values

            #Prepare LMM
            phenotype = phenotype.astype(float)

            ##Mixed and test.
            ##This is a future change so we don't need to decompose the COVs every time.
            ##Like QS this needs to happen when genetic unique individuals is the same.
            #svd_cov = economic_svd(cov_matrix)
            #lmm = LMM(phenotype, cov_matrix, QS, SVD=svd_cov)
            #These steps need to happen only once per phenotype.
            #print(QS)
            lmm = LMM(phenotype, cov_matrix, QS)
            if not mixed:
                lmm.delta = 1
                lmm.fix('delta')
            #Prepare null model.
            lmm.fit(verbose=False)
            if regressCovariatesUpfront:
                phenotype_corrected = phenotype - cov_matrix[:, 1:].dot(
                    lmm.beta[1:])
                cov_matrix_corrected = cov_matrix[:, 0]
                lmm = LMM(phenotype_corrected, cov_matrix_corrected, QS)
                lmm.fit(verbose=False)

            null_lml = lmm.lml()
            flmm = lmm.get_fast_scanner()
            #pdb.set_trace();
            for snpGroup in utils.chunker(snpQuery, blocksize):
                #Fix seed at the start of the first chunker so all permutations are based on the same random first split.
                np.random.seed(seed)

                snp_names = snpGroup

                tested_snp_names.extend(snp_names)
                snp_matrix_DF = risk_df.loc[snp_names,
                                            individual_ids].transpose()
                ##GRS var QC
                snp_matrix_DF = snp_matrix_DF.loc[:,
                                                  snp_matrix_DF.isna().sum(
                                                      axis=0) != snp_matrix_DF.
                                                  shape[0], ]
                snp_matrix_DF = snp_matrix_DF.loc[:, (
                    np.nanstd(snp_matrix_DF, axis=0) > 0)]
                #               test if the covariates, kinship, snp and phenotype are in the same order
                if (len(snp_matrix_DF.index) != len(
                        sample2individual_feature.loc[phenotype_ds.index]
                    ['iid']) or not all(
                        snp_matrix_DF.index == sample2individual_feature.loc[
                            phenotype_ds.index]['iid'])):
                    print(
                        'There is an issue in mapping phenotypes and genotypes'
                    )
                    sys.exit()
                #Impute missingness
                #pdb.set_trace()
                call_rate = 1 - snp_matrix_DF.isnull().sum() / len(
                    snp_matrix_DF.index)
                if snpQcInfo is None and call_rate is not None:
                    snpQcInfo = call_rate
                elif call_rate is not None:
                    snpQcInfo = pd.concat([snpQcInfo, call_rate], axis=0)
                selection = call_rate > min_call_rate
                snp_matrix_DF = snp_matrix_DF.loc[:,
                                                  list(snp_matrix_DF.
                                                       columns[selection])]
                if snp_matrix_DF.shape[1] == 0:
                    continue
                snp_matrix_DF = pd.DataFrame(
                    fill_NaN.fit_transform(snp_matrix_DF),
                    index=snp_matrix_DF.index,
                    columns=snp_matrix_DF.columns)
                #
                G = snp_matrix_DF.values
                G = G.astype(float)
                G_index = snp_matrix_DF.columns

                alt_lmls, effsizes = flmm.fast_scan(G, verbose=False)
                var_pvalues = lrt_pvalues(null_lml, alt_lmls)
                var_effsizes_se = effsizes_se(effsizes, var_pvalues)

                #add these results to qtl_results
                temp_df = pd.DataFrame(index=range(len(G_index)),
                                       columns=[
                                           'feature_id', 'snp_id', 'p_value',
                                           'beta', 'beta_se',
                                           'empirical_feature_p_value'
                                       ])
                temp_df['snp_id'] = G_index
                temp_df['feature_id'] = feature_id
                temp_df['beta'] = np.asarray(effsizes)
                temp_df['p_value'] = np.asarray(var_pvalues)
                temp_df['beta_se'] = np.asarray(var_effsizes_se)
                #insert default dummy value
                temp_df['empirical_feature_p_value'] = -1.0

                if (n_perm != 0):
                    pValueBuffer = []
                    totalSnpsToBeTested = (G.shape[1] * n_perm)
                    permutationStepSize = np.floor(
                        n_perm / (totalSnpsToBeTested / blocksize))
                    if (permutationStepSize > n_perm):
                        permutationStepSize = n_perm
                    elif (permutationStepSize < 1):
                        permutationStepSize = 1

                    if (write_permutations):
                        perm_df = pd.DataFrame(
                            index=range(len(G_index)),
                            columns=['snp_id'] +
                            ['permutation_' + str(x) for x in range(n_perm)])
                        perm_df['snp_id'] = G_index
                    for currentNperm in utils.chunker(
                            list(range(1, n_perm + 1)), permutationStepSize):
                        if (kinship_df is not None) and (relatedness_score
                                                         is not None):
                            temp = utils.get_shuffeld_genotypes_preserving_kinship(
                                geneticaly_unique_individuals,
                                relatedness_score, snp_matrix_DF,
                                kinship_df.loc[individual_ids, individual_ids],
                                len(currentNperm))
                        else:
                            temp = utils.get_shuffeld_genotypes(
                                snp_matrix_DF, len(currentNperm))
                        temp = temp.astype(float)
                        alt_lmls_p, effsizes_p = flmm.fast_scan(temp,
                                                                verbose=False)
                        var_pvalues_p = lrt_pvalues(null_lml, alt_lmls_p)
                        pValueBuffer.extend(np.asarray(var_pvalues_p))
                    if (not (len(pValueBuffer) == totalSnpsToBeTested)):
                        #print(len(pValueBuffer))
                        #print(pValueBuffer)
                        #print(totalSnpsToBeTested)
                        print('Error in blocking logic for permutations.')
                        sys.exit()
                    perm = 0
                    for relevantOutput in utils.chunker(
                            pValueBuffer, G.shape[1]):
                        if (write_permutations):
                            perm_df['permutation_' +
                                    str(perm)] = relevantOutput
                        if (bestPermutationPval[perm] > min(relevantOutput)):
                            bestPermutationPval[perm] = min(relevantOutput)
                        perm = perm + 1
                        #print(relevantOutput)
                        #print('permutation_'+str(perm))

                if not temp_df.empty:
                    data_written = True
                    output_writer.add_result_df(temp_df)
                    if (write_permutations):
                        permutation_writer.add_permutation_results_df(
                            perm_df, feature_id)
            #This we need to change in the written file.
            if (n_perm > 1 and data_written):
                #updated_permuted_p_in_hdf5(bestPermutationPval, feature_id);
                alpha_para, beta_para = output_writer.apply_pval_correction(
                    feature_id, bestPermutationPval, False)
                alpha_params.append(alpha_para)
                beta_params.append(beta_para)
                #pdb.set_trace();
            if not data_written:
                fail_qc_features.append(feature_id)
            else:
                n_samples.append(phenotype_ds.size)
                n_e_samples.append(len(geneticaly_unique_individuals))
            if contains_missing_samples:
                QS = QS_tmp
                geneticaly_unique_individuals = tmp_unique_individuals
                snpQcInfo = snpQcInfo.to_frame(name="call_rate")
                snpQcInfo.index.name = "snp_id"
                snpQcInfo.to_csv(
                    output_dir +
                    '/snp_qc_metrics_naContaining_feature_{}.txt'.format(
                        feature_id),
                    sep='\t')
                del QS_tmp
                del tmp_unique_individuals
            else:
                if (snpQcInfo is not None and snpQcInfoMain is not None):
                    snpQcInfoMain = pd.concat([snpQcInfoMain, snpQcInfo],
                                              axis=0)
                elif snpQcInfo is not None:
                    snpQcInfoMain = snpQcInfo.copy(deep=True)
                #print('step 5')
    output_writer.close()

    if (write_permutations):
        permutation_writer.close()
    fail_qc_features = np.unique(fail_qc_features)
    if ((len(feature_list) - len(fail_qc_features)) == 0):
        time.sleep(15)
        #Safety timer to make sure the file is unlocked.
        print("Trying to remove the h5 file. Nothing has been tested.")
        print(output_dir + 'qtl_results_{}_{}_{}.h5'.format(
            chromosome, selectionStart, selectionEnd))
        if not selectionStart is None:
            os.remove(output_dir + 'qtl_results_{}_{}_{}.h5'.format(
                chromosome, selectionStart, selectionEnd))
        else:
            os.remove(output_dir + 'qtl_results_{}.h5'.format(chromosome))
        sys.exit()
    #gather unique indexes of tested snps
    #write annotation and snp data to file
    snp_df = pd.DataFrame()
    snp_df['snp_id'] = np.unique(tested_snp_names)
    snp_df.index = np.unique(tested_snp_names)
    snp_df['chromosome'] = "NA"
    snp_df['position'] = "NA"
    if (snpQcInfoMain is not None):
        snpQcInfoMain = snpQcInfoMain.to_frame(name="call_rate")
        snpQcInfoMain['index'] = snpQcInfoMain.index
        snpQcInfoMain = snpQcInfoMain.drop_duplicates()
        del snpQcInfoMain['index']
        snp_df = pd.concat(
            [snp_df, snpQcInfoMain.reindex(snp_df.index)], axis=1)

    feature_list = list(set(feature_list) - set(fail_qc_features))
    annotation_df = annotation_df.reindex(feature_list)
    annotation_df['n_samples'] = n_samples
    annotation_df['n_e_samples'] = n_e_samples
    if (n_perm > 1):
        annotation_df['alpha_param'] = alpha_params
        annotation_df['beta_param'] = beta_params
    if not selectionStart is None:
        snp_df.to_csv(output_dir + '/snp_metadata_{}_{}_{}.txt'.format(
            chromosome, selectionStart, selectionEnd),
                      sep='\t',
                      index=False)
        annotation_df.to_csv(output_dir +
                             '/feature_metadata_{}_{}_{}.txt'.format(
                                 chromosome, selectionStart, selectionEnd),
                             sep='\t')
    else:
        snp_df.to_csv(output_dir + '/snp_metadata_{}.txt'.format(chromosome),
                      sep='\t',
                      index=False)
        annotation_df.to_csv(output_dir +
                             '/feature_metadata_{}.txt'.format(chromosome),
                             sep='\t')
예제 #8
0
def _test_lmm(random, y, X, G, mvn, restricted):
    c = X.shape[1]
    QS = economic_qs_linear(G)
    lmm = LMM(y, X, QS, restricted=restricted)
    beta = lmm.beta
    v0 = lmm.v0
    v1 = lmm.v1

    K0 = G @ G.T
    assert_allclose(lmm.lml(), mvn(beta, v0, v1, y, X, K0))

    beta = random.randn(c)
    lmm.beta = beta
    assert_allclose(lmm.lml(), mvn(beta, v0, v1, y, X, K0))

    delta = random.rand(1).item()
    lmm.delta = delta
    v0 = lmm.v0
    v1 = lmm.v1
    assert_allclose(lmm.lml(), mvn(beta, v0, v1, y, X, K0))

    scale = random.rand(1).item()
    lmm.scale = scale
    v0 = lmm.v0
    v1 = lmm.v1
    assert_allclose(lmm.lml(), mvn(beta, v0, v1, y, X, K0))

    def fun(x):
        beta = x[:c]
        v0 = exp(x[c])
        v1 = exp(x[c + 1])
        return -mvn(beta, v0, v1, y, X, K0)

    res = minimize(fun, [0] * c + [0, 0])
    lmm.fit(verbose=False)
    assert_allclose(lmm.lml(), -res.fun, rtol=1e-3, atol=1e-6)
    assert_allclose(lmm.beta, res.x[:c], rtol=1e-3, atol=1e-6)
    assert_allclose(lmm.v0, exp(res.x[c]), rtol=1e-3, atol=1e-6)
    assert_allclose(lmm.v1, exp(res.x[c + 1]), rtol=1e-3, atol=1e-6)

    lmm = LMM(y, X, QS, restricted=restricted)
    beta = random.randn(c)
    lmm.beta = beta
    lmm.delta = random.rand(1).item()
    lmm.scale = random.rand(1).item()
    lmm.fix("beta")

    def fun(x):
        v0 = exp(x[0])
        v1 = exp(x[1])
        return -mvn(beta, v0, v1, y, X, K0)

    res = minimize(fun, [0, 0])
    lmm.fit(verbose=False)
    assert_allclose(lmm.lml(), -res.fun, rtol=1e-3, atol=1e-6)
    assert_allclose(lmm.v0, exp(res.x[0]), rtol=1e-3, atol=1e-6)
    assert_allclose(lmm.v1, exp(res.x[1]), rtol=1e-3, atol=1e-6)

    lmm = LMM(y, X, QS, restricted=restricted)
    lmm.beta = random.randn(c)
    delta = random.rand(1).item()
    lmm.delta = delta
    lmm.scale = random.rand(1).item()
    lmm.fix("delta")

    def fun(x):
        beta = x[:c]
        scale = exp(x[c])
        v0 = scale * (1 - delta)
        v1 = scale * delta
        return -mvn(beta, v0, v1, y, X, K0)

    res = minimize(fun, [0] * c + [0])
    lmm.fit(verbose=False)
    assert_allclose(lmm.lml(), -res.fun, rtol=1e-5, atol=1e-6)
    assert_allclose(lmm.beta, res.x[:c], rtol=1e-5, atol=1e-6)
    assert_allclose(lmm.scale, exp(res.x[c]), rtol=1e-5, atol=1e-6)

    lmm = LMM(y, X, QS, restricted=restricted)
    lmm.beta = random.randn(c)
    lmm.delta = random.rand(1).item()
    scale = random.rand(1).item()
    lmm.scale = scale
    lmm.fix("scale")

    def fun(x):
        beta = x[:c]
        delta = 1 / (1 + exp(-x[c]))
        v0 = scale * (1 - delta)
        v1 = scale * delta
        return -mvn(beta, v0, v1, y, X, K0)

    res = minimize(fun, [0] * c + [0])
    lmm.fit(verbose=False)
    assert_allclose(lmm.lml(), -res.fun, rtol=1e-5, atol=1e-6)
    assert_allclose(lmm.beta, res.x[:c], rtol=1e-3, atol=1e-6)
    assert_allclose(lmm.delta, 1 / (1 + exp(-res.x[c])), rtol=1e-3, atol=1e-6)