예제 #1
0
def run_gwas(mt,
             phen: str,
             sim_name: str,
             subset_idx: int,
             param_suffix: str,
             wd: str,
             is_logreg=True):
    assert {'GT', 'dosage'}.intersection(
        mt.entry
    ) != {}, "mt does not have an entry field named 'dosage' or 'GT' corresponding to genotype data"

    mt = mt.filter_cols(mt.subset_idx == subset_idx)
    mt = mt.filter_cols(hl.is_defined(mt[phen]))
    print(
        f'\n\ngwas sample count (subset {subset_idx}): {mt.count_cols()}\n\n')

    if 'dosage' in mt.entry:
        mt = mt.annotate_rows(EAF=hl.agg.mean(mt.dosage) / 2)
    elif 'GT' in mt.entry:
        mt = mt.annotate_rows(EAF=hl.agg.mean(mt.GT.n_alt_alleles()) / 2)

    gwas_path = f'{wd}/gwas.{"logreg" if is_logreg else "linreg"}.{sim_name}.subset_{subset_idx}.{param_suffix}.tsv.gz'

    if not hl.hadoop_is_file(gwas_path):
        gt_field = mt.dosage if 'dosage' in mt.entry else mt.GT.n_alt_alleles()

        if is_logreg:
            gwas_ht = hl.logistic_regression_rows(test='wald',
                                                  y=mt[phen],
                                                  x=gt_field,
                                                  covariates=[1],
                                                  pass_through=['EAF'])
        else:
            gwas_ht = hl.linear_regression_rows(y=mt[phen],
                                                x=gt_field,
                                                covariates=[1],
                                                pass_through=['EAF'])
        gwas_ht.select('EAF', 'beta', 'standard_error',
                       'p_value').export(gwas_path)

    else:
        print(f'GWAS already run! ({gwas_path})')
        gwas_ht = hl.import_table(gwas_path, impute=True, force=True)
        gwas_ht = gwas_ht.annotate(locus=hl.parse_locus(gwas_ht.locus),
                                   alleles=gwas_ht.alleles.replace(
                                       '\[\"', '').replace('\"\]',
                                                           '').split('\",\"'))
        gwas_ht = gwas_ht.key_by('locus', 'alleles')

    return gwas_ht
def logistic_regression(mt: hl.MatrixTable,
                        x_expr: str,
                        response: str,
                        covs: list,
                        pass_through: list,
                        extra_fields: dict,
                        add_odd_stats: bool = True) -> hl.Table:
    """
    Perform a logistic-regression test (use by default Wald test).

    :param mt: Hail MatrixTable
    :param x_expr: the genotype field name (numeric expression)
    :param response: binary response
    :param covs: list of covariates to be included in the test
    :param pass_through: list of extra fields to keep in the output
    :param extra_fields: extra field to annotated (expected a dict)
    :param add_odd_stats: compute odds from logistic regression stats
    :return: Hail Table with logistic regression test results
    """
    # parsing covariates list
    if len(covs) >= 1:
        covs = [1] + [mt[field] for field in covs]
    else:
        covs = [1]

    tb_stats = hl.logistic_regression_rows(y=mt[response],
                                           x=mt[x_expr],
                                           covariates=covs,
                                           pass_through=pass_through,
                                           test='wald')

    if add_odd_stats:
        # Compute Odds ratio and 95% confidence interval from logistics regression stats
        tb_stats = tb_stats.annotate(
            odds_ratio=hl.exp(tb_stats.beta),
            lower_ci_95=hl.exp(tb_stats.beta - 1.96 * tb_stats.standard_error),
            upper_ci_95=hl.exp(tb_stats.beta + 1.96 * tb_stats.standard_error))

    # add column with additional information
    if len(extra_fields) == 0:
        return tb_stats
    else:
        return tb_stats.annotate(**extra_fields)
예제 #3
0
def run_logistic_bool(mt, variable):

    ht = hl.logistic_regression_rows(test='firth',
                                     y=mt[variable],
                                     x=mt.GT.n_alt_alleles(),
                                     covariates=[
                                         1, mt.imputesex.impute_sex.is_female,
                                         mt.pca.PC1, mt.pca.PC2, mt.pca.PC3,
                                         mt.pca.PC4, mt.pca.PC5, mt.pca.PC6,
                                         mt.pca.PC7, mt.pca.PC8, mt.pca.PC9,
                                         mt.pca.PC10
                                     ])

    mt = mt.filter_cols(hl.is_defined(mt[variable]))
    mt = mt.annotate_rows(MAC=hl.min(
        hl.agg.sum(mt.GT.n_alt_alleles()),
        hl.agg.sum(
            hl.int64(mt.GT.is_het_ref()) + 2 * hl.int64(mt.GT.is_hom_ref()))))
    ht = ht.annotate(MAC=mt.rows()[ht.key].MAC)
    return (ht)
예제 #4
0
def logistic_regression_rows_wald(mt_path):
    mt = hl.read_matrix_table(mt_path)
    mt = mt.head(2000)
    num_phenos = 5
    num_covs = 2
    pheno_dict = {
        f"pheno_{i}": hl.rand_bool(.5, seed=i)
        for i in range(num_phenos)
    }
    cov_dict = {
        f"cov_{i}": hl.rand_unif(0, 1, seed=i)
        for i in range(num_covs)
    }
    mt = mt.annotate_cols(**pheno_dict)
    mt = mt.annotate_cols(**cov_dict)
    res = hl.logistic_regression_rows(
        test='wald',
        y=[mt[key] for key in pheno_dict.keys()],
        x=mt.x,
        covariates=[mt[key] for key in cov_dict.keys()])
    res._force_count()
p.circle(x='x', y='y', legend='label', color='color', source=source)
#show(p)
p.legend.label_text_font_size = '15pt'
p.xaxis.axis_label_text_font_size = "25pt"
p.yaxis.axis_label_text_font_size = "25pt"
p.title.text_font_size = '25pt'

output_file('pca-1KGP1-sex', title='pca-1KGP1-sex', mode='inline')
save(p)
blob = bucket.blob('pca-1KGP1-sex.html')
blob.upload_from_filename('pca-1KGP1-sex')

#logistic_regression
gwas = hl.logistic_regression_rows(
    test='wald',
    y=mt.Gender_Classification,
    x=mt.GT.n_alt_alleles(),
    covariates=[1, mt.pca.scores[0], mt.pca.scores[1]])

manhattan = hl.plot.manhattan(gwas.p_value, title='Manhattan Plot')
#show(manhattan)
manhattan.xaxis.axis_label_text_font_size = "25pt"
manhattan.yaxis.axis_label_text_font_size = "25pt"
manhattan.title.text_font_size = '25pt'

output_file('manhattan-1KGP1-sex', title='manhattan-1KGP1-sex', mode='inline')
save(manhattan)
blob = bucket.blob('manhattan-1KGP1-sex.html')
blob.upload_from_filename('manhattan-1KGP1-sex')

t2 = time.time()
예제 #6
0
ds = ds.annotate_rows(**mt5.index(ds.row_key))
ds = ds.annotate_cols(pheno = phenos[ds.col_key])
ds = ds.annotate_cols(array = hl.if_else((ds.pheno.genotyping_array == "UKBB"), 1, 0))
ds = ds.filter_cols(hl.is_defined(ds.pheno.age), keep=True)

### variant qc 
mt = hl.variant_qc(ds,name='variant_qc')
mt = mt.filter_rows( ((mt.variant_qc.AF[1] > 0.001) & (mt.variant_qc.AF[1] < 0.999) & (mt.info>0.4) & (mt.variant_qc.p_value_hwe >= 0.0000000001)),keep = True )
final= mt.annotate_rows(AF = mt.variant_qc.AF[1],AC = mt.variant_qc.AC[1],AN = mt.variant_qc.AN)
#final_annot = final.annotate_rows(HWE = final.variant_qc.p_value_hwe, callRate = final.variant_qc.call_rate)
#final_annot = final_annot.drop('variant_qc').rows()
### gwas logistic regression wald
gwas = hl.logistic_regression_rows(test='wald',\
									y=final.pheno.All_Pneumonia,\
									x=final.GT.n_alt_alleles(),\
									covariates=[1, final.pheno.age,final.pheno.age2, final.pheno.Sex_numeric, final.pheno.ever_smoked, final.pheno.PC1,final.pheno.PC2,final.pheno.PC3,final.pheno.PC4,final.pheno.PC5,final.pheno.PC6,final.pheno.PC7,final.pheno.PC8,final.pheno.PC9,final.pheno.PC10,final.array],
									pass_through=['rsid','Gene','Consequence','clin_sig', 'metasvm','LOF_LOFTEE','PolyPhen','SIFT','hgvsp','AF', 'AC', 'AN','info'])
​
### Writting out the annotated GWAS results:
gwas.flatten().export('gs://ukbb_v2/projects/mzekavat/Pneumonia_GWAS/logreg_wald_All_Pneumonia.tsv.bgz')
gwas.write('gs://ukbb_v2/projects/mzekavat/Pneumonia_GWAS/logreg_wald_All_Pneumonia.ht')
gwas = hl.read_table('gs://ukbb_v2/projects/mzekavat/Pneumonia_GWAS/logreg_wald_All_Pneumonia.ht')
gwas_v2 = gwas.filter(gwas.p_value<0.0001, keep=True)


### Filtering the pneumonia GWAS to just the SBP and DBP SNPs:
SBPsnps = hl.import_table('gs://ukbb_v2/projects/mzekavat/Pneumonia_GWAS/SBP_75SNP_instrument_hg37.txt', impute = True) 
DBPsnps = hl.import_table('gs://ukbb_v2/projects/mzekavat/Pneumonia_GWAS/DBP_75SNP_instrument_hg37.txt', impute = True) 

예제 #7
0
show(p)

######## 6 GWAS
# Keep at leats allele frequency of 1% (common + rare)
mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01)

######## 6.1 Firth logistic regression ~ Affection status
# Covariates:
### 1.0 is input variable number of alternate alleles with input variable the genotype dosage derived from the PL field,
### number of pack years smoking,
### population structure by PCA scores,
gwas = hl.logistic_regression_rows(
    test="firth",  #controls false positives
    y=hl.float(mt.AffectionBool),
    x=mt.GT.n_alt_alleles(),
    covariates=[
        1,
        hl.float(mt.PackYear), mt.scores[0], mt.scores[1], mt.scores[2],
        mt.scores[3], mt.scores[4], mt.scores[5], mt.scores[6], mt.scores[7],
        mt.scores[8], mt.scores[9]
    ])

######## 6.2 Firth linear regression ~ FEV1% predicted (lung function variable)
gwas = hl.linear_regression_rows(y=hl.float(mt.FEV1),
                                 x=mt.GT.n_alt_alleles(),
                                 covariates=[
                                     1,
                                     hl.float(mt.PackYear), mt.scores[0],
                                     mt.scores[1], mt.scores[2], mt.scores[3],
                                     mt.scores[4], mt.scores[5], mt.scores[6],
                                     mt.scores[7], mt.scores[8], mt.scores[9]
                                 ])