def run_gwas(mt, phen: str, sim_name: str, subset_idx: int, param_suffix: str, wd: str, is_logreg=True): assert {'GT', 'dosage'}.intersection( mt.entry ) != {}, "mt does not have an entry field named 'dosage' or 'GT' corresponding to genotype data" mt = mt.filter_cols(mt.subset_idx == subset_idx) mt = mt.filter_cols(hl.is_defined(mt[phen])) print( f'\n\ngwas sample count (subset {subset_idx}): {mt.count_cols()}\n\n') if 'dosage' in mt.entry: mt = mt.annotate_rows(EAF=hl.agg.mean(mt.dosage) / 2) elif 'GT' in mt.entry: mt = mt.annotate_rows(EAF=hl.agg.mean(mt.GT.n_alt_alleles()) / 2) gwas_path = f'{wd}/gwas.{"logreg" if is_logreg else "linreg"}.{sim_name}.subset_{subset_idx}.{param_suffix}.tsv.gz' if not hl.hadoop_is_file(gwas_path): gt_field = mt.dosage if 'dosage' in mt.entry else mt.GT.n_alt_alleles() if is_logreg: gwas_ht = hl.logistic_regression_rows(test='wald', y=mt[phen], x=gt_field, covariates=[1], pass_through=['EAF']) else: gwas_ht = hl.linear_regression_rows(y=mt[phen], x=gt_field, covariates=[1], pass_through=['EAF']) gwas_ht.select('EAF', 'beta', 'standard_error', 'p_value').export(gwas_path) else: print(f'GWAS already run! ({gwas_path})') gwas_ht = hl.import_table(gwas_path, impute=True, force=True) gwas_ht = gwas_ht.annotate(locus=hl.parse_locus(gwas_ht.locus), alleles=gwas_ht.alleles.replace( '\[\"', '').replace('\"\]', '').split('\",\"')) gwas_ht = gwas_ht.key_by('locus', 'alleles') return gwas_ht
def logistic_regression(mt: hl.MatrixTable, x_expr: str, response: str, covs: list, pass_through: list, extra_fields: dict, add_odd_stats: bool = True) -> hl.Table: """ Perform a logistic-regression test (use by default Wald test). :param mt: Hail MatrixTable :param x_expr: the genotype field name (numeric expression) :param response: binary response :param covs: list of covariates to be included in the test :param pass_through: list of extra fields to keep in the output :param extra_fields: extra field to annotated (expected a dict) :param add_odd_stats: compute odds from logistic regression stats :return: Hail Table with logistic regression test results """ # parsing covariates list if len(covs) >= 1: covs = [1] + [mt[field] for field in covs] else: covs = [1] tb_stats = hl.logistic_regression_rows(y=mt[response], x=mt[x_expr], covariates=covs, pass_through=pass_through, test='wald') if add_odd_stats: # Compute Odds ratio and 95% confidence interval from logistics regression stats tb_stats = tb_stats.annotate( odds_ratio=hl.exp(tb_stats.beta), lower_ci_95=hl.exp(tb_stats.beta - 1.96 * tb_stats.standard_error), upper_ci_95=hl.exp(tb_stats.beta + 1.96 * tb_stats.standard_error)) # add column with additional information if len(extra_fields) == 0: return tb_stats else: return tb_stats.annotate(**extra_fields)
def run_logistic_bool(mt, variable): ht = hl.logistic_regression_rows(test='firth', y=mt[variable], x=mt.GT.n_alt_alleles(), covariates=[ 1, mt.imputesex.impute_sex.is_female, mt.pca.PC1, mt.pca.PC2, mt.pca.PC3, mt.pca.PC4, mt.pca.PC5, mt.pca.PC6, mt.pca.PC7, mt.pca.PC8, mt.pca.PC9, mt.pca.PC10 ]) mt = mt.filter_cols(hl.is_defined(mt[variable])) mt = mt.annotate_rows(MAC=hl.min( hl.agg.sum(mt.GT.n_alt_alleles()), hl.agg.sum( hl.int64(mt.GT.is_het_ref()) + 2 * hl.int64(mt.GT.is_hom_ref())))) ht = ht.annotate(MAC=mt.rows()[ht.key].MAC) return (ht)
def logistic_regression_rows_wald(mt_path): mt = hl.read_matrix_table(mt_path) mt = mt.head(2000) num_phenos = 5 num_covs = 2 pheno_dict = { f"pheno_{i}": hl.rand_bool(.5, seed=i) for i in range(num_phenos) } cov_dict = { f"cov_{i}": hl.rand_unif(0, 1, seed=i) for i in range(num_covs) } mt = mt.annotate_cols(**pheno_dict) mt = mt.annotate_cols(**cov_dict) res = hl.logistic_regression_rows( test='wald', y=[mt[key] for key in pheno_dict.keys()], x=mt.x, covariates=[mt[key] for key in cov_dict.keys()]) res._force_count()
p.circle(x='x', y='y', legend='label', color='color', source=source) #show(p) p.legend.label_text_font_size = '15pt' p.xaxis.axis_label_text_font_size = "25pt" p.yaxis.axis_label_text_font_size = "25pt" p.title.text_font_size = '25pt' output_file('pca-1KGP1-sex', title='pca-1KGP1-sex', mode='inline') save(p) blob = bucket.blob('pca-1KGP1-sex.html') blob.upload_from_filename('pca-1KGP1-sex') #logistic_regression gwas = hl.logistic_regression_rows( test='wald', y=mt.Gender_Classification, x=mt.GT.n_alt_alleles(), covariates=[1, mt.pca.scores[0], mt.pca.scores[1]]) manhattan = hl.plot.manhattan(gwas.p_value, title='Manhattan Plot') #show(manhattan) manhattan.xaxis.axis_label_text_font_size = "25pt" manhattan.yaxis.axis_label_text_font_size = "25pt" manhattan.title.text_font_size = '25pt' output_file('manhattan-1KGP1-sex', title='manhattan-1KGP1-sex', mode='inline') save(manhattan) blob = bucket.blob('manhattan-1KGP1-sex.html') blob.upload_from_filename('manhattan-1KGP1-sex') t2 = time.time()
ds = ds.annotate_rows(**mt5.index(ds.row_key)) ds = ds.annotate_cols(pheno = phenos[ds.col_key]) ds = ds.annotate_cols(array = hl.if_else((ds.pheno.genotyping_array == "UKBB"), 1, 0)) ds = ds.filter_cols(hl.is_defined(ds.pheno.age), keep=True) ### variant qc mt = hl.variant_qc(ds,name='variant_qc') mt = mt.filter_rows( ((mt.variant_qc.AF[1] > 0.001) & (mt.variant_qc.AF[1] < 0.999) & (mt.info>0.4) & (mt.variant_qc.p_value_hwe >= 0.0000000001)),keep = True ) final= mt.annotate_rows(AF = mt.variant_qc.AF[1],AC = mt.variant_qc.AC[1],AN = mt.variant_qc.AN) #final_annot = final.annotate_rows(HWE = final.variant_qc.p_value_hwe, callRate = final.variant_qc.call_rate) #final_annot = final_annot.drop('variant_qc').rows() ### gwas logistic regression wald gwas = hl.logistic_regression_rows(test='wald',\ y=final.pheno.All_Pneumonia,\ x=final.GT.n_alt_alleles(),\ covariates=[1, final.pheno.age,final.pheno.age2, final.pheno.Sex_numeric, final.pheno.ever_smoked, final.pheno.PC1,final.pheno.PC2,final.pheno.PC3,final.pheno.PC4,final.pheno.PC5,final.pheno.PC6,final.pheno.PC7,final.pheno.PC8,final.pheno.PC9,final.pheno.PC10,final.array], pass_through=['rsid','Gene','Consequence','clin_sig', 'metasvm','LOF_LOFTEE','PolyPhen','SIFT','hgvsp','AF', 'AC', 'AN','info']) ### Writting out the annotated GWAS results: gwas.flatten().export('gs://ukbb_v2/projects/mzekavat/Pneumonia_GWAS/logreg_wald_All_Pneumonia.tsv.bgz') gwas.write('gs://ukbb_v2/projects/mzekavat/Pneumonia_GWAS/logreg_wald_All_Pneumonia.ht') gwas = hl.read_table('gs://ukbb_v2/projects/mzekavat/Pneumonia_GWAS/logreg_wald_All_Pneumonia.ht') gwas_v2 = gwas.filter(gwas.p_value<0.0001, keep=True) ### Filtering the pneumonia GWAS to just the SBP and DBP SNPs: SBPsnps = hl.import_table('gs://ukbb_v2/projects/mzekavat/Pneumonia_GWAS/SBP_75SNP_instrument_hg37.txt', impute = True) DBPsnps = hl.import_table('gs://ukbb_v2/projects/mzekavat/Pneumonia_GWAS/DBP_75SNP_instrument_hg37.txt', impute = True)
show(p) ######## 6 GWAS # Keep at leats allele frequency of 1% (common + rare) mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01) ######## 6.1 Firth logistic regression ~ Affection status # Covariates: ### 1.0 is input variable number of alternate alleles with input variable the genotype dosage derived from the PL field, ### number of pack years smoking, ### population structure by PCA scores, gwas = hl.logistic_regression_rows( test="firth", #controls false positives y=hl.float(mt.AffectionBool), x=mt.GT.n_alt_alleles(), covariates=[ 1, hl.float(mt.PackYear), mt.scores[0], mt.scores[1], mt.scores[2], mt.scores[3], mt.scores[4], mt.scores[5], mt.scores[6], mt.scores[7], mt.scores[8], mt.scores[9] ]) ######## 6.2 Firth linear regression ~ FEV1% predicted (lung function variable) gwas = hl.linear_regression_rows(y=hl.float(mt.FEV1), x=mt.GT.n_alt_alleles(), covariates=[ 1, hl.float(mt.PackYear), mt.scores[0], mt.scores[1], mt.scores[2], mt.scores[3], mt.scores[4], mt.scores[5], mt.scores[6], mt.scores[7], mt.scores[8], mt.scores[9] ])