import hail as hl hl.init() from hail.plot import show from pprint import pprint hl.plot.output_notebook() # Want to check whether any of the samples are in gnomAD # Check distribution of singletons not in gnomAD - should be non-zero! QC_HARDCALLS_MT = 'gs://dalio_bipolar_w1_w2_hail_02/data/mt/17_european.strict.hardcalls.mt' GNOMAD_TSV = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/19_gnomAD_check.tsv' mt = hl.read_matrix_table(QC_HARDCALLS_MT) mt = mt.filter_cols((mt.phenotype.PHENOTYPE_COARSE == "Bipolar Disorder") | (mt.phenotype.PHENOTYPE_COARSE == "Control")) mt = mt.annotate_rows(is_singleton=hl.agg.sum(mt.GT.n_alt_alleles()) == 1) mt = mt.filter_rows(mt.is_singleton) mt = mt.annotate_cols(singleton_count=hl.agg.count_where(mt.GT.is_non_ref())) mt = mt.filter_rows((~mt.annotation.inGnomAD_nonpsych) & (mt.is_singleton)) mt = mt.annotate_cols( not_inGnomAD_count=hl.agg.count_where(mt.GT.is_non_ref())) scatter = hl.plot.scatter(mt.not_inGnomAD_count, mt.singleton_count) show(scatter) mt.cols().select('phenotype', 'singleton_count', 'not_inGnomAD_count').flatten().export(GNOMAD_TSV)
mt_pruned = mt_common.filter_rows(hl.is_defined(pruned_t[mt_common.row_key])) # Calculate eigenvalues, PC scores (k=10) and loadings eigenvalues, scores, loadings = hl.hwe_normalized_pca(mt_common.GT, k=10, compute_loadings=True) mt = mt.annotate_cols(scores=scores[mt.s].scores) # Plot first 2 PCs pca = hl.plot.scatter(mt_common.scores[0], mt_common.scores[1], label=mt_common.Race, title='PCA Caucasian', xlabel='PC1', ylabel='PC2') show(pca) ######## 5.2 Optional: Project new samples on existing PCs def pc_project( # reference: https://github.com/macarthur-lab/gnomad_hail/blob/master/utils/generic.py#L131 mt: hl.MatrixTable, loadings_ht: hl.Table, loading_location: str = "loadings", af_location: str = "pca_af") -> hl.Table: n_variants = loadings_ht.count() mt = mt.annotate_rows( pca_loadings=loadings_ht[mt.row_key][loading_location], pca_af=loadings_ht[mt.row_key][af_location]) mt = mt.filter_rows( hl.is_defined(mt.pca_loadings) & hl.is_defined(mt.pca_af)
#then can read the mt file back in and plot up the manhattan and QQ plots mt = hl.read_matrix_table( 'gs://ukb-diverse-pops/AdmixedAfrEur/DosageFiles/UKBB_AfEur_QCed_lipids_dosages_admixfrac.mt' ) # Note - for plotting, will need to use workers rather than premptible workers # In[23]: #plot up TC as a manhattan plot, anc 0 p_TC0 = hl.plot.manhattan(mt.results.TC.p_value[2], title='Admixed Afr-Eur UKBB, TC, anc0', collect_all=False, significance_line=5e-08) #colors=["#030303", "#7F7F7F"]) show(p_TC0) # In[24]: #plot up TC manhattan plot, anc 1 p_TC1 = hl.plot.manhattan(mt.results.TC.p_value[3], title='Admixed Afr-Eur UKBB, TC, anc1', collect_all=False, significance_line=5e-08) #colors=["#030303", "#7F7F7F"]) show(p_TC1) # In[25]: #make a QQ plot for TC anc0 p = hl.plot.qq(mt.results.TC.p_value[2], title="QQ plot, TC, anc0")
''' Plotting female hwe pval distributions to understand X chrom QC MOP = Kenya Moi AAP = Ethiopia KWP = Kenya Kemri CTP = South Africa MAP = Uganda ''' # Plotting Kenya Moi p = hl.plot.histogram(female_list[0].log10_hwe_pval, legend="HWE -log10 p-val", range=(0, 20), title="Kenya, Moi -log10 HWE p-val Before Prelim QC") show(p) # Plotting Ethiopia p = hl.plot.histogram(female_list[1].log10_hwe_pval, legend="HWE -log10 p-val", range=(0, 20), title="Uganda -log10 HWE p-val Before Prelim QC") show(p) # Plotting KEMRI p = hl.plot.histogram(female_list[2].log10_hwe_pval, legend="HWE -log10 p-val", range=(0, 20), title="KEMRI -log10 HWE p-val Before Prelim QC") show(p)
def main(number_of_pcs: int): # pylint: disable=too-many-locals """Query script entry point.""" hl.init() mt = hl.read_matrix_table(HGDP1KG_TOBWGS) scores = hl.read_table(SCORES) mt = mt.annotate_cols(scores=scores[mt.s].scores) mt = mt.annotate_cols(TOB_WGS=mt.s.contains('TOB')) # PCA plot must all come from the same object columns = mt.cols() pca_scores = columns.scores labels = columns.TOB_WGS # get percent variance explained eigenvalues = pd.read_csv(EIGENVALUES) eigenvalues.columns = ['eigenvalue'] variance = eigenvalues['eigenvalue'].divide(float(eigenvalues.sum())) * 100 variance = variance.round(2) print('Making PCA plots labelled by the study ID') for i in range(0, number_of_pcs): pc1 = i pc2 = i + 1 print(f'PC{pc1 + 1} vs PC{pc2 + 1}') p = hl.plot.scatter( pca_scores[pc1], pca_scores[pc2], label=labels, title='TOB-WGS', xlabel='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) + '%)', ylabel='PC' + str(pc2 + 1) + ' (' + str(variance[pc2]) + '%)', ) show(p) print('Making PCA plots labelled by the continental population') labels = columns.hgdp_1kg_metadata.population_inference.pop pops = list(set(labels.collect())) hover_fields = dict([('s', columns.s)]) for i in range(0, number_of_pcs): pc1 = i pc2 = i + 1 print(f'PC{pc1 + 1} vs PC{pc2 + 1}') p = hl.plot.scatter( pca_scores[pc1], pca_scores[pc2], label=labels, title='Continental Population', xlabel='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) + '%)', ylabel='PC' + str(pc2 + 1) + ' (' + str(variance[pc2]) + '%)', collect_all=True, colors=CategoricalColorMapper(palette=turbo(len(pops)), factors=pops), hover_fields=hover_fields, ) show(p) print('Making PCA plots labelled by the subpopulation') labels = columns.hgdp_1kg_metadata.labeled_subpop pops = list(set(labels.collect())) for i in range(0, number_of_pcs): pc1 = i pc2 = i + 1 print(f'PC{pc1 + 1} vs PC{pc2 + 1}') p = hl.plot.scatter( pca_scores[pc1], pca_scores[pc2], label=labels, title='Sub-Population', xlabel='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) + '%)', ylabel='PC' + str(pc2 + 1) + ' (' + str(variance[pc2]) + '%)', collect_all=True, colors=CategoricalColorMapper(palette=turbo(len(pops)), factors=pops), ) show(p)