import hail as hl
hl.init()

from hail.plot import show
from pprint import pprint
hl.plot.output_notebook()

# Want to check whether any of the samples are in gnomAD
# Check distribution of singletons not in gnomAD - should be non-zero!

QC_HARDCALLS_MT = 'gs://dalio_bipolar_w1_w2_hail_02/data/mt/17_european.strict.hardcalls.mt'
GNOMAD_TSV = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/19_gnomAD_check.tsv'

mt = hl.read_matrix_table(QC_HARDCALLS_MT)

mt = mt.filter_cols((mt.phenotype.PHENOTYPE_COARSE == "Bipolar Disorder")
                    | (mt.phenotype.PHENOTYPE_COARSE == "Control"))
mt = mt.annotate_rows(is_singleton=hl.agg.sum(mt.GT.n_alt_alleles()) == 1)
mt = mt.filter_rows(mt.is_singleton)
mt = mt.annotate_cols(singleton_count=hl.agg.count_where(mt.GT.is_non_ref()))

mt = mt.filter_rows((~mt.annotation.inGnomAD_nonpsych) & (mt.is_singleton))
mt = mt.annotate_cols(
    not_inGnomAD_count=hl.agg.count_where(mt.GT.is_non_ref()))

scatter = hl.plot.scatter(mt.not_inGnomAD_count, mt.singleton_count)
show(scatter)

mt.cols().select('phenotype', 'singleton_count',
                 'not_inGnomAD_count').flatten().export(GNOMAD_TSV)
Пример #2
0
mt_pruned = mt_common.filter_rows(hl.is_defined(pruned_t[mt_common.row_key]))

# Calculate eigenvalues, PC scores (k=10) and loadings
eigenvalues, scores, loadings = hl.hwe_normalized_pca(mt_common.GT,
                                                      k=10,
                                                      compute_loadings=True)
mt = mt.annotate_cols(scores=scores[mt.s].scores)

# Plot first 2 PCs
pca = hl.plot.scatter(mt_common.scores[0],
                      mt_common.scores[1],
                      label=mt_common.Race,
                      title='PCA Caucasian',
                      xlabel='PC1',
                      ylabel='PC2')
show(pca)


######## 5.2 Optional: Project new samples on existing PCs
def pc_project(
        # reference: https://github.com/macarthur-lab/gnomad_hail/blob/master/utils/generic.py#L131
        mt: hl.MatrixTable,
        loadings_ht: hl.Table,
        loading_location: str = "loadings",
        af_location: str = "pca_af") -> hl.Table:
    n_variants = loadings_ht.count()
    mt = mt.annotate_rows(
        pca_loadings=loadings_ht[mt.row_key][loading_location],
        pca_af=loadings_ht[mt.row_key][af_location])
    mt = mt.filter_rows(
        hl.is_defined(mt.pca_loadings) & hl.is_defined(mt.pca_af)
#then can read the mt file back in and plot up the manhattan and QQ plots
mt = hl.read_matrix_table(
    'gs://ukb-diverse-pops/AdmixedAfrEur/DosageFiles/UKBB_AfEur_QCed_lipids_dosages_admixfrac.mt'
)

# Note - for plotting, will need to use workers rather than premptible workers

# In[23]:

#plot up TC as a manhattan plot, anc 0
p_TC0 = hl.plot.manhattan(mt.results.TC.p_value[2],
                          title='Admixed Afr-Eur UKBB, TC, anc0',
                          collect_all=False,
                          significance_line=5e-08)
#colors=["#030303", "#7F7F7F"])
show(p_TC0)

# In[24]:

#plot up TC manhattan plot, anc 1
p_TC1 = hl.plot.manhattan(mt.results.TC.p_value[3],
                          title='Admixed Afr-Eur UKBB, TC, anc1',
                          collect_all=False,
                          significance_line=5e-08)
#colors=["#030303", "#7F7F7F"])
show(p_TC1)

# In[25]:

#make a QQ plot for TC anc0
p = hl.plot.qq(mt.results.TC.p_value[2], title="QQ plot, TC, anc0")
Пример #4
0
'''
Plotting female hwe pval distributions to understand X chrom QC

 MOP = Kenya Moi
 AAP = Ethiopia
 KWP = Kenya Kemri
 CTP = South Africa
 MAP = Uganda
'''

# Plotting Kenya Moi
p = hl.plot.histogram(female_list[0].log10_hwe_pval,
                      legend="HWE -log10 p-val",
                      range=(0, 20),
                      title="Kenya, Moi -log10 HWE p-val Before Prelim QC")
show(p)

# Plotting Ethiopia
p = hl.plot.histogram(female_list[1].log10_hwe_pval,
                      legend="HWE -log10 p-val",
                      range=(0, 20),
                      title="Uganda -log10 HWE p-val Before Prelim QC")
show(p)

# Plotting KEMRI
p = hl.plot.histogram(female_list[2].log10_hwe_pval,
                      legend="HWE -log10 p-val",
                      range=(0, 20),
                      title="KEMRI -log10 HWE p-val Before Prelim QC")
show(p)
Пример #5
0
def main(number_of_pcs: int):  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init()

    mt = hl.read_matrix_table(HGDP1KG_TOBWGS)
    scores = hl.read_table(SCORES)
    mt = mt.annotate_cols(scores=scores[mt.s].scores)
    mt = mt.annotate_cols(TOB_WGS=mt.s.contains('TOB'))

    # PCA plot must all come from the same object
    columns = mt.cols()
    pca_scores = columns.scores
    labels = columns.TOB_WGS

    # get percent variance explained
    eigenvalues = pd.read_csv(EIGENVALUES)
    eigenvalues.columns = ['eigenvalue']
    variance = eigenvalues['eigenvalue'].divide(float(eigenvalues.sum())) * 100
    variance = variance.round(2)

    print('Making PCA plots labelled by the study ID')
    for i in range(0, number_of_pcs):
        pc1 = i
        pc2 = i + 1
        print(f'PC{pc1 + 1} vs PC{pc2 + 1}')
        p = hl.plot.scatter(
            pca_scores[pc1],
            pca_scores[pc2],
            label=labels,
            title='TOB-WGS',
            xlabel='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) + '%)',
            ylabel='PC' + str(pc2 + 1) + ' (' + str(variance[pc2]) + '%)',
        )
        show(p)

    print('Making PCA plots labelled by the continental population')
    labels = columns.hgdp_1kg_metadata.population_inference.pop
    pops = list(set(labels.collect()))
    hover_fields = dict([('s', columns.s)])

    for i in range(0, number_of_pcs):
        pc1 = i
        pc2 = i + 1
        print(f'PC{pc1 + 1} vs PC{pc2 + 1}')
        p = hl.plot.scatter(
            pca_scores[pc1],
            pca_scores[pc2],
            label=labels,
            title='Continental Population',
            xlabel='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) + '%)',
            ylabel='PC' + str(pc2 + 1) + ' (' + str(variance[pc2]) + '%)',
            collect_all=True,
            colors=CategoricalColorMapper(palette=turbo(len(pops)),
                                          factors=pops),
            hover_fields=hover_fields,
        )
        show(p)

    print('Making PCA plots labelled by the subpopulation')
    labels = columns.hgdp_1kg_metadata.labeled_subpop
    pops = list(set(labels.collect()))

    for i in range(0, number_of_pcs):
        pc1 = i
        pc2 = i + 1
        print(f'PC{pc1 + 1} vs PC{pc2 + 1}')
        p = hl.plot.scatter(
            pca_scores[pc1],
            pca_scores[pc2],
            label=labels,
            title='Sub-Population',
            xlabel='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) + '%)',
            ylabel='PC' + str(pc2 + 1) + ' (' + str(variance[pc2]) + '%)',
            collect_all=True,
            colors=CategoricalColorMapper(palette=turbo(len(pops)),
                                          factors=pops),
        )
        show(p)