def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    mt = hl.read_matrix_table(HGDP1KG_TOBWGS)

    # Perform kinship test with pc_relate
    pc_rel_path = output_path('pc_relate_kinship_estimate.ht')
    pc_rel = hl.pc_relate(mt.GT, 0.01, k=10, statistics='kin')
    pc_rel.write(pc_rel_path, overwrite=True)
    pairs = pc_rel.filter(pc_rel['kin'] >= 0.125)
    related_samples_to_remove = hl.maximal_independent_set(
        pairs.i, pairs.j, False)
    n_related_samples = related_samples_to_remove.count()
    print(f'related_samples_to_remove.count() = {n_related_samples}')

    # save as html
    html = pd.DataFrame({
        'removed_individual':
        related_samples_to_remove.node.s.collect()
    }).to_html()
    plot_filename_html = output_path(f'removed_samples.html', 'web')
    with hl.hadoop_open(plot_filename_html, 'w') as f:
        f.write(html)
Пример #2
0
def query():  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    tob_wgs = hl.read_matrix_table(TOB_WGS).key_rows_by('locus', 'alleles')
    snp_chip = hl.read_matrix_table(SNP_CHIP).key_rows_by('locus', 'alleles')

    # filter to loci that are contained in snp-chip data after densifying
    tob_wgs = hl.experimental.densify(tob_wgs)
    tob_wgs = tob_wgs.select_entries(
        GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA)
    ).select_cols()
    snp_chip = snp_chip.select_entries(snp_chip.GT).select_cols()
    snp_chip = snp_chip.key_cols_by(s=snp_chip.s + '_snp_chip')
    tob_combined = tob_wgs.union_cols(snp_chip)
    tob_combined = tob_combined.cache()
    print(tob_combined.count_rows())

    # Perform PCA
    eigenvalues_path = output_path('eigenvalues.ht')
    scores_path = output_path('scores.ht')
    loadings_path = output_path('loadings.ht')
    eigenvalues, scores, loadings = hl.hwe_normalized_pca(
        tob_combined.GT, compute_loadings=True, k=20
    )
    hl.Table.from_pandas(pd.DataFrame(eigenvalues)).export(eigenvalues_path)
    scores.write(scores_path, overwrite=True)
    loadings.write(loadings_path, overwrite=True)
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    tob_wgs = hl.read_matrix_table(TOB_WGS)
    hgdp_1kg = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT)

    # keep loci that are contained in the densified, filtered tob-wgs mt
    hgdp_1kg = hgdp_1kg.semi_join_rows(tob_wgs.rows())

    # Entries and columns must be identical
    tob_wgs_select = tob_wgs.select_entries(
        GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA)).select_cols()
    hgdp_1kg_select = hgdp_1kg.select_entries(hgdp_1kg.GT).select_cols()
    # Join datasets
    hgdp1kg_tobwgs_joined = hgdp_1kg_select.union_cols(tob_wgs_select)
    # Add in metadata information
    hgdp_1kg_metadata = hgdp_1kg.cols()
    hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.annotate_cols(
        hgdp_1kg_metadata=hgdp_1kg_metadata[hgdp1kg_tobwgs_joined.s])
    # save this for population-level PCAs
    mt_path = output_path('hgdp1kg_tobwgs_joined_all_samples.mt')
    if not hl.hadoop_exists(mt_path):
        hgdp1kg_tobwgs_joined.write(mt_path)

    # Perform PCA
    eigenvalues_path = output_path('eigenvalues.ht')
    scores_path = output_path('scores.ht')
    loadings_path = output_path('loadings.ht')
    eigenvalues, scores, loadings = hl.hwe_normalized_pca(
        hgdp1kg_tobwgs_joined.GT, compute_loadings=True, k=20)
    hl.Table.from_pandas(pd.DataFrame(eigenvalues)).export(eigenvalues_path)
    scores.write(scores_path, overwrite=True)
    loadings.write(loadings_path, overwrite=True)
Пример #4
0
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    mt = hl.read_matrix_table(HGDP1KG_TOBWGS)
    mt = mt.filter_cols(
        (mt.hgdp_1kg_metadata.population_inference.pop == 'nfe')
        | (mt.s.contains('TOB'))
    )
    # Remove related samples (at the 2nd degree or closer)
    king = hl.king(mt.GT)
    king_path = output_path('king_kinship_estimate_NFE.ht')
    king.write(king_path)
    ht = king.entries()
    related_samples = ht.filter((ht.s_1 != ht.s) & (ht.phi > 0.125), keep=True)
    struct = hl.struct(i=related_samples.s_1, j=related_samples.s)
    struct = struct.annotate(phi=related_samples.phi)
    related_samples_to_remove = hl.maximal_independent_set(
        struct.i, struct.j, False  # pylint: disable=E1101
    )
    n_related_samples = related_samples_to_remove.count()
    print(f'related_samples_to_remove.count() = {n_related_samples}')
    # save as html
    html = pd.DataFrame(
        {'related_individual': related_samples_to_remove.node.collect()}
    ).to_html()
    plot_filename_html = output_path(f'related_samples.html', 'web')
    with hl.hadoop_open(plot_filename_html, 'w') as f:
        f.write(html)
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    mt = hl.read_matrix_table(HGDP1KG_TOBWGS)
    # Get samples from the specified population only
    mt = mt.filter_cols((
        mt.hgdp_1kg_metadata.population_inference.pop == 'nfe')
                        | (mt.s.contains('TOB')))
    # remove outlier samples, as identified by PCA
    outliers = [
        'TOB1734',
        'TOB1714',
        'TOB1126',
        'TOB1653',
        'TOB1668',
        'TOB1681',
        'TOB1116',
        'TOB1107',
        'TOB1635',
        'HG01628',
        'TOB1675',
        'TOB1125',
        'TOB1762',
        'TOB1263',
        'TOB1640',
        'HG01669',
        'TOB1795',
        'TOB1707',
        'HG01695',
        'HG01694',
        'TOB1673',
        'HG01630',
    ]

    mt = mt.filter_cols(hl.literal(outliers).contains(mt.s), keep=False)

    # Remove related samples at the 2nd degree or closer, as indicated by gnomAD
    mt = mt.filter_cols(mt.hgdp_1kg_metadata.gnomad_release
                        | mt.s.startswith('TOB'))

    # Perform PCA
    eigenvalues_path = output_path('eigenvalues.ht')
    scores_path = output_path('scores.ht')
    loadings_path = output_path('loadings.ht')
    eigenvalues, scores, loadings = hl.hwe_normalized_pca(
        mt.GT, compute_loadings=True, k=20)
    hl.Table.from_pandas(pd.DataFrame(eigenvalues)).export(eigenvalues_path)
    scores.write(scores_path, overwrite=True)
    loadings.write(loadings_path, overwrite=True)
def query():  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    mt = hl.read_matrix_table(FILTERED_VARIANTS)
    nrows = mt.count_rows()
    print(f'mt.count_rows() = {nrows}')

    # Plot the allele frequency
    fig = figure(
        title='Variant AF',
        x_axis_label='Allele Frequency',
        y_axis_label='Frequency (%)',
    )
    variant_af = mt.variant_qc.AF[1].collect()
    af_count, edges = np.histogram(variant_af,
                                   bins=100,
                                   weights=np.ones(len(variant_af)) /
                                   len(variant_af))
    variant_af_count = pd.DataFrame({
        'variant_af_count': af_count,
        'left': edges[:-1],
        'right': edges[1:]
    })
    fig.quad(
        bottom=0,
        top=variant_af_count['variant_af_count'],
        left=variant_af_count['left'],
        right=variant_af_count['right'],
        fill_color='blue',
        line_color='black',
    )
    # Add in the cumulative distribution
    cumulative_af = np.cumsum(af_count)
    fig.line(
        x=variant_af_count['right'],
        y=cumulative_af,
        color='gray',
        line_width=1,
        legend='Cum dist',
    )
    fig.legend.location = 'top_left'
    fig_filename = output_path('variant_selection_histogram.png', 'web')
    with hl.hadoop_open(fig_filename, 'wb') as f:
        get_screenshot_as_png(fig).save(f, format='PNG')
    html = file_html(fig, CDN, 'my plot')
    fig_filename_html = output_path('variant_selection_histogram.html', 'web')
    with hl.hadoop_open(fig_filename_html, 'w') as f:
        f.write(html)
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    snp_chip = hl.read_matrix_table(SNP_CHIP)
    eigenvalues_path = output_path('eigenvalues.ht')
    scores_path = output_path('scores.ht')
    loadings_path = output_path('loadings.ht')
    # Perform PCA
    eigenvalues, scores, loadings = hl.hwe_normalized_pca(
        snp_chip.GT, compute_loadings=True, k=5)
    hl.Table.from_pandas(pd.DataFrame(eigenvalues)).export(eigenvalues_path)
    scores.write(scores_path, overwrite=True)
    loadings.write(loadings_path, overwrite=True)
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    loadings = hl.read_table(LOADINGS)
    loadings = loadings.repartition(100, shuffle=False)
    loadings_path = output_path(f'gnomad_loadings_90k_liftover_repartitioned.ht')
    loadings.write(loadings_path)
Пример #9
0
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    snp_chip = hl.read_matrix_table(SNP_CHIP).key_rows_by('locus', 'alleles')
    snp_chip = snp_chip.repartition(10000)
    snp_chip_path = output_path('snp_chip_10000_partitions.mt')
    snp_chip.write(snp_chip_path, overwrite=True)
Пример #10
0
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    tob_wgs = hl.read_matrix_table(TOB_WGS)
    tob_wgs = hl.experimental.densify(tob_wgs)
    tob_wgs = hl.split_multi_hts(tob_wgs)
    tob_wgs_path = output_path('tob_wgs_plink')
    hl.export_plink(tob_wgs, tob_wgs_path, ind_id=tob_wgs.s)
Пример #11
0
def query(rerun):
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    sample_qc_path = output_path('sample_qc.mt')
    if rerun or not hl.hadoop_exists(sample_qc_path):
        mt = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT)
        mt = mt.head(100, n_cols=100)
        mt_qc = hl.sample_qc(mt)
        mt_qc.write(sample_qc_path)
    mt_qc = hl.read_matrix_table(sample_qc_path)

    plot_filename = output_path('call_rate_plot.png', 'web')
    if rerun or not hl.hadoop_exists(plot_filename):
        call_rate_plot = hl.plot.histogram(mt_qc.sample_qc.call_rate,
                                           range=(0, 1),
                                           legend='Call rate')
        with hl.hadoop_open(plot_filename, 'wb') as f:
            get_screenshot_as_png(call_rate_plot).save(f, format='PNG')
Пример #12
0
def main(mt: str):
    """
    Run vep using main.py wrapper
    """

    hl.init(default_reference='GRCh38')

    mt = hl.read_matrix_table(mt)
    # filter to biallelic loci only
    mt = mt.filter_rows(hl.len(mt.alleles) == 2)
    mt = mt.filter_rows(mt.alleles[1] != '*')
    vep = hl.vep(mt, config='file:///vep_data/vep-gcloud.json')
    vep_path = output_path('vep105_GRCh38.mt')
    vep.write(vep_path)
Пример #13
0
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    tob_wgs = hl.read_matrix_table(TOB_WGS)
    tob_wgs = hl.experimental.densify(tob_wgs)
    # filter out constant variants
    tob_wgs = tob_wgs.filter_rows(hl.len(tob_wgs.alleles) == 2)
    tob_wgs = tob_wgs.head(30000)
    ld = hl.ld_matrix(tob_wgs.GT.n_alt_alleles(), tob_wgs.locus, radius=2e6)
    ld = pd.DataFrame(ld.to_numpy())
    # save pandas df
    ld_filename = output_path(f'ld_matrix.csv', 'analysis')
    ld.to_csv(ld_filename, index=False)
Пример #14
0
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    mt = hl.read_matrix_table(HGDP1KG_TOBWGS)
    scores = hl.read_table(SCORES)

    # Filter outliers and related samples
    mt = mt.semi_join_cols(scores)
    mt = mt.annotate_cols(scores=scores[mt.s].scores)
    mt = mt.annotate_cols(
        study=hl.if_else(mt.s.contains('TOB'), 'TOB-WGS', 'HGDP-1kG'))

    # PCA plot must all come from the same object
    columns = mt.cols()
    pca_scores = columns.scores
    labels = columns.study
    sample_names = columns.s
    cohort_sample_codes = list(set(labels.collect()))
    tooltips = [('labels', '@label'), ('samples', '@samples')]

    # get percent variance explained
    eigenvalues = hl.import_table(EIGENVALUES)
    eigenvalues = eigenvalues.to_pandas()
    eigenvalues.columns = ['eigenvalue']
    eigenvalues = pd.to_numeric(eigenvalues.eigenvalue)
    variance = eigenvalues.divide(float(eigenvalues.sum())) * 100
    variance = variance.round(2)

    # Get number of PCs
    number_of_pcs = len(eigenvalues)

    print('Making PCA plots labelled by study')
    for i in range(0, (number_of_pcs - 1)):
        pc1 = i
        pc2 = i + 1
        print(f'PC{pc1 + 1} vs PC{pc2 + 1}')
        plot = figure(
            title='TOB-WGS + HGDP/1kG Dataset',
            x_axis_label=f'PC{pc1 + 1} ({variance[pc1]}%)',
            y_axis_label=f'PC{pc2 + 1} ({variance[pc2]}%)',
            tooltips=tooltips,
        )
        source = ColumnDataSource(
            dict(
                x=pca_scores[pc1].collect(),
                y=pca_scores[pc2].collect(),
                label=labels.collect(),
                samples=sample_names.collect(),
            ))
        plot.circle(
            'x',
            'y',
            alpha=0.5,
            source=source,
            size=4,
            color=factor_cmap('label', ['#1b9e77', '#d95f02'],
                              cohort_sample_codes),
            legend_group='label',
        )
        plot.add_layout(plot.legend[0], 'left')
        plot_filename = output_path(f'study_pc{pc2}.png', 'web')
        with hl.hadoop_open(plot_filename, 'wb') as f:
            get_screenshot_as_png(plot).save(f, format='PNG')
        html = file_html(plot, CDN, 'my plot')
        plot_filename_html = output_path(f'study_pc{pc2}.html', 'web')
        with hl.hadoop_open(plot_filename_html, 'w') as f:
            f.write(html)

    print('Making PCA plots labelled by the subpopulation')
    labels = columns.hgdp_1kg_metadata.labeled_subpop.collect()
    labels = ['TOB-WGS' if x is None else x for x in labels]
    subpopulation = list(set(labels))
    # change ordering of subpopulations
    # so TOB-WGS is at the end and glyphs appear on top
    subpopulation.append(subpopulation.pop(subpopulation.index('TOB-WGS')))
    tooltips = [('labels', '@label'), ('samples', '@samples')]

    for i in range(0, (number_of_pcs - 1)):
        pc1 = i
        pc2 = i + 1
        print(f'PC{pc1 + 1} vs PC{pc2 + 1}')
        plot = figure(
            title='Subpopulation',
            x_axis_label=f'PC{pc1 + 1} ({variance[pc1]}%)',
            y_axis_label=f'PC{pc2 + 1} ({variance[pc2]}%)',
            tooltips=tooltips,
        )
        source = ColumnDataSource(
            dict(
                x=pca_scores[pc1].collect(),
                y=pca_scores[pc2].collect(),
                label=labels,
                samples=sample_names.collect(),
            ))
        plot.circle(
            'x',
            'y',
            alpha=0.5,
            source=source,
            size=4,
            color=factor_cmap('label', turbo(len(subpopulation)),
                              subpopulation),
            legend_group='label',
        )
        plot.add_layout(plot.legend[0], 'left')
        plot_filename = output_path(f'subpopulation_pc{pc2}.png', 'web')
        with hl.hadoop_open(plot_filename, 'wb') as f:
            get_screenshot_as_png(plot).save(f, format='PNG')
        html = file_html(plot, CDN, 'my plot')
        plot_filename_html = output_path(f'subpopulation_pc{pc2}.html', 'web')
        with hl.hadoop_open(plot_filename_html, 'w') as f:
            f.write(html)
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    # save relatedness estimates for pc_relate global populations
    ht = hl.read_table(PC_RELATE_ESTIMATE_GLOBAL)
    related_samples = ht.filter(ht.kin > 0.1)
    pc_relate_global = pd.DataFrame({
        'i_s': related_samples.i.s.collect(),
        'j_s': related_samples.j.s.collect(),
        'kin': related_samples.kin.collect(),
    })
    filename = output_path(f'pc_relate_global_matrix.csv', 'analysis')
    pc_relate_global.to_csv(filename, index=False)

    # get maximal independent set
    pairs = ht.filter(ht['kin'] >= 0.125)
    related_samples_to_remove = hl.maximal_independent_set(
        pairs.i, pairs.j, False)

    related_samples = pd.DataFrame(
        {'removed_individual': related_samples_to_remove.node.s.collect()})
    filename = output_path(f'pc_relate_global_maximal_independent_set.csv',
                           'analysis')
    related_samples.to_csv(filename, index=False)

    # save relatedness estimates for pc_relate NFE samples
    ht = hl.read_table(PC_RELATE_ESTIMATE_NFE)
    related_samples = ht.filter(ht.kin > 0.1)
    pc_relate_nfe = pd.DataFrame({
        'i_s': related_samples.i.s.collect(),
        'j_s': related_samples.j.s.collect(),
        'kin': related_samples.kin.collect(),
    })
    filename = output_path(f'pc_relate_nfe_matrix.csv', 'analysis')
    pc_relate_nfe.to_csv(filename, index=False)
    # get maximal independent set
    pairs = ht.filter(ht['kin'] >= 0.125)
    related_samples_to_remove = hl.maximal_independent_set(
        pairs.i, pairs.j, False)
    related_samples = pd.DataFrame(
        {'removed_individual': related_samples_to_remove.node.s.collect()})
    filename = output_path(f'pc_relate_nfe_maximal_independent_set.csv',
                           'analysis')
    related_samples.to_csv(filename, index=False)

    # save relatedness estimates for KING NFE samples
    mt = hl.read_matrix_table(KING_ESTIMATE_NFE)
    ht = mt.entries()
    # remove entries where samples are identical
    related_samples = ht.filter(ht.s_1 != ht.s)
    related_samples = ht.filter(ht.phi > 0.1)
    king_nfe = pd.DataFrame({
        'i_s': related_samples.s_1.collect(),
        'j_s': related_samples.s.collect(),
        'kin': related_samples.phi.collect(),
    })
    filename = output_path(f'king_nfe_matrix_90k.csv', 'analysis')
    king_nfe.to_csv(filename, index=False)
    # save KING NFE maximal independent set
    second_degree_related_samples = ht.filter(
        (ht.s_1 != ht.s) & (ht.phi > 0.125), keep=True)
    struct = hl.struct(i=second_degree_related_samples.s_1,
                       j=second_degree_related_samples.s)
    struct = struct.annotate(phi=second_degree_related_samples.phi)
    related_samples_to_remove = hl.maximal_independent_set(
        struct.i,
        struct.j,
        False  # pylint: disable=E1101
    )
    related_samples = pd.DataFrame(
        {'related_individual': related_samples_to_remove.node.collect()})
    filename = output_path(
        f'king_90k_related_samples_maximal_independent_set.csv', 'analysis')
    related_samples.to_csv(filename, index=False)
Пример #16
0
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    scores = hl.read_table(SCORES)
    scores = scores.annotate(
        study=hl.if_else(scores.s.contains('TOB'), 'TOB-WGS', 'HGDP-1kG'))
    sample_names = scores.s.collect()
    labels = scores.study.collect()
    study = list(set(labels))
    tooltips = [('labels', '@label'), ('samples', '@samples')]
    eigenvalues = hl.import_table(EIGENVALUES)
    eigenvalues = eigenvalues.to_pandas()
    eigenvalues.columns = ['eigenvalue']
    eigenvalues = pd.to_numeric(eigenvalues.eigenvalue)
    variance = eigenvalues.divide(float(eigenvalues.sum())) * 100
    variance = variance.round(2)

    # Get number of PCs
    number_of_pcs = len(eigenvalues)

    # plot by study
    for i in range(0, (number_of_pcs - 1)):
        pc1 = i
        pc2 = i + 1
        plot = figure(
            title='Study',
            x_axis_label=f'PC{pc1 + 1} ({variance[pc1]})%)',
            y_axis_label=f'PC{pc2 + 1} ({variance[pc2]}%)',
            tooltips=tooltips,
        )
        source = ColumnDataSource(
            dict(
                x=scores.scores[pc1].collect(),
                y=scores.scores[pc2].collect(),
                label=labels,
                samples=sample_names,
            ))
        plot.circle(
            'x',
            'y',
            alpha=0.5,
            source=source,
            size=4,
            color=factor_cmap('label', ['#1b9e77', '#d95f02'], study),
            legend_group='label',
        )
        plot.add_layout(plot.legend[0], 'left')
        plot_filename = output_path(f'study_pc{pc2}.png', 'web')
        with hl.hadoop_open(plot_filename, 'wb') as f:
            get_screenshot_as_png(plot).save(f, format='PNG')
        html = file_html(plot, CDN, 'my plot')
        plot_filename_html = output_path(f'study_pc{pc2}.html', 'web')
        with hl.hadoop_open(plot_filename_html, 'w') as f:
            f.write(html)

    # plot by continental population
    hgdp1kg_tobwgs = hl.read_matrix_table(HGDP1KG_TOBWGS)
    scores = scores.annotate(continental_pop=hgdp1kg_tobwgs.cols()[
        scores.s].hgdp_1kg_metadata.population_inference.pop)
    labels = scores.continental_pop.collect()
    # Change TOB-WGS 'none' values to 'TOB-WGS'
    labels = ['TOB-NFE' if x is None else x for x in labels]
    continental_population = list(set(labels))
    tooltips = [('labels', '@label'), ('samples', '@samples')]

    for i in range(0, (number_of_pcs - 1)):
        pc1 = i
        pc2 = i + 1
        plot = figure(
            title='Continental Population',
            x_axis_label=f'PC{pc1 + 1} ({variance[pc1]})%)',
            y_axis_label=f'PC{pc2 + 1} ({variance[pc2]}%)',
            tooltips=tooltips,
        )
        source = ColumnDataSource(
            dict(
                x=scores.scores[pc1].collect(),
                y=scores.scores[pc2].collect(),
                label=labels,
                samples=sample_names,
            ))
        plot.circle(
            'x',
            'y',
            alpha=0.5,
            source=source,
            size=4,
            color=factor_cmap('label', turbo(len(continental_population)),
                              continental_population),
            legend_group='label',
        )
        plot.add_layout(plot.legend[0], 'left')
        plot_filename = output_path(f'continental_pop_pc{pc2}.png', 'web')
        with hl.hadoop_open(plot_filename, 'wb') as f:
            get_screenshot_as_png(plot).save(f, format='PNG')
        html = file_html(plot, CDN, 'my plot')
        plot_filename_html = output_path(f'continental_pop_pc{pc2}.html',
                                         'web')
        with hl.hadoop_open(plot_filename_html, 'w') as f:
            f.write(html)

    # plot by subpopulation
    scores = scores.annotate(subpop=hgdp1kg_tobwgs.cols()[
        scores.s].hgdp_1kg_metadata.labeled_subpop)
    labels = scores.subpop.collect()
    labels = ['TOB-NFE' if x is None else x for x in labels]
    sub_population = list(set(labels))
    tooltips = [('labels', '@label'), ('samples', '@samples')]

    for i in range(0, (number_of_pcs - 1)):
        pc1 = i
        pc2 = i + 1
        plot = figure(
            title='Subpopulation',
            x_axis_label=f'PC{pc1 + 1} ({variance[pc1]})%)',
            y_axis_label=f'PC{pc2 + 1} ({variance[pc2]}%)',
            tooltips=tooltips,
        )
        source = ColumnDataSource(
            dict(
                x=scores.scores[pc1].collect(),
                y=scores.scores[pc2].collect(),
                label=labels,
                samples=sample_names,
            ))
        plot.circle(
            'x',
            'y',
            alpha=0.5,
            source=source,
            size=4,
            color=factor_cmap('label', turbo(len(sub_population)),
                              sub_population),
            legend_group='label',
        )
        plot.add_layout(plot.legend[0], 'left')
        plot_filename = output_path(f'subpop_pc{pc2}.png', 'web')
        with hl.hadoop_open(plot_filename, 'wb') as f:
            get_screenshot_as_png(plot).save(f, format='PNG')
        html = file_html(plot, CDN, 'my plot')
        plot_filename_html = output_path(f'subpop_pc{pc2}.html', 'web')
        with hl.hadoop_open(plot_filename_html, 'w') as f:
            f.write(html)

    # Plot loadings
    loadings_ht = hl.read_table(LOADINGS)
    for i in range(0, (number_of_pcs)):
        pc = i + 1
        plot = manhattan_loadings(
            pvals=hl.abs(loadings_ht.loadings[i]),
            locus=loadings_ht.locus,
            title='Loadings of PC ' + str(pc),
            collect_all=True,
        )
        plot_filename = output_path(f'loadings_pc{pc}.png', 'web')
        with hl.hadoop_open(plot_filename, 'wb') as f:
            get_screenshot_as_png(plot).save(f, format='PNG')
        html = file_html(plot, CDN, 'my plot')
        plot_filename_html = output_path(f'loadings_pc{pc}.html', 'web')
        with hl.hadoop_open(plot_filename_html, 'w') as f:
            f.write(html)
Пример #17
0
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    scores = hl.read_table(SCORES)
    tob_wgs = hl.read_matrix_table(TOB_WGS)
    snp_chip_names = scores.s.collect()
    wgs_names = tob_wgs.s.collect()

    def sample_type(sample_name):
        return 'dual_sample' if sample_name in wgs_names else 'snp_chip_only'

    labels = list(map(sample_type, snp_chip_names))

    # get percent variance explained
    eigenvalues = hl.import_table(EIGENVALUES)
    eigenvalues = eigenvalues.to_pandas()
    eigenvalues.columns = ['eigenvalue']
    eigenvalues = pd.to_numeric(eigenvalues.eigenvalue)
    variance = eigenvalues.divide(float(eigenvalues.sum())) * 100
    variance = variance.round(2)

    # Get number of PCs
    number_of_pcs = len(eigenvalues)

    # plot
    cohort_sample_codes = list(set(labels))
    tooltips = [('labels', '@label'), ('samples', '@samples')]
    for i in range(0, (number_of_pcs - 1)):
        pc1 = i
        pc2 = i + 1
        plot = figure(
            title='SNP Chip Samples',
            x_axis_label=f'PC{pc1 + 1} ({variance[pc1]})%)',
            y_axis_label=f'PC{pc2 + 1} ({variance[pc2]}%)',
            tooltips=tooltips,
        )
        source = ColumnDataSource(
            dict(
                x=scores.scores[pc1].collect(),
                y=scores.scores[pc2].collect(),
                label=labels,
                samples=snp_chip_names,
            ))
        plot.circle(
            'x',
            'y',
            alpha=0.5,
            source=source,
            size=8,
            color=factor_cmap('label', ['#1b9e77', '#d95f02'],
                              cohort_sample_codes),
            legend_group='label',
        )
        plot.add_layout(plot.legend[0], 'left')
        plot_filename = output_path(f'pc{pc2}.png', 'web')
        with hl.hadoop_open(plot_filename, 'wb') as f:
            get_screenshot_as_png(plot).save(f, format='PNG')
        html = file_html(plot, CDN, 'my plot')
        plot_filename_html = output_path(f'pc{pc2}.html', 'web')
        with hl.hadoop_open(plot_filename_html, 'w') as f:
            f.write(html)
Пример #18
0
def query():  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    loadings_ht = hl.read_table(LOADINGS)
    gtf_ht = hl.experimental.import_gtf(
        GTF_FILE,
        reference_genome='GRCh38',
        skip_invalid_contigs=True,
        min_partitions=12,
    )
    number_of_pcs = hl.len(loadings_ht.loadings).take(1)[0] - 1
    for i in range(0, (number_of_pcs)):
        pc = i + 1
        plot_filename = output_path(f'loadings_manhattan_plot_pc{pc}.png',
                                    'web')
        if not hl.hadoop_exists(plot_filename):
            p = manhattan_loadings(
                iteration=i,
                gtf=gtf_ht,
                loadings=loadings_ht,
                title=f'Loadings of PC{pc}',
                collect_all=True,
            )
            with hl.hadoop_open(plot_filename, 'wb') as f:
                get_screenshot_as_png(p).save(f, format='PNG')
            html = file_html(p, CDN, 'my plot')
            plot_filename_html = output_path(f'loadings_pc{pc}.html', 'web')
            with hl.hadoop_open(plot_filename_html, 'w') as f:
                f.write(html)

    # Get samples which are driving loadings
    mt = hl.read_matrix_table(HGDP1KG_TOBWGS)
    scores = hl.read_table(SCORES)
    mt = mt.semi_join_cols(scores)
    loadings_ht = loadings_ht.key_by('locus')
    mt = mt.annotate_rows(loadings=loadings_ht[mt.locus].loadings)

    for dim in range(0, number_of_pcs):
        max_value = mt.aggregate_rows(hl.agg.stats(hl.abs(
            mt.loadings[dim]))).max
        significant_variants = mt.filter_rows(
            hl.abs(mt.loadings[dim]) == max_value)
        significant_variants = hl.sample_qc(significant_variants)
        significant_variant_list = significant_variants.locus.collect()
        print(f'PC{dim}:', significant_variant_list)
        heterozygous_samples = significant_variants.filter_cols(
            significant_variants.sample_qc.n_het > 0).s.collect()
        homozygous_alternate_samples = significant_variants.filter_cols(
            significant_variants.sample_qc.n_hom_var > 0).s.collect()
        if len(heterozygous_samples) > len(homozygous_alternate_samples):
            homozygous_alternate_samples.extend('null' for _ in range(
                len(heterozygous_samples) - len(homozygous_alternate_samples)))
        elif len(heterozygous_samples) < len(homozygous_alternate_samples):
            heterozygous_samples.extend('null' for _ in range(
                len(homozygous_alternate_samples) - len(heterozygous_samples)))

        # save as html
        html = pd.DataFrame({
            'heterozygous_samples':
            heterozygous_samples,
            'homozygous_alternate_samples':
            homozygous_alternate_samples,
        }).to_html()
        plot_filename_html = output_path(
            f'significant_variants_non_ref_samples{dim}.html', 'web')
        with hl.hadoop_open(plot_filename_html, 'w') as f:
            f.write(html)
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    mt = hl.read_matrix_table(HGDP1KG_TOBWGS)
    # Get NFE samples only
    mt = mt.filter_cols((
        mt.hgdp_1kg_metadata.population_inference.pop == 'nfe')
                        | (mt.s.contains('TOB')))
    scores = hl.read_table(SCORES)
    mt = mt.annotate_cols(scores=scores[mt.s].scores)
    mt = mt.annotate_cols(TOB_WGS=mt.s.contains('TOB'))

    # PCA plot must all come from the same object
    columns = mt.cols()
    pca_scores = columns.scores
    labels = columns.TOB_WGS
    hover_fields = dict([('s', columns.s)])

    # get percent variance explained
    eigenvalues = hl.import_table(EIGENVALUES)
    eigenvalues = eigenvalues.to_pandas()
    eigenvalues.columns = ['eigenvalue']
    eigenvalues = pd.to_numeric(eigenvalues.eigenvalue)
    variance = eigenvalues.divide(float(eigenvalues.sum())) * 100
    variance = variance.round(2)

    # Get number of PCs
    number_of_pcs = len(eigenvalues)

    print('Making PCA plots labelled by study')
    for i in range(0, (number_of_pcs - 1)):
        pc1 = i
        pc2 = i + 1
        print(f'PC{pc1 + 1} vs PC{pc2 + 1}')
        p = hl.plot.scatter(
            pca_scores[pc1],
            pca_scores[pc2],
            label=labels,
            title='TOB-WGS',
            xlabel='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) + '%)',
            ylabel='PC' + str(pc2 + 1) + ' (' + str(variance[pc2]) + '%)',
            collect_all=True,
            hover_fields=hover_fields,
        )
        plot_filename = output_path(f'study_pc{pc2}.png', 'web')
        with hl.hadoop_open(plot_filename, 'wb') as f:
            get_screenshot_as_png(p).save(f, format='PNG')
        html = file_html(p, CDN, 'my plot')
        plot_filename_html = output_path(f'study_pc{pc2}.html', 'web')
        with hl.hadoop_open(plot_filename_html, 'w') as f:
            f.write(html)

    print('Making PCA plots labelled by the subpopulation')
    labels = columns.hgdp_1kg_metadata.labeled_subpop
    pops = list(set(labels.collect()))

    for i in range(0, (number_of_pcs - 1)):
        pc1 = i
        pc2 = i + 1
        print(f'PC{pc1 + 1} vs PC{pc2 + 1}')
        p = hl.plot.scatter(
            pca_scores[pc1],
            pca_scores[pc2],
            label=labels,
            title='Subpopulation',
            xlabel='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) + '%)',
            ylabel='PC' + str(pc2 + 1) + ' (' + str(variance[pc2]) + '%)',
            collect_all=True,
            colors=CategoricalColorMapper(palette=turbo(len(pops)),
                                          factors=pops),
        )
        plot_filename = output_path(f'subpopulation_pc{pc2}.png', 'web')
        with hl.hadoop_open(plot_filename, 'wb') as f:
            get_screenshot_as_png(p).save(f, format='PNG')
        html = file_html(p, CDN, 'my plot')
        plot_filename_html = output_path(f'subpopulation_pc{pc2}.html', 'web')
        with hl.hadoop_open(plot_filename_html, 'w') as f:
            f.write(html)
def query():  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    scores = hl.read_table(SCORES)
    scores = scores.annotate(cohort_sample_codes=hl.if_else(
        scores.s.contains('snp_chip'), 'snp_chip', 'tob_wgs'))
    labels = scores.cohort_sample_codes
    hover_fields = dict([('s', scores.s)])

    # get percent variance explained
    eigenvalues = hl.import_table(EIGENVALUES)
    eigenvalues = eigenvalues.to_pandas()
    eigenvalues.columns = ['eigenvalue']
    eigenvalues = pd.to_numeric(eigenvalues.eigenvalue)
    variance = eigenvalues.divide(float(eigenvalues.sum())) * 100
    variance = variance.round(2)

    # Get number of PCs
    number_of_pcs = len(eigenvalues)

    for i in range(0, (number_of_pcs - 1)):
        pc1 = i
        pc2 = i + 1
        print(f'PC{pc1 + 1} vs PC{pc2 + 1}')
        p = hl.plot.scatter(
            scores.scores[pc1],
            scores.scores[pc2],
            label=labels,
            title='TOB-WGS + TOB SNP Chip',
            xlabel='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) + '%)',
            ylabel='PC' + str(pc2 + 1) + ' (' + str(variance[pc2]) + '%)',
            hover_fields=hover_fields,
        )
        plot_filename = output_path('pc' + str(pc2) + '.png', 'web')
        with hl.hadoop_open(plot_filename, 'wb') as f:
            get_screenshot_as_png(p).save(f, format='PNG')
        html = file_html(p, CDN, 'my plot')
        plot_filename_html = output_path(f'pc{pc2}.html', 'web')
        with hl.hadoop_open(plot_filename_html, 'w') as f:
            f.write(html)

    # Get partner sample information
    sample_names = scores.s.collect()

    def sample_type(sample_name):
        if sample_name.endswith('snp_chip'):
            partner_name = re.sub('_snp_chip', '', sample_name)
            tech = 'snp'
        else:
            partner_name = sample_name + '_snp_chip'
            tech = 'wgs'

        if partner_name in sample_names:
            prefix = 'dual_'
        else:
            prefix = ''

        return prefix + tech

    # save as html
    labels = list(map(sample_type, sample_names))
    html = pd.DataFrame({
        'sample_name': sample_names,
        'sample_tech': labels
    }).to_html()
    plot_filename_html = output_path(f'sample_technology.html', 'web')
    with hl.hadoop_open(plot_filename_html, 'w') as f:
        f.write(html)

    # plot
    cohort_sample_codes = list(set(labels))
    tooltips = [('labels', '@label'), ('samples', '@samples')]
    for i in range(0, (number_of_pcs - 1)):
        pc1 = i
        pc2 = i + 1
        plot = figure(
            title='Reprocessed Sample Projection',
            x_axis_label='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) +
            '%)',
            y_axis_label='PC' + str(pc2 + 1) + ' (' + str(variance[pc1]) +
            '%)',
            tooltips=tooltips,
        )
        source = ColumnDataSource(
            dict(
                x=scores.scores[pc1].collect(),
                y=scores.scores[pc2].collect(),
                label=labels,
                samples=sample_names,
            ))
        plot.circle(
            'x',
            'y',
            alpha=0.5,
            source=source,
            size=8,
            color=factor_cmap('label', Dark2[len(cohort_sample_codes)],
                              cohort_sample_codes),
            legend_group='label',
        )
        plot.add_layout(plot.legend[0], 'left')
        plot_filename = output_path('technology_type_pc' + str(pc2) + '.png',
                                    'web')
        with hl.hadoop_open(plot_filename, 'wb') as f:
            get_screenshot_as_png(plot).save(f, format='PNG')
        html = file_html(plot, CDN, 'my plot')
        plot_filename_html = output_path(f'technology_type_pc{pc2}.html',
                                         'web')
        with hl.hadoop_open(plot_filename_html, 'w') as f:
            f.write(html)
def query():  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    snp_chip = hl.read_matrix_table(SNP_CHIP)
    tob_wgs = hl.read_matrix_table(TOB_WGS)
    tob_wgs = hl.experimental.densify(tob_wgs)
    tob_wgs = tob_wgs.annotate_entries(GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA))
    snp_chip = snp_chip.semi_join_rows(tob_wgs.rows())
    snp_chip_path = output_path('snp_chip_filtered_by_tob_wgs.mt', 'tmp')
    snp_chip = snp_chip.checkpoint(snp_chip_path)

    # Perform PCA
    eigenvalues, scores, loadings = hl.hwe_normalized_pca(
        snp_chip.GT, compute_loadings=True, k=5)

    scores_path = output_path('scores.ht', 'tmp')
    loadings_path = output_path('loadings.ht', 'tmp')
    scores = scores.checkpoint(scores_path)
    loadings = loadings.checkpoint(loadings_path)

    # make tob_wgs rows equivalent to the snp_chip rows
    tob_wgs = tob_wgs.semi_join_rows(snp_chip.rows())
    tob_wgs_path = output_path('tob_wgs_filtered_by_snp_chip.mt', 'tmp')
    tob_wgs = tob_wgs.checkpoint(tob_wgs_path)
    snp_chip = snp_chip.annotate_rows(
        af=hl.agg.mean(snp_chip.GT.n_alt_alleles()) / 2)
    loadings = loadings.annotate(af=snp_chip.rows()[loadings.key].af)
    # project WGS samples onto PCA
    ht = pc_project(tob_wgs.GT, loadings.loadings, loadings.af)
    ht_path = output_path('pc_project_tob_wgs.ht', 'tmp')
    ht = ht.checkpoint(ht_path)
    scores = scores.key_by(s=scores.s + '_snp_chip')
    union_scores = ht.union(scores)
    variance = [(x / sum(eigenvalues) * 100) for x in eigenvalues]
    variance = [round(x, 2) for x in variance]

    # Get partner sample information
    sample_names = union_scores.s.collect()

    def sample_type(sample_name):
        if sample_name.endswith('snp_chip'):
            partner_name = re.sub('_snp_chip', '', sample_name)
            tech = 'snp'
        else:
            partner_name = sample_name + '_snp_chip'
            tech = 'wgs'

        if partner_name in sample_names:
            prefix = 'dual_'
        else:
            prefix = ''

        return prefix + tech

    # plot
    labels = list(map(sample_type, sample_names))
    cohort_sample_codes = list(set(labels))
    tooltips = [('labels', '@label'), ('samples', '@samples')]

    # Get number of PCs
    number_of_pcs = len(eigenvalues)

    for i in range(0, (number_of_pcs - 1)):
        pc1 = i
        pc2 = i + 1
        plot = figure(
            title='TOB-WGS + TOB SNP Chip',
            x_axis_label=f'PC{pc1 + 1} ({variance[pc1]})%)',
            y_axis_label=f'PC{pc2 + 1} ({variance[pc2]}%)',
            tooltips=tooltips,
        )
        source = ColumnDataSource(
            dict(
                x=union_scores.scores[pc1].collect(),
                y=union_scores.scores[pc2].collect(),
                label=labels,
                samples=sample_names,
            ))
        source_filename = output_path(f'source_{pc2}.ht', 'tmp')
        hl.Table.from_pandas(pd.DataFrame(source.data)).export(source_filename)
        plot.circle(
            'x',
            'y',
            alpha=0.5,
            source=source,
            size=8,
            color=factor_cmap('label', Dark2[len(cohort_sample_codes)],
                              cohort_sample_codes),
            legend_group='label',
        )
        plot.add_layout(plot.legend[0], 'left')
        plot_filename = output_path(f'pc{pc2}.png', 'web')
        with hl.hadoop_open(plot_filename, 'wb') as f:
            get_screenshot_as_png(plot).save(f, format='PNG')
        html = file_html(plot, CDN, 'my plot')
        plot_filename_html = output_path(f'pc{pc2}.html', 'web')
        with hl.hadoop_open(plot_filename_html, 'w') as f:
            f.write(html)