Пример #1
0
"""
Estimate kinship coefficient using KING on NFE samples from the HGDP/1KG dataset.
"""

import hail as hl
import pandas as pd
from analysis_runner import bucket_path, output_path

HGDP1KG_TOBWGS = bucket_path(
    '1kg_hgdp_densified_pca_new_variants/v0/hgdp1kg_tobwgs_joined_all_samples.mt'
)


def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    mt = hl.read_matrix_table(HGDP1KG_TOBWGS)
    mt = mt.filter_cols(
        (mt.hgdp_1kg_metadata.population_inference.pop == 'nfe')
        | (mt.s.contains('TOB'))
    )
    # Remove related samples (at the 2nd degree or closer)
    king = hl.king(mt.GT)
    king_path = output_path('king_kinship_estimate_NFE.ht')
    king.write(king_path)
    ht = king.entries()
    related_samples = ht.filter((ht.s_1 != ht.s) & (ht.phi > 0.125), keep=True)
    struct = hl.struct(i=related_samples.s_1, j=related_samples.s)
    struct = struct.annotate(phi=related_samples.phi)
"""Create PCA plots for the combined TOB-WGS/SNP-chip data"""

import re
from bokeh.io.export import get_screenshot_as_png
from bokeh.resources import CDN
from bokeh.embed import file_html
from bokeh.transform import factor_cmap
from bokeh.plotting import ColumnDataSource, figure
from bokeh.palettes import Dark2  # pylint: disable=no-name-in-module
import pandas as pd
import hail as hl
import click
from analysis_runner import bucket_path, output_path

SCORES = bucket_path('tob_wgs_snp_chip_variant_pca/v6/scores.ht/')
EIGENVALUES = bucket_path('tob_wgs_snp_chip_variant_pca/v6/eigenvalues.ht')


@click.command()
def query():  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    scores = hl.read_table(SCORES)
    scores = scores.annotate(cohort_sample_codes=hl.if_else(
        scores.s.contains('snp_chip'), 'snp_chip', 'tob_wgs'))
    labels = scores.cohort_sample_codes
    hover_fields = dict([('s', scores.s)])

    # get percent variance explained
"""QC of newly-selected variants"""

import click
import hail as hl
import numpy as np
import pandas as pd
from analysis_runner import bucket_path, output_path
from bokeh.plotting import figure
from bokeh.io.export import get_screenshot_as_png
from bokeh.resources import CDN
from bokeh.embed import file_html

FILTERED_VARIANTS = bucket_path(
    'tob_wgs_hgdp_1kg_variant_selection/v8/tob_wgs_hgdp_1kg_filtered_variants.mt'
)


@click.command()
def query():  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    mt = hl.read_matrix_table(FILTERED_VARIANTS)
    nrows = mt.count_rows()
    print(f'mt.count_rows() = {nrows}')

    # Plot the allele frequency
    fig = figure(
        title='Variant AF',
        x_axis_label='Allele Frequency',
Пример #4
0
"""Export TOB-WGS joint callset as PLINK format"""

import hail as hl
from analysis_runner import bucket_path, output_path

TOB_WGS = bucket_path('mt/v5.1.mt/')


def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    tob_wgs = hl.read_matrix_table(TOB_WGS)
    tob_wgs = hl.experimental.densify(tob_wgs)
    tob_wgs = hl.split_multi_hts(tob_wgs)
    tob_wgs_path = output_path('tob_wgs_plink')
    hl.export_plink(tob_wgs, tob_wgs_path, ind_id=tob_wgs.s)


if __name__ == '__main__':
    query()
"""
Generate PCA on SNP-chip data only.
"""

import click
import hail as hl
import pandas as pd
from analysis_runner import bucket_path, output_path

SNP_CHIP = bucket_path('snpchip/v1/snpchip_grch38.mt')


@click.command()
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    snp_chip = hl.read_matrix_table(SNP_CHIP)
    eigenvalues_path = output_path('eigenvalues.ht')
    scores_path = output_path('scores.ht')
    loadings_path = output_path('loadings.ht')
    # Perform PCA
    eigenvalues, scores, loadings = hl.hwe_normalized_pca(
        snp_chip.GT, compute_loadings=True, k=5)
    hl.Table.from_pandas(pd.DataFrame(eigenvalues)).export(eigenvalues_path)
    scores.write(scores_path, overwrite=True)
    loadings.write(loadings_path, overwrite=True)


if __name__ == '__main__':
Пример #6
0
"""Plot PCA loadings for HGDP/1kG + TOB-WGS samples"""

from bokeh.models import CategoricalColorMapper, HoverTool
from bokeh.io.export import get_screenshot_as_png
from bokeh.plotting import figure
from bokeh.embed import file_html
from bokeh.resources import CDN
from analysis_runner import bucket_path, output_path
import hail as hl
import pandas as pd

LOADINGS = bucket_path('tob_wgs_hgdp_1kg_nfe_pca_new_variants/v9/loadings.ht/')
GTF_FILE = 'gs://hail-common/references/gencode/gencode.v29.annotation.gtf.bgz'
SCORES = bucket_path('tob_wgs_hgdp_1kg_nfe_pca_new_variants/v9/scores.ht/')
HGDP1KG_TOBWGS = bucket_path(
    '1kg_hgdp_densified_pca_new_variants/v0/hgdp1kg_tobwgs_joined_all_samples.mt'
)


def manhattan_loadings(
    iteration,
    gtf,
    loadings,
    title=None,
    size=4,
    hover_fields=None,
    collect_all=False,
    n_divisions=500,
):
    """modify hail manhattan plot"""
    palette = [
Пример #7
0
"""Create PCA plots for the combined TOB-WGS/SNP-chip data"""

from bokeh.io.export import get_screenshot_as_png
from bokeh.resources import CDN
from bokeh.embed import file_html
from bokeh.transform import factor_cmap
from bokeh.plotting import ColumnDataSource, figure
import pandas as pd
import hail as hl
import click
from analysis_runner import bucket_path, output_path

SCORES = bucket_path('tob_snp_chip_pca/v0/scores.ht')
EIGENVALUES = bucket_path('tob_snp_chip_pca/v0/eigenvalues.ht')
TOB_WGS = bucket_path('mt/v3-raw.mt')


@click.command()
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    scores = hl.read_table(SCORES)
    tob_wgs = hl.read_matrix_table(TOB_WGS)
    snp_chip_names = scores.s.collect()
    wgs_names = tob_wgs.s.collect()

    def sample_type(sample_name):
        return 'dual_sample' if sample_name in wgs_names else 'snp_chip_only'
Пример #8
0
from the HGDP/1kG + TOB-WGS datasets,
removing outliers.
"""

import hail as hl
import pandas as pd
from analysis_runner import bucket_path, output_path
from bokeh.io.export import get_screenshot_as_png
from bokeh.resources import CDN
from bokeh.embed import file_html
from bokeh.plotting import ColumnDataSource, figure
from bokeh.transform import factor_cmap
from bokeh.palettes import turbo  # pylint: disable=no-name-in-module

HGDP1KG_TOBWGS = bucket_path(
    '1kg_hgdp_densified_pca_new_variants/v0/hgdp1kg_tobwgs_joined_all_samples.mt'
)
SCORES = bucket_path('tob_wgs_hgdp_1kg_nfe_pca_new_variants/v9/scores.ht/')
EIGENVALUES = bucket_path(
    'tob_wgs_hgdp_1kg_nfe_pca_new_variants/v9/eigenvalues.ht')


def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    mt = hl.read_matrix_table(HGDP1KG_TOBWGS)
    scores = hl.read_table(SCORES)

    # Filter outliers and related samples
Пример #9
0
"""Calculate ld using the ld_matrix function"""

import hail as hl
import pandas as pd
from analysis_runner import bucket_path, output_path

TOB_WGS = bucket_path('mt/v7.mt/')


def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    tob_wgs = hl.read_matrix_table(TOB_WGS)
    tob_wgs = hl.experimental.densify(tob_wgs)
    # filter out constant variants
    tob_wgs = tob_wgs.filter_rows(hl.len(tob_wgs.alleles) == 2)
    tob_wgs = tob_wgs.head(30000)
    ld = hl.ld_matrix(tob_wgs.GT.n_alt_alleles(), tob_wgs.locus, radius=2e6)
    ld = pd.DataFrame(ld.to_numpy())
    # save pandas df
    ld_filename = output_path(f'ld_matrix.csv', 'analysis')
    ld.to_csv(ld_filename, index=False)


if __name__ == '__main__':
    query()
Пример #10
0
"""QC of newly-selected variants"""

import hail as hl
import pandas as pd
from analysis_runner import bucket_path, output_path
from bokeh.io.export import get_screenshot_as_png
from bokeh.resources import CDN
from bokeh.embed import file_html
from bokeh.transform import factor_cmap
from bokeh.plotting import ColumnDataSource, figure
from bokeh.palettes import turbo  # pylint: disable=no-name-in-module
from bokeh.models import CategoricalColorMapper, HoverTool

HGDP1KG_TOBWGS = bucket_path(
    '1kg_hgdp_densified_pca_new_variants/v0/hgdp1kg_tobwgs_joined_all_samples.mt'
)
SCORES = bucket_path('1kg_hgdp_densified_pca_new_variants/v0/scores.ht/')
EIGENVALUES = bucket_path(
    '1kg_hgdp_densified_pca_new_variants/v0/eigenvalues.ht')
LOADINGS = bucket_path('1kg_hgdp_densified_pca_new_variants/v0/loadings.ht/')


def manhattan_loadings(
    pvals,
    locus=None,
    title=None,
    size=4,
    hover_fields=None,
    collect_all=False,
    n_divisions=500,
):
"""
Perform PCA on densified TOB-WGS data. Reliant on output from
```
hgdp1kg_tobwgs_densified_pca_new_variants/
hgdp_1kg_tob_wgs_densified_pca_new_variants.py
````
"""

import hail as hl
import pandas as pd
from hail.experimental import lgt_to_gt
from analysis_runner import bucket_path, output_path

TOB_WGS = bucket_path('1kg_hgdp_densify_new_variants/v0/tob_wgs_filtered.mt/')
GNOMAD_HGDP_1KG_MT = ('gs://gcp-public-data--gnomad/release/3.1/mt/genomes/'
                      'gnomad.genomes.v3.1.hgdp_1kg_subset_dense.mt')


def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    tob_wgs = hl.read_matrix_table(TOB_WGS)
    hgdp_1kg = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT)

    # keep loci that are contained in the densified, filtered tob-wgs mt
    hgdp_1kg = hgdp_1kg.semi_join_rows(tob_wgs.rows())

    # Entries and columns must be identical
    tob_wgs_select = tob_wgs.select_entries(
"""
Save scores of related individuals after running pc_relate.

"""

import hail as hl
import pandas as pd
from analysis_runner import bucket_path, output_path

PC_RELATE_ESTIMATE_NFE = bucket_path(
    'tob_wgs_hgdp_1kg_nfe_pc_relate/v0/pc_relate_kinship_estimate.ht')
PC_RELATE_ESTIMATE_GLOBAL = bucket_path(
    'tob_wgs_hgdp_1kg_pc_relate/v0/pc_relate_kinship_estimate.ht')
KING_ESTIMATE_NFE = bucket_path('king/v0/king_kinship_estimate_NFE.ht')


def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    # save relatedness estimates for pc_relate global populations
    ht = hl.read_table(PC_RELATE_ESTIMATE_GLOBAL)
    related_samples = ht.filter(ht.kin > 0.1)
    pc_relate_global = pd.DataFrame({
        'i_s': related_samples.i.s.collect(),
        'j_s': related_samples.j.s.collect(),
        'kin': related_samples.kin.collect(),
    })
    filename = output_path(f'pc_relate_global_matrix.csv', 'analysis')
    pc_relate_global.to_csv(filename, index=False)
"""

import re
import hail as hl
import pandas as pd
from analysis_runner import bucket_path, output_path
from hail.experimental import pc_project
from hail.experimental import lgt_to_gt
from bokeh.plotting import ColumnDataSource, figure
from bokeh.palettes import Dark2  # pylint: disable=no-name-in-module
from bokeh.transform import factor_cmap
from bokeh.resources import CDN
from bokeh.embed import file_html
from bokeh.io.export import get_screenshot_as_png

SNP_CHIP = bucket_path(
    'tob_wgs_snp_chip_pca/increase_partitions/v2/snp_chip_10000_partitions.mt')
TOB_WGS = bucket_path('mt/v3-raw.mt')


def query():  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    snp_chip = hl.read_matrix_table(SNP_CHIP)
    tob_wgs = hl.read_matrix_table(TOB_WGS)
    tob_wgs = hl.experimental.densify(tob_wgs)
    tob_wgs = tob_wgs.annotate_entries(GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA))
    snp_chip = snp_chip.semi_join_rows(tob_wgs.rows())
    snp_chip_path = output_path('snp_chip_filtered_by_tob_wgs.mt', 'tmp')
    snp_chip = snp_chip.checkpoint(snp_chip_path)