예제 #1
0
    def test_hadoop_exists(self):
        with hadoop_open(f'{BUCKET}/test_exists.txt', 'w') as f:
            f.write("HELLO WORLD")

        r_exists = f'{BUCKET}/test_exists.txt'
        r_not_exists = f'{BUCKET}/not_exists.txt'
        self.assertTrue(hl.hadoop_exists(r_exists))
        self.assertFalse(hl.hadoop_exists(r_not_exists))
예제 #2
0
    def test_hadoop_exists(self, bucket=None):
        if bucket is None:
            bucket = self.remote_bucket

        with hadoop_open(f'{bucket}/test_exists.txt', 'w') as f:
            f.write("HELLO WORLD")

        r_exists = f'{bucket}/test_exists.txt'
        r_not_exists = f'{bucket}/not_exists.txt'
        self.assertTrue(hl.hadoop_exists(r_exists))
        self.assertFalse(hl.hadoop_exists(r_not_exists))
예제 #3
0
    def test_hadoop_exists(self, prefix: Optional[str] = None):
        if prefix is None:
            prefix = self.remote_tmpdir

        with hadoop_open(f'{prefix}/test_exists.txt', 'w') as f:
            f.write("HELLO WORLD")

        r_exists = f'{prefix}/test_exists.txt'
        r_not_exists = f'{prefix}/not_exists.txt'
        self.assertTrue(hl.hadoop_exists(r_exists))
        self.assertFalse(hl.hadoop_exists(r_not_exists))
예제 #4
0
def get_liftover_v2_qc_mt(data_type: str,
                          ld_pruned: bool,
                          release_only: bool = False,
                          overwrite: bool = False) -> hl.MatrixTable:
    """
    Returns MatrixTable for sample QC purposes on build 38: can be exomes, genomes, or joint (joint dataset can also be ld_pruned=True)
    Criteria: callrate > 0.99, AF > 0.001, SNPs only, bi-allelics only
    Note: sites where the locus changes chromosome are discarded
    """
    path = qc_mt_path(data_type, ld_pruned, 'GRCh38')
    if not overwrite and hl.hadoop_exists(path):
        grch38_qc_mt = hl.read_matrix_table(path)
    else:
        grch38_qc_mt = hl.read_matrix_table(
            qc_mt_path(data_type, ld_pruned=ld_pruned))
        get_liftover_genome(grch38_qc_mt)
        grch38_qc_mt = grch38_qc_mt.key_rows_by()
        grch38_qc_mt = grch38_qc_mt.transmute_rows(locus=hl.liftover(
            grch38_qc_mt.locus, 'GRCh38'),
                                                   locus37=grch38_qc_mt.locus)
        grch38_qc_mt = grch38_qc_mt.filter_rows(
            grch38_qc_mt.locus.contig == 'chr' + grch38_qc_mt.locus37.contig)
        grch38_qc_mt = grch38_qc_mt.key_rows_by(locus=grch38_qc_mt.locus,
                                                alleles=grch38_qc_mt.alleles)
        grch38_qc_mt = grch38_qc_mt.checkpoint(path, overwrite=overwrite)

    if release_only:
        meta = get_gnomad_meta(data_type)
        grch38_qc_mt = grch38_qc_mt.filter_cols(
            meta[grch38_qc_mt.col_key].release)

    return grch38_qc_mt
예제 #5
0
def to_plink(pops: list,
             subsets_dir,
             mt,
             ht_sample,
             bfile_path,
             export_varid: bool = True,
             overwrite=False):
    r'''
    Exports matrix table to PLINK2 files
    NOTE: These files will need to split up by chromosome before plink_clump.py
    can be run. 
    '''
    assert 'GT' in mt.entry, "mt must have 'GT' as an entry field"
    assert mt.GT.dtype == hl.tcall, "entry field 'GT' must be of type `Call`"

    if not overwrite and all([
            hl.hadoop_exists(f'{bfile_path}.{suffix}')
            for suffix in ['bed', 'bim']
    ]):
        print(f'\nPLINK .bed and .bim files already exist for {bfile_path}')
        print(bfile_path)
    else:
        print(f'Saving to bfile prefix {bfile_path}')
        mt_sample = mt.annotate_rows(varid=hl.str(mt.locus) + ':' +
                                     mt.alleles[0] + ':' + mt.alleles[1])
        mt_sample = mt_sample.filter_cols(hl.is_defined(
            ht_sample[mt_sample.s]))
        hl.export_plink(dataset=mt_sample,
                        output=bfile_path,
                        ind_id=mt_sample.s,
                        varid=mt_sample.varid)  # varid used to be rsid
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    tob_wgs = hl.read_matrix_table(TOB_WGS)
    hgdp_1kg = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT)

    # keep loci that are contained in the densified, filtered tob-wgs mt
    hgdp_1kg = hgdp_1kg.semi_join_rows(tob_wgs.rows())

    # Entries and columns must be identical
    tob_wgs_select = tob_wgs.select_entries(
        GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA)).select_cols()
    hgdp_1kg_select = hgdp_1kg.select_entries(hgdp_1kg.GT).select_cols()
    # Join datasets
    hgdp1kg_tobwgs_joined = hgdp_1kg_select.union_cols(tob_wgs_select)
    # Add in metadata information
    hgdp_1kg_metadata = hgdp_1kg.cols()
    hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.annotate_cols(
        hgdp_1kg_metadata=hgdp_1kg_metadata[hgdp1kg_tobwgs_joined.s])
    # save this for population-level PCAs
    mt_path = output_path('hgdp1kg_tobwgs_joined_all_samples.mt')
    if not hl.hadoop_exists(mt_path):
        hgdp1kg_tobwgs_joined.write(mt_path)

    # Perform PCA
    eigenvalues_path = output_path('eigenvalues.ht')
    scores_path = output_path('scores.ht')
    loadings_path = output_path('loadings.ht')
    eigenvalues, scores, loadings = hl.hwe_normalized_pca(
        hgdp1kg_tobwgs_joined.GT, compute_loadings=True, k=20)
    hl.Table.from_pandas(pd.DataFrame(eigenvalues)).export(eigenvalues_path)
    scores.write(scores_path, overwrite=True)
    loadings.write(loadings_path, overwrite=True)
예제 #7
0
def main(df_x_path, df_y_path, output_path, python_image):
    backend = hb.ServiceBackend()
    b = hb.Batch(name='rf-loo', default_python_image=python_image)

    with hl.hadoop_open(df_y_path) as f:
        local_df_y = pd.read_table(f, header=0, index_col=0)

    df_x_input = b.read_input(df_x_path)
    df_y_input = b.read_input(df_y_path)

    results = []

    for window in local_df_y.index.to_list():
        checkpoint = checkpoint_path(window)
        if hl.hadoop_exists(checkpoint):
            result = b.read_input(checkpoint)
            results.append(result)
            continue

        j = b.new_python_job()

        result = j.call(random_forest, df_x_input, df_y_input, window)
        tsv_result = j.call(as_tsv, result)
        tsv_result = tsv_result.as_str()

        b.write_output(tsv_result, checkpoint)
        results.append(tsv_result)

    output = hb.concatenate(b, results)
    b.write_output(output, output_path)

    b.run(wait=False)
    backend.close()
def create_rf_2_0_2_rank(data_type: str, beta: bool) -> None:
    """
    Creates a rank file for 2.0.2 RF and writes it to its correct location.

    :param str data_type: One of 'exomes' or 'genomes'
    :param bool beta: If set, then creates the table for the "beta" 2.0.2 RF with QD / max(p(AB))
    :return: Nothing
    :rtype: None
    """
    logger.info(
        f"Creating rank file for {data_type} RF 2.0.2{'beta' if beta else ''}")

    if not hl.hadoop_exists(
            f'gs://gnomad-tmp/gnomad_rf_2_0_2_{data_type}_{str(beta)}_tmp.ht'):
        ht = hl.import_table(get_2_0_2_rf_path(data_type, beta),
                             types={'chrom': hl.tstr},
                             impute=True,
                             min_partitions=1000)
        if 'chrom' in ht.row:
            ht = ht.transmute(locus=hl.locus(ht.chrom, ht.pos),
                              alleles=[ht.ref, ht.alt])
        else:
            ht = ht.transmute(
                v=hl.parse_variant(ht.v),
                rfprob=ht.rf_rpob_tp  # Yes, this is awful
            )
            ht = ht.transmute(locus=ht.v.locus, alleles=ht.v.alleles)

        ht = ht.key_by('locus', 'alleles')

        gnomad_ht = get_gnomad_annotations(data_type)
        ht = ht.annotate(**gnomad_ht[ht.key], score=ht.rfprob)

        ht.write(
            f'gs://gnomad-tmp/gnomad_rf_2_0_2_{data_type}_{str(beta)}_tmp.ht')
    ht = hl.read_table(
        f'gs://gnomad-tmp/gnomad_rf_2_0_2_{data_type}_{str(beta)}_tmp.ht')
    ht = add_rank(ht,
                  score_expr=1 - ht.score,
                  subrank_expr={
                      'singleton_rank':
                      ht.singleton,
                      'biallelic_rank':
                      ~ht.was_split,
                      'biallelic_singleton_rank':
                      ~ht.was_split & ht.singleton,
                      'adj_rank':
                      ht.ac > 0,
                      'adj_biallelic_rank':
                      ~ht.was_split & (ht.ac > 0),
                      'adj_singleton_rank':
                      ht.singleton & (ht.ac > 0),
                      'adj_biallelic_singleton_rank':
                      ~ht.was_split & ht.singleton & (ht.ac > 0)
                  })

    ht.write(score_ranking_path(data_type,
                                'rf_2.0.2{}'.format('_beta' if beta else '')),
             overwrite=True)
예제 #9
0
def get_files_in_parent_directory(parent_dir,
                                  fname: str = 'variant_results.ht'):
    all_outputs = []
    for directory in parent_dir:
        if not directory['is_dir']:
            continue
        file_path = f'{directory["path"]}/{fname}'
        if hl.hadoop_exists(f'{file_path}/_SUCCESS'):
            all_outputs.append(file_path)
    return all_outputs
예제 #10
0
def query(rerun):
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    sample_qc_path = output_path('sample_qc.mt')
    if rerun or not hl.hadoop_exists(sample_qc_path):
        mt = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT)
        mt = mt.head(100, n_cols=100)
        mt_qc = hl.sample_qc(mt)
        mt_qc.write(sample_qc_path)
    mt_qc = hl.read_matrix_table(sample_qc_path)

    plot_filename = output_path('call_rate_plot.png', 'web')
    if rerun or not hl.hadoop_exists(plot_filename):
        call_rate_plot = hl.plot.histogram(mt_qc.sample_qc.call_rate,
                                           range=(0, 1),
                                           legend='Call rate')
        with hl.hadoop_open(plot_filename, 'wb') as f:
            get_screenshot_as_png(call_rate_plot).save(f, format='PNG')
예제 #11
0
    def test_hadoop_mkdir_p(self):
        test_text = "HELLO WORLD"

        with hadoop_open(resource('./some/foo/bar.txt'), 'w') as out:
            out.write(test_text)

        self.assertTrue(hl.hadoop_exists(resource('./some/foo/bar.txt')))

        with hadoop_open(resource('./some/foo/bar.txt')) as f:
            assert (f.read() == test_text)

        hl.current_backend().fs.rmtree(resource('./some'))
예제 #12
0
def main(args):
    hl.init(default_reference='GRCh38', log='/load_results.log')
    start_time = time.time()
    all_phenos_ht = hl.import_table('gs://finngen-public-data-r2/summary_stats/r2_manifest.tsv', impute=True)
    # all_phenos_ht = all_phenos_ht.annotate(code=all_phenos_ht.phenocode.split('_', 2)[0])
    all_phenos = all_phenos_ht.collect()

    backend = pipeline.BatchBackend(billing_project='ukb_round2')
    # backend = pipeline.LocalBackend(gsa_key_file='/Users/konradk/.hail/ukb-diverse-pops.json')
    p = pipeline.Pipeline(name='finngen_load', backend=backend,
                          default_image='gcr.io/ukbb-exome-pharma/hail_utils:3.3',
                          default_storage='500Mi', default_cpu=8)

    tasks = []
    for i, pheno in enumerate(all_phenos):
        variant_results_ht_path = f'{results_dir}/ht/{pheno.phenocode}.ht'
        if not args.overwrite_results and hl.hadoop_exists(f'{variant_results_ht_path.replace(".ht", ".mt")}/_SUCCESS'):
            continue
        t: pipeline.pipeline.Task = p.new_task(name='load_pheno', attributes={'pheno': pheno.phenocode}).cpu(args.n_threads)
        t.command(f"""
        PYTHONPATH=$PYTHONPATH:/ PYSPARK_SUBMIT_ARGS="--conf spark.driver.memory=24g pyspark-shell"
        python3 /ukb_exomes/hail/load_finngen_results_hail.py
        --input_file {pheno.path_bucket} --n_threads {args.n_threads}
        --load_single --vep_path {vep_path}
        --additional_dict {shq(json.dumps(dict(pheno)))}
        --output_ht {variant_results_ht_path}
        --output_mt {variant_results_ht_path.replace('.ht', '.mt')}
        --overwrite
        """.replace('\n', ' '))
        tasks.append(t)
        if args.limit and i == args.limit:
            break

    t: pipeline.pipeline.Task = p.new_task(name='combine').cpu(args.n_threads)

    t.depends_on(*tasks)
    t.command(f"""
    PYTHONPATH=$PYTHONPATH:/ PYSPARK_SUBMIT_ARGS="--conf spark.driver.memory=4g --conf spark.executor.memory=24g pyspark-shell"
    python3 /ukb_exomes/hail/load_finngen_results_hail.py --combine_all
    --input_directory {results_dir}/ht
    --output_ht {final_results_ht}
    --output_mt {final_results_ht.replace('.ht', '.mt')}
    --overwrite --n_threads {args.n_threads}
    """.replace('\n', ' '))

    logger.info(f'Setup took: {time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))}')
    logger.info(f'Submitting: {get_tasks_from_pipeline(p)}')
    p.run(dry_run=args.dry_run, verbose=True, delete_scratch_on_exit=False)
    logger.info(f'Finished: {get_tasks_from_pipeline(p)}')
예제 #13
0
def file_exists(fname: str) -> bool:
    """
    Check whether a file exists.
    Supports either local or Google cloud (gs://) paths.
    If the file is a Hail file (.ht, .mt extensions), it checks that _SUCCESS is present.

    :param fname: File name
    :return: Whether the file exists
    """
    fext = os.path.splitext(fname)[1]
    if fext in [".ht", ".mt"]:
        fname += "/_SUCCESS"
    if fname.startswith("gs://"):
        return hl.hadoop_exists(fname)
    else:
        return os.path.isfile(fname)
def query(output):  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    hgdp_1kg = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT)
    tob_wgs = hl.read_matrix_table(TOB_WGS).key_rows_by('locus', 'alleles')
    loadings = hl.read_table(GNOMAD_LIFTOVER_LOADINGS).key_by(
        'locus', 'alleles')

    # filter to loci that are contained in both tables and the loadings after densifying
    tob_wgs = hl.experimental.densify(tob_wgs)
    hgdp_1kg = hgdp_1kg.filter_rows(
        hl.is_defined(loadings.index(hgdp_1kg['locus'], hgdp_1kg['alleles']))
        & hl.is_defined(
            tob_wgs.index_rows(hgdp_1kg['locus'], hgdp_1kg['alleles'])))
    tob_wgs = tob_wgs.semi_join_rows(hgdp_1kg.rows())

    # Entries and columns must be identical
    tob_wgs_select = tob_wgs.select_entries(
        GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA))
    hgdp_1kg_select = hgdp_1kg.select_entries(hgdp_1kg.GT)
    hgdp_1kg_select = hgdp_1kg_select.select_cols()
    # Join datasets
    hgdp1kg_tobwgs_joined = hgdp_1kg_select.union_cols(tob_wgs_select)
    # Add in metadata information
    hgdp_1kg_metadata = hgdp_1kg.cols()
    hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.annotate_cols(
        hgdp_1kg_metadata=hgdp_1kg_metadata[hgdp1kg_tobwgs_joined.s])
    mt_path = f'{output}/hgdp1kg_tobwgs_joined_all_samples.mt'
    if not hl.hadoop_exists(mt_path):
        hgdp1kg_tobwgs_joined.write(mt_path)
    hgdp1kg_tobwgs_joined = hl.read_matrix_table(mt_path)

    # Perform PCA
    eigenvalues_path = f'{output}/eigenvalues.csv'
    scores_path = f'{output}/scores.ht'
    loadings_path = f'{output}/loadings.ht'
    eigenvalues, scores, loadings = hl.hwe_normalized_pca(
        hgdp1kg_tobwgs_joined.GT, compute_loadings=True, k=20)
    # save the list of eigenvalues
    eigenvalues_df = pd.DataFrame(eigenvalues)
    eigenvalues_df.to_csv(eigenvalues_path, index=False)
    # save the scores and loadings as a hail table
    scores.write(scores_path, overwrite=True)
    loadings.write(loadings_path, overwrite=True)
예제 #15
0
def file_exists(fname: str) -> bool:
    """
    Check whether a file exists.
    Supports either local or Google cloud (gs://) paths.
    If the file is a Hail file (.ht, .mt extensions), it checks that _SUCCESS is present.
    :param str fname: File name
    :return: Whether the file exists
    :rtype: bool
    """
    _, fext = os.path.splitext(fname)
    if fext in ['.ht', '.mt']:
        fname = os.path.join(fname, '_SUCCESS')

    if fname.startswith('gs://'):
        return hl.hadoop_exists(fname)
    else:
        return os.path.isfile(fname)
def create_rf_rank(data_type: str, run_hash: str) -> None:
    """
    Creates a ranked table for a RF run and writes it to its correct location in annotations.

    :param str data_type: One of 'exomes' or 'genomes'
    :param str run_hash: RF run hash
    :return: Nothing
    :rtype: None
    """
    logger.info(f"Creating rank file for {data_type} RF run {run_hash}")

    if not hl.hadoop_exists(
            f'gs://gnomad-tmp/gnomad_{data_type}_rf_{run_hash}.ht/_SUCCESS'):
        gnomad_ht = get_gnomad_annotations(data_type)
        ht = hl.read_table(rf_path(data_type, 'rf_result', run_hash=run_hash))
        ht = ht.annotate(**gnomad_ht[ht.key], score=ht.rf_probability['TP'])

        # Write to temp location as result will be overwritten
        ht.write(f'gs://gnomad-tmp/gnomad_{data_type}_rf_{run_hash}.ht',
                 overwrite=True)
    ht = hl.read_table(f'gs://gnomad-tmp/gnomad_{data_type}_rf_{run_hash}.ht')

    ht = add_rank(ht,
                  score_expr=1 - ht.score,
                  subrank_expr={
                      'singleton_rank':
                      ht.singleton,
                      'biallelic_rank':
                      ~ht.was_split,
                      'biallelic_singleton_rank':
                      ~ht.was_split & ht.singleton,
                      'adj_rank':
                      ht.ac > 0,
                      'adj_biallelic_rank':
                      ~ht.was_split & (ht.ac > 0),
                      'adj_singleton_rank':
                      ht.singleton & (ht.ac > 0),
                      'adj_biallelic_singleton_rank':
                      ~ht.was_split & ht.singleton & (ht.ac > 0)
                  })
    ht.write(rf_path(data_type, 'rf_result', run_hash=run_hash),
             overwrite=True)
예제 #17
0
    def test_hadoop_mkdir_p(self):
        with self.assertRaises(Exception):
            hadoop_open(resource('./some2/foo/bar.txt'), 'r')

        self.assertFalse(hl.hadoop_exists(resource('./some2')))
예제 #18
0
 def test_hadoop_exists(self):
     self.assertTrue(hl.hadoop_exists(resource('ls_test')))
     self.assertFalse(hl.hadoop_exists(resource('doesnt.exist')))
예제 #19
0
파일: test_utils.py 프로젝트: bcajes/hail
 def test_hadoop_exists(self):
     self.assertTrue(hl.hadoop_exists(resource('ls_test')))
     self.assertFalse(hl.hadoop_exists(resource('doesnt.exist')))
예제 #20
0
import jinja2
import numpy as np
import hail as hl
import plotly
import plotly.express as px
import json
from aiohttp import web
import aiohttp_jinja2

app = web.Application()
routes = web.RouteTableDef()

if not hl.hadoop_exists('bn.mt'):
    # Generate data for demonstratation purposes, this should already exist
    mt = hl.balding_nichols_model(5,
                                  100,
                                  10000,
                                  pop_dist=[0.1, 0.2, 0.3, 0.2, 0.2],
                                  fst=[.02, .06, .04, .12, .08],
                                  af_dist=hl.rand_beta(a=0.01,
                                                       b=2.0,
                                                       lower=0.05,
                                                       upper=1.0),
                                  mixture=True)
    mt = hl.variant_qc(mt)
    mt.write('bn.mt', overwrite=True)

mt = hl.read_matrix_table('bn.mt')

if not hl.hadoop_exists('scores.t'):
    # Generate data for demonstratation purposes, this should already exist
예제 #21
0
def query():  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    loadings_ht = hl.read_table(LOADINGS)
    gtf_ht = hl.experimental.import_gtf(
        GTF_FILE,
        reference_genome='GRCh38',
        skip_invalid_contigs=True,
        min_partitions=12,
    )
    number_of_pcs = hl.len(loadings_ht.loadings).take(1)[0] - 1
    for i in range(0, (number_of_pcs)):
        pc = i + 1
        plot_filename = output_path(f'loadings_manhattan_plot_pc{pc}.png',
                                    'web')
        if not hl.hadoop_exists(plot_filename):
            p = manhattan_loadings(
                iteration=i,
                gtf=gtf_ht,
                loadings=loadings_ht,
                title=f'Loadings of PC{pc}',
                collect_all=True,
            )
            with hl.hadoop_open(plot_filename, 'wb') as f:
                get_screenshot_as_png(p).save(f, format='PNG')
            html = file_html(p, CDN, 'my plot')
            plot_filename_html = output_path(f'loadings_pc{pc}.html', 'web')
            with hl.hadoop_open(plot_filename_html, 'w') as f:
                f.write(html)

    # Get samples which are driving loadings
    mt = hl.read_matrix_table(HGDP1KG_TOBWGS)
    scores = hl.read_table(SCORES)
    mt = mt.semi_join_cols(scores)
    loadings_ht = loadings_ht.key_by('locus')
    mt = mt.annotate_rows(loadings=loadings_ht[mt.locus].loadings)

    for dim in range(0, number_of_pcs):
        max_value = mt.aggregate_rows(hl.agg.stats(hl.abs(
            mt.loadings[dim]))).max
        significant_variants = mt.filter_rows(
            hl.abs(mt.loadings[dim]) == max_value)
        significant_variants = hl.sample_qc(significant_variants)
        significant_variant_list = significant_variants.locus.collect()
        print(f'PC{dim}:', significant_variant_list)
        heterozygous_samples = significant_variants.filter_cols(
            significant_variants.sample_qc.n_het > 0).s.collect()
        homozygous_alternate_samples = significant_variants.filter_cols(
            significant_variants.sample_qc.n_hom_var > 0).s.collect()
        if len(heterozygous_samples) > len(homozygous_alternate_samples):
            homozygous_alternate_samples.extend('null' for _ in range(
                len(heterozygous_samples) - len(homozygous_alternate_samples)))
        elif len(heterozygous_samples) < len(homozygous_alternate_samples):
            heterozygous_samples.extend('null' for _ in range(
                len(homozygous_alternate_samples) - len(heterozygous_samples)))

        # save as html
        html = pd.DataFrame({
            'heterozygous_samples':
            heterozygous_samples,
            'homozygous_alternate_samples':
            homozygous_alternate_samples,
        }).to_html()
        plot_filename_html = output_path(
            f'significant_variants_non_ref_samples{dim}.html', 'web')
        with hl.hadoop_open(plot_filename_html, 'w') as f:
            f.write(html)
예제 #22
0
 def exists(self, path):  # pylint: disable=no-self-use
     return hl.hadoop_exists(path)
예제 #23
0
def query(output, pop):  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    mt = hl.read_matrix_table(HGDP1KG_TOBWGS)
    if pop:
        # Get samples from the specified population only
        mt = mt.filter_cols((
            mt.hgdp_1kg_metadata.population_inference.pop == pop.lower())
                            | (mt.s.contains('TOB')))
    else:
        mt = mt.filter_cols(mt.s.contains('TOB'))

    # Get allele-frequency and loadings for pc_project function
    mt = mt.annotate_rows(af=hl.agg.mean(mt.GT.n_alt_alleles()) / 2)
    loadings = hl.read_table(LOADINGS)
    loadings = loadings.annotate(af=mt.rows()[loadings.key].af)
    reprocessed_samples = hl.read_matrix_table(REPROCESSED_1KG)
    reprocessed_samples = hl.experimental.densify(reprocessed_samples)
    reprocessed_samples = reprocessed_samples.annotate_entries(
        GT=lgt_to_gt(reprocessed_samples.LGT, reprocessed_samples.LA))
    # Project new genotypes onto loadings
    ht = pc_project(reprocessed_samples.GT, loadings.loadings, loadings.af)
    ht = ht.key_by(s=ht.s + '_reprocessed')
    pcs = hl.read_table(SCORES)
    union_scores = ht.union(pcs)
    union_scores = union_scores.annotate(
        original=(union_scores.s == 'HG01513')
        | (union_scores.s == 'HG02238')
        | (union_scores.s == 'NA12248')
        | (union_scores.s == 'NA20502')
        | (union_scores.s == 'NA20826'),
        reprocessed=union_scores.s.contains('reprocessed'),
    )
    expr = (
        hl.case().when(
            (union_scores.original)
            & (
                union_scores.reprocessed  # pylint: disable=singleton-comparison
                == False  # noqa: E712
            ),
            'original',
        ).when(
            (union_scores.original == False)  # pylint: disable=singleton-comparison
            & (union_scores.reprocessed),
            'reprocessed',
        ).default('unedited'))
    union_scores = union_scores.annotate(cohort_sample_codes=expr)
    # get percentage of variance explained
    eigenvalues = hl.import_table(EIGENVALUES)
    eigenvalues = eigenvalues.to_pandas()
    eigenvalues.columns = ['eigenvalue']
    eigenvalues = pd.to_numeric(eigenvalues.eigenvalue)
    variance = eigenvalues.divide(float(eigenvalues.sum())) * 100
    variance = variance.round(2)

    # plot
    labels = union_scores.cohort_sample_codes
    sample_names = union_scores.s
    cohort_sample_codes = list(set(labels.collect()))
    tooltips = [('labels', '@label'), ('samples', '@samples')]
    for i in range(0, 10):
        pc1 = i
        pc2 = i + 1
        plot_filename = (f'{output}/reprocessed_sample_projection_pc' +
                         str(i + 1) + '.png')
        if not hl.hadoop_exists(plot_filename):
            plot = figure(
                title='Reprocessed Sample Projection',
                x_axis_label='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) +
                '%)',
                y_axis_label='PC' + str(pc2 + 1) + ' (' + str(variance[pc1]) +
                '%)',
                tooltips=tooltips,
            )
            source = ColumnDataSource(
                dict(
                    x=union_scores.scores[pc1].collect(),
                    y=union_scores.scores[pc2].collect(),
                    label=labels.collect(),
                    samples=sample_names.collect(),
                ))
            plot.circle(
                'x',
                'y',
                alpha=0.5,
                source=source,
                size=8,
                color=factor_cmap('label', Dark2[len(cohort_sample_codes)],
                                  cohort_sample_codes),
                legend_group='label',
            )
            plot.add_layout(plot.legend[0], 'left')
            with hl.hadoop_open(plot_filename, 'wb') as f:
                get_screenshot_as_png(plot).save(f, format='PNG')
            plot_filename_html = ('reprocessed_sample_projection_pc' +
                                  str(i + 1) + '.html')
            output_file(plot_filename_html)
            save(plot)
            subprocess.run(['gsutil', 'cp', plot_filename_html, output],
                           check=False)
예제 #24
0
def run_pca_normal(dirname: str = None,
                   basename: str = None,
                   input_type: str = None,
                   reference: str = 'GRCh38',
                   maf: float = 0.05,
                   hwe: float = 1e-3,
                   call_rate: float = 0.98,
                   ld_cor: float = 0.2,
                   ld_window: int = 250000,
                   n_pcs: int = 20,
                   relatedness_method: str = 'pc_relate',
                   relatedness_thresh: float = 0.98,
                   out_dir: str = None):

    print('\nReading mt')
    if reference.lower() == 'grch37':
        lifted_over = f'{dirname}{basename}.liftover.grch38.mt'
        if not hl.hadoop_exists(lifted_over):
            from gwaspy.utils.reference_liftover import liftover_to_grch38
            mt = liftover_to_grch38(dirname=dirname,
                                    basename=basename,
                                    input_type=input_type)
        else:
            print(f'\nFound lifted-over over file: {lifted_over}')
            mt = hl.read_matrix_table(lifted_over)
    else:
        from gwaspy.utils.read_file import read_infile
        mt = read_infile(input_type=input_type,
                         dirname=dirname,
                         basename=basename)

    print('\nFiltering mt')
    mt = pca_filter_mt(in_mt=mt,
                       maf=maf,
                       hwe=hwe,
                       call_rate=call_rate,
                       ld_cor=ld_cor,
                       ld_window=ld_window)

    mt = relatedness_check(in_mt=mt,
                           method=relatedness_method,
                           outdir=out_dir,
                           kin_estimate=relatedness_thresh)

    pca_snps = mt.count_rows()
    if pca_snps > 1000000:
        import warnings
        warnings.warn(
            f'Too many SNPs to be used in PCA: {pca_snps}. This will make PCA run longer'
        )

    print('\nRunning PCA')
    eigenvalues, pcs, _ = hl.hwe_normalized_pca(mt.GT, k=n_pcs)

    pcs_ht = pcs.transmute(
        **{f'PC{i}': pcs.scores[i - 1]
           for i in range(1, n_pcs + 1)})

    # add phenotype and sex to the output, using information from the mt
    # first check if is_case and os_female fields exist in the mt
    all_column_field_names = list(mt.col)
    # sex status is a MUST but not phenotype status
    if 'is_case' in all_column_field_names:
        ann_cols = ['is_case', 'is_female']
    else:
        ann_cols = ['is_female']

    annotations_ht = mt.cols().select(*ann_cols)

    if 'is_case' in all_column_field_names:
        pcs_ht = pcs_ht.annotate(is_case=annotations_ht[pcs_ht.s].is_case)
    pcs_ht = pcs_ht.annotate(is_female=annotations_ht[pcs_ht.s].is_female)

    print('\nSaving PC scores file')
    out_scores_file = f'{out_dir}GWASpy/PCA/pca_normal/{basename}.pca.normal.scores.tsv'
    pcs_ht.export(out_scores_file)

    print('\nGenerating PCA plots')
    pcs_scores = pd.read_table(out_scores_file, header=0, sep='\t')

    if 'is_case' in all_column_field_names:
        pcs_scores[['is_case'
                    ]] = pcs_scores[['is_case'
                                     ]].replace([True, False, None],
                                                ['case', 'control', 'unknown'])
    pcs_scores[['is_female'
                ]] = pcs_scores[['is_female'
                                 ]].replace([True, False, None],
                                            ['female', 'male', 'unknown'])

    figs_dict = {}
    for col in ann_cols:
        for i in range(1, n_pcs, 2):
            xpc = f'PC{i}'
            ypc = f'PC{i + 1}'

            figs_dict["fig{}{}".format(col,
                                       i)] = plot_pca(pcs_scores, xpc, ypc,
                                                      col)

    pdf = PdfPages('/tmp/pca.no.ref.plots.pdf')
    for figname, figure in figs_dict.items():
        pdf.savefig(figure)
    pdf.close()
    hl.hadoop_copy(
        'file:///tmp/pca.no.ref.plots.pdf',
        f'{out_dir}GWASpy/PCA/pca_normal/{basename}.pca.no.ref.plots.pdf')
예제 #25
0
파일: sex_aut_imp.py 프로젝트: atgu/GWASpy
def run_impute(backend: Union[hb.ServiceBackend, hb.LocalBackend] = None,
               input_vcf: str = None,
               females_file: str = None,
               n_samples: int = None,
               n_panel_samples: int = 4099,
               phasing_software: str = None,
               memory: str = 'highmem',
               buffer_region: int = 250,
               out_dir: str = None):

    global phased_bcf
    print(f'\n1. IMPUTATION ON {input_vcf} PHASED CHUNKS\n')
    vcf_filebase = get_vcf_filebase(input_vcf)

    impute_b = hb.Batch(backend=backend,
                        name=f'impute-phased-chunks-{vcf_filebase}')

    # use regions file to update the regions for imputation so that there's no overlaps like in phasing
    regions = pd.read_csv(
        f'{out_dir}/GWASpy/{vcf_filebase}/Phasing/refscatter.bed',
        delim_whitespace=True,
        names=['chrom', 'start', 'end'])
    chroms_dfs = []

    for chrom, df_group in regions.groupby('chrom'):
        # print(df_group.loc[df_group.index[0], 'end'])
        df_group.loc[df_group.index[0],
                     'stop'] = df_group.loc[df_group.index[0], 'end']

        for i in range(1, len(df_group)):
            df_group.loc[df_group.index[i],
                         'stop'] = df_group.loc[df_group.index[i - 1],
                                                'end'] + 1

        df_group['stop'] = df_group['stop'].astype(int)

        # add index column
        df_group['ind'] = df_group.index

        # update the first line to start at 1
        df_group.loc[df_group.index[0], 'stop'] = 1

        # combine the chromosome, start, and end positions into one
        df_group['reg'] = df_group['chrom'].astype(str) + ":" + df_group[
            'stop'].astype(str) + "-" + df_group['end'].astype(str)

        # select only the two needed columns
        regions_to_import_group = df_group[['reg', 'ind']]

        chroms_dfs.append(regions_to_import_group)

    regions_to_import = pd.concat(chroms_dfs, axis=0)
    regions_to_import = regions_to_import.sort_values('ind')
    regions_to_import.to_csv(
        f'{out_dir}/GWASpy/{vcf_filebase}/Imputation/imputation.regions',
        sep='\t',
        header=False,
        index=False)

    regions_dict = pd.Series(regions_to_import.reg.values,
                             index=regions_to_import.ind).to_dict()

    if phasing_software == 'shapeit':
        phased_vcfs_chunks = hl.utils.hadoop_ls(
            f'{out_dir}/GWASpy/{vcf_filebase}/Phasing/phased_scatter/*.shapeit.bcf'
        )
    else:
        phased_vcfs_chunks = hl.utils.hadoop_ls(
            f'{out_dir}/GWASpy/{vcf_filebase}/Phasing/phased_scatter/*.eagle.bcf'
        )

    for i in range(1, 24):
        if i == 23:
            chrom = 'chrX'
        else:
            chrom = f'chr{i}'

        ref_bcf = f'gs://gcp-public-data--gnomad/resources/hgdp_1kg/phased_haplotypes/hgdp.tgp.gwaspy.merged.{chrom}.merged.bcf'
        ref_size = bytes_to_gb(ref_bcf)
        ref = impute_b.read_input_group(**{
            'bcf': ref_bcf,
            'bcf.csi': f'{ref_bcf}.csi'
        })

        # output is not always bcf
        phased_filename = f'{out_dir}/GWASpy/{vcf_filebase}/Phasing/phased_merged/{vcf_filebase}.{chrom}.phased.{phasing_software}'
        if hl.hadoop_exists(f'{phased_filename}.bcf'):
            phased_bcf = f'{phased_filename}.bcf'
        elif hl.hadoop_exists(f'{phased_filename}.vcf.gz'):
            phased_bcf = f'{phased_filename}.vcf.gz'

        in_vcf = impute_b.read_input_group(**{
            'bcf': phased_bcf,
            'bcf.csi': f'{phased_bcf}.csi'
        })
        vcf_size = bytes_to_gb(input_vcf)

        disk_size = int(
            round(10.0 + 3.0 * vcf_size +
                  ((1.0 + 2.0 * n_samples / n_panel_samples) * ref_size)))
        job_memory = memory
        job_cpu = 16 if job_memory == 'highmem' else 8

        for file in phased_vcfs_chunks:
            f = file['path']
            vcf_basename = get_vcf_filebase(f)
            file_index = int(vcf_basename.split('.')[-3])
            file_region = regions_dict[file_index]
            map_chrom = file_region.split(':')[0]

            imp_out_filename = f'{vcf_basename}.imputed.bcf'
            # file_dir = vcf_basename.split('.')[0]
            output_filepath_name = f'{out_dir}/GWASpy/{vcf_filebase}/Imputation/imputed_chunks/{imp_out_filename}'

            if map_chrom == chrom:
                # check if imputed file already exists
                if hl.hadoop_exists(output_filepath_name):
                    continue

                else:
                    if chrom == 'chrX':
                        females_in = impute_b.read_input(females_file)

                        sex_impute(b=impute_b,
                                   vcf=in_vcf,
                                   females_list=females_in,
                                   vcf_filename_no_ext=vcf_basename,
                                   ref=ref,
                                   region=file_region,
                                   buffer=buffer_region,
                                   storage=disk_size,
                                   memory=job_memory,
                                   cpu=job_cpu,
                                   out_dir=out_dir)
                    else:
                        aut_impute(b=impute_b,
                                   vcf=in_vcf,
                                   vcf_filename_no_ext=vcf_basename,
                                   ref=ref,
                                   region=file_region,
                                   chromosome=chrom,
                                   buffer=buffer_region,
                                   storage=disk_size,
                                   memory=job_memory,
                                   cpu=job_cpu,
                                   out_dir=out_dir)

    impute_b.run()
예제 #26
0
def create_binned_concordance(data_type: str, truth_sample: str, metric: str,
                              nbins: int, overwrite: bool) -> None:
    """
    Creates and writes a concordance table binned by rank (both absolute and relative) for a given data type, truth sample and metric.

    :param str data_type: One 'exomes' or 'genomes'
    :param str truth_sample: Which truth sample concordance to load
    :param str metric: One of the evaluation metrics (or a RF hash)
    :param int nbins: Number of bins for the rank
    :param bool overwrite: Whether to overwrite existing table
    :return: Nothing -- just writes the table
    :rtype: None
    """

    if hl.hadoop_exists(
            binned_concordance_path(data_type, truth_sample, metric) +
            '/_SUCCESS') and not overwrite:
        logger.warn(
            f"Skipping binned concordance creation as {binned_concordance_path(data_type, truth_sample, metric)} exists and overwrite=False"
        )
    else:
        ht = hl.read_table(
            annotations_ht_path(data_type, f'{truth_sample}_concordance'))
        # Remove 1bp indels for syndip as cannot be trusted
        if truth_sample == 'syndip':
            ht = ht.filter(
                hl.is_indel(ht.alleles[0], ht.alleles[1]) &
                (hl.abs(hl.len(ht.alleles[0]) - hl.len(ht.alleles[1])) == 1),
                keep=False)
            high_conf_intervals = hl.import_locus_intervals(
                syndip_high_conf_regions_bed_path)
        else:
            high_conf_intervals = hl.import_locus_intervals(
                NA12878_high_conf_regions_bed_path)

        lcr = hl.import_locus_intervals(lcr_intervals_path)
        segdup = hl.import_locus_intervals(segdup_intervals_path)
        ht = ht.filter(
            hl.is_defined(high_conf_intervals[ht.locus])
            & hl.is_missing(lcr[ht.locus]) & hl.is_missing(segdup[ht.locus]))

        if metric in ['vqsr', 'rf_2.0.2', 'rf_2.0.2_beta', 'cnn']:
            metric_ht = hl.read_table(score_ranking_path(data_type, metric))
        else:
            metric_ht = hl.read_table(
                rf_path(data_type, 'rf_result', run_hash=metric))

        metric_snvs, metrics_indels = metric_ht.aggregate([
            hl.agg.count_where(
                hl.is_snp(metric_ht.alleles[0], metric_ht.alleles[1])),
            hl.agg.count_where(
                ~hl.is_snp(metric_ht.alleles[0], metric_ht.alleles[1]))
        ])

        snvs, indels = ht.aggregate([
            hl.agg.count_where(hl.is_snp(ht.alleles[0], ht.alleles[1])),
            hl.agg.count_where(~hl.is_snp(ht.alleles[0], ht.alleles[1]))
        ])

        ht = ht.annotate_globals(global_counts=hl.struct(
            snvs=metric_snvs, indels=metrics_indels),
                                 counts=hl.struct(snvs=snvs, indels=indels))

        ht = ht.annotate(
            snv=hl.is_snp(ht.alleles[0], ht.alleles[1]),
            score=metric_ht[ht.key].score,
            global_rank=metric_ht[ht.key].rank,
            # TP => allele is found in both data sets
            n_tp=ht.concordance[3][3] + ht.concordance[3][4] +
            ht.concordance[4][3] + ht.concordance[4][4],
            # FP => allele is found only in test data set
            n_fp=hl.sum(ht.concordance[3][:2]) + hl.sum(ht.concordance[4][:2]),
            # FN => allele is found only in truth data set
            n_fn=hl.sum(ht.concordance[:2].map(lambda x: x[3] + x[4])))

        ht = add_rank(ht, -1.0 * ht.score)

        ht = ht.annotate(rank=[
            hl.tuple([
                'global_rank', (ht.global_rank + 1) /
                hl.cond(ht.snv, ht.globals.global_counts.snvs,
                        ht.globals.global_counts.indels)
            ]),
            hl.tuple([
                'truth_sample_rank', (ht.rank + 1) / hl.cond(
                    ht.snv, ht.globals.counts.snvs, ht.globals.counts.indels)
            ])
        ])

        ht = ht.explode(ht.rank)
        ht = ht.annotate(rank_name=ht.rank[0], bin=hl.int(ht.rank[1] * nbins))

        ht = ht.group_by('rank_name', 'snv', 'bin').aggregate(
            # Look at site-level metrics -> tp > fp > fn -- only important for multi-sample comparisons
            tp=hl.agg.count_where(ht.n_tp > 0),
            fp=hl.agg.count_where((ht.n_tp == 0) & (ht.n_fp > 0)),
            fn=hl.agg.count_where((ht.n_tp == 0) & (ht.n_fp == 0)
                                  & (ht.n_fn > 0)),
            min_score=hl.agg.min(ht.score),
            max_score=hl.agg.max(ht.score),
            n_alleles=hl.agg.count()).repartition(5)

        ht.write(binned_concordance_path(data_type, truth_sample, metric),
                 overwrite=overwrite)
예제 #27
0
def query(output, pop):  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    mt = hl.read_matrix_table(HGDP1KG_TOBWGS)
    if pop:
        # Get samples from the specified population only
        mt = mt.filter_cols((
            mt.hgdp_1kg_metadata.population_inference.pop == pop.lower())
                            | (mt.s.contains('TOB')))
    else:
        mt = mt.filter_cols(mt.s.contains('TOB'))

    mt = mt.annotate_rows(af=hl.agg.mean(mt.GT.n_alt_alleles()) / 2)
    loadings = hl.read_table(LOADINGS)
    loadings = loadings.annotate(af=mt.rows()[loadings.key].af)
    tob_wgs_snp_chip = hl.read_matrix_table(SNP_CHIP).key_rows_by(
        'locus', 'alleles')
    ht = pc_project(tob_wgs_snp_chip.GT, loadings.loadings, loadings.af)
    ht = ht.key_by(s=ht.s + '_SNP_CHIP')
    pcs = hl.read_table(SCORES)
    union_scores = ht.union(pcs)
    union_scores = union_scores.annotate(
        snp_chip=(union_scores.s.contains('_SNP_CHIP')),
        tob_wgs=(union_scores.s.contains('_SNP_CHIP')
                 | union_scores.s.contains('TOB')),
    )
    expr = (
        hl.case().when(
            (union_scores.snp_chip),
            'snp_chip',
        ).when(
            (
                union_scores.snp_chip  # noqa: E501; pylint: disable=singleton-comparison;
                == False  # noqa: E712
            )
            & (union_scores.tob_wgs),
            'tob_wgs',
        ).default('hgdp_1kg'))
    union_scores = union_scores.annotate(cohort_sample_codes=expr)
    # get percentage of variance explained
    eigenvalues = hl.import_table(EIGENVALUES)
    eigenvalues = eigenvalues.to_pandas()
    eigenvalues.columns = ['eigenvalue']
    eigenvalues = pd.to_numeric(eigenvalues.eigenvalue)
    variance = eigenvalues.divide(float(eigenvalues.sum())) * 100
    variance = variance.round(2)

    # plot
    labels = union_scores.cohort_sample_codes
    sample_names = union_scores.s
    cohort_sample_codes = list(set(labels.collect()))
    tooltips = [('labels', '@label'), ('samples', '@samples')]
    number_of_pcs = len(eigenvalues)
    union_scores = union_scores.persist()
    for i in range(0, (number_of_pcs - 1)):
        pc1 = i
        pc2 = i + 1
        plot_filename = (f'{output}/reprocessed_sample_projection_pc' +
                         str(i + 1) + '.png')
        if not hl.hadoop_exists(plot_filename):
            plot = figure(
                title='SNP-Chip Sample Projection',
                x_axis_label='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) +
                '%)',
                y_axis_label='PC' + str(pc2 + 1) + ' (' + str(variance[pc1]) +
                '%)',
                tooltips=tooltips,
            )
            source = ColumnDataSource(
                dict(
                    x=union_scores.scores[pc1].collect(),
                    y=union_scores.scores[pc2].collect(),
                    label=labels.collect(),
                    samples=sample_names.collect(),
                ))
            plot.circle(
                'x',
                'y',
                alpha=0.5,
                source=source,
                size=8,
                color=factor_cmap('label', Dark2[len(cohort_sample_codes)],
                                  cohort_sample_codes),
                legend_group='label',
            )
            plot.add_layout(plot.legend[0], 'left')
            with hl.hadoop_open(plot_filename, 'wb') as f:
                get_screenshot_as_png(plot).save(f, format='PNG')
            plot_filename_html = 'snp_chip_sample_projection_pc' + str(
                i + 1) + '.html'
            output_file(plot_filename_html)
            save(plot)
            subprocess.run(['gsutil', 'cp', plot_filename_html, output],
                           check=False)
예제 #28
0
def main():

    # # Args (local)
    # chrom = 11
    # chain_file = '/Users/em21/Projects/ot_genetics/genetics-sumstats_data/extras/prepare_uk_biobank_gwas_catalog/sitelist/input_data/grch37_to_grch38.over.chain.gz'
    # in_bgen = 'example_data/ukb_imp_chr{chrom}_v3.example.bgen'
    # in_sample = 'output/ukb_10k_downsampled.sample'
    # to_keep_list = 'output/ukb_10k_downsampled.sample_list.tsv'
    # out_plink = 'output/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k'
    # cores = 1 # Use "*" for all
    # maf_threshold = 0.001

    # Args (server)
    chrom = sys.argv[1]
    chain_file = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/grch37_to_grch38.over.chain.gz'
    in_bgen = '/nfs/users/nfs_e/em21/otcoregen/uk_biobank_data/data/genetics/imputation/ukb_imp_chr{chrom}_v3.bgen'
    in_sample = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/ukb_10k_downsampled.sample'
    to_keep_list = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/ukb_10k_downsampled.sample_list.tsv'
    out_plink = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/output/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k'
    cores = sys.argv[2]  # Use "*" for all
    maf_threshold = 0.001

    # Set the maximum number of cores
    hl.init(master="local[{}]".format(cores))

    # Prepare liftover
    rg37 = hl.get_reference('GRCh37')
    rg38 = hl.get_reference('GRCh38')
    rg37.add_liftover(chain_file, rg38)

    # Create my own rg38 with altered names
    rg38_custom_contigs = [
        contig.replace('chr', '') for contig in rg38.contigs
    ]
    rg38_custom_lens = {}
    for contig in rg38.lengths:
        rg38_custom_lens[contig.replace('chr', '')] = rg38.lengths[contig]
    rg38_custom = hl.ReferenceGenome('rg38_custom', rg38_custom_contigs,
                                     rg38_custom_lens)

    print('Processing chromosome {0}'.format(chrom))

    # Index bgen if not existing
    if not hl.hadoop_exists(in_bgen.format(chrom=chrom) + '.idx2'):
        hl.index_bgen(in_bgen.format(chrom=chrom),
                      contig_recoding={
                          "01": "1",
                          "02": "2",
                          "03": "3",
                          "04": "4",
                          "05": "5",
                          "06": "6",
                          "07": "7",
                          "08": "8",
                          "09": "9"
                      },
                      reference_genome='GRCh37')

    # Load bgen
    mt = hl.import_bgen(in_bgen.format(chrom=chrom),
                        entry_fields=['GT'],
                        sample_file=in_sample)

    # Load list samples to keep
    samples_to_keep = hl.import_table(to_keep_list,
                                      no_header=True,
                                      impute=False,
                                      types={
                                          'f0': hl.tstr
                                      }).key_by('f0')

    # Downsample to required subset of samples
    mt = mt.filter_cols(hl.is_defined(samples_to_keep[mt.s]))

    # Re-call to remove phasing (required for plink output)
    # mt = mt.annotate_entries(GT=hl.call(mt.GT[0], mt.GT[1], phased=False))

    # Filter on MAF
    mt = hl.variant_qc(mt)
    mt = mt.annotate_rows(variant_qc=mt.variant_qc.annotate(
        MAF=hl.min(mt.variant_qc.AF)))
    mt = mt.filter_rows(mt.variant_qc.MAF >= maf_threshold)

    # Liftover
    mt = mt.annotate_rows(locus_GRCh38=hl.liftover(mt.locus, 'GRCh38'))

    # Strip chr from contig name (causes problems with GCTA)
    mt = mt.annotate_rows(
        contig_GRCh38=mt.locus_GRCh38.contig.replace('chr', ''))

    # Swap GRCh37 locus for GRCh38 (but have to use rg38_custom)
    mt = mt.key_rows_by()
    mt = mt.annotate_rows(locus=hl.locus(mt.contig_GRCh38,
                                         mt.locus_GRCh38.position,
                                         reference_genome=rg38_custom))
    mt = mt.key_rows_by(mt.locus, mt.alleles)

    # Remove rows with missing locus (after liftover)
    mt = mt.filter_rows(hl.is_defined(mt.locus))

    # Write plink format
    hl.export_plink(dataset=mt, output=out_plink.format(chrom=chrom))

    return 0
예제 #29
0
파일: pca_project.py 프로젝트: atgu/GWASpy
def run_pca_project(
        ref_dirname: str = 'gs://hgdp-1kg/hgdp_tgp/datasets_for_others/lindo/ds_without_outliers/',
        ref_basename: str = 'unrelated',
        ref_info: str = 'gs://hgdp-1kg/hgdp_tgp/gwaspy_pca_ref/hgdp_1kg_sample_info.unrelateds.pca_outliers_removed.with_project.tsv',
        data_dirname: str = None,
        data_basename: str = None,
        out_dir: str = None,
        input_type: str = None,
        reference: str = 'GRCh38',
        npcs: int = 20,
        maf: float = 0.05,
        hwe: float = 1e-3,
        call_rate: float = 0.98,
        ld_cor: float = 0.2,
        ld_window: int = 250000,
        relatedness_method: str = 'pc_relate',
        relatedness_thresh: float = 0.98,
        prob_threshold: float = 0.8):
    """
    Project samples into predefined PCA space
    :param ref_dirname: directory name where reference data is
    :param ref_basename: base filename for reference data
    :param ref_info: reference sample information
    :param data_dirname: matrix table of data to project
    :param data_basename: matrix table of data to project
    :param out_dir: directory and filename prefix for where to put PCA projection output
    :param input_type: input file(s) type: hail, plink, or vcf
    :param reference: reference build
    :param npcs: number of principal components to be used in PCA
    :param maf: minor allele frequency threshold
    :param hwe: hardy-weinberg fiter threshold
    :param call_rate: variant call rate filter threshold
    :param ld_cor: reference build
    :param ld_window: window size
    :param relatedness_method: method to use for relatedness filtering
    :param relatedness_thresh: threshold to use for filtering out related individuals
    :param prob_threshold: a list of probability thresholds to use for classifying samples
    :return: a pandas Dataframe with data PCA scores projected on the same PCA space using the Human Genome Diversity
    """
    print('\nReading data mt')
    if reference.lower() == 'grch37':
        lifted_over = f'{data_dirname}{data_basename}.liftover.grch38.mt'
        if not hl.hadoop_exists(lifted_over):
            from gwaspy.utils.reference_liftover import liftover_to_grch38
            mt = liftover_to_grch38(dirname=data_dirname, basename=data_basename, input_type=input_type)
        else:
            print(f'\nFound lifted-over over file: {lifted_over}')
            mt = hl.read_matrix_table(lifted_over)
    else:
        from gwaspy.utils.read_file import read_infile
        mt = read_infile(input_type=input_type, dirname=data_dirname, basename=data_basename)

    print('\nFiltering data mt')
    mt = pca_filter_mt(in_mt=mt, maf=maf, hwe=hwe, call_rate=call_rate, ld_cor=ld_cor, ld_window=ld_window)

    mt = relatedness_check(in_mt=mt, method=relatedness_method, outdir=out_dir, kin_estimate=relatedness_thresh)

    # Intersect data with reference
    intersect_ref(ref_dirname=ref_dirname, ref_basename=ref_basename, data_mt=mt, data_basename=data_basename,
                  out_dir=out_dir)

    ref_in_data = hl.read_matrix_table(f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/1kg_hgdp_intersect_{data_basename}.mt')

    print('\nComputing reference PCs')
    run_ref_pca(mt=ref_in_data, npcs=npcs, out_dir=out_dir, data_basename=data_basename)

    # project data
    pca_loadings = hl.read_table(f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/1kg_hgdp_loadings.ht')
    project_mt = hl.read_matrix_table(f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/{data_basename}_intersect_1kg_hgdp.mt')

    ht_projections = pc_project(mt=project_mt, loadings_ht=pca_loadings)
    ht_projections = ht_projections.transmute(**{f'PC{i}': ht_projections.scores[i - 1] for i in range(1, npcs+1)})
    ht_projections.export(f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/{data_basename}.project.pca.scores.tsv')

    ref_scores = f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/1kg_hgdp.project.pca.scores.txt.bgz'
    data_scores = f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/{data_basename}.project.pca.scores.tsv'
    data_ref = merge_data_with_ref(ref_scores=ref_scores, ref_info=ref_info, data_scores=data_scores)

    from gwaspy.pca.assign_pop_labels import assign_population_pcs
    pcs_df, clf = assign_population_pcs(pop_pc_pd=data_ref, num_pcs=npcs, min_prob=prob_threshold)

    data_pops = pcs_df.loc[pcs_df['SuperPop'].isnull()]
    data_pops['pop'].value_counts()
    cols = ['s', 'pop'] + [f'prob_{i}' for i in ["AFR", "AMR", "CSA", "EAS", "EUR", "MID", "OCE"]] + [f'PC{i}' for i in
                                                                                                      range(1, npcs+1)]
    data_pops_df = data_pops[cols]

    data_pops_df.to_csv(f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/pca_sup_pops_{prob_threshold}_probs.project.pca.txt',
                        sep='\t', index=False)

    print("\nGenerating PCA plots")
    data_scores_prob = f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/pca_sup_pops_{prob_threshold}_probs.project.pca.txt'

    figs_dict = {}
    # plotting more than 10 PCA plots in HTML generates wobbly, large files
    for i in range(1, 10, 2):
        xpc = f'PC{i}'
        ypc = f'PC{i + 1}'
        figs_dict["fig{}{}".format(xpc, ypc)] = plot_pca_ref(data_scores=data_scores_prob,
                                                             ref_scores=ref_scores,
                                                             ref_info=ref_info,
                                                             x_pc=xpc, y_pc=ypc)
    with open('/tmp/pca.project.plots.html', 'a') as f:
        for figname, figure in figs_dict.items():
            f.write(figure.to_html(include_plotlyjs='cdn'))

    hl.hadoop_copy('file:///tmp/pca.project.plots.html',
                   f'{out_dir}GWASpy/PCA/{data_basename}/pca_project/{data_basename}.pca.project.plots.html')
예제 #30
0
def main(args):
    ########################################################################
    ### initialize
    print('Getting started: ' + datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

    # 1. Read in summary stats data
    # 2. Annotate matrix table with effect sizes for each phenotype
    # 3. Compute PRS for each
    start = time.time()

    pheno_gwas = hl.import_table(f'gs://apcdr/pheno_code_ukb_code.txt')
    pheno_ss = dict([(x.pheno_code, x.ukb_code) for x in pheno_gwas.collect()])
    #pheno_ss = dict([(x.ss_code, x.pheno_code) for x in pheno_gwas.collect()])

    # mt = hl.read_matrix_table('gs://apcdr/prs_sumstats_clumps/ukb_holdout/ukb31063.gwas_holdout_sumstats_pheno37_subset.mt')
    mt = hl.read_matrix_table('gs://apcdr/dosage_bgen/apcdr.mt')
    ss_keys = dict(
        zip(['CHR', 'POS', 'REF', 'ALT', 'P', 'BETA'],
            args.chr_pos_ref_alt_p_beta.split(',')))

    for pheno in list(pheno_ss.keys()):
        #for pheno in ['WHR']:
        print('Pheno: ' + pheno + ', Time: ' +
              datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

        suffix_replace = args.ss_suffix.split('.')
        suffix_replace[-2] = 'clumped'
        suffix_replace = '.'.join(suffix_replace)
        if hl.hadoop_exists(args.ss_clump_prefix + pheno + suffix_replace):
            ss_path = args.ss_clump_prefix + pheno + args.ss_suffix
            clump_path = args.ss_clump_prefix + pheno + suffix_replace
        elif hl.hadoop_exists(args.ss_clump_prefix + pheno_ss[pheno] +
                              suffix_replace):
            ss_path = args.ss_clump_prefix + pheno_ss[pheno] + args.ss_suffix
            clump_path = args.ss_clump_prefix + pheno_ss[pheno] + suffix_replace
        else:
            continue

        ss = hl.import_table(ss_path,
                             impute=True,
                             delimiter='\s+',
                             min_partitions=1000)
        ss = ss.annotate(locus=hl.locus(hl.str(ss[ss_keys['CHR']]),
                                        ss[ss_keys['POS']]),
                         alleles=[ss[ss_keys['REF']], ss[ss_keys['ALT']]])
        ss = ss.key_by(ss.locus, ss.alleles)

        ## Read in summary statistics and true phenotypes
        mt_annot = mt.annotate_rows(ss=ss[mt.locus,
                                          mt.alleles])  # come back to this
        # ht_samples = hl.import_table('gs://apcdr/ukb_holdout/ukb31063.gwas_samples.gwas_vs_holdout.txt',
        #                              types={'s': hl.tstr}, key='s')
        # ht_samples = hl.import_table('gs://apcdr/ukb_holdout/ukb31063.gwas_samples.holdout_and_target.txt',
        #                              types={'s': hl.tstr}, key='s')
        # # mt_annot = mt_annot.filter_cols(hl.or_else(ht_samples[mt_annot.s].in_gwas != 'TRUE', True))
        # mt_annot = mt_annot.filter_cols(hl.is_defined(ht_samples[mt_annot.s]))
        # # print(mt.count()) # 13364303, 136265)

        print('Starting ' + pheno + ': ' +
              datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

        p_max = {
            's1': 5e-8,
            's2': 1e-6,
            's3': 1e-4,
            's4': 1e-3,
            's5': 1e-2,
            's6': .05,
            's7': .1,
            's8': .2,
            's9': .5,
            's10': 1
        }

        pheno_clump = specific_clumps(clump_path)

        mt_annot = mt_annot.filter_rows(pheno_clump.get(mt_annot.locus, False))
        # print(mt.count())

        annot_expr = {
            k: hl.agg.sum(
                hl.float(mt_annot.ss[ss_keys['BETA']]) * mt_annot.dosage *
                hl.int(mt_annot.ss[ss_keys['P']] < v))
            for k, v in p_max.items()
        }

        mt_annot = mt_annot.annotate_cols(**annot_expr)

        ht_out = mt_annot.cols()
        #ht_out.describe()
        #covs = hl.read_table('gs://apcdr/ukb_holdout/uk_round2_allSamples_phenos_phesant.ht').select('age', 'sex')  # added
        # need to add in PCs
        #ht_out = ht_out.annotate(**covs[ht_out.key])
        ht_comb = ht_out.select(*p_max.keys(),
                                age=ht_out.phenotypes.age,
                                sex=ht_out.phenotypes.sex,
                                pheno=ht_out.phenotypes[pheno])

        output_location = args.ss_clump_prefix + pheno + '_apcdr_PRS'
        #ht_comb.describe()
        #ht_comb.write(output_location + '.ht', overwrite=args.overwrite)
        #ht_comb = hl.read_table(output_location + '.ht')
        ht_comb.export(output_location + '.txt.bgz')

    end = time.time()
    print("Success! Job was completed in %s" %
          time.strftime("%H:%M:%S", time.gmtime(end - start)))
    fn = "gs://qingbowang/ems_v1_test/ems_p_causal_interpolated_{0}.tsv".format(
        tissue_name)
    with hl.hadoop_open(fn, 'r') as f:
        pcausal = pd.read_csv(f, sep="\t", index_col=0)
    pcausal["rf_score_bin"] = pcausal.index
    del pcausal["rf_score_bin.1"]  #duplicated columns

    pcausal = hl.Table.from_pandas(pcausal)
    pcausal = pcausal.transmute(
        rf_score_bin=hl.format('%.3f', pcausal["rf_score_bin"]))

    #score all chunks:
    #get the max
    for i in range(10000):  #just take the upperbound
        if not hl.hadoop_exists(
                "gs://qingbowang/ems_v1_test/ems_rawscore_gtexvg_all{0}_chunk{1}.tsv.gz"
                .format(tissue_name, i)):
            imax = i
            break
    dfall = []
    for i in range(imax):
        print("starting chunk {0} of {1}, {2}".format(i, imax - 1, tm.ctime()))
        df = hl.import_table(
            "gs://qingbowang/ems_v1_test/ems_rawscore_gtexvg_all{0}_chunk{1}.tsv.gz"
            .format(tissue_name, i),
            force=True,
            impute=True)
        df = df.repartition(80)  #80 partition for 1000mann lines
        df = df.annotate(rf_score_bin=hl.format('%.3f', df["0"]))
        pcausal = pcausal.key_by("rf_score_bin")
        df = df.key_by("rf_score_bin")