Пример #1
0
def read_gwas_table_with_varlist(gwas_table_tsv,
                                 varlist,
                                 type_dic,
                                 checkpoint_path,
                                 gwas_ht=None,
                                 no_return=False):
    # varlist can be one file or a list of files
    # type dic is for gwas_table_tsv
    if gwas_ht is None:
        gwas_tsv = hl.import_table(gwas_table_tsv,
                                   key=['rsid'],
                                   types=type_dic)
    else:
        phenotypes = gwas_ht['phenotypes'].collect()[0]
        i, j = get_index_in_nested_list(phenotypes, gwas_table_tsv)
        gwas_tsv = gwas_ht.annotate(beta=gwas_ht['beta'][i][j],
                                    pval=gwas_ht['p_value'][i][j])
        gwas_tsv = gwas_tsv.select('beta', 'pval', 'locus', 'alleles')

    clump_snp = hl.import_table(varlist, key=['f0'], no_header=True)
    gwas_tsv = gwas_tsv.filter(hl.is_defined(clump_snp[gwas_tsv.rsid]))
    if gwas_ht is None:
        k = hl.parse_variant(gwas_tsv.variant)
        gwas_tsv = gwas_tsv.annotate(**k)
    gwas_tsv = gwas_tsv.key_by(gwas_tsv.locus, gwas_tsv.alleles)
    # gwas_tsv = gwas_tsv.repartition(40)
    # gwas_tsv = gwas_tsv.cache()
    if no_return is False:
        gwas_tsv = gwas_tsv.checkpoint(checkpoint_path, overwrite=True)
        return gwas_tsv
    else:
        gwas_tsv.write(checkpoint_path, overwrite=True)
Пример #2
0
def make_clinvar_hail2(clinvar_vcf_path, clinvar_variants_table,
                       clinvar_mt_out_path):
    """
    Import ClinVar vcf file, and turn it into a usable Hail2 mt

    :param str clinvar_vcf_path: Example : "gs://gnomad-berylc/tx-annotation/hail2/clinvar_alleles_single.b37.vcf.bgz"
    :param str clinvar_variants_table: Example : "gs://gnomad-berylc/tx-annotation/hail2/clinvar_alleles_single.b37.variants_table.tsv"
    :param bool repartition:
    :param int n_partitions: Number of partitions if repartition = True
    :param str clinvar_mt_out_path: "gs://gnomad-resources/clinvar/hail-0.2/clinvar_alleles.single.b37.hail2.vepped.mt"
    :return: split and VEP'd MT
    :rtype: MatrixTable
    """
    clinvar_mt = hl.import_vcf(clinvar_vcf_path)
    variants_table = hl.import_table(clinvar_variants_table, impute=True)
    variants_table = variants_table.annotate(
        v=hl.parse_variant(variants_table.v))
    variants_table = (variants_table.annotate(
        locus=variants_table.v.locus,
        alleles=variants_table.v.alleles).key_by('locus', 'alleles'))

    clinvar_mt = clinvar_mt.annotate_rows(
        va=variants_table[clinvar_mt.locus, clinvar_mt.alleles])

    clinvar_mt = split_multi_dynamic(clinvar_mt, left_aligned=False)
    clinvar_mt = clinvar_mt.repartition(100)
    clinvar_vep = hl.vep(clinvar_mt, vep_config)
    clinvar_vep.write(clinvar_mt_out_path, overwrite=True)

    t = hl.read_matrix_table(clinvar_mt_out_path)
    t.rows().show()
def create_rf_2_0_2_rank(data_type: str, beta: bool) -> None:
    """
    Creates a rank file for 2.0.2 RF and writes it to its correct location.

    :param str data_type: One of 'exomes' or 'genomes'
    :param bool beta: If set, then creates the table for the "beta" 2.0.2 RF with QD / max(p(AB))
    :return: Nothing
    :rtype: None
    """
    logger.info(
        f"Creating rank file for {data_type} RF 2.0.2{'beta' if beta else ''}")

    if not hl.hadoop_exists(
            f'gs://gnomad-tmp/gnomad_rf_2_0_2_{data_type}_{str(beta)}_tmp.ht'):
        ht = hl.import_table(get_2_0_2_rf_path(data_type, beta),
                             types={'chrom': hl.tstr},
                             impute=True,
                             min_partitions=1000)
        if 'chrom' in ht.row:
            ht = ht.transmute(locus=hl.locus(ht.chrom, ht.pos),
                              alleles=[ht.ref, ht.alt])
        else:
            ht = ht.transmute(
                v=hl.parse_variant(ht.v),
                rfprob=ht.rf_rpob_tp  # Yes, this is awful
            )
            ht = ht.transmute(locus=ht.v.locus, alleles=ht.v.alleles)

        ht = ht.key_by('locus', 'alleles')

        gnomad_ht = get_gnomad_annotations(data_type)
        ht = ht.annotate(**gnomad_ht[ht.key], score=ht.rfprob)

        ht.write(
            f'gs://gnomad-tmp/gnomad_rf_2_0_2_{data_type}_{str(beta)}_tmp.ht')
    ht = hl.read_table(
        f'gs://gnomad-tmp/gnomad_rf_2_0_2_{data_type}_{str(beta)}_tmp.ht')
    ht = add_rank(ht,
                  score_expr=1 - ht.score,
                  subrank_expr={
                      'singleton_rank':
                      ht.singleton,
                      'biallelic_rank':
                      ~ht.was_split,
                      'biallelic_singleton_rank':
                      ~ht.was_split & ht.singleton,
                      'adj_rank':
                      ht.ac > 0,
                      'adj_biallelic_rank':
                      ~ht.was_split & (ht.ac > 0),
                      'adj_singleton_rank':
                      ht.singleton & (ht.ac > 0),
                      'adj_biallelic_singleton_rank':
                      ~ht.was_split & ht.singleton & (ht.ac > 0)
                  })

    ht.write(score_ranking_path(data_type,
                                'rf_2.0.2{}'.format('_beta' if beta else '')),
             overwrite=True)
Пример #4
0
    def test_variant_qc(self):
        data = [
            {'v': '1:1:A:T', 's': '1', 'GT': hl.Call([0, 0]), 'GQ': 10, 'DP': 0},
            {'v': '1:1:A:T', 's': '2', 'GT': hl.Call([1, 1]), 'GQ': 10, 'DP': 5},
            {'v': '1:1:A:T', 's': '3', 'GT': hl.Call([0, 1]), 'GQ': 11, 'DP': 100},
            {'v': '1:1:A:T', 's': '4', 'GT': None, 'GQ': None, 'DP': 100},
            {'v': '1:2:A:T,C', 's': '1', 'GT': hl.Call([1, 2]), 'GQ': 10, 'DP': 5},
            {'v': '1:2:A:T,C', 's': '2', 'GT': hl.Call([2, 2]), 'GQ': 10, 'DP': 5},
            {'v': '1:2:A:T,C', 's': '3', 'GT': hl.Call([0, 1]), 'GQ': 10, 'DP': 5},
            {'v': '1:2:A:T,C', 's': '4', 'GT': hl.Call([1, 1]), 'GQ': 10, 'DP': 5},
        ]

        ht = hl.Table.parallelize(data, hl.dtype('struct{v: str, s: str, GT: call, GQ: int, DP: int}'))
        ht = ht.transmute(**hl.parse_variant(ht.v))
        mt = ht.to_matrix_table(['locus', 'alleles'], ['s'])
        mt = hl.variant_qc(mt, 'vqc')
        r = mt.rows().collect()

        self.assertEqual(r[0].vqc.AF, [0.5, 0.5])
        self.assertEqual(r[0].vqc.AC, [3, 3])
        self.assertEqual(r[0].vqc.AN, 6)
        self.assertEqual(r[0].vqc.homozygote_count, [1, 1])
        self.assertEqual(r[0].vqc.n_called, 3)
        self.assertEqual(r[0].vqc.n_not_called, 1)
        self.assertEqual(r[0].vqc.call_rate, 0.75)
        self.assertEqual(r[0].vqc.n_het, 1)
        self.assertEqual(r[0].vqc.n_non_ref, 2)
        self.assertEqual(r[0].vqc.het_freq_hwe, 0.6)
        self.assertEqual(r[0].vqc.p_value_hwe, 0.7)
        self.assertEqual(r[0].vqc.dp_stats.min, 0)
        self.assertEqual(r[0].vqc.dp_stats.max, 100)
        self.assertEqual(r[0].vqc.dp_stats.mean, 51.25)
        self.assertAlmostEqual(r[0].vqc.dp_stats.stdev, 48.782040752719645)
        self.assertEqual(r[0].vqc.gq_stats.min, 10)
        self.assertEqual(r[0].vqc.gq_stats.max, 11)
        self.assertAlmostEqual(r[0].vqc.gq_stats.mean, 10.333333333333334)
        self.assertAlmostEqual(r[0].vqc.gq_stats.stdev, 0.47140452079103168)

        self.assertEqual(r[1].vqc.AF, [0.125, 0.5, 0.375])
        self.assertEqual(r[1].vqc.AC, [1, 4, 3])
        self.assertEqual(r[1].vqc.AN, 8)
        self.assertEqual(r[1].vqc.homozygote_count, [0, 1, 1])
        self.assertEqual(r[1].vqc.n_called, 4)
        self.assertEqual(r[1].vqc.n_not_called, 0)
        self.assertEqual(r[1].vqc.call_rate, 1.0)
        self.assertEqual(r[1].vqc.n_het, 2)
        self.assertEqual(r[1].vqc.n_non_ref, 4)
        self.assertEqual(r[1].vqc.p_value_hwe, None)
        self.assertEqual(r[1].vqc.het_freq_hwe, None)
        self.assertEqual(r[1].vqc.dp_stats.min, 5)
        self.assertEqual(r[1].vqc.dp_stats.max, 5)
        self.assertEqual(r[1].vqc.dp_stats.mean, 5)
        self.assertEqual(r[1].vqc.dp_stats.stdev, 0.0)
        self.assertEqual(r[1].vqc.gq_stats.min, 10)
        self.assertEqual(r[1].vqc.gq_stats.max, 10)
        self.assertEqual(r[1].vqc.gq_stats.mean, 10)
        self.assertEqual(r[1].vqc.gq_stats.stdev, 0)
Пример #5
0
    def test_variant_qc(self):
        data = [
            {'v': '1:1:A:T', 's': '1', 'GT': hl.Call([0, 0]), 'GQ': 10, 'DP': 0},
            {'v': '1:1:A:T', 's': '2', 'GT': hl.Call([1, 1]), 'GQ': 10, 'DP': 5},
            {'v': '1:1:A:T', 's': '3', 'GT': hl.Call([0, 1]), 'GQ': 11, 'DP': 100},
            {'v': '1:1:A:T', 's': '4', 'GT': None, 'GQ': None, 'DP': 100},
            {'v': '1:2:A:T,C', 's': '1', 'GT': hl.Call([1, 2]), 'GQ': 10, 'DP': 5},
            {'v': '1:2:A:T,C', 's': '2', 'GT': hl.Call([2, 2]), 'GQ': 10, 'DP': 5},
            {'v': '1:2:A:T,C', 's': '3', 'GT': hl.Call([0, 1]), 'GQ': 10, 'DP': 5},
            {'v': '1:2:A:T,C', 's': '4', 'GT': hl.Call([1, 1]), 'GQ': 10, 'DP': 5},
        ]

        ht = hl.Table.parallelize(data, hl.dtype('struct{v: str, s: str, GT: call, GQ: int, DP: int}'))
        ht = ht.transmute(**hl.parse_variant(ht.v))
        mt = ht.to_matrix_table(['locus', 'alleles'], ['s'])
        mt = hl.variant_qc(mt, 'vqc')
        r = mt.rows().collect()

        self.assertEqual(r[0].vqc.AF, [0.5, 0.5])
        self.assertEqual(r[0].vqc.AC, [3, 3])
        self.assertEqual(r[0].vqc.AN, 6)
        self.assertEqual(r[0].vqc.homozygote_count, [1, 1])
        self.assertEqual(r[0].vqc.n_called, 3)
        self.assertEqual(r[0].vqc.n_not_called, 1)
        self.assertEqual(r[0].vqc.call_rate, 0.75)
        self.assertEqual(r[0].vqc.n_het, 1)
        self.assertEqual(r[0].vqc.n_non_ref, 2)
        self.assertEqual(r[0].vqc.het_freq_hwe, 0.6)
        self.assertEqual(r[0].vqc.p_value_hwe, 0.7)
        self.assertEqual(r[0].vqc.dp_stats.min, 0)
        self.assertEqual(r[0].vqc.dp_stats.max, 100)
        self.assertEqual(r[0].vqc.dp_stats.mean, 51.25)
        self.assertAlmostEqual(r[0].vqc.dp_stats.stdev, 48.782040752719645)
        self.assertEqual(r[0].vqc.gq_stats.min, 10)
        self.assertEqual(r[0].vqc.gq_stats.max, 11)
        self.assertAlmostEqual(r[0].vqc.gq_stats.mean, 10.333333333333334)
        self.assertAlmostEqual(r[0].vqc.gq_stats.stdev, 0.47140452079103168)

        self.assertEqual(r[1].vqc.AF, [0.125, 0.5, 0.375])
        self.assertEqual(r[1].vqc.AC, [1, 4, 3])
        self.assertEqual(r[1].vqc.AN, 8)
        self.assertEqual(r[1].vqc.homozygote_count, [0, 1, 1])
        self.assertEqual(r[1].vqc.n_called, 4)
        self.assertEqual(r[1].vqc.n_not_called, 0)
        self.assertEqual(r[1].vqc.call_rate, 1.0)
        self.assertEqual(r[1].vqc.n_het, 2)
        self.assertEqual(r[1].vqc.n_non_ref, 4)
        self.assertEqual(r[1].vqc.p_value_hwe, None)
        self.assertEqual(r[1].vqc.het_freq_hwe, None)
        self.assertEqual(r[1].vqc.dp_stats.min, 5)
        self.assertEqual(r[1].vqc.dp_stats.max, 5)
        self.assertEqual(r[1].vqc.dp_stats.mean, 5)
        self.assertEqual(r[1].vqc.dp_stats.stdev, 0.0)
        self.assertEqual(r[1].vqc.gq_stats.min, 10)
        self.assertEqual(r[1].vqc.gq_stats.max, 10)
        self.assertEqual(r[1].vqc.gq_stats.mean, 10)
        self.assertEqual(r[1].vqc.gq_stats.stdev, 0)
Пример #6
0
 def _write_tmp_mt():
     ## part 1: about an 1 hr with 30 workers (possibly starting with 10 then increasing to 30 if progress stalls)
     variants = hl.import_table(
         'gs://nbaya/hapmap3_variants.tsv.gz', force=True
     )  # download here: https://github.com/nikbaya/split/blob/master/hapmap3_variants.tsv.gz
     variants = variants.key_by(**hl.parse_variant(variants.v))
     mt = get_ukb_imputed_data(
         'all', variant_list=variants,
         entry_fields=('dosage', ))  # 'all' = autosomes only
     # print(mt.count()) # (1089172, 487409)
     # mt = mt.checkpoint(mt_path.replace('nbaya/','nbaya/tmp-'), overwrite=overwrite)
     mt.write(tmp_mt_path, _read_if_exists=True)
Пример #7
0
def get_ldsim_dataset(n_variants=16, n_samples=4, n_contigs=2, seed=None):
    data = []
    rs = np.random.RandomState(seed)
    for v in range(n_variants):
        for s in range(n_samples):
            for c in range(n_contigs):
                data.append({
                    'v': f'{c+1}:{v+1}:A:C',
                    's': f's{s+1:09d}',
                    'cm': .1,
                    'GT': hl.Call([rs.randint(0, 2),
                                   rs.randint(0, 2)])
                })
    ht = hl.Table.parallelize(
        data, hl.dtype('struct{v: str, s: str, cm: float64, GT: call}'))
    ht = ht.transmute(**hl.parse_variant(ht.v))
    mt = ht.to_matrix_table(row_key=['locus', 'alleles'],
                            col_key=['s'],
                            row_fields=['cm'])
    return add_default_plink_fields(mt)
Пример #8
0
    def test_sample_qc(self):
        data = [
            {'v': '1:1:A:T', 's': '1', 'GT': hl.Call([0, 0]), 'GQ': 10, 'DP': 0},
            {'v': '1:2:A:T,C', 's': '1', 'GT': hl.Call([1]), 'GQ': 15, 'DP': 5},
            {'v': '1:3:A:G,C', 's': '1', 'GT': hl.Call([2, 2]), 'GQ': 10, 'DP': 4},
            {'v': '1:4:G:A', 's': '1', 'GT': hl.Call([0, 1]), 'GQ': None, 'DP': 5},
            {'v': '1:5:C:CG', 's': '1', 'GT': hl.Call([1, 1]), 'GQ': 20, 'DP': 3},
            {'v': '1:6:C:A', 's': '1', 'GT': None, 'GQ': 0, 'DP': None},
        ]

        ht = hl.Table.parallelize(data, hl.dtype('struct{v: str, s: str, GT: call, GQ: int, DP: int}'))
        ht = ht.transmute(**hl.parse_variant(ht.v))
        mt = ht.to_matrix_table(['locus', 'alleles'], ['s'])
        mt = hl.sample_qc(mt, 'sqc')
        r = mt.cols().select('sqc').collect()

        self.assertAlmostEqual(r[0].sqc.gq_stats.mean, 11)
        self.assertAlmostEqual(r[0].sqc.gq_stats.stdev, 6.6332495807)
        self.assertAlmostEqual(r[0].sqc.gq_stats.min, 0)
        self.assertAlmostEqual(r[0].sqc.gq_stats.max, 20)
        self.assertAlmostEqual(r[0].sqc.dp_stats.mean, 3.399999999)
        self.assertAlmostEqual(r[0].sqc.dp_stats.stdev, 1.8547236990)
        self.assertAlmostEqual(r[0].sqc.dp_stats.min, 0)
        self.assertAlmostEqual(r[0].sqc.dp_stats.max, 5)
        self.assertAlmostEqual(r[0].sqc.call_rate, 0.8333333333)
        self.assertEqual(r[0].sqc.n_called, 5)
        self.assertEqual(r[0].sqc.n_not_called, 1)
        self.assertEqual(r[0].sqc.n_hom_ref, 1)
        self.assertEqual(r[0].sqc.n_het, 1)
        self.assertEqual(r[0].sqc.n_hom_var, 3)
        self.assertEqual(r[0].sqc.n_insertion, 2)
        self.assertEqual(r[0].sqc.n_deletion, 0)
        self.assertEqual(r[0].sqc.n_singleton, 3)
        self.assertEqual(r[0].sqc.n_transition, 1)
        self.assertEqual(r[0].sqc.n_transversion, 3)
        self.assertEqual(r[0].sqc.n_star, 0)
        self.assertEqual(r[0].sqc.n_non_ref, 4)
        self.assertAlmostEqual(r[0].sqc.r_ti_tv, 0.333333333)
        self.assertAlmostEqual(r[0].sqc.r_het_hom_var, 0.3333333333)
        self.assertAlmostEqual(r[0].sqc.r_insertion_deletion, None)
Пример #9
0
    def test_sample_qc(self):
        data = [
            {'v': '1:1:A:T', 's': '1', 'GT': hl.Call([0, 0]), 'GQ': 10, 'DP': 0},
            {'v': '1:2:A:T,C', 's': '1', 'GT': hl.Call([1]), 'GQ': 15, 'DP': 5},
            {'v': '1:3:A:G,C', 's': '1', 'GT': hl.Call([2, 2]), 'GQ': 10, 'DP': 4},
            {'v': '1:4:G:A', 's': '1', 'GT': hl.Call([0, 1]), 'GQ': None, 'DP': 5},
            {'v': '1:5:C:CG', 's': '1', 'GT': hl.Call([1, 1]), 'GQ': 20, 'DP': 3},
            {'v': '1:6:C:A', 's': '1', 'GT': None, 'GQ': 0, 'DP': None},
        ]

        ht = hl.Table.parallelize(data, hl.dtype('struct{v: str, s: str, GT: call, GQ: int, DP: int}'))
        ht = ht.transmute(**hl.parse_variant(ht.v))
        mt = ht.to_matrix_table(['locus', 'alleles'], ['s'])
        mt = hl.sample_qc(mt, 'sqc')
        r = mt.cols().select('sqc').collect()

        self.assertAlmostEqual(r[0].sqc.gq_stats.mean, 11)
        self.assertAlmostEqual(r[0].sqc.gq_stats.stdev, 6.6332495807)
        self.assertAlmostEqual(r[0].sqc.gq_stats.min, 0)
        self.assertAlmostEqual(r[0].sqc.gq_stats.max, 20)
        self.assertAlmostEqual(r[0].sqc.dp_stats.mean, 3.399999999)
        self.assertAlmostEqual(r[0].sqc.dp_stats.stdev, 1.8547236990)
        self.assertAlmostEqual(r[0].sqc.dp_stats.min, 0)
        self.assertAlmostEqual(r[0].sqc.dp_stats.max, 5)
        self.assertAlmostEqual(r[0].sqc.call_rate, 0.8333333333)
        self.assertEqual(r[0].sqc.n_called, 5)
        self.assertEqual(r[0].sqc.n_not_called, 1)
        self.assertEqual(r[0].sqc.n_hom_ref, 1)
        self.assertEqual(r[0].sqc.n_het, 1)
        self.assertEqual(r[0].sqc.n_hom_var, 3)
        self.assertEqual(r[0].sqc.n_insertion, 2)
        self.assertEqual(r[0].sqc.n_deletion, 0)
        self.assertEqual(r[0].sqc.n_singleton, 3)
        self.assertEqual(r[0].sqc.n_transition, 1)
        self.assertEqual(r[0].sqc.n_transversion, 3)
        self.assertEqual(r[0].sqc.n_star, 0)
        self.assertEqual(r[0].sqc.n_non_ref, 4)
        self.assertAlmostEqual(r[0].sqc.r_ti_tv, 0.333333333)
        self.assertAlmostEqual(r[0].sqc.r_het_hom_var, 0.3333333333)
        self.assertAlmostEqual(r[0].sqc.r_insertion_deletion, None)
Пример #10
0
def get_plink_sim_dataset(n_variants=16, n_samples=4, n_contigs=2, seed=0):
    data = []
    rs = np.random.RandomState(seed)
    contig_index = dividx(n_variants, n_contigs)
    assert contig_index.ndim == 1
    assert contig_index.size == n_variants
    for v in range(n_variants):
        c = contig_index[v]
        for s in range(n_samples):
            data.append({
                "v": f"{c+1}:{v+1}:A:C",
                "s": f"S{s+1:07d}",
                "cm": 0.1,
                "GT": hl.Call([rs.randint(0, 2),
                               rs.randint(0, 2)]),
            })
    ht = hl.Table.parallelize(
        data, hl.dtype("struct{v: str, s: str, cm: float64, GT: call}"))
    ht = ht.transmute(**hl.parse_variant(ht.v))
    mt = ht.to_matrix_table(row_key=["locus", "alleles"],
                            col_key=["s"],
                            row_fields=["cm"])
    return add_default_plink_fields(mt)
Пример #11
0
def preprocess1(variant_set):
    print('\n##################')
    print('Starting Pre-processing 1: Creating variants table (variant_set: ' +
          variant_set + ')')
    print('Time: {:%H:%M:%S (%Y-%b-%d)}'.format(datetime.datetime.now()))
    print('\n##################')

    if variant_set == 'hm3':
        variants = hl.import_table('gs://nbaya/split/hapmap3_variants.tsv')
    elif variant_set == 'qc_pos':
        variants = hl.import_table(
            'gs://ukb31063-mega-gwas/qc/ukb31063.gwas_variants.autosomes.tsv')

    variants = variants.annotate(**hl.parse_variant(variants.v))
    variants = variants.key_by('locus', 'alleles')

    variants.write('gs://nbaya/split/' + variant_set + '_variants.ht')

    print('\n##################')
    print(
        'Finished Pre-processing 1: Creating variants table using variant_set: '
        + variant_set)
    print('Time: {:%H:%M:%S (%Y-%b-%d)}'.format(datetime.datetime.now()))
    print('\n##################')
Пример #12
0
hl.init(log='/hail.log', min_block_size=2048)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# define files
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# input
vds_file = 'gs://ccdg-qc-multi/vds/raw/hail2_allchr.vds'
onep_file = 'gs://ccdg-qc-multi/out/onep_variants_table.tsv'

# output
vds_onep_file = 'gs://ccdg-qc-multi/vds/raw/hail2_onep.vds'

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# create subset
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

vds = hl.read_matrix_table(vds_file)

onep = hl.import_table(onep_file, no_header=True).key_by('f0')
onep2 = onep.transmute(**hl.parse_variant(
    onep.f0, reference_genome=hl.genetics.GenomeReference.GRCh38())).key_by(
        'locus', 'alleles')
vds = vds.filter_rows(hl.is_defined(onep2[vds.locus, vds.alleles]), keep=True)

vds.write(vds_onep_file, overwrite=True)

# print runtime
stop = timeit.default_timer()
print("runtime: " + str(stop - start) + " seconds")
Пример #13
0
import hail as hl
from pprint import pprint
from bokeh.io import output_notebook,show,save
from bokeh.layouts import gridplot
from bokeh.models import Span
import hail.expr.aggregators as agg
from bokeh.plotting import figure, output_file
import numpy as np
​
​
hl.init(default_reference='GRCh38',min_block_size=6)
​
​#Annotations: gsutil -m cp /medpop/esp2/mzekavat/CHIP/CHUD/data/variant_annot/somVariants.txt.bgz gs://maryam_lipids/UKBB_CHIP/somVariants.txt.bgz

kt = hl.import_table('gs://maryam_lipids/UKBB_CHIP/somVariants.txt.bgz', impute = True,min_partitions=2000,no_header = True) 
kt2 = kt.key_by(**hl.parse_variant(kt.f0)
kt2.describe()
kt2.write('gs://maryam_lipids/UKBB_CHIP/all_somatic_var_list.ht')

kt2=hl.read_table('gs://maryam_lipids/UKBB_CHIP/all_somatic_var_list.ht').repartition(1000)
kt2 = hl.vep(kt2, 'gs://hail-us-vep/vep95-GRCh38-loftee-gcloud.json')

consequence_in_severity_order = [
  "transcript_ablation"
, "splice_acceptor_variant"
, "splice_donor_variant"
, "stop_gained"
, "frameshift_variant"
, "stop_lost"
, "start_lost"
, "transcript_amplification"
Пример #14
0
def main(args):
    betas = ['beta_01', 'beta_1', 'beta_10', 'beta_100']
    spike_slab = hl.import_table(
        'gs://armartin/mama/spike_slab/BBJ_UKB_hm3.chr22.cm.beta.txt',
        impute=True)
    spike_slab = spike_slab.key_by(**hl.parse_variant(spike_slab.v))
    if args.compute_true_phenotypes:
        # get the white british subset
        eur = hl.import_table(
            'gs://phenotype_31063/ukb31063.gwas_covariates.both_sexes.tsv'
        ).key_by('s')

        # read in imputed data, subset to chr22
        mt = hl.read_matrix_table(
            'gs://phenotype_31063/hail/imputed/ukb31063.dosage.autosomes.mt')
        mt = hl.filter_intervals(mt, [hl.parse_locus_interval('22')])

        # annotate and filter imputed data to all sites with causal effects
        mt = mt.annotate_rows(ss=spike_slab[mt.row_key])
        mt = mt.filter_rows(hl.is_defined(mt.ss))

        # compute true PRS (i.e. phenotypes)
        annot_expr = {i: hl.agg.sum(mt.ss[i] * mt.dosage) for i in betas}

        # write out phenos for white British unrelated subset
        mt = mt.annotate_cols(**annot_expr)
        mt = mt.filter_cols(hl.is_defined(eur[mt.s]))
        mt.cols().write(
            'gs://armartin/mama/spike_slab/BBJ_UKB_hm3.chr22.cm.beta.true_PRS.ht',
            stage_locally=True,
            overwrite=True)

    if args.run_gwas:
        # read back in PRS (now true phenotypes)
        phenos = hl.read_table(
            'gs://armartin/mama/spike_slab/BBJ_UKB_hm3.chr22.cm.beta.true_PRS.ht'
        ).key_by('s')
        phenos.show()
        covariates = hl.import_table(
            'gs://phenotype_31063/ukb31063.gwas_covariates.both_sexes.tsv',
            impute=True,
            types={
                's': hl.tstr
            }).key_by('s')
        full_mt = hl.read_matrix_table(
            'gs://phenotype_31063/hail/imputed/ukb31063.dosage.autosomes.mt')
        full_mt = full_mt.annotate_cols(**covariates[full_mt.s])
        full_mt = hl.filter_intervals(full_mt, [hl.parse_locus_interval('22')])

        # annotate and filter imputed data to all sites with causal effects
        full_mt = full_mt.annotate_rows(ss=spike_slab[full_mt.row_key])
        full_mt = full_mt.filter_rows(hl.is_defined(full_mt.ss))

        # subset to white British subset, get 10 sets of 10k and run a gwas for each of these w/ PCs as covs
        for i in range(10):
            subset_pheno = phenos.annotate(r=hl.rand_unif(0, 1))
            subset_pheno = subset_pheno.order_by(
                subset_pheno.r).add_index('global_idx').key_by('s')
            subset_pheno = subset_pheno.filter(subset_pheno.global_idx < 10000)
            mt = full_mt.annotate_cols(**subset_pheno[full_mt.s])
            mt = mt.annotate_rows(maf=hl.agg.mean(mt.dosage) / 2)
            result_ht = hl.linear_regression_rows(
                y=[mt[i] for i in betas],
                x=mt.dosage,
                covariates=[1] + [mt['PC' + str(i)] for i in range(1, 21)],
                pass_through=['rsid', 'maf'])

            subset_pheno.export(
                'gs://armartin/mama/spike_slab/UKB_hm3.chr22.cm.beta.true_PRS.gwas_inds_'
                + str(i) + '.tsv.gz')
            result_ht.write(
                'gs://armartin/mama/spike_slab/UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_'
                + str(i) + '.ht',
                overwrite=True)

    if args.write_gwas:
        for i in range(10):
            result_ht = hl.read_table(
                'gs://armartin/mama/spike_slab/UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_'
                + str(i) + '.ht')
            result_ht = result_ht.key_by()
            get_expr = {
                field + '_' + x: result_ht[field][i]
                for i, x in enumerate(betas)
                for field in ['beta', 'standard_error', 'p_value']
            }
            result_ht.select(chr=result_ht.locus.contig, pos=result_ht.locus.position, rsid=result_ht.rsid, ref=result_ht.alleles[0],
                             alt=result_ht.alleles[1], maf=result_ht.maf, n=result_ht.n, **get_expr)\
                .export('gs://armartin/mama/spike_slab/UKB_hm3.chr22.cm.beta.true_PRS.gwas_sumstat_' + str(i) + '.tsv.gz')
Пример #15
0
IMPUTESEX_TABLE = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/05_imputesex.ht'
IMPUTESEX_FILE = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/05_imputesex.tsv'
Y_NCALLED = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/05_ycalled.tsv'

INITIAL_SAMPLES = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/03_initial_qc.keep.sample_list'
PRUNED_CHRX_VARIANTS = 'gs://dalio_bipolar_w1_w2_hail_02/data/variants/04_chrX.prune.in'

PHENOTYPES_TABLE = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/phenotypes.ht'

ht_initial_samples = hl.import_table(INITIAL_SAMPLES, no_header=True, key='f0')
ht_pruned_chrx_variants = hl.import_table(PRUNED_CHRX_VARIANTS, no_header=True)
sample_annotations = hl.read_table(PHENOTYPES_TABLE)

ht_pruned_chrx_variants = ht_pruned_chrx_variants.annotate(
    **hl.parse_variant(ht_pruned_chrx_variants.f0, reference_genome='GRCh38'))
ht_pruned_chrx_variants = ht_pruned_chrx_variants.key_by(
    ht_pruned_chrx_variants.locus, ht_pruned_chrx_variants.alleles)

mt = hl.read_matrix_table(MT_HARDCALLS)
mt = mt.filter_cols(hl.is_defined(ht_initial_samples[mt.col_key]))
mt = mt.filter_rows(hl.is_defined(ht_pruned_chrx_variants[mt.row_key]))

n = mt.count()

print('n samples:')
print(n[1])
print('n variants:')
print(n[0])

imputed_sex = hl.impute_sex(mt.GT, female_threshold=0.6, male_threshold=0.6)
Пример #16
0
ht_final_samples = hl.import_table(FINAL_SAMPLE_LIST, no_header=True, key='f0')
ht_final_variants = hl.import_table(FINAL_VARIANT_LIST,
                                    types={
                                        'locus':
                                        hl.tlocus(reference_genome='GRCh38'),
                                        'alleles':
                                        hl.tarray(hl.tstr)
                                    })
ht_final_variants = ht_final_variants.key_by(ht_final_variants.locus,
                                             ht_final_variants.alleles)

ht_final_pruned_variants = hl.import_table(FINAL_PRUNED_VARIANTS,
                                           no_header=True)
ht_final_pruned_variants = ht_final_pruned_variants.annotate(
    **hl.parse_variant(ht_final_pruned_variants.f0, reference_genome='GRCh38'))
ht_final_pruned_variants = ht_final_pruned_variants.key_by(
    ht_final_pruned_variants.locus, ht_final_pruned_variants.alleles)

sample_annotations = hl.read_table(PHENOTYPES_TABLE)
impute_sex_annotations = hl.read_table(IMPUTESEX_TABLE)
annotation_annotations = hl.read_table(ANNOTATION_TABLE)

mt = hl.read_matrix_table(MT)
mt = mt.drop('a_index', 'qual', 'info', 'filters', 'was_split')

mt = mt.filter_cols(hl.is_defined(ht_final_samples[mt.col_key]))
mt = mt.filter_rows(hl.is_defined(ht_final_variants[mt.row_key]))

mt = mt.annotate_cols(phenotype=sample_annotations[mt.col_key])
mt = mt.annotate_cols(imputesex=impute_sex_annotations[mt.col_key])
Пример #17
0
def read_gwas_table(gwas_table_tsv, type_dic):
    gwas_tsv = hl.import_table(gwas_table_tsv, types=type_dic)
    gwas_tsv = gwas_tsv.annotate(v=hl.parse_variant(gwas_tsv.variant))
    gwas_tsv = gwas_tsv.key_by(gwas_tsv.v.locus, gwas_tsv.v.alleles)
    gwas_tsv = gwas_tsv.drop('v')
    return gwas_tsv
Пример #18
0
MT_HARDCALLS = 'gs://raw_data_bipolar_dalio_w1_w2_hail_02/bipolar_wes_dalio_W1_W2/filterGT_GRCh38_6_multi.hardcalls.mt'
PCA_SCORES = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/09_pca_scores.tsv'

PHENOTYPES_TABLE = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/phenotypes.ht'
INITIAL_SAMPLES = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/03_initial_qc.keep.sample_list'
PRUNED_VARIANTS = 'gs://dalio_bipolar_w1_w2_hail_02/data/variants/04_prune.keep.variant_list'
IBD_SAMPLES = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/06_ibd.remove.sample_list'

mt = hl.read_matrix_table(MT_HARDCALLS)
sample_annotations = hl.read_table(PHENOTYPES_TABLE)

ht_initial_samples = hl.import_table(INITIAL_SAMPLES, no_header=True, key='f0')
ht_pruned_variants = hl.import_table(PRUNED_VARIANTS, no_header=True)
ht_ibd_samples = hl.import_table(IBD_SAMPLES, no_header=True, key='f0')

ht_pruned_variants = ht_pruned_variants.annotate(**hl.parse_variant(ht_pruned_variants.f0, reference_genome='GRCh38'))
ht_pruned_variants = ht_pruned_variants.key_by(ht_pruned_variants.locus, ht_pruned_variants.alleles)

mt = mt.filter_cols(hl.is_defined(ht_initial_samples[mt.col_key]))
mt = mt.filter_cols(~hl.is_defined(ht_ibd_samples[mt.col_key]))
mt = mt.filter_rows(hl.is_defined(ht_pruned_variants[mt.row_key]))

mt = mt.annotate_cols(phenotype = sample_annotations[mt.s]).repartition(128).persist()

n = mt.count()

print('n samples:')
print(n[1])
print('n variants:')
print(n[0])
Пример #19
0
def main(args):
    ht_snp = hl.import_table(args.snp, impute=True)
    ht_snp = ht_snp.annotate(variant=hl.delimit([
        ht_snp.chromosome,
        hl.str(ht_snp.position), ht_snp.allele1, ht_snp.allele2
    ],
                                                delimiter=':'))
    ht_snp = ht_snp.annotate(
        **hl.parse_variant(ht_snp.variant, reference_genome='GRCh38'))
    ht_snp = ht_snp.key_by('locus', 'alleles')
    ht_snp = ht_snp.add_index('idx_snp')
    ht_snp = ht_snp.checkpoint(new_temp_file())

    # annotate vep
    gnomad = hl.read_table(
        'gs://gnomad-public-requester-pays/release/3.0/ht/genomes/gnomad.genomes.r3.0.sites.ht'
    )
    ht_snp = ht_snp.join(gnomad.select('vep'), how='left')
    ht_snp = process_consequences(ht_snp)

    # extract most severe
    ht_snp = ht_snp.annotate(vep=(hl.case().when(
        hl.is_defined(ht_snp.vep.worst_csq_for_variant_canonical),
        ht_snp.vep.worst_csq_for_variant_canonical).when(
            hl.is_defined(ht_snp.vep.worst_csq_for_variant),
            ht_snp.vep.worst_csq_for_variant).or_missing()),
                             is_canonical_vep=hl.is_defined(
                                 ht_snp.vep.worst_csq_for_variant_canonical))
    ht_snp = ht_snp.annotate(most_severe=hl.if_else(
        hl.is_defined(ht_snp.vep), ht_snp.vep.most_severe_consequence,
        'intergenic_variant'),
                             gene_most_severe=ht_snp.vep.gene_symbol)
    ht_snp = ht_snp.select_globals()
    ht_snp = ht_snp.drop('vep')
    ht_snp = ht_snp.annotate(
        **annotate_consequence_category(ht_snp.most_severe))
    ht_snp = ht_snp.checkpoint(new_temp_file())

    df = ht_snp.key_by().drop('locus', 'alleles', 'variant',
                              'idx_snp').to_pandas()

    # annotate LD
    for pop in POPS:
        ht = hl.read_table(
            f'gs://gnomad-public-requester-pays/release/2.1.1/ld/gnomad.genomes.r2.1.1.{pop}.common.adj.ld.variant_indices.ht'
        )
        ht = ht.annotate(locus_hg38=hl.liftover(ht.locus, 'GRCh38'))
        ht = ht.filter(hl.is_defined(ht.locus_hg38))
        ht = ht.key_by('locus_hg38', 'alleles').drop('locus')
        ht = ht_snp.join(ht, 'inner')
        ht = ht.checkpoint(new_temp_file())

        lead_idx = ht.order_by(hl.desc(ht.prob)).head(1).idx.collect()
        idx = ht.idx.collect()
        bm = BlockMatrix.read(
            f'gs://gnomad-public-requester-pays/release/2.1.1/ld/gnomad.genomes.r2.1.1.{pop}.common.ld.bm'
        )
        bm = bm.filter(idx, idx)
        # re-densify triangluar matrix
        bm = bm + bm.T - get_diag_mat(bm.diagonal())
        bm = bm.filter_rows(
            np.where(np.array(idx) == lead_idx[0])[0].tolist())**2

        idx_snp = ht.idx_snp.collect()
        r2 = bm.to_numpy()[0]
        df[f'gnomad_lead_r2_{pop}'] = np.nan
        df[f'gnomad_lead_r2_{pop}'].iloc[idx_snp] = r2

    if args.out.startswith('gs://'):
        fopen = hl.hadoop_open
    else:
        fopen = open

    with fopen(args.out, 'w') as f:
        df.to_csv(f, sep='\t', na_rep='NA', index=False)
Пример #20
0
    'Cells_Transformed_fibroblasts', 'Colon_Sigmoid', 'Colon_Transverse',
    'Esophagus_Gastroesophageal_Junction', 'Esophagus_Mucosa',
    'Esophagus_Muscularis', 'Heart_Atrial_Appendage', 'Heart_Left_Ventricle',
    'Liver', 'Lung', 'Minor_Salivary_Gland', 'Muscle_Skeletal', 'Nerve_Tibial',
    'Ovary', 'Pancreas', 'Pituitary', 'Prostate',
    'Skin_Not_Sun_Exposed_Suprapubic', 'Skin_Sun_Exposed_Lower_leg',
    'Small_Intestine_Terminal_Ileum', 'Spleen', 'Stomach', 'Testis', 'Thyroid',
    'Uterus', 'V****a', 'Whole_Blood'
]

hts = [(hl.import_table(
    'gs://hail-datasets/raw-data/gtex/v7/single-tissue-eqtl/processed/{}.allpairs.tsv.bgz'
    .format(x)).annotate(tissue='{}'.format(x))) for x in tissues]

ht_union = hl.Table.union(*hts)
ht_union = ht_union.annotate(**hl.parse_variant(
    ht_union.variant_id.replace('_b37$', '').replace('_', ':')))
ht_union = ht_union.drop('variant_id')
ht_union = ht_union.annotate(tss_distance=hl.int(ht_union['tss_distance']),
                             maf=hl.float(ht_union['maf']),
                             ma_samples=hl.int(ht_union['ma_samples']),
                             ma_count=hl.int(ht_union['ma_count']),
                             pval_nominal=hl.float(ht_union['pval_nominal']),
                             slope=hl.float(ht_union['slope']),
                             slope_se=hl.float(
                                 hl.or_missing(ht_union['slope_se'] != '-nan',
                                               ht_union['slope_se'])))

mt = ht_union.to_matrix_table(row_key=['locus', 'alleles', 'gene_id'],
                              col_key=['tissue'],
                              row_fields=['tss_distance', 'maf'])
mt = mt.partition_rows_by(['locus'], 'locus', 'alleles', 'gene_id')
Пример #21
0
    #    ukb_snps = ukb_snps.annotate(sort_al=hl.sorted(ukb_snps.alleles))
    if verbose:
        print("\nCount 1: " + str(ukb_snps.count()) + '\n')

    ukb_snps = ukb_snps.filter(
        hl.is_snp(ukb_snps.alleles[0], ukb_snps.alleles[1])
        & (~(ukb_snps.locus.contig == 'X'))
        & (~((ukb_snps.locus.contig == '6') &
             (ukb_snps.locus.position > 25000000) &
             (ukb_snps.locus.position < 34000000))))
    if verbose:
        print("\nCount 2: " + str(ukb_snps.count()) + '\n')

    # merge in, filter on MAF from the UKBB GWAS sample
    ukb_qc = hl.import_table(GWAS_qc)
    ukb_qc = ukb_qc.annotate(vstruct=hl.parse_variant(ukb_qc.variant))
    ukb_qc = ukb_qc.annotate(locus=ukb_qc.vstruct.locus,
                             alleles=ukb_qc.vstruct.alleles).key_by(
                                 'locus', 'alleles')
    ukb_qc2 = ukb_snps.join(ukb_qc.select(ukb_qc.minor_AF))
    if verbose:
        print("\nCount 3: " + str(ukb_qc2.count()) + '\n')

    ukb_qc2 = ukb_qc2.filter((hl.float(ukb_qc2.minor_AF) > 0.01)
                             & (hl.float(ukb_qc2.minor_AF) < 0.99))
    if verbose:
        print("\nCount 4: " + str(ukb_qc2.count()) + '\n')

    # merge in rsid, info (from full UKB sample)
    # and filter to info > 0.9
    ukb_mfi = hl.read_table(GWAS_mfi).key_by('locus', 'alleles').repartition(