예제 #1
0
    def test_reference_genome(self):
        rg = hl.get_reference('GRCh37')
        self.assertEqual(rg.name, "GRCh37")
        self.assertEqual(rg.contigs[0], "1")
        self.assertListEqual(rg.x_contigs, ["X"])
        self.assertListEqual(rg.y_contigs, ["Y"])
        self.assertListEqual(rg.mt_contigs, ["MT"])
        self.assertEqual(rg.par[0], hl.eval(hl.parse_locus_interval("X:60001-2699521")))
        self.assertEqual(rg.contig_length("1"), 249250621)

        name = "test"
        contigs = ["1", "X", "Y", "MT"]
        lengths = {"1": 10000, "X": 2000, "Y": 4000, "MT": 1000}
        x_contigs = ["X"]
        y_contigs = ["Y"]
        mt_contigs = ["MT"]
        par = [("X", 5, 1000)]

        gr2 = ReferenceGenome(name, contigs, lengths, x_contigs, y_contigs, mt_contigs, par)
        self.assertEqual(gr2.name, name)
        self.assertListEqual(gr2.contigs, contigs)
        self.assertListEqual(gr2.x_contigs, x_contigs)
        self.assertListEqual(gr2.y_contigs, y_contigs)
        self.assertListEqual(gr2.mt_contigs, mt_contigs)
        self.assertEqual(gr2.par, [hl.eval(hl.parse_locus_interval("X:5-1000", gr2))])
        self.assertEqual(gr2.contig_length("1"), 10000)
        self.assertDictEqual(gr2.lengths, lengths)
        gr2.write("/tmp/my_gr.json")
예제 #2
0
    def test_reference_genome(self):
        rg = hl.get_reference('GRCh37')
        self.assertEqual(rg.name, "GRCh37")
        self.assertEqual(rg.contigs[0], "1")
        self.assertListEqual(rg.x_contigs, ["X"])
        self.assertListEqual(rg.y_contigs, ["Y"])
        self.assertListEqual(rg.mt_contigs, ["MT"])
        self.assertEqual(rg.par[0],
                         hl.parse_locus_interval("X:60001-2699521").value)
        self.assertEqual(rg.contig_length("1"), 249250621)

        name = "test"
        contigs = ["1", "X", "Y", "MT"]
        lengths = {"1": 10000, "X": 2000, "Y": 4000, "MT": 1000}
        x_contigs = ["X"]
        y_contigs = ["Y"]
        mt_contigs = ["MT"]
        par = [("X", 5, 1000)]

        gr2 = ReferenceGenome(name, contigs, lengths, x_contigs, y_contigs,
                              mt_contigs, par)
        self.assertEqual(gr2.name, name)
        self.assertListEqual(gr2.contigs, contigs)
        self.assertListEqual(gr2.x_contigs, x_contigs)
        self.assertListEqual(gr2.y_contigs, y_contigs)
        self.assertListEqual(gr2.mt_contigs, mt_contigs)
        self.assertEqual(gr2.par,
                         [hl.parse_locus_interval("X:5-1000", gr2).value])
        self.assertEqual(gr2.contig_length("1"), 10000)
        self.assertDictEqual(gr2.lengths, lengths)
        gr2.write("/tmp/my_gr.json")
예제 #3
0
    def test_table_filter_intervals(self):
        ds = hl.import_vcf(resource('sample.vcf'), min_partitions=20).rows()

        self.assertEqual(
            hl.filter_intervals(
                ds, [hl.parse_locus_interval('20:10639222-10644705')]).count(),
            3)

        intervals = [
            hl.parse_locus_interval('20:10639222-10644700'),
            hl.parse_locus_interval('20:10644700-10644705')
        ]
        self.assertEqual(hl.filter_intervals(ds, intervals).count(), 3)

        intervals = hl.array([
            hl.parse_locus_interval('20:10639222-10644700'),
            hl.parse_locus_interval('20:10644700-10644705')
        ])
        self.assertEqual(hl.filter_intervals(ds, intervals).count(), 3)

        intervals = hl.array([
            hl.eval(hl.parse_locus_interval('20:10639222-10644700')),
            hl.parse_locus_interval('20:10644700-10644705')
        ])
        self.assertEqual(hl.filter_intervals(ds, intervals).count(), 3)

        intervals = [
            hl.eval(hl.parse_locus_interval('[20:10019093-10026348]')),
            hl.eval(hl.parse_locus_interval('[20:17705793-17716416]'))
        ]
        self.assertEqual(hl.filter_intervals(ds, intervals).count(), 4)
예제 #4
0
def main(args):
    hl.init(master=f'local[{args.n_threads}]',
            log=hl.utils.timestamp_path(os.path.join(tempfile.gettempdir(), 'extract_vcf'), suffix='.log'),
            default_reference=args.reference)

    sys.path.append('/')
    add_args = []
    if args.additional_args is not None:
        add_args = args.additional_args.split(',')
    load_module = importlib.import_module(args.load_module)
    mt = getattr(load_module, args.load_mt_function)(*add_args)

    if args.gene_map_ht_path is None:
        interval = [hl.parse_locus_interval(args.interval)]
    else:
        gene_ht = hl.read_table(args.gene_map_ht_path)
        if args.gene is not None:
            gene_ht = gene_ht.filter(gene_ht.gene_symbol == args.gene)
            interval = gene_ht.aggregate(hl.agg.take(gene_ht.interval, 1), _localize=False)
        else:
            interval = [hl.parse_locus_interval(args.interval)]
            gene_ht = hl.filter_intervals(gene_ht, interval)

        gene_ht = gene_ht.filter(hl.set(args.groups.split(',')).contains(gene_ht.annotation))
        gene_ht.select(group=gene_ht.gene_id + '_' + gene_ht.gene_symbol + '_' + gene_ht.annotation, variant=hl.delimit(gene_ht.variants, '\t')
                       ).key_by().drop('start').export(args.group_output_file, header=False)
        # TODO: possible minor optimization: filter output VCF to only variants in `gene_ht.variants`

    if not args.no_adj:
        mt = mt.filter_entries(mt.adj)

    mt = hl.filter_intervals(mt, interval)

    if not args.input_bgen:
        mt = mt.select_entries('GT')
        mt = mt.filter_rows(hl.agg.count_where(mt.GT.is_non_ref()) > 0)
    mt = mt.annotate_rows(rsid=mt.locus.contig + ':' + hl.str(mt.locus.position) + '_' + mt.alleles[0] + '/' + mt.alleles[1])

    if args.callrate_filter:
        mt = mt.filter_rows(hl.agg.fraction(hl.is_defined(mt.GT)) >= args.callrate_filter)

    if args.export_bgen:
        if not args.input_bgen:
            mt = mt.annotate_entries(GT=hl.if_else(mt.GT.is_haploid(), hl.call(mt.GT[0], mt.GT[0]), mt.GT))
            mt = gt_to_gp(mt)
            mt = impute_missing_gp(mt, mean_impute=args.mean_impute_missing)
        hl.export_bgen(mt, args.output_file, gp=mt.GP, varid=mt.rsid)
    else:
        mt = mt.annotate_entries(GT=hl.or_else(mt.GT, hl.call(0, 0)))
        # Note: no mean-imputation for VCF
        hl.export_vcf(mt, args.output_file)
def setup(path):
    interval = [
        hl.eval(
            hl.parse_locus_interval('chr1:START-END',
                                    reference_genome='GRCh38'))
    ]
    return hl.import_vcfs([path], interval, reference_genome='GRCh38')[0]
def annotate_sex(mt: hl.MatrixTable,
                 out_internal_mt_prefix: str,
                 male_threshold: float = 0.8,
                 female_threshold: float = 0.5) -> hl.MatrixTable:
    """
    Imputes sex, exports data, and annotates mt with this data
    NOTE: Evaluated in R (plots) and decided on cutoff of F<0.5 for females and F>0.8 for males (default) for genomes

    :param MatrixTable mt: MT containing samples to be ascertained for sex
    :param str out_internal_mt_prefix: file path prefix for tsv containing samples and sex imputation annotations
    :return: MatrixTable with imputed sex annotations stashed in column annotation 'sex_check'
    :rtype: MatrixTable
    """
    mt1 = hl.filter_intervals(mt, [hl.parse_locus_interval('chrX')])
    #mt = mt.filter_rows(mt.locus.in_x_nonpar())
    mtx_unphased = mt1.select_entries(
        GT=hl.unphased_diploid_gt_index_call(mt1.GT.n_alt_alleles()))
    #imputed_sex = hl.impute_sex(mtx_unphased.GT)
    sex_ht = hl.impute_sex(mtx_unphased.GT,
                           aaf_threshold=0.05,
                           female_threshold=female_threshold,
                           male_threshold=male_threshold)
    sex_ht.export(out_internal_mt_prefix + '.sex_check.txt.bgz')
    sex_colnames = ['f_stat', 'is_female']
    sex_ht = sex_ht.select(*sex_colnames)
    mt = mt.annotate_cols(**sex_ht[mt.col_key])
    return mt
예제 #7
0
def run_mendel_errors() -> hl.Table:
    meta_ht = meta.ht()
    ped = pedigree.versions[f"{CURRENT_RELEASE}_raw"].pedigree()
    logger.info(f"Running Mendel errors for {len(ped.trios)} trios.")

    fake_ped = create_fake_pedigree(
        n=100,
        sample_list=list(
            meta_ht.aggregate(
                hl.agg.filter(
                    hl.rand_bool(0.01)
                    & ((hl.len(meta_ht.qc_metrics_filters) == 0)
                       & hl.or_else(hl.len(meta_ht.hard_filters) == 0, False)),
                    hl.agg.collect_as_set(meta_ht.s),
                ))),
        real_pedigree=ped,
    )
    merged_ped = hl.Pedigree(trios=ped.trios + fake_ped.trios)

    ped_samples = hl.literal(
        set([
            s for trio in merged_ped.trios
            for s in [trio.s, trio.pat_id, trio.mat_id]
        ]))
    mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True)
    mt = mt.filter_cols(ped_samples.contains(mt.s))
    mt = hl.filter_intervals(
        mt, [hl.parse_locus_interval("chr20", reference_genome='GRCh38')])
    mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)
    mt = mt.select_entries("GT", "END")
    mt = hl.experimental.densify(mt)
    mt = mt.filter_rows(hl.len(mt.alleles) == 2)
    mendel_errors, _, _, _ = hl.mendel_errors(mt["GT"], merged_ped)
    return mendel_errors
예제 #8
0
def main(args):

    hl.init(log='/liftover.log')

    if args.gnomad:
        gnomad = True
        path = None

        if args.exomes:
            data_type = 'exomes'
        if args.genomes:
            data_type = 'genomes'

        logger.info('Working on gnomAD {} release ht'.format(data_type))
        logger.info('Reading in release ht')
        t = public_release(data_type).ht()
        logger.info('Variants in release ht: {}'.format(t.count()))

    else:
        data_type = None
        gnomad = False

        if args.ht:
            path = args.ht
            t = hl.read_table(args.ht)
        if args.mt:
            path = args.mt
            t = hl.read_matrix_table(args.mt)

    logger.info('Checking if input data has been split')
    if 'was_split' not in t.row:
        t = hl.split_multi(t) if isinstance(
            t, hl.Table) else hl.split_multi_hts(t)

    logger.info('Preparing reference genomes for liftover')
    source, target = get_liftover_genome(t)

    if args.test:
        logger.info('Filtering to chr21 for testing')
        if source.name == 'GRCh38':
            contig = 'chr21'
        else:
            contig = '21'
        t = hl.filter_intervals(
            t, [hl.parse_locus_interval(contig, reference_genome=source.name)])

    logger.info(f'Lifting data to {target.name}')
    t = lift_data(t, gnomad, data_type, path, target, args.overwrite)

    logger.info('Checking SNPs for reference mismatches')
    t = annotate_snp_mismatch(t, data_type, target)

    mismatch = check_mismatch(t) if isinstance(
        t, hl.Table) else check_mismatch(t.rows())
    logger.info('{} total SNPs'.format(mismatch['total_variants']))
    logger.info('{} SNPs on minus strand'.format(mismatch['negative_strand']))
    logger.info('{} reference mismatches in SNPs'.format(
        mismatch['total_mismatch']))
    logger.info('{} mismatches on minus strand'.format(
        mismatch['negative_strand_mismatch']))
예제 #9
0
def impute_sex_aggregator(call,
                          aaf,
                          aaf_threshold=0.0,
                          include_par=False,
                          female_threshold=0.4,
                          male_threshold=0.8) -> hl.Table:
    """:func:`.impute_sex` as an aggregator."""
    mt = call._indices.source
    rg = mt.locus.dtype.reference_genome
    x_contigs = hl.literal(
        hl.eval(
            hl.map(lambda x_contig: hl.parse_locus_interval(x_contig, rg),
                   rg.x_contigs)))
    inbreeding = hl.agg.inbreeding(call, aaf)
    is_female = hl.if_else(
        inbreeding.f_stat < female_threshold, True,
        hl.if_else(inbreeding.f_stat > male_threshold, False,
                   hl.is_missing('tbool')))
    expression = hl.struct(is_female=is_female, **inbreeding)
    if not include_par:
        interval_type = hl.tarray(hl.tinterval(hl.tlocus(rg)))
        par_intervals = hl.literal(rg.par, interval_type)
        expression = hl.agg.filter(
            ~par_intervals.any(
                lambda par_interval: par_interval.contains(mt.locus)),
            expression)
    expression = hl.agg.filter(
        (aaf > aaf_threshold) & (aaf < (1 - aaf_threshold)), expression)
    expression = hl.agg.filter(
        x_contigs.any(lambda contig: contig.contains(mt.locus)), expression)

    return expression
예제 #10
0
def genotype_correlation(chr_list):
    """
    Get classic LD correlation matrix from genotypes of white British, using
    variant_set variants.
    """
    starttime = datetime.datetime.now()

    gt1 = hl.read_matrix_table('gs://nbaya/split/ukb31063.' + variant_set +
                               '_variants.gwas_samples_repart.mt')

    print(gt1.count())
    print(gt1.describe())

    for ch in chr_list:
        mt_chr = hl.filter_intervals(gt1, [hl.parse_locus_interval(str(ch))])
        print(mt_chr.count_rows())
        print(mt_chr.describe())
        ld = hl.ld_matrix(mt_chr.dosage, mt_chr.locus, radius=3e7)
        ld_sparse = ld.sparsify_band(lower=1, upper=1000)
        ld_sparse.write('gs://nbaya/sumstats_corr/hm3_gt_correlation_chr' +
                        str(ch) + '.bm',
                        overwrite=True)

    endtime = datetime.datetime.now()
    elapsed = endtime - starttime
    print('\n####################')
    print('Iteration time: ' + str(round(elapsed.seconds / 60, 2)) +
          ' minutes')
    print('####################')
예제 #11
0
def sumstats_correlation(chr_list):
    """
    Get LD correlation matrix using genotypes of white British, using
    variant_set variants.
    """
    starttime = datetime.datetime.now()

    mt0 = hl.read_matrix_table(
        'gs://phenotype_31063/hail/gwas.imputed_v3.both_sexes.annotated.mt')
    mt1 = mt0.filter_rows(hl.is_defined(
        variants[mt0.locus, mt0.alleles]))  #filter to variant_set variants
    mt1.describe()

    for ch in chr_list:
        mt_chr = hl.filter_intervals(mt1, [hl.parse_locus_interval(str(ch))])
        print(mt_chr.count_rows())
        ld = hl.ld_matrix(mt_chr.beta, mt_chr.locus, radius=3e7)
        ld_sparse = ld.sparsify_band(lower=1, upper=1000)
        ld_sparse.write('gs://nbaya/sumstats_corr/hm3_ss_correlation_chr' +
                        str(ch) + '.bm',
                        overwrite=True)

    endtime = datetime.datetime.now()
    elapsed = endtime - starttime
    print('\n####################')
    print('Iteration time: ' + str(round(elapsed.seconds / 60, 2)) +
          ' minutes')
    print('####################')
예제 #12
0
def test_combiner_run():

    tmpdir = new_temp_file()
    samples = all_samples[:5]

    input_paths = [resource(os.path.join('gvcfs', '1kg_chr22', f'{s}.hg38.g.vcf.gz')) for s in samples]
    final_paths_individual = [os.path.join(tmpdir, f'sample_{s}') for s in samples]
    final_path_1 = os.path.join(tmpdir, 'final1.vds')
    final_path_2 = os.path.join(tmpdir, 'final2.vds')

    parts = hl.eval([hl.parse_locus_interval('chr22:start-end', reference_genome='GRCh38')])

    for input_gvcf, path in zip(input_paths[:2], final_paths_individual[:2]):
        combiner = hl.vds.new_combiner(output_path=path, intervals=parts,
                                       temp_path=tmpdir,
                                       gvcf_paths=[input_gvcf],
                                       reference_genome='GRCh38')
        combiner.run()

    combiner = hl.vds.new_combiner(output_path=final_path_1, intervals=parts, temp_path=tmpdir,
                                   gvcf_paths=input_paths[2:], vds_paths=final_paths_individual[:2],
                                   reference_genome='GRCh38',
                                   branch_factor=2, batch_size=2)
    combiner.run()

    combiner2 = hl.vds.new_combiner(output_path=final_path_2, intervals=parts, temp_path=tmpdir,
                                    gvcf_paths=input_paths,
                                    reference_genome='GRCh38',
                                    branch_factor=2, batch_size=2)
    combiner2.run()

    assert hl.vds.read_vds(final_path_1)._same(hl.vds.read_vds(final_path_2))
예제 #13
0
    def test_liftover_strand(self):
        grch37 = hl.get_reference('GRCh37')
        grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'),
                            'GRCh38')

        self.assertEqual(
            hl.eval(
                hl.liftover(hl.locus('20', 60001, 'GRCh37'),
                            'GRCh38',
                            include_strand=True)),
            hl.eval(
                hl.struct(result=hl.locus('chr20', 79360, 'GRCh38'),
                          is_negative_strand=False)))

        self.assertEqual(
            hl.eval(
                hl.liftover(hl.locus_interval('20', 37007582, 37007586, True,
                                              True, 'GRCh37'),
                            'GRCh38',
                            include_strand=True)),
            hl.eval(
                hl.struct(result=hl.locus_interval('chr12', 32563117, 32563121,
                                                   True, True, 'GRCh38'),
                          is_negative_strand=True)))

        with self.assertRaises(FatalError):
            hl.eval(
                hl.liftover(
                    hl.parse_locus_interval('1:10000-10000',
                                            reference_genome='GRCh37'),
                    'GRCh38'))

        grch37.remove_liftover("GRCh38")
def pre_process_subset_freq(subset: str,
                            global_ht: hl.Table,
                            test: bool = False) -> hl.Table:
    """
    Prepare subset frequency Table by filling in missing frequency fields for loci present only in the global cohort.

    .. note::

        The resulting final `freq` array will be as long as the subset `freq_meta` global (i.e., one `freq` entry for each `freq_meta` entry)

    :param subset: subset ID
    :param global_ht: Hail Table containing all variants discovered in the overall release cohort
    :param test: If True, filter to small region on chr20
    :return: Table containing subset frequencies with missing freq structs filled in
    """

    # Read in subset HTs
    subset_ht_path = get_freq(subset=subset).path
    subset_chr20_ht_path = qc_temp_prefix() + f"chr20_test_freq.{subset}.ht"

    if test:
        if file_exists(subset_chr20_ht_path):
            logger.info(
                "Loading chr20 %s subset frequency data for testing: %s",
                subset,
                subset_chr20_ht_path,
            )
            subset_ht = hl.read_table(subset_chr20_ht_path)

        elif file_exists(subset_ht_path):
            logger.info(
                "Loading %s subset frequency data for testing: %s",
                subset,
                subset_ht_path,
            )
            subset_ht = hl.read_table(subset_ht_path)
            subset_ht = hl.filter_intervals(
                subset_ht, [hl.parse_locus_interval("chr20:1-1000000")])

    elif file_exists(subset_ht_path):
        logger.info("Loading %s subset frequency data: %s", subset,
                    subset_ht_path)
        subset_ht = hl.read_table(subset_ht_path)

    else:
        raise DataException(
            f"Hail Table containing {subset} subset frequencies not found. You may need to run the script generate_freq_data.py to generate frequency annotations first."
        )

    # Fill in missing freq structs
    ht = subset_ht.join(global_ht.select().select_globals(), how="right")
    ht = ht.annotate(freq=hl.if_else(
        hl.is_missing(ht.freq),
        hl.map(lambda x: missing_callstats_expr(),
               hl.range(hl.len(ht.freq_meta))),
        ht.freq,
    ))

    return ht
def main(args):

    if args.debug:
        logger.setLevel(logging.DEBUG)

    data_type = 'exomes' if args.exomes else 'genomes'
    path_args = [data_type, True, args.least_consequence, args.max_freq]
    ht = hl.read_table(pbt_trio_et_path(*path_args))

    # Apply relevant filters
    if not args.export_sex_chrom:
        autosomes = hl.parse_locus_interval('1-22')
        ht = ht.filter(autosomes.contains(
            ht.locus1))  # locus1 and locus2 are always on the same contig

    if not args.export_filtered:
        ht = ht.filter((hl.len(ht.filters1) == 0) & (hl.len(ht.filters2) == 0))
        logger.debug(
            f'Rows remaining after keeping non-filtered variants: {ht.count()}'
        )

    if not args.export_raw:
        ht = ht.filter(ht.adj1 & ht.adj2)
        logger.debug(f'Rows remaining after keeping adj-only: {ht.count()}')

    if not args.export_other_pop:
        ht = ht.filter(hl.is_defined(ht.pop) & (ht.pop != 'oth'))
        logger.debug(
            f'Rows remaining after removing oth samples: {ht.count()}')

    pbt_vp_summary = hl.read_table(pbt_phase_count_ht_path(*path_args))
    pbt_vp_summary = pbt_vp_summary.filter(
        pbt_vp_summary.adj.n_same_hap + pbt_vp_summary.adj.n_chet > 0)
    indexed_pbt_vp_summary = pbt_vp_summary[ht.key]
    discordant_expr = (indexed_pbt_vp_summary.adj.n_same_hap >
                       0) & (indexed_pbt_vp_summary.adj.n_chet > 0)
    if args.exclude_discordant_vps:
        ht = ht.filter(discordant_expr)
    else:
        ht = ht.annotate(trio_phase_discordant=discordant_expr)

    ht = ht.filter((ht.pop_freq1.af <= args.max_pop_freq)
                   & (ht.pop_freq2.af <= args.max_pop_freq))
    logger.debug(
        f'Rows remiaining after removing sites with freq > {args.max_pop_freq}: {ht.count()}'
    )

    # Annotate phase from gnomAD
    vp_ht = hl.read_table(phased_vp_count_ht_path(*path_args))
    vp_ht = vp_ht.select('em', 'em_plus_one', 'likelihood_model',
                         'singlet_het_ratio')
    ht = ht.annotate(**vp_ht[ht.key])

    ht = ht.flatten()
    ht.export(args.output)
예제 #16
0
def main(args):
    hl.init(log='/frequency_data_generation.log', default_reference='GRCh38')

    logger.info("Reading sparse MT and metadata table...")
    mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True)
    meta_ht = meta.ht().select('pop', 'sex', 'project_id', 'release', 'sample_filters')

    if args.test:
        logger.info("Filtering to chr20:1-1000000")
        mt = hl.filter_intervals(mt, [hl.parse_locus_interval('chr20:1-1000000')])

    mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)

    logger.info("Annotating sparse MT with metadata...")
    mt = mt.annotate_cols(meta=meta_ht[mt.s])
    mt = mt.filter_cols(mt.meta.release)
    samples = mt.count_cols()
    logger.info(f"Running frequency table prep and generation pipeline on {samples} samples")

    logger.info("Computing adj and sex adjusted genotypes.")
    mt = mt.annotate_entries(
        GT=adjusted_sex_ploidy_expr(mt.locus, mt.GT, mt.meta.sex),
        adj=get_adj_expr(mt.GT, mt.GQ, mt.DP, mt.AD)
    )

    logger.info("Densify-ing...")
    mt = hl.experimental.densify(mt)
    mt = mt.filter_rows(hl.len(mt.alleles) > 1)

    logger.info("Generating frequency data...")
    mt = annotate_freq(
        mt,
        sex_expr=mt.meta.sex,
        pop_expr=mt.meta.pop
    )

    # Select freq, FAF and popmax
    faf, faf_meta = faf_expr(mt.freq, mt.freq_meta, mt.locus, POPS_TO_REMOVE_FOR_POPMAX)
    mt = mt.select_rows(
        'freq',
        faf=faf,
        popmax=pop_max_expr(mt.freq, mt.freq_meta, POPS_TO_REMOVE_FOR_POPMAX)
    )
    mt = mt.annotate_globals(faf_meta=faf_meta)

    # Annotate quality metrics histograms, as these also require densifying
    mt = mt.annotate_rows(
        **qual_hist_expr(mt.GT, mt.GQ, mt.DP, mt.AD)
    )

    logger.info("Writing out frequency data...")
    if args.test:
        mt.rows().write("gs://gnomad-tmp/gnomad_freq/chr20_1_1000000_freq.ht", overwrite=True)
    else:
        mt.rows().write(freq.path, overwrite=args.overwrite)
예제 #17
0
    def get_chr_dp_ann(chrom: str) -> hl.Table:
        """
        Compute the mean depth of the specified chromosome.

        The total depth will be determined using the sum DP of either reference and variant data or only variant data
        depending on the value of `use_only_variants` in the outer function.

        If `use_only_variants` is set to False then this value is computed using the median block coverage (summed over
        the block size). If `use_only_variants` is set to True, this value is computed using the sum of DP for  all
        variants divided by the total number of variants.

        The depth calculations will be determined using only non par regions if the contig is an X or Y reference contig
        and using the intervals specified by `included_calling_intervals` and excluding intervals specified by
        `excluded_calling_intervals` if either is defined in the outer function (when `use_only_variants` is not
        set this only applies to the contig size estimate and is not used when computing chromosome depth).

        :param chrom: Chromosome to compute the mean depth of
        :return: Table of a per sample mean depth of `chrom`
        """
        contig_size = get_contig_size(chrom)
        chr_mt = hl.filter_intervals(mt, [hl.parse_locus_interval(chrom)])

        if chrom in ref.x_contigs:
            chr_mt = chr_mt.filter_rows(chr_mt.locus.in_x_nonpar())
        if chrom in ref.y_contigs:
            chr_mt = chr_mt.filter_rows(chr_mt.locus.in_y_nonpar())

        if use_only_variants:
            if included_calling_intervals is not None:
                chr_mt = chr_mt.filter_rows(
                    hl.is_defined(included_calling_intervals[chr_mt.row_key]))
            if excluded_calling_intervals is not None:
                chr_mt = chr_mt.filter_rows(
                    hl.is_missing(excluded_calling_intervals[chr_mt.row_key]))
            return chr_mt.select_cols(
                **{
                    f"{chrom}_mean_dp":
                    hl.agg.filter(
                        chr_mt.LGT.is_non_ref(),
                        hl.agg.sum(chr_mt.DP),
                    ) / hl.agg.filter(chr_mt.LGT.is_non_ref(), hl.agg.count())
                }).cols()
        else:
            return chr_mt.select_cols(
                **{
                    f"{chrom}_mean_dp":
                    hl.agg.sum(
                        hl.if_else(
                            chr_mt.LGT.is_hom_ref(),
                            chr_mt.DP *
                            (1 + chr_mt.END - chr_mt.locus.position),
                            chr_mt.DP,
                        )) / contig_size
                }).cols()
예제 #18
0
파일: filters.py 프로젝트: bw2/hail-utils
def filter_to_autosomes(t):
    """
    Filters the Table or MatrixTable to autosomes only.
    This assumes that the input contains a field named `locus` of type Locus
    :param MatrixTable or Table t: Input MT/HT
    :return:  MT/HT autosomes
    :rtype: MatrixTable or Table
    """
    reference = get_reference_genome(t.locus)
    autosomes = hl.parse_locus_interval(f'{reference.contigs[0]}-{reference.contigs[21]}', reference_genome=reference)
    return hl.filter_intervals(t, [autosomes])
def filter_to_autosomes(
        t: Union[hl.MatrixTable, hl.Table]) -> Union[hl.MatrixTable, hl.Table]:
    """
    Filters the Table or MatrixTable to autosomes only.
    This assumes that the input contains a field named `locus` of type Locus
    :param t: Input MT/HT
    :return:  MT/HT autosomes
    """
    reference = get_reference_genome(t.locus)
    autosomes = hl.parse_locus_interval(
        f"{reference.contigs[0]}-{reference.contigs[21]}",
        reference_genome=reference)
    return hl.filter_intervals(t, [autosomes])
def vcf_to_mt(splice_ai_snvs_path, splice_ai_indels_path, genome_version):
    """
    Loads the snv path and indels source path to a matrix table and returns the table.

    :param splice_ai_snvs_path: source location
    :param splice_ai_indels_path: source location
    :return: matrix table
    """

    logger.info("==> reading in splice_ai vcfs: %s, %s" %
                (splice_ai_snvs_path, splice_ai_indels_path))

    # for 37, extract to MT, for 38, MT not included.
    interval = "1-MT" if genome_version == "37" else "chr1-chrY"
    contig_dict = None
    if genome_version == "38":
        contig_dict = NO_CHR_TO_CHR_CONTIG_RECODING

    mt = hl.import_vcf(
        [splice_ai_snvs_path, splice_ai_indels_path],
        reference_genome=f"GRCh{genome_version}",
        contig_recoding=contig_dict,
        force_bgz=True,
        min_partitions=10000,
        skip_invalid_loci=True,
    )
    interval = [
        hl.parse_locus_interval(interval,
                                reference_genome=f"GRCh{genome_version}")
    ]
    mt = hl.filter_intervals(mt, interval)

    # Split SpliceAI field by | delimiter. Capture delta score entries and map to floats
    delta_scores = mt.info.SpliceAI[0].split(delim="\\|")[2:6]
    splice_split = mt.info.annotate(
        SpliceAI=hl.map(lambda x: hl.float32(x), delta_scores))
    mt = mt.annotate_rows(info=splice_split)

    # Annotate info.max_DS with the max of DS_AG, DS_AL, DS_DG, DS_DL in info.
    # delta_score array is |DS_AG|DS_AL|DS_DG|DS_DL
    consequences = hl.literal(
        ["Acceptor gain", "Acceptor loss", "Donor gain", "Donor loss"])
    mt = mt.annotate_rows(info=mt.info.annotate(
        max_DS=hl.max(mt.info.SpliceAI)))
    mt = mt.annotate_rows(info=mt.info.annotate(splice_consequence=hl.if_else(
        mt.info.max_DS > 0,
        consequences[mt.info.SpliceAI.index(mt.info.max_DS)],
        "No consequence",
    )))
    return mt
예제 #21
0
def train_rf(ht, args):
    features = FEATURES
    test_intervals = args.test_intervals
    print("test_intervals")
    print(test_intervals)
    #test_intervals = False

    if args.no_inbreeding_coeff:
        features.remove("InbreedingCoeff")

    fp_expr = ht.fail_hard_filters
    tp_expr = ht.omni & ht.mills & ht.kgp_phase1_hc & ht.hapmap
    if not args.no_transmitted_singletons:
        tp_expr = tp_expr | ht.transmitted_singleton

    if test_intervals:

        if isinstance(test_intervals, str):
            test_intervals = [test_intervals]
        test_intervals = [
            hl.parse_locus_interval(x, reference_genome="GRCh38")
            for x in test_intervals
        ]
        print("Resulting intervals")
        print(hl.eval(test_intervals))

    ht = ht.annotate(tp=tp_expr, fp=fp_expr)
    logger.info("Now runnning train_rf_model method")
    test_expr = hl.literal(test_intervals).any(
        lambda interval: interval.contains(ht.locus))
    print(ht.locus.show())
    print("Test_expr")
    print(test_expr)
    rf_ht, rf_model = train_rf_model(
        ht,
        rf_features=features,
        tp_expr=ht.tp,
        fp_expr=ht.fp,
        fp_to_tp=args.fp_to_tp,
        num_trees=args.num_trees,
        max_depth=args.max_depth,
        test_expr=hl.literal(test_intervals).any(
            lambda interval: interval.contains(ht.locus)),
    )

    logger.info("Joining original RF Table with training information")
    ht = ht.join(rf_ht, how="left")

    return ht, rf_model
예제 #22
0
def filter_snps(mt, maf):
    mt = hl.variant_qc(mt)
    mt = mt.annotate_rows(maf=hl.min(mt.variant_qc.AF))
    mt.filter_rows(mt.maf > maf)

    # MHC chr6:25-35Mb
    # chr8.inversion chr8:7-13Mb
    intervals = ['chr6:25M-35M', 'chr8:7M-13M']
    mt = hl.filter_intervals(mt, [
        hl.parse_locus_interval(x, reference_genome='GRCh38')
        for x in intervals
    ],
                             keep=False)

    return mt
예제 #23
0
def test_filter_intervals_default():
    vds = hl.vds.read_vds(
        os.path.join(resource('vds'), '1kg_2samples_starts.vds'))

    intervals = [
        hl.parse_locus_interval('chr22:10514784-10517000',
                                reference_genome='GRCh38')
    ]
    filt = hl.vds.filter_intervals(vds, intervals)

    assert hl.vds.to_dense_mt(filt)._same(
        hl.filter_intervals(hl.vds.to_dense_mt(vds), intervals))

    var = filt.variant_data
    assert var.aggregate_rows(hl.agg.all(intervals[0].contains(var.locus)))
예제 #24
0
파일: test_misc.py 프로젝트: danking/hail
    def test_matrix_filter_intervals(self):
        ds = hl.import_vcf(resource('sample.vcf'), min_partitions=20)

        self.assertEqual(
            hl.filter_intervals(ds, [hl.parse_locus_interval('20:10639222-10644705')]).count_rows(), 3)

        intervals = [hl.parse_locus_interval('20:10639222-10644700'),
                     hl.parse_locus_interval('20:10644700-10644705')]
        self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3)

        intervals = hl.array([hl.parse_locus_interval('20:10639222-10644700'),
                              hl.parse_locus_interval('20:10644700-10644705')])
        self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3)

        intervals = hl.array([hl.eval(hl.parse_locus_interval('20:10639222-10644700')),
                              hl.parse_locus_interval('20:10644700-10644705')])
        self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3)

        intervals = [hl.eval(hl.parse_locus_interval('[20:10019093-10026348]')),
                     hl.eval(hl.parse_locus_interval('[20:17705793-17716416]'))]
        self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 4)
예제 #25
0
def pca_filter_mt(in_mt: hl.MatrixTable,
                  maf: float = 0.05,
                  hwe: float = 1e-3,
                  call_rate: float = 0.98,
                  ld_cor: float = 0.2,
                  ld_window: int = 250000):

    print("\nInitial number of SNPs before filtering: {}".format(
        in_mt.count_rows()))
    mt = hl.variant_qc(in_mt)
    print(f'\nFiltering out variants with MAF < {maf}')
    mt_filt = mt.annotate_rows(maf=hl.min(mt.variant_qc.AF))
    mt_filt = mt_filt.filter_rows(mt_filt.maf > maf)

    print(f'\nFiltering out variants with HWE < {hwe:1e}')
    mt_filt = mt_filt.filter_rows(mt_filt.variant_qc.p_value_hwe > hwe)

    print(f'\nFiltering out variants with Call Rate < {call_rate}')
    mt_filt = mt_filt.filter_rows(mt_filt.variant_qc.call_rate >= call_rate)

    # no strand ambiguity
    print('\nFiltering out strand ambigous variants')
    mt_filt = mt_filt.filter_rows(
        ~hl.is_strand_ambiguous(mt_filt.alleles[0], mt_filt.alleles[1]))

    # MHC chr6:25-35Mb
    # chr8.inversion chr8:7-13Mb
    print(
        '\nFiltering out variants in MHC [chr6:25M-35M] and chromosome 8 inversions [chr8:7M-13M]'
    )
    intervals = ['chr6:25M-35M', 'chr8:7M-13M']
    mt_filt = hl.filter_intervals(mt_filt, [
        hl.parse_locus_interval(x, reference_genome='GRCh38')
        for x in intervals
    ],
                                  keep=False)

    # This step is expensive (on local machine)
    print(
        f'\nLD pruning using correlation threshold of {ld_cor} and window size of {ld_window}'
    )
    mt_ld_prune = hl.ld_prune(mt_filt.GT, r2=ld_cor, bp_window_size=ld_window)
    mt_ld_pruned = mt_filt.filter_rows(
        hl.is_defined(mt_ld_prune[mt_filt.row_key]))
    print("\nNumber of SNPs after filtering: {}".format(
        mt_ld_pruned.count_rows()))

    return mt_ld_pruned
예제 #26
0
    def test_liftover_strand(self):
        grch37 = hl.get_reference('GRCh37')
        grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38')

        self.assertEqual(hl.eval(hl.liftover(hl.locus('20', 60001, 'GRCh37'), 'GRCh38', include_strand=True)),
                         hl.eval(hl.struct(result=hl.locus('chr20', 79360, 'GRCh38'), is_negative_strand=False)))

        self.assertEqual(hl.eval(hl.liftover(hl.locus_interval('20', 37007582, 37007586, True, True, 'GRCh37'),
                                             'GRCh38', include_strand=True)),
                         hl.eval(hl.struct(result=hl.locus_interval('chr12', 32563117, 32563121, True, True, 'GRCh38'),
                                           is_negative_strand=True)))

        with self.assertRaises(FatalError):
            hl.eval(hl.liftover(hl.parse_locus_interval('1:10000-10000', reference_genome='GRCh37'), 'GRCh38'))

        grch37.remove_liftover("GRCh38")
예제 #27
0
def vds_combiner_chr22(*paths):
    with TemporaryDirectory() as tmpdir:
        with TemporaryDirectory() as outpath:
            parts = hl.eval([
                hl.parse_locus_interval('chr22:start-end',
                                        reference_genome='GRCh38')
            ])

            from hail.vds.combiner import new_combiner
            combiner = new_combiner(output_path=outpath,
                                    intervals=parts,
                                    temp_path=tmpdir,
                                    gvcf_paths=paths,
                                    reference_genome='GRCh38',
                                    branch_factor=16,
                                    target_records=10000000)
            combiner.run()
예제 #28
0
def write_ldsc_hm3_snplist(info_threshold=0.9,
                           maf_threshold=0.01,
                           overwrite=False):
    # Filter variants
    ht = hl.read_table(get_variant_results_qc_path())
    # in autosomes
    ht = ht.filter(ht.locus.in_autosome())
    # no MHC
    ht = ht.filter(
        ~hl.parse_locus_interval('6:28477797-33448354').contains(ht.locus))
    # info > 0.9
    ht = ht.filter(ht.info > info_threshold)
    # SNP only
    ht = ht.filter(hl.is_snp(ht.alleles[0], ht.alleles[1]))
    # no multi-allelic sites
    loc_count = ht.group_by(ht.locus).aggregate(nloc=hl.agg.count())
    loc_count = loc_count.filter(loc_count.nloc > 1)
    multi_sites = loc_count.aggregate(hl.agg.collect_as_set(loc_count.locus),
                                      _localize=False)
    ht = ht.filter(~multi_sites.contains(ht.locus))

    # in HM3
    hm3_snps = hl.read_table(
        'gs://ukbb-ldsc-dev/ukb_hm3_snplist/hm3.r3.b37.auto_bi_af.ht')
    hm3_snps = hm3_snps.select()
    ht = ht.join(hm3_snps, 'right')
    # no strand ambiguity
    ht = ht.filter(~hl.is_strand_ambiguous(ht.alleles[0], ht.alleles[1]))

    ht = checkpoint_tmp(ht)

    def get_maf(af):
        return 0.5 - hl.abs(0.5 - af)

    # MAF > 1% in UKB & gnomad genome/exome (if defined) for each population
    for pop in POPS:
        snplist = ht.filter(
            hl.rbind(
                ht.freq[ht.freq.index(lambda x: x.pop == pop)], lambda y:
                (get_maf(y.af) > maf_threshold) &
                (hl.is_missing(y.gnomad_genomes_af) |
                 (get_maf(y.gnomad_genomes_af) > maf_threshold)) &
                (hl.is_missing(y.gnomad_exomes_af) |
                 (get_maf(y.gnomad_exomes_af) > maf_threshold))))
        snplist = snplist.select('rsid')
        snplist.write(get_hm3_snplist_path(pop), overwrite=overwrite)
예제 #29
0
def test_filter_intervals_default_table():
    vds = hl.vds.read_vds(
        os.path.join(resource('vds'), '1kg_2samples_starts.vds'))

    intervals = [
        hl.parse_locus_interval('chr22:10514784-10517000',
                                reference_genome='GRCh38')
    ]
    intervals_table = hl.Table.parallelize(
        hl.array(intervals).map(lambda x: hl.struct(interval=x)),
        key='interval')
    filt = hl.vds.filter_intervals(vds, intervals_table)

    assert hl.vds.to_dense_mt(filt)._same(
        hl.filter_intervals(hl.vds.to_dense_mt(vds), intervals))

    var = filt.variant_data
    assert var.aggregate_rows(hl.agg.all(intervals[0].contains(var.locus)))
예제 #30
0
    def get_chr_dp_ann(chrom: str) -> hl.Table:
        contig_size = get_contig_size(chrom)
        chr_mt = hl.filter_intervals(mt, [hl.parse_locus_interval(chrom)])

        if chrom in ref.x_contigs:
            chr_mt = chr_mt.filter_rows(chr_mt.locus.in_x_nonpar())
        if chrom in ref.y_contigs:
            chr_mt = chr_mt.filter_rows(chr_mt.locus.in_y_nonpar())

        return chr_mt.select_cols(
            **{
                f"{chrom}_mean_dp":
                hl.agg.sum(
                    hl.cond(
                        chr_mt.LGT.is_hom_ref(),
                        chr_mt.DP * (1 + chr_mt.END - chr_mt.locus.position),
                        chr_mt.DP,
                    )) / contig_size
            }).cols()
예제 #31
0
파일: test_api.py 프로젝트: shulik7/hail
    def test_constructors(self):
        rg = hl.ReferenceGenome("foo", ["1"], {"1": 100})

        schema = hl.tstruct(a=hl.tfloat64, b=hl.tfloat64, c=hl.tint32, d=hl.tint32)
        rows = [{'a': 2.0, 'b': 4.0, 'c': 1, 'd': 5}]
        kt = hl.Table.parallelize(rows, schema)
        kt = kt.annotate(d=hl.int64(kt.d))

        kt = kt.annotate(l1=hl.parse_locus("1:51"),
                         l2=hl.locus("1", 51, reference_genome=rg),
                         i1=hl.parse_locus_interval("1:51-56", reference_genome=rg),
                         i2=hl.interval(hl.locus("1", 51, reference_genome=rg),
                                        hl.locus("1", 56, reference_genome=rg)))

        expected_schema = {'a': hl.tfloat64, 'b': hl.tfloat64, 'c': hl.tint32, 'd': hl.tint64,
                           'l1': hl.tlocus(), 'l2': hl.tlocus(rg),
                           'i1': hl.tinterval(hl.tlocus(rg)), 'i2': hl.tinterval(hl.tlocus(rg))}

        self.assertTrue(all([expected_schema[f] == t for f, t in kt.row.dtype.items()]))
예제 #32
0
def vcf_to_mt(splice_ai_snvs_path, splice_ai_indels_path, genome_version):
    '''
    Loads the snv path and indels source path to a matrix table and returns the table.

    :param splice_ai_snvs_path: source location
    :param splice_ai_indels_path: source location
    :return: matrix table
    '''

    logger.info('==> reading in splice_ai vcfs: %s, %s' %
                (splice_ai_snvs_path, splice_ai_indels_path))

    # for 37, extract to MT, for 38, MT not included.
    interval = '1-MT' if genome_version == '37' else 'chr1-chrY'
    contig_dict = None
    if genome_version == '38':
        rg = hl.get_reference('GRCh37')
        grch37_contigs = [
            x for x in rg.contigs
            if not x.startswith('GL') and not x.startswith('M')
        ]
        contig_dict = dict(
            zip(grch37_contigs, ['chr' + x for x in grch37_contigs]))

    mt = hl.import_vcf([splice_ai_snvs_path, splice_ai_indels_path],
                       reference_genome=f"GRCh{genome_version}",
                       contig_recoding=contig_dict,
                       force_bgz=True,
                       min_partitions=10000,
                       skip_invalid_loci=True)
    interval = [
        hl.parse_locus_interval(interval,
                                reference_genome=f"GRCh{genome_version}")
    ]
    mt = hl.filter_intervals(mt, interval)

    # Annotate info.max_DS with the max of DS_AG, DS_AL, DS_DG, DS_DL in info.
    info = mt.info.annotate(max_DS=hl.max(
        [mt.info.DS_AG, mt.info.DS_AL, mt.info.DS_DG, mt.info.DS_DL]))
    mt = mt.annotate_rows(info=info)

    return mt
예제 #33
0
def test_conversion_equivalence():
    gvcfs = [
        os.path.join(resource('gvcfs'), '1kg_chr22', path) for path in [
            'HG00187.hg38.g.vcf.gz', 'HG00190.hg38.g.vcf.gz',
            'HG00308.hg38.g.vcf.gz', 'HG00313.hg38.g.vcf.gz',
            'HG00320.hg38.g.vcf.gz'
        ]
    ]

    tmpdir = new_temp_file()
    mt_path = new_temp_file()
    vds_path = new_temp_file()

    hl.experimental.run_combiner(
        gvcfs,
        mt_path,
        tmpdir,
        use_exome_default_intervals=True,
        reference_genome='GRCh38',
        overwrite=True,
        intervals=[hl.eval(hl.parse_locus_interval('chr22', 'GRCh38'))],
        key_by_locus_and_alleles=True)

    svcr = hl.read_matrix_table(mt_path)

    vds = hl.vds.VariantDataset.from_merged_representation(svcr).checkpoint(
        vds_path)
    ref = vds.reference_data
    var = vds.variant_data

    assert svcr.aggregate_entries(hl.agg.count_where(hl.is_defined(
        svcr.END))) == ref.aggregate_entries(hl.agg.count())
    assert svcr.aggregate_entries(hl.agg.count()) == ref.aggregate_entries(
        hl.agg.count()) + var.aggregate_entries(hl.agg.count())

    svcr_readback = hl.vds.to_merged_sparse_mt(vds)

    assert svcr._same(svcr_readback)
예제 #34
0
파일: conftest.py 프로젝트: bcajes/hail
def init(doctest_namespace):
    # This gets run once per process -- must avoid race conditions
    print("setting up doctest...")

    olddir = os.getcwd()
    os.chdir("docs/")

    doctest_namespace['hl'] = hl
    doctest_namespace['agg'] = agg

    if not os.path.isdir("output/"):
        try:
            os.mkdir("output/")
        except OSError:
            pass

    files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"]
    for f in files:
        if os.path.isdir(f):
            shutil.rmtree(f)

    ds = hl.read_matrix_table('data/example.vds')
    doctest_namespace['ds'] = ds
    doctest_namespace['dataset'] = ds
    doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5)
    doctest_namespace['dataset_to_union_1'] = ds
    doctest_namespace['dataset_to_union_2'] = ds

    v_metadata = ds.rows().annotate_globals(global_field=5).annotate(consequence='SYN')
    doctest_namespace['v_metadata'] = v_metadata

    s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F')
    doctest_namespace['s_metadata'] = s_metadata

    # Table
    table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID')
    table1 = table1.annotate_globals(global_field_1=5, global_field_2=10)
    doctest_namespace['table1'] = table1
    doctest_namespace['other_table'] = table1

    table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID')
    doctest_namespace['table2'] = table2

    table4 = hl.import_table('data/kt_example4.tsv', impute=True,
                             types={'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr),
                                    'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32),
                                    'E': hl.tstruct(A=hl.tint32, B=hl.tint32)})
    doctest_namespace['table4'] = table4

    people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+',
                                   types={'Age': hl.tint32, 'Children': hl.tarray(hl.tstr)},
                                   key='Name')
    doctest_namespace['people_table'] = people_table

    # TDT
    doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf')

    ds2 = hl.variant_qc(ds)
    doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF)

    # Expressions
    doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie'])
    doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5])
    doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1])
    doctest_namespace['t'] = hl.literal(True)
    doctest_namespace['f'] = hl.literal(False)
    doctest_namespace['na'] = hl.null(hl.tbool)
    doctest_namespace['call'] = hl.call(0, 1, phased=False)
    doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5])
    doctest_namespace['d'] = hl.literal({'Alice': 43, 'Bob': 33, 'Charles': 44})
    doctest_namespace['interval'] = hl.interval(3, 11)
    doctest_namespace['locus_interval'] = hl.parse_locus_interval("1:53242-90543")
    doctest_namespace['locus'] = hl.locus('1', 1034245)
    doctest_namespace['x'] = hl.literal(3)
    doctest_namespace['y'] = hl.literal(4.5)
    doctest_namespace['s1'] = hl.literal({1, 2, 3})
    doctest_namespace['s2'] = hl.literal({1, 3, 5})
    doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'})
    doctest_namespace['struct'] = hl.struct(a=5, b='Foo')
    doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3]))
    doctest_namespace['s'] = hl.literal('The quick brown fox')
    doctest_namespace['interval2'] = hl.Interval(3, 6)

    # Overview
    doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True)
    doctest_namespace['mt'] = ds

    gnomad_data = ds.rows()
    doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF)

    # BGEN
    bgen = hl.import_bgen('data/example.8bits.bgen',
                          entry_fields=['GT', 'GP', 'dosage'])
    doctest_namespace['variants_table'] = bgen.rows()

    print("finished setting up doctest...")
    yield
    os.chdir(olddir)