def test_vcf_vds_combiner_equivalence(): import hail.experimental.vcf_combiner.vcf_combiner as vcf import hail.vds.combiner as vds _paths = ['gvcfs/HG00096.g.vcf.gz', 'gvcfs/HG00268.g.vcf.gz'] paths = [resource(p) for p in _paths] parts = [ hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 17821257, reference_genome='GRCh38')), end=hl.Struct(locus=hl.Locus('chr20', 18708366, reference_genome='GRCh38')), includes_end=True), hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 18708367, reference_genome='GRCh38')), end=hl.Struct(locus=hl.Locus('chr20', 19776611, reference_genome='GRCh38')), includes_end=True), hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 19776612, reference_genome='GRCh38')), end=hl.Struct(locus=hl.Locus('chr20', 21144633, reference_genome='GRCh38')), includes_end=True) ] vcfs = [mt.annotate_rows(info=mt.info.annotate( MQ_DP=hl.missing(hl.tint32), VarDP=hl.missing(hl.tint32), QUALapprox=hl.missing(hl.tint32))) for mt in hl.import_gvcfs(paths, parts, reference_genome='GRCh38', array_elements_required=False)] entry_to_keep = defined_entry_fields(vcfs[0].filter_rows(hl.is_defined(vcfs[0].info.END)), 100_000) - {'GT', 'PGT', 'PL'} vds = vds.combine_variant_datasets([vds.transform_gvcf(mt, reference_entry_fields_to_keep=entry_to_keep) for mt in vcfs]) smt = vcf.combine_gvcfs([vcf.transform_gvcf(mt) for mt in vcfs]) smt_from_vds = hl.vds.to_merged_sparse_mt(vds).drop('RGQ') smt = smt.select_entries(*smt_from_vds.entry) # harmonize fields and order smt = smt.key_rows_by('locus', 'alleles') assert smt._same(smt_from_vds)
def compile_2k_merge(path): vcf = setup(path) vcfs = [vc_all.transform_gvcf(vcf)] * COMBINE_GVCF_MAX combined = [vc_all.combine_gvcfs(vcfs)] * 20 with TemporaryDirectory() as tmpdir: hl.experimental.write_matrix_tables(combined, os.path.join( tmpdir, 'combiner-multi-write'), overwrite=True)
def compile_2k_merge(path): flagname = 'no_ir_logging' prev_flag_value = hl._get_flags(flagname).get(flagname) try: hl._set_flags(**{flagname: '1'}) vcf = setup(path) vcfs = [vc_all.transform_gvcf(vcf)] * COMBINE_GVCF_MAX combined = [vc_all.combine_gvcfs(vcfs)] * 20 with TemporaryDirectory() as tmpdir: hl.experimental.write_matrix_tables(combined, os.path.join( tmpdir, 'combiner-multi-write'), overwrite=True) finally: hl._set_flags(**{flagname: prev_flag_value})
def main(args): hl.init(default_reference='GRCh38') hgdp_inputs = [] tgp_inputs = [] with hl.hadoop_open( 'gs://hgdp_tgp/misc/tgp_plus_hgdp_30x_reblocked_gvcfs.txt', 'r') as f: for line in f: line = line.strip() hgdp_inputs.append(line) if 'HGDP' in line else tgp_inputs.append( line) temp_bucket = 'gs://gnomad-tmp/tgp_hgdp' if args.get_sample_names: get_sample_names_from_list_of_files(tgp_inputs, get_samples_path('tgp')) get_sample_names_from_list_of_files(hgdp_inputs, get_samples_path('hgdp')) if args.create_sparse_mt: sample_names = get_sample_list_in_order(get_samples_path('tgp'), tgp_inputs) hl.experimental.run_combiner(tgp_inputs, out_file=get_reference_mt_path( 'tgp', sparse=True), tmp_path=temp_bucket, overwrite=args.overwrite, header=get_header_path('tgp'), sample_names=sample_names, use_genome_default_intervals=True) sample_names = get_sample_list_in_order(get_samples_path('hgdp'), hgdp_inputs) hl.experimental.run_combiner(hgdp_inputs, out_file=get_reference_mt_path( 'hgdp', sparse=True), tmp_path=temp_bucket, overwrite=args.overwrite, header=get_header_path('hgdp'), sample_names=sample_names, use_genome_default_intervals=True) tgp_mt = hl.read_matrix_table(get_reference_mt_path('tgp', sparse=True)) tgp_mt = tgp_mt.annotate_entries( gvcf_info=tgp_mt.gvcf_info.drop('MQ0', 'VariantType')).drop( 'AB', 'MQ0') hgdp_mt = hl.read_matrix_table( get_reference_mt_path('hgdp', sparse=True)) hgdp_mt = hgdp_mt.annotate_entries(gvcf_info=hgdp_mt.gvcf_info.select( *tgp_mt.gvcf_info)) mt = combine_gvcfs([tgp_mt, hgdp_mt]) mt.write(get_reference_mt_path(sparse=True), overwrite=args.overwrite) if args.densify_mt: mt = hl.read_matrix_table(get_reference_mt_path( sparse=True)).key_rows_by('locus', 'alleles') mt = hl.experimental.densify(hl.experimental.sparse_split_multi(mt)) mt = mt.filter_rows(hl.len(mt.alleles) > 1) mt.naive_coalesce(5000).write(get_reference_mt_path(), args.overwrite) mt = hl.read_matrix_table(get_reference_mt_path()).drop('gvcf_info') hl.export_vcf(mt, get_reference_mt_path(extension='vcf.bgz'), parallel='header_per_shard')
def python_only_10k_combine(path): vcf = setup(path) mt = vc_all.transform_gvcf(vcf) mts = [mt] * 10_000 _ = [vc_all.combine_gvcfs(mts) for mts in chunks(mts, COMBINE_GVCF_MAX)]
def combine_variant_datasets(vdss: List[VariantDataset]) -> VariantDataset: reference = combine_references([vds.reference_data for vds in vdss]) no_variant_key = [vds.variant_data.key_rows_by('locus') for vds in vdss] variants = combine_gvcfs(no_variant_key) return VariantDataset(reference, variants._key_rows_by_assert_sorted('locus', 'alleles'))