def test_combiner_plan_round_trip_serialization(): sample_names = all_samples[:5] paths = [os.path.join(resource('gvcfs'), '1kg_chr22', f'{s}.hg38.g.vcf.gz') for s in sample_names] plan_path = new_temp_file(extension='json') out_file = new_temp_file(extension='vds') plan = new_combiner(gvcf_paths=paths, output_path=out_file, temp_path=Env.hc()._tmpdir, save_path=plan_path, reference_genome='GRCh38', use_exome_default_intervals=True, branch_factor=2, batch_size=2) plan.save() plan_loaded = load_combiner(plan_path) assert plan == plan_loaded
def vds_combiner_chr22(*paths): with TemporaryDirectory() as tmpdir: with TemporaryDirectory() as outpath: parts = hl.eval([ hl.parse_locus_interval('chr22:start-end', reference_genome='GRCh38') ]) from hail.vds.combiner import new_combiner combiner = new_combiner(output_path=outpath, intervals=parts, temp_path=tmpdir, gvcf_paths=paths, reference_genome='GRCh38', branch_factor=16, target_records=10000000) combiner.run()
def test_combiner_manual_filtration(): sample_names = all_samples[:2] paths = [os.path.join(resource('gvcfs'), '1kg_chr22', f'{s}.hg38.g.vcf.gz') for s in sample_names] out_file = new_temp_file(extension='vds') plan = new_combiner(gvcf_paths=paths, output_path=out_file, temp_path=Env.hc()._tmpdir, reference_genome='GRCh38', use_exome_default_intervals=True, gvcf_reference_entry_fields_to_keep=['GQ'], gvcf_info_to_keep=['ExcessHet'], force=True) assert plan.gvcf_info_to_keep == {'ExcessHet'} plan.run() vds = hl.vds.read_vds(out_file) assert list(vds.variant_data.gvcf_info) == ['ExcessHet'] assert list(vds.reference_data.entry) == ['END', 'GQ']