def test_1kg_chr22(): out_file = new_temp_file(extension='mt') sample_names = all_samples[:5] paths = [os.path.join(resource('gvcfs'), '1kg_chr22', f'{s}.hg38.g.vcf.gz') for s in sample_names] vc.run_combiner(paths, out_file=out_file, tmp_path=Env.hc()._tmpdir, branch_factor=2, batch_size=2, reference_genome='GRCh38') sample_data = dict() for sample, path in zip(sample_names, paths): ht = hl.import_vcf(path, force_bgz=True, reference_genome='GRCh38').localize_entries('entries') n, n_variant = ht.aggregate((hl.agg.count(), hl.agg.count_where(ht.entries[0].GT.is_non_ref()))) sample_data[sample] = (n, n_variant) mt = hl.read_matrix_table(out_file) mt = mt.annotate_cols(n=hl.agg.count(), n_variant=hl.agg.count_where( mt.LGT.is_non_ref())) # annotate the number of non-missing records combined_results = hl.tuple([mt.s, mt.n, mt.n_variant]).collect() assert len(combined_results) == len(sample_names) for sample, n, n_variant in combined_results: true_n, true_n_variant = sample_data[sample] assert n == true_n, sample assert n_variant == true_n_variant, sample
def test_contig_recoding(): path1 = os.path.join(resource('gvcfs'), 'recoding', 'HG00187.hg38.g.vcf.gz') path2 = os.path.join(resource('gvcfs'), 'recoding', 'HG00187.hg38.recoded.g.vcf.gz') out_file_1 = new_temp_file(extension='mt') out_file_2 = new_temp_file(extension='mt') vc.run_combiner([path1, path1], out_file_1, Env.hc()._tmpdir, reference_genome='GRCh38', use_exome_default_intervals=True) vc.run_combiner([path2, path2], out_file_2, Env.hc()._tmpdir, reference_genome='GRCh38', contig_recoding={'22': 'chr22'}, use_exome_default_intervals=True) mt1 = hl.read_matrix_table(out_file_1) mt2 = hl.read_matrix_table(out_file_2) assert mt1.count() == mt2.count() assert mt1._same(mt2)
def full_combiner_chr22(*paths): with TemporaryDirectory() as tmpdir: vc_all.run_combiner(list(paths), out_file=tmpdir, tmp_path='/tmp', branch_factor=16, reference_genome='GRCh38', overwrite=True)
def test_key_by_locus_alleles(): out_file = new_temp_file(extension='mt') sample_names = all_samples[:5] paths = [os.path.join(resource('gvcfs'), '1kg_chr22', f'{s}.hg38.g.vcf.gz') for s in sample_names] vc.run_combiner(paths, out_file=out_file, tmp_path=Env.hc()._tmpdir, reference_genome='GRCh38', key_by_locus_and_alleles=True) mt = hl.read_matrix_table(out_file) assert(list(mt.row_key) == ['locus', 'alleles']) mt._force_count_rows()
def test_non_ref_alleles_set_to_missing(): path = os.path.join(resource('gvcfs'), 'non_ref_call.g.vcf.gz') out_file = new_temp_file(extension='mt') vc.run_combiner([path, path], out_file=out_file, tmp_path=Env.hc()._tmpdir, branch_factor=2, batch_size=2, reference_genome='GRCh38') mt = hl.read_matrix_table(out_file) n_alleles = hl.len(mt.alleles) gt_idx = hl.experimental.lgt_to_gt(mt.LGT, mt.LA).unphased_diploid_gt_index() assert mt.aggregate_entries( hl.agg.all(gt_idx < (n_alleles * (n_alleles + 1)) / 2))
def test_sample_override(): out_file = new_temp_file(extension='mt') sample_names = all_samples[:5] new_names = [f'S{i}' for i, _ in enumerate(sample_names)] paths = [ os.path.join(resource('gvcfs'), '1kg_chr22', f'{s}.hg38.g.vcf.gz') for s in sample_names ] header_path = paths[0] vc.run_combiner(paths, out_file=out_file, tmp_path=Env.hc()._tmpdir, reference_genome='GRCh38', header=header_path, sample_names=new_names, key_by_locus_and_alleles=True, use_exome_default_intervals=True) mt_cols = hl.read_matrix_table(out_file).key_cols_by().cols() mt_names = mt_cols.aggregate(hl.agg.collect(mt_cols.s)) assert new_names == mt_names