Exemplo n.º 1
0
def test_1kg_chr22():
    out_file = new_temp_file(extension='mt')

    sample_names = all_samples[:5]
    paths = [os.path.join(resource('gvcfs'), '1kg_chr22', f'{s}.hg38.g.vcf.gz') for s in sample_names]
    vc.run_combiner(paths,
                    out_file=out_file,
                    tmp_path=Env.hc()._tmpdir,
                    branch_factor=2,
                    batch_size=2,
                    reference_genome='GRCh38')

    sample_data = dict()
    for sample, path in zip(sample_names, paths):
        ht = hl.import_vcf(path, force_bgz=True, reference_genome='GRCh38').localize_entries('entries')
        n, n_variant = ht.aggregate((hl.agg.count(), hl.agg.count_where(ht.entries[0].GT.is_non_ref())))
        sample_data[sample] = (n, n_variant)

    mt = hl.read_matrix_table(out_file)
    mt = mt.annotate_cols(n=hl.agg.count(), n_variant=hl.agg.count_where(
        mt.LGT.is_non_ref()))  # annotate the number of non-missing records

    combined_results = hl.tuple([mt.s, mt.n, mt.n_variant]).collect()
    assert len(combined_results) == len(sample_names)

    for sample, n, n_variant in combined_results:
        true_n, true_n_variant = sample_data[sample]
        assert n == true_n, sample
        assert n_variant == true_n_variant, sample
Exemplo n.º 2
0
def test_contig_recoding():
    path1 = os.path.join(resource('gvcfs'), 'recoding',
                         'HG00187.hg38.g.vcf.gz')
    path2 = os.path.join(resource('gvcfs'), 'recoding',
                         'HG00187.hg38.recoded.g.vcf.gz')

    out_file_1 = new_temp_file(extension='mt')
    out_file_2 = new_temp_file(extension='mt')

    vc.run_combiner([path1, path1],
                    out_file_1,
                    Env.hc()._tmpdir,
                    reference_genome='GRCh38',
                    use_exome_default_intervals=True)
    vc.run_combiner([path2, path2],
                    out_file_2,
                    Env.hc()._tmpdir,
                    reference_genome='GRCh38',
                    contig_recoding={'22': 'chr22'},
                    use_exome_default_intervals=True)

    mt1 = hl.read_matrix_table(out_file_1)
    mt2 = hl.read_matrix_table(out_file_2)

    assert mt1.count() == mt2.count()
    assert mt1._same(mt2)
Exemplo n.º 3
0
def full_combiner_chr22(*paths):
    with TemporaryDirectory() as tmpdir:
        vc_all.run_combiner(list(paths),
                            out_file=tmpdir,
                            tmp_path='/tmp',
                            branch_factor=16,
                            reference_genome='GRCh38',
                            overwrite=True)
Exemplo n.º 4
0
def test_key_by_locus_alleles():
    out_file = new_temp_file(extension='mt')

    sample_names = all_samples[:5]
    paths = [os.path.join(resource('gvcfs'), '1kg_chr22', f'{s}.hg38.g.vcf.gz') for s in sample_names]
    vc.run_combiner(paths,
                    out_file=out_file,
                    tmp_path=Env.hc()._tmpdir,
                    reference_genome='GRCh38',
                    key_by_locus_and_alleles=True)

    mt = hl.read_matrix_table(out_file)
    assert(list(mt.row_key) == ['locus', 'alleles'])
    mt._force_count_rows()
Exemplo n.º 5
0
def test_non_ref_alleles_set_to_missing():
    path = os.path.join(resource('gvcfs'), 'non_ref_call.g.vcf.gz')
    out_file = new_temp_file(extension='mt')
    vc.run_combiner([path, path],
                    out_file=out_file,
                    tmp_path=Env.hc()._tmpdir,
                    branch_factor=2,
                    batch_size=2,
                    reference_genome='GRCh38')

    mt = hl.read_matrix_table(out_file)
    n_alleles = hl.len(mt.alleles)
    gt_idx = hl.experimental.lgt_to_gt(mt.LGT, mt.LA).unphased_diploid_gt_index()
    assert mt.aggregate_entries(
        hl.agg.all(gt_idx < (n_alleles * (n_alleles + 1)) / 2))
Exemplo n.º 6
0
def test_sample_override():
    out_file = new_temp_file(extension='mt')

    sample_names = all_samples[:5]
    new_names = [f'S{i}' for i, _ in enumerate(sample_names)]
    paths = [
        os.path.join(resource('gvcfs'), '1kg_chr22', f'{s}.hg38.g.vcf.gz')
        for s in sample_names
    ]
    header_path = paths[0]
    vc.run_combiner(paths,
                    out_file=out_file,
                    tmp_path=Env.hc()._tmpdir,
                    reference_genome='GRCh38',
                    header=header_path,
                    sample_names=new_names,
                    key_by_locus_and_alleles=True,
                    use_exome_default_intervals=True)
    mt_cols = hl.read_matrix_table(out_file).key_cols_by().cols()
    mt_names = mt_cols.aggregate(hl.agg.collect(mt_cols.s))
    assert new_names == mt_names