示例#1
0
def test_contig_recoding():
    path1 = os.path.join(resource('gvcfs'), 'recoding',
                         'HG00187.hg38.g.vcf.gz')
    path2 = os.path.join(resource('gvcfs'), 'recoding',
                         'HG00187.hg38.recoded.g.vcf.gz')

    out_file_1 = new_temp_file(extension='mt')
    out_file_2 = new_temp_file(extension='mt')

    vc.run_combiner([path1, path1],
                    out_file_1,
                    Env.hc()._tmpdir,
                    reference_genome='GRCh38',
                    use_exome_default_intervals=True)
    vc.run_combiner([path2, path2],
                    out_file_2,
                    Env.hc()._tmpdir,
                    reference_genome='GRCh38',
                    contig_recoding={'22': 'chr22'},
                    use_exome_default_intervals=True)

    mt1 = hl.read_matrix_table(out_file_1)
    mt2 = hl.read_matrix_table(out_file_2)

    assert mt1.count() == mt2.count()
    assert mt1._same(mt2)
示例#2
0
    def test_write_stage_locally(self):
        mt = self.get_vds()
        f = new_temp_file(suffix='mt')
        mt.write(f, stage_locally=True)

        mt2 = hl.read_matrix_table(f)
        self.assertTrue(mt._same(mt2))
示例#3
0
    def test_write_stage_locally(self):
        mt = self.get_vds()
        f = new_temp_file(suffix='mt')
        mt.write(f, stage_locally=True)

        mt2 = hl.read_matrix_table(f)
        self.assertTrue(mt._same(mt2))
示例#4
0
 def test_read_stored_globals(self):
     ds = self.get_vds()
     ds = ds.annotate_globals(x=5, baz='foo')
     f = new_temp_file(suffix='vds')
     ds.write(f)
     t = hl.read_table(f + '/globals')
     self.assertTrue(ds.globals_table()._same(t))
示例#5
0
def test_1kg_chr22():
    out_file = new_temp_file(suffix='mt')

    sample_names = all_samples[:5]
    paths = [os.path.join(resource('gvcfs'), '1kg_chr22', f'{s}.hg38.g.vcf.gz') for s in sample_names]
    vc.run_combiner(paths,
                    out_file=out_file,
                    tmp_path=Env.hc().tmp_dir,
                    branch_factor=2,
                    batch_size=2,
                    reference_genome='GRCh38')

    sample_data = dict()
    for sample, path in zip(sample_names, paths):
        ht = hl.import_vcf(path, force_bgz=True, reference_genome='GRCh38').localize_entries('entries')
        n, n_variant = ht.aggregate((hl.agg.count(), hl.agg.count_where(ht.entries[0].GT.is_non_ref())))
        sample_data[sample] = (n, n_variant)

    mt = hl.read_matrix_table(out_file)
    mt = mt.annotate_cols(n=hl.agg.count(), n_variant=hl.agg.count_where(
        mt.LGT.is_non_ref()))  # annotate the number of non-missing records

    combined_results = hl.tuple([mt.s, mt.n, mt.n_variant]).collect()
    assert len(combined_results) == len(sample_names)

    for sample, n, n_variant in combined_results:
        true_n, true_n_variant = sample_data[sample]
        assert n == true_n, sample
        assert n_variant == true_n_variant, sample
示例#6
0
 def test_read_stored_globals(self):
     ds = self.get_vds()
     ds = ds.annotate_globals(x=5, baz='foo')
     f = new_temp_file(suffix='vds')
     ds.write(f)
     t = hl.read_table(f + '/globals')
     self.assertTrue(ds.globals_table()._same(t))
示例#7
0
def test_combiner_plan_round_trip_serialization():
    sample_names = all_samples[:5]
    paths = [os.path.join(resource('gvcfs'), '1kg_chr22', f'{s}.hg38.g.vcf.gz') for s in sample_names]
    plan_path = new_temp_file(extension='json')
    out_file = new_temp_file(extension='vds')
    plan = new_combiner(gvcf_paths=paths,
                        output_path=out_file,
                        temp_path=Env.hc()._tmpdir,
                        save_path=plan_path,
                        reference_genome='GRCh38',
                        use_exome_default_intervals=True,
                        branch_factor=2,
                        batch_size=2)
    plan.save()
    plan_loaded = load_combiner(plan_path)
    assert plan == plan_loaded
示例#8
0
def test_combiner_run():

    tmpdir = new_temp_file()
    samples = all_samples[:5]

    input_paths = [resource(os.path.join('gvcfs', '1kg_chr22', f'{s}.hg38.g.vcf.gz')) for s in samples]
    final_paths_individual = [os.path.join(tmpdir, f'sample_{s}') for s in samples]
    final_path_1 = os.path.join(tmpdir, 'final1.vds')
    final_path_2 = os.path.join(tmpdir, 'final2.vds')

    parts = hl.eval([hl.parse_locus_interval('chr22:start-end', reference_genome='GRCh38')])

    for input_gvcf, path in zip(input_paths[:2], final_paths_individual[:2]):
        combiner = hl.vds.new_combiner(output_path=path, intervals=parts,
                                       temp_path=tmpdir,
                                       gvcf_paths=[input_gvcf],
                                       reference_genome='GRCh38')
        combiner.run()

    combiner = hl.vds.new_combiner(output_path=final_path_1, intervals=parts, temp_path=tmpdir,
                                   gvcf_paths=input_paths[2:], vds_paths=final_paths_individual[:2],
                                   reference_genome='GRCh38',
                                   branch_factor=2, batch_size=2)
    combiner.run()

    combiner2 = hl.vds.new_combiner(output_path=final_path_2, intervals=parts, temp_path=tmpdir,
                                    gvcf_paths=input_paths,
                                    reference_genome='GRCh38',
                                    branch_factor=2, batch_size=2)
    combiner2.run()

    assert hl.vds.read_vds(final_path_1)._same(hl.vds.read_vds(final_path_2))
示例#9
0
 def test_codecs_table(self):
     from hail.utils.java import scala_object
     codecs = scala_object(Env.hail().io, 'CodecSpec').codecSpecs()
     rt = self.get_vds().rows()
     temp = new_temp_file(suffix='ht')
     for codec in codecs:
         rt.write(temp, overwrite=True, _codec_spec=codec.toString())
         rt2 = hl.read_table(temp)
         self.assertTrue(rt._same(rt2))
示例#10
0
 def test_codecs_table(self):
     from hail.utils.java import Env, scala_object
     codecs = scala_object(Env.hail().io, 'CodecSpec').codecSpecs()
     rt = self.get_vds().rows()
     temp = new_temp_file(suffix='ht')
     for codec in codecs:
         rt.write(temp, overwrite=True, _codec_spec=codec.toString())
         rt2 = hl.read_table(temp)
         self.assertTrue(rt._same(rt2))
示例#11
0
 def test_codecs_matrix(self):
     from hail.utils.java import Env, scala_object
     codecs = scala_object(Env.hail().io, 'CodecSpec').codecSpecs()
     ds = self.get_vds()
     temp = new_temp_file(suffix='hmt')
     for codec in codecs:
         ds.write(temp, overwrite=True, _codec_spec=codec.toString())
         ds2 = hl.read_matrix_table(temp)
         self.assertTrue(ds._same(ds2))
示例#12
0
 def test_multi_write(self):
     mt = self.get_vds()
     f = new_temp_file()
     hl.experimental.write_matrix_tables([mt, mt], f)
     path1 = f + '0.mt'
     path2 = f + '1.mt'
     mt1 = hl.read_matrix_table(path1)
     mt2 = hl.read_matrix_table(path2)
     self.assertTrue(mt._same(mt1))
     self.assertTrue(mt._same(mt2))
     self.assertTrue(mt1._same(mt2))
示例#13
0
def test_key_by_locus_alleles():
    out_file = new_temp_file(extension='mt')

    sample_names = all_samples[:5]
    paths = [os.path.join(resource('gvcfs'), '1kg_chr22', f'{s}.hg38.g.vcf.gz') for s in sample_names]
    vc.run_combiner(paths,
                    out_file=out_file,
                    tmp_path=Env.hc()._tmpdir,
                    reference_genome='GRCh38',
                    key_by_locus_and_alleles=True)

    mt = hl.read_matrix_table(out_file)
    assert(list(mt.row_key) == ['locus', 'alleles'])
    mt._force_count_rows()
示例#14
0
def test_non_ref_alleles_set_to_missing():
    path = os.path.join(resource('gvcfs'), 'non_ref_call.g.vcf.gz')
    out_file = new_temp_file(extension='mt')
    vc.run_combiner([path, path],
                    out_file=out_file,
                    tmp_path=Env.hc()._tmpdir,
                    branch_factor=2,
                    batch_size=2,
                    reference_genome='GRCh38')

    mt = hl.read_matrix_table(out_file)
    n_alleles = hl.len(mt.alleles)
    gt_idx = hl.experimental.lgt_to_gt(mt.LGT, mt.LA).unphased_diploid_gt_index()
    assert mt.aggregate_entries(
        hl.agg.all(gt_idx < (n_alleles * (n_alleles + 1)) / 2))
示例#15
0
    def test_head(self):
        # no empty partitions
        mt1 = hl.utils.range_matrix_table(10, 10)

        # empty partitions at front
        mt2 = hl.utils.range_matrix_table(20, 10, 20)
        mt2 = mt2.filter_rows(mt2.row_idx > 9)
        mts = [mt1, mt2]

        for mt in mts:
            tmp_file = new_temp_file(suffix='mt')

            mt.write(tmp_file)
            mt_readback = hl.read_matrix_table(tmp_file)
            for mt_ in [mt, mt_readback]:
                assert mt_.head(1).count_rows() == 1
                assert mt_.head(1)._force_count_rows() == 1
                assert mt_.head(100).count_rows() == 10
                assert mt_.head(100)._force_count_rows() == 10
示例#16
0
def test_combiner_manual_filtration():
    sample_names = all_samples[:2]
    paths = [os.path.join(resource('gvcfs'), '1kg_chr22', f'{s}.hg38.g.vcf.gz') for s in sample_names]
    out_file = new_temp_file(extension='vds')
    plan = new_combiner(gvcf_paths=paths,
                        output_path=out_file,
                        temp_path=Env.hc()._tmpdir,
                        reference_genome='GRCh38',
                        use_exome_default_intervals=True,
                        gvcf_reference_entry_fields_to_keep=['GQ'],
                        gvcf_info_to_keep=['ExcessHet'],
                        force=True)

    assert plan.gvcf_info_to_keep == {'ExcessHet'}

    plan.run()
    vds = hl.vds.read_vds(out_file)
    assert list(vds.variant_data.gvcf_info) == ['ExcessHet']
    assert list(vds.reference_data.entry) == ['END', 'GQ']
示例#17
0
def test_sample_override():
    out_file = new_temp_file(extension='mt')

    sample_names = all_samples[:5]
    new_names = [f'S{i}' for i, _ in enumerate(sample_names)]
    paths = [
        os.path.join(resource('gvcfs'), '1kg_chr22', f'{s}.hg38.g.vcf.gz')
        for s in sample_names
    ]
    header_path = paths[0]
    vc.run_combiner(paths,
                    out_file=out_file,
                    tmp_path=Env.hc()._tmpdir,
                    reference_genome='GRCh38',
                    header=header_path,
                    sample_names=new_names,
                    key_by_locus_and_alleles=True,
                    use_exome_default_intervals=True)
    mt_cols = hl.read_matrix_table(out_file).key_cols_by().cols()
    mt_names = mt_cols.aggregate(hl.agg.collect(mt_cols.s))
    assert new_names == mt_names
示例#18
0
def impute_sex_chromosome_ploidy(vds: VariantDataset, calling_intervals,
                                 normalization_contig: str) -> hl.Table:
    """Impute sex chromosome ploidy from depth of reference data within calling intervals.

    Returns a :class:`.Table` with sample ID keys, with the following fields:

     -  ``autosomal_mean_dp`` (*float64*): Mean depth on calling intervals on normalization contig.
     -  ``x_mean_dp`` (*float64*): Mean depth on calling intervals on X chromosome.
     -  ``x_ploidy`` (*float64*): Estimated ploidy on X chromosome. Equal to ``2 * x_mean_dp / autosomal_mean_dp``.
     -  ``y_mean_dp`` (*float64*): Mean depth on calling intervals on  chromosome.
     -  ``y_ploidy`` (*float64*): Estimated ploidy on Y chromosome. Equal to ``2 * y_mean_db / autosomal_mean_dp``.

    Parameters
    ----------
    vds : vds: :class:`.VariantDataset`
        Dataset.
    calling_intervals : :class:`.Table` or :class:`.ArrayExpression`
        Calling intervals with consistent read coverage (for exomes, trim the capture intervals).
    normalization_contig : str
        Autosomal contig for depth comparison.

    Returns
    -------
    :class:`.Table`
    """

    if not isinstance(calling_intervals, Table):
        calling_intervals = hl.Table.parallelize(
            hl.map(lambda i: hl.struct(interval=i), calling_intervals),
            schema=hl.tstruct(interval=calling_intervals.dtype.element_type),
            key='interval')
    else:
        key_dtype = calling_intervals.key.dtype
        if len(key_dtype) != 1 or not isinstance(
                calling_intervals.key[0].dtype,
                hl.tinterval) or calling_intervals.key[
                    0].dtype.point_type != vds.reference_data.locus.dtype:
            raise ValueError(
                f"'impute_sex_chromosome_ploidy': expect calling_intervals to be list of intervals or"
                f" table with single key of type interval<locus>, found table with key: {key_dtype}"
            )

    rg = vds.reference_data.locus.dtype.reference_genome

    par_boundaries = []
    for par_interval in rg.par:
        par_boundaries.append(par_interval.start)
        par_boundaries.append(par_interval.end)

    # segment on PAR interval boundaries
    calling_intervals = hl.segment_intervals(calling_intervals, par_boundaries)

    # remove intervals overlapping PAR
    calling_intervals = calling_intervals.filter(
        hl.all(lambda x: ~x.overlaps(calling_intervals.interval),
               hl.literal(rg.par)))

    # checkpoint for efficient multiple downstream usages
    info("'impute_sex_chromosome_ploidy': checkpointing calling intervals")
    calling_intervals = calling_intervals.checkpoint(
        new_temp_file(extension='ht'))

    interval = calling_intervals.key[0]
    (any_bad_intervals, chrs_represented) = calling_intervals.aggregate(
        (hl.agg.any(interval.start.contig != interval.end.contig),
         hl.agg.collect_as_set(interval.start.contig)))
    if any_bad_intervals:
        raise ValueError(
            "'impute_sex_chromosome_ploidy' does not support calling intervals that span chromosome boundaries"
        )

    if len(rg.x_contigs) != 1:
        raise NotImplementedError(
            f"reference genome {rg.name!r} has multiple X contigs, this is not supported in 'impute_sex_chromosome_ploidy'"
        )
    chr_x = rg.x_contigs[0]
    if len(rg.y_contigs) != 1:
        raise NotImplementedError(
            f"reference genome {rg.name!r} has multiple Y contigs, this is not supported in 'impute_sex_chromosome_ploidy'"
        )
    chr_y = rg.y_contigs[0]

    kept_contig_filter = hl.array(chrs_represented).map(
        lambda x: hl.parse_locus_interval(x, reference_genome=rg))
    vds = VariantDataset(
        hl.filter_intervals(vds.reference_data, kept_contig_filter),
        hl.filter_intervals(vds.variant_data, kept_contig_filter))

    coverage = interval_coverage(vds, calling_intervals,
                                 gq_thresholds=()).drop('gq_thresholds')

    coverage = coverage.annotate_rows(contig=coverage.interval.start.contig)
    coverage = coverage.annotate_cols(__mean_dp=hl.agg.group_by(
        coverage.contig,
        hl.agg.sum(coverage.sum_dp) / hl.agg.sum(coverage.interval_size)))

    mean_dp_dict = coverage.__mean_dp
    auto_dp = mean_dp_dict.get(normalization_contig)
    x_dp = mean_dp_dict.get(chr_x)
    y_dp = mean_dp_dict.get(chr_y)
    per_sample = coverage.transmute_cols(autosomal_mean_dp=auto_dp,
                                         x_mean_dp=x_dp,
                                         x_ploidy=2 * x_dp / auto_dp,
                                         y_mean_dp=y_dp,
                                         y_ploidy=2 * y_dp / auto_dp)
    info(
        "'impute_sex_chromosome_ploidy': computing and checkpointing coverage and karyotype metrics"
    )
    return per_sample.cols().checkpoint(
        new_temp_file('impute_sex_karyotype', extension='ht'))