def generate_5_sample_vds(): paths = [ os.path.join(resource('gvcfs'), '1kg_chr22', path) for path in [ 'HG00187.hg38.g.vcf.gz', 'HG00190.hg38.g.vcf.gz', 'HG00308.hg38.g.vcf.gz', 'HG00313.hg38.g.vcf.gz', 'HG00320.hg38.g.vcf.gz' ] ] parts = [ hl.Interval(start=hl.Struct( locus=hl.Locus('chr22', 1, reference_genome='GRCh38')), end=hl.Struct(locus=hl.Locus( 'chr22', hl.get_reference('GRCh38').contig_length('chr22') - 1, reference_genome='GRCh38')), includes_end=True) ] vcfs = hl.import_gvcfs(paths, parts, reference_genome='GRCh38', array_elements_required=False) to_keep = defined_entry_fields( vcfs[0].filter_rows(hl.is_defined(vcfs[0].info.END)), 100_000) vds = hl.vds.combiner.combine_variant_datasets( [hl.vds.combiner.transform_gvcf(mt, to_keep) for mt in vcfs]) vds.variant_data = vds.variant_data._key_rows_by_assert_sorted( 'locus', 'alleles') vds.write(os.path.join(resource('vds'), '1kg_chr22_5_samples.vds'), overwrite=True)
def __init__(self, name, contigs, lengths, x_contigs=[], y_contigs=[], mt_contigs=[], par=[], _builtin=False): super(ReferenceGenome, self).__init__() contigs = wrap_to_list(contigs) x_contigs = wrap_to_list(x_contigs) y_contigs = wrap_to_list(y_contigs) mt_contigs = wrap_to_list(mt_contigs) self._config = { 'name': name, 'contigs': [{'name': c, 'length': l} for c, l in lengths.items()], 'xContigs': x_contigs, 'yContigs': y_contigs, 'mtContigs': mt_contigs, 'par': [{'start': {'contig': c, 'position': s}, 'end': {'contig': c, 'position': e}} for (c, s, e) in par] } self._contigs = contigs self._lengths = lengths self._par_tuple = par self._par = [hl.Interval(hl.Locus(c, s, self), hl.Locus(c, e, self)) for (c, s, e) in par] self._global_positions = None ReferenceGenome._references[name] = self if not _builtin: Env.backend().add_reference(self._config) hl.ir.register_reference_genome_functions(name) self._has_sequence = False self._liftovers = set()
def test_filter_intervals_compound_partition_key(self): ds = hl.import_vcf(resource('sample.vcf'), min_partitions=20) ds = (ds.annotate_rows(variant=hl.struct(locus=ds.locus, alleles=ds.alleles)) .key_rows_by('locus', 'alleles')) intervals = [hl.Interval(hl.Struct(locus=hl.Locus('20', 10639222), alleles=['A', 'T']), hl.Struct(locus=hl.Locus('20', 10644700), alleles=['A', 'T']))] self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3)
def locus_interval(start, end): return hl.Interval(start=hl.Locus( contig=contig, position=start, reference_genome=reference_genome), end=hl.Locus(contig=contig, position=end, reference_genome=reference_genome), includes_end=True)
def assert_rg_loaded_correctly(name): rg = hl.get_reference(name) self.assertEqual(rg.contigs, ["1", "X", "Y", "MT"]) self.assertEqual(rg.lengths, {"1": 5, "X": 4, "Y": 3, "MT": 2}) self.assertEqual(rg.x_contigs, ["X"]) self.assertEqual(rg.y_contigs, ["Y"]) self.assertEqual(rg.mt_contigs, ["MT"]) self.assertEqual(rg.par, [ hl.Interval(start=hl.Locus("X", 2, name), end=hl.Locus("X", 4, name)) ])
def test_gvcfs(spark, tmp_path): # GVCF MatrixTables are not keyed by locus and alleles, just by locus input_vcf = 'test-data/tabix-test-vcf/combined.chr20_18210071_18210093.g.vcf.gz' partitions = [ hl.Interval(hl.Locus("chr20", 1, reference_genome='GRCh38'), hl.Locus("chr20", 20000000, reference_genome='GRCh38'), includes_end=True) ] hail_df = functions.from_matrix_table( hl.import_gvcfs([input_vcf], partitions, force_bgz=True, reference_genome='GRCh38')[0]) _assert_lossless_adapter(spark, tmp_path, hail_df, input_vcf, 'vcf', 'bigvcf')
def default_exome_intervals(reference_genome) -> List[hl.utils.Interval]: """create a list of locus intervals suitable for importing and merging exome gvcfs. As exomes are small. One partition per chromosome works well here. Parameters ---------- reference_genome: :obj:`str` or :class:`.ReferenceGenome`, optional Reference genome to use. NOTE: only GRCh37 and GRCh38 references are supported. Returns ------- :obj:`List[Interval]` """ if reference_genome.name == 'GRCh37': contigs = [f'{i}' for i in range(1, 23)] + ['X', 'Y', 'MT'] elif reference_genome.name == 'GRCh38': contigs = [f'chr{i}' for i in range(1, 23)] + ['chrX', 'chrY', 'chrM'] else: raise ValueError( f"Invalid reference genome '{reference_genome.name}', only 'GRCh37' and 'GRCh38' are supported" ) return [ hl.Interval(start=hl.Locus(contig=contig, position=1, reference_genome=reference_genome), end=hl.Locus.parse(f'{contig}:END', reference_genome=reference_genome), includes_end=True) for contig in contigs ]
def test_vcf_vds_combiner_equivalence(): import hail.experimental.vcf_combiner.vcf_combiner as vcf import hail.vds.combiner as vds _paths = ['gvcfs/HG00096.g.vcf.gz', 'gvcfs/HG00268.g.vcf.gz'] paths = [resource(p) for p in _paths] parts = [ hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 17821257, reference_genome='GRCh38')), end=hl.Struct(locus=hl.Locus('chr20', 18708366, reference_genome='GRCh38')), includes_end=True), hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 18708367, reference_genome='GRCh38')), end=hl.Struct(locus=hl.Locus('chr20', 19776611, reference_genome='GRCh38')), includes_end=True), hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 19776612, reference_genome='GRCh38')), end=hl.Struct(locus=hl.Locus('chr20', 21144633, reference_genome='GRCh38')), includes_end=True) ] vcfs = [mt.annotate_rows(info=mt.info.annotate( MQ_DP=hl.missing(hl.tint32), VarDP=hl.missing(hl.tint32), QUALapprox=hl.missing(hl.tint32))) for mt in hl.import_gvcfs(paths, parts, reference_genome='GRCh38', array_elements_required=False)] entry_to_keep = defined_entry_fields(vcfs[0].filter_rows(hl.is_defined(vcfs[0].info.END)), 100_000) - {'GT', 'PGT', 'PL'} vds = vds.combine_variant_datasets([vds.transform_gvcf(mt, reference_entry_fields_to_keep=entry_to_keep) for mt in vcfs]) smt = vcf.combine_gvcfs([vcf.transform_gvcf(mt) for mt in vcfs]) smt_from_vds = hl.vds.to_merged_sparse_mt(vds).drop('RGQ') smt = smt.select_entries(*smt_from_vds.entry) # harmonize fields and order smt = smt.key_rows_by('locus', 'alleles') assert smt._same(smt_from_vds)
def test_multiple_files_variant_filtering(self): bgen_file = [ resource('random-b.bgen'), resource('random-c.bgen'), resource('random-a.bgen') ] hl.index_bgen(bgen_file) alleles = ['A', 'G'] desired_variants = [ hl.Struct(locus=hl.Locus('20', 11), alleles=alleles), hl.Struct(locus=hl.Locus('20', 13), alleles=alleles), hl.Struct(locus=hl.Locus('20', 29), alleles=alleles), hl.Struct(locus=hl.Locus('20', 28), alleles=alleles), hl.Struct(locus=hl.Locus('20', 1), alleles=alleles), hl.Struct(locus=hl.Locus('20', 12), alleles=alleles), ] actual = hl.import_bgen(bgen_file, ['GT'], n_partitions=10, variants=desired_variants) self.assertEqual(actual.count_rows(), 6) everything = hl.import_bgen(bgen_file, ['GT']) self.assertEqual(everything.count(), (30, 10)) expected = everything.filter_rows( hl.set(desired_variants).contains(everything.row_key)) self.assertTrue(expected._same(actual))
def test_combiner_works(): _paths = ['gvcfs/HG00096.g.vcf.gz', 'gvcfs/HG00268.g.vcf.gz'] paths = [resource(p) for p in _paths] parts = [ hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 17821257, reference_genome='GRCh38')), end=hl.Struct(locus=hl.Locus('chr20', 18708366, reference_genome='GRCh38')), includes_end=True), hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 18708367, reference_genome='GRCh38')), end=hl.Struct(locus=hl.Locus('chr20', 19776611, reference_genome='GRCh38')), includes_end=True), hl.Interval(start=hl.Struct(locus=hl.Locus('chr20', 19776612, reference_genome='GRCh38')), end=hl.Struct(locus=hl.Locus('chr20', 21144633, reference_genome='GRCh38')), includes_end=True) ] vcfs = hl.import_gvcfs(paths, parts, reference_genome='GRCh38', array_elements_required=False) entry_to_keep = defined_entry_fields(vcfs[0].filter_rows(hl.is_defined(vcfs[0].info.END)), 100_000) - {'GT', 'PGT', 'PL'} vcfs = [transform_gvcf(mt.annotate_rows(info=mt.info.annotate( MQ_DP=hl.missing(hl.tint32), VarDP=hl.missing(hl.tint32), QUALapprox=hl.missing(hl.tint32))), reference_entry_fields_to_keep=entry_to_keep) for mt in vcfs] comb = combine_variant_datasets(vcfs) assert len(parts) == comb.variant_data.n_partitions() comb.variant_data._force_count_rows() comb.reference_data._force_count_rows()
def test_summarize_variants(self): mt = hl.utils.range_matrix_table(3, 3) variants = hl.literal({ 0: hl.Struct(locus=hl.Locus('1', 1), alleles=['A', 'T', 'C']), 1: hl.Struct(locus=hl.Locus('2', 1), alleles=['A', 'AT', '@']), 2: hl.Struct(locus=hl.Locus('2', 1), alleles=['AC', 'GT']) }) mt = mt.annotate_rows(**variants[mt.row_idx]).key_rows_by( 'locus', 'alleles') r = hl.summarize_variants(mt, show=False) self.assertEqual(r.n_variants, 3) self.assertEqual(r.contigs, {'1': 1, '2': 2}) self.assertEqual(r.allele_types, { 'SNP': 2, 'MNP': 1, 'Unknown': 1, 'Insertion': 1 }) self.assertEqual(r.allele_counts, {2: 1, 3: 2})
def test_import_vcf_missing_format_field_elements(self): mt = hl.import_vcf(resource('missingFormatArray.vcf'), reference_genome='GRCh37', array_elements_required=False) mt = mt.select_rows().select_entries('AD', 'PL') expected = hl.Table.parallelize([{ 'locus': hl.Locus('X', 16050036), 'alleles': ['A', 'C'], 's': 'C1046::HG02024', 'AD': [None, None], 'PL': [0, None, 180] }, { 'locus': hl.Locus('X', 16050036), 'alleles': ['A', 'C'], 's': 'C1046::HG02025', 'AD': [None, 6], 'PL': [70, None] }, { 'locus': hl.Locus('X', 16061250), 'alleles': ['T', 'A', 'C'], 's': 'C1046::HG02024', 'AD': [0, 0, None], 'PL': [396, None, None, 33, None, 0] }, { 'locus': hl.Locus('X', 16061250), 'alleles': ['T', 'A', 'C'], 's': 'C1046::HG02025', 'AD': [0, 0, 9], 'PL': [None, None, None] }], hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), s=hl.tstr, AD=hl.tarray(hl.tint), PL=hl.tarray(hl.tint)), key=['locus', 'alleles', 's']) self.assertTrue(mt.entries()._same(expected))
def test_import_vcf_missing_info_field_elements(self): mt = hl.import_vcf(resource('missingInfoArray.vcf'), reference_genome='GRCh37', array_elements_required=False) mt = mt.select_rows(FOO=mt.info.FOO, BAR=mt.info.BAR) expected = hl.Table.parallelize([{ 'locus': hl.Locus('X', 16050036), 'alleles': ['A', 'C'], 'FOO': [1, None], 'BAR': [2, None, None] }, { 'locus': hl.Locus('X', 16061250), 'alleles': ['T', 'A', 'C'], 'FOO': [None, 2, None], 'BAR': [None, 1.0, None] }], hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), FOO=hl.tarray(hl.tint), BAR=hl.tarray(hl.tfloat64)), key=['locus', 'alleles']) self.assertTrue(mt.rows()._same(expected))
def calculate_new_intervals(ht, n, reference_genome): """takes a table, keyed by ['locus', ...] and produces a list of intervals suitable for repartitioning a combiner matrix table Parameters ---------- ht : :class:`.Table` Table / Rows Table to compute new intervals for n : :obj:`int` Number of rows each partition should have, (last partition may be smaller) reference_genome: :obj:`str` or :class:`.ReferenceGenome`, optional Reference genome to use. Returns ------- :obj:`List[Interval]` """ assert list(ht.key) == ['locus'] assert ht.locus.dtype == hl.tlocus(reference_genome=reference_genome) end = hl.Locus(reference_genome.contigs[-1], reference_genome.lengths[reference_genome.contigs[-1]], reference_genome=reference_genome) n_rows = ht.count() if n_rows == 0: raise ValueError('empty table!') ht = ht.select() ht = ht.annotate(x=hl.scan.count()) ht = ht.annotate(y=ht.x + 1) ht = ht.filter((ht.x // n != ht.y // n) | (ht.x == (n_rows - 1))) ht = ht.select() ht = ht.annotate(start=hl.or_else( hl.scan._prev_nonnull( hl.locus_from_global_position(ht.locus.global_position() + 1, reference_genome=reference_genome)), hl.locus_from_global_position(0, reference_genome=reference_genome))) ht = ht.key_by() ht = ht.select( interval=hl.interval(start=ht.start, end=ht.locus, includes_end=True)) intervals = ht.aggregate(hl.agg.collect(ht.interval)) last_st = hl.eval( hl.locus_from_global_position( hl.literal(intervals[-1].end).global_position() + 1, reference_genome=reference_genome)) interval = hl.Interval(start=last_st, end=end, includes_end=True) intervals.append(interval) return intervals
def values(self): values = [(hl.tbool, True), (hl.tint32, 0), (hl.tint64, 0), (hl.tfloat32, 0.5), (hl.tfloat64, 0.5), (hl.tstr, "foo"), (hl.tstruct(x=hl.tint32), hl.Struct(x=0)), (hl.tarray(hl.tint32), [0, 1, 4]), (hl.tset(hl.tint32), {0, 1, 4}), (hl.tdict(hl.tstr, hl.tint32), { "a": 0, "b": 1, "c": 4 }), (hl.tinterval(hl.tint32), hl.Interval(0, 1, True, False)), (hl.tlocus(hl.default_reference()), hl.Locus("1", 1)), (hl.tcall, hl.Call([0, 1]))] return values
def test_json_encoder(self): self.assertEqual( json.dumps(frozendict({"foo": "bar"}), cls=hl.utils.JSONEncoder), '{"foo": "bar"}') self.assertEqual( json.dumps(Struct(foo="bar"), cls=hl.utils.JSONEncoder), '{"foo": "bar"}') self.assertEqual( json.dumps(Interval(start=1, end=10), cls=hl.utils.JSONEncoder), '{"start": 1, "end": 10, "includes_start": true, "includes_end": false}' ) self.assertEqual( json.dumps(hl.Locus(1, 100, "GRCh38"), cls=hl.utils.JSONEncoder), '{"contig": "1", "position": 100, "reference_genome": "GRCh38"}')
def test_impute_sex_chromosome_ploidy(): x_par_end = 2699521 y_par_end = 2649521 rg = hl.get_reference('GRCh37') ref_blocks = [ hl.Struct(s='sample_xx', ref_allele='A', locus=hl.Locus('22', 1000000, rg), END=2000000, GQ=15, DP=5), hl.Struct(s='sample_xx', ref_allele='A', locus=hl.Locus('X', x_par_end - 10, rg), END=x_par_end + 9, GQ=18, DP=6), hl.Struct(s='sample_xx', ref_allele='A', locus=hl.Locus('X', x_par_end + 10, rg), END=x_par_end + 29, GQ=15, DP=5), hl.Struct(s='sample_xy', ref_allele='A', locus=hl.Locus('22', 1000000, rg), END=2000000, GQ=15, DP=5), hl.Struct(s='sample_xy', ref_allele='A', locus=hl.Locus('X', x_par_end - 10, rg), END=x_par_end + 9, GQ=9, DP=3), hl.Struct(s='sample_xy', ref_allele='A', locus=hl.Locus('X', x_par_end + 10, rg), END=x_par_end + 29, GQ=6, DP=2), hl.Struct(s='sample_xy', ref_allele='A', locus=hl.Locus('Y', y_par_end - 10, rg), END=y_par_end + 9, GQ=12, DP=4), hl.Struct(s='sample_xy', ref_allele='A', locus=hl.Locus('Y', y_par_end + 10, rg), END=y_par_end + 29, GQ=9, DP=3), ] ref_mt = hl.Table.parallelize(ref_blocks, schema=hl.dtype('struct{s:str,locus:locus<GRCh37>,ref_allele:str,END:int32,GQ:int32,DP:int32}')) \ .to_matrix_table(row_key=['locus'], row_fields=['ref_allele'], col_key=['s']) var_mt = hl.Table.parallelize([], schema=hl.dtype('struct{locus:locus<GRCh37>,alleles:array<str>,s:str,LA:array<int32>,LGT:call,GQ:int32}'))\ .to_matrix_table(row_key=['locus', 'alleles'], col_key=['s']) vds = hl.vds.VariantDataset(ref_mt, var_mt) calling_intervals = [ hl.parse_locus_interval('22:1000010-1000020', reference_genome='GRCh37'), hl.parse_locus_interval(f'X:{x_par_end}-{x_par_end+20}', reference_genome='GRCh37'), hl.parse_locus_interval(f'Y:{y_par_end}-{y_par_end+20}', reference_genome='GRCh37'), ] r = hl.vds.impute_sex_chromosome_ploidy(vds, calling_intervals, normalization_contig='22') assert r.collect() == [ hl.Struct(s='sample_xx', autosomal_mean_dp=5.0, x_mean_dp=5.5, x_ploidy=2.2, y_mean_dp=0.0, y_ploidy=0.0), hl.Struct(s='sample_xy', autosomal_mean_dp=5.0, x_mean_dp=2.5, x_ploidy=1.0, y_mean_dp=3.5, y_ploidy=1.4) ]
def test_import_bgen_variant_filtering_from_literals(self): bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding={'01': '1'}) alleles = ['A', 'G'] desired_variants = [ hl.Struct(locus=hl.Locus('1', 2000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 2001), alleles=alleles), hl.Struct(locus=hl.Locus('1', 4000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 100001), alleles=alleles), ] expected_result = [ hl.Struct(locus=hl.Locus('1', 2000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 2001), alleles=alleles), hl.Struct(locus=hl.Locus('1', 4000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles), hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles), # Duplicated variant hl.Struct(locus=hl.Locus('1', 100001), alleles=alleles), ] part_1 = hl.import_bgen( bgen_file, ['GT'], n_partitions=1, # forcing seek to be called variants=desired_variants) self.assertTrue(part_1.rows().key_by( 'locus', 'alleles').select().collect() == expected_result) part_199 = hl.import_bgen( bgen_file, ['GT'], n_partitions= 199, # forcing each variant to be its own partition for testing duplicates work properly variants=desired_variants) self.assertTrue(part_199.rows().key_by( 'locus', 'alleles').select().collect() == expected_result) everything = hl.import_bgen(bgen_file, ['GT']) self.assertEqual(everything.count(), (199, 500)) expected = everything.filter_rows( hl.set(desired_variants).contains(everything.row_key)) self.assertTrue(expected._same(part_1))
def test_mendel_errors(self): mt = hl.import_vcf(resource('mendel.vcf')) ped = hl.Pedigree.read(resource('mendel.fam')) men, fam, ind, var = hl.mendel_errors(mt['GT'], ped) self.assertEqual(men.key.dtype, hl.tstruct(locus=mt.locus.dtype, alleles=hl.tarray(hl.tstr), s=hl.tstr)) self.assertEqual(men.row.dtype, hl.tstruct(locus=mt.locus.dtype, alleles=hl.tarray(hl.tstr), s=hl.tstr, fam_id=hl.tstr, mendel_code=hl.tint)) self.assertEqual(fam.key.dtype, hl.tstruct(pat_id=hl.tstr, mat_id=hl.tstr)) self.assertEqual(fam.row.dtype, hl.tstruct(pat_id=hl.tstr, mat_id=hl.tstr, fam_id=hl.tstr, children=hl.tint, errors=hl.tint64, snp_errors=hl.tint64)) self.assertEqual(ind.key.dtype, hl.tstruct(s=hl.tstr)) self.assertEqual(ind.row.dtype, hl.tstruct(s=hl.tstr, fam_id=hl.tstr, errors=hl.tint64, snp_errors=hl.tint64)) self.assertEqual(var.key.dtype, hl.tstruct(locus=mt.locus.dtype, alleles=hl.tarray(hl.tstr))) self.assertEqual(var.row.dtype, hl.tstruct(locus=mt.locus.dtype, alleles=hl.tarray(hl.tstr), errors=hl.tint64)) self.assertEqual(men.count(), 41) self.assertEqual(fam.count(), 2) self.assertEqual(ind.count(), 7) self.assertEqual(var.count(), mt.count_rows()) self.assertEqual(set(fam.select('errors', 'snp_errors').collect()), { hl.utils.Struct(pat_id='Dad1', mat_id='Mom1', errors=41, snp_errors=39), hl.utils.Struct(pat_id='Dad2', mat_id='Mom2', errors=0, snp_errors=0) }) self.assertEqual(set(ind.select('errors', 'snp_errors').collect()), { hl.utils.Struct(s='Son1', errors=23, snp_errors=22), hl.utils.Struct(s='Dtr1', errors=18, snp_errors=17), hl.utils.Struct(s='Dad1', errors=19, snp_errors=18), hl.utils.Struct(s='Mom1', errors=22, snp_errors=21), hl.utils.Struct(s='Dad2', errors=0, snp_errors=0), hl.utils.Struct(s='Mom2', errors=0, snp_errors=0), hl.utils.Struct(s='Son2', errors=0, snp_errors=0) }) to_keep = hl.set([ (hl.Locus("1", 1), ['C', 'CT']), (hl.Locus("1", 2), ['C', 'T']), (hl.Locus("X", 1), ['C', 'T']), (hl.Locus("X", 3), ['C', 'T']), (hl.Locus("Y", 1), ['C', 'T']), (hl.Locus("Y", 3), ['C', 'T']) ]) self.assertEqual(var.filter(to_keep.contains((var.locus, var.alleles))) .order_by('locus') .select('locus', 'alleles', 'errors').collect(), [ hl.utils.Struct(locus=hl.Locus("1", 1), alleles=['C', 'CT'], errors=2), hl.utils.Struct(locus=hl.Locus("1", 2), alleles=['C', 'T'], errors=1), hl.utils.Struct(locus=hl.Locus("X", 1), alleles=['C', 'T'], errors=2), hl.utils.Struct(locus=hl.Locus("X", 3), alleles=['C', 'T'], errors=1), hl.utils.Struct(locus=hl.Locus("Y", 1), alleles=['C', 'T'], errors=1), hl.utils.Struct(locus=hl.Locus("Y", 3), alleles=['C', 'T'], errors=1), ]) ped2 = hl.Pedigree.read(resource('mendelWithMissingSex.fam')) men2, _, _, _ = hl.mendel_errors(mt['GT'], ped2) self.assertTrue(men2.filter(men2.s == 'Dtr1')._same(men.filter(men.s == 'Dtr1')))
def test_concordance_n_discordant(self): dataset = get_dataset() _, cols_conc, rows_conc = hl.concordance(dataset, dataset) assert cols_conc.aggregate( hl.agg.count_where(cols_conc.n_discordant != 0)) == 0 rows1 = [ hl.Struct( **{ 'locus': hl.Locus('1', 100), 'alleles': ['A', 'T'], 's': '1', 'GT': hl.Call([0, 0]) }), hl.Struct( **{ 'locus': hl.Locus('1', 100), 'alleles': ['A', 'T'], 's': '2', 'GT': hl.Call([0, 0]) }), hl.Struct( **{ 'locus': hl.Locus('1', 100), 'alleles': ['A', 'T'], 's': '3', 'GT': hl.Call([1, 1]) }), hl.Struct( **{ 'locus': hl.Locus('1', 100), 'alleles': ['A', 'T'], 's': '4', 'GT': hl.Call([1, 1]) }), hl.Struct( **{ 'locus': hl.Locus('1', 101), 'alleles': ['A', 'T'], 's': '1', 'GT': hl.Call([1, 1]) }), ] rows2 = [ hl.Struct( **{ 'locus': hl.Locus('1', 100), 'alleles': ['A', 'T'], 's': '1', 'GT': None }), hl.Struct( **{ 'locus': hl.Locus('1', 100), 'alleles': ['A', 'T'], 's': '2', 'GT': hl.Call([0, 1]) }), hl.Struct( **{ 'locus': hl.Locus('1', 100), 'alleles': ['A', 'T'], 's': '3', 'GT': hl.Call([0, 1]) }), hl.Struct( **{ 'locus': hl.Locus('1', 100), 'alleles': ['A', 'T'], 's': '4', 'GT': hl.Call([1, 1]) }), ] def make_mt(rows): ht = hl.Table.parallelize( rows, schema= 'struct{locus:locus<GRCh37>,alleles:array<str>,s:str,GT:call}') return ht.to_matrix_table(row_key=['locus', 'alleles'], col_key=['s']) global_conc_2, cols_conc_2, rows_conc_2 = hl.concordance( make_mt(rows1), make_mt(rows2)) assert cols_conc_2.collect() == [ hl.Struct(s='1', concordance=[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 0], [1, 0, 0, 0, 0]], n_discordant=0), hl.Struct(s='2', concordance=[[1, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]], n_discordant=1), hl.Struct(s='3', concordance=[[1, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 1, 0]], n_discordant=1), hl.Struct(s='4', concordance=[[1, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 1]], n_discordant=0), ] assert global_conc_2 == [[3, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 1, 0, 1, 0], [0, 0, 0, 0, 0], [1, 0, 0, 1, 1]] assert rows_conc_2.collect() == [ hl.Struct(locus=hl.Locus('1', 100), alleles=['A', 'T'], concordance=[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 1, 0, 1, 0], [0, 0, 0, 0, 0], [0, 0, 0, 1, 1]], n_discordant=2), hl.Struct(locus=hl.Locus('1', 101), alleles=['A', 'T'], concordance=[[3, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [1, 0, 0, 0, 0]], n_discordant=0), ]
def test_locus_windows(self): def assert_eq(a, b): self.assertTrue(np.array_equal(a, np.array(b))) centimorgans = hl.literal([0.1, 1.0, 1.0, 1.5, 1.9]) mt = hl.balding_nichols_model(1, 5, 5).add_row_index() mt = mt.annotate_rows(cm=centimorgans[hl.int32(mt.row_idx)]).cache() starts, stops = hl.linalg.utils.locus_windows(mt.locus, 2) assert_eq(starts, [0, 0, 0, 1, 2]) assert_eq(stops, [3, 4, 5, 5, 5]) starts, stops = hl.linalg.utils.locus_windows(mt.locus, 0.5, coord_expr=mt.cm) assert_eq(starts, [0, 1, 1, 1, 3]) assert_eq(stops, [1, 4, 4, 5, 5]) starts, stops = hl.linalg.utils.locus_windows( mt.locus, 1.0, coord_expr=2 * centimorgans[hl.int32(mt.row_idx)]) assert_eq(starts, [0, 1, 1, 1, 3]) assert_eq(stops, [1, 4, 4, 5, 5]) rows = [{ 'locus': hl.Locus('1', 1), 'cm': 1.0 }, { 'locus': hl.Locus('1', 2), 'cm': 3.0 }, { 'locus': hl.Locus('1', 4), 'cm': 4.0 }, { 'locus': hl.Locus('2', 1), 'cm': 2.0 }, { 'locus': hl.Locus('2', 1), 'cm': 2.0 }, { 'locus': hl.Locus('3', 3), 'cm': 5.0 }] ht = hl.Table.parallelize(rows, hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus']) starts, stops = hl.linalg.utils.locus_windows(ht.locus, 1) assert_eq(starts, [0, 0, 2, 3, 3, 5]) assert_eq(stops, [2, 2, 3, 5, 5, 6]) starts, stops = hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm) assert_eq(starts, [0, 1, 1, 3, 3, 5]) assert_eq(stops, [1, 3, 3, 5, 5, 6]) with self.assertRaises(ValueError) as cm: hl.linalg.utils.locus_windows(ht.order_by(ht.cm).locus, 1.0) self.assertTrue('ascending order' in str(cm.exception)) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows( ht.locus, 1.0, coord_expr=hl.utils.range_table(1).idx) self.assertTrue('different source' in str(cm.exception)) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(hl.locus('1', 1), 1.0) self.assertTrue("no source" in str(cm.exception)) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=0.0) self.assertTrue("no source" in str(cm.exception)) ht = ht.annotate_globals(x=hl.locus('1', 1), y=1.0) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(ht.x, 1.0) self.assertTrue("row-indexed" in str(cm.exception)) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, ht.y) self.assertTrue("row-indexed" in str(cm.exception)) ht = hl.Table.parallelize([{ 'locus': hl.null(hl.tlocus()), 'cm': 1.0 }], hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus']) with self.assertRaises(ValueError) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0) self.assertTrue("missing value for 'locus_expr'" in str(cm.exception)) with self.assertRaises(ValueError) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm) self.assertTrue("missing value for 'locus_expr'" in str(cm.exception)) ht = hl.Table.parallelize([{ 'locus': hl.Locus('1', 1), 'cm': hl.null(hl.tfloat64) }], hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus']) with self.assertRaises(ValueError) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm) self.assertTrue("missing value for 'coord_expr'" in str(cm.exception))