def _parameterized_filter_intervals(vds: 'VariantDataset', intervals, keep: bool, mode: str) -> 'VariantDataset': intervals_table = None if isinstance(intervals, Table): expected = hl.tinterval(hl.tlocus(vds.reference_genome)) if len(intervals.key) != 1 or intervals.key[0].dtype != hl.tinterval( hl.tlocus(vds.reference_genome)): raise ValueError( f"'filter_intervals': expect a table with a single key of type {expected}; " f"found {list(intervals.key.dtype.values())}") intervals_table = intervals intervals = intervals.aggregate(hl.agg.collect(intervals.key[0])) if mode == 'variants_only': variant_data = hl.filter_intervals(vds.variant_data, intervals, keep) return VariantDataset(vds.reference_data, variant_data) if mode == 'split_at_boundaries': if not keep: raise ValueError( "filter_intervals mode 'split_at_boundaries' not implemented for keep=False" ) par_intervals = intervals_table or hl.Table.parallelize( intervals.map(lambda x: hl.struct(interval=x)), schema=hl.tstruct(interval=intervals.dtype.element_type), key='interval') ref = segment_reference_blocks(vds.reference_data, par_intervals).drop( 'interval_end', list(par_intervals.key)[0]) return VariantDataset( ref, hl.filter_intervals(vds.variant_data, intervals, keep)) return VariantDataset( hl.filter_intervals(vds.reference_data, intervals, keep), hl.filter_intervals(vds.variant_data, intervals, keep))
def validate_variant_results_table(ds): assert ds.key.dtype.fields == ("locus", "alleles"), "Table must be keyed by locus and alleles" assert ds.locus.dtype in (hl.tlocus("GRCh37"), hl.tlocus("GRCh38")), "'locus' must be a locus type" assert ds.alleles.dtype == hl.tarray(hl.tstr), "'alleles' must be an array of strings" required_fields = { "gene_id": hl.tstr, "consequence": hl.tstr, "hgvsc": hl.tstr, "hgvsp": hl.tstr, } for field, typ in required_fields.items(): assert field in ds.row_value.dtype.fields, f"Missing required field '{field}'" assert ds[field].dtype == typ, f"{field} should be type {typ}" assert "group_results" in ds.row_value.dtype.fields, "Table must have a 'group_results' field" assert isinstance(ds.group_results.dtype, hl.tdict), "'group_results' must be a dict" assert ds.group_results.dtype.key_type == hl.tstr, "'group_results' keys must be strings" assert isinstance(ds.group_results.dtype.value_type, hl.tstruct), "'group_results' value must be a struct" for typ in ds.group_results.dtype.value_type.types: assert ( typ in ALLOWED_RESULT_TYPES ), f"'group_results' fields may only be one of {', '.join(map(str, ALLOWED_RESULT_TYPES))}" assert isinstance(ds.info.dtype, hl.tstruct), "'info' must be a struct" for typ in ds.info.dtype.types: assert ( typ in ALLOWED_RESULT_TYPES ), f"'info' fields may only be one of {', '.join(map(str, ALLOWED_RESULT_TYPES))}"
def impute_sex_aggregator(call, aaf, aaf_threshold=0.0, include_par=False, female_threshold=0.4, male_threshold=0.8) -> hl.Table: """:func:`.impute_sex` as an aggregator.""" mt = call._indices.source rg = mt.locus.dtype.reference_genome x_contigs = hl.literal( hl.eval( hl.map(lambda x_contig: hl.parse_locus_interval(x_contig, rg), rg.x_contigs))) inbreeding = hl.agg.inbreeding(call, aaf) is_female = hl.if_else( inbreeding.f_stat < female_threshold, True, hl.if_else(inbreeding.f_stat > male_threshold, False, hl.is_missing('tbool'))) expression = hl.struct(is_female=is_female, **inbreeding) if not include_par: interval_type = hl.tarray(hl.tinterval(hl.tlocus(rg))) par_intervals = hl.literal(rg.par, interval_type) expression = hl.agg.filter( ~par_intervals.any( lambda par_interval: par_interval.contains(mt.locus)), expression) expression = hl.agg.filter( (aaf > aaf_threshold) & (aaf < (1 - aaf_threshold)), expression) expression = hl.agg.filter( x_contigs.any(lambda contig: contig.contains(mt.locus)), expression) return expression
def test_import_bgen(self): hl.index_bgen(resource('example.v11.bgen')) bgen_rows = hl.import_bgen(resource('example.v11.bgen'), entry_fields=['GT', 'GP'], sample_file=resource('example.sample'), contig_recoding={'01': '1'}, reference_genome='GRCh37').rows() self.assertTrue(bgen_rows.all(bgen_rows.locus.contig == '1')) self.assertEqual(bgen_rows.count(), 199) hl.index_bgen(resource('example.8bits.bgen')) bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], contig_recoding={'01': '1'}, reference_genome='GRCh37') self.assertEqual(bgen.entry.dtype, hl.tstruct(dosage=hl.tfloat64)) bgen = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['GT', 'GP'], sample_file=resource('example.sample'), contig_recoding={'01': '1'}, reference_genome='GRCh37') self.assertEqual(bgen.entry.dtype, hl.tstruct(GT=hl.tcall, GP=hl.tarray(hl.tfloat64))) self.assertEqual(bgen.count_rows(), 199) hl.index_bgen(resource('example.10bits.bgen')) bgen = hl.import_bgen(resource('example.10bits.bgen'), entry_fields=['GT', 'GP', 'dosage'], contig_recoding={'01': '1'}, reference_genome='GRCh37') self.assertEqual(bgen.entry.dtype, hl.tstruct(GT=hl.tcall, GP=hl.tarray(hl.tfloat64), dosage=hl.tfloat64)) self.assertEqual(bgen.locus.dtype, hl.tlocus('GRCh37'))
def test_import_gen(self): gen = hl.import_gen(resource('example.gen'), sample_file=resource('example.sample'), contig_recoding={"01": "1"}, reference_genome = 'GRCh37').rows() self.assertTrue(gen.all(gen.locus.contig == "1")) self.assertEqual(gen.count(), 199) self.assertEqual(gen.locus.dtype, hl.tlocus('GRCh37'))
def test_import_vcf(self): vcf = hl.split_multi_hts( hl.import_vcf(resource('sample2.vcf'), reference_genome=hl.get_reference('GRCh38'), contig_recoding={"22": "chr22"})) vcf_table = vcf.rows() self.assertTrue(vcf_table.all(vcf_table.locus.contig == "chr22")) self.assertTrue(vcf.locus.dtype, hl.tlocus('GRCh37'))
def test_import_bgen_row_fields(self): default_row_fields = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], contig_recoding={'01': '1'}, reference_genome='GRCh37') self.assertEqual( default_row_fields.row.dtype, hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), rsid=hl.tstr, varid=hl.tstr)) no_row_fields = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], contig_recoding={'01': '1'}, reference_genome='GRCh37', _row_fields=[]) self.assertEqual( no_row_fields.row.dtype, hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr))) varid_only = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], contig_recoding={'01': '1'}, reference_genome='GRCh37', _row_fields=['varid']) self.assertEqual( varid_only.row.dtype, hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), varid=hl.tstr)) rsid_only = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], contig_recoding={'01': '1'}, reference_genome='GRCh37', _row_fields=['rsid']) self.assertEqual( rsid_only.row.dtype, hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), rsid=hl.tstr)) self.assertTrue(default_row_fields.drop('varid')._same(rsid_only)) self.assertTrue(default_row_fields.drop('rsid')._same(varid_only)) self.assertTrue( default_row_fields.drop('varid', 'rsid')._same(no_row_fields))
def test_constructors(self): rg = hl.ReferenceGenome("foo", ["1"], {"1": 100}) schema = hl.tstruct(a=hl.tfloat64, b=hl.tfloat64, c=hl.tint32, d=hl.tint32) rows = [{'a': 2.0, 'b': 4.0, 'c': 1, 'd': 5}] kt = hl.Table.parallelize(rows, schema) kt = kt.annotate(d=hl.int64(kt.d)) kt = kt.annotate(l1=hl.parse_locus("1:51"), l2=hl.locus("1", 51, reference_genome=rg), i1=hl.parse_locus_interval("1:51-56", reference_genome=rg), i2=hl.interval(hl.locus("1", 51, reference_genome=rg), hl.locus("1", 56, reference_genome=rg))) expected_schema = {'a': hl.tfloat64, 'b': hl.tfloat64, 'c': hl.tint32, 'd': hl.tint64, 'l1': hl.tlocus(), 'l2': hl.tlocus(rg), 'i1': hl.tinterval(hl.tlocus(rg)), 'i2': hl.tinterval(hl.tlocus(rg))} self.assertTrue(all([expected_schema[f] == t for f, t in kt.row.dtype.items()]))
def test_import_vcf_missing_info_field_elements(self): mt = hl.import_vcf(resource('missingInfoArray.vcf'), reference_genome='GRCh37', array_elements_required=False) mt = mt.select_rows(FOO=mt.info.FOO, BAR=mt.info.BAR) expected = hl.Table.parallelize([{'locus': hl.Locus('X', 16050036), 'alleles': ['A', 'C'], 'FOO': [1, None], 'BAR': [2, None, None]}, {'locus': hl.Locus('X', 16061250), 'alleles': ['T', 'A', 'C'], 'FOO': [None, 2, None], 'BAR': [None, 1.0, None]}], hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), FOO=hl.tarray(hl.tint), BAR=hl.tarray(hl.tfloat64)), key=['locus', 'alleles']) self.assertTrue(mt.rows()._same(expected))
def par(self): """Pseudoautosomal regions. Returns ------- :obj:`list` of :class:`.Interval` """ from hail.utils.interval import Interval if self._par is None: self._par = [Interval._from_java(jrep, hl.tlocus(self)) for jrep in self._jrep.par()] return self._par
def create_all_values(): return hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval( hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool)) )
def test_import_locus_intervals(self): interval_file = resource('annotinterall.interval_list') intervals = hl.import_locus_intervals(interval_file, reference_genome='GRCh37') nint = intervals.count() i = 0 with open(interval_file) as f: for line in f: if len(line.strip()) != 0: i += 1 self.assertEqual(nint, i) self.assertEqual(intervals.interval.dtype.point_type, hl.tlocus('GRCh37'))
def calculate_new_intervals(ht, n, reference_genome): """takes a table, keyed by ['locus', ...] and produces a list of intervals suitable for repartitioning a combiner matrix table Parameters ---------- ht : :class:`.Table` Table / Rows Table to compute new intervals for n : :obj:`int` Number of rows each partition should have, (last partition may be smaller) reference_genome: :obj:`str` or :class:`.ReferenceGenome`, optional Reference genome to use. Returns ------- :obj:`List[Interval]` """ assert list(ht.key) == ['locus'] assert ht.locus.dtype == hl.tlocus(reference_genome=reference_genome) end = hl.Locus(reference_genome.contigs[-1], reference_genome.lengths[reference_genome.contigs[-1]], reference_genome=reference_genome) n_rows = ht.count() if n_rows == 0: raise ValueError('empty table!') ht = ht.select() ht = ht.annotate(x=hl.scan.count()) ht = ht.annotate(y=ht.x + 1) ht = ht.filter((ht.x // n != ht.y // n) | (ht.x == (n_rows - 1))) ht = ht.select() ht = ht.annotate(start=hl.or_else( hl.scan._prev_nonnull( hl.locus_from_global_position(ht.locus.global_position() + 1, reference_genome=reference_genome)), hl.locus_from_global_position(0, reference_genome=reference_genome))) ht = ht.key_by() ht = ht.select( interval=hl.interval(start=ht.start, end=ht.locus, includes_end=True)) intervals = ht.aggregate(hl.agg.collect(ht.interval)) last_st = hl.eval( hl.locus_from_global_position( hl.literal(intervals[-1].end).global_position() + 1, reference_genome=reference_genome)) interval = hl.Interval(start=last_st, end=end, includes_end=True) intervals.append(interval) return intervals
def create_all_values_datasets(): all_values = hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({ hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3 }), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval(hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool))) def prefix(s, p): return hl.struct(**{p + k: s[k] for k in s}) all_values_table = (hl.utils.range_table( 5, n_partitions=3).annotate_globals( **prefix(all_values, 'global_')).annotate(**all_values).cache()) all_values_matrix_table = (hl.utils.range_matrix_table( 3, 2, n_partitions=2).annotate_globals( **prefix(all_values, 'global_')).annotate_rows( **prefix(all_values, 'row_')).annotate_cols( **prefix(all_values, 'col_')).annotate_entries( **prefix(all_values, 'entry_')).cache()) return all_values_table, all_values_matrix_table
def create_all_values_datasets(): all_values = hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval( hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool)) ) def prefix(s, p): return hl.struct(**{p + k: s[k] for k in s}) all_values_table = (hl.utils.range_table(5, n_partitions=3) .annotate_globals(**prefix(all_values, 'global_')) .annotate(**all_values) .cache()) all_values_matrix_table = (hl.utils.range_matrix_table(3, 2, n_partitions=2) .annotate_globals(**prefix(all_values, 'global_')) .annotate_rows(**prefix(all_values, 'row_')) .annotate_cols(**prefix(all_values, 'col_')) .annotate_entries(**prefix(all_values, 'entry_')) .cache()) return all_values_table, all_values_matrix_table
def test_import_bed(self): bed_file = resource('example1.bed') bed = hl.import_bed(bed_file, reference_genome='GRCh37') nbed = bed.count() i = 0 with open(bed_file) as f: for line in f: if len(line.strip()) != 0: try: int(line.split()[0]) i += 1 except: pass self.assertEqual(nbed, i) self.assertEqual(bed.interval.dtype.point_type, hl.tlocus('GRCh37')) bed_file = resource('example2.bed') t = hl.import_bed(bed_file, reference_genome='GRCh37') self.assertEqual(t.interval.dtype.point_type, hl.tlocus('GRCh37')) self.assertTrue(list(t.key.dtype) == ['interval']) self.assertTrue(list(t.row.dtype) == ['interval', 'target'])
def test_import_bed(self): bed_file = resource('example1.bed') bed = hl.import_bed(bed_file, reference_genome='GRCh37') nbed = bed.count() i = 0 with open(bed_file) as f: for line in f: if len(line.strip()) != 0: try: int(line.split()[0]) i += 1 except: pass self.assertEqual(nbed, i) self.assertEqual(bed.interval.dtype.point_type, hl.tlocus('GRCh37')) bed_file = resource('example2.bed') t = hl.import_bed(bed_file, reference_genome='GRCh37') self.assertEqual(t.interval.dtype.point_type, hl.tlocus('GRCh37')) self.assertTrue(list(t.key.dtype) == ['interval']) self.assertTrue(list(t.row.dtype) == ['interval','target'])
def test_import_bgen_row_fields(self): hl.index_bgen(resource('example.8bits.bgen'), contig_recoding={'01': '1'}, reference_genome='GRCh37') default_row_fields = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage']) self.assertEqual(default_row_fields.row.dtype, hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), rsid=hl.tstr, varid=hl.tstr)) no_row_fields = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], _row_fields=[]) self.assertEqual(no_row_fields.row.dtype, hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr))) varid_only = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], _row_fields=['varid']) self.assertEqual(varid_only.row.dtype, hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), varid=hl.tstr)) rsid_only = hl.import_bgen(resource('example.8bits.bgen'), entry_fields=['dosage'], _row_fields=['rsid']) self.assertEqual(rsid_only.row.dtype, hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), rsid=hl.tstr)) self.assertTrue(default_row_fields.drop('varid')._same(rsid_only)) self.assertTrue(default_row_fields.drop('rsid')._same(varid_only)) self.assertTrue( default_row_fields.drop('varid', 'rsid')._same(no_row_fields))
def values(self): values = [(hl.tbool, True), (hl.tint32, 0), (hl.tint64, 0), (hl.tfloat32, 0.5), (hl.tfloat64, 0.5), (hl.tstr, "foo"), (hl.tstruct(x=hl.tint32), hl.Struct(x=0)), (hl.tarray(hl.tint32), [0, 1, 4]), (hl.tset(hl.tint32), {0, 1, 4}), (hl.tdict(hl.tstr, hl.tint32), { "a": 0, "b": 1, "c": 4 }), (hl.tinterval(hl.tint32), hl.Interval(0, 1, True, False)), (hl.tlocus(hl.default_reference()), hl.Locus("1", 1)), (hl.tcall, hl.Call([0, 1]))] return values
def test_reference_genome_liftover(self): grch37 = hl.get_reference('GRCh37') grch38 = hl.get_reference('GRCh38') self.assertTrue(not grch37.has_liftover('GRCh38') and not grch38.has_liftover('GRCh37')) grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38') grch38.add_liftover(resource('grch38_to_grch37_chr20.over.chain.gz'), 'GRCh37') assert grch37.has_liftover('GRCh38') assert grch38.has_liftover('GRCh37') ds = hl.import_vcf(resource('sample.vcf')) t = ds.annotate_rows(liftover=hl.liftover(hl.liftover(ds.locus, 'GRCh38'), 'GRCh37')).rows() assert t.all(t.locus == t.liftover) null_locus = hl.null(hl.tlocus('GRCh38')) rows = [ {'l37': hl.locus('20', 1, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 60000, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 60001, 'GRCh37'), 'l38': hl.locus('chr20', 79360, 'GRCh38')}, {'l37': hl.locus('20', 278686, 'GRCh37'), 'l38': hl.locus('chr20', 298045, 'GRCh38')}, {'l37': hl.locus('20', 278687, 'GRCh37'), 'l38': hl.locus('chr20', 298046, 'GRCh38')}, {'l37': hl.locus('20', 278688, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278689, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278690, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278691, 'GRCh37'), 'l38': hl.locus('chr20', 298047, 'GRCh38')}, {'l37': hl.locus('20', 37007586, 'GRCh37'), 'l38': hl.locus('chr12', 32563117, 'GRCh38')}, {'l37': hl.locus('20', 62965520, 'GRCh37'), 'l38': hl.locus('chr20', 64334167, 'GRCh38')}, {'l37': hl.locus('20', 62965521, 'GRCh37'), 'l38': null_locus} ] schema = hl.tstruct(l37=hl.tlocus(grch37), l38=hl.tlocus(grch38)) t = hl.Table.parallelize(rows, schema) self.assertTrue(t.all(hl.cond(hl.is_defined(t.l38), hl.liftover(t.l37, 'GRCh38') == t.l38, hl.is_missing(hl.liftover(t.l37, 'GRCh38'))))) t = t.filter(hl.is_defined(t.l38)) self.assertTrue(t.count() == 6) t = t.key_by('l38') t.count() self.assertTrue(list(t.key) == ['l38']) null_locus_interval = hl.null(hl.tinterval(hl.tlocus('GRCh38'))) rows = [ {'i37': hl.locus_interval('20', 1, 60000, True, False, 'GRCh37'), 'i38': null_locus_interval}, {'i37': hl.locus_interval('20', 60001, 82456, True, True, 'GRCh37'), 'i38': hl.locus_interval('chr20', 79360, 101815, True, True, 'GRCh38')} ] schema = hl.tstruct(i37=hl.tinterval(hl.tlocus(grch37)), i38=hl.tinterval(hl.tlocus(grch38))) t = hl.Table.parallelize(rows, schema) self.assertTrue(t.all(hl.liftover(t.i37, 'GRCh38') == t.i38)) grch37.remove_liftover("GRCh38") grch38.remove_liftover("GRCh37")
def man_plot(infile, outfile): ''' Function to perform man_plot of p-vals of masterfastGWA. ''' mlm = hl.import_table(infile, types={ 'P': hl.tfloat64, 'locus': hl.tlocus(reference_genome='GRCh37') }) f = hl.plot.manhattan(mlm.P, title='Man_Plot') export_png(f, filename=outfile) pass
def QQ_plot(infile, outfile): ''' Function to perform QQ_plot of p-vals of masterfastGWA. ''' mlm = hl.import_table(infile, types={ 'P': hl.tfloat64, 'locus': hl.tlocus(reference_genome='GRCh37') }) p = hl.plot.qq(mlm.P, title='QQ_plot') export_png(p, filename=outfile) pass
def test_reference_genome_liftover(self): grch37 = hl.get_reference('GRCh37') grch38 = hl.get_reference('GRCh38') self.assertTrue(not grch37.has_liftover('GRCh38') and not grch38.has_liftover('GRCh37')) grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38') grch38.add_liftover(resource('grch38_to_grch37_chr20.over.chain.gz'), 'GRCh37') self.assertTrue(grch37.has_liftover('GRCh38') and grch38.has_liftover('GRCh37')) ds = hl.import_vcf(resource('sample.vcf')) t = ds.annotate_rows(liftover=hl.liftover(hl.liftover(ds.locus, 'GRCh38'), 'GRCh37')).rows() self.assertTrue(t.all(t.locus == t.liftover)) null_locus = hl.null(hl.tlocus('GRCh38')) rows = [ {'l37': hl.locus('20', 1, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 60000, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 60001, 'GRCh37'), 'l38': hl.locus('chr20', 79360, 'GRCh38')}, {'l37': hl.locus('20', 278686, 'GRCh37'), 'l38': hl.locus('chr20', 298045, 'GRCh38')}, {'l37': hl.locus('20', 278687, 'GRCh37'), 'l38': hl.locus('chr20', 298046, 'GRCh38')}, {'l37': hl.locus('20', 278688, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278689, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278690, 'GRCh37'), 'l38': null_locus}, {'l37': hl.locus('20', 278691, 'GRCh37'), 'l38': hl.locus('chr20', 298047, 'GRCh38')}, {'l37': hl.locus('20', 37007586, 'GRCh37'), 'l38': hl.locus('chr12', 32563117, 'GRCh38')}, {'l37': hl.locus('20', 62965520, 'GRCh37'), 'l38': hl.locus('chr20', 64334167, 'GRCh38')}, {'l37': hl.locus('20', 62965521, 'GRCh37'), 'l38': null_locus} ] schema = hl.tstruct(l37=hl.tlocus(grch37), l38=hl.tlocus(grch38)) t = hl.Table.parallelize(rows, schema) self.assertTrue(t.all(hl.cond(hl.is_defined(t.l38), hl.liftover(t.l37, 'GRCh38') == t.l38, hl.is_missing(hl.liftover(t.l37, 'GRCh38'))))) t = t.filter(hl.is_defined(t.l38)) self.assertTrue(t.count() == 6) t = t.key_by('l38') t.count() self.assertTrue(list(t.key) == ['l38']) null_locus_interval = hl.null(hl.tinterval(hl.tlocus('GRCh38'))) rows = [ {'i37': hl.locus_interval('20', 1, 60000, True, False, 'GRCh37'), 'i38': null_locus_interval}, {'i37': hl.locus_interval('20', 60001, 82456, True, True, 'GRCh37'), 'i38': hl.locus_interval('chr20', 79360, 101815, True, True, 'GRCh38')} ] schema = hl.tstruct(i37=hl.tinterval(hl.tlocus(grch37)), i38=hl.tinterval(hl.tlocus(grch38))) t = hl.Table.parallelize(rows, schema) self.assertTrue(t.all(hl.liftover(t.i37, 'GRCh38') == t.i38)) grch37.remove_liftover("GRCh38") grch38.remove_liftover("GRCh37")
def test_import_vcf_missing_format_field_elements(self): mt = hl.import_vcf(resource('missingFormatArray.vcf'), reference_genome='GRCh37', array_elements_required=False) mt = mt.select_rows().select_entries('AD', 'PL') expected = hl.Table.parallelize([{'locus': hl.Locus('X', 16050036), 'alleles': ['A', 'C'], 's': 'C1046::HG02024', 'AD': [None, None], 'PL': [0, None, 180]}, {'locus': hl.Locus('X', 16050036), 'alleles': ['A', 'C'], 's': 'C1046::HG02025', 'AD': [None, 6], 'PL': [70, None]}, {'locus': hl.Locus('X', 16061250), 'alleles': ['T', 'A', 'C'], 's': 'C1046::HG02024', 'AD': [0, 0, None], 'PL': [396, None, None, 33, None, 0]}, {'locus': hl.Locus('X', 16061250), 'alleles': ['T', 'A', 'C'], 's': 'C1046::HG02025', 'AD': [0, 0, 9], 'PL': [None, None, None]}], hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), s=hl.tstr, AD=hl.tarray(hl.tint), PL=hl.tarray(hl.tint)), key=['locus', 'alleles', 's']) self.assertTrue(mt.entries()._same(expected))
def test_import_plink_contig_recoding_w_reference(self): vcf = hl.split_multi_hts( hl.import_vcf(resource('sample2.vcf'), reference_genome=hl.get_reference('GRCh38'), contig_recoding={"22": "chr22"})) hl.export_plink(vcf, '/tmp/sample_plink') bfile = '/tmp/sample_plink' plink = hl.import_plink( bfile + '.bed', bfile + '.bim', bfile + '.fam', a2_reference=True, contig_recoding={'chr22': '22'}, reference_genome='GRCh37').rows() self.assertTrue(plink.all(plink.locus.contig == "22")) self.assertEqual(vcf.count_rows(), plink.count()) self.assertTrue(plink.locus.dtype, hl.tlocus('GRCh37'))
def values(self): values = [ (hl.tbool, True), (hl.tint32, 0), (hl.tint64, 0), (hl.tfloat32, 0.5), (hl.tfloat64, 0.5), (hl.tstr, "foo"), (hl.tstruct(x=hl.tint32), hl.Struct(x=0)), (hl.tarray(hl.tint32), [0, 1, 4]), (hl.tset(hl.tint32), {0, 1, 4}), (hl.tdict(hl.tstr, hl.tint32), {"a": 0, "b": 1, "c": 4}), (hl.tinterval(hl.tint32), hl.Interval(0, 1, True, False)), (hl.tlocus(hl.default_reference()), hl.Locus("1", 1)), (hl.tcall, hl.Call([0, 1])) ] return values
def _object_hook(obj): if 'name' not in obj: return obj name = obj['name'] if name == VariantDatasetCombiner.__name__: del obj['name'] obj['vdses'] = [VDSMetadata(*x) for x in obj['vdses']] rg = hl.get_reference(obj['reference_genome']) obj['reference_genome'] = rg intervals_type = hl.tarray(hl.tinterval(hl.tlocus(rg))) intervals = intervals_type._convert_from_json( obj['gvcf_import_intervals']) obj['gvcf_import_intervals'] = intervals return VariantDatasetCombiner(**obj) return obj
def test_import_plink(self): vcf = hl.split_multi_hts( hl.import_vcf(resource('sample2.vcf'), reference_genome=hl.get_reference('GRCh38'), contig_recoding={"22": "chr22"})) hl.export_plink(vcf, '/tmp/sample_plink') bfile = '/tmp/sample_plink' plink = hl.import_plink( bfile + '.bed', bfile + '.bim', bfile + '.fam', a2_reference=True, contig_recoding={'chr22': '22'}, reference_genome='GRCh37').rows() self.assertTrue(plink.all(plink.locus.contig == "22")) self.assertEqual(vcf.count_rows(), plink.count()) self.assertTrue(plink.locus.dtype, hl.tlocus('GRCh37'))
def test_import_bgen_locus_filtering_from_table(self): bgen_file = resource('example.8bits.bgen') hl.index_bgen(bgen_file, contig_recoding={'01': '1'}) desired_loci = hl.Table.parallelize([{'locus': hl.Locus('1', 10000)}], schema=hl.tstruct(locus=hl.tlocus()), key='locus') expected_result = [ hl.Struct(locus=hl.Locus('1', 10000), alleles=['A', 'G']), hl.Struct(locus=hl.Locus('1', 10000), alleles=['A', 'G']) # Duplicated variant ] result = hl.import_bgen(bgen_file, ['GT'], variants=desired_loci) self.assertTrue(result.rows().key_by('locus', 'alleles').select().collect() == expected_result)
def test_skat(self): ds2 = hl.import_vcf(resource('sample2.vcf')) covariatesSkat = (hl.import_table(resource("skat.cov"), impute=True) .key_by("Sample")) phenotypesSkat = (hl.import_table(resource("skat.pheno"), types={"Pheno": hl.tfloat64}, missing="0") .key_by("Sample")) intervalsSkat = (hl.import_locus_intervals(resource("skat.interval_list"))) weightsSkat = (hl.import_table(resource("skat.weights"), types={"locus": hl.tlocus(), "weight": hl.tfloat64}) .key_by("locus")) ds = hl.split_multi_hts(ds2) ds = ds.annotate_rows(gene=intervalsSkat[ds.locus], weight=weightsSkat[ds.locus].weight) ds = ds.annotate_cols(pheno=phenotypesSkat[ds.s].Pheno, cov=covariatesSkat[ds.s]) ds = ds.annotate_cols(pheno=hl.cond(ds.pheno == 1.0, False, hl.cond(ds.pheno == 2.0, True, hl.null(hl.tbool)))) hl.skat(ds, key_expr=ds.gene, weight_expr=ds.weight, y=ds.pheno, x=ds.GT.n_alt_alleles(), covariates=[ds.cov.Cov1, ds.cov.Cov2], logistic=False).count() hl.skat(ds, key_expr=ds.gene, weight_expr=ds.weight, y=ds.pheno, x=hl.pl_dosage(ds.PL), covariates=[ds.cov.Cov1, ds.cov.Cov2], logistic=True).count()
def main(): parser = argparse.ArgumentParser( description="Driver for hail's gVCF combiner") parser.add_argument( '--sample-map', help='path to the sample map (must be filesystem local). ' 'The sample map should be tab separated with two columns. ' 'The first column is the sample ID, and the second column ' 'is the gVCF path.\n' 'WARNING: the sample names in the gVCFs will be overwritten', required=True) parser.add_argument( '--tmp-path', help='path to folder for temp output (can be a cloud bucket)', default='/tmp') parser.add_argument('--out-file', '-o', help='path to final combiner output', required=True) parser.add_argument('--json', help='json to use for the import of the gVCFs' '(must be filesystem local)', required=True) parser.add_argument('--header', help='external header, must be cloud based', required=False) args = parser.parse_args() hl.init(default_reference=DEFAULT_REF, log='/hail-joint-caller-' + time.strftime('%Y%m%d-%H%M') + '.log') with open(args.json) as j: ty = hl.tarray( hl.tinterval( hl.tstruct(locus=hl.tlocus(reference_genome='GRCh38')))) intervals = ty._from_json(j.read()) with open(args.sample_map) as m: samples = [l.strip().split('\t') for l in m] run_combiner(samples, intervals, args.out_file, args.tmp_path, args.header, overwrite=True)
def to_dict(self) -> dict: intervals_typ = hl.tarray( hl.tinterval(hl.tlocus(self.reference_genome))) return { 'name': self.__class__.__name__, 'save_path': self.save_path, 'output_path': self.output_path, 'temp_path': self.temp_path, 'reference_genome': str(self.reference_genome), 'branch_factor': self.branch_factor, 'target_records': self.target_records, 'gvcf_batch_size': self.gvcf_batch_size, 'gvcf_external_header': self.gvcf_external_header, # put this here for humans 'contig_recoding': self.contig_recoding, 'gvcf_info_to_keep': None if self.gvcf_info_to_keep is None else list( self.gvcf_info_to_keep), 'gvcf_reference_entry_fields_to_keep': None if self.gvcf_reference_entry_fields_to_keep is None else list( self.gvcf_reference_entry_fields_to_keep), 'vdses': [ md for i in sorted(self.vdses, reverse=True) for md in self.vdses[i] ], 'gvcfs': self.gvcfs, 'gvcf_sample_names': self.gvcf_sample_names, 'gvcf_import_intervals': intervals_typ._convert_to_json(self.gvcf_import_intervals), }
def main(): parser = argparse.ArgumentParser( description="Driver for hail's gVCF combiner") parser.add_argument( '--sample-map', help='path to the sample map (must be filesystem local)', required=True) parser.add_argument('--sample-file', help='path to a file containing a line separated list' 'of samples to combine (must be filesystem local)') parser.add_argument( '--tmp-path', help='path to folder for temp output (can be a cloud bucket)', default='/tmp') parser.add_argument('--out-file', '-o', help='path to final combiner output', required=True) parser.add_argument( '--summarize', help='if defined, run summarize, placing the rows table ' 'of the output at the argument value') parser.add_argument('--json', help='json to use for the import of the gVCFs' '(must be filesystem local)', required=True) args = parser.parse_args() samples = build_sample_list(args.sample_map, args.sample_file) with open(args.json) as j: ty = hl.tarray( hl.tinterval( hl.tstruct(locus=hl.tlocus(reference_genome='GRCh38')))) intervals = ty._from_json(j.read()) hl.init(default_reference=DEFAULT_REF, log='/hail-joint-caller-' + time.strftime('%Y%m%d-%H%M') + '.log') run_combiner(samples, intervals, args.out_file, args.tmp_path, summary_path=args.summarize, overwrite=True)
def test_import_vcf_missing_info_field_elements(self): mt = hl.import_vcf(resource('missingInfoArray.vcf'), reference_genome='GRCh37', array_elements_required=False) mt = mt.select_rows(FOO=mt.info.FOO, BAR=mt.info.BAR) expected = hl.Table.parallelize([{ 'locus': hl.Locus('X', 16050036), 'alleles': ['A', 'C'], 'FOO': [1, None], 'BAR': [2, None, None] }, { 'locus': hl.Locus('X', 16061250), 'alleles': ['T', 'A', 'C'], 'FOO': [None, 2, None], 'BAR': [None, 1.0, None] }], hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), FOO=hl.tarray(hl.tint), BAR=hl.tarray(hl.tfloat64)), key=['locus', 'alleles']) self.assertTrue(mt.rows()._same(expected))
def test_import_vcf_missing_format_field_elements(self): mt = hl.import_vcf(resource('missingFormatArray.vcf'), reference_genome='GRCh37', array_elements_required=False) mt = mt.select_rows().select_entries('AD', 'PL') expected = hl.Table.parallelize([{ 'locus': hl.Locus('X', 16050036), 'alleles': ['A', 'C'], 's': 'C1046::HG02024', 'AD': [None, None], 'PL': [0, None, 180] }, { 'locus': hl.Locus('X', 16050036), 'alleles': ['A', 'C'], 's': 'C1046::HG02025', 'AD': [None, 6], 'PL': [70, None] }, { 'locus': hl.Locus('X', 16061250), 'alleles': ['T', 'A', 'C'], 's': 'C1046::HG02024', 'AD': [0, 0, None], 'PL': [396, None, None, 33, None, 0] }, { 'locus': hl.Locus('X', 16061250), 'alleles': ['T', 'A', 'C'], 's': 'C1046::HG02025', 'AD': [0, 0, 9], 'PL': [None, None, None] }], hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), s=hl.tstr, AD=hl.tarray(hl.tint), PL=hl.tarray(hl.tint)), key=['locus', 'alleles', 's']) self.assertTrue(mt.entries()._same(expected))
def test_import_locus_intervals(self): interval_file = resource('annotinterall.interval_list') t = hl.import_locus_intervals(interval_file, reference_genome='GRCh37') nint = t.count() i = 0 with open(interval_file) as f: for line in f: if len(line.strip()) != 0: i += 1 self.assertEqual(nint, i) self.assertEqual(t.interval.dtype.point_type, hl.tlocus('GRCh37')) tmp_file = new_temp_file(prefix="test", suffix="interval_list") start = t.interval.start end = t.interval.end (t .key_by(interval=hl.locus_interval(start.contig, start.position, end.position, True, True)) .select() .export(tmp_file, header=False)) t2 = hl.import_locus_intervals(tmp_file) self.assertTrue(t.select()._same(t2))
def test_import_locus_intervals(self): interval_file = resource('annotinterall.interval_list') t = hl.import_locus_intervals(interval_file, reference_genome='GRCh37') nint = t.count() i = 0 with open(interval_file) as f: for line in f: if len(line.strip()) != 0: i += 1 self.assertEqual(nint, i) self.assertEqual(t.interval.dtype.point_type, hl.tlocus('GRCh37')) tmp_file = new_temp_file(prefix="test", suffix="interval_list") start = t.interval.start end = t.interval.end (t.key_by( interval=hl.locus_interval(start.contig, start.position, end. position, True, True)).select().export( tmp_file, header=False)) t2 = hl.import_locus_intervals(tmp_file) self.assertTrue(t.select()._same(t2))
def visit_locus(self, node, visited_children): tlocus, _, angle_bracket, gr, angle_bracket = visited_children return hl.tlocus(gr)
FINAL_SAMPLE_LIST = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/15_final_qc.keep.sample_list' FINAL_VARIANT_LIST = 'gs://dalio_bipolar_w1_w2_hail_02/data/variants/14_final_qc.keep.variant_list' FINAL_PRUNED_VARIANTS = 'gs://dalio_bipolar_w1_w2_hail_02/data/variants/16_prune.final_qc.keep.variant_list' QC_MT = 'gs://dalio_bipolar_w1_w2_hail_02/data/mt/17_european.strict.mt' QC_HARDCALLS_MT = 'gs://dalio_bipolar_w1_w2_hail_02/data/mt/17_european.strict.hardcalls.mt' FINAL_VARIANT_QC_FILE = 'gs://dalio_bipolar_w1_w2_hail_02/data/variants/17_final_qc.variants.tsv.bgz' FINAL_SAMPLE_QC_FILE = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/17_final_qc.samples.tsv.bgz' ht_final_samples = hl.import_table(FINAL_SAMPLE_LIST, no_header=True, key='f0') ht_final_variants = hl.import_table(FINAL_VARIANT_LIST, types={ 'locus': hl.tlocus(reference_genome='GRCh38'), 'alleles': hl.tarray(hl.tstr) }) ht_final_variants = ht_final_variants.key_by(ht_final_variants.locus, ht_final_variants.alleles) ht_final_pruned_variants = hl.import_table(FINAL_PRUNED_VARIANTS, no_header=True) ht_final_pruned_variants = ht_final_pruned_variants.annotate( **hl.parse_variant(ht_final_pruned_variants.f0, reference_genome='GRCh38')) ht_final_pruned_variants = ht_final_pruned_variants.key_by( ht_final_pruned_variants.locus, ht_final_pruned_variants.alleles) sample_annotations = hl.read_table(PHENOTYPES_TABLE) impute_sex_annotations = hl.read_table(IMPUTESEX_TABLE)
def test_locus_windows(self): def assert_eq(a, b): self.assertTrue(np.array_equal(a, np.array(b))) centimorgans = hl.literal([0.1, 1.0, 1.0, 1.5, 1.9]) mt = hl.balding_nichols_model(1, 5, 5).add_row_index() mt = mt.annotate_rows(cm=centimorgans[hl.int32(mt.row_idx)]).cache() starts, stops = hl.linalg.utils.locus_windows(mt.locus, 2) assert_eq(starts, [0, 0, 0, 1, 2]) assert_eq(stops, [3, 4, 5, 5, 5]) starts, stops = hl.linalg.utils.locus_windows(mt.locus, 0.5, coord_expr=mt.cm) assert_eq(starts, [0, 1, 1, 1, 3]) assert_eq(stops, [1, 4, 4, 5, 5]) starts, stops = hl.linalg.utils.locus_windows(mt.locus, 1.0, coord_expr=2 * centimorgans[hl.int32(mt.row_idx)]) assert_eq(starts, [0, 1, 1, 1, 3]) assert_eq(stops, [1, 4, 4, 5, 5]) rows = [{'locus': hl.Locus('1', 1), 'cm': 1.0}, {'locus': hl.Locus('1', 2), 'cm': 3.0}, {'locus': hl.Locus('1', 4), 'cm': 4.0}, {'locus': hl.Locus('2', 1), 'cm': 2.0}, {'locus': hl.Locus('2', 1), 'cm': 2.0}, {'locus': hl.Locus('3', 3), 'cm': 5.0}] ht = hl.Table.parallelize(rows, hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus']) starts, stops = hl.linalg.utils.locus_windows(ht.locus, 1) assert_eq(starts, [0, 0, 2, 3, 3, 5]) assert_eq(stops, [2, 2, 3, 5, 5, 6]) starts, stops = hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm) assert_eq(starts, [0, 1, 1, 3, 3, 5]) assert_eq(stops, [1, 3, 3, 5, 5, 6]) with self.assertRaises(ValueError) as cm: hl.linalg.utils.locus_windows(ht.order_by(ht.cm).locus, 1.0) self.assertTrue('ascending order' in str(cm.exception)) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=hl.utils.range_table(1).idx) self.assertTrue('different source' in str(cm.exception)) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(hl.locus('1', 1), 1.0) self.assertTrue("no source" in str(cm.exception)) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=0.0) self.assertTrue("no source" in str(cm.exception)) ht = ht.annotate_globals(x = hl.locus('1', 1), y = 1.0) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(ht.x, 1.0) self.assertTrue("row-indexed" in str(cm.exception)) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, ht.y) self.assertTrue("row-indexed" in str(cm.exception)) ht = hl.Table.parallelize([{'locus': hl.null(hl.tlocus()), 'cm': 1.0}], hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus']) with self.assertRaises(ValueError) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0) self.assertTrue("missing value for 'locus_expr'" in str(cm.exception)) with self.assertRaises(ValueError) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm) self.assertTrue("missing value for 'locus_expr'" in str(cm.exception)) ht = hl.Table.parallelize([{'locus': hl.Locus('1', 1), 'cm': hl.null(hl.tfloat64)}], hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus']) with self.assertRaises(ValueError) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm) self.assertTrue("missing value for 'coord_expr'" in str(cm.exception))
def generate_datasets(doctest_namespace): doctest_namespace['hl'] = hl doctest_namespace['np'] = np ds = hl.import_vcf('data/sample.vcf.bgz') ds = ds.sample_rows(0.03) ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5), panel_maf=0.1, anno1=5, anno2=0, consequence="LOF", gene="A", score=5.0) ds = ds.annotate_rows(a_index=1) ds = hl.sample_qc(hl.variant_qc(ds)) ds = ds.annotate_cols(is_case=True, pheno=hl.struct(is_case=hl.rand_bool(0.5), is_female=hl.rand_bool(0.5), age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1), cov2=hl.rand_norm(0, 1), cohort="SIGMA") ds = ds.annotate_globals( global_field_1=5, global_field_2=10, pli={ 'SCN1A': 0.999, 'SONIC': 0.014 }, populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS']) ds = ds.annotate_rows(gene=['TTN']) ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS') ds = ds.checkpoint(f'output/example.mt', overwrite=True) doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate( consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata doctest_namespace['cols_to_keep'] = s_metadata doctest_namespace['cols_to_remove'] = s_metadata doctest_namespace['rows_to_keep'] = v_metadata doctest_namespace['rows_to_remove'] = v_metadata small_mt = hl.balding_nichols_model(3, 4, 4) doctest_namespace['small_mt'] = small_mt.checkpoint('output/small.mt', overwrite=True) # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={ 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32) }) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={ 'Age': hl.tint32, 'Children': hl.tarray(hl.tstr) }, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.null(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({ 'Alice': 43, 'Bob': 33, 'Charles': 44 }) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval( "1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) doctest_namespace['nd'] = hl._nd.array([[1, 2], [3, 4]]) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() burden_ds = hl.import_vcf('data/example_burden.vcf') burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True) burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s]) burden_ds = burden_ds.annotate_rows( weight=hl.float64(burden_ds.locus.position)) burden_ds = hl.variant_qc(burden_ds) genekt = hl.import_locus_intervals('data/gene.interval_list') burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus]) burden_ds = burden_ds.checkpoint(f'output/example_burden.vds', overwrite=True) doctest_namespace['burden_ds'] = burden_ds ld_score_one_pheno_sumstats = hl.import_table( 'data/ld_score_regression.one_pheno.sumstats.tsv', types={ 'locus': hl.tlocus('GRCh37'), 'alleles': hl.tarray(hl.tstr), 'chi_squared': hl.tfloat64, 'n': hl.tint32, 'ld_score': hl.tfloat64, 'phenotype': hl.tstr, 'chi_squared_50_irnt': hl.tfloat64, 'n_50_irnt': hl.tint32, 'chi_squared_20160': hl.tfloat64, 'n_20160': hl.tint32 }, key=['locus', 'alleles']) doctest_namespace[ 'ld_score_one_pheno_sumstats'] = ld_score_one_pheno_sumstats mt = hl.import_matrix_table( 'data/ld_score_regression.all_phenos.sumstats.tsv', row_fields={ 'locus': hl.tstr, 'alleles': hl.tstr, 'ld_score': hl.tfloat64 }, entry_type=hl.tstr) mt = mt.key_cols_by(phenotype=mt.col_id) mt = mt.key_rows_by(locus=hl.parse_locus(mt.locus), alleles=mt.alleles.split(',')) mt = mt.drop('row_id', 'col_id') mt = mt.annotate_entries(x=mt.x.split(",")) mt = mt.transmute_entries(chi_squared=hl.float64(mt.x[0]), n=hl.int32(mt.x[1])) mt = mt.annotate_rows(ld_score=hl.float64(mt.ld_score)) doctest_namespace['ld_score_all_phenos_sumstats'] = mt print("finished setting up doctest...")