def combine(ts): # pylint: disable=protected-access tmp = ts.annotate( alleles=merge_alleles(ts.data.map(lambda d: d.alleles)), rsid=hl.find(hl.is_defined, ts.data.map(lambda d: d.rsid)), info=hl.struct( MQ_DP=hl.sum(ts.data.map(lambda d: d.info.MQ_DP)), QUALapprox=hl.sum(ts.data.map(lambda d: d.info.QUALapprox)), RAW_MQ=hl.sum(ts.data.map(lambda d: d.info.RAW_MQ)), VarDP=hl.sum(ts.data.map(lambda d: d.info.VarDP)), SB_TABLE=hl.array([ hl.sum(ts.data.map(lambda d: d.info.SB_TABLE[0])), hl.sum(ts.data.map(lambda d: d.info.SB_TABLE[1])), hl.sum(ts.data.map(lambda d: d.info.SB_TABLE[2])), hl.sum(ts.data.map(lambda d: d.info.SB_TABLE[3])) ]))) tmp = tmp.annotate( __entries=hl.bind( lambda combined_allele_index: hl.range(0, hl.len(tmp.data)).flatmap( lambda i: hl.cond(hl.is_missing(tmp.data[i].__entries), hl.range(0, hl.len(tmp.g[i].__cols)) .map(lambda _: hl.null(tmp.data[i].__entries.dtype.element_type)), hl.bind( lambda old_to_new: tmp.data[i].__entries.map(lambda e: renumber_entry(e, old_to_new)), hl.array([0]).extend( hl.range(0, hl.len(tmp.data[i].alleles)).map( lambda j: combined_allele_index[tmp.data[i].alleles[j]]))))), hl.dict(hl.range(1, hl.len(tmp.alleles) + 1).map( lambda j: hl.tuple([tmp.alleles[j - 1], j]))))) tmp = tmp.annotate_globals(__cols=hl.flatten(tmp.g.map(lambda g: g.__cols))) return tmp.drop('data', 'g')
def vep_protein_domain_ann_expr( s: hl.expr.StringExpression) -> hl.expr.DictExpression: """ Parse and annotate protein domain(s) from VEP annotation. Expected StringExpression as input (e.g. 'Pfam:PF13853&Prints:PR00237&PROSITE_profiles:PS50262') It will generate a dict<k,v> where keys (k) represent source/database and values (v) the annotated domain_id. :param s: hl.expr.StringExpression :return: hl.expr.DictExpression """ a1 = s.split(delim="&") # keep only well-annotated domain(s) (i.e. <source:domain_id>) a2 = a1.map(lambda x: x.split(delim=":")) a2 = a2.filter(lambda x: x.length() == 2) d = ( hl.case().when( hl.len(a2) > 0, hl.dict( hl.zip( a2.map(lambda x: x[0] ), # TODO: Optimize by scanning array just one. a2.map(lambda x: x[1])))).or_missing()) return d
def prepare_gene_results(): ds = hl.import_table( pipeline_config.get("ASC", "gene_results_path"), missing="", types={ "gene_name": hl.tstr, "gene_id": hl.tstr, "description": hl.tstr, "analysis_group": hl.tstr, "xcase_dn_ptv": hl.tint, "xcont_dn_ptv": hl.tint, "xcase_dn_misa": hl.tint, "xcont_dn_misa": hl.tint, "xcase_dn_misb": hl.tint, "xcont_dn_misb": hl.tint, "xcase_dbs_ptv": hl.tint, "xcont_dbs_ptv": hl.tint, "xcase_swe_ptv": hl.tint, "xcont_swe_ptv": hl.tint, "xcase_tut": hl.tint, "xcont_tut": hl.tint, "qval": hl.tfloat, }, ) ds = ds.drop("gene_name", "description") ds = ds.group_by("gene_id").aggregate( group_results=hl.agg.collect(ds.row_value)) ds = ds.annotate(group_results=hl.dict( ds.group_results.map(lambda group_result: (group_result.analysis_group, group_result.drop("analysis_group"))))) return ds
def combine(ts): def merge_alleles(alleles): from hail.expr.functions import _num_allele_type, _allele_ints return hl.rbind( alleles.map(lambda a: hl.or_else(a[0], '')).fold( lambda s, t: hl.cond(hl.len(s) > hl.len(t), s, t), ''), lambda ref: hl.rbind( alleles.map(lambda al: hl.rbind( al[0], lambda r: hl.array([ref]). extend(al[1:].map(lambda a: hl.rbind( _num_allele_type(r, a), lambda at: hl.cond( (_allele_ints['SNP'] == at) | (_allele_ints['Insertion'] == at) | (_allele_ints['Deletion'] == at) | (_allele_ints['MNP'] == at) | (_allele_ints['Complex'] == at), a + ref[hl.len( r):], a)))))), lambda lal: hl. struct(globl=hl.array([ref]).extend( hl.array(hl.set(hl.flatten(lal)).remove(ref))), local=lal))) def renumber_entry(entry, old_to_new) -> StructExpression: # global index of alternate (non-ref) alleles return entry.annotate(LA=entry.LA.map(lambda lak: old_to_new[lak])) if (ts.row.dtype, ts.globals.dtype) not in _merge_function_map: f = hl.experimental.define_function( lambda row, gbl: hl.rbind( merge_alleles(row.data.map(lambda d: d.alleles)), lambda alleles: hl.struct( locus=row.locus, alleles=alleles.globl, rsid=hl.find(hl.is_defined, row.data.map(lambda d: d.rsid) ), __entries=hl.bind( lambda combined_allele_index: hl. range(0, hl.len(row.data)).flatmap(lambda i: hl.cond( hl.is_missing(row.data[i].__entries), hl.range(0, hl.len(gbl.g[i].__cols)).map( lambda _: hl.null(row.data[i].__entries.dtype. element_type)), hl.bind( lambda old_to_new: row.data[i].__entries.map( lambda e: renumber_entry(e, old_to_new)), hl.range(0, hl.len(alleles.local[i])).map( lambda j: combined_allele_index[ alleles.local[i][j]])))), hl.dict( hl.range(0, hl.len(alleles.globl)).map( lambda j: hl.tuple([alleles.globl[j], j])))))), ts.row.dtype, ts.globals.dtype) _merge_function_map[(ts.row.dtype, ts.globals.dtype)] = f merge_function = _merge_function_map[(ts.row.dtype, ts.globals.dtype)] ts = Table( TableMapRows( ts._tir, Apply(merge_function._name, merge_function._ret_type, TopLevelReference('row'), TopLevelReference('global')))) return ts.transmute_globals( __cols=hl.flatten(ts.g.map(lambda g: g.__cols)))
def combine(ts): # pylint: disable=protected-access tmp = ts.annotate( alleles=merge_alleles(ts.data.map(lambda d: d.alleles)), rsid=hl.find(hl.is_defined, ts.data.map(lambda d: d.rsid)), filters=hl.set(hl.flatten(ts.data.map(lambda d: hl.array(d.filters)))), info=hl.struct( DP=hl.sum(ts.data.map(lambda d: d.info.DP)), MQ_DP=hl.sum(ts.data.map(lambda d: d.info.MQ_DP)), QUALapprox=hl.sum(ts.data.map(lambda d: d.info.QUALapprox)), RAW_MQ=hl.sum(ts.data.map(lambda d: d.info.RAW_MQ)), VarDP=hl.sum(ts.data.map(lambda d: d.info.VarDP)), SB=hl.array([ hl.sum(ts.data.map(lambda d: d.info.SB[0])), hl.sum(ts.data.map(lambda d: d.info.SB[1])), hl.sum(ts.data.map(lambda d: d.info.SB[2])), hl.sum(ts.data.map(lambda d: d.info.SB[3])) ]))) tmp = tmp.annotate( __entries=hl.bind( lambda combined_allele_index: hl.range(0, hl.len(tmp.data)).flatmap( lambda i: hl.cond(hl.is_missing(tmp.data[i].__entries), hl.range(0, hl.len(tmp.g[i].__cols)) .map(lambda _: hl.null(tmp.data[i].__entries.dtype.element_type)), hl.bind( lambda old_to_new: tmp.data[i].__entries.map(lambda e: renumber_entry(e, old_to_new)), hl.range(0, hl.len(tmp.data[i].alleles)).map( lambda j: combined_allele_index[tmp.data[i].alleles[j]])))), hl.dict(hl.range(0, hl.len(tmp.alleles)).map( lambda j: hl.tuple([tmp.alleles[j], j]))))) tmp = tmp.annotate_globals(__cols=hl.flatten(tmp.g.map(lambda g: g.__cols))) return tmp.drop('data', 'g')
def specific_clumps(filename): clump = hl.import_table(filename, delimiter='\s+', min_partitions=10, types={'P': hl.tfloat}) clump_dict = clump.aggregate(hl.dict(hl.agg.collect( (hl.locus(hl.str(clump.CHR), hl.int(clump.BP)), True) )), _localize=False) return clump_dict
def join_hts(datasets, coverage_datasets=[], reference_genome='37'): # Get a list of hail tables and combine into an outer join. hts = [get_ht(dataset, reference_genome) for dataset in datasets] joined_ht = reduce((lambda joined_ht, ht: joined_ht.join(ht, 'outer')), hts) # Annotate coverages. for coverage_dataset in coverage_datasets: joined_ht = annotate_coverages(joined_ht, coverage_dataset, reference_genome) # Track the dataset we've added as well as the source path. included_dataset = { k: v[reference_genome]['path'] for k, v in CONFIG.items() if k in datasets + coverage_datasets } # Add metadata, but also removes previous globals. joined_ht = joined_ht.select_globals(date=datetime.now().isoformat(), datasets=hl.dict(included_dataset), version=VERSION) joined_ht.describe() output_path = os.path.join( OUTPUT_TEMPLATE.format(genome_version=reference_genome, version=VERSION)) print('Writing to %s' % output_path) joined_ht.write(os.path.join(output_path))
def _coerce(self, x: Expression): assert isinstance(x, hl.expr.DictExpression) if not self.kc._requires_conversion(x.dtype.key_type): # fast path return x.map_values(self.vc.coerce) else: return hl.dict(hl.map(lambda e: (self.kc.coerce(e[0]), self.vc.coerce(e[1])), hl.array(x)))
def parse_attributes(unparsed_attributes): def parse_attribute(attribute): key_and_value = attribute.split(' ') key = key_and_value[0] value = key_and_value[1] return (key, value.replace('"|;\\$', '')) return hl.dict(unparsed_attributes.split('; ').map(parse_attribute))
def test_complex_round_trips(): assert_round_trip(hl.struct()) assert_round_trip(hl.empty_array(hl.tint32)) assert_round_trip(hl.empty_set(hl.tint32)) assert_round_trip(hl.empty_dict(hl.tint32, hl.tint32)) assert_round_trip(hl.locus('1', 100)) assert_round_trip(hl.struct(x=3)) assert_round_trip(hl.set([3, 4, 5, 3])) assert_round_trip(hl.array([3, 4, 5])) assert_round_trip(hl.dict({3: 'a', 4: 'b', 5: 'c'})) assert_round_trip( hl.struct(x=hl.dict({ 3: 'a', 4: 'b', 5: 'c' }), y=hl.array([3, 4, 5]), z=hl.set([3, 4, 5, 3])))
def filter_samples(vds: 'VariantDataset', samples_table: 'Table', *, keep: bool = True, remove_dead_alleles: bool = False) -> 'VariantDataset': """Filter samples in a :class:`.VariantDataset`. Parameters ---------- vds : :class:`.VariantDataset` Dataset in VariantDataset representation. samples_table : :class:`.Table` Samples to filter on. keep : :obj:`bool` Whether to keep (default), or filter out the samples from `samples_table`. remove_dead_alleles : :obj:`bool` If true, remove alleles observed in no samples. Alleles with AC == 0 will be removed, and LA values recalculated. Returns ------- :class:`.VariantDataset` """ if not list(samples_table[x].dtype for x in samples_table.key) == [hl.tstr]: raise TypeError(f'invalid key: {samples_table.key.dtype}') samples_to_keep = samples_table.aggregate(hl.agg.collect_as_set(samples_table.key[0]), _localize=False)._persist() reference_data = vds.reference_data.filter_cols(samples_to_keep.contains(vds.reference_data.col_key[0]), keep=keep) reference_data = reference_data.filter_rows(hl.agg.count() > 0) variant_data = vds.variant_data.filter_cols(samples_to_keep.contains(vds.variant_data.col_key[0]), keep=keep) if remove_dead_alleles: vd = variant_data vd = vd.annotate_rows(__allele_counts=hl.agg.explode(lambda x: hl.agg.counter(x), vd.LA), __n=hl.agg.count()) vd = vd.filter_rows(vd.__n > 0) vd = vd.annotate_rows(__kept_indices=hl.dict( hl.enumerate( hl.range(hl.len(vd.alleles)).filter(lambda idx: (idx == 0) | (vd.__allele_counts.get(idx, 0) > 0)), index_first=False))) vd = vd.annotate_rows( __old_to_new_LA=hl.range(hl.len(vd.alleles)).map(lambda idx: vd.__kept_indices.get(idx, -1))) def new_la_index(old_idx): raw_idx = vd.__old_to_new_LA[old_idx] return hl.case().when(raw_idx >= 0, raw_idx) \ .or_error("'filter_samples': unexpected local allele: old index=" + hl.str(old_idx)) vd = vd.annotate_entries(LA=vd.LA.map(lambda la: new_la_index(la))) vd = vd.key_rows_by('locus') vd = vd.annotate_rows(alleles=vd.__kept_indices.keys().map(lambda i: vd.alleles[i])) vd = vd._key_rows_by_assert_sorted('locus', 'alleles') vd = vd.drop('__allele_counts', '__kept_indices', '__old_to_new_LA') return VariantDataset(reference_data, vd) variant_data = variant_data.filter_rows(hl.agg.count() > 0) return VariantDataset(reference_data, variant_data)
def test(self): schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr, f=hl.tarray(hl.tint32), g=hl.tarray( hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)), h=hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tstr), i=hl.tbool, j=hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)) rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3], 'g': [hl.Struct(x=1, y=5, z='banana')], 'h': hl.Struct(a=5, b=3, c='winter'), 'i': True, 'j': hl.Struct(x=3, y=2, z='summer')}] kt = hl.Table.parallelize(rows, schema) result = convert_struct_to_dict(kt.annotate( chisq=hl.chisq(kt.a, kt.b, kt.c, kt.d), ctt=hl.ctt(kt.a, kt.b, kt.c, kt.d, 5), dict=hl.dict(hl.zip([kt.a, kt.b], [kt.c, kt.d])), dpois=hl.dpois(4, kt.a), drop=kt.h.drop('b', 'c'), exp=hl.exp(kt.c), fet=hl.fisher_exact_test(kt.a, kt.b, kt.c, kt.d), hwe=hl.hardy_weinberg_p(1, 2, 1), index=hl.index(kt.g, 'z'), is_defined=hl.is_defined(kt.i), is_missing=hl.is_missing(kt.i), is_nan=hl.is_nan(hl.float64(kt.a)), json=hl.json(kt.g), log=hl.log(kt.a, kt.b), log10=hl.log10(kt.c), or_else=hl.or_else(kt.a, 5), or_missing=hl.or_missing(kt.i, kt.j), pchisqtail=hl.pchisqtail(kt.a, kt.b), pcoin=hl.rand_bool(0.5), pnorm=hl.pnorm(0.2), pow=2.0 ** kt.b, ppois=hl.ppois(kt.a, kt.b), qchisqtail=hl.qchisqtail(kt.a, kt.b), range=hl.range(0, 5, kt.b), rnorm=hl.rand_norm(0.0, kt.b), rpois=hl.rand_pois(kt.a), runif=hl.rand_unif(kt.b, kt.a), select=kt.h.select('c', 'b'), sqrt=hl.sqrt(kt.a), to_str=[hl.str(5), hl.str(kt.a), hl.str(kt.g)], where=hl.cond(kt.i, 5, 10) ).take(1)[0])
def create_broadcast_dict(key, value=None): """ Create broadcast join (local dictionary from key -> value) from a Hail Table. :param Expression key: Key Expression :param Expression value: Value Expression :return: Hail DictExpression (without an index) :rtype: DictExpression """ if isinstance(key, hl.Table): key = key.key ht = key._indices.source if value is None: value = ht.row_value return hl.dict(ht.aggregate(hl.agg.collect((key, value)), _localize=False))
def prepare_variant_results(): results = hl.read_table( pipeline_config.get("BipEx", "variant_results_path")) # Get unique variants from results table variants = results.group_by(results.locus, results.alleles).aggregate() # Select AC/AF numbers for the alternate allele results = results.annotate(ac_case=results.ac_case[1], ac_ctrl=results.ac_ctrl[1]) results = results.drop("af_case", "af_ctrl") results = results.filter((results.ac_case > 0) | (results.ac_ctrl > 0)) # Annotate variants with a struct for each analysis group results = results.group_by( "locus", "alleles").aggregate(group_results=hl.agg.collect(results.row_value)) results = results.annotate(group_results=hl.dict( results.group_results.map(lambda group_result: (group_result.analysis_group, group_result.drop("analysis_group"))))) variants = variants.annotate(**results[variants.locus, variants.alleles]) # Merge variant annotations for canonical transcripts annotations = hl.read_table( pipeline_config.get("BipEx", "variant_annotations_path")) annotations = annotations.filter( annotations.transcript_id == annotations.canonical_transcript_id) annotations = annotations.select( "gene_id", consequence=annotations.csq_analysis, hgvsc=annotations.hgvsc_canonical.split(":")[-1], hgvsp=annotations.hgvsp_canonical.split(":")[-1], info=hl.struct(cadd=annotations.cadd, mpc=annotations.mpc, polyphen=annotations.polyphen), ) variants = variants.annotate(**annotations[variants.locus, variants.alleles]) return variants
def prepare_variant_results(): results_path = pipeline_config.get("SCHEMA", "variant_results_path") annotations_path = pipeline_config.get("SCHEMA", "variant_annotations_path") results = hl.read_table(results_path) results = results.drop("v", "af_case", "af_ctrl") # Add n_denovos to AC_case results = results.annotate(ac_case=hl.or_else(results.ac_case, 0) + hl.or_else(results.n_denovos, 0)) results = results.annotate( source=hl.delimit(hl.sorted(hl.array(results.source)), ", ")) results = results.group_by( "locus", "alleles").aggregate(group_results=hl.agg.collect(results.row_value)) results = results.annotate(group_results=hl.dict( results.group_results.map(lambda group_result: (group_result.analysis_group, group_result.drop("analysis_group"))))) variants = hl.read_table(annotations_path) variants = variants.select( gene_id=variants.gene_id, consequence=hl.case().when( (variants.canonical_term == "missense_variant") & (variants.mpc >= 3), "missense_variant_mpc_>=3").when( (variants.canonical_term == "missense_variant") & (variants.mpc >= 2), "missense_variant_mpc_2-3").when( variants.canonical_term == "missense_variant", "missense_variant_mpc_<2").default( variants.canonical_term), hgvsc=variants.hgvsc_canonical.split(":")[-1], hgvsp=variants.hgvsp_canonical.split(":")[-1], info=hl.struct(cadd=variants.cadd, mpc=variants.mpc, polyphen=variants.polyphen), ) variants = variants.annotate(**results[variants.key]) variants = variants.filter(hl.is_defined(variants.group_results)) return variants
def prepare_gene_results(): ds = hl.import_table( pipeline_config.get("Epi25", "gene_results_path"), delimiter=",", missing="NA", quote='"', types={ "gene_id": hl.tstr, "gene_name": hl.tstr, "description": hl.tstr, "pval_meta": hl.tfloat, "analysis_group": hl.tstr, # LoF "xcase_lof": hl.tint, "xctrl_lof": hl.tint, "pval_lof": hl.tfloat, # MPC "xcase_mpc": hl.tint, "xctrl_mpc": hl.tint, "pval_mpc": hl.tfloat, # Inframe indel "xcase_infrIndel": hl.tint, "xctrl_infrIndel": hl.tint, "pval_infrIndel": hl.tfloat, }, ) ds = ds.drop("gene_name", "description") # Rename EE group to DEE ds = ds.annotate(analysis_group=hl.if_else(ds.analysis_group == "EE", "DEE", ds.analysis_group)) # "Meta" p-val was carried over from SCHEMA's data format but isn't descriptive of Epi25 ds = ds.rename({"pval_meta": "pval"}) ds = ds.group_by("gene_id").aggregate(group_results=hl.agg.collect(ds.row_value)) ds = ds.annotate( group_results=hl.dict( ds.group_results.map( lambda group_result: (group_result.analysis_group, group_result.drop("gene_id", "analysis_group")) ) ) ) return ds
def get_annot_ht(): t = hl.import_table(f'{wd_data}/gencode.v31lift37.annotation.gff3.gz',no_header=True,impute=True, comment=('#'),force=True) #t = hl.import_table('/Users/nbaya/Downloads/gencode.v31lift37.annotation.gtf',no_header=True,impute=True, comment=('#')) t2 = t.annotate(GFF_Columns = t.f8.split(";").map(lambda x: x.split("="))) t2 = t2.filter(t2.f2 == "CDS") # only want coding sequences, not entire genes t2 = t2.filter(hl.is_valid_locus(t2.f0[3:], t2.f3, 'GRCh37')) t2 = t2.filter(hl.is_valid_locus(t2.f0[3:], t2.f4, 'GRCh37')) t2 = t2.annotate(interval=hl.interval(hl.locus(t2.f0[3:], t2.f3, 'GRCh37'), hl.locus(t2.f0[3:], t2.f4, 'GRCh37'))) t2 = t2.annotate(GFF_Columns = hl.dict(t2.GFF_Columns.map(lambda x: (x[0], x[1])))) t2 = t2.annotate(ID=t2.GFF_Columns["ID"], gene_id=t2.GFF_Columns["gene_id"], gene_name=t2.GFF_Columns["gene_name"], gene_type=t2.GFF_Columns["gene_type"], level=t2.GFF_Columns["level"]) t2 = t2.annotate(type=t2.f2, gene_score=t2.f5, gene_strand=t2.f6, gene_phase=t2.f7) t2 = t2.drop(t2.GFF_Columns, t2.f8, t2.f0, t2.f1, t2.f2, t2.f3, t2.f4, t2.f5, t2.f6, t2.f7) t2 = t2.key_by(t2.interval) return t2
def create_all_values(): return hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval( hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool)) )
def create_all_values_datasets(): all_values = hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({ hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3 }), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval(hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool))) def prefix(s, p): return hl.struct(**{p + k: s[k] for k in s}) all_values_table = (hl.utils.range_table( 5, n_partitions=3).annotate_globals( **prefix(all_values, 'global_')).annotate(**all_values).cache()) all_values_matrix_table = (hl.utils.range_matrix_table( 3, 2, n_partitions=2).annotate_globals( **prefix(all_values, 'global_')).annotate_rows( **prefix(all_values, 'row_')).annotate_cols( **prefix(all_values, 'col_')).annotate_entries( **prefix(all_values, 'entry_')).cache()) return all_values_table, all_values_matrix_table
def create_all_values_datasets(): all_values = hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval( hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool)) ) def prefix(s, p): return hl.struct(**{p + k: s[k] for k in s}) all_values_table = (hl.utils.range_table(5, n_partitions=3) .annotate_globals(**prefix(all_values, 'global_')) .annotate(**all_values) .cache()) all_values_matrix_table = (hl.utils.range_matrix_table(3, 2, n_partitions=2) .annotate_globals(**prefix(all_values, 'global_')) .annotate_rows(**prefix(all_values, 'row_')) .annotate_cols(**prefix(all_values, 'col_')) .annotate_entries(**prefix(all_values, 'entry_')) .cache()) return all_values_table, all_values_matrix_table
def relatedness_check(in_mt: hl.MatrixTable = None, method: str = 'pc_relate', outdir: str = None, kin_estimate: float = 0.98): global mt, samples_to_remove in_mt = hl.variant_qc(in_mt) in_mt = hl.sample_qc(in_mt) # _localize=False means don't put this in Python, keep it as a Hail expr call_rate_dict = in_mt.aggregate_cols(hl.dict( hl.agg.collect((in_mt.s, in_mt.sample_qc.call_rate))), _localize=False) if method == 'pc_relate': print("\nUsing PC-Relate for relatedness checks") relatedness_ht = hl.pc_relate(in_mt.GT, 0.01, k=10, min_kinship=0.1, statistics='kin') samples_to_remove_ht = relatedness_ht.filter( relatedness_ht.kin > kin_estimate) # get call rates for both samples so we remove the one with lower call rate between the two samples_to_remove = samples_to_remove_ht.annotate( cr_s1=call_rate_dict[samples_to_remove_ht.i.s], cr_s2=call_rate_dict[samples_to_remove_ht.j.s]) samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond( samples_to_remove.cr_s1 <= samples_to_remove.cr_s2, samples_to_remove.i, samples_to_remove.j)) elif method == 'ibd': print("\nUsing PLINK-style identity by descent for relatedness checks") in_mt = in_mt.annotate_rows(maf=hl.min(in_mt.variant_qc.AF)) relatedness_ht = hl.identity_by_descent( in_mt, maf=in_mt['maf'] ) # this returns a Hail Table with the sample pairs samples_to_remove_ht = relatedness_ht.filter( relatedness_ht.ibd.PI_HAT > kin_estimate) # get call rates for both samples so we remove the one with lower call rate between the two samples_to_remove = samples_to_remove_ht.annotate( cr_s1=call_rate_dict[samples_to_remove_ht.i], cr_s2=call_rate_dict[samples_to_remove_ht.j]) samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond( samples_to_remove.cr_s1 <= samples_to_remove.cr_s2, samples_to_remove.i, samples_to_remove.j)) else: print("\nUsing KING for relatedness checks") if kin_estimate > 0.5: raise Exception( "\nThe maximum kinship coefficient is for KING 0.5") relatedness_mt = hl.king(in_mt.GT) filtered_relatedness_mt = relatedness_mt.filter_entries( (relatedness_mt.s_1 != relatedness_mt.s) & (relatedness_mt.phi >= kin_estimate), keep=True) samples_to_remove_ht = filtered_relatedness_mt.entries() samples_to_remove = samples_to_remove_ht.annotate( cr_s1=call_rate_dict[samples_to_remove_ht.s_1], cr_s2=call_rate_dict[samples_to_remove_ht.s]) samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond( samples_to_remove.cr_s1 <= samples_to_remove.cr_s2, samples_to_remove.s_1, samples_to_remove.s)) samples = samples_list.sample_to_remove.collect() if len(samples) > 0: in_mt = in_mt.filter_cols(hl.literal(samples).contains(in_mt['s']), keep=False) print("\nNumber of samples that fail relatedness checks: {}".format( len(samples))) with open(outdir + 'relatedness_removed_samples.tsv', 'w') as f: for sample in samples: f.write(sample + "\n") else: print("\nNo samples failed the relatedness check") return in_mt
def default_generate_gene_lof_summary( mt: hl.MatrixTable, collapse_indels: bool = False, tx: bool = False, lof_csq_set: Set[str] = LOF_CSQ_SET, meta_root: str = "meta", pop_field: str = "pop", filter_loftee: bool = False, ) -> hl.Table: """ Generate summary counts for loss-of-function (LoF), missense, and synonymous variants. Also calculates p, proportion of of haplotypes carrying a putative LoF (pLoF) variant, and observed/expected (OE) ratio of samples with homozygous pLoF variant calls. Summary counts are (all per gene): - Number of samples with no pLoF variants. - Number of samples with heterozygous pLoF variants. - Number of samples with homozygous pLoF variants. - Total number of sites with genotype calls. - All of the above stats grouped by population. Assumes MT was created using `default_generate_gene_lof_matrix`. .. note:: Assumes LoF variants in MT were filtered (LOFTEE pass and no LoF flag only). If LoF variants have not been filtered and `filter_loftee` is True, expects MT has the row annotation `vep`. :param mt: Input MatrixTable. :param collapse_indels: Whether to collapse indels. Default is False. :param tx: Whether input MT has transcript expression data. Default is False. :param lof_csq_set: Set containing LoF transcript consequence strings. Default is LOF_CSQ_SET. :param meta_root: String indicating top level name for sample metadata. Default is 'meta'. :param pop_field: String indiciating field with sample population assignment information. Default is 'pop'. :param filter_loftee: Filters to LOFTEE pass variants (and no LoF flags) only. Default is False. :return: Table with het/hom summary counts. """ if collapse_indels: grouping = ["gene_id", "gene", "most_severe_consequence"] if tx: grouping.append("expressed") else: grouping.extend(["transcript_id", "canonical"]) mt = (mt.group_rows_by(*grouping).aggregate_rows( n_sites=hl.agg.sum(mt.n_sites), n_sites_array=hl.agg.array_sum(mt.n_sites_array), classic_caf=hl.agg.sum(mt.classic_caf), max_af=hl.agg.max(mt.max_af), classic_caf_array=hl.agg.array_sum(mt.classic_caf_array), ).aggregate_entries( num_homs=hl.agg.sum(mt.num_homs), num_hets=hl.agg.sum(mt.num_hets), defined_sites=hl.agg.sum(mt.defined_sites), ).result()) if filter_loftee: lof_ht = get_most_severe_consequence_for_summary(mt.rows()) mt = mt.filter_rows( hl.is_defined(lof_ht[mt.row_key].lof) & (lof_ht[mt.row_key].lof == "HC") & (lof_ht[mt.row_key].no_lof_flags)) ht = mt.annotate_rows( lof=hl.struct( **get_het_hom_summary_dict( csq_set=lof_csq_set, most_severe_csq_expr=mt.most_severe_consequence, defined_sites_expr=mt.defined_sites, num_homs_expr=mt.num_homs, num_hets_expr=mt.num_hets, pop_expr=mt[meta_root][pop_field], ), ), missense=hl.struct( **get_het_hom_summary_dict( csq_set={"missense_variant"}, most_severe_csq_expr=mt.most_severe_consequence, defined_sites_expr=mt.defined_sites, num_homs_expr=mt.num_homs, num_hets_expr=mt.num_hets, pop_expr=mt[meta_root][pop_field], ), ), synonymous=hl.struct( **get_het_hom_summary_dict( csq_set={"synonymous_variant"}, most_severe_csq_expr=mt.most_severe_consequence, defined_sites_expr=mt.defined_sites, num_homs_expr=mt.num_homs, num_hets_expr=mt.num_hets, pop_expr=mt[meta_root][pop_field], ), ), ).rows() ht = ht.annotate( p=(1 - hl.sqrt(hl.float64(ht.lof.no_alt_calls) / ht.lof.defined)), pop_p=hl.dict( hl.array(ht.lof.pop_defined).map(lambda x: ( x[0], 1 - hl.sqrt( hl.float64(ht.lof.pop_no_alt_calls.get(x[0])) / x[1]), ))), ) ht = ht.annotate(exp_hom_lof=ht.lof.defined * ht.p * ht.p) return ht.annotate(oe=ht.lof.obs_hom / ht.exp_hom_lof)
def import_structural_variants(vcf_path): ds = hl.import_vcf(vcf_path, force_bgz=True, min_partitions=32).rows() ds = ds.annotate( **{field.lower(): ds.info[field] for field in TOP_LEVEL_INFO_FIELDS}) ds = ds.annotate( variant_id=ds.rsid.replace("^gnomAD-SV_v2.1_", ""), reference_genome="GRCh37", # Start chrom=ds.locus.contig, pos=ds.locus.position, xpos=x_position(ds.locus.contig, ds.locus.position), # End end=ds.info.END, xend=x_position(ds.locus.contig, ds.info.END), # Start 2 chrom2=ds.info.CHR2, pos2=ds.info.POS2, xpos2=x_position(ds.info.CHR2, ds.info.POS2), # End 2 end2=ds.info.END2, xend2=x_position(ds.info.CHR2, ds.info.END2), # Other length=ds.info.SVLEN, type=ds.info.SVTYPE, alts=ds.alleles[1:], ) # MULTIALLELIC should not be used as a quality filter in the browser ds = ds.annotate(filters=ds.filters.difference(hl.set(["MULTIALLELIC"]))) # Group gene lists for all consequences in one field ds = ds.annotate(consequences=hl.array([ hl.struct( consequence=csq.lower(), genes=hl.or_else(ds.info[f"PROTEIN_CODING__{csq}"], hl.empty_array(hl.tstr)), ) for csq in RANKED_CONSEQUENCES if csq not in ("INTERGENIC", "NEAREST_TSS") ]).filter(lambda csq: hl.len(csq.genes) > 0)) ds = ds.annotate(intergenic=ds.info.PROTEIN_CODING__INTERGENIC) ds = ds.annotate(major_consequence=hl.rbind( ds.consequences.find(lambda csq: hl.len(csq.genes) > 0), lambda csq: hl.or_else(csq.consequence, hl.or_missing(ds.intergenic, "intergenic")), )) # Collect set of all genes for which a variant has a consequence ds = ds.annotate(genes=hl.set(ds.consequences.flatmap(lambda c: c.genes))) # Group per-population frequency values ds = ds.annotate(freq=hl.struct( **{field.lower(): ds.info[field] for field in FREQ_FIELDS}, populations=[ hl.struct(id=pop, **{ field.lower(): ds.info[f"{pop}_{field}"] for field in FREQ_FIELDS }) for pop in DIVISIONS ], )) # For MCNVs, store per-copy number allele counts ds = ds.annotate(freq=ds.freq.annotate(copy_numbers=hl.or_missing( ds.type == "MCNV", hl.zip_with_index(ds.alts).map(lambda pair: hl.rbind( pair[0], pair[1], lambda index, alt: hl.struct( # Extract copy number. Example, get 2 from "CN=<2>" copy_number=hl.int(alt[4:-1]), ac=ds.freq.ac[index], ), )), ))) # For MCNVs, sum AC/AF for all alt alleles except CN=2 ds = ds.annotate(freq=ds.freq.annotate( ac=hl.if_else(ds.type == "MCNV", sum_mcnv_ac_or_af( ds.alts, ds.freq.ac), ds.freq.ac[0]), af=hl.if_else(ds.type == "MCNV", sum_mcnv_ac_or_af( ds.alts, ds.freq.af), ds.freq.af[0]), populations=hl.if_else( ds.type == "MCNV", ds.freq.populations.map(lambda pop: pop.annotate( ac=sum_mcnv_ac_or_af(ds.alts, pop.ac), af=sum_mcnv_ac_or_af(ds.alts, pop.af), )), ds.freq.populations.map( lambda pop: pop.annotate(ac=pop.ac[0], af=pop.af[0])), ), )) # Add hemizygous frequencies ds = ds.annotate(hemizygote_count=hl.dict( [( pop_id, hl.if_else(((ds.chrom == "X") | (ds.chrom == "Y")) & ~ds.par, ds.info[f"{pop_id}_MALE_N_HEMIALT"], 0), ) for pop_id in POPULATIONS] + [(f"{pop_id}_FEMALE", 0) for pop_id in POPULATIONS] + [( f"{pop_id}_MALE", hl.if_else(((ds.chrom == "X") | (ds.chrom == "Y")) & ~ds.par, ds.info[f"{pop_id}_MALE_N_HEMIALT"], 0), ) for pop_id in POPULATIONS] + [("FEMALE", 0)] + [("MALE", hl.if_else(((ds.chrom == "X") | (ds.chrom == "Y")) & ~ds.par, ds.info.MALE_N_HEMIALT, 0))])) ds = ds.annotate(freq=ds.freq.annotate( hemizygote_count=hl.or_missing( ds.type != "MCNV", hl.if_else(((ds.chrom == "X") | (ds.chrom == "Y")) & ~ds.par, ds.info.MALE_N_HEMIALT, 0), ), populations=hl.if_else( ds.type != "MCNV", ds.freq.populations.map(lambda pop: pop.annotate( hemizygote_count=ds.hemizygote_count[pop.id])), ds.freq.populations.map( lambda pop: pop.annotate(hemizygote_count=hl.null(hl.tint))), ), )) ds = ds.drop("hemizygote_count") # Rename n_homalt ds = ds.annotate(freq=ds.freq.annotate( homozygote_count=ds.freq.n_homalt, populations=ds.freq.populations.map(lambda pop: pop.annotate( homozygote_count=pop.n_homalt).drop("n_homalt")), ).drop("n_homalt")) # Re-key ds = ds.key_by("variant_id") ds = ds.drop("locus", "alleles", "info", "rsid") return ds
def main(args): # Init Hail hl.init(default_reference=args.default_ref_genome) # Import VEPed VCF file as MatrixTable and get VCF file meta-data # vcf_path = args.vcf_vep_path mt = hl.import_vcf(path=get_vep_vqsr_vcf_path(), force_bgz=args.force_bgz) # getting annotated VEP fields names from VCF-header vep_fields = get_vep_fields(vcf_path=get_vep_vqsr_vcf_path(), vep_csq_field=args.csq_field) if args.split_multi_allelic: # split multi-allelic variants mt = hl.split_multi_hts(mt) # split/annotate fields in the info field (use allele index ) mt = mt.annotate_rows(info=mt.info.annotate( **{field: mt.info[field][mt.a_index - 1] for field in INFO_FIELDS})) # parse/annotate the CSQ field in a different structure tb_csq = mt.rows() tb_csq = (tb_csq.annotate(csq_raw=tb_csq.info[args.csq_field])) # Convert/annotate all transcripts per variants with a structure of type array<dict<str, str>>. # The transcript(s) are represented as a dict<k,v>, where keys are the field names extracted from the VCF header and # the values are the current annotated values in the CSQ field. tb_csq = (tb_csq.annotate(csq_raw=tb_csq.csq_raw.map( lambda x: hl.dict(hl.zip(vep_fields, x.split('[|]')))))) # Keep transcript(s) matching with the allele index (only used if variant were split with split_multi_hts) # It requires having the flag "ALLELE_NUM" annotated by VEP # Apply only were the alleles were split. # TODO: Handle exception when the flag "ALLELE_NUM" is not present if all( [x in list(tb_csq._fields.keys()) for x in ['was_split', 'a_index']]): tb_csq = (tb_csq.annotate(csq_raw=hl.cond( tb_csq.was_split, tb_csq.csq_raw.filter(lambda x: (hl.int(x["ALLELE_NUM"]) == tb_csq. a_index)), tb_csq.csq_raw))) # select and annotate one transcript per variant based on pre-defined rules tb_csq = pick_transcript( ht=tb_csq, csq_array='csq_raw', ) # Expand selected transcript (dict) annotations adding independent fields. tb_csq = annotate_from_dict(ht=tb_csq, dict_field='tx', output_filed='vep') # Parse the "Consequence" field. Keep only the more severe consequence. # Avoid the notation "consequence_1&consequence_2" tb_csq = (tb_csq.annotate(vep=tb_csq.vep.annotate( Consequence=tb_csq.vep.Consequence.split('&')[0]))) # Parse the protein DOMAIN field if 'DOMAINS' in vep_fields: tb_csq = (tb_csq.annotate(vep=tb_csq.vep.annotate( DOMAINS=vep_protein_domain_ann_expr(tb_csq.vep['DOMAINS'])))) # drop redundant/temp fields tb_csq = (tb_csq.drop('csq_raw', 'tx').repartition(500)) # print fields overview tb_csq.describe() # write table as HailTable to disk # (tb_csq # .write(output=args.tb_output_path, # overwrite=args.overwrite) # ) output_path = get_variant_qc_ht_path(part='vep_vqsr', split=args.split_multi_allelic) tb_csq = (tb_csq.checkpoint(output=output_path, overwrite=args.overwrite)) if args.write_to_file: # write table to disk as a BGZ-compressed TSV file (tb_csq.export(f'{output_path}.tsv.bgz')) # Stop Hail hl.stop()
def prepare_mitochondrial_variants(path, mnvs_path=None): ds = hl.read_table(path) haplogroups = hl.eval(ds.globals.hap_order) ds = ds.annotate(hl_hist=ds.hl_hist.annotate( bin_edges=ds.hl_hist.bin_edges.map( lambda n: hl.float(hl.format("%.2f", n))))) filter_names = hl.dict({ "artifact_prone_site": "Artifact-prone site", "indel_stack": "Indel stack", "npg": "No passing genotype" }) ds = ds.select( # ID variant_id=variant_id(ds.locus, ds.alleles), reference_genome=ds.locus.dtype.reference_genome.name, chrom=normalized_contig(ds.locus.contig), pos=ds.locus.position, ref=ds.alleles[0], alt=ds.alleles[1], rsid=ds.rsid, # Quality filters=ds.filters.map(lambda f: filter_names.get(f, f)), qual=ds.qual, genotype_quality_metrics=[ hl.struct(name="Depth", alt=ds.dp_hist_alt, all=ds.dp_hist_all) ], genotype_quality_filters=[ hl.struct( name="Base Quality", filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges, bin_freq=ds.base_qual_hist), ), hl.struct( name="Contamination", filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges, bin_freq=ds.contamination_hist), ), hl.struct( name="Heteroplasmy below 10%", filtered=hl.struct( bin_edges=ds.hl_hist.bin_edges, bin_freq=ds.heteroplasmy_below_10_percent_hist), ), hl.struct(name="Position", filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges, bin_freq=ds.position_hist)), hl.struct( name="Strand Bias", filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges, bin_freq=ds.strand_bias_hist), ), hl.struct( name="Weak Evidence", filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges, bin_freq=ds.weak_evidence_hist), ), ], site_quality_metrics=[ hl.struct(name="Mean Depth", value=nullify_nan(ds.dp_mean)), hl.struct(name="Mean MQ", value=nullify_nan(ds.mq_mean)), hl.struct(name="Mean TLOD", value=nullify_nan(ds.tlod_mean)), ], # Frequency an=ds.AN, ac_hom=ds.AC_hom, ac_het=ds.AC_het, excluded_ac=ds.excluded_AC, # Heteroplasmy common_low_heteroplasmy=ds.common_low_heteroplasmy, heteroplasmy_distribution=ds.hl_hist, max_heteroplasmy=ds.max_hl, # Populations populations=hl.sorted( hl.range(hl.len( ds.globals.pop_order)).map(lambda pop_index: hl.struct( id=ds.globals.pop_order[pop_index], an=ds.pop_AN[pop_index], ac_het=ds.pop_AC_het[pop_index], ac_hom=ds.pop_AC_hom[pop_index], heteroplasmy_distribution=hl.struct( bin_edges=ds.hl_hist.bin_edges, bin_freq=ds.pop_hl_hist[pop_index], n_smaller=0, n_larger=0, ), )), key=lambda pop: pop.id, ), # Haplogroups hapmax_af_hom=ds.hapmax_AF_hom, hapmax_af_het=ds.hapmax_AF_het, faf_hapmax_hom=ds.faf_hapmax_hom, haplogroup_defining=ds.hap_defining_variant, haplogroups=[ hl.struct( id=haplogroup, an=ds.hap_AN[i], ac_het=ds.hap_AC_het[i], ac_hom=ds.hap_AC_hom[i], faf_hom=ds.hap_faf_hom[i], heteroplasmy_distribution=ds.hap_hl_hist[i], ) for i, haplogroup in enumerate(haplogroups) ], # Other age_distribution=hl.struct(het=ds.age_hist_het, hom=ds.age_hist_hom), flags=hl.set([ hl.or_missing(ds.common_low_heteroplasmy, "common_low_heteroplasmy") ]).filter(hl.is_defined), mitotip_score=ds.mitotip_score, mitotip_trna_prediction=ds.mitotip_trna_prediction, pon_ml_probability_of_pathogenicity=ds. pon_ml_probability_of_pathogenicity, pon_mt_trna_prediction=ds.pon_mt_trna_prediction, variant_collapsed=ds.variant_collapsed, vep=ds.vep, ) if mnvs_path: mnvs = hl.import_table(mnvs_path, types={ "pos": hl.tint, "ref": hl.tstr, "alt": hl.tstr, "AC_hom_MNV": hl.tint }) mnvs = mnvs.key_by( locus=hl.locus("chrM", mnvs.pos, reference_genome=ds.locus.dtype.reference_genome), alleles=[mnvs.ref, mnvs.alt], ) ds = ds.annotate(ac_hom_mnv=hl.or_else(mnvs[ds.key].AC_hom_MNV, 0)) ds = ds.annotate( flags=hl.if_else(ds.ac_hom_mnv > 0, ds.flags.add("mnv"), ds.flags)) return ds
def import_gtf(path, reference_genome=None, skip_invalid_contigs=False, min_partitions=None) -> hl.Table: """Import a GTF file. The GTF file format is identical to the GFF version 2 file format, and so this function can be used to import GFF version 2 files as well. See https://www.ensembl.org/info/website/upload/gff.html for more details on the GTF/GFF2 file format. The :class:`.Table` returned by this function will be keyed by the ``interval`` row field and will include the following row fields: .. code-block:: text 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'interval': interval<> There will also be corresponding fields for every tag found in the attribute field of the GTF file. Note ---- This function will return an ``interval`` field of type :class:`.tinterval` constructed from the ``seqname``, ``start``, and ``end`` fields in the GTF file. This interval is inclusive of both the start and end positions in the GTF file. If the ``reference_genome`` parameter is specified, the start and end points of the ``interval`` field will be of type :class:`.tlocus`. Otherwise, the start and end points of the ``interval`` field will be of type :class:`.tstruct` with fields ``seqname`` (type :class:`str`) and ``position`` (type :class:`.tint32`). Furthermore, if the ``reference_genome`` parameter is specified and ``skip_invalid_contigs`` is ``True``, this import function will skip lines in the GTF where ``seqname`` is not consistent with the reference genome specified. Example ------- >>> ht = hl.experimental.import_gtf('data/test.gtf', ... reference_genome='GRCh37', ... skip_invalid_contigs=True) >>> ht.describe() # doctest: +NOTEST ---------------------------------------- Global fields: None ---------------------------------------- Row fields: 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'gene_type': str 'exon_id': str 'havana_transcript': str 'level': str 'transcript_name': str 'gene_status': str 'gene_id': str 'transcript_type': str 'tag': str 'transcript_status': str 'gene_name': str 'transcript_id': str 'exon_number': str 'havana_gene': str 'interval': interval<locus<GRCh37>> ---------------------------------------- Key: ['interval'] ---------------------------------------- Parameters ---------- path : :obj:`str` File to import. reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional Reference genome to use. skip_invalid_contigs : :obj:`bool` If ``True`` and `reference_genome` is not ``None``, skip lines where ``seqname`` is not consistent with the reference genome. min_partitions : :obj:`int` or :obj:`None` Minimum number of partitions (passed to import_table). Returns ------- :class:`.Table` """ ht = hl.import_table(path, min_partitions=min_partitions, comment='#', no_header=True, types={'f3': hl.tint, 'f4': hl.tint, 'f5': hl.tfloat, 'f7': hl.tint}, missing='.', delimiter='\t') ht = ht.rename({'f0': 'seqname', 'f1': 'source', 'f2': 'feature', 'f3': 'start', 'f4': 'end', 'f5': 'score', 'f6': 'strand', 'f7': 'frame', 'f8': 'attribute'}) ht = ht.annotate(attribute=hl.dict( hl.map(lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', '').replace(';$', '')), ht['attribute'].split('; ')))) attributes = ht.aggregate(hl.agg.explode(lambda x: hl.agg.collect_as_set(x), ht['attribute'].keys())) ht = ht.transmute(**{x: hl.or_missing(ht['attribute'].contains(x), ht['attribute'][x]) for x in attributes if x}) if reference_genome: if reference_genome == 'GRCh37': ht = ht.annotate(seqname=ht['seqname'].replace('^chr', '')) else: ht = ht.annotate(seqname=hl.case() .when(ht['seqname'].startswith('HLA'), ht['seqname']) .when(ht['seqname'].startswith('chrHLA'), ht['seqname'].replace('^chr', '')) .when(ht['seqname'].startswith('chr'), ht['seqname']) .default('chr' + ht['seqname'])) if skip_invalid_contigs: valid_contigs = hl.literal(set(hl.get_reference(reference_genome).contigs)) ht = ht.filter(valid_contigs.contains(ht['seqname'])) ht = ht.transmute(interval=hl.locus_interval(ht['seqname'], ht['start'], ht['end'], includes_start=True, includes_end=True, reference_genome=reference_genome)) else: ht = ht.transmute(interval=hl.interval(hl.struct(seqname=ht['seqname'], position=ht['start']), hl.struct(seqname=ht['seqname'], position=ht['end']), includes_start=True, includes_end=True)) ht = ht.key_by('interval') return ht
def import_gtf(path, reference_genome=None, skip_invalid_contigs=False, min_partitions=None) -> hl.Table: """Import a GTF file. The GTF file format is identical to the GFF version 2 file format, and so this function can be used to import GFF version 2 files as well. See https://www.ensembl.org/info/website/upload/gff.html for more details on the GTF/GFF2 file format. The :class:`.Table` returned by this function will be keyed by the ``interval`` row field and will include the following row fields: .. code-block:: text 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'interval': interval<> There will also be corresponding fields for every tag found in the attribute field of the GTF file. Note ---- This function will return an ``interval`` field of type :class:`.tinterval` constructed from the ``seqname``, ``start``, and ``end`` fields in the GTF file. This interval is inclusive of both the start and end positions in the GTF file. If the ``reference_genome`` parameter is specified, the start and end points of the ``interval`` field will be of type :class:`.tlocus`. Otherwise, the start and end points of the ``interval`` field will be of type :class:`.tstruct` with fields ``seqname`` (type :class:`str`) and ``position`` (type :class:`.tint32`). Furthermore, if the ``reference_genome`` parameter is specified and ``skip_invalid_contigs`` is ``True``, this import function will skip lines in the GTF where ``seqname`` is not consistent with the reference genome specified. Example ------- >>> ht = hl.experimental.import_gtf('data/test.gtf', ... reference_genome='GRCh37', ... skip_invalid_contigs=True) >>> ht.describe() # doctest: +SKIP_OUTPUT_CHECK ---------------------------------------- Global fields: None ---------------------------------------- Row fields: 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'gene_type': str 'exon_id': str 'havana_transcript': str 'level': str 'transcript_name': str 'gene_status': str 'gene_id': str 'transcript_type': str 'tag': str 'transcript_status': str 'gene_name': str 'transcript_id': str 'exon_number': str 'havana_gene': str 'interval': interval<locus<GRCh37>> ---------------------------------------- Key: ['interval'] ---------------------------------------- Parameters ---------- path : :obj:`str` File to import. reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional Reference genome to use. skip_invalid_contigs : :obj:`bool` If ``True`` and `reference_genome` is not ``None``, skip lines where ``seqname`` is not consistent with the reference genome. min_partitions : :obj:`int` or :obj:`None` Minimum number of partitions (passed to import_table). Returns ------- :class:`.Table` """ ht = hl.import_table(path, min_partitions=min_partitions, comment='#', no_header=True, types={ 'f3': hl.tint, 'f4': hl.tint, 'f5': hl.tfloat, 'f7': hl.tint }, missing='.', delimiter='\t') ht = ht.rename({ 'f0': 'seqname', 'f1': 'source', 'f2': 'feature', 'f3': 'start', 'f4': 'end', 'f5': 'score', 'f6': 'strand', 'f7': 'frame', 'f8': 'attribute' }) ht = ht.annotate(attribute=hl.dict( hl.map( lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', ''). replace(';$', '')), ht['attribute'].split('; ')))) attributes = ht.aggregate( hl.agg.explode(lambda x: hl.agg.collect_as_set(x), ht['attribute'].keys())) ht = ht.transmute( **{ x: hl.or_missing(ht['attribute'].contains(x), ht['attribute'][x]) for x in attributes if x }) if reference_genome: if reference_genome == 'GRCh37': ht = ht.annotate(seqname=ht['seqname'].replace('^chr', '')) else: ht = ht.annotate(seqname=hl.case().when( ht['seqname'].startswith('HLA'), ht['seqname']).when( ht['seqname'].startswith('chrHLA'), ht['seqname'].replace( '^chr', '')).when(ht['seqname'].startswith( 'chr'), ht['seqname']).default('chr' + ht['seqname'])) if skip_invalid_contigs: valid_contigs = hl.literal( set(hl.get_reference(reference_genome).contigs)) ht = ht.filter(valid_contigs.contains(ht['seqname'])) ht = ht.transmute( interval=hl.locus_interval(ht['seqname'], ht['start'], ht['end'], includes_start=True, includes_end=True, reference_genome=reference_genome)) else: ht = ht.transmute(interval=hl.interval( hl.struct(seqname=ht['seqname'], position=ht['start']), hl.struct(seqname=ht['seqname'], position=ht['end']), includes_start=True, includes_end=True)) ht = ht.key_by('interval') return ht
mnvs = mnvs.annotate( related_mnvs=component_2bp_mnvs[mnvs.variant_id].related_mnvs) mnvs = mnvs.annotate(related_mnvs=mnvs.related_mnvs.map( lambda related_mnv: related_mnv.select( "combined_variant_id", "n_individuals", "other_constituent_snvs", changes_amino_acids=hl.bind( lambda mnv_consequences, related_mnv_consequences: mnv_consequences.key_set( ).union(related_mnv_consequences.key_set() ).any(lambda gene_id: mnv_consequences.get(gene_id) != related_mnv_consequences.get(gene_id)), hl.dict( mnvs.consequences.map(lambda c: (c.gene_id, c.amino_acids.lower()))), hl.dict( related_mnv.consequences.map(lambda c: ( c.gene_id, c.amino_acids.lower()))), ), ))) mnvs_3bp = mnvs_3bp.annotate( related_mnvs=hl.empty_array(mnvs.related_mnvs.dtype.element_type)) mnvs = mnvs.union(mnvs_3bp) mnvs = mnvs.repartition(8, shuffle=True) mnvs = mnvs.key_by()
def _to_expr(e, dtype): if e is None: return None elif isinstance(e, Expression): if e.dtype != dtype: assert is_numeric(dtype), 'expected {}, got {}'.format( dtype, e.dtype) if dtype == tfloat64: return hl.float64(e) elif dtype == tfloat32: return hl.float32(e) elif dtype == tint64: return hl.int64(e) else: assert dtype == tint32 return hl.int32(e) return e elif not is_compound(dtype): # these are not container types and cannot contain expressions if we got here return e elif isinstance(dtype, tstruct): new_fields = [] found_expr = False for f, t in dtype.items(): value = _to_expr(e[f], t) found_expr = found_expr or isinstance(value, Expression) new_fields.append(value) if not found_expr: return e else: exprs = [ new_fields[i] if isinstance(new_fields[i], Expression) else hl.literal(new_fields[i], dtype[i]) for i in range(len(new_fields)) ] fields = {name: expr for name, expr in zip(dtype.keys(), exprs)} from .typed_expressions import StructExpression return StructExpression._from_fields(fields) elif isinstance(dtype, tarray): elements = [] found_expr = False for element in e: value = _to_expr(element, dtype.element_type) found_expr = found_expr or isinstance(value, Expression) elements.append(value) if not found_expr: return e else: assert len(elements) > 0 exprs = [ element if isinstance(element, Expression) else hl.literal( element, dtype.element_type) for element in elements ] indices, aggregations = unify_all(*exprs) x = ir.MakeArray([e._ir for e in exprs], None) return expressions.construct_expr(x, dtype, indices, aggregations) elif isinstance(dtype, tset): elements = [] found_expr = False for element in e: value = _to_expr(element, dtype.element_type) found_expr = found_expr or isinstance(value, Expression) elements.append(value) if not found_expr: return e else: assert len(elements) > 0 exprs = [ element if isinstance(element, Expression) else hl.literal( element, dtype.element_type) for element in elements ] indices, aggregations = unify_all(*exprs) x = ir.ToSet( ir.ToStream(ir.MakeArray([e._ir for e in exprs], None))) return expressions.construct_expr(x, dtype, indices, aggregations) elif isinstance(dtype, ttuple): elements = [] found_expr = False assert len(e) == len(dtype.types) for i in range(len(e)): value = _to_expr(e[i], dtype.types[i]) found_expr = found_expr or isinstance(value, Expression) elements.append(value) if not found_expr: return e else: exprs = [ elements[i] if isinstance(elements[i], Expression) else hl.literal(elements[i], dtype.types[i]) for i in range(len(elements)) ] indices, aggregations = unify_all(*exprs) x = ir.MakeTuple([expr._ir for expr in exprs]) return expressions.construct_expr(x, dtype, indices, aggregations) elif isinstance(dtype, tdict): keys = [] values = [] found_expr = False for k, v in e.items(): k_ = _to_expr(k, dtype.key_type) v_ = _to_expr(v, dtype.value_type) found_expr = found_expr or isinstance(k_, Expression) found_expr = found_expr or isinstance(v_, Expression) keys.append(k_) values.append(v_) if not found_expr: return e else: assert len(keys) > 0 # Here I use `to_expr` to call `lit` the keys and values separately. # I anticipate a common mode is statically-known keys and Expression # values. key_array = to_expr(keys, tarray(dtype.key_type)) value_array = to_expr(values, tarray(dtype.value_type)) return hl.dict(hl.zip(key_array, value_array)) elif isinstance(dtype, hl.tndarray): return hl.nd.array(e) else: raise NotImplementedError(dtype)
def annotate_rows_db(self, rel, *names): """Add annotations from datasets specified by name. List datasets with at :meth:`.available_databases`. An interactive query builder is available in the `Hail Annotation Database documentation </docs/0.2/annotation_database_ui.html>`_. Examples -------- Annotate a matrix table with the `gnomad_lof_metrics`: >>> db = hl.experimental.DB() >>> mt = db.annotate_rows_db(mt, 'gnomad_lof_metrics') # doctest: +SKIP Annotate a table with `clinvar_gene_summary`, `CADD`, and `DANN`: >>> db = hl.experimental.DB() >>> mt = db.annotate_rows_db(mt, 'clinvar_gene_summary', 'CADD', 'DANN') # doctest: +SKIP Notes ----- If a dataset is gene-keyed, the annotation will be a dictionary mapping from gene name to the annotation value. There will be one entry for each gene overlapping the given locus. If a dataset does not have unique rows for each key (consider the `gencode` genes, which may overlap; and `clinvar_variant_summary`, which contains many overlapping multiple nucleotide variants), then the result will be an array of annotation values, one for each row. Parameters ---------- rel : :class:`.MatrixTable` or :class:`.Table` The relational object to which to add annotations. names : varargs of :obj:`str` The names of the datasets with which to annotate `rel`. Returns ------- :class:`.MatrixTable` or :class:`.Table The original dataset with new annotations added. """ rel = self._row_lens(rel) if len(set(names)) != len(names): raise ValueError( f'cannot annotate same dataset twice, please remove duplicates from: {names}' ) datasets = [self.dataset_by_name(name) for name in names] if any(dataset.is_gene_keyed() for dataset in datasets): gene_field, rel = self._annotate_gene_name(rel) else: gene_field = None for dataset in datasets: if dataset.is_gene_keyed(): genes = rel.select(gene_field).explode(gene_field) genes = genes.annotate( **{ dataset.name: dataset.index_compatible_version(genes[gene_field]) }) genes = genes.group_by(*genes.key)\ .aggregate(**{ dataset.name: hl.dict( hl.agg.filter(hl.is_defined(genes[dataset.name]), hl.agg.collect((genes[gene_field], genes[dataset.name]))))}) rel = rel.annotate( **{dataset.name: genes.index(rel.key)[dataset.name]}) else: indexed_value = dataset.index_compatible_version(rel.key) if isinstance(indexed_value.dtype, hl.tstruct) and len( indexed_value.dtype) == 0: indexed_value = hl.is_defined(indexed_value) rel = rel.annotate(**{dataset.name: indexed_value}) if gene_field: rel = rel.drop(gene_field) return rel.unlens()
def main(args): # Init Hail with hg38 genome build as default hl.init(default_reference=args.default_ref_genome) # Import VEPed VCF file as MatrixTable and get VCF file meta-data vcf_path = args.vcf_vep_path mt = hl.import_vcf(path=vcf_path, force_bgz=args.force_bgz) # getting annotated VEP fields names from VCF-header vep_fields = get_vep_fields(vcf_path=vcf_path, vep_csq_field=args.csq_field) if args.exclude_multi_allelic: # TODO: This option should skip the split_multi step... # Filter out multi-allelic variants. Keep only bi-allelic mt = filter_biallelic(mt) # split multi-allelic variants mt = hl.split_multi_hts(mt) # flatten nested structure (e.g. 'info') and get a HailTable with all rows fields tb_csq = (mt.rows().flatten().key_by('locus', 'alleles')) # rename info[CSQ] field to 'csq_array'. # Simpler field name are easier to parse later... tb_csq = (tb_csq.rename({'info.' + args.csq_field: 'csq_array'})) # Convert/annotate all transcripts per variants with a structure of type array<dict<str, str>>. # The transcript(s) are represented as a dict<k,v>, the keys are the field names extracted from the VCF header, the # values are the current annotated values in the CSQ field. tb_csq = (tb_csq.annotate(csq_array=tb_csq.csq_array.map( lambda x: hl.dict(hl.zip(vep_fields, x.split('[|]')))))) # Keep transcript(s) matching with the allele index. # It requires having the flag "ALLELE_NUM" annotated by VEP # Apply only were the alleles were split. # TODO: Handle exception when the flag "ALLELE_NUM" is not present tb_csq = (tb_csq.annotate(csq_array=hl.cond( tb_csq.was_split, tb_csq.csq_array.filter(lambda x: (hl.int(x["ALLELE_NUM"]) == tb_csq. a_index)), tb_csq.csq_array))) # select and annotate one transcript per variant based on pre-defined rules tb_csq = pick_transcript(ht=tb_csq, csq_array='csq_array') # Expand selected transcript (dict) annotations adding independent fields. tb_csq = annotate_from_dict(ht=tb_csq, dict_field='tx') # Parse the "Consequence" field. Keep only the more severe consequence. # Avoid the notation "consequence_1&consequence_2" tb_csq = (tb_csq.transmute(Consequence=tb_csq.Consequence.split('&')[0])) # print fields overview tb_csq.describe() # drop unnecessary fields tb_csq = (tb_csq.drop('csq_array', 'tx')) # write table as HailTable to disk (tb_csq.write(output=args.tb_output_path)) if args.write_to_file: # write table to disk as a BGZ-compressed TSV file (tb_csq.export(args.tb_output_path + '.tsv.bgz')) # Stop Hail hl.stop()
def prepare_gene_results(): ds = hl.import_table( pipeline_config.get("SCHEMA", "gene_results_path"), delimiter="\t", missing="NA", types={ "Gene ID": hl.tstr, "Gene Symbol": hl.tstr, "Gene Name": hl.tstr, "Case PTV": hl.tint, "Ctrl PTV": hl.tint, "Case mis3": hl.tint, "Ctrl mis3": hl.tint, "Case mis2": hl.tint, "Ctrl mis2": hl.tint, "P ca/co (Class 1)": hl.tfloat, "P ca/co (Class 2)": hl.tfloat, "P ca/co (comb)": hl.tfloat, "De novo PTV": hl.tint, "De novo mis3": hl.tint, "De novo mis2": hl.tint, "P de novo": hl.tfloat, "P meta": hl.tfloat, "Q meta": hl.tfloat, "OR (PTV)": hl.tstr, "OR (Class I)": hl.tstr, "OR (Class II)": hl.tstr, }, ) # Parse upper and lower bounds out of odds ratio columns def _parse_odds_ratio(field_name): return hl.rbind( ds[field_name].split(" ", n=2), lambda parts: hl.rbind( parts[0], parts[1][1:-1].split("-", 2), lambda value, bounds: hl.struct( **{ field_name: hl.float(value), field_name + " lower bound": hl.float(bounds[0]), field_name + " upper bound": hl.float(bounds[1]), }), ), ) ds = ds.transmute(**_parse_odds_ratio("OR (PTV)")) ds = ds.transmute(**_parse_odds_ratio("OR (Class I)")) ds = ds.transmute(**_parse_odds_ratio("OR (Class II)")) ds = ds.drop("Gene Symbol", "Gene Name") ds = ds.rename({"Gene ID": "gene_id"}) ds = ds.key_by("gene_id") ds = ds.select(group_results=hl.dict([( "meta", hl.struct(**{field: ds[field] for field in ds.row_value.dtype.fields}))])) return ds
import hail as hl from hail_scripts.v02.utils.hail_utils import import_vcf CLINVAR_FTP_PATH = "ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh{genome_version}/clinvar.vcf.gz" CLINVAR_HT_PATH = "gs://seqr-reference-data/GRCh{genome_version}/clinvar/clinvar.GRCh{genome_version}.ht" CLINVAR_GOLD_STARS_LOOKUP = hl.dict({ "no_interpretation_for_the_single_variant": 0, "no_assertion_provided": 0, "no_assertion_criteria_provided": 0, "criteria_provided,_single_submitter": 1, "criteria_provided,_conflicting_interpretations": 1, "criteria_provided,_multiple_submitters,_no_conflicts": 2, "reviewed_by_expert_panel": 3, "practice_guideline": 4, }) def download_and_import_latest_clinvar_vcf( genome_version: str) -> hl.MatrixTable: """Downloads the latest clinvar VCF from the NCBI FTP server, imports it to a MT and returns that. Args:
def combine(ts): def merge_alleles(alleles): from hail.expr.functions import _num_allele_type, _allele_ints return hl.rbind( alleles.map(lambda a: hl.or_else(a[0], '')) .fold(lambda s, t: hl.cond(hl.len(s) > hl.len(t), s, t), ''), lambda ref: hl.rbind( alleles.map( lambda al: hl.rbind( al[0], lambda r: hl.array([ref]).extend( al[1:].map( lambda a: hl.rbind( _num_allele_type(r, a), lambda at: hl.cond( (_allele_ints['SNP'] == at) | (_allele_ints['Insertion'] == at) | (_allele_ints['Deletion'] == at) | (_allele_ints['MNP'] == at) | (_allele_ints['Complex'] == at), a + ref[hl.len(r):], a)))))), lambda lal: hl.struct( globl=hl.array([ref]).extend(hl.array(hl.set(hl.flatten(lal)).remove(ref))), local=lal))) def renumber_entry(entry, old_to_new) -> StructExpression: # global index of alternate (non-ref) alleles return entry.annotate(LA=entry.LA.map(lambda lak: old_to_new[lak])) if (ts.row.dtype, ts.globals.dtype) not in _merge_function_map: f = hl.experimental.define_function( lambda row, gbl: hl.rbind( merge_alleles(row.data.map(lambda d: d.alleles)), lambda alleles: hl.struct( locus=row.locus, alleles=alleles.globl, rsid=hl.find(hl.is_defined, row.data.map(lambda d: d.rsid)), __entries=hl.bind( lambda combined_allele_index: hl.range(0, hl.len(row.data)).flatmap( lambda i: hl.cond(hl.is_missing(row.data[i].__entries), hl.range(0, hl.len(gbl.g[i].__cols)) .map(lambda _: hl.null(row.data[i].__entries.dtype.element_type)), hl.bind( lambda old_to_new: row.data[i].__entries.map( lambda e: renumber_entry(e, old_to_new)), hl.range(0, hl.len(alleles.local[i])).map( lambda j: combined_allele_index[alleles.local[i][j]])))), hl.dict(hl.range(0, hl.len(alleles.globl)).map( lambda j: hl.tuple([alleles.globl[j], j])))))), ts.row.dtype, ts.globals.dtype) _merge_function_map[(ts.row.dtype, ts.globals.dtype)] = f merge_function = _merge_function_map[(ts.row.dtype, ts.globals.dtype)] ts = Table(TableMapRows(ts._tir, Apply(merge_function._name, TopLevelReference('row'), TopLevelReference('global')))) return ts.transmute_globals(__cols=hl.flatten(ts.g.map(lambda g: g.__cols)))