def phase_diploid_proband( locus: hl.expr.LocusExpression, alleles: hl.expr.ArrayExpression, proband_call: hl.expr.CallExpression, father_call: hl.expr.CallExpression, mother_call: hl.expr.CallExpression ) -> hl.expr.ArrayExpression: """ Returns phased genotype calls in the case of a diploid proband (autosomes, PAR regions of sex chromosomes or non-PAR regions of a female proband) :param LocusExpression locus: Locus in the trio MatrixTable :param ArrayExpression alleles: Alleles in the trio MatrixTable :param CallExpression proband_call: Input proband genotype call :param CallExpression father_call: Input father genotype call :param CallExpression mother_call: Input mother genotype call :return: Array containing: phased proband call, phased father call, phased mother call :rtype: ArrayExpression """ proband_v = proband_call.one_hot_alleles(alleles) father_v = hl.cond( locus.in_x_nonpar() | locus.in_y_nonpar(), hl.or_missing(father_call.is_haploid(), hl.array([father_call.one_hot_alleles(alleles)])), call_to_one_hot_alleles_array(father_call, alleles) ) mother_v = call_to_one_hot_alleles_array(mother_call, alleles) combinations = hl.flatmap( lambda f: hl.zip_with_index(mother_v) .filter(lambda m: m[1] + f[1] == proband_v) .map(lambda m: hl.struct(m=m[0], f=f[0])), hl.zip_with_index(father_v) ) return ( hl.or_missing( hl.is_defined(combinations) & (hl.len(combinations) == 1), hl.array([ hl.call(father_call[combinations[0].f], mother_call[combinations[0].m], phased=True), hl.cond(father_call.is_haploid(), hl.call(father_call[0], phased=True), phase_parent_call(father_call, combinations[0].f)), phase_parent_call(mother_call, combinations[0].m) ]) ) )
def with_local_a_index(local_a_index): new_pl = hl.or_missing( hl.is_defined(old_entry.LPL), hl.or_missing( hl.is_defined(local_a_index), hl.range(0, 3).map(lambda i: hl.min( hl.range(0, hl.triangle(hl.len(old_entry.LA))) .filter(lambda j: hl.downcode(hl.unphased_diploid_gt_index_call(j), local_a_index) == hl.unphased_diploid_gt_index_call(i)) .map(lambda idx: old_entry.LPL[idx]))))) fields = set(old_entry.keys()) def with_pl(pl): new_exprs = {} dropped_fields = ['LA'] if 'LGT' in fields: new_exprs['GT'] = hl.downcode(old_entry.LGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LGT') if 'LPGT' in fields: new_exprs['PGT'] = hl.downcode(old_entry.LPGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LPGT') if 'LAD' in fields: new_exprs['AD'] = hl.or_missing( hl.is_defined(old_entry.LAD), [old_entry.LAD[0], hl.or_else(old_entry.LAD[local_a_index], 0)]) # second entry zeroed for lack of non-ref AD dropped_fields.append('LAD') if 'LPL' in fields: new_exprs['PL'] = pl if 'GQ' in fields: new_exprs['GQ'] = hl.or_else(hl.gq_from_pl(pl), old_entry.GQ) dropped_fields.append('LPL') return hl.cond(hl.len(ds.alleles) == 1, old_entry.annotate(**{f[1:]: old_entry[f] for f in ['LGT', 'LPGT', 'LAD', 'LPL'] if f in fields}).drop(*dropped_fields), old_entry.annotate(**new_exprs).drop(*dropped_fields)) if 'LPL' in fields: return hl.bind(with_pl, new_pl) else: return with_pl(None)
def test(self): schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr, f=hl.tarray(hl.tint32), g=hl.tarray( hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)), h=hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tstr), i=hl.tbool, j=hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)) rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3], 'g': [hl.Struct(x=1, y=5, z='banana')], 'h': hl.Struct(a=5, b=3, c='winter'), 'i': True, 'j': hl.Struct(x=3, y=2, z='summer')}] kt = hl.Table.parallelize(rows, schema) result = convert_struct_to_dict(kt.annotate( chisq=hl.chisq(kt.a, kt.b, kt.c, kt.d), ctt=hl.ctt(kt.a, kt.b, kt.c, kt.d, 5), dict=hl.dict(hl.zip([kt.a, kt.b], [kt.c, kt.d])), dpois=hl.dpois(4, kt.a), drop=kt.h.drop('b', 'c'), exp=hl.exp(kt.c), fet=hl.fisher_exact_test(kt.a, kt.b, kt.c, kt.d), hwe=hl.hardy_weinberg_p(1, 2, 1), index=hl.index(kt.g, 'z'), is_defined=hl.is_defined(kt.i), is_missing=hl.is_missing(kt.i), is_nan=hl.is_nan(hl.float64(kt.a)), json=hl.json(kt.g), log=hl.log(kt.a, kt.b), log10=hl.log10(kt.c), or_else=hl.or_else(kt.a, 5), or_missing=hl.or_missing(kt.i, kt.j), pchisqtail=hl.pchisqtail(kt.a, kt.b), pcoin=hl.rand_bool(0.5), pnorm=hl.pnorm(0.2), pow=2.0 ** kt.b, ppois=hl.ppois(kt.a, kt.b), qchisqtail=hl.qchisqtail(kt.a, kt.b), range=hl.range(0, 5, kt.b), rnorm=hl.rand_norm(0.0, kt.b), rpois=hl.rand_pois(kt.a), runif=hl.rand_unif(kt.b, kt.a), select=kt.h.select('c', 'b'), sqrt=hl.sqrt(kt.a), to_str=[hl.str(5), hl.str(kt.a), hl.str(kt.g)], where=hl.cond(kt.i, 5, 10) ).take(1)[0])
def add_rank( ht: hl.Table, score_expr: hl.expr.NumericExpression, subrank_expr: Optional[Dict[str, hl.expr.BooleanExpression]] = None, ) -> hl.Table: """ Adds rank based on the `score_expr`. Rank is added for snvs and indels separately. If one or more `subrank_expr` are provided, then subrank is added based on all sites for which the boolean expression is true. In addition, variant counts (snv, indel separately) is added as a global (`rank_variant_counts`). :param ht: input Hail Table containing variants (with QC annotations) to be ranked :param score_expr: the Table annotation by which ranking should be scored :param subrank_expr: Any subranking to be added in the form name_of_subrank: subrank_filtering_expr :return: Table with rankings added """ key = ht.key if subrank_expr is None: subrank_expr = {} temp_expr = {"_score": score_expr} temp_expr.update({f"_{name}": expr for name, expr in subrank_expr.items()}) rank_ht = ht.select(**temp_expr, is_snv=hl.is_snp(ht.alleles[0], ht.alleles[1])) rank_ht = rank_ht.key_by("_score").persist() scan_expr = { "rank": hl.if_else( rank_ht.is_snv, hl.scan.count_where(rank_ht.is_snv), hl.scan.count_where(~rank_ht.is_snv), ) } scan_expr.update({ name: hl.or_missing( rank_ht[f"_{name}"], hl.if_else( rank_ht.is_snv, hl.scan.count_where(rank_ht.is_snv & rank_ht[f"_{name}"]), hl.scan.count_where(~rank_ht.is_snv & rank_ht[f"_{name}"]), ), ) for name in subrank_expr }) rank_ht = rank_ht.annotate(**scan_expr) rank_ht = rank_ht.key_by(*key).persist() rank_ht = rank_ht.select(*scan_expr.keys()) ht = ht.annotate(**rank_ht[key]) return ht
def compute_last_ref_block_end(mt: hl.MatrixTable) -> hl.Table: """ Compute the genomic position of the most upstream reference block overlapping each row on a sparse MT. Note that since reference blocks do not extend beyond contig boundaries, only the position is kept. This function returns a Table with that annotation. (`last_END_position`). :param mt: Input MatrixTable :return: Output Table with `last_END_position` annotation """ mt = mt.select_entries("END") # Localize entries, so that they can be viewed as an array and scanned over using hl.scan.array_agg ht = mt._localize_entries("__entries", "__cols") # Compute the position by using hl.scan._prev_nonnull. # This was inspired by hl.experimental.densify # _prev_non_null is an aggregator that keeps the previous record in memory # and updates it with the given value at the row if it's not null (missing) # The following code computes the following annotation for each row: # 1. Keep a scan of the entries using _prev_nonnull, keeping the start (ht.locus) and end (entry.END) of each ref block (1.1) # 2. For the current row locus, record the start of the block that starts the furthest away, # that is the minimum position in the current scan for any block that overlaps the current locus (2.1) ht = ht.select(last_END_position=hl.or_else( hl. min( # 2. For the current row locus, record the start of the block that starts the furthest away hl.scan.array_agg( lambda entry: hl.scan. _prev_nonnull( # 1. Keep a scan of the entries using _prev_nonnull hl.or_missing( hl.is_defined( entry.END ), # Update the scan whenever a new ref block is encountered hl.tuple( [ # 1.1 keep the start (ht.locus) and end (entry.END) of each ref block ht.locus, entry.END, ]), )), ht.__entries, ).map( lambda x: hl. or_missing( # 2.1 get the start position of blocks that overlap the current locus (x[1] >= ht.locus.position) & (x[0].contig == ht.locus.contig), x[0].position, ))), ht.locus.position, )) return ht.select_globals().key_by("locus")
def import_vcf( vcf_path: str, genome_version: str, min_partitions: int = None, force_bgz: bool = True, drop_samples: bool = False, skip_invalid_loci: bool = False, split_multi_alleles: bool = True): """Import vcf and return MatrixTable. :param str vcf_path: MT to annotate with VEP :param str genome_version: "37" or "38" :param int min_partitions: min partitions :param bool force_bgz: read .gz as a bgzipped file :param bool drop_samples: if True, discard genotype info :param bool skip_invalid_loci: if True, skip loci that are not consistent with the reference_genome. """ if genome_version not in ("37", "38"): raise ValueError(f"Invalid genome_version: {genome_version}") logger.info(f"\n==> import vcf: {vcf_path}") # add (or remove) "chr" prefix from vcf chroms so they match the reference ref = hl.get_reference(f"GRCh{genome_version}") contig_recoding = { **{ref_contig.replace("chr", ""): ref_contig for ref_contig in ref.contigs if "chr" in ref_contig}, **{f"chr{ref_contig}": ref_contig for ref_contig in ref.contigs if "chr" not in ref_contig}} mt = hl.import_vcf( vcf_path, reference_genome=f"GRCh{genome_version}", contig_recoding=contig_recoding, min_partitions=min_partitions, force_bgz=force_bgz, drop_samples=drop_samples, skip_invalid_loci=skip_invalid_loci) mt = mt.annotate_globals(sourceFilePath=vcf_path, genomeVersion=genome_version) mt = mt.annotate_rows( original_alt_alleles=hl.or_missing(hl.len(mt.alleles) > 2, get_expr_for_variant_ids(mt.locus, mt.alleles)) ) if split_multi_alleles: mt = hl.split_multi_hts(mt) mt = mt.key_rows_by(**hl.min_rep(mt.locus, mt.alleles)) return mt
def phase_haploid_proband_x_nonpar( proband_call: hl.expr.CallExpression, father_call: hl.expr.CallExpression, mother_call: hl.expr.CallExpression ) -> hl.expr.ArrayExpression: """ Returns phased genotype calls in the case of a haploid proband in the non-PAR region of X :param CallExpression proband_call: Input proband genotype call :param CallExpression father_call: Input father genotype call :param CallExpression mother_call: Input mother genotype call :return: Array containing: phased proband call, phased father call, phased mother call :rtype: ArrayExpression """ transmitted_allele = hl.zip_with_index(hl.array([mother_call[0], mother_call[1]])).find(lambda m: m[1] == proband_call[0]) return hl.or_missing( hl.is_defined(transmitted_allele), hl.array([ hl.call(proband_call[0], phased=True), hl.or_missing(father_call.is_haploid(), hl.call(father_call[0], phased=True)), phase_parent_call(mother_call, transmitted_allele[0]) ]) )
def densify(sparse_mt): """Convert sparse MatrixTable to a dense one. Parameters ---------- sparse_mt : :class:`.MatrixTable` Sparse MatrixTable to densify. The first row key field must be named ``locus`` and have type ``locus``. Must have an ``END`` entry field of type ``int32``. Returns ------- :class:`.MatrixTable` The densified MatrixTable. The ``END`` entry field is dropped. """ if list(sparse_mt.row_key)[0] != 'locus' or not isinstance( sparse_mt.locus.dtype, hl.tlocus): raise ValueError( "first row key field must be named 'locus' and have type 'locus'") if 'END' not in sparse_mt.entry or sparse_mt.END.dtype != hl.tint32: raise ValueError( "'densify' requires 'END' entry field of type 'int32'") col_key_fields = list(sparse_mt.col_key) contigs = sparse_mt.locus.dtype.reference_genome.contigs contig_idx_map = hl.literal({contigs[i]: i for i in range(len(contigs))}, 'dict<str, int32>') mt = sparse_mt.annotate_rows( __contig_idx=contig_idx_map[sparse_mt.locus.contig]) mt = mt.annotate_entries(__contig=mt.__contig_idx) t = mt._localize_entries('__entries', '__cols') t = t.annotate(__entries=hl.rbind( hl.scan.array_agg( lambda entry: hl.scan._prev_nonnull( hl.or_missing(hl.is_defined(entry.END), entry)), t.__entries), lambda prev_entries: hl.map( lambda i: hl.rbind( prev_entries[i], t.__entries[i], lambda prev_entry, entry: hl. cond((~hl.is_defined(entry) & (prev_entry.END >= t.locus.position) & (prev_entry.__contig == t.__contig_idx)), prev_entry, entry)), hl.range(0, hl.len(t.__entries))))) mt = t._unlocalize_entries('__entries', '__cols', col_key_fields) mt = mt.drop('__contig_idx', '__contig', 'END') return mt
def test_validate(): vds = hl.vds.read_vds( os.path.join(resource('vds'), '1kg_chr22_5_samples.vds')) vds.validate() with pytest.raises(ValueError): hl.vds.VariantDataset( vds.reference_data.annotate_rows(arr=[0, 1]).explode_rows('arr'), vds.variant_data).validate() with pytest.raises(ValueError): hl.vds.VariantDataset( vds.reference_data.annotate_entries( END=hl.or_missing(vds.reference_data.locus.position % 2 == 0, vds.reference_data.END)), vds.variant_data).validate()
def filter(self, mt): row_filter = mt[self._row_filter].filters if self._row_filter else mt.exclude_row col_filter = mt[self._col_filter].filters if self._col_filter else mt.exclude_col mt = mt.annotate_rows(cr=hl.or_missing(row_filter == False, hl.agg.group_by(mt.is_case, hl.agg.filter(col_filter == False, variant_qc_aggregator(mt).call_rate)))) mt = mt.annotate_rows(diff=hl.abs(mt.cr[False] - mt.cr[True])) mt = mt.annotate_rows(**{ 'cr_diff': hl.struct( filters=hl.agg.any((mt.diff > self._cr_thresh) & (mt[self._initial_row_filter].filters == False)))}) return mt
def get_alt_count(locus, gt, is_female): """ Helper method to calculate alt allele count with sex info if present """ if is_female is None: return hl.or_missing(locus.in_autosome(), gt.n_alt_alleles()) return ( hl.case() .when(locus.in_autosome_or_par(), gt.n_alt_alleles()) .when( ~is_female & (locus.in_x_nonpar() | locus.in_y_nonpar()), hl.min(1, gt.n_alt_alleles()), ) .when(is_female & locus.in_y_nonpar(), 0) .default(0) )
def with_pl(pl): new_exprs = {} dropped_fields = ['LA'] if 'LGT' in fields: new_exprs['GT'] = hl.downcode( old_entry.LGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LGT') if 'LPGT' in fields: new_exprs['PGT'] = hl.downcode( old_entry.LPGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LPGT') if 'LAD' in fields: non_ref_ad = hl.or_else(old_entry.LAD[local_a_index], 0) # zeroed if not in LAD new_exprs['AD'] = hl.or_missing( hl.is_defined(old_entry.LAD), [hl.sum(old_entry.LAD) - non_ref_ad, non_ref_ad]) dropped_fields.append('LAD') if 'LPL' in fields: new_exprs['PL'] = pl if 'GQ' in fields: new_exprs['GQ'] = hl.or_else(hl.gq_from_pl(pl), old_entry.GQ) dropped_fields.append('LPL') return (hl.case().when( hl.len(ds.alleles) == 1, old_entry.annotate( **{ f[1:]: old_entry[f] for f in ['LGT', 'LPGT', 'LAD', 'LPL'] if f in fields }).drop(*dropped_fields)).when( hl.or_else(old_entry.LGT.is_hom_ref(), False), old_entry.annotate( **{ f: old_entry[f'L{f}'] if f in ['GT', 'PGT'] else e for f, e in new_exprs.items() }).drop(*dropped_fields)).default( old_entry.annotate(**new_exprs).drop( *dropped_fields)))
def densify(sparse_mt): """Convert sparse MatrixTable to a dense one. Parameters ---------- sparse_mt : :class:`.MatrixTable` Sparse MatrixTable to densify. The first row key field must be named ``locus`` and have type ``locus``. Must have an ``END`` entry field of type ``int32``. Returns ------- :class:`.MatrixTable` The densified MatrixTable. The ``END`` entry field is dropped. """ if list(sparse_mt.row_key)[0] != 'locus' or not isinstance(sparse_mt.locus.dtype, hl.tlocus): raise ValueError("first row key field must be named 'locus' and have type 'locus'") if 'END' not in sparse_mt.entry or sparse_mt.END.dtype != hl.tint32: raise ValueError("'densify' requires 'END' entry field of type 'int32'") col_key_fields = list(sparse_mt.col_key) mt = sparse_mt mt = sparse_mt.annotate_entries(__contig = mt.locus.contig) t = mt._localize_entries('__entries', '__cols') t = t.annotate( __entries = hl.rbind( hl.scan.array_agg( lambda entry: hl.scan._prev_nonnull(hl.or_missing(hl.is_defined(entry.END), entry)), t.__entries), lambda prev_entries: hl.map( lambda i: hl.rbind( prev_entries[i], t.__entries[i], lambda prev_entry, entry: hl.cond( (~hl.is_defined(entry) & (prev_entry.END >= t.locus.position) & (prev_entry.__contig == t.locus.contig)), prev_entry, entry)), hl.range(0, hl.len(t.__entries))))) mt = t._unlocalize_entries('__entries', '__cols', col_key_fields) mt = mt.drop('__contig', 'END') return mt
def get_phesant_reassignments(phesant_summary): """ Helper function for add_coding_information. Parse PHESANT phenotype description data to get any coding reassignments. :param Table phesant_summary: Summary hail Table with PHESANT metadata :return: Table with reassignments :rtype: Table """ phesant_summary = phesant_summary.annotate( reassign=phesant_summary['PHESANT.reassignments'].split(' ')[1].split('\|').map(lambda x: x.split('='))) ht = phesant_summary.explode('reassign') ht = ht.filter(ht.reassign[1] != 'NA') ht = ht.transmute(reassign_from=ht.reassign[0], reassign_to=ht.reassign[1]) ht = ht.key_by( pheno=ht.FieldID.split('_')[0], coding=hl.or_missing(hl.len(ht.FieldID.split('_')) > 1, ht.FieldID.split('_')[1]) ) return ht.filter(ht.reassign_to == ht.coding)
def add_popmax_expr(freq: hl.expr.ArrayExpression, freq_meta: hl.expr.ArrayExpression, populations: Set[str]) -> hl.expr.ArrayExpression: """ Calculates popmax (add an additional entry into freq with popmax: pop) :param ArrayExpression freq: ArrayExpression of Structs with ['ac', 'an', 'hom'] :param ArrayExpression freq_meta: ArrayExpression of meta dictionaries corresponding to freq :param set of str populations: Set of populations over which to calculate popmax :return: Frequency data with annotated popmax :rtype: ArrayExpression """ pops_to_use = hl.literal(populations) freq = hl.map(lambda x: x[0].annotate(meta=x[1]), hl.zip(freq, freq_meta)) freq_filtered = hl.filter(lambda f: (f.meta.size() == 2) & (f.meta.get('group') == 'adj') & pops_to_use.contains(f.meta.get('pop')) & (f.AC > 0), freq) sorted_freqs = hl.sorted(freq_filtered, key=lambda x: x.AF, reverse=True) return hl.or_missing(hl.len(sorted_freqs) > 0, hl.struct(AC=sorted_freqs[0].AC, AF=sorted_freqs[0].AF, AN=sorted_freqs[0].AN, homozygote_count=sorted_freqs[0].homozygote_count, pop=sorted_freqs[0].meta['pop']))
def get_adj_missing_mt(data_type: str, pbt: bool) -> hl.MatrixTable: mt = get_gnomad_data(data_type).select_cols() if not pbt else hl.read_matrix_table(pbt_phased_trios_mt_path(data_type)) mt = mt.select_rows() mt = mt.select_entries( GT=hl.or_missing(mt.GT.is_non_ref(), mt.GT), missing=hl.is_missing(mt.GT), adj=mt.adj ).select_cols().select_rows() if pbt: mt = mt.key_cols_by('s', trio_id=mt.source_trio.id) mt = extract_pbt_probands(mt, data_type) mt = mt.filter_rows(hl.agg.any(mt.GT.is_non_ref())) mt = mt.key_cols_by(s=mt.s, trio_id=mt.source_trio.id) else: meta = get_gnomad_meta('exomes') mt = mt.filter_cols(meta[mt.col_key].high_quality) return mt
def all_and_leave_one_out(x, pop_array, all_f=hl.sum, loo_f=lambda i, x: hl.sum(x) - hl.or_else(x[i], 0)): """ Applies a function to an input array for all populations, and for each of leave-one-out populations. :param x: Input array :param pop_array: Population array :param all_f: Function for all populations. It takes the input array and returns a new value :param loo_f: Function for each of leave-one-out populations. It takes an index of leave-one-out population and the input array, and returns an array of new values. ... :return: Array of new values for all populations and for each of leave-one-out populations. :rtype: ArrayExpression """ arr = hl.array([all_f(x)]) arr = arr.extend(hl.map(lambda i: loo_f(i, x), hl.range(hl.len(pop_array)))) return hl.or_missing(hl.any(hl.is_defined, x), arr)
def phase_y_nonpar( proband_call: hl.expr.CallExpression, father_call: hl.expr.CallExpression, ) -> hl.expr.ArrayExpression: """ Returns phased genotype calls in the non-PAR region of Y (requires both father and proband to be haploid to return phase) :param CallExpression proband_call: Input proband genotype call :param CallExpression father_call: Input father genotype call :return: Array containing: phased proband call, phased father call, phased mother call :rtype: ArrayExpression """ return hl.or_missing( proband_call.is_haploid() & father_call.is_haploid() & (father_call[0] == proband_call[0]), hl.array([ hl.call(proband_call[0], phased=True), hl.call(father_call[0], phased=True), hl.null(hl.tcall) ]) )
def main(args): hl.init(log='/platform_pca.log') if not args.skip_prepare_data_for_platform_pca: # ~1 hour on 800 cores (3/8/18) logger.info('Preparing data for platform PCA...') mt = get_gnomad_data('exomes', adj=True, raw=False, meta_root=None, fam_root=None, split=False) mt = filter_to_autosomes(mt) intervals = hl.import_locus_intervals(evaluation_intervals_path) mt = mt.annotate_rows(interval=intervals[mt.locus].target) mt = mt.filter_rows(hl.is_defined(mt.interval) & (hl.len(mt.alleles) == 2)) mt = mt.select_entries(GT=hl.or_missing(hl.is_defined(mt.GT), hl.struct())) callrate_mt = mt.group_rows_by(mt.interval).aggregate(callrate=hl.agg.fraction(hl.is_defined(mt.GT))) callrate_mt.write(exome_callrate_mt_path, args.overwrite) if not args.skip_run_platform_pca: logger.info('Running platform PCA...') qc_ht = hl.read_table(qc_ht_path('exomes', 'hard_filters')).key_by('s') callrate_mt = hl.read_matrix_table(exome_callrate_mt_path) callrate_mt = callrate_mt.filter_cols(hl.len(qc_ht[callrate_mt.col_key].hard_filters) == 0) callrate_mt = callrate_mt.annotate_entries(callrate=hl.int(callrate_mt.callrate > 0.25)) # Center until Hail's PCA does it for you callrate_mt = callrate_mt.annotate_rows(mean_callrate=hl.agg.mean(callrate_mt.callrate)) callrate_mt = callrate_mt.annotate_entries(callrate=callrate_mt.callrate - callrate_mt.mean_callrate) eigenvalues, scores, _ = hl.pca(callrate_mt.callrate, compute_loadings=False) logger.info('Eigenvalues: {}'.format(eigenvalues)) # [731282566.2824697, 78687228.90071851, 43837650.51729764, 33969298.61827205, 26308703.539534636, 21102437.512725923, 16949828.555817757, 12994894.187041137, 8372332.274295175, 8128326.814388647] scores.write(exome_callrate_scores_ht_path) logger.info('Annotating with platform PCs and known platform annotations...') scores = hl.read_table(exome_callrate_scores_ht_path).annotate(data_type='exomes') if args.pc_scores_in_separate_fields: scores = scores.transmute(scores=[ scores[ann] for ann in sorted( [ann for ann in scores.row if ann.startswith("PC")], key=lambda x: int(x[2:]) ) ]) platform_pcs = assign_platform_pcs(scores) platform_pcs.write(qc_ht_path('exomes', 'platforms'), overwrite=args.overwrite)
def pop_max_expr( freq: hl.expr.ArrayExpression, freq_meta: hl.expr.ArrayExpression, pops_to_exclude: Optional[Set[str]] = None, ) -> hl.expr.StructExpression: """ Creates an expression containing popmax: the frequency information about the population that has the highest AF from the populations provided in `freq_meta`, excluding those specified in `pops_to_exclude`. Only frequencies from adj populations are considered. This resulting struct contains the following fields: - AC: int32 - AF: float64 - AN: int32 - homozygote_count: int32 - pop: str :param freq: ArrayExpression of Structs with fields ['AC', 'AF', 'AN', 'homozygote_count'] :param freq_meta: ArrayExpression of meta dictionaries corresponding to freq (as returned by annotate_freq) :param pops_to_exclude: Set of populations to skip for popmax calcluation :return: Popmax struct """ _pops_to_exclude = hl.literal(pops_to_exclude) # pylint: disable=invalid-unary-operand-type popmax_freq_indices = hl.range(0, hl.len(freq_meta)).filter( lambda i: (hl.set(freq_meta[i].keys()) == {"group", "pop"}) & (freq_meta[i]["group"] == "adj") & (~_pops_to_exclude.contains(freq_meta[i]["pop"])) ) freq_filtered = popmax_freq_indices.map( lambda i: freq[i].annotate(pop=freq_meta[i]["pop"]) ).filter(lambda f: f.AC > 0) sorted_freqs = hl.sorted(freq_filtered, key=lambda x: x.AF, reverse=True) return hl.or_missing(hl.len(sorted_freqs) > 0, sorted_freqs[0])
def set_female_y_metrics_to_na( t: Union[hl.Table, hl.MatrixTable], ) -> Dict[str, hl.expr.Int32Expression]: """ Set AC, AN, and nhomalt chrY variant annotations for females to NA (instead of 0). :param t: Table/MatrixTable containing female variant annotations. :return: Dictionary with reset annotations """ metrics = list(t.row.info) female_metrics = [x for x in metrics if "_female" in x or "_XX" in x] female_metrics_dict = {} for metric in female_metrics: female_metrics_dict.update({ f"{metric}": hl.or_missing( (~t.locus.in_y_nonpar() | ~t.locus.in_y_par()), t.info[f"{metric}"], ) }) return female_metrics_dict
def load_final_sumstats_mt(filter_phenos: bool = True, filter_variants: bool = True, filter_sumstats: bool = True, separate_columns_by_pop: bool = True, annotate_with_nearest_gene: bool = True): mt = hl.read_matrix_table(get_variant_results_path('full', 'mt')) variant_qual_ht = hl.read_table(get_variant_results_qc_path()) mt = mt.annotate_rows(**variant_qual_ht[mt.row_key]) pheno_qual_ht = hl.read_table( get_analysis_data_path('lambda', 'lambdas', 'full', 'ht')) mt = mt.annotate_cols(**pheno_qual_ht[mt.col_key]) if filter_phenos: keep_phenos = hl.zip_with_index( mt.pheno_data).filter(lambda x: filter_lambda_gc(x[1].lambda_gc)) mt = mt.annotate_cols(pheno_indices=keep_phenos.map(lambda x: x[0]), pheno_data=keep_phenos.map(lambda x: x[1])) mt = mt.annotate_entries( summary_stats=hl.zip_with_index(mt.summary_stats).filter( lambda x: mt.pheno_indices.contains(x[0])).map(lambda x: x[1])) mt = mt.filter_cols(hl.len(mt.pheno_data) > 0) if filter_sumstats: mt = mt.annotate_entries(summary_stats=mt.summary_stats.map( lambda x: hl.or_missing(~x.low_confidence, x))) mt = mt.filter_entries( ~mt.summary_stats.all(lambda x: hl.is_missing(x.Pvalue))) if filter_variants: mt = mt.filter_rows(mt.high_quality) if annotate_with_nearest_gene: mt = annotate_nearest_gene(mt) if separate_columns_by_pop: mt = separate_results_mt_by_pop(mt) return mt
def add_coding_information(mt: hl.MatrixTable, coding_ht: hl.Table, phesant_phenotype_info_path: str, download_missing_codings: bool = False) -> hl.MatrixTable: """ Add coding information from coding_ht as column annotations into mt :param MatrixTable mt: Input MT :param Table coding_ht: HT with coding information :param str phesant_phenotype_info_path: PHESANT phenotype metadata path :param bool download_missing_codings: Whether to download missing coding data :return: MT with coding information in column data :rtype: MatrixTable """ mt = mt.annotate_cols(**coding_ht[(mt.coding_id, hl.str(mt.coding))]) if download_missing_codings: get_missing_codings(mt.cols()) phesant_summary = hl.import_table(phesant_phenotype_info_path, impute=True, missing='', key='FieldID') phesant_reassign = get_phesant_reassignments(phesant_summary) mt = mt.annotate_cols(recoding=hl.or_missing( hl.is_missing(mt.meaning), phesant_reassign[mt.col_key.select('phenocode', 'coding')].reassign_from )) return mt.annotate_cols(**hl.cond(hl.is_defined(mt.meaning), hl.struct(**{x: mt[x] for x in list(coding_ht.row_value)}), coding_ht[(mt.coding_id, hl.str(mt.recoding))]), )
def hemi_expr( locus: hl.expr.LocusExpression, sex_expr: hl.expr.StringExpression, gt: hl.expr.CallExpression, male_str: str = "XY", ) -> hl.expr.BooleanExpression: """ Return whether genotypes are hemizygous. Return missing expression if locus is not in chrX/chrY non-PAR regions. :param locus: Input locus. :param sex_expr: Input StringExpression indicating whether sample is XX or XY. :param gt: Input genotype. :param xy_str: String indicating whether sample is XY. Default is "XY". :return: BooleanExpression indicating whether genotypes are hemizygous. """ return hl.or_missing( locus.in_x_nonpar() | locus.in_y_nonpar(), # Haploid genotypes have a single integer, so checking if # mt.GT[0] is alternate allele gt.is_haploid() & (sex_expr == male_str) & (gt[0] == 1), )
def with_pl(pl): new_exprs = {} dropped_fields = ['LA'] if 'LGT' in fields: new_exprs['GT'] = hl.downcode( old_entry.LGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LGT') if 'LPGT' in fields: new_exprs['PGT'] = hl.downcode( old_entry.LPGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LPGT') if 'LAD' in fields: new_exprs['AD'] = hl.or_missing( hl.is_defined(old_entry.LAD), [ old_entry.LAD[0], hl.or_else(old_entry.LAD[local_a_index], 0) ]) # second entry zeroed for lack of non-ref AD dropped_fields.append('LAD') if 'LPL' in fields: new_exprs['PL'] = pl if 'GQ' in fields: new_exprs['GQ'] = hl.or_else(hl.gq_from_pl(pl), old_entry.GQ) dropped_fields.append('LPL') return hl.cond( hl.len(ds.alleles) == 1, old_entry.annotate( **{ f[1:]: old_entry[f] for f in ['LGT', 'LPGT', 'LAD', 'LPL'] if f in fields }).drop(*dropped_fields), old_entry.annotate(**new_exprs).drop(*dropped_fields))
def apply_filter_flags_expr( mt: hl.MatrixTable, data_type: str, metric_thresholds: dict ) -> hl.expr.SetExpression: """ Annotates table with flags for elevated contamination and chimera as well as low coverage and call rate :param Table mt: input MatrixTable :param str data_type: 'WES' or 'WGS' for selecting coverage threshold :param dict metric_thresholds: dictionary where key is metric and value is threshold value :return: Set of sequencing metric flags :rtype: SetExpression """ flags = { "callrate": mt.filtered_callrate < metric_thresholds["callrate_thres"], "contamination": mt.PCT_CONTAMINATION > metric_thresholds[ "contam_thres" ], # TODO revisit current thresholds and rename once have to kristen's script output "chimera": mt.AL_PCT_CHIMERAS > metric_thresholds["chimera_thres"], } if data_type == "WES": flags.update( { "coverage": mt.HS_PCT_TARGET_BASES_20X < metric_thresholds["wes_cov_thres"] } ) else: flags.update( {"coverage": mt.WGS_MEAN_COVERAGE < metric_thresholds["wgs_cov_thres"]} ) return hl.set( hl.filter( lambda x: hl.is_defined(x), [hl.or_missing(filter_expr, name) for name, filter_expr in flags.items()], ) )
def load_activity_monitor_data(first_exposure_and_activity_monitor_data_path): ht = hl.import_table(first_exposure_and_activity_monitor_data_path, delimiter=',', quote='"', missing='', impute=True, key='eid') #, min_partitions=500) quality_fields = ['90015-0.0', '90016-0.0', '90017-0.0'] qual_ht = ht.select( hq=hl.is_missing(ht['90002-0.0']) & hl.all(lambda x: x == 1, [ht[x] for x in quality_fields])) mt = filter_and_annotate_ukb_data( ht, lambda x, v: x.startswith('90') and x.endswith('-0.0') and v.dtype in {hl.tint32, hl.tfloat64}) mt = mt.filter_cols(mt.ValueType == 'Continuous') mt = mt.annotate_rows(**qual_ht[mt.row_key]) mt = mt.annotate_entries( value=hl.or_missing(hl.is_defined(mt.hq), mt.value)) mt = mt.key_cols_by(trait_type='continuous', phenocode=mt.phenocode, pheno_sex='both_sexes', coding=NULL_STR_KEY, modifier=NULL_STR_KEY) return mt
def with_pl(pl): new_exprs = {} dropped_fields = ['LA'] if 'LGT' in fields: new_exprs['GT'] = hl.downcode(old_entry.LGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LGT') if 'LPGT' in fields: new_exprs['PGT'] = hl.downcode(old_entry.LPGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LPGT') if 'LAD' in fields: new_exprs['AD'] = hl.or_missing( hl.is_defined(old_entry.LAD), [old_entry.LAD[0], hl.or_else(old_entry.LAD[local_a_index], 0)]) # second entry zeroed for lack of non-ref AD dropped_fields.append('LAD') if 'LPL' in fields: new_exprs['PL'] = pl if 'GQ' in fields: new_exprs['GQ'] = hl.or_else(hl.gq_from_pl(pl), old_entry.GQ) dropped_fields.append('LPL') return hl.cond(hl.len(ds.alleles) == 1, old_entry.annotate(**{f[1:]: old_entry[f] for f in ['LGT', 'LPGT', 'LAD', 'LPL'] if f in fields}).drop(*dropped_fields), old_entry.annotate(**new_exprs).drop(*dropped_fields))
def process_consequences( mt: Union[hl.MatrixTable, hl.Table], vep_root: str = "vep", penalize_flags: bool = True, ) -> Union[hl.MatrixTable, hl.Table]: """ Adds most_severe_consequence (worst consequence for a transcript) into [vep_root].transcript_consequences, and worst_csq_by_gene, any_lof into [vep_root] :param mt: Input MT :param vep_root: Root for vep annotation (probably vep) :param penalize_flags: Whether to penalize LOFTEE flagged variants, or treat them as equal to HC :return: MT with better formatted consequences """ csqs = hl.literal(CSQ_ORDER) csq_dict = hl.literal(dict(zip(CSQ_ORDER, range(len(CSQ_ORDER))))) def find_worst_transcript_consequence( tcl: hl.expr.ArrayExpression, ) -> hl.expr.StructExpression: """ Gets worst transcript_consequence from an array of em """ flag_score = 500 no_flag_score = flag_score * (1 + penalize_flags) def csq_score(tc): return csq_dict[csqs.find( lambda x: x == tc.most_severe_consequence)] tcl = tcl.map( lambda tc: tc.annotate(csq_score=hl.case(missing_false=True).when( (tc.lof == "HC") & (tc.lof_flags == ""), csq_score(tc) - no_flag_score, ).when( (tc.lof == "HC") & (tc.lof_flags != ""), csq_score(tc) - flag_score ).when(tc.lof == "OS", csq_score(tc) - 20).when( tc.lof == "LC", csq_score(tc) - 10 ).when(tc.polyphen_prediction == "probably_damaging", csq_score(tc) - 0.5).when( tc.polyphen_prediction == "possibly_damaging", csq_score(tc) - 0.25).when( tc.polyphen_prediction == "benign", csq_score(tc) - 0.1).default(csq_score(tc)))) return hl.or_missing( hl.len(tcl) > 0, hl.sorted(tcl, lambda x: x.csq_score)[0]) transcript_csqs = mt[vep_root].transcript_consequences.map( add_most_severe_consequence_to_consequence) gene_dict = transcript_csqs.group_by(lambda tc: tc.gene_symbol) worst_csq_gene = gene_dict.map_values( find_worst_transcript_consequence).values() sorted_scores = hl.sorted(worst_csq_gene, key=lambda tc: tc.csq_score) canonical = transcript_csqs.filter(lambda csq: csq.canonical == 1) gene_canonical_dict = canonical.group_by(lambda tc: tc.gene_symbol) worst_csq_gene_canonical = gene_canonical_dict.map_values( find_worst_transcript_consequence).values() sorted_canonical_scores = hl.sorted(worst_csq_gene_canonical, key=lambda tc: tc.csq_score) vep_data = mt[vep_root].annotate( transcript_consequences=transcript_csqs, worst_consequence_term=csqs.find(lambda c: transcript_csqs.map( lambda csq: csq.most_severe_consequence).contains(c)), worst_csq_by_gene=sorted_scores, worst_csq_for_variant=hl.or_missing( hl.len(sorted_scores) > 0, sorted_scores[0]), worst_csq_by_gene_canonical=sorted_canonical_scores, worst_csq_for_variant_canonical=hl.or_missing( hl.len(sorted_canonical_scores) > 0, sorted_canonical_scores[0]), ) return (mt.annotate_rows(**{vep_root: vep_data}) if isinstance( mt, hl.MatrixTable) else mt.annotate(**{vep_root: vep_data}))
def vep_struct_to_csq( vep_expr: hl.expr.StructExpression, csq_fields: str = VEP_CSQ_FIELDS) -> hl.expr.ArrayExpression: """ Given a VEP Struct, returns and array of VEP VCF CSQ strings (one per consequence in the struct). The fields and their order will correspond to those passed in `csq_fields`, which corresponds to the VCF header that is required to interpret the VCF CSQ INFO field. Note that the order is flexible and that all fields that are in the default value are supported. These fields will be formatted in the same way that their VEP CSQ counterparts are. While other fields can be added if their name are the same as those in the struct. Their value will be the result of calling hl.str(), so it may differ from their usual VEP CSQ representation. :param vep_expr: The input VEP Struct :param csq_fields: The | delimited list of fields to include in the CSQ (in that order) :return: The corresponding CSQ strings """ _csq_fields = [f.lower() for f in csq_fields.split("|")] def get_csq_from_struct(element: hl.expr.StructExpression, feature_type: str) -> hl.expr.StringExpression: # Most fields are 1-1, just lowercase fields = dict(element) # Add general exceptions fields.update({ "allele": element.variant_allele, "consequence": hl.delimit(element.consequence_terms, delimiter="&"), "feature_type": feature_type, "feature": (element.transcript_id if "transcript_id" in element else element.regulatory_feature_id if "regulatory_feature_id" in element else element.motif_feature_id if "motif_feature_id" in element else ""), "variant_class": vep_expr.variant_class, }) # Add exception for transcripts if feature_type == "Transcript": fields.update({ "canonical": hl.cond(element.canonical == 1, "YES", ""), "ensp": element.protein_id, "gene": element.gene_id, "symbol": element.gene_symbol, "symbol_source": element.gene_symbol_source, "cdna_position": hl.str(element.cdna_start) + hl.cond( element.cdna_start == element.cdna_end, "", "-" + hl.str(element.cdna_end), ), "cds_position": hl.str(element.cds_start) + hl.cond( element.cds_start == element.cds_end, "", "-" + hl.str(element.cds_end), ), "protein_position": hl.str(element.protein_start) + hl.cond( element.protein_start == element.protein_end, "", "-" + hl.str(element.protein_end), ), "sift": element.sift_prediction + "(" + hl.format("%.3f", element.sift_score) + ")", "polyphen": element.polyphen_prediction + "(" + hl.format("%.3f", element.polyphen_score) + ")", "domains": hl.delimit(element.domains.map(lambda d: d.db + ":" + d.name), "&"), }) elif feature_type == "MotifFeature": fields["motif_score_change"] = hl.format( "%.3f", element.motif_score_change) return hl.delimit( [hl.or_else(hl.str(fields.get(f, "")), "") for f in _csq_fields], "|") csq = hl.empty_array(hl.tstr) for feature_field, feature_type in [ ("transcript_consequences", "Transcript"), ("regulatory_feature_consequences", "RegulatoryFeature"), ("motif_feature_consequences", "MotifFeature"), ("intergenic_consequences", "Intergenic"), ]: csq = csq.extend( hl.or_else( vep_expr[feature_field].map(lambda x: get_csq_from_struct( x, feature_type=feature_type)), hl.empty_array(hl.tstr), )) return hl.or_missing(hl.len(csq) > 0, csq)
def import_gtf(path, reference_genome=None, skip_invalid_contigs=False, min_partitions=None) -> hl.Table: """Import a GTF file. The GTF file format is identical to the GFF version 2 file format, and so this function can be used to import GFF version 2 files as well. See https://www.ensembl.org/info/website/upload/gff.html for more details on the GTF/GFF2 file format. The :class:`.Table` returned by this function will be keyed by the ``interval`` row field and will include the following row fields: .. code-block:: text 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'interval': interval<> There will also be corresponding fields for every tag found in the attribute field of the GTF file. Note ---- This function will return an ``interval`` field of type :class:`.tinterval` constructed from the ``seqname``, ``start``, and ``end`` fields in the GTF file. This interval is inclusive of both the start and end positions in the GTF file. If the ``reference_genome`` parameter is specified, the start and end points of the ``interval`` field will be of type :class:`.tlocus`. Otherwise, the start and end points of the ``interval`` field will be of type :class:`.tstruct` with fields ``seqname`` (type :class:`str`) and ``position`` (type :class:`.tint32`). Furthermore, if the ``reference_genome`` parameter is specified and ``skip_invalid_contigs`` is ``True``, this import function will skip lines in the GTF where ``seqname`` is not consistent with the reference genome specified. Example ------- >>> ht = hl.experimental.import_gtf('data/test.gtf', ... reference_genome='GRCh37', ... skip_invalid_contigs=True) >>> ht.describe() # doctest: +NOTEST ---------------------------------------- Global fields: None ---------------------------------------- Row fields: 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'gene_type': str 'exon_id': str 'havana_transcript': str 'level': str 'transcript_name': str 'gene_status': str 'gene_id': str 'transcript_type': str 'tag': str 'transcript_status': str 'gene_name': str 'transcript_id': str 'exon_number': str 'havana_gene': str 'interval': interval<locus<GRCh37>> ---------------------------------------- Key: ['interval'] ---------------------------------------- Parameters ---------- path : :obj:`str` File to import. reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional Reference genome to use. skip_invalid_contigs : :obj:`bool` If ``True`` and `reference_genome` is not ``None``, skip lines where ``seqname`` is not consistent with the reference genome. min_partitions : :obj:`int` or :obj:`None` Minimum number of partitions (passed to import_table). Returns ------- :class:`.Table` """ ht = hl.import_table(path, min_partitions=min_partitions, comment='#', no_header=True, types={'f3': hl.tint, 'f4': hl.tint, 'f5': hl.tfloat, 'f7': hl.tint}, missing='.', delimiter='\t') ht = ht.rename({'f0': 'seqname', 'f1': 'source', 'f2': 'feature', 'f3': 'start', 'f4': 'end', 'f5': 'score', 'f6': 'strand', 'f7': 'frame', 'f8': 'attribute'}) ht = ht.annotate(attribute=hl.dict( hl.map(lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', '').replace(';$', '')), ht['attribute'].split('; ')))) attributes = ht.aggregate(hl.agg.explode(lambda x: hl.agg.collect_as_set(x), ht['attribute'].keys())) ht = ht.transmute(**{x: hl.or_missing(ht['attribute'].contains(x), ht['attribute'][x]) for x in attributes if x}) if reference_genome: if reference_genome == 'GRCh37': ht = ht.annotate(seqname=ht['seqname'].replace('^chr', '')) else: ht = ht.annotate(seqname=hl.case() .when(ht['seqname'].startswith('HLA'), ht['seqname']) .when(ht['seqname'].startswith('chrHLA'), ht['seqname'].replace('^chr', '')) .when(ht['seqname'].startswith('chr'), ht['seqname']) .default('chr' + ht['seqname'])) if skip_invalid_contigs: valid_contigs = hl.literal(set(hl.get_reference(reference_genome).contigs)) ht = ht.filter(valid_contigs.contains(ht['seqname'])) ht = ht.transmute(interval=hl.locus_interval(ht['seqname'], ht['start'], ht['end'], includes_start=True, includes_end=True, reference_genome=reference_genome)) else: ht = ht.transmute(interval=hl.interval(hl.struct(seqname=ht['seqname'], position=ht['start']), hl.struct(seqname=ht['seqname'], position=ht['end']), includes_start=True, includes_end=True)) ht = ht.key_by('interval') return ht
def prepare_mitochondrial_variants(path, mnvs_path=None): ds = hl.read_table(path) haplogroups = hl.eval(ds.globals.hap_order) ds = ds.annotate(hl_hist=ds.hl_hist.annotate( bin_edges=ds.hl_hist.bin_edges.map( lambda n: hl.float(hl.format("%.2f", n))))) filter_names = hl.dict({ "artifact_prone_site": "Artifact-prone site", "indel_stack": "Indel stack", "npg": "No passing genotype" }) ds = ds.select( # ID variant_id=variant_id(ds.locus, ds.alleles), reference_genome=ds.locus.dtype.reference_genome.name, chrom=normalized_contig(ds.locus.contig), pos=ds.locus.position, ref=ds.alleles[0], alt=ds.alleles[1], rsid=ds.rsid, # Quality filters=ds.filters.map(lambda f: filter_names.get(f, f)), qual=ds.qual, genotype_quality_metrics=[ hl.struct(name="Depth", alt=ds.dp_hist_alt, all=ds.dp_hist_all) ], genotype_quality_filters=[ hl.struct( name="Base Quality", filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges, bin_freq=ds.base_qual_hist), ), hl.struct( name="Contamination", filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges, bin_freq=ds.contamination_hist), ), hl.struct( name="Heteroplasmy below 10%", filtered=hl.struct( bin_edges=ds.hl_hist.bin_edges, bin_freq=ds.heteroplasmy_below_10_percent_hist), ), hl.struct(name="Position", filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges, bin_freq=ds.position_hist)), hl.struct( name="Strand Bias", filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges, bin_freq=ds.strand_bias_hist), ), hl.struct( name="Weak Evidence", filtered=hl.struct(bin_edges=ds.hl_hist.bin_edges, bin_freq=ds.weak_evidence_hist), ), ], site_quality_metrics=[ hl.struct(name="Mean Depth", value=nullify_nan(ds.dp_mean)), hl.struct(name="Mean MQ", value=nullify_nan(ds.mq_mean)), hl.struct(name="Mean TLOD", value=nullify_nan(ds.tlod_mean)), ], # Frequency an=ds.AN, ac_hom=ds.AC_hom, ac_het=ds.AC_het, excluded_ac=ds.excluded_AC, # Heteroplasmy common_low_heteroplasmy=ds.common_low_heteroplasmy, heteroplasmy_distribution=ds.hl_hist, max_heteroplasmy=ds.max_hl, # Populations populations=hl.sorted( hl.range(hl.len( ds.globals.pop_order)).map(lambda pop_index: hl.struct( id=ds.globals.pop_order[pop_index], an=ds.pop_AN[pop_index], ac_het=ds.pop_AC_het[pop_index], ac_hom=ds.pop_AC_hom[pop_index], heteroplasmy_distribution=hl.struct( bin_edges=ds.hl_hist.bin_edges, bin_freq=ds.pop_hl_hist[pop_index], n_smaller=0, n_larger=0, ), )), key=lambda pop: pop.id, ), # Haplogroups hapmax_af_hom=ds.hapmax_AF_hom, hapmax_af_het=ds.hapmax_AF_het, faf_hapmax_hom=ds.faf_hapmax_hom, haplogroup_defining=ds.hap_defining_variant, haplogroups=[ hl.struct( id=haplogroup, an=ds.hap_AN[i], ac_het=ds.hap_AC_het[i], ac_hom=ds.hap_AC_hom[i], faf_hom=ds.hap_faf_hom[i], heteroplasmy_distribution=ds.hap_hl_hist[i], ) for i, haplogroup in enumerate(haplogroups) ], # Other age_distribution=hl.struct(het=ds.age_hist_het, hom=ds.age_hist_hom), flags=hl.set([ hl.or_missing(ds.common_low_heteroplasmy, "common_low_heteroplasmy") ]).filter(hl.is_defined), mitotip_score=ds.mitotip_score, mitotip_trna_prediction=ds.mitotip_trna_prediction, pon_ml_probability_of_pathogenicity=ds. pon_ml_probability_of_pathogenicity, pon_mt_trna_prediction=ds.pon_mt_trna_prediction, variant_collapsed=ds.variant_collapsed, vep=ds.vep, ) if mnvs_path: mnvs = hl.import_table(mnvs_path, types={ "pos": hl.tint, "ref": hl.tstr, "alt": hl.tstr, "AC_hom_MNV": hl.tint }) mnvs = mnvs.key_by( locus=hl.locus("chrM", mnvs.pos, reference_genome=ds.locus.dtype.reference_genome), alleles=[mnvs.ref, mnvs.alt], ) ds = ds.annotate(ac_hom_mnv=hl.or_else(mnvs[ds.key].AC_hom_MNV, 0)) ds = ds.annotate( flags=hl.if_else(ds.ac_hom_mnv > 0, ds.flags.add("mnv"), ds.flags)) return ds
def import_gtf(path, reference_genome=None, skip_invalid_contigs=False, min_partitions=None) -> hl.Table: """Import a GTF file. The GTF file format is identical to the GFF version 2 file format, and so this function can be used to import GFF version 2 files as well. See https://www.ensembl.org/info/website/upload/gff.html for more details on the GTF/GFF2 file format. The :class:`.Table` returned by this function will be keyed by the ``interval`` row field and will include the following row fields: .. code-block:: text 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'interval': interval<> There will also be corresponding fields for every tag found in the attribute field of the GTF file. Note ---- This function will return an ``interval`` field of type :class:`.tinterval` constructed from the ``seqname``, ``start``, and ``end`` fields in the GTF file. This interval is inclusive of both the start and end positions in the GTF file. If the ``reference_genome`` parameter is specified, the start and end points of the ``interval`` field will be of type :class:`.tlocus`. Otherwise, the start and end points of the ``interval`` field will be of type :class:`.tstruct` with fields ``seqname`` (type :class:`str`) and ``position`` (type :class:`.tint32`). Furthermore, if the ``reference_genome`` parameter is specified and ``skip_invalid_contigs`` is ``True``, this import function will skip lines in the GTF where ``seqname`` is not consistent with the reference genome specified. Example ------- >>> ht = hl.experimental.import_gtf('data/test.gtf', ... reference_genome='GRCh37', ... skip_invalid_contigs=True) >>> ht.describe() # doctest: +SKIP_OUTPUT_CHECK ---------------------------------------- Global fields: None ---------------------------------------- Row fields: 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'gene_type': str 'exon_id': str 'havana_transcript': str 'level': str 'transcript_name': str 'gene_status': str 'gene_id': str 'transcript_type': str 'tag': str 'transcript_status': str 'gene_name': str 'transcript_id': str 'exon_number': str 'havana_gene': str 'interval': interval<locus<GRCh37>> ---------------------------------------- Key: ['interval'] ---------------------------------------- Parameters ---------- path : :obj:`str` File to import. reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional Reference genome to use. skip_invalid_contigs : :obj:`bool` If ``True`` and `reference_genome` is not ``None``, skip lines where ``seqname`` is not consistent with the reference genome. min_partitions : :obj:`int` or :obj:`None` Minimum number of partitions (passed to import_table). Returns ------- :class:`.Table` """ ht = hl.import_table(path, min_partitions=min_partitions, comment='#', no_header=True, types={ 'f3': hl.tint, 'f4': hl.tint, 'f5': hl.tfloat, 'f7': hl.tint }, missing='.', delimiter='\t') ht = ht.rename({ 'f0': 'seqname', 'f1': 'source', 'f2': 'feature', 'f3': 'start', 'f4': 'end', 'f5': 'score', 'f6': 'strand', 'f7': 'frame', 'f8': 'attribute' }) ht = ht.annotate(attribute=hl.dict( hl.map( lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', ''). replace(';$', '')), ht['attribute'].split('; ')))) attributes = ht.aggregate( hl.agg.explode(lambda x: hl.agg.collect_as_set(x), ht['attribute'].keys())) ht = ht.transmute( **{ x: hl.or_missing(ht['attribute'].contains(x), ht['attribute'][x]) for x in attributes if x }) if reference_genome: if reference_genome == 'GRCh37': ht = ht.annotate(seqname=ht['seqname'].replace('^chr', '')) else: ht = ht.annotate(seqname=hl.case().when( ht['seqname'].startswith('HLA'), ht['seqname']).when( ht['seqname'].startswith('chrHLA'), ht['seqname'].replace( '^chr', '')).when(ht['seqname'].startswith( 'chr'), ht['seqname']).default('chr' + ht['seqname'])) if skip_invalid_contigs: valid_contigs = hl.literal( set(hl.get_reference(reference_genome).contigs)) ht = ht.filter(valid_contigs.contains(ht['seqname'])) ht = ht.transmute( interval=hl.locus_interval(ht['seqname'], ht['start'], ht['end'], includes_start=True, includes_end=True, reference_genome=reference_genome)) else: ht = ht.transmute(interval=hl.interval( hl.struct(seqname=ht['seqname'], position=ht['start']), hl.struct(seqname=ht['seqname'], position=ht['end']), includes_start=True, includes_end=True)) ht = ht.key_by('interval') return ht
if pop == 'AFRICAN_AMERICAN': vds = vdsor.filter_cols((vdsor.race_ethnicity == pop)) if pop == 'HISPANIC': vds = vdsor.filter_cols((vdsor.race_ethnicity == 'HISPANIC') | (vdsor.race_ethnicity == 'PUERTO_RICAN')) #vds.cols().flatten().export(samples_file) for disease in ['meta', 'cd']: if disease == 'meta': vds = hl.logistic_regression( test='wald', y=hl.cond(((vds.diagnosis_specific == 'uc') | (vds.diagnosis_specific == 'cd')), 1, hl.cond(vds.diagnosis_specific == 'control', 0, hl.or_missing(False, 1))), x=vds.GT.n_alt_alleles(), covariates=[vds.sex == 'male'], root='logreg_' + disease) # , vds.PC1, vds.PC2, vds.PC3, vds.PC4, vds.PC5, vds.PC6, vds.PC7, vds.PC8, vds.PC9, vds.PC10 vds = vds.annotate_rows( metahomref=hl.agg.count_where(((vds.diagnosis_specific == 'uc') | (vds.diagnosis_specific == 'cd')) & vds.GT.is_hom_ref()), metahet=hl.agg.count_where(((vds.diagnosis_specific == 'uc') | (vds.diagnosis_specific == 'cd')) & vds.GT.is_het()), metahomvar=hl.agg.count_where(((vds.diagnosis_specific == 'uc') | (vds.diagnosis_specific == 'cd')) & vds.GT.is_hom_var()), metamissing=hl.agg.count_where(((vds.diagnosis_specific == 'uc')
def fs_from_sb( sb: Union[hl.expr.ArrayNumericExpression, hl.expr.ArrayExpression], normalize: bool = True, min_cell_count: int = 200, min_count: int = 4, min_p_value: float = 1e-320, ) -> hl.expr.Int64Expression: """ Computes `FS` (Fisher strand balance) annotation from the `SB` (strand balance table) field. `FS` is the phred-scaled value of the double-sided Fisher exact test on strand balance. Using default values will have the same behavior as the GATK implementation, that is: - If sum(counts) > 2*`min_cell_count` (default to GATK value of 200), they are normalized - If sum(counts) < `min_count` (default to GATK value of 4), returns missing - Any p-value < `min_p_value` (default to GATK value of 1e-320) is truncated to that value In addition to the default GATK behavior, setting `normalize` to `False` will perform a chi-squared test for large counts (> `min_cell_count`) instead of normalizing the cell values. .. note:: This function can either take - an array of length four containing the forward and reverse strands' counts of ref and alt alleles: [ref fwd, ref rev, alt fwd, alt rev] - a two dimensional array with arrays of length two, containing the counts: [[ref fwd, ref rev], [alt fwd, alt rev]] GATK code here: https://github.com/broadinstitute/gatk/blob/master/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/FisherStrand.java :param sb: Count of ref/alt reads on each strand :param normalize: Whether to normalize counts is sum(counts) > min_cell_count (normalize=True), or use a chi sq instead of FET (normalize=False) :param min_cell_count: Maximum count for performing a FET :param min_count: Minimum total count to output FS (otherwise null it output) :return: FS value """ if not isinstance(sb, hl.expr.ArrayNumericExpression): sb = hl.bind(lambda x: hl.flatten(x), sb) sb_sum = hl.bind(lambda x: hl.sum(x), sb) # Normalize table if counts get too large if normalize: fs_expr = hl.bind( lambda sb, sb_sum: hl.cond( sb_sum <= 2 * min_cell_count, sb, sb.map(lambda x: hl.int(x / (sb_sum / min_cell_count))), ), sb, sb_sum, ) # FET fs_expr = to_phred( hl.max( hl.fisher_exact_test( fs_expr[0], fs_expr[1], fs_expr[2], fs_expr[3] ).p_value, min_p_value, ) ) else: fs_expr = to_phred( hl.max( hl.contingency_table_test( sb[0], sb[1], sb[2], sb[3], min_cell_count=min_cell_count ).p_value, min_p_value, ) ) # Return null if counts <= `min_count` return hl.or_missing( sb_sum > min_count, hl.max(0, fs_expr) # Needed to avoid -0.0 values )