def flatten_phased_ht(phased_ht: hl.Table) -> hl.Table: phased_ht = phased_ht.key_by() # phase_ht = phase_ht.key_by() def flatten_phase_dict( expr: hl.expr.StructExpression) -> hl.expr.StructExpression: return hl.struct( raw=flatten_gt_counts(expr.gt_counts.raw), adj=flatten_gt_counts(expr.gt_counts.adj), em_p_chet_raw=expr.em.raw.p_chet, em_p_chet_adj=expr.em.adj.p_chet, # em1_p_chet_raw=expr.em_plus_one.raw.p_chet, # em1_p_chet_adj=expr.em_plus_one.adj.p_chet ) return phased_ht.transmute( chrom=phased_ht.locus1.contig, pos1=phased_ht.locus1.position, ref1=phased_ht.alleles1[0], alt1=phased_ht.alleles1[1], pos2=phased_ht.locus2.position, ref2=phased_ht.alleles2[0], alt2=phased_ht.alleles2[1], **{k: v for k, v in flatten_phase_dict(phased_ht.phase_info).items() }).flatten()
def check_sex( sex_ht: hl.Table, output_dir: str, output_name: str, ) -> None: """ Compare inferred to given sex and output file with column added for discrepancies. Output directory and name here are used to locate the functioning pedigree with given sexes. :param sex_ht: Table of inferred sexes for each sample :param output_dir: Path to directory to output results :param output_name: Output prefix to use for results :return: None """ # Read in functioning pedigree with given sexes ped_ht = hl.import_table( f"{output_dir}/{output_name}_functioning_pedigree.ped") ped_ht = ped_ht.key_by(s=ped_ht.Individual_ID).select("Sex") ped_ht = ped_ht.annotate( given_sex=hl.case().when(ped_ht.Sex == "M", "male").when( ped_ht.Sex == "F", "female").default(ped_ht.Sex)).drop("Sex") sex_ht = sex_ht.join(ped_ht, how="outer") sex_ht = sex_ht.annotate(discrepant_sex=sex_ht.sex != sex_ht.given_sex) sex_ht.export(f"{output_dir}/{output_name}_sex_check.txt")
def apply_rf_model(ht: hl.Table, rf_model: pyspark.ml.PipelineModel, features: List[str], label: str, probability_col_name: str = 'rf_probability', prediction_col_name: str = 'rf_prediction') -> hl.Table: """ Applies a Random Forest (RF) pipeline model to a Table and annotate the RF probabilities and predictions. :param MatrixTable ht: Input HT :param PipelineModel rf_model: Random Forest pipeline model :param list of str features: List of feature columns in the pipeline. !Should match the model list of features! :param str label: Column containing the labels. !Should match the model labels! :param str probability_col_name: Name of the column that will store the RF probabilities :param str prediction_col_name: Name of the column that will store the RF predictions :return: Table with RF columns :rtype: Table """ logger.info("Applying RF model.") check_ht_fields_for_spark(ht, features + [label]) index_name = 'rf_idx' while index_name in ht.row: index_name += '_tmp' ht = ht.add_index(name=index_name) ht_keys = ht.key ht = ht.key_by(index_name) df = ht_to_rf_df(ht, features, label, index_name) rf_df = rf_model.transform(df) def to_array(col): def to_array_(v): return v.toArray().tolist() return udf(to_array_, ArrayType(DoubleType()))(col) rf_ht = hl.Table.from_spark( rf_df.withColumn("probability", to_array(col("probability"))).select( [index_name, 'probability', 'predictedLabel'])).persist() rf_ht = rf_ht.key_by(index_name) ht = ht.annotate( **{ probability_col_name: { label: rf_ht[ht[index_name]]["probability"][i] for i, label in enumerate(get_labels(rf_model)) }, prediction_col_name: rf_ht[ht[index_name]]["predictedLabel"] }) ht = ht.key_by(*ht_keys) ht = ht.drop(index_name) return ht
def annotate_from_dict(ht: hl.Table, dict_field: str, output_filed: str) -> hl.Table: """ Expand an dict field and add new fields. :param ht: HailTable :param dict_field: The dict field to be expanded :param output_filed: The output filed name (annotated as structure) :return: Annotated HailTable """ # retrieve dict keys to be annotated as fields dict_keys = ht[dict_field].keys().take(1)[0] # structure annotation expression struct_expr = hl.struct( **{ dict_keys[i]: ht[dict_field].get(dict_keys[i]) for i in range(len(dict_keys)) }) ht = (ht.annotate(_tmp_field_=struct_expr)) ht = ht.rename({'_tmp_field_': output_filed}) return ht
def compute_phase(variants_ht: hl.Table, least_consequence: str = LEAST_CONSEQUENCE, max_freq: float = MAX_FREQ) -> hl.Table: n_variant_pairs = variants_ht.count() logger.info(f"Looking up phase for {n_variant_pairs} variant pair(s).") # Join with gnomad phased variants vp_ht = hl.read_table(phased_vp_count_ht_path('exomes')) phased_ht = vp_ht.semi_join(variants_ht) n_phased = phased_ht.count() phased_ht = explode_phase_info(phased_ht) # explodes phase_info by pop phased_ht = phased_ht.transmute( phase_info=phased_ht.phase_info.select('gt_counts', 'em')).repartition( ceil(n_variant_pairs / 10000), shuffle=True) phased_ht = phased_ht.persist() # .checkpoint("gs://gnomad-tmp/vp_ht.ht") # If not all pairs had at least one carrier of both, then compute phase estimate from single variants logger.info( f"{n_phased}/{n_variant_pairs} variant pair(s) found with carriers of both in gnomAD." ) if n_phased < n_variant_pairs: unphased_ht = variants_ht.anti_join(vp_ht) unphased_ht = annotate_unphased_pairs(unphased_ht, n_variant_pairs, least_consequence, max_freq) phased_ht = phased_ht.union(unphased_ht, unify=True) return phased_ht
def filter_kin_ht( ht: hl.Table, out_summary: io.TextIOWrapper, first_degree_pi_hat: float = 0.40, grandparent_pi_hat: float = 0.20, grandparent_ibd1: float = 0.25, grandparent_ibd2: float = 0.15, ) -> hl.Table: """ Filter the kinship table to relationships of grandparents and above. :param ht: hl.Table :param out_summary: Summary file with a summary statistics and notes :param first_degree_pi_hat: Minimum pi_hat threshold to use to filter the kinship table to first degree relatives :param grandparent_pi_hat: Minimum pi_hat threshold to use to filter the kinship table to grandparents :param grandparent_ibd1: Minimum IBD1 threshold to use to filter the kinship table to grandparents :param grandparent_ibd2: Maximum IBD2 threshold to use to filter the kinship table to grandparents :return: Table containing only relationships of grandparents and above """ # Filter to anything above the relationship of a grandparent ht = ht.filter((ht.pi_hat > first_degree_pi_hat) | ((ht.pi_hat > grandparent_pi_hat) & (ht.ibd1 > grandparent_ibd1) & (ht.ibd2 < grandparent_ibd2))) ht = ht.annotate(pair=hl.sorted([ht.i, ht.j])) out_summary.write( f"NOTE: kinship table was filtered to:\n(kin > {first_degree_pi_hat}) or kin > {grandparent_pi_hat} and IBD1 > {grandparent_ibd1} and IBD2 > {grandparent_ibd2})\n" ) out_summary.write( f"relationships not meeting this critera were not evaluated\n\n") return ht
def generate_allele_data(ht: hl.Table) -> hl.Table: """ Returns bi-allelic sites HT with the following annotations: - allele_data (nonsplit_alleles, has_star, variant_type, and n_alt_alleles) :param Table ht: Full unsplit HT :return: Table with allele data annotations :rtype: Table """ ht = ht.select() allele_data = hl.struct(nonsplit_alleles=ht.alleles, has_star=hl.any(lambda a: a == "*", ht.alleles)) ht = ht.annotate(allele_data=allele_data.annotate( **add_variant_type(ht.alleles))) ht = hl.split_multi_hts(ht) ht = ht.filter(hl.len(ht.alleles) > 1) allele_type = (hl.case().when( hl.is_snp(ht.alleles[0], ht.alleles[1]), "snv").when(hl.is_insertion(ht.alleles[0], ht.alleles[1]), "ins").when(hl.is_deletion(ht.alleles[0], ht.alleles[1]), "del").default("complex")) ht = ht.annotate(allele_data=ht.allele_data.annotate( allele_type=allele_type, was_mixed=ht.allele_data.variant_type == "mixed")) return ht
def combine(ts): def merge_alleles(alleles): from hail.expr.functions import _num_allele_type, _allele_ints return hl.rbind( alleles.map(lambda a: hl.or_else(a[0], '')).fold( lambda s, t: hl.cond(hl.len(s) > hl.len(t), s, t), ''), lambda ref: hl.rbind( alleles.map(lambda al: hl.rbind( al[0], lambda r: hl.array([ref]). extend(al[1:].map(lambda a: hl.rbind( _num_allele_type(r, a), lambda at: hl.cond( (_allele_ints['SNP'] == at) | (_allele_ints['Insertion'] == at) | (_allele_ints['Deletion'] == at) | (_allele_ints['MNP'] == at) | (_allele_ints['Complex'] == at), a + ref[hl.len( r):], a)))))), lambda lal: hl. struct(globl=hl.array([ref]).extend( hl.array(hl.set(hl.flatten(lal)).remove(ref))), local=lal))) def renumber_entry(entry, old_to_new) -> StructExpression: # global index of alternate (non-ref) alleles return entry.annotate(LA=entry.LA.map(lambda lak: old_to_new[lak])) if (ts.row.dtype, ts.globals.dtype) not in _merge_function_map: f = hl.experimental.define_function( lambda row, gbl: hl.rbind( merge_alleles(row.data.map(lambda d: d.alleles)), lambda alleles: hl.struct( locus=row.locus, alleles=alleles.globl, rsid=hl.find(hl.is_defined, row.data.map(lambda d: d.rsid) ), __entries=hl.bind( lambda combined_allele_index: hl. range(0, hl.len(row.data)).flatmap(lambda i: hl.cond( hl.is_missing(row.data[i].__entries), hl.range(0, hl.len(gbl.g[i].__cols)).map( lambda _: hl.null(row.data[i].__entries.dtype. element_type)), hl.bind( lambda old_to_new: row.data[i].__entries.map( lambda e: renumber_entry(e, old_to_new)), hl.range(0, hl.len(alleles.local[i])).map( lambda j: combined_allele_index[ alleles.local[i][j]])))), hl.dict( hl.range(0, hl.len(alleles.globl)).map( lambda j: hl.tuple([alleles.globl[j], j])))))), ts.row.dtype, ts.globals.dtype) _merge_function_map[(ts.row.dtype, ts.globals.dtype)] = f merge_function = _merge_function_map[(ts.row.dtype, ts.globals.dtype)] ts = Table( TableMapRows( ts._tir, Apply(merge_function._name, merge_function._ret_type, TopLevelReference('row'), TopLevelReference('global')))) return ts.transmute_globals( __cols=hl.flatten(ts.g.map(lambda g: g.__cols)))
def explode_duplicate_samples_ht(dups_ht: hl.Table) -> hl.Table: """ Explodes the result of `get_duplicated_samples_ht`, so that each line contains a single sample. An additional annotation is added: `dup_filtered` indicating which of the duplicated samples was kept. Requires a field `filtered` which type should be the same as the input duplicated samples Table key. :param dups_ht: Input HT :return: Flattened HT """ def get_dups_to_keep_expr(): if dups_ht.filtered.dtype.element_type == dups_ht.key.dtype: return (dups_ht.key, False) elif (len(dups_ht.key) == 1) & (dups_ht.filtered.dtype.element_type == dups_ht.key[0].dtype): return (dups_ht.key[0], False) else: raise TypeError( f"Cannot explode table as types of the filtered field ({dups_ht.filtered.dtype}) and the key ({dups_ht.key.dtype}) are incompatible." ) dups_ht = dups_ht.annotate(dups=hl.array([get_dups_to_keep_expr()]).extend( dups_ht.filtered.map(lambda x: (x, True)))) dups_ht = dups_ht.explode("dups") dups_ht = dups_ht.key_by() return dups_ht.select(s=dups_ht.dups[0], dup_filtered=dups_ht.dups[1]).key_by("s")
def join_tables(ht: hl.Table, exomes: bool) -> hl.Table: ''' Joins seqr variant table to gnomAD table. NOTE code was written assuming most recent gnomAD release is v3 :param Table ht: Table with variants downloaded from seqr :param bool exomes: Whether to join with gnomAD exomes table or genomes table :return: seqr variants Table joined with gnomAD table :rtype: hl.Table ''' if exomes: # read in exomes table gnomad_ht = hl.read_table( get_gnomad_liftover_data_path('exomes', version='2.1.1')) gnomad_ht = gnomad_ht.select('freq', 'popmax') gnomad_ht = gnomad_ht.select_globals() gnomad_ht = gnomad_ht.transmute( gnomad_exomes_AC=gnomad_ht.freq[0].AC, gnomad_exomes_AN=gnomad_ht.freq[0].AN, gnomad_exomes_popmax_AF=gnomad_ht.popmax[0].AF, gnomad_exomes_popmax_pop=gnomad_ht.popmax[0].pop) gnomad_ht.describe() else: # read in genomes table gnomad_ht = hl.read_table( 'gs://gnomad-public/release/3.0/ht/genomes/gnomad.genomes.r3.0.sites.ht' ) gnomad_ht = gnomad_ht.select('freq') gnomad_ht = gnomad_ht.transmute(gnomad_genomes_AC=gnomad_ht.freq[0].AC, gnomad_genomes_AN=gnomad_ht.freq[0].AN) gnomad_ht = gnomad_ht.select_globals() gnomad_ht.describe() ht = ht.annotate(**gnomad_ht[ht.key]) ht.describe() return ht
def make_sample_rank_table(phe_ht: hl.Table) -> hl.Table: """ Make table with rank of sample sorted by retention priority (lower rank has higher priority). It mainly uses two bits of information: - cases are prioritised over controls - samples are preferred based on the cohort info as follow: chd > ddd > ukbb :param phe_ht: Table with sample meta-data annotations (e.g. phenotype, cohort info...) :return: Hail Table """ phe_ht = ( phe_ht.annotate( case_control_rank=hl.int( phe_ht['phe.is_case']), # 0: control, 1: cases cohort_rank=hl.case().when(phe_ht.is_ukbb, 10).when( phe_ht.is_ddd, 100).when(phe_ht.is_chd, 1000).or_missing()).key_by()) phe_ht = (phe_ht.select('ega_id', 'case_control_rank', 'cohort_rank')) # sort table (descending) tb_rank = (phe_ht.order_by(hl.desc(phe_ht.case_control_rank), hl.desc(phe_ht.cohort_rank))) tb_rank = (tb_rank.add_index(name='rank').key_by('ega_id')) tb_rank = tb_rank.annotate(rank=tb_rank.rank + 1) return tb_rank
def add_global_af(ht: hl.Table, temp: str) -> hl.Table: ''' Adds gnomAD global AF annotation to Table :param Table ht: Input Table :param str temp: Path to temp bucket (to store intermediary files) :return: Table with gnomAD global AF annotation :rtype: Table ''' # checkpoint table after completing both gnomAD exomes and gnomAD genomes join temp_path = f'{temp}/join.ht' ht = ht.checkpoint(temp_path) # set gnomAD ACs and ANs to 0 if they are missing after the join ht = ht.transmute( gnomad_exomes_AC=hl.if_else(hl.is_defined(ht.gnomad_exomes_AC), ht.gnomad_exomes_AC, 0), gnomad_genomes_AC=hl.if_else(hl.is_defined(ht.gnomad_genomes_AC), ht.gnomad_genomes_AC, 0), gnomad_exomes_AN=hl.if_else(hl.is_defined(ht.gnomad_exomes_AN), ht.gnomad_exomes_AN, 0), gnomad_genomes_AN=hl.if_else(hl.is_defined(ht.gnomad_genomes_AN), ht.gnomad_genomes_AN, 0), ) ht = ht.annotate(gnomad_global_AF=( hl.if_else(((ht.gnomad_exomes_AN == 0) & (ht.gnomad_genomes_AN == 0)), 0.0, hl.float((ht.gnomad_exomes_AC + ht.gnomad_genomes_AC) / (ht.gnomad_exomes_AN + ht.gnomad_genomes_AN))))) ht.describe() return ht
def prepare_exomes(exome_ht: hl.Table, groupings: List, impose_high_af_cutoff_upfront: bool = True) -> hl.Table: # Manipulate VEP annotations and explode by them exome_ht = add_most_severe_csq_to_tc_within_ht(exome_ht) exome_ht = exome_ht.transmute(transcript_consequences=exome_ht.vep.transcript_consequences) exome_ht = exome_ht.explode(exome_ht.transcript_consequences) # Annotate variants with grouping variables. exome_ht, grouping = annotate_constraint_groupings(exome_ht,groupings) # This function needs to be adapted exome_ht = exome_ht.select( 'context', 'ref', 'alt', 'methylation_level', 'freq', 'pass_filters', *groupings) # Filter by allele count # Likely to need to adapt this function as well af_cutoff = 0.001 freq_index = exome_ht.freq_index_dict.collect()[0][dataset] def keep_criteria(ht): crit = (ht.freq[freq_index].AC > 0) & ht.pass_filters & (ht.coverage > 0) if impose_high_af_cutoff_upfront: crit &= (ht.freq[freq_index].AF <= af_cutoff) return crit exome_ht = exome_ht.filter(keep_criteria(exome_ht)) return exome_ht
def annotate_relatedness( relatedness_ht: hl.Table, first_degree_kin_thresholds: float = (0.1767767, 0.4), second_degree_kin_cutoff: float = 0.1, ibd0_0_max: float = 0.05, ) -> hl.Table: relatedness_ht = relatedness_ht.annotate( relationship=get_relationship_expr( kin_expr=relatedness_ht.kin, ibd0_expr=relatedness_ht.ibd0, ibd1_expr=relatedness_ht.ibd1, ibd2_expr=relatedness_ht.ibd2, first_degree_kin_thresholds=tuple(first_degree_kin_thresholds), second_degree_min_kin=second_degree_kin_cutoff, ibd0_0_max=ibd0_0_max, ) ) relatedness_ht = relatedness_ht.annotate_globals( min_individual_maf=0.01, min_emission_kinship=0.05, ibd0_0_max=ibd0_0_max, second_degree_kin_cutoff=second_degree_kin_cutoff, first_degree_kin_thresholds=tuple(first_degree_kin_thresholds), ) return relatedness_ht
def add_project_and_family_annotations(ht: hl.Table, seqr_projects: dict, family_ids: dict) -> hl.Table: """ Add seqr project and family ID annotations to the kinship table. :param ht: Hail Table of kinship values :param seqr_projects: Dictionary of seqr projects for each sample :param family_ids: Dictionary of family ids for each sample :return: Table with seqr project and family id annotations added """ # Add annotation for seqr projects of sample i and sample j hl_seqr_projects = hl.literal(seqr_projects) ht = ht.annotate( seqr_proj_i=hl_seqr_projects.get(ht.i), seqr_proj_j=hl_seqr_projects.get(ht.j), ) # Add annotation for family ids of sample i and sample j hl_family_ids = hl.literal(family_ids) ht = ht.annotate( fam_id_i=hl_family_ids.get(ht.i), fam_id_j=hl_family_ids.get(ht.j), ) return ht
def assign_platform_from_pcs( platform_pca_scores_ht: hl.Table, pc_scores_ann: str = "scores", hdbscan_min_cluster_size: Optional[int] = None, hdbscan_min_samples: int = None, ) -> hl.Table: """ Assigns platforms using HBDSCAN on the results of call rate PCA. :param platform_pca_scores_ht: Input table with the PCA score for each sample :param pc_scores_ann: Field containing the scores :param hdbscan_min_cluster_size: HDBSCAN `min_cluster_size` parameter. If not specified the smallest of 500 and 0.1*n_samples will be used. :param hdbscan_min_samples: HDBSCAN `min_samples` parameter :return: A Table with a `qc_platform` annotation containing the platform based on HDBSCAN clustering """ logger.info("Assigning platforms based on platform PCA clustering") # Read and format data for clustering data = platform_pca_scores_ht.to_pandas() callrate_data = np.matrix(data[pc_scores_ann].tolist()) logger.info("Assigning platforms to {} samples.".format( len(callrate_data))) # Cluster data if hdbscan_min_cluster_size is None: hdbscan_min_cluster_size = min(500, 0.1 * data.shape[0]) clusterer = hdbscan.HDBSCAN(min_cluster_size=hdbscan_min_cluster_size, min_samples=hdbscan_min_samples) cluster_labels = clusterer.fit_predict(callrate_data) n_clusters = len(set(cluster_labels)) - ( -1 in cluster_labels ) # NOTE: -1 is the label for noisy (un-classifiable) data points logger.info("Found {} unique platforms during platform imputation.".format( n_clusters)) data["qc_platform"] = cluster_labels # Note: write pandas dataframe to disk and re-import as HailTable. # This a temporary solution until sort the hail's issue with the function 'hl.Table.from_pandas' # and different python versions between driver/executors. (data.drop(axis=1, labels=pc_scores_ann).to_csv( f'{local_dir}/tmp/data_tmp_hdbscan.tsv', index=False, sep='\t')) ht_tmp = (hl.import_table(f'{nfs_dir}/tmp/data_tmp_hdbscan.tsv', impute=True).key_by(*platform_pca_scores_ht.key)) ht = platform_pca_scores_ht.join(ht_tmp) # original/elegant solution (TODO: sort issue with 'from_pandas' function) # ht = hl.Table.from_pandas(data, key=[*platform_pca_scores_ht.key]) # expand array structure and annotate scores (PCs) as individual fields. # drop array scores field before to export the results. n_pcs = len(ht[pc_scores_ann].take(1)[0]) ht = (ht.annotate( **{f'platform_PC{i + 1}': ht[pc_scores_ann][i] for i in range(n_pcs)}).drop(pc_scores_ann)) ht = ht.annotate(qc_platform="platform_" + hl.str(ht.qc_platform)) return ht
def compute_callrate_mt( mt: hl.MatrixTable, intervals_ht: hl.Table, bi_allelic_only: bool = True, autosomes_only: bool = True, match: bool = True, ) -> hl.MatrixTable: """ Compute a sample/interval MT with each entry containing the call rate for that sample/interval. This can be used as input for imputing exome sequencing platforms. .. note:: The input interval HT should have a key of type Interval. The resulting table will have a key of the same type as the `intervals_ht` table and contain an `interval_info` field containing all non-key fields of the `intervals_ht`. :param mt: Input MT :param intervals_ht: Table containing the intervals. This table has to be keyed by locus. :param bi_allelic_only: If set, only bi-allelic sites are used for the computation :param autosomes_only: If set, only autosomal intervals are used. :param matches: If set, returns all intervals in intervals_ht that overlap the locus in the input MT. :return: Callrate MT """ logger.info("Computing call rate MatrixTable") if len(intervals_ht.key) != 1 or not isinstance( intervals_ht.key[0], hl.expr.IntervalExpression): logger.warning( "Call rate matrix computation expects `intervals_ht` with a key of type Interval. Found: %s", intervals_ht.key, ) if autosomes_only: callrate_mt = filter_to_autosomes(mt) if bi_allelic_only: callrate_mt = callrate_mt.filter_rows(bi_allelic_expr(callrate_mt)) intervals_ht = intervals_ht.annotate(_interval_key=intervals_ht.key) callrate_mt = callrate_mt.annotate_rows(_interval_key=intervals_ht.index( callrate_mt.locus, all_matches=match)._interval_key) if match: callrate_mt = callrate_mt.explode_rows("_interval_key") callrate_mt = callrate_mt.filter_rows( hl.is_defined(callrate_mt._interval_key.interval)) callrate_mt = callrate_mt.select_entries( GT=hl.or_missing(hl.is_defined(callrate_mt.GT), hl.struct())) callrate_mt = callrate_mt.group_rows_by( **callrate_mt._interval_key).aggregate( callrate=hl.agg.fraction(hl.is_defined(callrate_mt.GT))) intervals_ht = intervals_ht.drop("_interval_key") callrate_mt = callrate_mt.annotate_rows(interval_info=hl.struct( **intervals_ht[callrate_mt.row_key])) return callrate_mt
def generate_final_rf_ht( ht: hl.Table, snp_cutoff: Union[int, float], indel_cutoff: Union[int, float], inbreeding_coeff_cutoff: float = INBREEDING_COEFF_HARD_CUTOFF, determine_cutoff_from_bin: bool = False, aggregated_bin_ht: Optional[hl.Table] = None, bin_id: Optional[hl.expr.Int32Expression] = None, ) -> hl.Table: """ Prepares finalized RF model given an RF result table from `rf.apply_rf_model` and cutoffs for filtering. If `determine_cutoff_from_bin` is True, `aggregated_bin_ht` must be supplied to determine the SNP and indel RF probabilities to use as cutoffs from an aggregated quantile bin Table like one created by `compute_grouped_binned_ht` in combination with `score_bin_agg`. :param ht: RF result table from `rf.apply_rf_model` to prepare as the final RF Table :param ac0_filter_expr: Expression that indicates if a variant should be filtered as allele count 0 (AC0) :param ts_ac_filter_expr: Expression in `ht` that indicates if a variant is a transmitted singleton :param mono_allelic_fiter_expr: Expression indicating if a variant is mono-allelic :param snp_cutoff: RF probability or bin (if `determine_cutoff_from_bin` True) to use for SNP variant QC filter :param indel_cutoff: RF probability or bin (if `determine_cutoff_from_bin` True) to use for indel variant QC filter :param inbreeding_coeff_cutoff: InbreedingCoeff hard filter to use for variants :param determine_cutoff_from_bin: If True RF probability will be determined using bin info in `aggregated_bin_ht` :param aggregated_bin_ht: File with aggregate counts of variants based on quantile bins :param bin_id: Name of bin to use in 'bin_id' column of `aggregated_bin_ht` to use to determine probability cutoff :return: Finalized random forest Table annotated with variant filters """ # Determine SNP and indel RF cutoffs if given bin instead of RF probability snp_cutoff_global = hl.struct(min_score=snp_cutoff) indel_cutoff_global = hl.struct(min_score=indel_cutoff) # Add filters to RF HT filters = dict() if ht.any(hl.is_missing(ht.rf_probability["TP"])): raise ValueError("Missing RF probability!") filters["RF"] = ( hl.is_snp(ht.alleles[0], ht.alleles[1]) & (ht.rf_probability["TP"] < snp_cutoff_global.min_score)) | ( ~hl.is_snp(ht.alleles[0], ht.alleles[1]) & (ht.rf_probability["TP"] < indel_cutoff_global.min_score)) # Fix annotations for release annotations_expr = { "rf_positive_label": hl.or_else(ht.tp, False), "rf_negative_label": ht.fail_hard_filters, "rf_probability": ht.rf_probability["TP"], } ht = ht.transmute(filters=add_filters_expr(filters=filters), **annotations_expr) ht = ht.annotate_globals(rf_snv_cutoff=snp_cutoff_global, rf_indel_cutoff=indel_cutoff_global) return ht
def default_generate_sib_stats( mt: hl.MatrixTable, relatedness_ht: hl.Table, sex_ht: hl.Table, i_col: str = "i", j_col: str = "j", relationship_col: str = "relationship", ) -> hl.Table: """ This is meant as a default wrapper for `generate_sib_stats_expr`. It returns a hail table with counts of variants shared by pairs of siblings in `relatedness_ht`. This function takes a hail Table with a row for each pair of individuals i,j in the data that are related (it's OK to have unrelated samples too). The `relationship_col` should be a column specifying the relationship between each two samples as defined by the constants in `gnomad.utils.relatedness`. This relationship_col will be used to filter to only pairs of samples that are annotated as `SIBLINGS`. :param mt: Input Matrix table :param relatedness_ht: Input relationship table :param sex_ht: A Table containing sex information for the samples :param i_col: Column containing the 1st sample of the pair in the relationship table :param j_col: Column containing the 2nd sample of the pair in the relationship table :param relationship_col: Column containing the relationship for the sample pair as defined in this module constants. :return: A Table with the sibling shared variant counts """ sex_ht = sex_ht.annotate( is_female=hl.case() .when(sex_ht.sex_karyotype == "XX", True) .when(sex_ht.sex_karyotype == "XY", False) .or_missing() ) # TODO: Change to use SIBLINGS constant when relatedness PR goes in sib_ht = relatedness_ht.filter(relatedness_ht[relationship_col] == "Siblings") s_to_keep = sib_ht.aggregate( hl.agg.explode( lambda s: hl.agg.collect_as_set(s), [sib_ht[i_col].s, sib_ht[j_col].s] ), _localize=False, ) mt = mt.filter_cols(s_to_keep.contains(mt.s)) mt = annotate_adj(mt) mt = mt.annotate_cols(is_female=sex_ht[mt.s].is_female) sib_stats_ht = mt.select_rows( **generate_sib_stats_expr( mt, sib_ht, i_col=i_col, j_col=j_col, strata={"raw": True, "adj": mt.adj}, is_female=mt.is_female, ) ).rows() return sib_stats_ht
def explode_phase_info(ht: hl.Table, remove_all_ref: bool = True) -> hl.Table: ht = ht.transmute(phase_info=hl.array(ht.phase_info)) ht = ht.explode('phase_info') ht = ht.transmute(pop=ht.phase_info[0], phase_info=ht.phase_info[1]) if remove_all_ref: ht = ht.filter(hl.sum(ht.phase_info.gt_counts.raw[1:]) > 0) return ht
def get_known_populations( ht: hl.Table, pop: str): # TODO: bring data into separate file and load here if pop == 'eur' or pop == 'nfe': known_pops = hl.literal({ 'ICR1000': 'nwe', # gb, ICR 'ICR142': 'nwe', # gb, ICR 'C1017': 'seu', # it, ATVB 'C1568': 'seu', # es, Regicor 'Bulgarian_Trios': 'bgr', # Bulgarians 'C533': 'bgr', # Bulgarians 'C821': 'bgr', # Bulgarians 'C952': 'bgr', # Bulgarians 'G89634': 'est', # Estonians 'G94980': 'est', # Estonians # 'C1830': 'neu', # gb, Leicester 'C1972': 'nwe', # gb, Tayside region of Scotland # 'G94051': 'seu', # es, "United States and Spain" # 'C1708': 'deu', # Germans? 'C1508': 'swe', # Swedes 'C1509': 'swe', # Swedes }) elif pop == 'eas': known_pops = hl.literal({ 'C1397': 'oea', # 'twn', # Taiwanese trios 'C1443': 'oea', # 'twn', # Taiwanese trios 'C1506': 'oea', # 'twn', # Taiwanese trios 'C1867': 'oea', # 'twn', # Taiwanese trios 'C978': 'oea', # 'twn', # Taiwanese trios 'C774': 'kor', # Korean T2D project 'C1982': 'kor', # Korean 'C1940': 'oea', # 'sgp', # Singapore 'C1980': 'oea', # 'hkg', # Hong Kong '1kg_JPT': 'jpn' }) elif pop == 'afr': known_pops = hl.literal({ 'C773': 't2d', # African American T2D 'C1002': 't2d', # African American T2D 'C1567': 'jhs', # African American JHS 'C1956': 'biome', # African American BioMe # TODO: Add 1kg populations here }) else: raise ValueError('pop must be one of eur, nfe, eas, afr') ht = ht.annotate(known_pop=known_pops.get(ht.meta.project_id)) if pop == 'eur': finns = hl.import_table( 'gs://gnomad/sample_qc/input_meta/source/99percent_finns_plus_AD_IBD_NFID.tsv.bgz', impute=True) finns = finns.filter( finns.percent_finnish > 0.99).key_by('sample_name_in_vcf') ht = ht.annotate( known_pop=hl.cond(hl.is_defined(finns[ht.s]), 'fin', ht.known_pop)) return ht
def add_release_annotations(ht: hl.Table) -> hl.Table: """ :param Table ht: Table containing meta column annotations for the dataset :return: Table containing final 'high_quality' and 'release' sample status annotations :rtype: Table """ ht = ht.annotate(high_quality=(hl.len(ht.hard_filters) == 0) & (hl.len(ht.pop_platform_filters) == 0)) return ht.annotate(release=ht.high_quality & (hl.len(ht.perm_filters) == 0) & ~ht.related)
def add_rank( ht: hl.Table, score_expr: hl.expr.NumericExpression, subrank_expr: Optional[Dict[str, hl.expr.BooleanExpression]] = None, ) -> hl.Table: """ Adds rank based on the `score_expr`. Rank is added for snvs and indels separately. If one or more `subrank_expr` are provided, then subrank is added based on all sites for which the boolean expression is true. In addition, variant counts (snv, indel separately) is added as a global (`rank_variant_counts`). :param ht: input Hail Table containing variants (with QC annotations) to be ranked :param score_expr: the Table annotation by which ranking should be scored :param subrank_expr: Any subranking to be added in the form name_of_subrank: subrank_filtering_expr :return: Table with rankings added """ key = ht.key if subrank_expr is None: subrank_expr = {} temp_expr = {"_score": score_expr} temp_expr.update({f"_{name}": expr for name, expr in subrank_expr.items()}) rank_ht = ht.select( **temp_expr, is_snv=hl.is_snp(ht.alleles[0], ht.alleles[1])) rank_ht = rank_ht.key_by("_score").persist() scan_expr = { "rank": hl.cond( rank_ht.is_snv, hl.scan.count_where(rank_ht.is_snv), hl.scan.count_where(~rank_ht.is_snv), ) } scan_expr.update( { name: hl.or_missing( rank_ht[f"_{name}"], hl.cond( rank_ht.is_snv, hl.scan.count_where(rank_ht.is_snv & rank_ht[f"_{name}"]), hl.scan.count_where(~rank_ht.is_snv & rank_ht[f"_{name}"]), ), ) for name in subrank_expr } ) rank_ht = rank_ht.annotate(**scan_expr) rank_ht = rank_ht.key_by(*key).persist() rank_ht = rank_ht.select(*scan_expr.keys()) ht = ht.annotate(**rank_ht[key]) return ht
def pheno_ht_to_mt(pheno_ht: hl.Table, data_type: str, special_fields: str = ('age', 'sex'), rekey: bool = True): """ Input Hail Table with lots of phenotype row fields, distill into MatrixTable with either categorical or continuous data types as entries :param Table pheno_ht: Input hail Table with phenotypes as row fields :param str data_type: one of "categorical" or "continuous" :return: Hail MatrixTable with phenotypes as entries :rtype: MatrixTable """ if data_type == 'categorical': filter_type = {hl.tbool} value_type = hl.bool else: filter_type = {hl.tint, hl.tfloat} value_type = hl.float special_fields_to_include = [] fields = set(pheno_ht.row_value) for field in special_fields: if field in fields: fields.remove(field) special_fields_to_include.append(field) select_fields = { x: value_type(pheno_ht[x]) for x in fields if pheno_ht[x].dtype in filter_type } pheno_ht = pheno_ht.select(*special_fields_to_include, **select_fields) mt = pheno_ht.to_matrix_table_row_major(columns=list(select_fields), entry_field_name='value', col_field_name='phesant_pheno') if rekey: mt = mt.key_cols_by( trait_type=data_type, phenocode=mt.phesant_pheno.split('_')[0], pheno_sex='both_sexes', coding=hl.case().when( (data_type == 'categorical') & (hl.len(mt.phesant_pheno.split('_')) > 1), mt.phesant_pheno.split('_', 2)[1] ) # TODO: fix to 1 when https://github.com/hail-is/hail/issues/7893 is fixed .default(NULL_STR_KEY), modifier=hl.case().when( (data_type == 'continuous') & (hl.len(mt.phesant_pheno.split('_')) > 1), mt.phesant_pheno.split('_', 2)[1] ) # TODO: fix to 1 when https://github.com/hail-is/hail/issues/7893 is fixed .default(NULL_STR_KEY)) return mt
def densify_sites( mt: hl.MatrixTable, sites_ht: hl.Table, last_END_positions_ht: hl.Table, semi_join_rows: bool = True, ) -> hl.MatrixTable: """ Creates a dense version of the input sparse MT at the sites in `sites_ht` reading the minimal amount of data required. Note that only rows that appear both in `mt` and `sites_ht` are returned. :param mt: Input sparse MT :param sites_ht: Desired sites to densify :param last_END_positions_ht: Table storing positions of the furthest ref block (END tag) :param semi_join_rows: Whether to filter the MT rows based on semi-join (default, better if sites_ht is large) or based on filter_intervals (better if sites_ht only contains a few sites) :return: Dense MT filtered to the sites in `sites_ht` """ logger.info("Computing intervals to densify from sites Table.") sites_ht = sites_ht.key_by("locus") sites_ht = sites_ht.annotate( interval=hl.locus_interval( sites_ht.locus.contig, last_END_positions_ht[sites_ht.key].last_END_position, end=sites_ht.locus.position, includes_end=True, reference_genome=sites_ht.locus.dtype.reference_genome, ) ) sites_ht = sites_ht.filter(hl.is_defined(sites_ht.interval)) if semi_join_rows: mt = mt.filter_rows(hl.is_defined(sites_ht.key_by("interval")[mt.locus])) else: logger.info("Collecting intervals to densify.") intervals = sites_ht.interval.collect() print( "Found {0} intervals, totalling {1} bp in the dense Matrix.".format( len(intervals), sum( [ interval_length(interval) for interval in union_intervals(intervals) ] ), ) ) mt = hl.filter_intervals(mt, intervals) mt = hl.experimental.densify(mt) return mt.filter_rows(hl.is_defined(sites_ht[mt.locus]))
def compare_row_counts(ht1: hl.Table, ht2: hl.Table) -> bool: """ Check if the row counts in two Tables are the same. :param ht1: First Table to be checked :param ht2: Second Table to be checked :return: Whether the row counts are the same """ r_count1 = ht1.count() r_count2 = ht2.count() logger.info(f"{r_count1} rows in left table; {r_count2} rows in right table") return r_count1 == r_count2
def make_hist_bin_edges_expr( ht: hl.Table, hists: List[str] = HISTS, prefix: str = "", label_delimiter: str = "_", include_age_hists: bool = True, ) -> Dict[str, str]: """ Create dictionaries containing variant histogram annotations and their associated bin edges, formatted into a string separated by pipe delimiters. :param ht: Table containing histogram variant annotations. :param hists: List of variant histogram annotations. Default is HISTS. :param prefix: Prefix text for age histogram bin edges. Default is empty string. :param label_delimiter: String used as delimiter between prefix and histogram annotation. :param include_age_hists: Include age histogram annotations. :return: Dictionary keyed by histogram annotation name, with corresponding reformatted bin edges for values. """ # Add underscore to prefix if it isn't empty if prefix != "": prefix += label_delimiter edges_dict = {} if include_age_hists: edges_dict.update( { f"{prefix}{call_type}": "|".join( map( lambda x: f"{x:.1f}", ht.head(1)[f"age_hist_{call_type}"].collect()[0].bin_edges, ) ) for call_type in ["het", "hom"] } ) for hist in hists: # Parse hists calculated on both raw and adj-filtered data for hist_type in [f"{prefix}raw_qual_hists", f"{prefix}qual_hists"]: hist_name = hist if "raw" in hist_type: hist_name = f"{prefix}{hist}_raw" edges_dict[hist_name] = "|".join( map( lambda x: f"{x:.2f}" if "ab" in hist else str(int(x)), ht.head(1)[hist_type][hist].collect()[0].bin_edges, ) ) return edges_dict
def aggregate_contig(ht: hl.Table, contigs: Set[str] = None): """ Aggregates all contigs together and computes number for bins accross the contigs. """ if contigs: ht = ht.filter(hl.literal(contigs).contains(ht.contig)) return ht.group_by(*[k for k in ht.key if k != 'contig']).aggregate( min_score=hl.agg.min(ht.min_score), max_score=hl.agg.max(ht.max_score), **{ x: hl.agg.sum(ht[x]) for x in ht.row_value if x not in ['min_score', 'max_score'] })
def explode_duplicate_samples_ht(dups_ht: hl.Table) -> hl.Table: """ Flattens the result of `filter_duplicate_samples`, so that each line contains a single sample. An additional annotation is added: `dup_filtered` indicating which of the duplicated samples was kept. :param dups_ht: Input HT :return: Flattened HT """ dups_ht = dups_ht.annotate(dups=hl.array([( dups_ht.key, False)]).extend(dups_ht.filtered.map(lambda x: (x, True)))) dups_ht = dups_ht.explode('dups') dups_ht = dups_ht.key_by() return dups_ht.select(s=dups_ht.dups[0], dup_filtered=dups_ht.dups[1]).key_by('s')
def annotate_related_pairs(related_pairs: hl.Table, index_col: str) -> hl.Table: related_pairs = related_pairs.key_by(**related_pairs[index_col]) related_pairs = related_pairs.filter( hl.is_missing(case_parents[related_pairs.key])) return related_pairs.annotate( **{ index_col: related_pairs[index_col].annotate( case_rank=hl.or_else( hl.int(meta_ht[related_pairs.key].is_case), -1), dp_mean=hl.or_else( sample_qc_ht[ related_pairs.key].sample_qc.dp_stats.mean, -1.0)) }).key_by()
def combine(ts): def merge_alleles(alleles): from hail.expr.functions import _num_allele_type, _allele_ints return hl.rbind( alleles.map(lambda a: hl.or_else(a[0], '')) .fold(lambda s, t: hl.cond(hl.len(s) > hl.len(t), s, t), ''), lambda ref: hl.rbind( alleles.map( lambda al: hl.rbind( al[0], lambda r: hl.array([ref]).extend( al[1:].map( lambda a: hl.rbind( _num_allele_type(r, a), lambda at: hl.cond( (_allele_ints['SNP'] == at) | (_allele_ints['Insertion'] == at) | (_allele_ints['Deletion'] == at) | (_allele_ints['MNP'] == at) | (_allele_ints['Complex'] == at), a + ref[hl.len(r):], a)))))), lambda lal: hl.struct( globl=hl.array([ref]).extend(hl.array(hl.set(hl.flatten(lal)).remove(ref))), local=lal))) def renumber_entry(entry, old_to_new) -> StructExpression: # global index of alternate (non-ref) alleles return entry.annotate(LA=entry.LA.map(lambda lak: old_to_new[lak])) if (ts.row.dtype, ts.globals.dtype) not in _merge_function_map: f = hl.experimental.define_function( lambda row, gbl: hl.rbind( merge_alleles(row.data.map(lambda d: d.alleles)), lambda alleles: hl.struct( locus=row.locus, alleles=alleles.globl, rsid=hl.find(hl.is_defined, row.data.map(lambda d: d.rsid)), __entries=hl.bind( lambda combined_allele_index: hl.range(0, hl.len(row.data)).flatmap( lambda i: hl.cond(hl.is_missing(row.data[i].__entries), hl.range(0, hl.len(gbl.g[i].__cols)) .map(lambda _: hl.null(row.data[i].__entries.dtype.element_type)), hl.bind( lambda old_to_new: row.data[i].__entries.map( lambda e: renumber_entry(e, old_to_new)), hl.range(0, hl.len(alleles.local[i])).map( lambda j: combined_allele_index[alleles.local[i][j]])))), hl.dict(hl.range(0, hl.len(alleles.globl)).map( lambda j: hl.tuple([alleles.globl[j], j])))))), ts.row.dtype, ts.globals.dtype) _merge_function_map[(ts.row.dtype, ts.globals.dtype)] = f merge_function = _merge_function_map[(ts.row.dtype, ts.globals.dtype)] ts = Table(TableMapRows(ts._tir, Apply(merge_function._name, TopLevelReference('row'), TopLevelReference('global')))) return ts.transmute_globals(__cols=hl.flatten(ts.g.map(lambda g: g.__cols)))