def generate_allele_data(ht: hl.Table) -> hl.Table: """ Returns bi-allelic sites HT with the following annotations: - allele_data (nonsplit_alleles, has_star, variant_type, and n_alt_alleles) :param Table ht: Full unsplit HT :return: Table with allele data annotations :rtype: Table """ ht = ht.select() allele_data = hl.struct(nonsplit_alleles=ht.alleles, has_star=hl.any(lambda a: a == "*", ht.alleles)) ht = ht.annotate(allele_data=allele_data.annotate( **add_variant_type(ht.alleles))) ht = hl.split_multi_hts(ht) ht = ht.filter(hl.len(ht.alleles) > 1) allele_type = (hl.case().when( hl.is_snp(ht.alleles[0], ht.alleles[1]), "snv").when(hl.is_insertion(ht.alleles[0], ht.alleles[1]), "ins").when(hl.is_deletion(ht.alleles[0], ht.alleles[1]), "del").default("complex")) ht = ht.annotate(allele_data=ht.allele_data.annotate( allele_type=allele_type, was_mixed=ht.allele_data.variant_type == "mixed")) return ht
def add_project_and_family_annotations(ht: hl.Table, seqr_projects: dict, family_ids: dict) -> hl.Table: """ Add seqr project and family ID annotations to the kinship table. :param ht: Hail Table of kinship values :param seqr_projects: Dictionary of seqr projects for each sample :param family_ids: Dictionary of family ids for each sample :return: Table with seqr project and family id annotations added """ # Add annotation for seqr projects of sample i and sample j hl_seqr_projects = hl.literal(seqr_projects) ht = ht.annotate( seqr_proj_i=hl_seqr_projects.get(ht.i), seqr_proj_j=hl_seqr_projects.get(ht.j), ) # Add annotation for family ids of sample i and sample j hl_family_ids = hl.literal(family_ids) ht = ht.annotate( fam_id_i=hl_family_ids.get(ht.i), fam_id_j=hl_family_ids.get(ht.j), ) return ht
def get_known_populations( ht: hl.Table, pop: str): # TODO: bring data into separate file and load here if pop == 'eur' or pop == 'nfe': known_pops = hl.literal({ 'ICR1000': 'nwe', # gb, ICR 'ICR142': 'nwe', # gb, ICR 'C1017': 'seu', # it, ATVB 'C1568': 'seu', # es, Regicor 'Bulgarian_Trios': 'bgr', # Bulgarians 'C533': 'bgr', # Bulgarians 'C821': 'bgr', # Bulgarians 'C952': 'bgr', # Bulgarians 'G89634': 'est', # Estonians 'G94980': 'est', # Estonians # 'C1830': 'neu', # gb, Leicester 'C1972': 'nwe', # gb, Tayside region of Scotland # 'G94051': 'seu', # es, "United States and Spain" # 'C1708': 'deu', # Germans? 'C1508': 'swe', # Swedes 'C1509': 'swe', # Swedes }) elif pop == 'eas': known_pops = hl.literal({ 'C1397': 'oea', # 'twn', # Taiwanese trios 'C1443': 'oea', # 'twn', # Taiwanese trios 'C1506': 'oea', # 'twn', # Taiwanese trios 'C1867': 'oea', # 'twn', # Taiwanese trios 'C978': 'oea', # 'twn', # Taiwanese trios 'C774': 'kor', # Korean T2D project 'C1982': 'kor', # Korean 'C1940': 'oea', # 'sgp', # Singapore 'C1980': 'oea', # 'hkg', # Hong Kong '1kg_JPT': 'jpn' }) elif pop == 'afr': known_pops = hl.literal({ 'C773': 't2d', # African American T2D 'C1002': 't2d', # African American T2D 'C1567': 'jhs', # African American JHS 'C1956': 'biome', # African American BioMe # TODO: Add 1kg populations here }) else: raise ValueError('pop must be one of eur, nfe, eas, afr') ht = ht.annotate(known_pop=known_pops.get(ht.meta.project_id)) if pop == 'eur': finns = hl.import_table( 'gs://gnomad/sample_qc/input_meta/source/99percent_finns_plus_AD_IBD_NFID.tsv.bgz', impute=True) finns = finns.filter( finns.percent_finnish > 0.99).key_by('sample_name_in_vcf') ht = ht.annotate( known_pop=hl.cond(hl.is_defined(finns[ht.s]), 'fin', ht.known_pop)) return ht
def add_release_annotations(ht: hl.Table) -> hl.Table: """ :param Table ht: Table containing meta column annotations for the dataset :return: Table containing final 'high_quality' and 'release' sample status annotations :rtype: Table """ ht = ht.annotate(high_quality=(hl.len(ht.hard_filters) == 0) & (hl.len(ht.pop_platform_filters) == 0)) return ht.annotate(release=ht.high_quality & (hl.len(ht.perm_filters) == 0) & ~ht.related)
def check_sex( sex_ht: hl.Table, output_dir: str, output_name: str, ) -> None: """ Compare inferred to given sex and output file with column added for discrepancies. Output directory and name here are used to locate the functioning pedigree with given sexes. :param sex_ht: Table of inferred sexes for each sample :param output_dir: Path to directory to output results :param output_name: Output prefix to use for results :return: None """ # Read in functioning pedigree with given sexes ped_ht = hl.import_table( f"{output_dir}/{output_name}_functioning_pedigree.ped") ped_ht = ped_ht.key_by(s=ped_ht.Individual_ID).select("Sex") ped_ht = ped_ht.annotate( given_sex=hl.case().when(ped_ht.Sex == "M", "male").when( ped_ht.Sex == "F", "female").default(ped_ht.Sex)).drop("Sex") sex_ht = sex_ht.join(ped_ht, how="outer") sex_ht = sex_ht.annotate(discrepant_sex=sex_ht.sex != sex_ht.given_sex) sex_ht.export(f"{output_dir}/{output_name}_sex_check.txt")
def annotate_from_dict(ht: hl.Table, dict_field: str, output_filed: str) -> hl.Table: """ Expand an dict field and add new fields. :param ht: HailTable :param dict_field: The dict field to be expanded :param output_filed: The output filed name (annotated as structure) :return: Annotated HailTable """ # retrieve dict keys to be annotated as fields dict_keys = ht[dict_field].keys().take(1)[0] # structure annotation expression struct_expr = hl.struct( **{ dict_keys[i]: ht[dict_field].get(dict_keys[i]) for i in range(len(dict_keys)) }) ht = (ht.annotate(_tmp_field_=struct_expr)) ht = ht.rename({'_tmp_field_': output_filed}) return ht
def apply_rf_model(ht: hl.Table, rf_model: pyspark.ml.PipelineModel, features: List[str], label: str, probability_col_name: str = 'rf_probability', prediction_col_name: str = 'rf_prediction') -> hl.Table: """ Applies a Random Forest (RF) pipeline model to a Table and annotate the RF probabilities and predictions. :param MatrixTable ht: Input HT :param PipelineModel rf_model: Random Forest pipeline model :param list of str features: List of feature columns in the pipeline. !Should match the model list of features! :param str label: Column containing the labels. !Should match the model labels! :param str probability_col_name: Name of the column that will store the RF probabilities :param str prediction_col_name: Name of the column that will store the RF predictions :return: Table with RF columns :rtype: Table """ logger.info("Applying RF model.") check_ht_fields_for_spark(ht, features + [label]) index_name = 'rf_idx' while index_name in ht.row: index_name += '_tmp' ht = ht.add_index(name=index_name) ht_keys = ht.key ht = ht.key_by(index_name) df = ht_to_rf_df(ht, features, label, index_name) rf_df = rf_model.transform(df) def to_array(col): def to_array_(v): return v.toArray().tolist() return udf(to_array_, ArrayType(DoubleType()))(col) rf_ht = hl.Table.from_spark( rf_df.withColumn("probability", to_array(col("probability"))).select( [index_name, 'probability', 'predictedLabel'])).persist() rf_ht = rf_ht.key_by(index_name) ht = ht.annotate( **{ probability_col_name: { label: rf_ht[ht[index_name]]["probability"][i] for i, label in enumerate(get_labels(rf_model)) }, prediction_col_name: rf_ht[ht[index_name]]["predictedLabel"] }) ht = ht.key_by(*ht_keys) ht = ht.drop(index_name) return ht
def create_quantile_bin_ht(ht: hl.Table, model_id: str, n_bins: int, vqsr: bool = False, overwrite: bool = False) -> None: """ Creates a table with quantile bin annotations added for a RF run and writes it to its correct location in annotations. :param model_id: Which data/run hash is being created :param n_bins: Number of bins to bin the data into :param vqsr: Set True is `model_id` refers to a VQSR filtering model :param overwrite: Should output files be overwritten if present :return: Nothing """ logger.info(f"Annotating {model_id} HT with quantile bins using {n_bins}") ht = ht.annotate( positive_train_site=ht.tp, negative_train_site=ht.fp, score=ht.rf_probability["TP"], ) #ht = ht.filter(ht.ac_raw > 0) bin_ht = create_binned_ht(ht, n_bins) return bin_ht
def join_tables(ht: hl.Table, exomes: bool) -> hl.Table: ''' Joins seqr variant table to gnomAD table. NOTE code was written assuming most recent gnomAD release is v3 :param Table ht: Table with variants downloaded from seqr :param bool exomes: Whether to join with gnomAD exomes table or genomes table :return: seqr variants Table joined with gnomAD table :rtype: hl.Table ''' if exomes: # read in exomes table gnomad_ht = hl.read_table( get_gnomad_liftover_data_path('exomes', version='2.1.1')) gnomad_ht = gnomad_ht.select('freq', 'popmax') gnomad_ht = gnomad_ht.select_globals() gnomad_ht = gnomad_ht.transmute( gnomad_exomes_AC=gnomad_ht.freq[0].AC, gnomad_exomes_AN=gnomad_ht.freq[0].AN, gnomad_exomes_popmax_AF=gnomad_ht.popmax[0].AF, gnomad_exomes_popmax_pop=gnomad_ht.popmax[0].pop) gnomad_ht.describe() else: # read in genomes table gnomad_ht = hl.read_table( 'gs://gnomad-public/release/3.0/ht/genomes/gnomad.genomes.r3.0.sites.ht' ) gnomad_ht = gnomad_ht.select('freq') gnomad_ht = gnomad_ht.transmute(gnomad_genomes_AC=gnomad_ht.freq[0].AC, gnomad_genomes_AN=gnomad_ht.freq[0].AN) gnomad_ht = gnomad_ht.select_globals() gnomad_ht.describe() ht = ht.annotate(**gnomad_ht[ht.key]) ht.describe() return ht
def explode_duplicate_samples_ht(dups_ht: hl.Table) -> hl.Table: """ Explodes the result of `get_duplicated_samples_ht`, so that each line contains a single sample. An additional annotation is added: `dup_filtered` indicating which of the duplicated samples was kept. Requires a field `filtered` which type should be the same as the input duplicated samples Table key. :param dups_ht: Input HT :return: Flattened HT """ def get_dups_to_keep_expr(): if dups_ht.filtered.dtype.element_type == dups_ht.key.dtype: return (dups_ht.key, False) elif (len(dups_ht.key) == 1) & (dups_ht.filtered.dtype.element_type == dups_ht.key[0].dtype): return (dups_ht.key[0], False) else: raise TypeError( f"Cannot explode table as types of the filtered field ({dups_ht.filtered.dtype}) and the key ({dups_ht.key.dtype}) are incompatible." ) dups_ht = dups_ht.annotate(dups=hl.array([get_dups_to_keep_expr()]).extend( dups_ht.filtered.map(lambda x: (x, True)))) dups_ht = dups_ht.explode("dups") dups_ht = dups_ht.key_by() return dups_ht.select(s=dups_ht.dups[0], dup_filtered=dups_ht.dups[1]).key_by("s")
def add_global_af(ht: hl.Table, temp: str) -> hl.Table: ''' Adds gnomAD global AF annotation to Table :param Table ht: Input Table :param str temp: Path to temp bucket (to store intermediary files) :return: Table with gnomAD global AF annotation :rtype: Table ''' # checkpoint table after completing both gnomAD exomes and gnomAD genomes join temp_path = f'{temp}/join.ht' ht = ht.checkpoint(temp_path) # set gnomAD ACs and ANs to 0 if they are missing after the join ht = ht.transmute( gnomad_exomes_AC=hl.if_else(hl.is_defined(ht.gnomad_exomes_AC), ht.gnomad_exomes_AC, 0), gnomad_genomes_AC=hl.if_else(hl.is_defined(ht.gnomad_genomes_AC), ht.gnomad_genomes_AC, 0), gnomad_exomes_AN=hl.if_else(hl.is_defined(ht.gnomad_exomes_AN), ht.gnomad_exomes_AN, 0), gnomad_genomes_AN=hl.if_else(hl.is_defined(ht.gnomad_genomes_AN), ht.gnomad_genomes_AN, 0), ) ht = ht.annotate(gnomad_global_AF=( hl.if_else(((ht.gnomad_exomes_AN == 0) & (ht.gnomad_genomes_AN == 0)), 0.0, hl.float((ht.gnomad_exomes_AC + ht.gnomad_genomes_AC) / (ht.gnomad_exomes_AN + ht.gnomad_genomes_AN))))) ht.describe() return ht
def make_sample_rank_table(phe_ht: hl.Table) -> hl.Table: """ Make table with rank of sample sorted by retention priority (lower rank has higher priority). It mainly uses two bits of information: - cases are prioritised over controls - samples are preferred based on the cohort info as follow: chd > ddd > ukbb :param phe_ht: Table with sample meta-data annotations (e.g. phenotype, cohort info...) :return: Hail Table """ phe_ht = ( phe_ht.annotate( case_control_rank=hl.int( phe_ht['phe.is_case']), # 0: control, 1: cases cohort_rank=hl.case().when(phe_ht.is_ukbb, 10).when( phe_ht.is_ddd, 100).when(phe_ht.is_chd, 1000).or_missing()).key_by()) phe_ht = (phe_ht.select('ega_id', 'case_control_rank', 'cohort_rank')) # sort table (descending) tb_rank = (phe_ht.order_by(hl.desc(phe_ht.case_control_rank), hl.desc(phe_ht.cohort_rank))) tb_rank = (tb_rank.add_index(name='rank').key_by('ega_id')) tb_rank = tb_rank.annotate(rank=tb_rank.rank + 1) return tb_rank
def filter_kin_ht( ht: hl.Table, out_summary: io.TextIOWrapper, first_degree_pi_hat: float = 0.40, grandparent_pi_hat: float = 0.20, grandparent_ibd1: float = 0.25, grandparent_ibd2: float = 0.15, ) -> hl.Table: """ Filter the kinship table to relationships of grandparents and above. :param ht: hl.Table :param out_summary: Summary file with a summary statistics and notes :param first_degree_pi_hat: Minimum pi_hat threshold to use to filter the kinship table to first degree relatives :param grandparent_pi_hat: Minimum pi_hat threshold to use to filter the kinship table to grandparents :param grandparent_ibd1: Minimum IBD1 threshold to use to filter the kinship table to grandparents :param grandparent_ibd2: Maximum IBD2 threshold to use to filter the kinship table to grandparents :return: Table containing only relationships of grandparents and above """ # Filter to anything above the relationship of a grandparent ht = ht.filter((ht.pi_hat > first_degree_pi_hat) | ((ht.pi_hat > grandparent_pi_hat) & (ht.ibd1 > grandparent_ibd1) & (ht.ibd2 < grandparent_ibd2))) ht = ht.annotate(pair=hl.sorted([ht.i, ht.j])) out_summary.write( f"NOTE: kinship table was filtered to:\n(kin > {first_degree_pi_hat}) or kin > {grandparent_pi_hat} and IBD1 > {grandparent_ibd1} and IBD2 > {grandparent_ibd2})\n" ) out_summary.write( f"relationships not meeting this critera were not evaluated\n\n") return ht
def annotate_relatedness( relatedness_ht: hl.Table, first_degree_kin_thresholds: float = (0.1767767, 0.4), second_degree_kin_cutoff: float = 0.1, ibd0_0_max: float = 0.05, ) -> hl.Table: relatedness_ht = relatedness_ht.annotate( relationship=get_relationship_expr( kin_expr=relatedness_ht.kin, ibd0_expr=relatedness_ht.ibd0, ibd1_expr=relatedness_ht.ibd1, ibd2_expr=relatedness_ht.ibd2, first_degree_kin_thresholds=tuple(first_degree_kin_thresholds), second_degree_min_kin=second_degree_kin_cutoff, ibd0_0_max=ibd0_0_max, ) ) relatedness_ht = relatedness_ht.annotate_globals( min_individual_maf=0.01, min_emission_kinship=0.05, ibd0_0_max=ibd0_0_max, second_degree_kin_cutoff=second_degree_kin_cutoff, first_degree_kin_thresholds=tuple(first_degree_kin_thresholds), ) return relatedness_ht
def compute_callrate_mt( mt: hl.MatrixTable, intervals_ht: hl.Table, bi_allelic_only: bool = True, autosomes_only: bool = True, match: bool = True, ) -> hl.MatrixTable: """ Compute a sample/interval MT with each entry containing the call rate for that sample/interval. This can be used as input for imputing exome sequencing platforms. .. note:: The input interval HT should have a key of type Interval. The resulting table will have a key of the same type as the `intervals_ht` table and contain an `interval_info` field containing all non-key fields of the `intervals_ht`. :param mt: Input MT :param intervals_ht: Table containing the intervals. This table has to be keyed by locus. :param bi_allelic_only: If set, only bi-allelic sites are used for the computation :param autosomes_only: If set, only autosomal intervals are used. :param matches: If set, returns all intervals in intervals_ht that overlap the locus in the input MT. :return: Callrate MT """ logger.info("Computing call rate MatrixTable") if len(intervals_ht.key) != 1 or not isinstance( intervals_ht.key[0], hl.expr.IntervalExpression): logger.warning( "Call rate matrix computation expects `intervals_ht` with a key of type Interval. Found: %s", intervals_ht.key, ) if autosomes_only: callrate_mt = filter_to_autosomes(mt) if bi_allelic_only: callrate_mt = callrate_mt.filter_rows(bi_allelic_expr(callrate_mt)) intervals_ht = intervals_ht.annotate(_interval_key=intervals_ht.key) callrate_mt = callrate_mt.annotate_rows(_interval_key=intervals_ht.index( callrate_mt.locus, all_matches=match)._interval_key) if match: callrate_mt = callrate_mt.explode_rows("_interval_key") callrate_mt = callrate_mt.filter_rows( hl.is_defined(callrate_mt._interval_key.interval)) callrate_mt = callrate_mt.select_entries( GT=hl.or_missing(hl.is_defined(callrate_mt.GT), hl.struct())) callrate_mt = callrate_mt.group_rows_by( **callrate_mt._interval_key).aggregate( callrate=hl.agg.fraction(hl.is_defined(callrate_mt.GT))) intervals_ht = intervals_ht.drop("_interval_key") callrate_mt = callrate_mt.annotate_rows(interval_info=hl.struct( **intervals_ht[callrate_mt.row_key])) return callrate_mt
def default_generate_sib_stats( mt: hl.MatrixTable, relatedness_ht: hl.Table, sex_ht: hl.Table, i_col: str = "i", j_col: str = "j", relationship_col: str = "relationship", ) -> hl.Table: """ This is meant as a default wrapper for `generate_sib_stats_expr`. It returns a hail table with counts of variants shared by pairs of siblings in `relatedness_ht`. This function takes a hail Table with a row for each pair of individuals i,j in the data that are related (it's OK to have unrelated samples too). The `relationship_col` should be a column specifying the relationship between each two samples as defined by the constants in `gnomad.utils.relatedness`. This relationship_col will be used to filter to only pairs of samples that are annotated as `SIBLINGS`. :param mt: Input Matrix table :param relatedness_ht: Input relationship table :param sex_ht: A Table containing sex information for the samples :param i_col: Column containing the 1st sample of the pair in the relationship table :param j_col: Column containing the 2nd sample of the pair in the relationship table :param relationship_col: Column containing the relationship for the sample pair as defined in this module constants. :return: A Table with the sibling shared variant counts """ sex_ht = sex_ht.annotate( is_female=hl.case() .when(sex_ht.sex_karyotype == "XX", True) .when(sex_ht.sex_karyotype == "XY", False) .or_missing() ) # TODO: Change to use SIBLINGS constant when relatedness PR goes in sib_ht = relatedness_ht.filter(relatedness_ht[relationship_col] == "Siblings") s_to_keep = sib_ht.aggregate( hl.agg.explode( lambda s: hl.agg.collect_as_set(s), [sib_ht[i_col].s, sib_ht[j_col].s] ), _localize=False, ) mt = mt.filter_cols(s_to_keep.contains(mt.s)) mt = annotate_adj(mt) mt = mt.annotate_cols(is_female=sex_ht[mt.s].is_female) sib_stats_ht = mt.select_rows( **generate_sib_stats_expr( mt, sib_ht, i_col=i_col, j_col=j_col, strata={"raw": True, "adj": mt.adj}, is_female=mt.is_female, ) ).rows() return sib_stats_ht
def compute_stratified_metrics_filter(ht: hl.Table, qc_metrics: List[str], strata: List[str] = None) -> hl.Table: """ Compute median, MAD, and upper and lower thresholds for each metric used in pop- and platform-specific outlier filtering :param MatrixTable ht: HT containing relevant sample QC metric annotations :param list qc_metrics: list of metrics for which to compute the critical values for filtering outliers :param list of str strata: List of annotations used for stratification. These metrics should be discrete types! :return: Table grouped by pop and platform, with upper and lower threshold values computed for each sample QC metric :rtype: Table """ def make_pop_filters_expr(ht: hl.Table, qc_metrics: List[str]) -> hl.expr.SetExpression: return hl.set( hl.filter(lambda x: hl.is_defined(x), [ hl.or_missing(ht[f'fail_{metric}'], metric) for metric in qc_metrics ])) ht = ht.select(*strata, **ht.sample_qc.select(*qc_metrics)).key_by('s').persist() def get_metric_expr(ht, metric): metric_values = hl.agg.collect(ht[metric]) metric_median = hl.median(metric_values) metric_mad = 1.4826 * hl.median(hl.abs(metric_values - metric_median)) return hl.struct(median=metric_median, mad=metric_mad, upper=metric_median + 4 * metric_mad if metric != 'callrate' else 1, lower=metric_median - 4 * metric_mad if metric != 'callrate' else 0.99) agg_expr = hl.struct( **{metric: get_metric_expr(ht, metric) for metric in qc_metrics}) if strata: ht = ht.annotate_globals(metrics_stats=ht.aggregate( hl.agg.group_by(hl.tuple([ht[x] for x in strata]), agg_expr))) else: ht = ht.annotate_globals(metrics_stats={(): ht.aggregate(agg_expr)}) strata_exp = hl.tuple([ht[x] for x in strata]) if strata else hl.tuple([]) fail_exprs = { f'fail_{metric}': (ht[metric] >= ht.metrics_stats[strata_exp][metric].upper) | (ht[metric] <= ht.metrics_stats[strata_exp][metric].lower) for metric in qc_metrics } ht = ht.transmute(**fail_exprs) pop_platform_filters = make_pop_filters_expr(ht, qc_metrics) return ht.annotate(pop_platform_filters=pop_platform_filters)
def add_rank( ht: hl.Table, score_expr: hl.expr.NumericExpression, subrank_expr: Optional[Dict[str, hl.expr.BooleanExpression]] = None, ) -> hl.Table: """ Adds rank based on the `score_expr`. Rank is added for snvs and indels separately. If one or more `subrank_expr` are provided, then subrank is added based on all sites for which the boolean expression is true. In addition, variant counts (snv, indel separately) is added as a global (`rank_variant_counts`). :param ht: input Hail Table containing variants (with QC annotations) to be ranked :param score_expr: the Table annotation by which ranking should be scored :param subrank_expr: Any subranking to be added in the form name_of_subrank: subrank_filtering_expr :return: Table with rankings added """ key = ht.key if subrank_expr is None: subrank_expr = {} temp_expr = {"_score": score_expr} temp_expr.update({f"_{name}": expr for name, expr in subrank_expr.items()}) rank_ht = ht.select( **temp_expr, is_snv=hl.is_snp(ht.alleles[0], ht.alleles[1])) rank_ht = rank_ht.key_by("_score").persist() scan_expr = { "rank": hl.cond( rank_ht.is_snv, hl.scan.count_where(rank_ht.is_snv), hl.scan.count_where(~rank_ht.is_snv), ) } scan_expr.update( { name: hl.or_missing( rank_ht[f"_{name}"], hl.cond( rank_ht.is_snv, hl.scan.count_where(rank_ht.is_snv & rank_ht[f"_{name}"]), hl.scan.count_where(~rank_ht.is_snv & rank_ht[f"_{name}"]), ), ) for name in subrank_expr } ) rank_ht = rank_ht.annotate(**scan_expr) rank_ht = rank_ht.key_by(*key).persist() rank_ht = rank_ht.select(*scan_expr.keys()) ht = ht.annotate(**rank_ht[key]) return ht
def densify_sites( mt: hl.MatrixTable, sites_ht: hl.Table, last_END_positions_ht: hl.Table, semi_join_rows: bool = True, ) -> hl.MatrixTable: """ Creates a dense version of the input sparse MT at the sites in `sites_ht` reading the minimal amount of data required. Note that only rows that appear both in `mt` and `sites_ht` are returned. :param mt: Input sparse MT :param sites_ht: Desired sites to densify :param last_END_positions_ht: Table storing positions of the furthest ref block (END tag) :param semi_join_rows: Whether to filter the MT rows based on semi-join (default, better if sites_ht is large) or based on filter_intervals (better if sites_ht only contains a few sites) :return: Dense MT filtered to the sites in `sites_ht` """ logger.info("Computing intervals to densify from sites Table.") sites_ht = sites_ht.key_by("locus") sites_ht = sites_ht.annotate( interval=hl.locus_interval( sites_ht.locus.contig, last_END_positions_ht[sites_ht.key].last_END_position, end=sites_ht.locus.position, includes_end=True, reference_genome=sites_ht.locus.dtype.reference_genome, ) ) sites_ht = sites_ht.filter(hl.is_defined(sites_ht.interval)) if semi_join_rows: mt = mt.filter_rows(hl.is_defined(sites_ht.key_by("interval")[mt.locus])) else: logger.info("Collecting intervals to densify.") intervals = sites_ht.interval.collect() print( "Found {0} intervals, totalling {1} bp in the dense Matrix.".format( len(intervals), sum( [ interval_length(interval) for interval in union_intervals(intervals) ] ), ) ) mt = hl.filter_intervals(mt, intervals) mt = hl.experimental.densify(mt) return mt.filter_rows(hl.is_defined(sites_ht[mt.locus]))
def compute_fisher_exact(tb: hl.Table, n_cases_col: str, n_control_col: str, total_cases_col: str, total_controls_col: str, correct_total_counts: bool, extra_fields: dict, root_col_name: str = 'fet') -> hl.Table: """ Perform two-sided Fisher Exact test. Add extra annotations (if any) :param tb: Hail Table :param n_cases_col: field name with number of (affected) cases :param n_control_col: field name with number of (affected) control :param total_cases_col: field name with total number of cases :param total_controls_col: field name with total number of controls :param correct_total_counts: should the total numbers (case/control) be corrected to avoid duplicated counting? :param root_col_name: field to be annotated with test results :param extra_fields: Extra filed (must be a dict) to be annotated :return: Hail Table with Fisher Exact test results. """ # compute fisher exact if correct_total_counts: fet = hl.fisher_exact_test( c1=hl.int32(tb[n_cases_col]), c2=hl.int32(tb[n_control_col]), c3=hl.int32(tb[total_cases_col]) - hl.int32(tb[n_cases_col]), c4=hl.int32(tb[total_controls_col]) - hl.int32(tb[n_control_col])) else: fet = hl.fisher_exact_test(c1=hl.int32(tb[n_cases_col]), c2=hl.int32(tb[n_control_col]), c3=hl.int32(tb[total_cases_col]), c4=hl.int32(tb[total_controls_col])) tb = (tb.annotate(**{root_col_name: fet}).flatten()) if len(extra_fields) == 0: return tb else: return tb.annotate(**extra_fields)
def load_covid_data(all_samples_ht: hl.Table, covid_data_path: str, wave: str = '01'): print(f'Loading COVID wave {wave}...') covid_ht = hl.import_table(covid_data_path, delimiter='\t', missing='', impute=True, key='eid') covid_ht = covid_ht.group_by('eid').aggregate( origin=hl.agg.any(covid_ht.origin == 1), result=hl.agg.any(covid_ht.result == 1), inpatient=hl.agg.any(covid_ht.reqorg == 1), ) # TODO: add aoo parse to separate trait_type (covid_quantitative?) # dob = load_dob_ht(pre_phesant_tsv_path)[ht.key].date_of_birth # ht = ht.annotate(aoo=hl.or_missing(ht.result == 1, hl.experimental.strptime(ht.specdate + ' 00:00:00', '%d/%m/%Y %H:%M:%S', 'GMT') - dob), # specdate=hl.experimental.strptime(ht.specdate + ' 00:00:00', '%d/%m/%Y %H:%M:%S', 'GMT')).drop('specdate') ht = all_samples_ht.annotate(**covid_ht[all_samples_ht.key]) centers = hl.literal(ENGLAND_RECRUITMENT_CENTERS) analyses = { 'B1_v2': hl.or_missing(ht.result, ht.inpatient), # fka ANA2 'B1_v2_origin': hl.or_missing(ht.result, ht.origin), # fka ANA2 'C2_v2': hl.or_else(ht.result, False), # fka ANA5 'C2_v2_england_controls': hl.or_missing(centers.contains(ht.recruitment_center), # fka ANA5_england_controls hl.or_else(ht.result, False)), 'C1_v2': ht.result, # fka ANA5_strict 'B2_v2': hl.or_else(ht.result & ht.inpatient, False), # fka ANA6 'B2_v2_origin': hl.or_else(ht.result & ht.origin, False) # fka ANA6 } analysis_names = { 'B1_v2': 'Hospitalized vs non-hospitalized (among COVID-19 positive)', # fka ANA2 'B1_v2_origin': 'Hospitalized vs non-hospitalized (among COVID-19 positive; old definition using "origin" field)', # fka ANA2 'C2_v2': 'COVID-19 positive (controls include untested)', # fka ANA5 'C2_v2_england_controls': 'COVID-19 positive (controls include untested), only patients from centers in England', # fka ANA5_england_controls 'C1_v2': 'COVID-19 positive (controls only COVID-19 negative)', # fka ANA5_strict 'B2_v2': 'Hospitalized vs non-hospitalized (controls include untested)', # ANA6 'B2_v2_origin': 'Hospitalized vs non-hospitalized (controls include untested; old definition using "origin" field)' # ANA6 } assert set(analyses.keys()) == set(analysis_names.keys()) ht = ht.select(**analyses) mt = filter_and_annotate_ukb_data(ht, lambda k, v: True, annotate_with_showcase=False, format_col_name=lambda x: x) mt = mt.key_cols_by(trait_type='categorical', phenocode='COVID19', pheno_sex='both_sexes', coding=mt.phenocode, modifier=wave) mt = mt.annotate_cols(description=hl.literal(analysis_names)[mt.coding]) mt.annotate_cols( n_cases=hl.agg.count_where(mt.value == 1.0), n_controls=hl.agg.count_where(mt.value == 0.0) ).cols().show() return mt
def prepare_ht_export(ht: hl.Table) -> hl.Table: subset_list = ['gnomad'] for subset in subset_list: INFO_DICT.update(make_info_dict(subset, dict(group=GROUPS))) INFO_DICT.update(make_info_dict(subset, dict(group=GROUPS, pop=POPS))) new_info_dict = { i.replace('gnomad_', '').replace('_adj', ''): j for i, j in INFO_DICT.items() } ht = ht.annotate(info=hl.struct(**make_info_expr(ht))) ht = ht.annotate(info=ht.info.annotate(**unfurl_nested_annotations(ht))) #ht = ht.select('info', 'filters', 'rsid', 'qual','vep') ht = ht.select('info', 'filters', 'rsid', 'qual') header_dict = {'info': new_info_dict} #'filter': make_filter_dict(ht)} return ht
def pick_transcript(ht: hl.Table, csq_array: str) -> hl.Table: # TODO: This function could be improved by scanning the array (just once) and sorting it as suggested here: # TODO: https://hail.zulipchat.com/#narrow/stream/123010-Hail-0.2E2.20support/topic/pick.20transcript.20from.20array # TODO: /near/190400193 """ Annotate an extra field (tx) with the selected transcript. This function will pick one transcript per variant/consequence based on the impact of the variant in the transcript (from more severe to less severe). :param ht: Hail table with VEP annotations :param csq_array: Parsed CSQ field name. Expected to be an array of dict(s). Transcript expected to be as a dict. :return: Hail table with an annotated extra field (tx). The transcript selected from the array based on a set of pre-defined criteria. """ # Set transcript (tx) field initially to 'NA' and update it sequentially based on a set of pre-defined criteria # (order matters) ht = (ht.annotate(tx=ht[csq_array].find(lambda x: False))) # getting current keys from dict keys = ht[csq_array].take(1)[0][0] # select tx if LoF == 'HC' if 'LoF' in keys: ht = (ht.annotate( tx=hl.cond(hl.is_missing(ht.tx), ht[csq_array].find( lambda x: x['LoF'] == 'HC'), ht.tx))) # select transcript based on the consequence impact (high -> moderate -> low) if 'IMPACT' in keys: # select tx if IMPACT == HIGH ht = (ht.annotate( tx=hl.cond(hl.is_missing(ht.tx), ht[csq_array].find( lambda x: x['IMPACT'] == 'HIGH'), ht.tx))) # select tx if IMPACT == MODERATE ht = (ht.annotate( tx=hl.cond(hl.is_missing(ht.tx), ht[csq_array].find( lambda x: x['IMPACT'] == 'MODERATE'), ht.tx))) # select tx if IMPACT == LOW ht = (ht.annotate( tx=hl.cond(hl.is_missing(ht.tx), ht[csq_array].find( lambda x: x['IMPACT'] == 'LOW'), ht.tx))) # select tx if CANONICAL ht = (ht.annotate( tx=hl.cond(hl.is_missing(ht.tx), ht[csq_array].find( lambda x: x['CANONICAL'] == 'YES'), ht.tx))) # if tx is still missing, set tx as the first annotated transcript ht = (ht.annotate( tx=hl.cond(hl.is_missing(ht.tx), ht[csq_array][0], ht.tx))) return ht
def annotate_related_pairs(related_pairs: hl.Table, index_col: str) -> hl.Table: related_pairs = related_pairs.key_by(**related_pairs[index_col]) related_pairs = related_pairs.filter( hl.is_missing(case_parents[related_pairs.key])) return related_pairs.annotate( **{ index_col: related_pairs[index_col].annotate( case_rank=hl.or_else( hl.int(meta_ht[related_pairs.key].is_case), -1), dp_mean=hl.or_else( sample_qc_ht[ related_pairs.key].sample_qc.dp_stats.mean, -1.0)) }).key_by()
def explode_duplicate_samples_ht(dups_ht: hl.Table) -> hl.Table: """ Flattens the result of `filter_duplicate_samples`, so that each line contains a single sample. An additional annotation is added: `dup_filtered` indicating which of the duplicated samples was kept. :param dups_ht: Input HT :return: Flattened HT """ dups_ht = dups_ht.annotate(dups=hl.array([( dups_ht.key, False)]).extend(dups_ht.filtered.map(lambda x: (x, True)))) dups_ht = dups_ht.explode('dups') dups_ht = dups_ht.key_by() return dups_ht.select(s=dups_ht.dups[0], dup_filtered=dups_ht.dups[1]).key_by('s')
def compute_grouped_binned_ht( bin_ht: hl.Table, checkpoint_path: Optional[str] = None, ) -> hl.GroupedTable: """ Group a Table that has been annotated with bins (`compute_ranked_bin` or `create_binned_ht`). The table will be grouped by bin_id (bin, biallelic, etc.), contig, snv, bi_allelic and singleton. .. note:: If performing an aggregation following this grouping (such as `score_bin_agg`) then the aggregation function will need to use `ht._parent` to get the origin Table from the GroupedTable for the aggregation :param bin_ht: Input Table with a `bin_id` annotation :param checkpoint_path: If provided an intermediate checkpoint table is created with all required annotations before shuffling. :return: Table grouped by bins(s) """ # Explode the rank table by bin_id bin_ht = bin_ht.annotate( bin_groups=hl.array( [ hl.Struct(bin_id=bin_name, bin=bin_ht[bin_name]) for bin_name in bin_ht.bin_group_variant_counts ] ) ) bin_ht = bin_ht.explode(bin_ht.bin_groups) bin_ht = bin_ht.transmute( bin_id=bin_ht.bin_groups.bin_id, bin=bin_ht.bin_groups.bin ) bin_ht = bin_ht.filter(hl.is_defined(bin_ht.bin)) if checkpoint_path is not None: bin_ht.checkpoint(checkpoint_path, overwrite=True) else: bin_ht = bin_ht.persist() # Group by bin_id, bin and additional stratification desired and compute QC metrics per bin return bin_ht.group_by( bin_id=bin_ht.bin_id, contig=bin_ht.locus.contig, snv=hl.is_snp(bin_ht.alleles[0], bin_ht.alleles[1]), bi_allelic=~bin_ht.was_split, singleton=bin_ht.singleton, release_adj=bin_ht.ac > 0, bin=bin_ht.bin, )._set_buffer_size(20000)
def filter_ht_for_plink(ht: hl.Table, n_samples: int, min_call_rate: float = 0.95, variants_per_mac_category: int = 2000, variants_per_maf_category: int = 10000): from gnomad.utils.filtering import filter_to_autosomes ht = filter_to_autosomes(ht) ht = ht.filter((ht.call_stats.AN >= n_samples * 2 * min_call_rate) & (ht.call_stats.AC > 0)) ht = ht.annotate(mac_category=mac_category_case_builder(ht.call_stats)) category_counter = ht.aggregate(hl.agg.counter(ht.mac_category)) print(category_counter) ht = ht.annotate_globals(category_counter=category_counter) return ht.filter( hl.rand_unif( 0, 1) < hl.cond(ht.mac_category >= 1, variants_per_mac_category, variants_per_maf_category) / ht.category_counter[ht.mac_category])
def filter_mt_to_trios(mt: hl.MatrixTable, fam_ht: hl.Table) -> hl.MatrixTable: """ Filters a MatrixTable to a set of trios in `fam_ht`, filters to autosomes, and annotates with adj. :param mt: A Matrix Table to filter to only trios :param fam_ht: A Table of trios to filter to, loaded using `hl.import_fam` :return: A MT filtered to trios and adj annotated """ # Filter MT to samples present in any of the trios fam_ht = fam_ht.annotate(fam_members=[fam_ht.id, fam_ht.pat_id, fam_ht.mat_id]) fam_ht = fam_ht.explode("fam_members", name="s") fam_ht = fam_ht.key_by("s").select().distinct() mt = mt.filter_cols(hl.is_defined(fam_ht[mt.col_key])) mt = filter_to_autosomes(mt) mt = annotate_adj(mt) return mt
def collapse_small_pops(ht: hl.Table, min_pop_size: int) -> hl.Table: """ Collapses (sub)populations that are too small for release into others. When collapsing subpops, the name for the other category is composed of "o" + 2 first letters of the superpop The original RF population assignments are kept in the `rf_pop` and `rf_subpop` columns. :param ht: Input Table :return: Table with small populations collapsed :rtype: Table """ def get_subpop_oth(pop: str): for superpop, subpops in SUBPOPS.items(): if pop.upper() in subpops: return "o" + superpop[:2].lower() raise ValueError( f"Subpopulation {pop} not found in possible subpopulations.") ht = ht.persist() pop_counts = ht.aggregate(hl.agg.filter(ht.release, hl.agg.counter(ht.pop))) pop_collapse = { pop: "oth" for pop, n in pop_counts.items() if n < min_pop_size } pop_collapse = hl.literal(pop_collapse) if pop_collapse else hl.empty_dict( hl.tstr, hl.tstr) subpop_counts = ht.aggregate( hl.agg.filter(ht.release, hl.agg.counter(ht.subpop))) subpop_collapse = { subpop: get_subpop_oth(subpop) for subpop, n in subpop_counts.items() if n < min_pop_size } subpop_collapse = hl.literal( subpop_collapse) if subpop_collapse else hl.empty_dict( hl.tstr, hl.tstr) return ht.annotate(pop=pop_collapse.get(ht.pop, ht.pop), subpop=subpop_collapse.get(ht.subpop, ht.subpop), rf_pop=ht.pop, rf_subpop=ht.subpop)
def annotate_from_dict(ht: hl.Table, dict_field: str) -> hl.Table: """ Expand an dict field and add new fields. :param ht: HailTable :param dict_field: The dict field to be expanded :return: Annotated HailTable """ # number of fields to be annotated dict_keys = ht[dict_field].keys().take(1)[0] # print(dict_keys) ht = (ht.annotate( **{ dict_keys[i]: ht[dict_field].get(dict_keys[i]) for i in range(len(dict_keys)) })) return ht