def make_perm_filters_expr(ht: hl.Table, data_type: str) -> hl.expr.SetExpression: """ NOTE: syndip will remain dropped wrt to permissions, but all possible QC measures will still be calculated :param Table ht: input MT :param str data_type: 'exomes' or 'genomes' :return: output MT :rtype: SetExpression """ if data_type == 'genomes': perm_filters = {'not_releasable': ~ht.releasable_2_1} else: perm_filters = { 'tcga_tumor': ht.tcga_tumor, 'tcga_barcode': ht.tcga_weird_barcode, 'tcga_below_30': ht.tcga_below_30, 'specific_exclusion': ht.specific_exclusion, 'esp': ht.esp, 'not_releasable': ht.non_releasable, 'syndip': ht.syndip } return hl.set( hl.filter(lambda x: hl.is_defined(x), [ hl.or_missing(filter_expr, name) for name, filter_expr in perm_filters.items() ]))
def set_female_y_metrics_to_na_expr( t: Union[hl.Table, hl.MatrixTable]) -> hl.expr.ArrayExpression: """ Set Y-variant frequency callstats for female-specific metrics to missing structs. .. note:: Requires freq, freq_meta, and freq_index_dict annotations to be present in Table or MatrixTable :param t: Table or MatrixTable for which to adjust female metrics :return: Hail array expression to set female Y-variant metrics to missing values """ female_idx = hl.map( lambda x: t.freq_index_dict[x], hl.filter(lambda x: x.contains("XX"), t.freq_index_dict.keys()), ) freq_idx_range = hl.range(hl.len(t.freq_meta)) new_freq_expr = hl.if_else( (t.locus.in_y_nonpar() | t.locus.in_y_par()), hl.map( lambda x: hl.if_else(female_idx.contains(x), missing_callstats_expr(), t.freq[x]), freq_idx_range, ), t.freq, ) return new_freq_expr
def make_hard_filters_expr(ht: hl.Table, data_type: str) -> hl.expr.SetExpression: """ NOTE: additional metadata in Kristen's import file is hard-coded :param: Table ht: input MT :param: str data_type: 'exomes' or 'genomes' :return: output MT :rtype: SetExpression """ hard_filters = { 'contamination': ht.freemix > 0.05, 'callrate': ht.callrate < 0.85, 'chimera': ht.pct_chimeras > 0.05, 'ambiguous_sex': ht.ambiguous_sex } if data_type == 'exomes': hard_filters.update({ 'coverage': ht.mean_chr20_coverage == 0, 'sex_aneuploidy': ht.sex_aneuploidy }) else: hard_filters.update({ 'coverage': ht.mean_dp < 15, 'insert_size': ht.median_insert_size < 250 }) return hl.set( hl.filter(lambda x: hl.is_defined(x), [ hl.or_missing(filter_expr, name) for name, filter_expr in hard_filters.items() ]))
def make_pop_filters_expr(ht: hl.Table, qc_metrics: List[str]) -> hl.expr.SetExpression: return hl.set( hl.filter(lambda x: hl.is_defined(x), [ hl.or_missing(ht[f'fail_{metric}'], metric) for metric in qc_metrics ]))
def add_popmax_expr(freq: hl.expr.ArrayExpression, freq_meta: hl.expr.ArrayExpression, populations: Set[str]) -> hl.expr.ArrayExpression: """ Calculates popmax (add an additional entry into freq with popmax: pop) :param ArrayExpression freq: ArrayExpression of Structs with ['ac', 'an', 'hom'] :param ArrayExpression freq_meta: ArrayExpression of meta dictionaries corresponding to freq :param set of str populations: Set of populations over which to calculate popmax :return: Frequency data with annotated popmax :rtype: ArrayExpression """ pops_to_use = hl.literal(populations) freq = hl.map(lambda x: x[0].annotate(meta=x[1]), hl.zip(freq, freq_meta)) freq_filtered = hl.filter( lambda f: (f.meta.size() == 2) & (f.meta.get('group') == 'adj') & pops_to_use.contains(f.meta.get('pop')) & (f.AC > 0), freq) sorted_freqs = hl.sorted(freq_filtered, key=lambda x: x.AF, reverse=True) return hl.or_missing( hl.len(sorted_freqs) > 0, hl.struct(AC=sorted_freqs[0].AC, AF=sorted_freqs[0].AF, AN=sorted_freqs[0].AN, homozygote_count=sorted_freqs[0].homozygote_count, pop=sorted_freqs[0].meta['pop']))
def make_filters_expr(ht: hl.Table, qc_metrics: Iterable[str]) -> hl.expr.SetExpression: return hl.set( hl.filter( lambda x: hl.is_defined(x), [ hl.or_missing(ht[f"fail_{metric}"], metric) for metric in qc_metrics ], ))
def get_expression_proportion(tx_table, tissues_to_filter, gene_maximum_ht): if tissues_to_filter: print("Filtering tissues:", tissues_to_filter) tx_table = tx_table.drop(*tissues_to_filter) remaining_tissue_columns = list( set(tx_table.row) - {'locus', 'alleles', 'csq', 'ensg', 'symbol', 'lof', 'lof_flag'}) tx_table = tx_table.annotate(tx_expression={ tissue_id: tx_table[tissue_id] for tissue_id in remaining_tissue_columns }) tx_table = tx_table.key_by('ensg').join(gene_maximum_ht.key_by("ensg")) expression_proportion_table = tx_table.annotate( expression_proportion_dict={ tissue_id: tx_table.tx_expression[tissue_id] / tx_table.gene_maximum_dict[tissue_id] for tissue_id in remaining_tissue_columns }) columns_to_drop = list( set(expression_proportion_table.row) - { 'locus', 'alleles', 'csq', 'ensg', 'symbol', 'lof', 'lof_flag', 'expression_proportion_dict' }) expression_proportion_table = expression_proportion_table.drop( *columns_to_drop) expression_proportion_table = expression_proportion_table.annotate( **{ tissue_id: expression_proportion_table.expression_proportion_dict[tissue_id] for tissue_id in remaining_tissue_columns }) expression_proportion_table = expression_proportion_table.annotate( mean_proportion=hl.mean(hl.filter(lambda e: ~hl.is_nan(e), [ expression_proportion_table[tissue_id] for tissue_id in remaining_tissue_columns ]), filter_missing=True)) expression_proportion_table = expression_proportion_table.drop( expression_proportion_table.expression_proportion_dict).key_by( 'locus', 'alleles', 'ensg') return expression_proportion_table
def add_variant_type(alt_alleles: hl.expr.ArrayExpression) -> hl.expr.StructExpression: """ Get Struct of variant_type and n_alt_alleles from ArrayExpression of Strings (all alleles) """ ref = alt_alleles[0] alts = alt_alleles[1:] non_star_alleles = hl.filter(lambda a: a != '*', alts) return hl.struct(variant_type=hl.cond( hl.all(lambda a: hl.is_snp(ref, a), non_star_alleles), hl.cond(hl.len(non_star_alleles) > 1, "multi-snv", "snv"), hl.cond( hl.all(lambda a: hl.is_indel(ref, a), non_star_alleles), hl.cond(hl.len(non_star_alleles) > 1, "multi-indel", "indel"), "mixed") ), n_alt_alleles=hl.len(non_star_alleles))
def annotate_fields(mt, gencode_release, gencode_path): genotypes = hl.agg.collect( hl.struct(sample_id=mt.s, gq=mt.GQ, cn=mt.RD_CN, num_alt=hl.if_else(hl.is_defined(mt.GT), mt.GT.n_alt_alleles(), -1))) rows = mt.annotate_rows(genotypes=genotypes).rows() rows = rows.annotate(**{k: v(rows) for k, v in CORE_FIELDS.items()}) gene_id_mapping = hl.literal( load_gencode(gencode_release, download_path=gencode_path)) rows = rows.annotate( sortedTranscriptConsequences=hl.flatmap( lambda x: x, hl.filter(lambda x: hl.is_defined(x), [ rows.info[col].map(lambda gene: hl.struct( gene_symbol=gene, gene_id=gene_id_mapping[gene], predicted_consequence=col.split('__')[-1])) for col in [ gene_col for gene_col in rows.info if gene_col.startswith('PROTEIN_CODING__') and rows.info[gene_col].dtype == hl.dtype('array<str>') ] ])), sv_type=rows.alleles[1].replace('[<>]', '').split(':', 2), ) DERIVED_FIELDS.update({ 'filters': lambda rows: hl.if_else( hl.len(rows.filters) > 0, rows.filters, hl.missing(hl.dtype('array<str>'))) }) rows = rows.annotate(**{k: v(rows) for k, v in DERIVED_FIELDS.items()}) rows = rows.rename({'rsid': 'variantId'}) return rows.key_by().select(*FIELDS)
def add_popmax_expr(freq: hl.expr.ArrayExpression) -> hl.expr.ArrayExpression: """ Calculates popmax (add an additional entry into freq with popmax: pop) :param ArrayExpression freq: ArrayExpression of Structs with ['ac', 'an', 'hom', 'meta'] :return: Frequency data with annotated popmax :rtype: ArrayExpression """ freq_filtered = hl.filter( lambda x: (x.meta.keys() == ['population']) & (x.meta['population'] != 'oth'), freq) sorted_freqs = hl.sorted(freq_filtered, key=lambda x: x.ac / x.an, reverse=True) return hl.cond( hl.len(sorted_freqs) > 0, freq.append( hl.struct(ac=sorted_freqs[0].ac, an=sorted_freqs[0].an, hom=sorted_freqs[0].hom, meta={'popmax': sorted_freqs[0].meta['population']})), freq)
def apply_filter_flags_expr( mt: hl.MatrixTable, data_type: str, metric_thresholds: dict ) -> hl.expr.SetExpression: """ Annotates table with flags for elevated contamination and chimera as well as low coverage and call rate :param Table mt: input MatrixTable :param str data_type: 'WES' or 'WGS' for selecting coverage threshold :param dict metric_thresholds: dictionary where key is metric and value is threshold value :return: Set of sequencing metric flags :rtype: SetExpression """ flags = { "callrate": mt.filtered_callrate < metric_thresholds["callrate_thres"], "contamination": mt.PCT_CONTAMINATION > metric_thresholds[ "contam_thres" ], # TODO revisit current thresholds and rename once have to kristen's script output "chimera": mt.AL_PCT_CHIMERAS > metric_thresholds["chimera_thres"], } if data_type == "WES": flags.update( { "coverage": mt.HS_PCT_TARGET_BASES_20X < metric_thresholds["wes_cov_thres"] } ) else: flags.update( {"coverage": mt.WGS_MEAN_COVERAGE < metric_thresholds["wgs_cov_thres"]} ) return hl.set( hl.filter( lambda x: hl.is_defined(x), [hl.or_missing(filter_expr, name) for name, filter_expr in flags.items()], ) )
def main(args): # Initializing Hail on cluster mode init_hail_on_cluster(tmp_dir=HAIL_TMP_DIR, log_file=HAIL_LOG_PATH, local_mode=True) # 1- Aggregate MatrixTable per gene/consequences creating gene/csq X sample matrix # Read MatrixTable mt = hl.read_matrix_table(args.mt_input_path) # Annotate csq group info per variants # Define consequences variant rules with hail expressions # TODO: check if field exist in dataset csq_group_rules = {} if args.ptv: csq_group_rules.update({'PTV': mt.csq_type == 'PTV'}) if args.pav: csq_group_rules.update({'PAV': mt.csq_type == 'PAV'}) if args.syn: csq_group_rules.update({'SYN': mt.csq_type == 'SYN'}) if args.cadd: sq_group_rules.update({ 'CADD': (mt.csq_type == 'PAV') & (mt.cadd_phred >= args.cadd_threshold) }) if args.mpc: csq_group_rules.update( {'MPC': (mt.csq_type == 'PAV') & (mt.mpc >= args.mpc_threshold)}) # Annotate groups per variants mt = (mt.annotate_rows(csq_group=csq_group_rules)) # Transmute csq_group and convert to set (easier to explode and grouping later) mt = (mt.transmute_rows(csq_group=hl.set( hl.filter(lambda x: mt.csq_group.get(x), mt.csq_group.keys())))) # Explode nested csq_group before grouping mt = (mt.explode_rows(mt.csq_group)) # Group mt by gene/csq_group. mt_grouped = (mt.group_rows_by( mt.csq_group, mt.symbol).partition_hint(100).aggregate( n_het=hl.agg.count_where(mt.GT.is_het()))) # 2- Annotate gene set information # Import/parsing gene cluster table clusters = hl.import_table(args.gene_set_path, no_header=True) # parsing gene set column clusters = (clusters.transmute(genes=hl.set(clusters['f1'].split( delim='[|]')))) clusters = (clusters.explode(clusters.genes)) clusters = (clusters.group_by('genes').partition_hint(100).aggregate( cluster_name=hl.agg.collect_as_set(clusters['f0'])).key_by('genes')) # annotate gene set info mt_grouped = (mt_grouped.annotate_rows( cluster_name=clusters[mt_grouped.symbol].cluster_name)) # 3- Aggregate per gene set and consequences # Group mt by gene set/csq_group. mt_grouped = (mt_grouped.explode_rows(mt_grouped.cluster_name)) mt_grouped = (mt_grouped.group_rows_by( mt_grouped.cluster_name, mt_grouped.csq_group).partition_hint(100).aggregate( n_het=hl.agg.sum(mt_grouped.n_het))) # force to eval all aggregation operation by writing mt to disk mt_grouped = mt_grouped.persist(storage_level='DISK_ONLY') if args.logistic_regression: # covariates list covs = list(args.covs_list) # Define x expression (entries/genotype) x_expr = 'n_het' extra_annotations = {'analysis': 'all_cases', 'covariates': covs} tb_stats = logistic_regression(mt=mt_grouped, x_expr=x_expr, response=args.phenotype_field, covs=covs, pass_through=[], extra_fields=extra_annotations) # export table tb_stats.export(args.output_path) if args.fet: None # TODO: implement Fisher Exact-based burden gene set test hl.stop()
def main(args): # Initializing Hail on cluster mode hl.init() # 1- Aggregate MatrixTable per gene/consequences creating gene/csq X sample matrix # Read MatrixTable mt = hl.read_matrix_table(args.mt_input_path) # Annotate csq group info per variants # Define consequences variant rules with hail expressions # TODO: check if fields exist in dataset csq_group_rules = {} if args.ptv: csq_group_rules.update({'PTV': mt.csq_type == 'PTV'}) if args.pav: csq_group_rules.update({'PAV': mt.csq_type == 'PAV'}) if args.syn: csq_group_rules.update({'SYN': mt.csq_type == 'SYN'}) if args.cadd: csq_group_rules.update({ 'CADD': (mt.csq_type == 'PAV') & (mt.cadd_phred >= args.cadd_threshold) }) # Annotate groups per variants mt = (mt.annotate_rows(csq_group=csq_group_rules)) # Transmute csq_group and convert to set (easier to explode and grouping later) mt = (mt.transmute_rows(csq_group=hl.set( hl.filter(lambda x: mt.csq_group.get(x), mt.csq_group.keys())))) # Explode nested csq_group before grouping mt = (mt.explode_rows(mt.csq_group)) # Group mt by gene/csq_group. mt_grouped = (mt.group_rows_by( mt.csq_group, mt.symbol).partition_hint(100).aggregate( n_het=hl.agg.count_where(mt.GT.is_het()))) # force to eval all aggregation operation by writing mt to disk # mt_grouped = mt_grouped.persist(storage_level='DISK_ONLY') if args.logistic_regression: # covariates list covs = list(args.covs_list) # Define x expression (entries/genotype) x_expr = 'n_het' extra_annotations = {'analysis': 'all_cases', 'covariates': covs} tb_stats = logistic_regression(mt=mt_grouped, x_expr=x_expr, response=args.phenotype_field, covs=covs, pass_through=[], extra_fields=extra_annotations) # export table tb_stats.export(args.output_path) if args.fet: None # TODO: implement gene-based Fisher Exact burden test hl.stop()
def main(args): hl.init(default_reference=args.default_ref_genome) if args.run_test_mode: logger.info('Running pipeline on test data...') mt = (get_mt_data(part='raw_chr20').sample_rows(0.1)) else: logger.info( 'Running pipeline on MatrixTable wih adjusted genotypes...') ds = args.exome_cohort mt = hl.read_matrix_table( get_qc_mt_path(dataset=ds, part='unphase_adj_genotypes', split=True)) # 1. Sample-QC filtering if not args.skip_sample_qc_filtering: logger.info('Applying per sample QC filtering...') mt = apply_sample_qc_filtering(mt) logger.info( 'Writing sample qc-filtered mt with rare variants (internal maf 0.01) to disk...' ) mt = (mt.write(f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt', overwrite=True)) # 2. Variant-QC filtering if not args.skip_variant_qc_filtering: logger.info('Applying per variant QC filtering...') if hl.hadoop_is_file( f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt/_SUCCESS'): logger.info('Reading pre-existing sample qc-filtered MT...') mt = hl.read_matrix_table( f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt') mt = apply_variant_qc_filtering(mt) # write hard filtered MT to disk logger.info( 'Writing variant qc-filtered mt with rare variants (internal maf 0.01) to disk...' ) mt = (mt.write(f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt', overwrite=True)) # 3. Annotate AFs # allelic frequency cut-off maf_cutoff = args.af_max_threshold if not args.skip_af_filtering: if hl.hadoop_is_file( f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt/_SUCCESS'): logger.info( 'Reading pre-existing sample/variant qc-filtered MT...') mt = hl.read_matrix_table( f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt') # Annotate allelic frequencies from external source, # and compute internal AF on samples passing QC af_ht = get_af_annotation_ht() mt = (mt.annotate_rows(**af_ht[mt.row_key])) filter_expressions = [ af_filter_expr(mt, 'internal_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'gnomad_genomes_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'gnomAD_AF', af_cutoff=maf_cutoff), af_filter_expr(mt, 'ger_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'rumc_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'bonn_af', af_cutoff=maf_cutoff) ] mt = (mt.filter_rows(functools.reduce(operator.iand, filter_expressions), keep=True)) logger.info( 'Writing qc-filtered MT filtered to external maf with to disk...') mt = (mt.write(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt', overwrite=True)) # 4. ##### Burden Test ###### logger.info('Running burden test...') if hl.hadoop_is_file(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt/_SUCCESS'): logger.info( 'Reading pre-existing sample/variant qc-filtered MT with rare variants...' ) mt = hl.read_matrix_table(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt') ## Add VEP-annotated fields vep_ht = get_vep_annotation_ht() mt = (mt.annotate_rows(LoF=vep_ht[mt.row_key].vep.LoF, Consequence=vep_ht[mt.row_key].vep.Consequence, DOMAINS=vep_ht[mt.row_key].vep.DOMAINS, SYMBOL=vep_ht[mt.row_key].vep.SYMBOL)) ## Filter to bi-allelic variants if args.filter_biallelic: logger.info('Running burden test on biallelic variants...') mt = mt.filter_rows(bi_allelic_expr(mt)) ## Filter to variants within protein domain(s) if args.filter_protein_domain: logger.info( 'Running burden test on variants within protein domain(s)...') mt = mt.filter_rows(vep_protein_domain_filter_expr(mt.DOMAINS), keep=True) ## Add cases/controls sample annotations tb_sample = get_sample_meta_data() mt = (mt.annotate_cols(**tb_sample[mt.s])) mt = (mt.filter_cols(mt['phe.is_case'] | mt['phe.is_control'])) ## Annotate pathogenic scores ht_scores = get_vep_scores_ht() mt = mt.annotate_rows(**ht_scores[mt.row_key]) ## Classify variant into (major) consequence groups score_expr_ann = { 'hcLOF': mt.LoF == 'HC', 'syn': mt.Consequence == 'synonymous_variant', 'miss': mt.Consequence == 'missense_variant' } # Update dict expr annotations with combinations of variant consequences categories score_expr_ann.update({ 'missC': (hl.sum([(mt['vep.MVP_score'] >= MVP_THRESHOLD), (mt['vep.REVEL_score'] >= REVEL_THRESHOLD), (mt['vep.CADD_PHRED'] >= CADD_THRESHOLD)]) >= 2) & score_expr_ann.get('miss') }) score_expr_ann.update({ 'hcLOF_missC': score_expr_ann.get('hcLOF') | score_expr_ann.get('missC') }) mt = (mt.annotate_rows(csq_group=score_expr_ann)) # Transmute csq_group and convert dict to set where the group is defined # (easier to explode and grouping later) mt = (mt.transmute_rows(csq_group=hl.set( hl.filter(lambda x: mt.csq_group.get(x), mt.csq_group.keys())))) mt = (mt.filter_rows(hl.len(mt.csq_group) > 0)) # Explode nested csq_group before grouping mt = (mt.explode_rows(mt.csq_group)) # print('Number of samples/variants: ') # print(mt.count()) # Group mt by gene/csq_group. mt_grouped = (mt.group_rows_by(mt['SYMBOL'], mt['csq_group']).aggregate( hets=hl.agg.any(mt.GT.is_het()), homs=hl.agg.any(mt.GT.is_hom_var()), chets=hl.agg.count_where(mt.GT.is_het()) >= 2, homs_chets=(hl.agg.count_where(mt.GT.is_het()) >= 2) | (hl.agg.any(mt.GT.is_hom_var()))).repartition(100).persist()) mts = [] if args.homs: # select homs genotypes. mt_homs = (mt_grouped.select_entries( mac=mt_grouped.homs).annotate_rows(agg_genotype='homs')) mts.append(mt_homs) if args.chets: # select compound hets (chets) genotypes. mt_chets = (mt_grouped.select_entries( mac=mt_grouped.chets).annotate_rows(agg_genotype='chets')) mts.append(mt_chets) if args.homs_chets: # select chets and/or homs genotypes. mt_homs_chets = (mt_grouped.select_entries( mac=mt_grouped.homs_chets).annotate_rows( agg_genotype='homs_chets')) mts.append(mt_homs_chets) if args.hets: # select hets genotypes mt_hets = (mt_grouped.select_entries( mac=mt_grouped.hets).annotate_rows(agg_genotype='hets')) mts.append(mt_hets) ## Joint MatrixTables mt_grouped = hl.MatrixTable.union_rows(*mts) # Generate table of counts tb_gene = (mt_grouped.annotate_rows( n_cases=hl.agg.filter(mt_grouped['phe.is_case'], hl.agg.sum(mt_grouped.mac)), n_syndromic=hl.agg.filter(mt_grouped['phe.is_syndromic'], hl.agg.sum(mt_grouped.mac)), n_nonsyndromic=hl.agg.filter(mt_grouped['phe.is_nonsyndromic'], hl.agg.sum(mt_grouped.mac)), n_controls=hl.agg.filter(mt_grouped['phe.is_control'], hl.agg.sum(mt_grouped.mac)), n_total_cases=hl.agg.filter(mt_grouped['phe.is_case'], hl.agg.count()), n_total_syndromic=hl.agg.filter(mt_grouped['phe.is_syndromic'], hl.agg.count()), n_total_nonsyndromic=hl.agg.filter(mt_grouped['phe.is_nonsyndromic'], hl.agg.count()), n_total_controls=hl.agg.filter(mt_grouped['phe.is_control'], hl.agg.count())).rows()) # run fet stratified by proband type analysis = ['all_cases', 'syndromic', 'nonsyndromic'] tbs = [] for proband in analysis: logger.info(f'Running test for {proband}...') colCases = None colTotalCases = None colControls = 'n_controls' colTotalControls = 'n_total_controls' if proband == 'all_cases': colCases = 'n_cases' colTotalCases = 'n_total_cases' if proband == 'syndromic': colCases = 'n_syndromic' colTotalCases = 'n_total_syndromic' if proband == 'nonsyndromic': colCases = 'n_nonsyndromic' colTotalCases = 'n_total_nonsyndromic' tb_fet = compute_fisher_exact(tb=tb_gene, n_cases_col=colCases, n_control_col=colControls, total_cases_col=colTotalCases, total_controls_col=colTotalControls, correct_total_counts=True, root_col_name='fet', extra_fields={ 'analysis': proband, 'maf': maf_cutoff }) # filter out zero-count genes tb_fet = (tb_fet.filter( hl.sum([tb_fet[colCases], tb_fet[colControls]]) > 0, keep=True)) tbs.append(tb_fet) tb_final = hl.Table.union(*tbs) tb_final.describe() # export results date = current_date() run_hash = str(uuid.uuid4())[:6] output_path = f'{args.output_dir}/{date}/{args.exome_cohort}.fet_burden.{run_hash}.ht' tb_final = (tb_final.checkpoint(output=output_path)) if args.write_to_file: # write table to disk as TSV file (tb_final.export(f'{output_path}.tsv')) hl.stop()
def main(args): hl.init() # Read in all sumstats mt = load_final_sumstats_mt(filter_phenos=True, filter_variants=False, filter_sumstats=True, separate_columns_by_pop=False, annotate_with_nearest_gene=False) # Annotate per-entry sample size def get_n(pheno_data, i): return pheno_data[i].n_cases + hl.or_else(pheno_data[i].n_controls, 0) mt = mt.annotate_entries(summary_stats=hl.map( lambda x: x[1].annotate(N=hl.or_missing(hl.is_defined(x[1]), get_n(mt.pheno_data, x[0]))), hl.zip_with_index(mt.summary_stats))) # Exclude entries with low confidence flag. if not args.keep_low_confidence_variants: mt = mt.annotate_entries(summary_stats=hl.map( lambda x: hl.or_missing(~x.low_confidence, x), mt.summary_stats)) # Run fixed-effect meta-analysis (all + leave-one-out) mt = mt.annotate_entries(unnorm_beta=mt.summary_stats.BETA / (mt.summary_stats.SE**2), inv_se2=1 / (mt.summary_stats.SE**2)) mt = mt.annotate_entries( sum_unnorm_beta=all_and_leave_one_out(mt.unnorm_beta, mt.pheno_data.pop), sum_inv_se2=all_and_leave_one_out(mt.inv_se2, mt.pheno_data.pop)) mt = mt.transmute_entries(META_BETA=mt.sum_unnorm_beta / mt.sum_inv_se2, META_SE=hl.map(lambda x: hl.sqrt(1 / x), mt.sum_inv_se2)) mt = mt.annotate_entries( META_Pvalue=hl.map(lambda x: 2 * hl.pnorm(x), -hl.abs(mt.META_BETA / mt.META_SE))) # Run heterogeneity test (Cochran's Q) mt = mt.annotate_entries(META_Q=hl.map( lambda x: hl.sum((mt.summary_stats.BETA - x)**2 * mt.inv_se2), mt.META_BETA), variant_exists=hl.map(lambda x: ~hl.is_missing(x), mt.summary_stats.BETA)) mt = mt.annotate_entries(META_N_pops=all_and_leave_one_out( mt.variant_exists, mt.pheno_data.pop)) mt = mt.annotate_entries(META_Pvalue_het=hl.map( lambda i: hl.pchisqtail(mt.META_Q[i], mt.META_N_pops[i] - 1), hl.range(hl.len(mt.META_Q)))) # Add other annotations mt = mt.annotate_entries( ac_cases=hl.map(lambda x: x["AF.Cases"] * x.N, mt.summary_stats), ac_controls=hl.map(lambda x: x["AF.Controls"] * x.N, mt.summary_stats), META_AC_Allele2=all_and_leave_one_out( mt.summary_stats.AF_Allele2 * mt.summary_stats.N, mt.pheno_data.pop), META_N=all_and_leave_one_out(mt.summary_stats.N, mt.pheno_data.pop)) mt = mt.annotate_entries( META_AF_Allele2=mt.META_AC_Allele2 / mt.META_N, META_AF_Cases=all_and_leave_one_out(mt.ac_cases, mt.pheno_data.pop) / mt.META_N, META_AF_Controls=all_and_leave_one_out(mt.ac_controls, mt.pheno_data.pop) / mt.META_N) mt = mt.drop('unnorm_beta', 'inv_se2', 'variant_exists', 'ac_cases', 'ac_controls', 'summary_stats', 'META_AC_Allele2') # Format everything into array<struct> def is_finite_or_missing(x): return (hl.or_missing(hl.is_finite(x), x)) meta_fields = [ 'BETA', 'SE', 'Pvalue', 'Q', 'Pvalue_het', 'N', 'N_pops', 'AF_Allele2', 'AF_Cases', 'AF_Controls' ] mt = mt.transmute_entries(meta_analysis=hl.map( lambda i: hl.struct( **{ field: is_finite_or_missing(mt[f'META_{field}'][i]) for field in meta_fields }), hl.range(hl.len(mt.META_BETA)))) col_fields = ['n_cases', 'n_controls'] mt = mt.annotate_cols( **{ field: all_and_leave_one_out(mt.pheno_data[field], mt.pheno_data.pop) for field in col_fields }) col_fields += ['pop'] mt = mt.annotate_cols(pop=all_and_leave_one_out( mt.pheno_data.pop, mt.pheno_data.pop, all_f=lambda x: x, loo_f=lambda i, x: hl.filter(lambda y: y != x[i], x), )) mt = mt.transmute_cols(meta_analysis_data=hl.map( lambda i: hl.struct(**{field: mt[field][i] for field in col_fields}), hl.range(hl.len(mt.pop)))) mt.describe() mt.write(get_meta_analysis_results_path(), overwrite=args.overwrite) hl.copy_log('gs://ukb-diverse-pops/combined_results/meta_analysis.log')
def merge_alleles(alleles) -> ArrayExpression: # alleles is tarray(tarray(tstruct(ref=tstr, alt=tstr))) return hl.rbind(hl.array(hl.set(hl.flatten(alleles))), lambda arr: hl.filter(lambda a: a.alt != '<NON_REF>', arr) .extend(hl.filter(lambda a: a.alt == '<NON_REF>', arr)))
def ld_score_regression(weight_expr, ld_score_expr, chi_sq_exprs, n_samples_exprs, n_blocks=200, two_step_threshold=30, n_reference_panel_variants=None) -> Table: r"""Estimate SNP-heritability and level of confounding biases from GWAS summary statistics. Given a set or multiple sets of genome-wide association study (GWAS) summary statistics, :func:`.ld_score_regression` estimates the heritability of a trait or set of traits and the level of confounding biases present in the underlying studies by regressing chi-squared statistics on LD scores, leveraging the model: .. math:: \mathrm{E}[\chi_j^2] = 1 + Na + \frac{Nh_g^2}{M}l_j * :math:`\mathrm{E}[\chi_j^2]` is the expected chi-squared statistic for variant :math:`j` resulting from a test of association between variant :math:`j` and a trait. * :math:`l_j = \sum_{k} r_{jk}^2` is the LD score of variant :math:`j`, calculated as the sum of squared correlation coefficients between variant :math:`j` and nearby variants. See :func:`ld_score` for further details. * :math:`a` captures the contribution of confounding biases, such as cryptic relatedness and uncontrolled population structure, to the association test statistic. * :math:`h_g^2` is the SNP-heritability, or the proportion of variation in the trait explained by the effects of variants included in the regression model above. * :math:`M` is the number of variants used to estimate :math:`h_g^2`. * :math:`N` is the number of samples in the underlying association study. For more details on the method implemented in this function, see: * `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ Examples -------- Run the method on a matrix table of summary statistics, where the rows are variants and the columns are different phenotypes: >>> mt_gwas = hl.read_matrix_table('data/ld_score_regression.sumstats.mt') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=mt_gwas['ld_score'], ... ld_score_expr=mt_gwas['ld_score'], ... chi_sq_exprs=mt_gwas['chi_squared'], ... n_samples_exprs=mt_gwas['n']) Run the method on a table with summary statistics for a single phenotype: >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=ht_gwas['chi_squared_50_irnt'], ... n_samples_exprs=ht_gwas['n_50_irnt']) Run the method on a table with summary statistics for multiple phenotypes: >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=[ht_gwas['chi_squared_50_irnt'], ... ht_gwas['chi_squared_20160']], ... n_samples_exprs=[ht_gwas['n_50_irnt'], ... ht_gwas['n_20160']]) Notes ----- The ``exprs`` provided as arguments to :func:`.ld_score_regression` must all be from the same object, either a :class:`Table` or a :class:`MatrixTable`. **If the arguments originate from a table:** * The table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * ``weight_expr``, ``ld_score_expr``, ``chi_sq_exprs``, and ``n_samples_exprs`` are must be row-indexed fields. * The number of expressions passed to ``n_samples_exprs`` must be equal to one or the number of expressions passed to ``chi_sq_exprs``. If just one expression is passed to ``n_samples_exprs``, that sample size expression is assumed to apply to all sets of statistics passed to ``chi_sq_exprs``. Otherwise, the expressions passed to ``chi_sq_exprs`` and ``n_samples_exprs`` are matched by index. * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have generic :obj:`int` values ``0``, ``1``, etc. corresponding to the ``0th``, ``1st``, etc. expressions passed to the ``chi_sq_exprs`` argument. **If the arguments originate from a matrix table:** * The dimensions of the matrix table must be variants (rows) by phenotypes (columns). * The rows of the matrix table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * The columns of the matrix table must be keyed by a field of type :py:data:`.tstr` that uniquely identifies phenotypes represented in the matrix table. The column key must be a single expression; compound keys are not accepted. * ``weight_expr`` and ``ld_score_expr`` must be row-indexed fields. * ``chi_sq_exprs`` must be a single entry-indexed field (not a list of fields). * ``n_samples_exprs`` must be a single entry-indexed field (not a list of fields). * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have values corresponding to the column keys of the input matrix table. This function returns a :class:`Table` with one row per set of summary statistics passed to the ``chi_sq_exprs`` argument. The following row-indexed fields are included in the table: * **phenotype** (:py:data:`.tstr`) -- The name of the phenotype. The returned table is keyed by this field. See the notes below for details on the possible values of this field. * **mean_chi_sq** (:py:data:`.tfloat64`) -- The mean chi-squared test statistic for the given phenotype. * **intercept** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the intercept :math:`1 + Na`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. * **snp_heritability** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the SNP-heritability :math:`h_g^2`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. Warning ------- :func:`.ld_score_regression` considers only the rows for which both row fields ``weight_expr`` and ``ld_score_expr`` are defined. Rows with missing values in either field are removed prior to fitting the LD score regression model. Parameters ---------- weight_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used to derive variant weights in the model. ld_score_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used as covariates in the model. chi_sq_exprs : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions for chi-squared statistics resulting from genome-wide association studies. n_samples_exprs: :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions indicating the number of samples used in the studies that generated the test statistics supplied to ``chi_sq_exprs``. n_blocks : :obj:`int` The number of blocks used in the jackknife approach to estimating standard errors. two_step_threshold : :obj:`int` Variants with chi-squared statistics greater than this value are excluded in the first step of the two-step procedure used to fit the model. n_reference_panel_variants : :obj:`int`, optional Number of variants used to estimate the SNP-heritability :math:`h_g^2`. Returns ------- :class:`.Table` Table keyed by ``phenotype`` with intercept and heritability estimates for each phenotype passed to the function.""" chi_sq_exprs = wrap_to_list(chi_sq_exprs) n_samples_exprs = wrap_to_list(n_samples_exprs) assert ((len(chi_sq_exprs) == len(n_samples_exprs)) or (len(n_samples_exprs) == 1)) __k = 2 # number of covariates, including intercept ds = chi_sq_exprs[0]._indices.source analyze('ld_score_regression/weight_expr', weight_expr, ds._row_indices) analyze('ld_score_regression/ld_score_expr', ld_score_expr, ds._row_indices) # format input dataset if isinstance(ds, MatrixTable): if len(chi_sq_exprs) != 1: raise ValueError("""Only one chi_sq_expr allowed if originating from a matrix table.""") if len(n_samples_exprs) != 1: raise ValueError("""Only one n_samples_expr allowed if originating from a matrix table.""") col_key = list(ds.col_key) if len(col_key) != 1: raise ValueError("""Matrix table must be keyed by a single phenotype field.""") analyze('ld_score_regression/chi_squared_expr', chi_sq_exprs[0], ds._entry_indices) analyze('ld_score_regression/n_samples_expr', n_samples_exprs[0], ds._entry_indices) ds = ds._select_all(row_exprs={'__locus': ds['locus'], '__alleles': ds['alleles'], '__w_initial': weight_expr, '__w_initial_floor': hl.max(weight_expr, 1.0), '__x': ld_score_expr, '__x_floor': hl.max(ld_score_expr, 1.0)}, row_key=['__locus', '__alleles'], col_exprs={'__y_name': ds[col_key[0]]}, col_key=['__y_name'], entry_exprs={'__y': chi_sq_exprs[0], '__n': n_samples_exprs[0]}) ds = ds.annotate_entries(**{'__w': ds['__w_initial']}) ds = ds.filter_rows(hl.is_defined(ds['__locus']) & hl.is_defined(ds['__alleles']) & hl.is_defined(ds['__w_initial']) & hl.is_defined(ds['__x'])) else: assert isinstance(ds, Table) for y in chi_sq_exprs: analyze('ld_score_regression/chi_squared_expr', y, ds._row_indices) for n in n_samples_exprs: analyze('ld_score_regression/n_samples_expr', n, ds._row_indices) ys = ['__y{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ws = ['__w{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ns = ['__n{:}'.format(i) for i, _ in enumerate(n_samples_exprs)] ds = ds.select(**dict(**{'__locus': ds['locus'], '__alleles': ds['alleles'], '__w_initial': weight_expr, '__x': ld_score_expr}, **{y: chi_sq_exprs[i] for i, y in enumerate(ys)}, **{w: weight_expr for w in ws}, **{n: n_samples_exprs[i] for i, n in enumerate(ns)})) ds = ds.key_by(ds['__locus'], ds['__alleles']) table_tmp_file = new_temp_file() ds.write(table_tmp_file) ds = hl.read_table(table_tmp_file) hts = [ds.select(**{'__w_initial': ds['__w_initial'], '__w_initial_floor': hl.max(ds['__w_initial'], 1.0), '__x': ds['__x'], '__x_floor': hl.max(ds['__x'], 1.0), '__y_name': i, '__y': ds[ys[i]], '__w': ds[ws[i]], '__n': hl.int(ds[ns[i]])}) for i, y in enumerate(ys)] mts = [ht.to_matrix_table(row_key=['__locus', '__alleles'], col_key=['__y_name'], row_fields=['__w_initial', '__w_initial_floor', '__x', '__x_floor']) for ht in hts] ds = mts[0] for i in range(1, len(ys)): ds = ds.union_cols(mts[i]) ds = ds.filter_rows(hl.is_defined(ds['__locus']) & hl.is_defined(ds['__alleles']) & hl.is_defined(ds['__w_initial']) & hl.is_defined(ds['__x'])) ds_tmp_file = new_temp_file() ds.write(ds_tmp_file) mt = hl.read_matrix_table(ds_tmp_file) if not n_reference_panel_variants: M = mt.count_rows() else: M = n_reference_panel_variants # block variants for each phenotype n_phenotypes = mt.count_cols() mt = mt.annotate_entries(__in_step1=(hl.is_defined(mt['__y']) & (mt['__y'] < two_step_threshold)), __in_step2=hl.is_defined(mt['__y'])) mt = mt.annotate_cols(__col_idx=hl.int(hl.scan.count()), __m_step1=hl.agg.count_where(mt['__in_step1']), __m_step2=hl.agg.count_where(mt['__in_step2'])) mt = mt.annotate_rows(__step1_array=hl.agg.collect( hl.struct( __col_idx=mt['__col_idx'], __in_step1=mt['__in_step1'])), __step2_array=hl.agg.collect( hl.struct( __col_idx=mt['__col_idx'], __in_step2=mt['__in_step2']))) mt = mt.annotate_rows( __step1_idx_array=[ hl.struct( __col_idx=mt['__step1_array'][i]['__col_idx'], __step1_idx=hl.scan.count_where( mt['__step1_array'][i]['__in_step1'])) for i in range(n_phenotypes)], __step2_idx_array=[ hl.struct( __col_idx=mt['__step2_array'][i]['__col_idx'], __step2_idx=hl.scan.count_where( mt['__step2_array'][i]['__in_step2'])) for i in range(n_phenotypes)]) mt = mt.annotate_entries( __step1_idx=hl.filter(lambda x: x['__col_idx'] == mt['__col_idx'], mt['__step1_idx_array'])[0]['__step1_idx'], __step2_idx=hl.filter(lambda x: x['__col_idx'] == mt['__col_idx'], mt['__step2_idx_array'])[0]['__step2_idx']) mt = mt.annotate_cols(__step2_maplist=hl.sorted( hl.agg.filter(mt['__in_step1'], hl.agg.collect(mt['__step2_idx'])))) mt_tmp_file = new_temp_file() mt.write(mt_tmp_file) mt = hl.read_matrix_table(mt_tmp_file) step1_dict = {x['__col_idx']: x['__m_step1'] for x in mt.cols().collect()} step2_dict = {x['__col_idx']: (x['__m_step2'], x['__step2_maplist']) for x in mt.cols().collect()} step1_separators = {} for k, v in step1_dict.items(): s = np.floor(np.linspace(0, v, n_blocks + 1)) step1_separators[k] = [int(x) for x in s] step2_separators = {} for k, v in step2_dict.items(): s = [0] s.extend([v[1][x] for x in step1_separators[k][1:-1]]) s.append(v[0]) step2_separators[k] = [int(x) for x in s] mt = mt.annotate_cols( __step1_separators=hl.literal(step1_separators)[mt['__col_idx']], __step2_separators=hl.literal(step2_separators)[mt['__col_idx']]) mt = mt.annotate_entries( __step2_block=hl.sum(hl.map(lambda x: mt['__step2_idx'] >= x, mt['__step1_separators'])) ) mt = mt.annotate_entries( __step1_block=hl.sum(hl.map(lambda x: mt['__step1_idx'] >= x, mt['__step1_separators'])) - 1, __step2_block=hl.sum(hl.map(lambda x: mt['__step2_idx'] >= x, mt['__step2_separators'])) - 1) # initial coefficient estimates mt = mt.annotate_cols(__initial_betas=[ 1.0, (hl.agg.mean(mt['__y']) - 1.0) / hl.agg.mean(mt['__x'])]) mt = mt.annotate_cols(__step1_betas=mt['__initial_betas'], __step2_betas=mt['__initial_betas']) # step 1 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt['__in_step1'], 1.0/(mt['__w_initial_floor'] * 2.0 * (mt['__step1_betas'][0] + mt['__step1_betas'][1] * mt['__x_floor'])**2), 0.0)) mt = mt.annotate_cols(__step1_betas=hl.agg.filter( mt['__in_step1'], hl.agg.linreg(y=mt['__y'], x=[1.0, mt['__x']], weight=mt['__w']).beta)) mt = mt.annotate_cols(__step1_h2=hl.max(hl.min( mt['__step1_betas'][1] * M / hl.agg.mean(mt['__n']), 1.0), 0.0)) mt = mt.annotate_cols(__step1_betas=[ mt['__step1_betas'][0], mt['__step1_h2'] * hl.agg.mean(mt['__n']) / M]) # step 1 block jackknife mt = mt.annotate_cols(__step1_block_betas=[ hl.agg.filter((mt['__step1_block'] != i) & mt['__in_step1'], hl.agg.linreg(y=mt['__y'], x=[1.0, mt['__x']], weight=mt['__w']).beta) for i in range(n_blocks)]) mt = mt.annotate_cols(__step1_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt['__step1_betas'] - (n_blocks - 1) * x, mt['__step1_block_betas'])) mt = mt.annotate_cols( __step1_jackknife_mean=hl.map( lambda i: hl.mean( hl.map(lambda x: x[i], mt['__step1_block_betas_bias_corrected'])), hl.range(0, __k)), __step1_jackknife_variance=hl.map( lambda i: (hl.sum( hl.map(lambda x: x[i]**2, mt['__step1_block_betas_bias_corrected'])) - hl.sum( hl.map(lambda x: x[i], mt['__step1_block_betas_bias_corrected']))**2 / n_blocks) / (n_blocks - 1) / n_blocks, hl.range(0, __k))) # step 2 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt['__in_step2'], 1.0/(mt['__w_initial_floor'] * 2.0 * (mt['__step2_betas'][0] + mt['__step2_betas'][1] * mt['__x_floor'])**2), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt['__step1_betas'][0], hl.agg.filter(mt['__in_step2'], hl.agg.linreg(y=mt['__y'] - mt['__step1_betas'][0], x=[mt['__x']], weight=mt['__w']).beta[0])]) mt = mt.annotate_cols(__step2_h2=hl.max(hl.min( mt['__step2_betas'][1] * M/hl.agg.mean(mt['__n']), 1.0), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt['__step1_betas'][0], mt['__step2_h2'] * hl.agg.mean(mt['__n'])/M]) # step 2 block jackknife mt = mt.annotate_cols(__step2_block_betas=[ hl.agg.filter((mt['__step2_block'] != i) & mt['__in_step2'], hl.agg.linreg(y=mt['__y'] - mt['__step1_betas'][0], x=[mt['__x']], weight=mt['__w']).beta[0]) for i in range(n_blocks)]) mt = mt.annotate_cols(__step2_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt['__step2_betas'][1] - (n_blocks - 1) * x, mt['__step2_block_betas'])) mt = mt.annotate_cols( __step2_jackknife_mean=hl.mean( mt['__step2_block_betas_bias_corrected']), __step2_jackknife_variance=( hl.sum(mt['__step2_block_betas_bias_corrected']**2) - hl.sum(mt['__step2_block_betas_bias_corrected'])**2 / n_blocks) / (n_blocks - 1) / n_blocks) # combine step 1 and step 2 block jackknifes mt = mt.annotate_entries( __step2_initial_w=1.0/(mt['__w_initial_floor'] * 2.0 * (mt['__initial_betas'][0] + mt['__initial_betas'][1] * mt['__x_floor'])**2)) mt = mt.annotate_cols( __final_betas=[ mt['__step1_betas'][0], mt['__step2_betas'][1]], __c=(hl.agg.sum(mt['__step2_initial_w'] * mt['__x']) / hl.agg.sum(mt['__step2_initial_w'] * mt['__x']**2))) mt = mt.annotate_cols(__final_block_betas=hl.map( lambda i: (mt['__step2_block_betas'][i] - mt['__c'] * (mt['__step1_block_betas'][i][0] - mt['__final_betas'][0])), hl.range(0, n_blocks))) mt = mt.annotate_cols( __final_block_betas_bias_corrected=(n_blocks * mt['__final_betas'][1] - (n_blocks - 1) * mt['__final_block_betas'])) mt = mt.annotate_cols( __final_jackknife_mean=[ mt['__step1_jackknife_mean'][0], hl.mean(mt['__final_block_betas_bias_corrected'])], __final_jackknife_variance=[ mt['__step1_jackknife_variance'][0], (hl.sum(mt['__final_block_betas_bias_corrected']**2) - hl.sum(mt['__final_block_betas_bias_corrected'])**2 / n_blocks) / (n_blocks - 1) / n_blocks]) # convert coefficient to heritability estimate mt = mt.annotate_cols( phenotype=mt['__y_name'], mean_chi_sq=hl.agg.mean(mt['__y']), intercept=hl.struct( estimate=mt['__final_betas'][0], standard_error=hl.sqrt(mt['__final_jackknife_variance'][0])), snp_heritability=hl.struct( estimate=(M/hl.agg.mean(mt['__n'])) * mt['__final_betas'][1], standard_error=hl.sqrt((M/hl.agg.mean(mt['__n']))**2 * mt['__final_jackknife_variance'][1]))) # format and return results ht = mt.cols() ht = ht.key_by(ht['phenotype']) ht = ht.select(ht['mean_chi_sq'], ht['intercept'], ht['snp_heritability']) ht_tmp_file = new_temp_file() ht.write(ht_tmp_file) ht = hl.read_table(ht_tmp_file) return ht
def main(args): hl.init(default_reference=args.default_ref_genome) if args.run_test_mode: logger.info('Running pipeline on test data...') mt = (get_mt_data(part='raw_chr20').sample_rows(0.1)) else: logger.info( 'Running pipeline on MatrixTable wih adjusted genotypes...') ds = args.exome_cohort mt = hl.read_matrix_table( get_qc_mt_path(dataset=ds, part='unphase_adj_genotypes', split=True)) # 1. Sample-QC filtering if not args.skip_sample_qc_filtering: logger.info('Applying per sample QC filtering...') mt = apply_sample_qc_filtering(mt) logger.info( 'Writing sample qc-filtered mt with rare variants (internal maf 0.01) to disk...' ) mt = (mt.write(f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt', overwrite=True)) # 2. Variant-QC filtering if not args.skip_variant_qc_filtering: logger.info('Applying per variant QC filtering...') if hl.hadoop_is_file( f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt/_SUCCESS'): logger.info('Reading pre-existing sample qc-filtered MT...') mt = hl.read_matrix_table( f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt') mt = apply_variant_qc_filtering(mt) # write hard filtered MT to disk logger.info( 'Writing variant qc-filtered mt with rare variants (internal maf 0.01) to disk...' ) mt = (mt.write(f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt', overwrite=True)) # 3. Annotate AFs # allelic frequency cut-off maf_cutoff = args.af_max_threshold if not args.skip_af_filtering: if hl.hadoop_is_file( f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt/_SUCCESS'): logger.info( 'Reading pre-existing sample/variant qc-filtered MT...') mt = hl.read_matrix_table( f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt') # Annotate allelic frequencies from external source, # and compute internal AF on samples passing QC af_ht = get_af_annotation_ht() mt = (mt.annotate_rows(**af_ht[mt.row_key])) filter_expressions = [ af_filter_expr(mt, 'internal_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'gnomad_genomes_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'gnomAD_AF', af_cutoff=maf_cutoff), af_filter_expr(mt, 'ger_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'rumc_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'bonn_af', af_cutoff=maf_cutoff) ] mt = (mt.filter_rows(functools.reduce(operator.iand, filter_expressions), keep=True)) logger.info( f'Writing sample/variant QCed MT with rare variants at maf: {args.af_max_threshold}.' ) mt = (mt.write(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt', overwrite=True)) # 4. ##### Run gene-set burden logistic regression ###### logger.info('Running gene-set burden logistic regression test...') if hl.hadoop_is_file(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt/_SUCCESS'): logger.info( 'Reading pre-existing sample/variant qc-filtered MT with rare variants...' ) mt = hl.read_matrix_table(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt') ## Add VEP-annotated fields vep_ht = get_vep_annotation_ht() mt = (mt.annotate_rows(LoF=vep_ht[mt.row_key].vep.LoF, Consequence=vep_ht[mt.row_key].vep.Consequence, DOMAINS=vep_ht[mt.row_key].vep.DOMAINS, SYMBOL=vep_ht[mt.row_key].vep.SYMBOL)) ## Filter to bi-allelic variants if args.filter_biallelic: logger.info('Running burden test on biallelic variants...') mt = mt.filter_rows(bi_allelic_expr(mt)) ## Filter to variants within protein domain(s) if args.filter_protein_domain: logger.info( 'Running burden test on variants within protein domain(s)...') mt = mt.filter_rows(vep_protein_domain_filter_expr(mt.DOMAINS), keep=True) ## Annotate pathogenic scores ht_scores = get_vep_scores_ht() mt = mt.annotate_rows(**ht_scores[mt.row_key]) ## Classify variant into (major) consequence groups score_expr_ann = { 'hcLOF': mt.LoF == 'HC', 'syn': mt.Consequence == 'synonymous_variant', 'miss': mt.Consequence == 'missense_variant' } # Update dict expr annotations with combinations of variant consequences categories score_expr_ann.update({ 'missC': (hl.sum([(mt['vep.MVP_score'] >= MVP_THRESHOLD), (mt['vep.REVEL_score'] >= REVEL_THRESHOLD), (mt['vep.CADD_PHRED'] >= CADD_THRESHOLD)]) >= 2) & score_expr_ann.get('miss') }) score_expr_ann.update({ 'hcLOF_missC': score_expr_ann.get('hcLOF') | score_expr_ann.get('missC') }) mt = (mt.annotate_rows(csq_group=score_expr_ann)) # Transmute csq_group and convert dict to set where the group is defined # (easier to explode and grouping later) mt = (mt.transmute_rows(csq_group=hl.set( hl.filter(lambda x: mt.csq_group.get(x), mt.csq_group.keys())))) mt = (mt.filter_rows(hl.len(mt.csq_group) > 0)) # Explode nested csq_group and gene clusters before grouping mt = (mt.explode_rows(mt.csq_group)) # First-step aggregation: # Generate a sample per gene/variant_type (binary) matrix aggregating genotypes as follow: # # a) entry: hets # b) entry: homs # c) entry: chets (compound hets) mt_grouped = (mt.group_rows_by(mt['SYMBOL'], mt['csq_group']).aggregate( hets=hl.agg.any(mt.GT.is_het()), homs=hl.agg.any(mt.GT.is_hom_var()), chets=hl.agg.count_where( mt.GT.is_het()) >= 2).repartition(100).persist()) # Import/generate gene clusters clusters = hl.import_table(args.set_file, no_header=True, delimiter="\t", min_partitions=50, impute=False) clusters = generate_clusters_map(clusters) # Annotate gene-set info mt_grouped = (mt_grouped.annotate_rows(**clusters[mt_grouped.SYMBOL])) # Explode nested csq_group before grouping mt_grouped = (mt_grouped.explode_rows(mt_grouped.cluster_id)) # filter rows with defined consequence and gene-set name mt_grouped = (mt_grouped.filter_rows( hl.is_defined(mt_grouped.csq_group) & hl.is_defined(mt_grouped.cluster_id))) # 2. Second-step aggregation # Generate a sample per gene-sets/variant type matrix aggregating genotypes as follow: # if dominant -> sum hets (default) # if recessive -> sum (homs) # if recessive (a) -> sum (chets) # if recessive (b) -> sum (chets and/or homs) mts = [] if args.homs: # Group mt by gene-sets/csq_group aggregating homs genotypes. mt_homs = (mt_grouped.group_rows_by( mt_grouped.csq_group, mt_grouped.cluster_id).aggregate( mac=hl.int(hl.agg.sum(mt_grouped.homs))).repartition( 100).persist().annotate_rows(agg_genotype='homs')) mts.append(mt_homs) if args.chets: # Group mt by gene-sets/csq_group aggregating compound hets (chets) genotypes. mt_chets = (mt_grouped.group_rows_by( mt_grouped.csq_group, mt_grouped.cluster_id).aggregate( mac=hl.int(hl.agg.sum(mt_grouped.chets))).repartition( 100).persist().annotate_rows(agg_genotype='chets')) mts.append(mt_chets) if args.homs_chets: # Group mt by gene-sets/csq_group aggregating chets and/or homs genotypes. mt_homs_chets = (mt_grouped.group_rows_by( mt_grouped.csq_group, mt_grouped.cluster_id).aggregate(mac=hl.int( hl.agg.count_where(mt_grouped.chets | mt_grouped.homs))).repartition(100). persist().annotate_rows(agg_genotype='homs_chets')) mts.append(mt_homs_chets) if args.hets: # Group mt by gene-sets/csq_group aggregating hets genotypes (default) mt_hets = (mt_grouped.group_rows_by( mt_grouped.csq_group, mt_grouped.cluster_id).aggregate( mac=hl.int(hl.agg.sum(mt_grouped.hets))).repartition( 100).persist().annotate_rows(agg_genotype='hets')) mts.append(mt_hets) ## Joint MatrixTables mt_joint = hl.MatrixTable.union_rows(*mts) ## Add samples annotations # annotate sample covs covariates = hl.read_table( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.sample_covariates.ht') mt_joint = (mt_joint.annotate_cols(**covariates[mt_joint.s])) # annotate case/control phenotype info tb_sample = get_sample_meta_data() mt_joint = (mt_joint.annotate_cols(**tb_sample[mt_joint.s])) mt_joint = (mt_joint.filter_cols(mt_joint['phe.is_case'] | mt_joint['phe.is_control'])) ## Run logistic regression stratified by proband type analysis = ['all_cases', 'syndromic', 'nonsyndromic'] tbs = [] covs = ['sex', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5'] for proband in analysis: logger.info(f'Running burden test for {proband}...') mt_tmp = hl.MatrixTable if proband == 'all_cases': mt_tmp = mt_joint if proband == 'syndromic': mt_tmp = mt_joint.filter_cols(~mt_joint['phe.is_nonsyndromic']) if proband == 'nonsyndromic': mt_tmp = mt_joint.filter_cols(~mt_joint['phe.is_syndromic']) tb_logreg = logistic_regression(mt=mt_tmp, x_expr='mac', response='phe.is_case', covs=covs, pass_through=['agg_genotype'], extra_fields={ 'analysis': proband, 'maf': maf_cutoff, 'covs': '|'.join(covs) }) tbs.append(tb_logreg) tb_final = hl.Table.union(*tbs) # export results date = current_date() run_hash = str(uuid.uuid4())[:6] output_path = f'{args.output_dir}/{date}/{args.exome_cohort}.logreg_burden.{run_hash}.ht' tb_final = (tb_final.checkpoint(output=output_path)) if args.write_to_file: # write table to disk as TSV file (tb_final.export(f'{output_path}.tsv')) hl.stop()