def generate_interval_list_ht(genome_ref: str = 'GRCh38') -> hl.Table: """ Generate a list of intervals (union) :return: A joint table (union) of intervals """ intervals = [ get_ssv2_intervals_ht(), get_ssv3_intervals_ht(), get_ssv4_intervals_ht(), get_ssv5_intervals_ht(), get_idt_xgen_intervals_ht() ] # get global annotation(s) from input tables sources = [t.source.collect()[0] for t in intervals] platform_labels = [t.platform_label.collect()[0] for t in intervals] global_ann_expr = dict( zip(GLOBAL_ANNOTATION_FIELDS, (current_date(), sources, genome_ref, platform_labels))) # keep only the interval <key> field for all tables intervals = [ht.key_by('interval').select() for ht in intervals] ht_interval = (hl.Table.union(*intervals).select_globals()) ht_interval = ht_interval.annotate_globals(**global_ann_expr) assert ht_interval.key.interval.dtype == hl.dtype( f'interval<locus<{genome_ref}>>') return ht_interval
def import_intervals_from_bed(bed_path: str, platform_label: str, genome_ref: str) -> hl.Table: """ Handle importing BED files as intervals. Recode contig if necessary and annotate global meta-info. Note: `platform_label` and `genome_ref` are required, since these info will be used as global annotations. :param bed_path: Path to capture interval BED file :param platform_label: Unique capture interval identifier (e.g. 'ssv3') :param genome_ref: Either 'GRCh37' or 'GRCh38 :return: HailTable keyed by interval """ # genome references rg37 = hl.get_reference('GRCh37') rg38 = hl.get_reference('GRCh38') # dict contig recode from rg38 -> rg37. # only autosomal and sex chromosomes CONTIG_RECODING_HG38_TO_HG37 = { contig: contig.replace('chr', '') for contig in rg38.contigs[:24] } # dict contig recode from rg37 -> rg38. # only autosomal and sex chromosomes CONTIG_RECODING_HG37_TO_HG38 = { CONTIG_RECODING_HG38_TO_HG37.get(k): k for k in CONTIG_RECODING_HG38_TO_HG37.keys() } # Recode contig if chromosome field in BED file miss-match with genome reference. if genome_ref == 'GRCh37': contig_recoding = CONTIG_RECODING_HG38_TO_HG37 elif genome_ref == 'GRCh38': contig_recoding = CONTIG_RECODING_HG37_TO_HG38 else: contig_recoding = None ht_intervals = hl.import_bed(bed_path, reference_genome=genome_ref, contig_recoding=contig_recoding) global_ann_expr = dict( zip(GLOBAL_ANNOTATION_FIELDS, (current_date(), bed_path, genome_ref, platform_label))) ht_intervals = (ht_intervals.annotate_globals( **global_ann_expr).key_by('interval').repartition(100)) return ht_intervals
'gnomAD_FIN_AF' ] gnomad_exomes_af_expr = { f: hl.parse_float(variant_ht.vep[f]) for f in gnomad_exomes_af_fields } # add gnomad exomes AF expression to annotation dict af_ann_expr.update(gnomad_exomes_af_expr) ## annotate afs variant_ht = (variant_ht.annotate(**af_ann_expr)) af_fields = list(af_ann_expr.keys()) variant_ht = (variant_ht.select(*af_fields)) ## add global annotation date = current_date() global_ann_expr = {'date': date, 'af_fields': af_fields} variant_ht = (variant_ht.annotate_globals(**global_ann_expr)) ## export af table # write to Hail table output_path_ht = f'{nfs_dir}/hail_data/hts/chd_ukbb.variants.af.annotations.external.{date}.ht' variant_ht = (variant_ht.checkpoint(output_path_ht, overwrite=True)) # write to TSV file (variant_ht.export(f'{output_path_ht}.tsv.bgz'))
def liftover_intervals(t: hl.Table, keep_missing_interval: bool = False) -> hl.Table: """ Liftover locus in intervals from one coordinate system (hg37) to another (hg38) # Example input table description # # ---------------------------------------- # Global fields: # None # ---------------------------------------- # Row fields: # 'interval': interval<locus<GRCh37>> # ---------------------------------------- # Key: ['interval'] # ---------------------------------------- :param t: Table of intervals on GRCh37 :param keep_missing_interval: If True, keep missing (non-lifted) intervals in the output Table. :return: Table with intervals lifted over GRCh38 added. """ rg37 = hl.get_reference("GRCh37") rg38 = hl.get_reference("GRCh38") if not rg37.has_liftover("GRCh38"): rg37.add_liftover( f'{nfs_dir}/resources/liftover/grch37_to_grch38.over.chain.gz', rg38) t = t.annotate( start=hl.liftover(t.interval.start, "GRCh38"), end=hl.liftover(t.interval.end, "GRCh38"), ) t = t.filter((t.start.contig == "chr" + t.interval.start.contig) & (t.end.contig == "chr" + t.interval.end.contig)) t = t.key_by() t = (t.select(interval=hl.locus_interval(t.start.contig, t.start.position, t.end.position, reference_genome=rg38, invalid_missing=True), interval_hg37=t.interval)) # bad intervals missing = t.aggregate(hl.agg.counter(~hl.is_defined(t.interval))) logger.info( f"Number of missing intervals: {missing[True]} out of {t.count()}...") # update globals annotations global_ann_expr = { 'date': current_date(), 'reference_genome': 'GRCh38', 'was_lifted': True } t = t.annotate_globals(**global_ann_expr) if not keep_missing_interval: logger.info(f"Filtering out {missing[True]} missing intervals...") t = t.filter(hl.is_defined(t.interval), keep=True) return t.key_by("interval")
def main(args): # init hail hl.init(default_reference=args.default_ref_genome) # import MT mt = hl.read_matrix_table(args.mt_input_path) n_variants, n_samples = mt.count() # Getting variant table. Basically, a table keyed by <locus> or <locus, alleles> # with all variants in the dataset and no extra fields (a.k.a reference table). tb_variants = (mt.select_rows().rows()) # compute overall coverage if args.compute_overall_coverage: logger.info( f"Computing coverage stats for {n_variants} variant over {n_samples} samples..." ) ht_cov_overall = compute_coverage_stats(mt=mt, reference_ht=tb_variants) tb_variants = (tb_variants.annotate( overall=ht_cov_overall[tb_variants.key])) # compute coverage stratified by phenotype status (expected binary) # force the input MT to have a case_control bool filed (is_case) # *** if args.compute_phe_coverage: logger.info( f"Computing coverage stats stratified by phenotype status...") # Annotate sample meta info # Note: Temporal solution, better to import annotated MT mt = (mt.annotate_cols(**get_sample_meta_data()[mt.col_key])) mt = (mt.annotate_cols( case_control=hl.if_else(mt[args.phe_field], 'case', 'control'))) strata = (mt.aggregate_cols(hl.agg.collect_as_set(mt['case_control']))) dict_strata_ht = { s: compute_coverage_stats(mt=mt.filter_cols(mt['case_control'] == s), reference_ht=tb_variants) for s in strata } for k in dict_strata_ht.keys(): _tb = dict_strata_ht.get(k) tb_variants = tb_variants.annotate(**{k: _tb[tb_variants.key]}) if args.run_binomial_test: logger.info(f"Running binomial test...") # perform a binomial test on coverage and case/control status # DOI: https://doi.org/10.1002/acn3.582 tb_binomial = (tb_variants.annotate( n_cases_over_10=hl.int(tb_variants.case.over_10 * 100), n_controls_over_10=hl.int(tb_variants.control.over_10 * 100), total_cases=tb_variants.case.n_samples, total_controls=tb_variants.control.n_samples, ).select('n_cases_over_10', 'n_controls_over_10', 'total_cases', 'total_controls')) binomial_expr = { 'p_value': hl.binom_test( x=tb_binomial.n_cases_over_10, n=tb_binomial.n_cases_over_10 + tb_binomial.n_controls_over_10, p=tb_binomial.total_cases / (tb_binomial.total_cases + tb_binomial.total_controls), alternative='two.sided') } tb_binomial = (tb_binomial.annotate(**binomial_expr)) tb_variants = (tb_variants.annotate( binomial_stats=tb_binomial[tb_variants.key])) # make coverage filter expressions # Note: the default number of reads is set to 10X logger.info(f"Assigning per site coverage filters...") significant_level = args.pvalue_threshold min_sample_prop = args.min_sample_proportion coverage_filter_dict_expr = {} if args.compute_overall_coverage: coverage_filter_dict_expr.update({ 'overall_hard_cutoff': hl.if_else((tb_variants.overall.over_10 >= min_sample_prop), "pass", "fail") }) if args.compute_phe_coverage: # DOI: https://doi.org/10.1016/j.ajhg.2018.08.016 coverage_filter_dict_expr.update({ 'phe_hard_cutoff': hl.if_else((tb_variants.case.over_10 >= min_sample_prop) & (tb_variants.control.over_10 >= min_sample_prop), "concordant", "discordant") }) if args.run_binomial_test: coverage_filter_dict_expr.update({ 'phe_binomial': hl.if_else(tb_variants.binomial_stats.p_value < significant_level, 'dependent', 'independent') }) # annotate coverage filters tb_variants = (tb_variants.annotate(coverage_filter=hl.struct( **coverage_filter_dict_expr))) # add useful global annotations to final coverage stats ht # as well as affected/non-affected summary counts per filters global_ann_dict_expr = { 'date': current_date(), 'mt_path': args.mt_input_path, 'min_sample_prop': min_sample_prop } if args.compute_overall_coverage: global_ann_dict_expr.update({ 'overall_hard_cutoff': tb_variants.aggregate( hl.agg.counter( tb_variants.coverage_filter.overall_hard_cutoff)) }) if args.compute_phe_coverage: global_ann_dict_expr.update({ 'phe_hard_cutoff': tb_variants.aggregate( hl.agg.counter(tb_variants.coverage_filter.phe_hard_cutoff)) }) if args.run_binomial_test: global_ann_dict_expr.update({ 'phe_binomial': tb_variants.aggregate( hl.agg.counter(tb_variants.coverage_filter.phe_binomial)), 'binomial_pvalue_cutoff': significant_level if args.run_binomial_test else hl.float('') }) tb_variants = (tb_variants.annotate_globals(**global_ann_dict_expr)) # check tb_variants.globals.show() tb_variants.describe() # write HT tb_variants = tb_variants.checkpoint(output=args.ht_output_path, overwrite=args.overwrite) # export to file if true if args.write_to_file: (tb_variants.export(f'{args.ht_output_path}.tsv.bgz')) hl.stop()
def main(args): hl.init(default_reference=args.default_ref_genome) if args.run_test_mode: logger.info('Running pipeline on test data...') mt = (get_mt_data(part='raw_chr20').sample_rows(0.1)) else: logger.info( 'Running pipeline on MatrixTable wih adjusted genotypes...') ds = args.exome_cohort mt = hl.read_matrix_table( get_qc_mt_path(dataset=ds, part='unphase_adj_genotypes', split=True)) # 1. Sample-QC filtering if not args.skip_sample_qc_filtering: logger.info('Applying per sample QC filtering...') mt = apply_sample_qc_filtering(mt) logger.info( 'Writing sample qc-filtered mt with rare variants (internal maf 0.01) to disk...' ) mt = (mt.write(f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt', overwrite=True)) # 2. Variant-QC filtering if not args.skip_variant_qc_filtering: logger.info('Applying per variant QC filtering...') if hl.hadoop_is_file( f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt/_SUCCESS'): logger.info('Reading pre-existing sample qc-filtered MT...') mt = hl.read_matrix_table( f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt') mt = apply_variant_qc_filtering(mt) # write hard filtered MT to disk logger.info( 'Writing variant qc-filtered mt with rare variants (internal maf 0.01) to disk...' ) mt = (mt.write(f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt', overwrite=True)) # 3. Annotate AFs # allelic frequency cut-off maf_cutoff = args.af_max_threshold if not args.skip_af_filtering: if hl.hadoop_is_file( f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt/_SUCCESS'): logger.info( 'Reading pre-existing sample/variant qc-filtered MT...') mt = hl.read_matrix_table( f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt') # Annotate allelic frequencies from external source, # and compute internal AF on samples passing QC af_ht = get_af_annotation_ht() mt = (mt.annotate_rows(**af_ht[mt.row_key])) filter_expressions = [ af_filter_expr(mt, 'internal_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'gnomad_genomes_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'gnomAD_AF', af_cutoff=maf_cutoff), af_filter_expr(mt, 'ger_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'rumc_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'bonn_af', af_cutoff=maf_cutoff) ] mt = (mt.filter_rows(functools.reduce(operator.iand, filter_expressions), keep=True)) logger.info( 'Writing qc-filtered MT filtered to external maf with to disk...') mt = (mt.write(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt', overwrite=True)) # 4. ##### Burden Test ###### logger.info('Running burden test...') if hl.hadoop_is_file(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt/_SUCCESS'): logger.info( 'Reading pre-existing sample/variant qc-filtered MT with rare variants...' ) mt = hl.read_matrix_table(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt') ## Add VEP-annotated fields vep_ht = get_vep_annotation_ht() mt = (mt.annotate_rows(LoF=vep_ht[mt.row_key].vep.LoF, Consequence=vep_ht[mt.row_key].vep.Consequence, DOMAINS=vep_ht[mt.row_key].vep.DOMAINS, SYMBOL=vep_ht[mt.row_key].vep.SYMBOL)) ## Filter to bi-allelic variants if args.filter_biallelic: logger.info('Running burden test on biallelic variants...') mt = mt.filter_rows(bi_allelic_expr(mt)) ## Filter to variants within protein domain(s) if args.filter_protein_domain: logger.info( 'Running burden test on variants within protein domain(s)...') mt = mt.filter_rows(vep_protein_domain_filter_expr(mt.DOMAINS), keep=True) ## Add cases/controls sample annotations tb_sample = get_sample_meta_data() mt = (mt.annotate_cols(**tb_sample[mt.s])) mt = (mt.filter_cols(mt['phe.is_case'] | mt['phe.is_control'])) ## Annotate pathogenic scores ht_scores = get_vep_scores_ht() mt = mt.annotate_rows(**ht_scores[mt.row_key]) ## Classify variant into (major) consequence groups score_expr_ann = { 'hcLOF': mt.LoF == 'HC', 'syn': mt.Consequence == 'synonymous_variant', 'miss': mt.Consequence == 'missense_variant' } # Update dict expr annotations with combinations of variant consequences categories score_expr_ann.update({ 'missC': (hl.sum([(mt['vep.MVP_score'] >= MVP_THRESHOLD), (mt['vep.REVEL_score'] >= REVEL_THRESHOLD), (mt['vep.CADD_PHRED'] >= CADD_THRESHOLD)]) >= 2) & score_expr_ann.get('miss') }) score_expr_ann.update({ 'hcLOF_missC': score_expr_ann.get('hcLOF') | score_expr_ann.get('missC') }) mt = (mt.annotate_rows(csq_group=score_expr_ann)) # Transmute csq_group and convert dict to set where the group is defined # (easier to explode and grouping later) mt = (mt.transmute_rows(csq_group=hl.set( hl.filter(lambda x: mt.csq_group.get(x), mt.csq_group.keys())))) mt = (mt.filter_rows(hl.len(mt.csq_group) > 0)) # Explode nested csq_group before grouping mt = (mt.explode_rows(mt.csq_group)) # print('Number of samples/variants: ') # print(mt.count()) # Group mt by gene/csq_group. mt_grouped = (mt.group_rows_by(mt['SYMBOL'], mt['csq_group']).aggregate( hets=hl.agg.any(mt.GT.is_het()), homs=hl.agg.any(mt.GT.is_hom_var()), chets=hl.agg.count_where(mt.GT.is_het()) >= 2, homs_chets=(hl.agg.count_where(mt.GT.is_het()) >= 2) | (hl.agg.any(mt.GT.is_hom_var()))).repartition(100).persist()) mts = [] if args.homs: # select homs genotypes. mt_homs = (mt_grouped.select_entries( mac=mt_grouped.homs).annotate_rows(agg_genotype='homs')) mts.append(mt_homs) if args.chets: # select compound hets (chets) genotypes. mt_chets = (mt_grouped.select_entries( mac=mt_grouped.chets).annotate_rows(agg_genotype='chets')) mts.append(mt_chets) if args.homs_chets: # select chets and/or homs genotypes. mt_homs_chets = (mt_grouped.select_entries( mac=mt_grouped.homs_chets).annotate_rows( agg_genotype='homs_chets')) mts.append(mt_homs_chets) if args.hets: # select hets genotypes mt_hets = (mt_grouped.select_entries( mac=mt_grouped.hets).annotate_rows(agg_genotype='hets')) mts.append(mt_hets) ## Joint MatrixTables mt_grouped = hl.MatrixTable.union_rows(*mts) # Generate table of counts tb_gene = (mt_grouped.annotate_rows( n_cases=hl.agg.filter(mt_grouped['phe.is_case'], hl.agg.sum(mt_grouped.mac)), n_syndromic=hl.agg.filter(mt_grouped['phe.is_syndromic'], hl.agg.sum(mt_grouped.mac)), n_nonsyndromic=hl.agg.filter(mt_grouped['phe.is_nonsyndromic'], hl.agg.sum(mt_grouped.mac)), n_controls=hl.agg.filter(mt_grouped['phe.is_control'], hl.agg.sum(mt_grouped.mac)), n_total_cases=hl.agg.filter(mt_grouped['phe.is_case'], hl.agg.count()), n_total_syndromic=hl.agg.filter(mt_grouped['phe.is_syndromic'], hl.agg.count()), n_total_nonsyndromic=hl.agg.filter(mt_grouped['phe.is_nonsyndromic'], hl.agg.count()), n_total_controls=hl.agg.filter(mt_grouped['phe.is_control'], hl.agg.count())).rows()) # run fet stratified by proband type analysis = ['all_cases', 'syndromic', 'nonsyndromic'] tbs = [] for proband in analysis: logger.info(f'Running test for {proband}...') colCases = None colTotalCases = None colControls = 'n_controls' colTotalControls = 'n_total_controls' if proband == 'all_cases': colCases = 'n_cases' colTotalCases = 'n_total_cases' if proband == 'syndromic': colCases = 'n_syndromic' colTotalCases = 'n_total_syndromic' if proband == 'nonsyndromic': colCases = 'n_nonsyndromic' colTotalCases = 'n_total_nonsyndromic' tb_fet = compute_fisher_exact(tb=tb_gene, n_cases_col=colCases, n_control_col=colControls, total_cases_col=colTotalCases, total_controls_col=colTotalControls, correct_total_counts=True, root_col_name='fet', extra_fields={ 'analysis': proband, 'maf': maf_cutoff }) # filter out zero-count genes tb_fet = (tb_fet.filter( hl.sum([tb_fet[colCases], tb_fet[colControls]]) > 0, keep=True)) tbs.append(tb_fet) tb_final = hl.Table.union(*tbs) tb_final.describe() # export results date = current_date() run_hash = str(uuid.uuid4())[:6] output_path = f'{args.output_dir}/{date}/{args.exome_cohort}.fet_burden.{run_hash}.ht' tb_final = (tb_final.checkpoint(output=output_path)) if args.write_to_file: # write table to disk as TSV file (tb_final.export(f'{output_path}.tsv')) hl.stop()
def main(args): hl.init(default_reference=args.default_ref_genome) if args.run_test_mode: logger.info('Running pipeline on test data...') mt = (get_mt_data(part='raw_chr20').sample_rows(0.1)) else: logger.info( 'Running pipeline on MatrixTable wih adjusted genotypes...') ds = args.exome_cohort mt = hl.read_matrix_table( get_qc_mt_path(dataset=ds, part='unphase_adj_genotypes', split=True)) # 1. Sample-QC filtering if not args.skip_sample_qc_filtering: logger.info('Applying per sample QC filtering...') mt = apply_sample_qc_filtering(mt) logger.info( 'Writing sample qc-filtered mt with rare variants (internal maf 0.01) to disk...' ) mt = (mt.write(f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt', overwrite=True)) # 2. Variant-QC filtering if not args.skip_variant_qc_filtering: logger.info('Applying per variant QC filtering...') if hl.hadoop_is_file( f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt/_SUCCESS'): logger.info('Reading pre-existing sample qc-filtered MT...') mt = hl.read_matrix_table( f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt') mt = apply_variant_qc_filtering(mt) # write hard filtered MT to disk logger.info( 'Writing variant qc-filtered mt with rare variants (internal maf 0.01) to disk...' ) mt = (mt.write(f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt', overwrite=True)) # 3. Annotate AFs # allelic frequency cut-off maf_cutoff = args.af_max_threshold if not args.skip_af_filtering: if hl.hadoop_is_file( f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt/_SUCCESS'): logger.info( 'Reading pre-existing sample/variant qc-filtered MT...') mt = hl.read_matrix_table( f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt') # Annotate allelic frequencies from external source, # and compute internal AF on samples passing QC af_ht = get_af_annotation_ht() mt = (mt.annotate_rows(**af_ht[mt.row_key])) filter_expressions = [ af_filter_expr(mt, 'internal_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'gnomad_genomes_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'gnomAD_AF', af_cutoff=maf_cutoff), af_filter_expr(mt, 'ger_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'rumc_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'bonn_af', af_cutoff=maf_cutoff) ] mt = (mt.filter_rows(functools.reduce(operator.iand, filter_expressions), keep=True)) logger.info( f'Writing sample/variant QCed MT with rare variants at maf: {args.af_max_threshold}.' ) mt = (mt.write(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt', overwrite=True)) # 4. ##### Run gene-set burden logistic regression ###### logger.info('Running gene-set burden logistic regression test...') if hl.hadoop_is_file(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt/_SUCCESS'): logger.info( 'Reading pre-existing sample/variant qc-filtered MT with rare variants...' ) mt = hl.read_matrix_table(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt') ## Add VEP-annotated fields vep_ht = get_vep_annotation_ht() mt = (mt.annotate_rows(LoF=vep_ht[mt.row_key].vep.LoF, Consequence=vep_ht[mt.row_key].vep.Consequence, DOMAINS=vep_ht[mt.row_key].vep.DOMAINS, SYMBOL=vep_ht[mt.row_key].vep.SYMBOL)) ## Filter to bi-allelic variants if args.filter_biallelic: logger.info('Running burden test on biallelic variants...') mt = mt.filter_rows(bi_allelic_expr(mt)) ## Filter to variants within protein domain(s) if args.filter_protein_domain: logger.info( 'Running burden test on variants within protein domain(s)...') mt = mt.filter_rows(vep_protein_domain_filter_expr(mt.DOMAINS), keep=True) ## Annotate pathogenic scores ht_scores = get_vep_scores_ht() mt = mt.annotate_rows(**ht_scores[mt.row_key]) ## Classify variant into (major) consequence groups score_expr_ann = { 'hcLOF': mt.LoF == 'HC', 'syn': mt.Consequence == 'synonymous_variant', 'miss': mt.Consequence == 'missense_variant' } # Update dict expr annotations with combinations of variant consequences categories score_expr_ann.update({ 'missC': (hl.sum([(mt['vep.MVP_score'] >= MVP_THRESHOLD), (mt['vep.REVEL_score'] >= REVEL_THRESHOLD), (mt['vep.CADD_PHRED'] >= CADD_THRESHOLD)]) >= 2) & score_expr_ann.get('miss') }) score_expr_ann.update({ 'hcLOF_missC': score_expr_ann.get('hcLOF') | score_expr_ann.get('missC') }) mt = (mt.annotate_rows(csq_group=score_expr_ann)) # Transmute csq_group and convert dict to set where the group is defined # (easier to explode and grouping later) mt = (mt.transmute_rows(csq_group=hl.set( hl.filter(lambda x: mt.csq_group.get(x), mt.csq_group.keys())))) mt = (mt.filter_rows(hl.len(mt.csq_group) > 0)) # Explode nested csq_group and gene clusters before grouping mt = (mt.explode_rows(mt.csq_group)) # First-step aggregation: # Generate a sample per gene/variant_type (binary) matrix aggregating genotypes as follow: # # a) entry: hets # b) entry: homs # c) entry: chets (compound hets) mt_grouped = (mt.group_rows_by(mt['SYMBOL'], mt['csq_group']).aggregate( hets=hl.agg.any(mt.GT.is_het()), homs=hl.agg.any(mt.GT.is_hom_var()), chets=hl.agg.count_where( mt.GT.is_het()) >= 2).repartition(100).persist()) # Import/generate gene clusters clusters = hl.import_table(args.set_file, no_header=True, delimiter="\t", min_partitions=50, impute=False) clusters = generate_clusters_map(clusters) # Annotate gene-set info mt_grouped = (mt_grouped.annotate_rows(**clusters[mt_grouped.SYMBOL])) # Explode nested csq_group before grouping mt_grouped = (mt_grouped.explode_rows(mt_grouped.cluster_id)) # filter rows with defined consequence and gene-set name mt_grouped = (mt_grouped.filter_rows( hl.is_defined(mt_grouped.csq_group) & hl.is_defined(mt_grouped.cluster_id))) # 2. Second-step aggregation # Generate a sample per gene-sets/variant type matrix aggregating genotypes as follow: # if dominant -> sum hets (default) # if recessive -> sum (homs) # if recessive (a) -> sum (chets) # if recessive (b) -> sum (chets and/or homs) mts = [] if args.homs: # Group mt by gene-sets/csq_group aggregating homs genotypes. mt_homs = (mt_grouped.group_rows_by( mt_grouped.csq_group, mt_grouped.cluster_id).aggregate( mac=hl.int(hl.agg.sum(mt_grouped.homs))).repartition( 100).persist().annotate_rows(agg_genotype='homs')) mts.append(mt_homs) if args.chets: # Group mt by gene-sets/csq_group aggregating compound hets (chets) genotypes. mt_chets = (mt_grouped.group_rows_by( mt_grouped.csq_group, mt_grouped.cluster_id).aggregate( mac=hl.int(hl.agg.sum(mt_grouped.chets))).repartition( 100).persist().annotate_rows(agg_genotype='chets')) mts.append(mt_chets) if args.homs_chets: # Group mt by gene-sets/csq_group aggregating chets and/or homs genotypes. mt_homs_chets = (mt_grouped.group_rows_by( mt_grouped.csq_group, mt_grouped.cluster_id).aggregate(mac=hl.int( hl.agg.count_where(mt_grouped.chets | mt_grouped.homs))).repartition(100). persist().annotate_rows(agg_genotype='homs_chets')) mts.append(mt_homs_chets) if args.hets: # Group mt by gene-sets/csq_group aggregating hets genotypes (default) mt_hets = (mt_grouped.group_rows_by( mt_grouped.csq_group, mt_grouped.cluster_id).aggregate( mac=hl.int(hl.agg.sum(mt_grouped.hets))).repartition( 100).persist().annotate_rows(agg_genotype='hets')) mts.append(mt_hets) ## Joint MatrixTables mt_joint = hl.MatrixTable.union_rows(*mts) ## Add samples annotations # annotate sample covs covariates = hl.read_table( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.sample_covariates.ht') mt_joint = (mt_joint.annotate_cols(**covariates[mt_joint.s])) # annotate case/control phenotype info tb_sample = get_sample_meta_data() mt_joint = (mt_joint.annotate_cols(**tb_sample[mt_joint.s])) mt_joint = (mt_joint.filter_cols(mt_joint['phe.is_case'] | mt_joint['phe.is_control'])) ## Run logistic regression stratified by proband type analysis = ['all_cases', 'syndromic', 'nonsyndromic'] tbs = [] covs = ['sex', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5'] for proband in analysis: logger.info(f'Running burden test for {proband}...') mt_tmp = hl.MatrixTable if proband == 'all_cases': mt_tmp = mt_joint if proband == 'syndromic': mt_tmp = mt_joint.filter_cols(~mt_joint['phe.is_nonsyndromic']) if proband == 'nonsyndromic': mt_tmp = mt_joint.filter_cols(~mt_joint['phe.is_syndromic']) tb_logreg = logistic_regression(mt=mt_tmp, x_expr='mac', response='phe.is_case', covs=covs, pass_through=['agg_genotype'], extra_fields={ 'analysis': proband, 'maf': maf_cutoff, 'covs': '|'.join(covs) }) tbs.append(tb_logreg) tb_final = hl.Table.union(*tbs) # export results date = current_date() run_hash = str(uuid.uuid4())[:6] output_path = f'{args.output_dir}/{date}/{args.exome_cohort}.logreg_burden.{run_hash}.ht' tb_final = (tb_final.checkpoint(output=output_path)) if args.write_to_file: # write table to disk as TSV file (tb_final.export(f'{output_path}.tsv')) hl.stop()