def compute_sample_qc(mt: hl.MatrixTable) -> hl.Table: """ Perform sample QC on the raw split matrix table using `compute_stratified_sample_qc`. :return: Table containing sample QC metrics :rtype: hl.Table """ logger.info("Computing sample QC") # mt = mt.select_entries("GT") # Remove centromeres and telomeres incase they were included mt = filter_low_conf_regions( mt, filter_lcr= True, # TODO: include also decoy and low coverage exome regions filter_segdup=True) # filter to autosomes mt = filter_to_autosomes(mt) # filter telomeres/centromes mt = remove_telomeres_centromes(mt) # filter coding variants # mt = filter_cds_regions(mt) sample_qc_ht = compute_stratified_sample_qc(mt, strata={ "bi_allelic": bi_allelic_expr(mt), "multi_allelic": ~bi_allelic_expr(mt), }, tmp_ht_prefix=None, gt_expr=None) # Remove annotations that cannot be computed from the sparse format # sample_qc_ht = sample_qc_ht.annotate( # **{ # x: sample_qc_ht[x].drop( # "n_called", "n_not_called", "n_filtered", "call_rate" # ) # for x in sample_qc_ht.row_value # } # ) return sample_qc_ht.repartition(100)
def compute_callrate_mt( mt: hl.MatrixTable, intervals_ht: hl.Table, bi_allelic_only: bool = True, autosomes_only: bool = True, match: bool = True, ) -> hl.MatrixTable: """ Computes a sample/interval MT with each entry containing the call rate for that sample/interval. This can be used as input for imputing exome sequencing platforms. .. note:: The input interval HT should have a key of type Interval. The resulting table will have a key of the same type as the `intervals_ht` table and contain an `interval_info` field containing all non-key fields of the `intervals_ht`. :param match: :param mt: Input MT :param intervals_ht: Table containing the intervals. This table has to be keyed by locus. :param bi_allelic_only: If set, only bi-allelic sites are used for the computation :param autosomes_only: If set, only autosomal intervals are used. :param matches: If set, returns all intervals in intervals_ht that overlap the locus in the input MT. :return: Callrate MT """ logger.info("Computing call rate MatrixTable") if len(intervals_ht.key) != 1 or not isinstance( intervals_ht.key[0], hl.expr.IntervalExpression): logger.warning( f"Call rate matrix computation expects `intervals_ht` with a key of type Interval. " f"Found: {intervals_ht.key}") if autosomes_only: callrate_mt = filter_to_autosomes(mt) if bi_allelic_only: callrate_mt = callrate_mt.filter_rows(bi_allelic_expr(callrate_mt)) intervals_ht = intervals_ht.annotate(_interval_key=intervals_ht.key) callrate_mt = callrate_mt.annotate_rows(_interval_key=intervals_ht.index( callrate_mt.locus, all_matches=match)._interval_key) if match: callrate_mt = callrate_mt.explode_rows("_interval_key") callrate_mt = callrate_mt.filter_rows( hl.is_defined(callrate_mt._interval_key.interval)) callrate_mt = callrate_mt.select_entries( GT=hl.or_missing(hl.is_defined(callrate_mt.GT), hl.struct())) callrate_mt = callrate_mt.group_rows_by( **callrate_mt._interval_key).aggregate( callrate=hl.agg.fraction(hl.is_defined(callrate_mt.GT))) intervals_ht = intervals_ht.drop("_interval_key") callrate_mt = callrate_mt.annotate_rows(interval_info=hl.struct( **intervals_ht[callrate_mt.row_key])) return callrate_mt
def main(args): # nfs_dir = 'file:///home/ubuntu/data' hl.init(default_reference=args.default_reference) logger.info("Importing data...") # import unfiltered MT mt = get_mt_data(dataset=args.exome_cohort, part='unfiltered') # keep bi-allelic variants mt = (mt .filter_rows(bi_allelic_expr(mt), keep=True) ) # read intervals for filtering variants (used mainly for exomes) def _get_interval_table(interval: str) -> Union[None, hl.Table]: return get_capture_interval_ht(name=interval, reference=args.default_reference) if interval is not None else interval ht = compute_mean_coverage(mt=mt, normalization_contig=args.normalization_contig, included_calling_intervals=_get_interval_table(args.interval_to_include), excluded_calling_intervals=_get_interval_table(args.interval_to_exclude), chr_x=args.chr_x, chr_y=args.chr_y) logger.info("Exporting data...") # write HT output_ht_path = get_sample_qc_ht_path(part='sex_chrom_coverage') ht.write(output=output_ht_path, overwrite=args.overwrite) # export to file if true if args.write_to_file: (ht .export(f'{output_ht_path}.tsv.bgz') ) hl.stop() print("Done!")
def main(args): # Start Hail hl.init(default_reference=args.default_reference) if not args.skip_filter_step: logger.info("Importing data...") # import unfiltered MT mt = get_mt_data(dataset=args.exome_cohort, part='unfiltered') # Read MT from 1kgenome and keep only locus defined in interval mt_1kg = get_1kg_mt(args.default_reference) # Joining dataset (inner join). Keep only 'GT' entry field mt_joint = (mt.select_entries('GT').union_cols( mt_1kg.select_entries('GT'), row_join_type='inner')) logger.info( "Filtering joint MT to bi-allelic, high-callrate, common SNPs...") mt_joint = (mt_joint.filter_rows( bi_allelic_expr(mt_joint) & hl.is_snp(mt_joint.alleles[0], mt_joint.alleles[1]) & (hl.agg.mean(mt_joint.GT.n_alt_alleles()) / 2 > 0.001) & (hl.agg.fraction(hl.is_defined(mt_joint.GT)) > 0.99)). naive_coalesce(1000)) logger.info( "Checkpoint: writing joint filtered MT before LD pruning...") mt_joint = mt_joint.checkpoint(get_mt_checkpoint_path( dataset=args.exome_cohort, part='joint_1kg_high_callrate_common_snp_biallelic'), overwrite=True) logger.info( f"Running ld_prune with r2 = {args.ld_prune_r2} on MT with {mt_joint.count_rows()} variants..." ) # remove correlated variants pruned_variant_table = hl.ld_prune(mt_joint.GT, r2=args.ld_prune_r2, bp_window_size=500000, memory_per_core=512) mt_joint = (mt_joint.filter_rows( hl.is_defined(pruned_variant_table[mt_joint.row_key]))) logger.info("Writing filtered joint MT with variants in LD pruned...") (mt_joint.write(get_qc_mt_path( dataset=args.exome_cohort + '_1kg', part='joint_high_callrate_common_snp_biallelic', split=True, ld_pruned=True), overwrite=args.overwrite)) logger.info("Importing filtered joint MT...") mt_joint = hl.read_matrix_table( get_qc_mt_path(dataset=args.exome_cohort + '_1kg', part='joint_high_callrate_common_snp_biallelic', split=True, ld_pruned=True)) logger.info(f"Running PCA with {mt_joint.count_rows()} variants...") # run pca on merged dataset eigenvalues, pc_scores, _ = hl.hwe_normalized_pca(mt_joint.GT, k=args.n_pcs) logger.info(f"Eigenvalues: {eigenvalues}") # TODO: save eigenvalues? # Annotate PC array as independent fields. pca_table = (pc_scores.annotate( ** {'PC' + str(k + 1): pc_scores.scores[k] for k in range(0, args.n_pcs)}).drop('scores')) logger.info(f"Writing HT with PCA results...") # write as HT output_ht_path = get_sample_qc_ht_path(dataset=args.exome_cohort, part='joint_pca_1kg') pca_table.write(output=output_ht_path) if args.write_to_file: (pca_table.export(f'{output_ht_path}.tsv.bgz')) # Stop Hail hl.stop() print("Done!")
def main(args): hl.init(default_reference=args.default_ref_genome) if args.run_test_mode: logger.info('Running pipeline on test data...') mt = (get_mt_data(part='raw_chr20').sample_rows(0.1)) else: logger.info( 'Running pipeline on MatrixTable wih adjusted genotypes...') ds = args.exome_cohort mt = hl.read_matrix_table( get_qc_mt_path(dataset=ds, part='unphase_adj_genotypes', split=True)) # 1. Sample-QC filtering if not args.skip_sample_qc_filtering: logger.info('Applying per sample QC filtering...') mt = apply_sample_qc_filtering(mt) logger.info( 'Writing sample qc-filtered mt with rare variants (internal maf 0.01) to disk...' ) mt = (mt.write(f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt', overwrite=True)) # 2. Variant-QC filtering if not args.skip_variant_qc_filtering: logger.info('Applying per variant QC filtering...') if hl.hadoop_is_file( f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt/_SUCCESS'): logger.info('Reading pre-existing sample qc-filtered MT...') mt = hl.read_matrix_table( f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt') mt = apply_variant_qc_filtering(mt) # write hard filtered MT to disk logger.info( 'Writing variant qc-filtered mt with rare variants (internal maf 0.01) to disk...' ) mt = (mt.write(f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt', overwrite=True)) # 3. Annotate AFs # allelic frequency cut-off maf_cutoff = args.af_max_threshold if not args.skip_af_filtering: if hl.hadoop_is_file( f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt/_SUCCESS'): logger.info( 'Reading pre-existing sample/variant qc-filtered MT...') mt = hl.read_matrix_table( f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt') # Annotate allelic frequencies from external source, # and compute internal AF on samples passing QC af_ht = get_af_annotation_ht() mt = (mt.annotate_rows(**af_ht[mt.row_key])) filter_expressions = [ af_filter_expr(mt, 'internal_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'gnomad_genomes_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'gnomAD_AF', af_cutoff=maf_cutoff), af_filter_expr(mt, 'ger_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'rumc_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'bonn_af', af_cutoff=maf_cutoff) ] mt = (mt.filter_rows(functools.reduce(operator.iand, filter_expressions), keep=True)) logger.info( 'Writing qc-filtered MT filtered to external maf with to disk...') mt = (mt.write(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt', overwrite=True)) # 4. ##### Burden Test ###### logger.info('Running burden test...') if hl.hadoop_is_file(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt/_SUCCESS'): logger.info( 'Reading pre-existing sample/variant qc-filtered MT with rare variants...' ) mt = hl.read_matrix_table(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt') ## Add VEP-annotated fields vep_ht = get_vep_annotation_ht() mt = (mt.annotate_rows(LoF=vep_ht[mt.row_key].vep.LoF, Consequence=vep_ht[mt.row_key].vep.Consequence, DOMAINS=vep_ht[mt.row_key].vep.DOMAINS, SYMBOL=vep_ht[mt.row_key].vep.SYMBOL)) ## Filter to bi-allelic variants if args.filter_biallelic: logger.info('Running burden test on biallelic variants...') mt = mt.filter_rows(bi_allelic_expr(mt)) ## Filter to variants within protein domain(s) if args.filter_protein_domain: logger.info( 'Running burden test on variants within protein domain(s)...') mt = mt.filter_rows(vep_protein_domain_filter_expr(mt.DOMAINS), keep=True) ## Add cases/controls sample annotations tb_sample = get_sample_meta_data() mt = (mt.annotate_cols(**tb_sample[mt.s])) mt = (mt.filter_cols(mt['phe.is_case'] | mt['phe.is_control'])) ## Annotate pathogenic scores ht_scores = get_vep_scores_ht() mt = mt.annotate_rows(**ht_scores[mt.row_key]) ## Classify variant into (major) consequence groups score_expr_ann = { 'hcLOF': mt.LoF == 'HC', 'syn': mt.Consequence == 'synonymous_variant', 'miss': mt.Consequence == 'missense_variant' } # Update dict expr annotations with combinations of variant consequences categories score_expr_ann.update({ 'missC': (hl.sum([(mt['vep.MVP_score'] >= MVP_THRESHOLD), (mt['vep.REVEL_score'] >= REVEL_THRESHOLD), (mt['vep.CADD_PHRED'] >= CADD_THRESHOLD)]) >= 2) & score_expr_ann.get('miss') }) score_expr_ann.update({ 'hcLOF_missC': score_expr_ann.get('hcLOF') | score_expr_ann.get('missC') }) mt = (mt.annotate_rows(csq_group=score_expr_ann)) # Transmute csq_group and convert dict to set where the group is defined # (easier to explode and grouping later) mt = (mt.transmute_rows(csq_group=hl.set( hl.filter(lambda x: mt.csq_group.get(x), mt.csq_group.keys())))) mt = (mt.filter_rows(hl.len(mt.csq_group) > 0)) # Explode nested csq_group before grouping mt = (mt.explode_rows(mt.csq_group)) # print('Number of samples/variants: ') # print(mt.count()) # Group mt by gene/csq_group. mt_grouped = (mt.group_rows_by(mt['SYMBOL'], mt['csq_group']).aggregate( hets=hl.agg.any(mt.GT.is_het()), homs=hl.agg.any(mt.GT.is_hom_var()), chets=hl.agg.count_where(mt.GT.is_het()) >= 2, homs_chets=(hl.agg.count_where(mt.GT.is_het()) >= 2) | (hl.agg.any(mt.GT.is_hom_var()))).repartition(100).persist()) mts = [] if args.homs: # select homs genotypes. mt_homs = (mt_grouped.select_entries( mac=mt_grouped.homs).annotate_rows(agg_genotype='homs')) mts.append(mt_homs) if args.chets: # select compound hets (chets) genotypes. mt_chets = (mt_grouped.select_entries( mac=mt_grouped.chets).annotate_rows(agg_genotype='chets')) mts.append(mt_chets) if args.homs_chets: # select chets and/or homs genotypes. mt_homs_chets = (mt_grouped.select_entries( mac=mt_grouped.homs_chets).annotate_rows( agg_genotype='homs_chets')) mts.append(mt_homs_chets) if args.hets: # select hets genotypes mt_hets = (mt_grouped.select_entries( mac=mt_grouped.hets).annotate_rows(agg_genotype='hets')) mts.append(mt_hets) ## Joint MatrixTables mt_grouped = hl.MatrixTable.union_rows(*mts) # Generate table of counts tb_gene = (mt_grouped.annotate_rows( n_cases=hl.agg.filter(mt_grouped['phe.is_case'], hl.agg.sum(mt_grouped.mac)), n_syndromic=hl.agg.filter(mt_grouped['phe.is_syndromic'], hl.agg.sum(mt_grouped.mac)), n_nonsyndromic=hl.agg.filter(mt_grouped['phe.is_nonsyndromic'], hl.agg.sum(mt_grouped.mac)), n_controls=hl.agg.filter(mt_grouped['phe.is_control'], hl.agg.sum(mt_grouped.mac)), n_total_cases=hl.agg.filter(mt_grouped['phe.is_case'], hl.agg.count()), n_total_syndromic=hl.agg.filter(mt_grouped['phe.is_syndromic'], hl.agg.count()), n_total_nonsyndromic=hl.agg.filter(mt_grouped['phe.is_nonsyndromic'], hl.agg.count()), n_total_controls=hl.agg.filter(mt_grouped['phe.is_control'], hl.agg.count())).rows()) # run fet stratified by proband type analysis = ['all_cases', 'syndromic', 'nonsyndromic'] tbs = [] for proband in analysis: logger.info(f'Running test for {proband}...') colCases = None colTotalCases = None colControls = 'n_controls' colTotalControls = 'n_total_controls' if proband == 'all_cases': colCases = 'n_cases' colTotalCases = 'n_total_cases' if proband == 'syndromic': colCases = 'n_syndromic' colTotalCases = 'n_total_syndromic' if proband == 'nonsyndromic': colCases = 'n_nonsyndromic' colTotalCases = 'n_total_nonsyndromic' tb_fet = compute_fisher_exact(tb=tb_gene, n_cases_col=colCases, n_control_col=colControls, total_cases_col=colTotalCases, total_controls_col=colTotalControls, correct_total_counts=True, root_col_name='fet', extra_fields={ 'analysis': proband, 'maf': maf_cutoff }) # filter out zero-count genes tb_fet = (tb_fet.filter( hl.sum([tb_fet[colCases], tb_fet[colControls]]) > 0, keep=True)) tbs.append(tb_fet) tb_final = hl.Table.union(*tbs) tb_final.describe() # export results date = current_date() run_hash = str(uuid.uuid4())[:6] output_path = f'{args.output_dir}/{date}/{args.exome_cohort}.fet_burden.{run_hash}.ht' tb_final = (tb_final.checkpoint(output=output_path)) if args.write_to_file: # write table to disk as TSV file (tb_final.export(f'{output_path}.tsv')) hl.stop()
def main(args): # Start Hail hl.init(default_reference=args.default_reference) if not args.skip_filter_step: logger.info("Importing data...") # import unfiltered MT mt = hl.read_matrix_table( get_qc_mt_path(dataset=args.exome_cohort, part='unphase_adj_genotypes', split=True)) # filter to samples passing QC filters logger.info( "Filtering MT to samples passing QC filters (hard filters, relatedness, european ancestries)..." ) sample_qc_ht = hl.read_table(get_sample_qc_ht_path(part='final_qc')) sample_qc_ht = (sample_qc_ht.filter(sample_qc_ht.pass_filters)) mt = (mt.filter_cols(hl.is_defined(sample_qc_ht[mt.col_key]))) logger.info( "Filtering joint MT to bi-allelic, high-callrate, common SNPs...") maf = args.maf_threshold mt = (mt.filter_rows( bi_allelic_expr(mt) & hl.is_snp(mt.alleles[0], mt.alleles[1]) & (hl.agg.mean(mt.GT.n_alt_alleles()) / 2 > maf) & (hl.agg.fraction(hl.is_defined(mt.GT)) > 0.99)).naive_coalesce( 500)) logger.info("Checkpoint: writing filtered MT before LD pruning...") mt = mt.checkpoint(get_mt_checkpoint_path( dataset=args.exome_cohort, part='high_callrate_common_snp_biallelic'), overwrite=args.overwrite) logger.info( f"Running ld_prune with r2 = {args.ld_prune_r2} on MT with {mt.count_rows()} variants..." ) # remove correlated variants pruned_variant_table = hl.ld_prune(mt.GT, r2=args.ld_prune_r2, bp_window_size=500000, memory_per_core=512) mt = (mt.filter_rows(hl.is_defined(pruned_variant_table[mt.row_key]))) logger.info("Writing filtered MT with ld-pruned variants...") (mt.write(get_qc_mt_path(dataset=args.exome_cohort, part='high_callrate_common_snp_biallelic', split=True, ld_pruned=True), overwrite=args.overwrite)) logger.info("Importing filtered ld-pruned MT...") mt = hl.read_matrix_table( get_qc_mt_path(dataset=args.exome_cohort, part='high_callrate_common_snp_biallelic', split=True, ld_pruned=True)) logger.info(f"Running PCA on {mt.count_rows()} variants...") # run pca on merged dataset eigenvalues, pc_scores, _ = hl.hwe_normalized_pca(mt.GT, k=args.n_pcs) logger.info(f"Eigenvalues: {eigenvalues}") # Annotate eigenvalues as global field pc_scores = (pc_scores.annotate_globals(**{'eigenvalues': eigenvalues})) # Annotate PC array as independent fields. pca_table = (pc_scores.annotate( ** {'PC' + str(k + 1): pc_scores.scores[k] for k in range(0, args.n_pcs)}).drop('scores')) logger.info(f"Writing HT with PCA results...") # write as HT output_ht_path = args.output_ht pca_table = (pca_table.checkpoint(output=output_ht_path, overwrite=args.overwrite)) if args.write_to_file: (pca_table.export(f'{output_ht_path}.tsv.bgz')) # Stop Hail hl.stop() print("PCA pipeline finalised...")
def main(args): ## Init Hail hl.init(default_reference=args.default_ref_genome) ## Import unfiltered MT with adjusted genotypes ds = args.exome_cohort mt = hl.read_matrix_table(get_qc_mt_path(dataset=ds, part='unphase_adj_genotypes', split=True)) ## Add VEP-annotated fields vep_ht = get_vep_annotation_ht() mt = (mt .annotate_rows(LoF=vep_ht[mt.row_key].vep.LoF, Consequence=vep_ht[mt.row_key].vep.Consequence, DOMAINS=vep_ht[mt.row_key].vep.DOMAINS, SYMBOL=vep_ht[mt.row_key].vep.SYMBOL) ) ## Parse geneset geneset = parse_geneset(args.geneset_file) ## Filter to geneset mt = (mt .filter_rows(hl.set(geneset).contains(mt.SYMBOL)) .checkpoint(f'{nfs_tmp}/tmp.mt', overwrite=True) ) ## Sample-QC filtering if args.apply_sample_qc_filtering: logger.info('Applying per sample QC filtering...') mt = apply_sample_qc_filtering(mt) logger.info('Writing sample qc-filtered MT to disk...') mt = (mt .checkpoint(f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt', overwrite=True) ) ## Variant-QC filtering if args.apply_variant_qc_filtering: logger.info('Applying per variant QC filtering...') mt = apply_variant_qc_filtering(mt) # write hard filtered MT to disk logger.info('Writing variant qc-filtered mt with rare variants (internal maf 0.01) to disk...') mt = (mt .checkpoint(f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt', overwrite=True) ) ## Filtering by AFs # allelic frequency cut-off maf_cutoff = args.af_max_threshold if args.apply_af_filtering: # Annotate allelic frequencies from external source, # and compute internal AF on samples passing QC af_ht = get_af_annotation_ht() mt = (mt .annotate_rows(**af_ht[mt.row_key]) ) filter_expressions = [af_filter_expr(mt, 'internal_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'gnomad_genomes_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'gnomAD_AF', af_cutoff=maf_cutoff), af_filter_expr(mt, 'ger_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'rumc_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'bonn_af', af_cutoff=maf_cutoff) ] mt = (mt .filter_rows(functools.reduce(operator.iand, filter_expressions), keep=True) ) logger.info('Writing AF-filtered MT to disk...') mt = (mt .checkpoint(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt', overwrite=True) ) ## Filter to bi-allelic variants if args.filter_biallelic: logger.info('Running burden test on biallelic variants...') mt = mt.filter_rows(bi_allelic_expr(mt)) ## Generate blind sample IDs mt = mt.add_col_index() mt = (mt .annotate_cols(BIID=hl.str('BLIND_ID_') + hl.str(mt.col_idx)) ) ## Add cases/controls sample annotations tb_sample = get_sample_meta_data() mt = (mt .annotate_cols(**tb_sample[mt.s]) ) mt = (mt .filter_cols(mt['phe.is_case'] | mt['phe.is_control']) ) ## Annotate pathogenic scores ht_scores = get_vep_scores_ht() mt = mt.annotate_rows(**ht_scores[mt.row_key]) ## Annotate variants ID mt = annotate_variant_id(mt) # annotate samples ann_expr = {'n_het_cases': hl.agg.filter(mt.GT.is_het() & mt['phe.is_case'], hl.agg.count()), 'n_hom_cases': hl.agg.filter(mt.GT.is_hom_var() & mt['phe.is_case'], hl.agg.count()), 'n_het_syndromic': hl.agg.filter(mt.GT.is_het() & mt['phe.is_syndromic'], hl.agg.count()), 'n_hom_syndromic': hl.agg.filter(mt.GT.is_hom_var() & mt['phe.is_syndromic'], hl.agg.count()), 'n_het_nonsyndromic': hl.agg.filter(mt.GT.is_het() & mt['phe.is_nonsyndromic'], hl.agg.count()), 'n_hom_nonsyndromic': hl.agg.filter(mt.GT.is_hom_var() & mt['phe.is_nonsyndromic'], hl.agg.count()), 'n_het_controls': hl.agg.filter(mt.GT.is_het() & ~mt['phe.is_case'], hl.agg.count()), 'n_hom_controls': hl.agg.filter(mt.GT.is_hom_var() & ~mt['phe.is_case'], hl.agg.count()), 'het_case_ids': hl.agg.filter(mt.GT.is_het() & mt['phe.is_case'], hl.delimit(hl.agg.collect_as_set(mt.BIID), '|')), 'hom_case_ids': hl.agg.filter(mt.GT.is_hom_var() & mt['phe.is_case'], hl.delimit(hl.agg.collect_as_set(mt.BIID), '|')), 'het_control_ids': hl.agg.filter(mt.GT.is_het() & ~mt['phe.is_case'], hl.delimit(hl.agg.collect_as_set(mt.BIID), '|')), 'hom_control_ids': hl.agg.filter(mt.GT.is_hom_var() & ~mt['phe.is_case'], hl.delimit(hl.agg.collect_as_set(mt.BIID), '|')) } ht = (mt .annotate_rows(**ann_expr) .rows() .key_by() .select(*list(['vid', 'Consequence', 'SYMBOL', 'internal_af', 'gnomAD_AF', 'vep.MVP_score', 'vep.REVEL_score', 'vep.MPC_score', 'vep.CADD_PHRED']) + list(ann_expr.keys())) ) # export results (ht .export(args.output_file) )
def main(args): hl.init(default_reference=args.default_ref_genome) if args.run_test_mode: logger.info('Running pipeline on test data...') mt = (get_mt_data(part='raw_chr20').sample_rows(0.1)) else: logger.info( 'Running pipeline on MatrixTable wih adjusted genotypes...') ds = args.exome_cohort mt = hl.read_matrix_table( get_qc_mt_path(dataset=ds, part='unphase_adj_genotypes', split=True)) # 1. Sample-QC filtering if not args.skip_sample_qc_filtering: logger.info('Applying per sample QC filtering...') mt = apply_sample_qc_filtering(mt) logger.info( 'Writing sample qc-filtered mt with rare variants (internal maf 0.01) to disk...' ) mt = (mt.write(f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt', overwrite=True)) # 2. Variant-QC filtering if not args.skip_variant_qc_filtering: logger.info('Applying per variant QC filtering...') if hl.hadoop_is_file( f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt/_SUCCESS'): logger.info('Reading pre-existing sample qc-filtered MT...') mt = hl.read_matrix_table( f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt') mt = apply_variant_qc_filtering(mt) # write hard filtered MT to disk logger.info( 'Writing variant qc-filtered mt with rare variants (internal maf 0.01) to disk...' ) mt = (mt.write(f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt', overwrite=True)) # 3. Annotate AFs # allelic frequency cut-off maf_cutoff = args.af_max_threshold if not args.skip_af_filtering: if hl.hadoop_is_file( f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt/_SUCCESS'): logger.info( 'Reading pre-existing sample/variant qc-filtered MT...') mt = hl.read_matrix_table( f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt') # Annotate allelic frequencies from external source, # and compute internal AF on samples passing QC af_ht = get_af_annotation_ht() mt = (mt.annotate_rows(**af_ht[mt.row_key])) filter_expressions = [ af_filter_expr(mt, 'internal_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'gnomad_genomes_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'gnomAD_AF', af_cutoff=maf_cutoff), af_filter_expr(mt, 'ger_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'rumc_af', af_cutoff=maf_cutoff), af_filter_expr(mt, 'bonn_af', af_cutoff=maf_cutoff) ] mt = (mt.filter_rows(functools.reduce(operator.iand, filter_expressions), keep=True)) logger.info( f'Writing sample/variant QCed MT with rare variants at maf: {args.af_max_threshold}.' ) mt = (mt.write(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt', overwrite=True)) # 4. ##### Run gene-set burden logistic regression ###### logger.info('Running gene-set burden logistic regression test...') if hl.hadoop_is_file(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt/_SUCCESS'): logger.info( 'Reading pre-existing sample/variant qc-filtered MT with rare variants...' ) mt = hl.read_matrix_table(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt') ## Add VEP-annotated fields vep_ht = get_vep_annotation_ht() mt = (mt.annotate_rows(LoF=vep_ht[mt.row_key].vep.LoF, Consequence=vep_ht[mt.row_key].vep.Consequence, DOMAINS=vep_ht[mt.row_key].vep.DOMAINS, SYMBOL=vep_ht[mt.row_key].vep.SYMBOL)) ## Filter to bi-allelic variants if args.filter_biallelic: logger.info('Running burden test on biallelic variants...') mt = mt.filter_rows(bi_allelic_expr(mt)) ## Filter to variants within protein domain(s) if args.filter_protein_domain: logger.info( 'Running burden test on variants within protein domain(s)...') mt = mt.filter_rows(vep_protein_domain_filter_expr(mt.DOMAINS), keep=True) ## Annotate pathogenic scores ht_scores = get_vep_scores_ht() mt = mt.annotate_rows(**ht_scores[mt.row_key]) ## Classify variant into (major) consequence groups score_expr_ann = { 'hcLOF': mt.LoF == 'HC', 'syn': mt.Consequence == 'synonymous_variant', 'miss': mt.Consequence == 'missense_variant' } # Update dict expr annotations with combinations of variant consequences categories score_expr_ann.update({ 'missC': (hl.sum([(mt['vep.MVP_score'] >= MVP_THRESHOLD), (mt['vep.REVEL_score'] >= REVEL_THRESHOLD), (mt['vep.CADD_PHRED'] >= CADD_THRESHOLD)]) >= 2) & score_expr_ann.get('miss') }) score_expr_ann.update({ 'hcLOF_missC': score_expr_ann.get('hcLOF') | score_expr_ann.get('missC') }) mt = (mt.annotate_rows(csq_group=score_expr_ann)) # Transmute csq_group and convert dict to set where the group is defined # (easier to explode and grouping later) mt = (mt.transmute_rows(csq_group=hl.set( hl.filter(lambda x: mt.csq_group.get(x), mt.csq_group.keys())))) mt = (mt.filter_rows(hl.len(mt.csq_group) > 0)) # Explode nested csq_group and gene clusters before grouping mt = (mt.explode_rows(mt.csq_group)) # First-step aggregation: # Generate a sample per gene/variant_type (binary) matrix aggregating genotypes as follow: # # a) entry: hets # b) entry: homs # c) entry: chets (compound hets) mt_grouped = (mt.group_rows_by(mt['SYMBOL'], mt['csq_group']).aggregate( hets=hl.agg.any(mt.GT.is_het()), homs=hl.agg.any(mt.GT.is_hom_var()), chets=hl.agg.count_where( mt.GT.is_het()) >= 2).repartition(100).persist()) # Import/generate gene clusters clusters = hl.import_table(args.set_file, no_header=True, delimiter="\t", min_partitions=50, impute=False) clusters = generate_clusters_map(clusters) # Annotate gene-set info mt_grouped = (mt_grouped.annotate_rows(**clusters[mt_grouped.SYMBOL])) # Explode nested csq_group before grouping mt_grouped = (mt_grouped.explode_rows(mt_grouped.cluster_id)) # filter rows with defined consequence and gene-set name mt_grouped = (mt_grouped.filter_rows( hl.is_defined(mt_grouped.csq_group) & hl.is_defined(mt_grouped.cluster_id))) # 2. Second-step aggregation # Generate a sample per gene-sets/variant type matrix aggregating genotypes as follow: # if dominant -> sum hets (default) # if recessive -> sum (homs) # if recessive (a) -> sum (chets) # if recessive (b) -> sum (chets and/or homs) mts = [] if args.homs: # Group mt by gene-sets/csq_group aggregating homs genotypes. mt_homs = (mt_grouped.group_rows_by( mt_grouped.csq_group, mt_grouped.cluster_id).aggregate( mac=hl.int(hl.agg.sum(mt_grouped.homs))).repartition( 100).persist().annotate_rows(agg_genotype='homs')) mts.append(mt_homs) if args.chets: # Group mt by gene-sets/csq_group aggregating compound hets (chets) genotypes. mt_chets = (mt_grouped.group_rows_by( mt_grouped.csq_group, mt_grouped.cluster_id).aggregate( mac=hl.int(hl.agg.sum(mt_grouped.chets))).repartition( 100).persist().annotate_rows(agg_genotype='chets')) mts.append(mt_chets) if args.homs_chets: # Group mt by gene-sets/csq_group aggregating chets and/or homs genotypes. mt_homs_chets = (mt_grouped.group_rows_by( mt_grouped.csq_group, mt_grouped.cluster_id).aggregate(mac=hl.int( hl.agg.count_where(mt_grouped.chets | mt_grouped.homs))).repartition(100). persist().annotate_rows(agg_genotype='homs_chets')) mts.append(mt_homs_chets) if args.hets: # Group mt by gene-sets/csq_group aggregating hets genotypes (default) mt_hets = (mt_grouped.group_rows_by( mt_grouped.csq_group, mt_grouped.cluster_id).aggregate( mac=hl.int(hl.agg.sum(mt_grouped.hets))).repartition( 100).persist().annotate_rows(agg_genotype='hets')) mts.append(mt_hets) ## Joint MatrixTables mt_joint = hl.MatrixTable.union_rows(*mts) ## Add samples annotations # annotate sample covs covariates = hl.read_table( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.sample_covariates.ht') mt_joint = (mt_joint.annotate_cols(**covariates[mt_joint.s])) # annotate case/control phenotype info tb_sample = get_sample_meta_data() mt_joint = (mt_joint.annotate_cols(**tb_sample[mt_joint.s])) mt_joint = (mt_joint.filter_cols(mt_joint['phe.is_case'] | mt_joint['phe.is_control'])) ## Run logistic regression stratified by proband type analysis = ['all_cases', 'syndromic', 'nonsyndromic'] tbs = [] covs = ['sex', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5'] for proband in analysis: logger.info(f'Running burden test for {proband}...') mt_tmp = hl.MatrixTable if proband == 'all_cases': mt_tmp = mt_joint if proband == 'syndromic': mt_tmp = mt_joint.filter_cols(~mt_joint['phe.is_nonsyndromic']) if proband == 'nonsyndromic': mt_tmp = mt_joint.filter_cols(~mt_joint['phe.is_syndromic']) tb_logreg = logistic_regression(mt=mt_tmp, x_expr='mac', response='phe.is_case', covs=covs, pass_through=['agg_genotype'], extra_fields={ 'analysis': proband, 'maf': maf_cutoff, 'covs': '|'.join(covs) }) tbs.append(tb_logreg) tb_final = hl.Table.union(*tbs) # export results date = current_date() run_hash = str(uuid.uuid4())[:6] output_path = f'{args.output_dir}/{date}/{args.exome_cohort}.logreg_burden.{run_hash}.ht' tb_final = (tb_final.checkpoint(output=output_path)) if args.write_to_file: # write table to disk as TSV file (tb_final.export(f'{output_path}.tsv')) hl.stop()