def test_pcrelate_paths(): mt = hl.balding_nichols_model(3, 50, 100) _, scores3, _ = hl.hwe_normalized_pca(mt.GT, k=3, compute_loadings=False) kin1 = hl.pc_relate(mt.GT, 0.10, k=2, statistics='kin', block_size=64) kin2 = hl.pc_relate(mt.GT, 0.05, k=2, min_kinship=0.01, statistics='kin2', block_size=128).cache() kin3 = hl.pc_relate(mt.GT, 0.02, k=3, min_kinship=0.1, statistics='kin20', block_size=64).cache() kin_s1 = hl.pc_relate(mt.GT, 0.10, scores_expr=scores3[mt.col_key].scores[:2], statistics='kin', block_size=32) assert kin1._same(kin_s1, tolerance=1e-4) assert kin1.count() == 50 * 49 / 2 assert kin2.count() > 0 assert kin2.filter(kin2.kin < 0.01).count() == 0 assert kin3.count() > 0 assert kin3.filter(kin3.kin < 0.1).count() == 0
def test_pcrelate_issue_5263(): mt = hl.balding_nichols_model(3, 50, 100) expected = hl.pc_relate(mt.GT, 0.10, k=2, statistics='all') mt = mt.select_entries(GT2=mt.GT, GT=hl.call(hl.rand_bool(0.5), hl.rand_bool(0.5))) actual = hl.pc_relate(mt.GT2, 0.10, k=2, statistics='all') assert expected._same(actual, tolerance=1e-4)
def test_self_kinship(): mt = hl.balding_nichols_model(3, 10, 50) with_self = hl.pc_relate(mt.GT, 0.10, k=2, statistics='kin20', block_size=16, include_self_kinship=True) without_self = hl.pc_relate(mt.GT, 0.10, k=2, statistics='kin20', block_size=16) assert with_self.count() == 55 assert without_self.count() == 45 with_self_self_kin_only = with_self.filter( with_self.i.sample_idx == with_self.j.sample_idx) assert with_self_self_kin_only.count( ) == 10, with_self_self_kin_only.collect() with_self_no_self_kin = with_self.filter( with_self.i.sample_idx != with_self.j.sample_idx) assert with_self_no_self_kin.count() == 45, with_self_no_self_kin.collect() assert with_self_no_self_kin._same(without_self) without_self_self_kin_only = without_self.filter( without_self.i.sample_idx == without_self.j.sample_idx) assert without_self_self_kin_only.count( ) == 0, without_self_self_kin_only.collect()
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') mt = hl.read_matrix_table(HGDP1KG_TOBWGS) # Perform kinship test with pc_relate pc_rel_path = output_path('pc_relate_kinship_estimate.ht') pc_rel = hl.pc_relate(mt.GT, 0.01, k=10, statistics='kin') pc_rel.write(pc_rel_path, overwrite=True) pairs = pc_rel.filter(pc_rel['kin'] >= 0.125) related_samples_to_remove = hl.maximal_independent_set( pairs.i, pairs.j, False) n_related_samples = related_samples_to_remove.count() print(f'related_samples_to_remove.count() = {n_related_samples}') # save as html html = pd.DataFrame({ 'removed_individual': related_samples_to_remove.node.s.collect() }).to_html() plot_filename_html = output_path(f'removed_samples.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html)
def test_pc_relate_simple_example(): gs = hl.literal( [[0, 0, 0, 0, 1, 1, 1, 1], [0, 0, 1, 1, 0, 0, 1, 1], [0, 1, 0, 1, 0, 1, 0, 1], [0, 0, 1, 1, 0, 0, 1, 1]]) scores = hl.literal([[0, 1], [1, 1], [1, 0], [0, 0]]) mt = hl.utils.range_matrix_table(n_rows=8, n_cols=4) mt = mt.annotate_entries(GT=hl.unphased_diploid_gt_index_call(gs[mt.col_idx][mt.row_idx])) mt = mt.annotate_cols(scores=scores[mt.col_idx]) pcr = hl.pc_relate(mt.GT, min_individual_maf=0, scores_expr=mt.scores) expected = [ hl.Struct(i=0, j=1, kin=-0.14570713364640647, ibd0=1.4823511628401964, ibd1=-0.38187379109476693, ibd2=-0.10047737174542953), hl.Struct(i=0, j=2, kin=0.16530591922102378, ibd0=0.5234783206257841, ibd1=0.2918196818643366, ibd2=0.18470199750987923), hl.Struct(i=0, j=3, kin=-0.14570713364640647, ibd0=1.4823511628401964, ibd1=-0.38187379109476693, ibd2=-0.10047737174542953), hl.Struct(i=1, j=2, kin=-0.14570713364640647, ibd0=1.4823511628401964, ibd1=-0.38187379109476693, ibd2=-0.10047737174542953), hl.Struct(i=1, j=3, kin=0.14285714285714285, ibd0=0.7027734170591313, ibd1=0.02302459445316596, ibd2=0.2742019884877027), hl.Struct(i=2, j=3, kin=-0.14570713364640647, ibd0=1.4823511628401964, ibd1=-0.38187379109476693, ibd2=-0.10047737174542953), ] ht_expected = hl.Table.parallelize(expected) ht_expected = ht_expected.key_by(i=hl.struct(col_idx=ht_expected.i), j=hl.struct(col_idx=ht_expected.j)) assert ht_expected._same(pcr)
def test_pc_relate_against_R_truth(): mt = hl.import_vcf(resource('pc_relate_bn_input.vcf.bgz')) hail_kin = hl.pc_relate(mt.GT, 0.00, k=2).checkpoint( utils.new_temp_file(extension='ht')) r_kin = hl.import_table(resource('pc_relate_r_truth.tsv.bgz'), types={ 'i': 'struct{s:str}', 'j': 'struct{s:str}', 'kin': 'float', 'ibd0': 'float', 'ibd1': 'float', 'ibd2': 'float' }, key=['i', 'j']) assert r_kin.select("kin")._same(hail_kin.select("kin"), tolerance=1e-3, absolute=True) assert r_kin.select("ibd0")._same(hail_kin.select("ibd0"), tolerance=1.3e-2, absolute=True) assert r_kin.select("ibd1")._same(hail_kin.select("ibd1"), tolerance=2.6e-2, absolute=True) assert r_kin.select("ibd2")._same(hail_kin.select("ibd2"), tolerance=1.3e-2, absolute=True)
def test_pcrelate(self): dataset = hl.balding_nichols_model(3, 100, 100) dataset = dataset.annotate_cols(sample_idx = hl.str(dataset.sample_idx)) t = hl.pc_relate(dataset, 2, 0.05, block_size=64, statistics="phi") self.assertTrue(isinstance(t, hl.Table)) t.count()
def run_pc_relate(mt: hl.MatrixTable, pca_prefix: str, overwrite: bool = False): """ Runs PC-relate to identify relatives in a matrix table :param mt: Matrix table to run PC-relate on :param pca_prefix: Prefix to path to output relatedness information :param overwrite: if True, overwrites existing data :return: """ relatedness_ht = hl.pc_relate(mt.GT, min_individual_maf=0.05, min_kinship=0.05, statistics='kin', k=20).key_by() relatedness_ht.write(pca_prefix + 'relatedness.ht', args.overwrite) relatedness_ht = hl.read_table(pca_prefix + 'relatedness.ht') # identify individuals in pairs to remove related_samples_to_remove = hl.maximal_independent_set( relatedness_ht.i, relatedness_ht.j, False) mt_unrel = mt.filter_cols(hl.is_defined( related_samples_to_remove[mt.col_key]), keep=False) mt_rel = mt.filter_cols(hl.is_defined( related_samples_to_remove[mt.col_key]), keep=True) mt_unrel.write(pca_prefix + 'unrel.mt', args.overwrite) mt_rel.write(pca_prefix + 'rel.mt', args.overwrite)
def compute_relatedness( data_type: str = "genomes", overwrite: bool = False, ) -> hl.Table: """ Perform sample QC on the split VDS table using `compute_stratified_sample_qc`. :param data_type: Whether data is from genomes or exomes, default is genomes :param overwrite: Whether to overwrite the file :return: Table table after running pc_relate :rtype: hl.Table """ logger.info("Computing relatedness table on CCDG %s VDS", data_type) pca_var_ht = hl.read_table(get_pca_variants_path()) mt = hl.vds.to_dense_mt(get_qc_vds(data_type)) mt = mt.filter_rows(hl.is_defined(pca_var_ht[mt.row_key])) eig, scores, _ = hl.hwe_normalized_pca(mt.GT, k=10, compute_loadings=False) scores = scores.checkpoint( get_ccdg_results_path(data_type=data_type, result="pc_scores"), overwrite=overwrite, _read_if_exists=not overwrite, ) relatedness_ht = hl.pc_relate( mt.GT, min_individual_maf=0.01, scores_expr=scores[mt.col_key].scores, block_size=4096, min_kinship=0.05, statistics="all", ) return relatedness_ht.checkpoint( get_ccdg_results_path(data_type=data_type, result="relatedness"), overwrite=overwrite, _read_if_exists=(not overwrite), )
def pc_relate_5k_5k(mt_path): mt = hl.read_matrix_table(mt_path) mt = mt.annotate_cols(scores=hl.range(2).map(lambda x: hl.rand_unif(0, 1))) rel = hl.pc_relate(mt.GT, 0.05, scores_expr=mt.scores, statistics='kin', min_kinship=0.05) rel._force_count()
def pc_relate_big(): mt = hl.balding_nichols_model(3, 2 * 4096, 2 * 4096).checkpoint( hl.utils.new_temp_file(extension='mt')) mt = mt.annotate_cols(scores=hl.range(2).map(lambda x: hl.rand_unif(0, 1))) rel = hl.pc_relate(mt.GT, 0.05, scores_expr=mt.scores, statistics='kin', min_kinship=0.05) rel._force_count()
def main(args): if args.join_qc_mt: v2_qc_mt_liftover = get_liftover_v2_qc_mt('exomes', ld_pruned=True, release_only=True) v2_qc_mt_liftover = v2_qc_mt_liftover.key_cols_by(s=v2_qc_mt_liftover.s, data_type="v2_exomes") v3_qc_mt = qc.mt() v3_qc_mt = v3_qc_mt.filter_cols(meta.ht()[v3_qc_mt.col_key].release) v3_qc_mt = v3_qc_mt.select_rows().select_cols() v3_qc_mt = v3_qc_mt.key_cols_by(s=v3_qc_mt.s, data_type="v3_genomes") joint_qc_mt = v2_qc_mt_liftover.union_cols(v3_qc_mt) joint_qc_mt.write("gs://gnomad-tmp/v2_exomes_v3_joint_qc.mt", overwrite=args.overwrite) if args.run_pc_relate: logger.info('Running PC-Relate') logger.warning("PC-relate requires SSDs and doesn't work with preemptible workers!") joint_qc_mt = hl.read_matrix_table("gs://gnomad-tmp/v2_exomes_v3_joint_qc.mt") joint_qc_mt = joint_qc_mt.sample_rows(0.1) eig, scores, _ = hl.hwe_normalized_pca(joint_qc_mt.GT, k=10, compute_loadings=False) scores = scores.checkpoint(v2_v3_pc_relate_pca_scores.path, overwrite=args.overwrite, _read_if_exists=not args.overwrite) relatedness_ht = hl.pc_relate(joint_qc_mt.GT, min_individual_maf=0.01, scores_expr=scores[joint_qc_mt.col_key].scores, block_size=4096, min_kinship=0.1, statistics='all') relatedness_ht.write(v2_v3_relatedness.path, args.overwrite)
def main(args): hl.init(log='/sample_qc.log', tmp_dir='hdfs:///pc_relate.tmp/') if not args.load_joint_pruned_qc_mt: logger.info('Joining exomes and genomes...') exome_qc_mt = read_and_pre_process_data( qc_mt_path('exomes'), qc_ht_path('exomes', 'hard_filters')) genome_qc_mt = read_and_pre_process_data( qc_mt_path('genomes'), qc_ht_path('genomes', 'hard_filters')) joint_qc_mt = exome_qc_mt.union_cols( genome_qc_mt) # NOTE: this is an inner join on rows joint_qc_mt = joint_qc_mt.filter_rows( (hl.agg.mean(joint_qc_mt.GT.n_alt_alleles()) / 2 > 0.001) & (hl.agg.fraction(hl.is_defined(joint_qc_mt.GT)) > 0.99)) joint_qc_mt.write(qc_mt_path('joint'), args.overwrite) logger.info('LD-pruning joint mt of exomes and genomes...') joint_qc_mt = hl.read_matrix_table(qc_mt_path('joint')) variants, samples = joint_qc_mt.count() logger.info('Pruning {0} variants in {1} samples'.format( variants, samples)) joint_qc_pruned_ht = hl.ld_prune(joint_qc_mt.GT, r2=0.1) # Note writing the LD-pruned MT is probably overkill # vs using `filter_rows` to filter sites based on the LD-pruned HT. joint_qc_pruned_mt = joint_qc_mt.filter_rows( hl.is_defined(joint_qc_pruned_ht[joint_qc_mt.row_key])) joint_qc_pruned_mt.write(qc_mt_path('joint', ld_pruned=True), args.overwrite) pruned_mt = hl.read_matrix_table(qc_mt_path('joint', ld_pruned=True)) variants, samples = pruned_mt.count() logger.info('{0} samples, {1} variants found in LD-pruned joint MT'.format( samples, variants)) if not args.skip_pc_relate: logger.info('Running PCA for PC-Relate...') eig, scores, _ = hl.hwe_normalized_pca(pruned_mt.GT, k=10, compute_loadings=False) scores.write( qc_temp_data_prefix('joint') + '.pruned.pca_scores.ht', args.overwrite) logger.info('Running PC-Relate...') scores = hl.read_table( qc_temp_data_prefix('joint') + '.pruned.pca_scores.ht') # NOTE: This needs SSDs on your workers (for the temp files) and no pre-emptibles while the BlockMatrix writes relatedness_ht = hl.pc_relate( pruned_mt.GT, min_individual_maf=0.05, scores_expr=scores[pruned_mt.col_key].scores, block_size=4096, min_kinship=0.05, statistics='kin2') relatedness_ht.write(relatedness_ht_path, args.overwrite) relatedness_ht = hl.read_table(relatedness_ht_path) if not args.skip_relatedness: infer_ped(GnomADRelatedData('exomes')) infer_ped(GnomADRelatedData('genomes')) logger.info('Making rank file...') rank_table = make_rank_file(rank_annotations_path('joint')) logger.info('Finished making rank file...') related_samples_to_drop_ranked = get_related_samples_to_drop( rank_table, relatedness_ht) related_samples_to_drop_ranked.write( qc_temp_data_prefix('joint') + '.related_samples_to_drop.ht', args.overwrite) pca_mt, related_mt = split_mt_by_relatedness(pruned_mt) if not args.skip_pop_pca: variants, samples = pca_mt.count() logger.info('{} samples after removing relateds'.format(samples)) # TODO: Check that there are no longer any 2nd-degree relateds in the callset by running KING on the output file below plink_mt = pca_mt.annotate_cols(uid=pca_mt.data_type + '_' + pca_mt.s.replace(" ", "_")).replace( "/", "_").key_cols_by('uid') hl.export_plink(plink_mt, qc_temp_data_prefix('joint') + '.unrelated.plink', fam_id=plink_mt.uid, ind_id=plink_mt.uid) logger.info( 'Computing population PCs and annotating with known population labels...' ) pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca( pca_mt.GT, k=20, compute_loadings=True) pca_af_ht = pca_mt.annotate_rows( pca_af=hl.agg.mean(pca_mt.GT.n_alt_alleles()) / 2).rows() pca_loadings = pca_loadings.annotate( pca_af=pca_af_ht[pca_loadings.key].pca_af) pca_scores.write(ancestry_pca_scores_ht_path(), args.overwrite) pca_loadings.write(ancestry_pca_loadings_ht_path(), args.overwrite) pca_scores = hl.read_table(ancestry_pca_scores_ht_path()) pca_loadings = hl.read_table(ancestry_pca_loadings_ht_path()) pca_mt = pca_mt.annotate_cols(scores=pca_scores[pca_mt.col_key].scores) variants, samples = related_mt.count() logger.info( 'Projecting population PCs for {} related samples...'.format(samples)) related_scores = pc_project(related_mt, pca_loadings) relateds = related_mt.cols() relateds = relateds.annotate(scores=related_scores[relateds.key].scores) logger.info('Assigning population annotations...') pop_colnames = ['related', 'known_pop', 'scores'] pop_annots_ht = hl.import_table(known_population_annotations, impute=True).key_by('combined_sample') joint_ht = pca_mt.cols().union(relateds) joint_ht = joint_ht.annotate( known_pop=pop_annots_ht[joint_ht.data_type.replace('s', '') + '_' + joint_ht.s.replace(' ', '_')].known_pop ) # FIXME: temporarily doing the underscore thing until known_population_annotations is fixed joint_pca_ht = joint_ht.select(*pop_colnames) joint_pca_ht, joint_pca_fit = run_assign_population_pcs( joint_pca_ht, qc_temp_data_prefix('joint') + '.RF_pop_assignments.txt.bgz', qc_temp_data_prefix('joint') + '.RF_fit.pkl', pcs=list(range(1, 7))) joint_ht = joint_ht.annotate(pop=joint_pca_ht[joint_ht.key].pop).select( 'pop', *pop_colnames) # Add special Estonian pop category for genomes estonian_ht = (hl.import_table(estonian_batches, impute=True).annotate( data_type='genomes').key_by('data_type', 'sample')) joint_ht = joint_ht.annotate(batch=estonian_ht[joint_ht.key].batch) joint_ht = joint_ht.annotate(qc_pop=hl.case(missing_false=True).when( hl.is_defined(joint_ht.pop) & (joint_ht.batch == 1), 'est_b1' ).when(hl.is_defined(joint_ht.pop) & (joint_ht.batch == 2), 'est_b2').default(joint_ht.pop)).persist() # These are keyed by only `s` genome_mt = get_gnomad_data('genomes', adj=False, split=False, meta_root=None).select_cols() exome_mt = get_gnomad_data('exomes', adj=False, split=False, meta_root=None).select_cols() # Population-specific filtering if not args.skip_calculate_sample_metrics: logger.info( 'Running mini sample QC for platform- and population-specific filtering...' ) gnomad_sample_qc(exome_mt).cols().select('sample_qc').write( qc_temp_data_prefix('exomes') + '.sample_qc.ht', args.overwrite) gnomad_sample_qc(genome_mt).cols().select('sample_qc').write( qc_temp_data_prefix('genomes') + '.sample_qc.ht', args.overwrite) # TODO: check that the pcr_free annotations are complete once samples are updated from Jessica's spreadsheet logger.info('Annotating population and platform assignments...') platform_ht = hl.read_table(qc_ht_path('exomes', 'platforms')) exome_ht = exome_mt.cols() exome_ht = exome_ht.annotate( qc_platform=platform_ht.key_by('s')[exome_ht.s].qc_platform, **joint_ht.filter( joint_ht.data_type == 'exomes').key_by('s')[exome_ht.s]) genome_meta_ht = hl.read_table(qc_ht_path('genomes', 'hard_filters')) genome_ht = genome_mt.cols() genome_ht = genome_ht.annotate( qc_platform=genome_meta_ht.key_by('s')[genome_ht.s].qc_platform, **joint_ht.filter( joint_ht.data_type == 'genomes').key_by('s')[genome_ht.s]) exome_sample_qc_ht = hl.read_table( qc_temp_data_prefix('exomes') + '.sample_qc.ht') genome_sample_qc_ht = hl.read_table( qc_temp_data_prefix('genomes') + '.sample_qc.ht') exome_ht = exome_ht.annotate(**exome_sample_qc_ht[exome_ht.s]) genome_ht = genome_ht.annotate(**genome_sample_qc_ht[genome_ht.s]) # For each population, aggregate sample QC metrics and calculate the MAD/mean/stdev logger.info( 'Calculating platform- and population-specific sample QC thresholds...' ) exome_qc_metrics = [ 'n_snp', 'r_ti_tv', 'r_insertion_deletion', 'n_insertion', 'n_deletion', 'r_het_hom_var' ] exome_pop_platform_filter_ht = compute_stratified_metrics_filter( exome_ht, exome_qc_metrics, ['qc_pop', 'qc_platform']) exome_ht = exome_ht.annotate_globals( hl.eval(exome_pop_platform_filter_ht.globals)) exome_ht = exome_ht.annotate( **exome_pop_platform_filter_ht[exome_ht.key]).persist() genome_qc_metrics = [ 'n_snp', 'r_ti_tv', 'r_insertion_deletion', 'n_insertion', 'n_deletion', 'r_het_hom_var' ] genome_pop_platform_filter_ht = compute_stratified_metrics_filter( genome_ht, genome_qc_metrics, ['qc_pop', 'qc_platform']) genome_ht = genome_ht.annotate_globals( hl.eval(genome_pop_platform_filter_ht.globals)) genome_ht = genome_ht.annotate( **genome_pop_platform_filter_ht[genome_ht.key]).persist() # Annotate samples that fail their respective filters checkpoint = exome_ht.aggregate( hl.agg.count_where(hl.len(exome_ht.pop_platform_filters) == 0)) logger.info( f'{checkpoint} exome samples found passing pop/platform-specific filtering' ) exome_ht.key_by(data_type='exomes', s=exome_ht.s).write(qc_ht_path('exomes', 'pop_platform'), args.overwrite) checkpoint = genome_ht.aggregate( hl.agg.count_where(hl.len(genome_ht.pop_platform_filters) == 0)) logger.info( f'{checkpoint} genome samples found passing pop/platform-specific filtering' ) genome_ht.key_by(data_type='genomes', s=genome_ht.s).write( qc_ht_path('genomes', 'pop_platform'), args.overwrite)
mt_rows = mt_rows.annotate(var_cr_flag=mt_rows.var_cr_flag_1) mt_rows = mt_rows.drop(mt_rows.var_cr_flag_1) mt_rows = mt_rows.drop(mt_rows.var_cr_flag_2) var_cr_counts = mt_rows.aggregate( hl.agg.array_agg(lambda x: hl.agg.counter(x), mt_rows.var_cr_flag)) maf_counts = mt_rows.aggregate( hl.agg.array_agg(lambda x: hl.agg.counter(x), mt_rows.maf_flag)) hwe_counts = mt_rows.aggregate( hl.agg.array_agg(lambda x: hl.agg.counter(x), mt_rows.hwe_pval_flag)) # Calculates relatedness using pc_relate for all samples in a matrix table # Annotates a column which flags those who failed the relatedness filter as True pc_rel = hl.pc_relate(mt_auto.GT, 0.001, k=10, statistics='kin') pairs = pc_rel.filter(pc_rel['kin'] > 0.125) related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, keep=False) mt_auto = mt_auto.annotate_cols( related_filter=hl.is_defined(related_samples_to_remove[mt_auto.col_key])) ''' --- Conducting QC --- QC Steps: snp call rate sample call rate sex violations maf hwe
def relatedness_check(in_mt: hl.MatrixTable = None, method: str = 'pc_relate', outdir: str = None, kin_estimate: float = 0.98): global mt, samples_to_remove in_mt = hl.variant_qc(in_mt) in_mt = hl.sample_qc(in_mt) # _localize=False means don't put this in Python, keep it as a Hail expr call_rate_dict = in_mt.aggregate_cols(hl.dict( hl.agg.collect((in_mt.s, in_mt.sample_qc.call_rate))), _localize=False) if method == 'pc_relate': print("\nUsing PC-Relate for relatedness checks") relatedness_ht = hl.pc_relate(in_mt.GT, 0.01, k=10, min_kinship=0.1, statistics='kin') samples_to_remove_ht = relatedness_ht.filter( relatedness_ht.kin > kin_estimate) # get call rates for both samples so we remove the one with lower call rate between the two samples_to_remove = samples_to_remove_ht.annotate( cr_s1=call_rate_dict[samples_to_remove_ht.i.s], cr_s2=call_rate_dict[samples_to_remove_ht.j.s]) samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond( samples_to_remove.cr_s1 <= samples_to_remove.cr_s2, samples_to_remove.i, samples_to_remove.j)) elif method == 'ibd': print("\nUsing PLINK-style identity by descent for relatedness checks") in_mt = in_mt.annotate_rows(maf=hl.min(in_mt.variant_qc.AF)) relatedness_ht = hl.identity_by_descent( in_mt, maf=in_mt['maf'] ) # this returns a Hail Table with the sample pairs samples_to_remove_ht = relatedness_ht.filter( relatedness_ht.ibd.PI_HAT > kin_estimate) # get call rates for both samples so we remove the one with lower call rate between the two samples_to_remove = samples_to_remove_ht.annotate( cr_s1=call_rate_dict[samples_to_remove_ht.i], cr_s2=call_rate_dict[samples_to_remove_ht.j]) samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond( samples_to_remove.cr_s1 <= samples_to_remove.cr_s2, samples_to_remove.i, samples_to_remove.j)) else: print("\nUsing KING for relatedness checks") if kin_estimate > 0.5: raise Exception( "\nThe maximum kinship coefficient is for KING 0.5") relatedness_mt = hl.king(in_mt.GT) filtered_relatedness_mt = relatedness_mt.filter_entries( (relatedness_mt.s_1 != relatedness_mt.s) & (relatedness_mt.phi >= kin_estimate), keep=True) samples_to_remove_ht = filtered_relatedness_mt.entries() samples_to_remove = samples_to_remove_ht.annotate( cr_s1=call_rate_dict[samples_to_remove_ht.s_1], cr_s2=call_rate_dict[samples_to_remove_ht.s]) samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond( samples_to_remove.cr_s1 <= samples_to_remove.cr_s2, samples_to_remove.s_1, samples_to_remove.s)) samples = samples_list.sample_to_remove.collect() if len(samples) > 0: in_mt = in_mt.filter_cols(hl.literal(samples).contains(in_mt['s']), keep=False) print("\nNumber of samples that fail relatedness checks: {}".format( len(samples))) with open(outdir + 'relatedness_removed_samples.tsv', 'w') as f: for sample in samples: f.write(sample + "\n") else: print("\nNo samples failed the relatedness check") return in_mt
def main(args): mt = hl.read_matrix_table(args.matrixtable) # ld pruning pruned_ht = hl.ld_prune(mt.GT, r2=0.1) pruned_mt = mt.filter_rows(hl.is_defined(pruned_ht[mt.row_key])) pruned_mt.write(f"{args.output_dir}/mt_ldpruned.mt", overwrite=True) # PC relate pruned_mt = pruned_mt.select_entries( GT=hl.unphased_diploid_gt_index_call(pruned_mt.GT.n_alt_alleles())) eig, scores, _ = hl.hwe_normalized_pca(pruned_mt.GT, k=10, compute_loadings=False) scores.write(f"{args.output_dir}/mt_pruned.pca_scores.ht", overwrite=True) relatedness_ht = hl.pc_relate(pruned_mt.GT, min_individual_maf=0.05, scores_expr=scores[pruned_mt.col_key].scores, block_size=4096, min_kinship=0.05, statistics='kin2') relatedness_ht.write(f"{args.output_dir}/mt_relatedness.ht", overwrite=True) pairs = relatedness_ht.filter(relatedness_ht['kin'] > 0.125) related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, keep=False) related_samples_to_remove.write( f"{args.output_dir}/mt_related_samples_to_remove.ht", overwrite=True) pca_mt = pruned_mt.filter_cols(hl.is_defined( related_samples_to_remove[pruned_mt.col_key]), keep=False) related_mt = pruned_mt.filter_cols(hl.is_defined( related_samples_to_remove[pruned_mt.col_key]), keep=True) variants, samples = pca_mt.count() print(f"{samples} samples after relatedness step.") # Population pca plink_mt = pca_mt.annotate_cols(uid=pca_mt.s).key_cols_by('uid') hl.export_plink(plink_mt, f"{args.output_dir}/mt_unrelated.plink", fam_id=plink_mt.uid, ind_id=plink_mt.uid) pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca( pca_mt.GT, k=20, compute_loadings=True) pca_af_ht = pca_mt.annotate_rows( pca_af=hl.agg.mean(pca_mt.GT.n_alt_alleles()) / 2).rows() pca_loadings = pca_loadings.annotate( pca_af=pca_af_ht[pca_loadings.key].pca_af) pca_scores.write(f"{args.output_dir}/mt_pca_scores.ht", overwrite=True) pca_loadings.write(f"{args.output_dir}/mt_pca_loadings.ht", overwrite=True) pca_mt = pca_mt.annotate_cols(scores=pca_scores[pca_mt.col_key].scores) variants, samples = related_mt.count() print( 'Projecting population PCs for {} related samples...'.format(samples)) #related_scores = pc_project(related_mt, pca_loadings) #relateds = related_mt.cols() #relateds = relateds.annotate(scores=related_scores[relateds.key].scores) pca_mt.write(f"{args.output_dir}/mt_pca.mt", overwrite=True) p = hl.plot.scatter(pca_mt.scores[0], pca_mt.scores[1], title='PCA', xlabel='PC1', ylabel='PC2') output_file(f"{args.plot_dir}/pca.html") save(p)
def main(args): if args.load_ref: load_ref(args.dirname, args.basename) if args.load_ukbb: samples = hl.read_table( 'gs://ukb-diverse-pops/pigmentation_phenos_covs_pops.ht') ukbb = hl.read_matrix_table('gs://ukb31063/ukb31063.genotype.mt') ukbb = ukbb.annotate_cols(**samples[ukbb.s]) if args.intersect_ref: intersect_ref(args.dirname, args.basename, ukbb) if args.pca_project: """ Compute PCA in global reference panel, project UKBB individuals into PCA space """ ref_in_ukbb = hl.read_matrix_table(args.dirname + 'intersect_' + args.basename + 'ukbb.mt') print('Computing reference PCs') run_pca(ref_in_ukbb, args.out_prefix + args.basename + '_ukbb_') # project ukbb pca_loadings = hl.read_table( f'{args.out_prefix}{args.basename}_ukbb_loadings.ht') project_mt = hl.read_matrix_table(args.dirname + 'intersect_ukbb_' + args.basename + '.mt') ht = project_individuals(pca_loadings, project_mt) ht.export(args.out_prefix + 'ukbb_' + args.basename + '_scores.txt.bgz') # if args.continental_pca: # """ # Compute PCA within reference panel super pops, project UKBB individuals into PCA space # 1. Filter UKBB to individuals in continental population # 2. Run PCA on continental ref # 3. Project UKBB inds # """ # pass if args.ukbb_pop_pca: """ Compute PCA in each UKBB population (unrelateds), project reference individuals and relateds into PCA space 1. Filter UKBB to individuals in continental population 2. Run PC-relate on these individuals # New 2.5 Filter to pruned set of individuals # 3. Filter UKBB population to unrelated individuals 4. Run PCA on UKBB unrelateds within population 5. Project relateds """ for pop in POPS: mt = hl.read_matrix_table(get_ukb_grm_mt_path(pop)) pruned_ht = hl.read_table(get_ukb_grm_pruned_ht_path(pop)) mt = mt.filter_rows(hl.is_defined(pruned_ht[mt.row_key])) # run PC-relate if args.overwrite or not hl.hadoop_exists( get_relatedness_path(pop, extension='all_scores.ht/_SUCCESS')): _, scores, _ = hl.hwe_normalized_pca(mt.GT, k=10, compute_loadings=False) scores.write( get_relatedness_path(pop, extension='all_scores.ht'), args.overwrite) scores = hl.read_table( get_relatedness_path(pop, extension='all_scores.ht')) mt = mt.annotate_cols(scores=scores[mt.col_key].scores) # For EUR, required highmem machines with SSDs (Needed ~6T of hdfs space, so 20 workers + 100 pre-emptibles ran in ~7 hours) relatedness_ht = hl.pc_relate( mt.GT, min_individual_maf=0.05, scores_expr=mt.scores, min_kinship=0.05, statistics='kin', block_size=4096 if pop == 'EUR' else 512).key_by() relatedness_ht.write(get_relatedness_path(pop, extension='ht'), args.overwrite) relatedness_ht = hl.read_table( get_relatedness_path(pop, extension='ht')) # identify individuals in pairs to remove related_samples_to_remove = hl.maximal_independent_set( relatedness_ht.i, relatedness_ht.j, False) mt_unrel = mt.filter_cols(hl.is_defined( related_samples_to_remove[mt.col_key]), keep=False) mt_rel = mt.filter_cols(hl.is_defined( related_samples_to_remove[mt.col_key]), keep=True) mt_unrel.write(get_relatedness_path(pop, True, 'mt'), args.overwrite) mt_rel.write(get_relatedness_path(pop, extension='mt'), args.overwrite) if args.ukb_prune_pca_project: for pop in POPS: mt_unrel = hl.read_matrix_table( get_relatedness_path(pop, True, 'mt')) mt_rel = hl.read_matrix_table( get_relatedness_path(pop, extension='mt')) # Removing individuals pruned_inds = hl.import_table(get_pruned_tsv_path(), key='s') mt_rel = mt_rel.filter_cols( hl.is_defined(pruned_inds[mt_rel.col_key])) mt_unrel = mt_unrel.filter_cols( hl.is_defined(pruned_inds[mt_unrel.col_key])) # Removing sites window = '1e6' if pop != 'EUR' else '1e7' pruned_ht = hl.read_table(get_ukb_grm_pruned_ht_path(pop, window)) mt_unrel = mt_unrel.filter_rows( hl.is_defined(pruned_ht[mt_unrel.row_key])) mt_unrel = mt_unrel.repartition(500).checkpoint( hl.utils.new_temp_file()) pop = pop if window == '1e6' else f'{pop}_{window}' run_pca( mt_unrel, get_relatedness_path(pop, unrelated=True, extension='') + '.', args.overwrite) pca_loadings = hl.read_table( get_relatedness_path(pop, unrelated=True, extension='loadings.ht')) ht = project_individuals(pca_loadings, mt_rel) ht.write( get_relatedness_path(pop, extension='scores_projected.ht'), args.overwrite) hl.read_table( get_relatedness_path( pop, extension='scores_projected.ht')).export( get_relatedness_path( pop, extension='scores_projected.txt.bgz')) if args.generate_covariates: hts = [] for pop in POPS: pop_path = pop if pop != 'EUR' else f'EUR_1e7' ht = hl.read_table( get_relatedness_path(pop_path, extension='scores_projected.ht')) hts.append(ht.annotate(pop=pop, related=True)) ht = hl.read_table( get_relatedness_path(pop_path, True, extension='scores.ht')) ht = ht.transmute( **{f'PC{i}': ht.scores[i - 1] for i in range(1, 21)}) hts.append(ht.annotate(pop=pop, related=False)) ht = hts[0].union(*hts[1:]) cov_ht = hl.import_table(get_age_sex_tsv_path(), impute=True, force=True, quote='"', key='userId').select('age', 'sex') cov_ht = cov_ht.annotate(age_sex=cov_ht.age * cov_ht.sex, age2=hl.int32(cov_ht.age**2), age2_sex=hl.int32(cov_ht.age**2) * cov_ht.sex) ht = ht.annotate(**cov_ht.key_by(userId=hl.str(cov_ht.userId))[ht.key]) ht.write(get_covariates_ht_path(), args.overwrite) get_filtered_mt(imputed=False).cols().export(get_final_sample_set())
# Preparing for PCA for_pca = filter_to_autosomes(mt) for_pca = for_pca.filter_rows(for_pca.n_alleles == 2) # Performing the PCA sample_num = for_pca.cols().count() _, scores, _ = hl.hwe_normalized_pca( for_pca.GT, k=max(1, min(sample_num // 3, 10)), compute_loadings=False ) relatedness_ht = hl.pc_relate( for_pca.GT, min_individual_maf=0.01, scores_expr=scores[for_pca.col_key].scores, block_size=4096, min_kinship=0.05, statistics="kin", ) pairs = relatedness_ht.filter(relatedness_ht["kin"] > RELATEDNESS) related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, False) mt = mt.filter_cols(hl.is_defined(related_samples_to_remove[mt.col_key]), keep=False) # Wrapping up: saving relatednsess Table and dataset MatrixTable to disk relatedness_ht.write("relatedness.ht", overwrite=True) mt.write("sampleqc_pass.mt", overwrite=True)
def main(args): # Init Hail hl.init(default_reference=args.default_reference) if not args.skip_compute_pc_relate: if not args.skip_filter_data: # Read MatrixTable mt = hl.read_matrix_table(args.mt_input_path) # filter variants (bi-allelic, high-callrate, common SNPs) logger.info( f"Filtering to bi-allelic, high-callrate, common SNPs ({args.maf_threshold}) for pc_relate..." ) mt = (mt.filter_rows( (hl.len(mt.alleles) == 2) & hl.is_snp(mt.alleles[0], mt.alleles[1]) & (hl.agg.mean(mt.GT.n_alt_alleles()) / 2 > args.maf_threshold) & (hl.agg.fraction(hl.is_defined(mt.GT)) > 0.99) & ~mt.was_split).repartition(500, shuffle=False)) # keep only GT entry field and force to evaluate expression (mt.select_entries(mt.GT).write( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.filtered_high_confidence_variants.mt', overwrite=args.overwrite)) mt = hl.read_matrix_table( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.filtered_high_confidence_variants.mt' ) if not args.skip_prune_ld: # LD pruning # Avoid filtering / missingness entries (genotypes) before run LP pruning # Zulip Hail support issue -> "BlockMatrix trouble when running pc_relate" # mt = mt.unfilter_entries() # Prune variants in linkage disequilibrium. # Return a table with nearly uncorrelated variants logger.info( f'Pruning variants in LD from MT with {mt.count_rows()} variants...' ) pruned_variant_table = hl.ld_prune(mt.GT, r2=args.r2) # Keep LD-pruned variants pruned_mt = (mt.filter_rows(hl.is_defined( pruned_variant_table[mt.row_key]), keep=True)) pruned_mt.write( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.ld_pruned.mt', overwrite=args.overwrite) pruned_mt = hl.read_matrix_table( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.ld_pruned.mt') v, s = pruned_mt.count() logger.info(f'{s} samples, {v} variants found in LD-pruned MT') pruned_mt = pruned_mt.select_entries( GT=hl.unphased_diploid_gt_index_call(pruned_mt.GT.n_alt_alleles())) # run pc_relate method...compute all stats logger.info('Running PCA for PC-Relate...') eig, scores, _ = hl.hwe_normalized_pca(pruned_mt.GT, k=10, compute_loadings=False) scores.write( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.pruned.pca_scores_for_pc_relate.ht', overwrite=args.overwrite) logger.info(f'Running PC-Relate...') scores = hl.read_table( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.pruned.pca_scores_for_pc_relate.ht' ) relatedness_ht = hl.pc_relate( call_expr=pruned_mt.GT, min_individual_maf=args.min_individual_maf, scores_expr=scores[pruned_mt.col_key].scores, block_size=4096, min_kinship=args.min_kinship, statistics='all') logger.info(f'Writing relatedness table...') # Write/export table to file relatedness_ht.write( output= f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.relatedness_kinship.ht', overwrite=args.overwrite) # Write PCs table to file (if specified) # if args.write_to_file: # # Export table to file # relatedness_ht.export(output=f'{args.ht_output_path}.tsv.bgz') # retrieve maximal independent set of related samples logger.info('Getting optimal set of related samples to prune...') relatedness_ht = hl.read_table( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.relatedness_kinship.ht') relatedness_ht = (relatedness_ht.flatten().rename({ 'i.s': 'i', 'j.s': 'j' }).repartition(100)) # import trios info fam = import_fam_ht() mat_ids = hl.set(fam.mat_id.collect()) fat_ids = hl.set(fam.pat_id.collect()) # rank samples by retention priority (e.g. cases over controls) tb_rank = make_sample_rank_table(get_sample_meta_data()) # apply min kinship to consider related pairs relatedness_ht = (relatedness_ht.filter(relatedness_ht.kin > MIN_KINSHIP)) # run maximal_independent_set stratified by groups # Note: This method fails when considering all pairs together (e.g. it removes most of the index in trios, we want # keep them (index) since they are mostly affected individuals rather than parents). # defining pairs group # TODO: check groups with updated fam file relatedness_ht = (relatedness_ht.annotate(pairs_group=hl.case().when( relatedness_ht.kin > 0.40, 'twins_or_dups').when( mat_ids.contains(relatedness_ht.i) | mat_ids.contains(relatedness_ht.j), 'pairs_child_mat').when( fat_ids.contains(relatedness_ht.i) | fat_ids.contains(relatedness_ht.j), 'pairs_child_fat').default('pairs_others'))) groups = (relatedness_ht.aggregate( hl.agg.collect_as_set(relatedness_ht['pairs_group']))) tbs = [] for pair_group in groups: pair_ht = relatedness_ht.filter( relatedness_ht.pairs_group == pair_group) tb = get_related_samples_to_drop(rank_table=tb_rank, relatedness_ht=pair_ht) tbs.append(tb) related_samples_to_remove = hl.Table.union(*tbs) related_samples_to_remove.describe() related_samples_to_remove = related_samples_to_remove.checkpoint( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.related_samples_to_remove.ht', overwrite=args.overwrite) if args.write_to_file: (related_samples_to_remove.flatten().export( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.related_samples_to_remove.tsv' )) hl.stop()
def main(args): global output_prefix output_prefix = args.output_dir.rstrip("/") + "/" + splitext( basename(args.input_mt))[0] if args.compute_qc_mt: qc_mt = get_qc_mt(hl.read_matrix_table(args.input_mt)) qc_mt = qc_mt.repartition(n_partitions=200) qc_mt.write(path('qc.mt'), overwrite=args.overwrite) if args.compute_qc_metrics: logger.info("Computing sample QC") mt = filter_to_autosomes(hl.read_matrix_table(args.input_mt)) strats = { 'bi_allelic': bi_allelic_expr(mt), 'multi_allelic': ~bi_allelic_expr(mt) } for strat, filter_expr in strats.items(): strat_sample_qc_ht = hl.sample_qc( mt.filter_rows(filter_expr)).cols() strat_sample_qc_ht.write(path(f'{strat}_sample_qc.ht'), overwrite=args.overwrite) strat_hts = [ hl.read_table(path(f'{strat}_sample_qc.ht')) for strat in strats ] sample_qc_ht = strat_hts.pop() sample_qc_ht = sample_qc_ht.select( sample_qc=merge_sample_qc_expr([sample_qc_ht.sample_qc] + [ strat_hts[i][sample_qc_ht.key].sample_qc for i in range(0, len(strat_hts)) ])) sample_qc_ht.write(path('sample_qc.ht'), overwrite=args.overwrite) if args.compute_callrate_mt: callrate_mt = compute_callrate_mt( hl.read_matrix_table(args.input_mt), hl.import_locus_intervals(exome_calling_intervals_path)) callrate_mt.write(path('callrate.mt'), args.overwrite) if args.run_platform_pca: eigenvalues, scores_ht, loadings_ht = run_platform_pca( hl.read_matrix_table(path('callrate.mt'))) scores_ht.write(path('platform_pca_scores.ht'), overwrite=args.overwrite) loadings_ht.write(path('platform_pca_loadings.ht'), overwrite=args.overwrite) if args.assign_platforms: platform_ht = assign_platform_from_pcs( hl.read_table(path('platform_pca_scores.ht')), hdbscan_min_cluster_size=args.hdbscan_min_cluster_size, hdbscan_min_samples=args.hdbscan_min_samples) platform_ht.write(f'{output_prefix}.platform_pca_results.ht', overwrite=args.overwrite) if args.impute_sex: sex_ht = infer_sex(hl.read_matrix_table(path('qc.mt')), hl.read_matrix_table(args.input_mt), hl.read_table(path('platform_pca_results.ht')), args.male_threshold, args.female_threshold, args.min_male_y_sites_called, args.max_y_female_call_rate, args.min_y_male_call_rate) sex_ht.write(path('sex.ht'), overwrite=args.overwrite) if args.run_pc_relate: logger.info('Running PCA for PC-Relate') qc_mt = hl.read_matrix_table(path('qc.mt')).unfilter_entries() eig, scores, _ = hl.hwe_normalized_pca(qc_mt.GT, k=10, compute_loadings=False) scores.write(path('pruned.pca_scores.ht'), args.overwrite) logger.info('Running PC-Relate') logger.warn( "PC-relate requires SSDs and doesn't work with preemptible workers!" ) scores = hl.read_table(path('pruned.pca_scores.ht')) relatedness_ht = hl.pc_relate(qc_mt.GT, min_individual_maf=0.05, scores_expr=scores[qc_mt.col_key].scores, block_size=4096, min_kinship=args.min_emission_kinship, statistics='all') relatedness_ht.write(path('relatedness.ht'), args.overwrite) if args.filter_dups: logger.info("Filtering duplicate samples") sample_qc_ht = hl.read_table(path('sample_qc.ht')) samples_rankings_ht = sample_qc_ht.select( rank=-1 * sample_qc_ht.sample_qc.dp_stats.mean) dups_ht = filter_duplicate_samples( hl.read_table(path('relatedness.ht')), samples_rankings_ht) dups_ht.write(path('duplicates.ht'), overwrite=args.overwrite) if args.infer_families: logger.info("Inferring families") duplicates_ht = hl.read_table(path('duplicates.ht')) dups_to_remove = duplicates_ht.aggregate( hl.agg.explode(lambda x: hl.agg.collect_as_set(x.s), duplicates_ht.filtered)) ped = infer_families(hl.read_table(path('relatedness.ht')), hl.read_table(path('sex.ht')), dups_to_remove) ped.write(path('pedigree.ped')) if args.filter_related_samples: logger.info("Filtering related samples") related_pairs_ht, related_pairs_tie_breaker = rank_related_samples( hl.read_table(path('relatedness.ht')), hl.read_table(args.meta), hl.read_table(path('sample_qc.ht')), hl.import_fam(path('pedigree.ped'), delimiter="\t")) related_samples_to_drop_ht = hl.maximal_independent_set( related_pairs_ht.i, related_pairs_ht.j, keep=False, tie_breaker=related_pairs_tie_breaker) related_samples_to_drop_ht = related_samples_to_drop_ht.key_by() related_samples_to_drop_ht = related_samples_to_drop_ht.select( **related_samples_to_drop_ht.node) related_samples_to_drop_ht = related_samples_to_drop_ht.key_by('s') related_samples_to_drop_ht.write(path('related_samples_to_drop.ht'), overwrite=args.overwrite) if args.run_pca: logger.info("Running population PCA") pca_evals, pop_pca_scores_ht, pop_pca_loadings_ht = run_pca_with_relateds( hl.read_matrix_table(path('qc.mt')), hl.read_table(path('related_samples_to_drop.ht')), args.n_pcs) pop_pca_loadings_ht.write(path('pop_pca_loadings.ht'), args.overwrite) pop_pca_scores_ht.write(path('pop_pca_scores.ht'), args.overwrite) if args.assign_pops: logger.info("Assigning global population labels") pop_pca_scores_ht = hl.read_table(path("pop_pca_scores.ht")) gnomad_meta_ht = get_gnomad_meta('exomes').select("pop")[ pop_pca_scores_ht.key] pop_pca_scores_ht = pop_pca_scores_ht.annotate(known_pop=hl.or_missing( gnomad_meta_ht.pop != "oth", gnomad_meta_ht.pop)) pop_ht, pops_rf_model = assign_population_pcs( pop_pca_scores_ht, pc_cols=pop_pca_scores_ht.scores[:args.n_pcs], known_col='known_pop', min_prob=args.min_pop_prob) pop_ht.write(path('pop.ht'), args.overwrite) with hl.hadoop_open(path('pop_rf_model.pkl'), 'wb') as out: pickle.dump(pops_rf_model, out) if args.assign_subpops: qc_mt = hl.read_matrix_table(path('qc.mt')) pop_ht = hl.read_table(path('pop.ht')) meta_ht = hl.read_table(args.meta)[qc_mt.col_key] qc_mt = qc_mt.annotate_cols(pop=pop_ht[qc_mt.col_key].pop, is_case=meta_ht.is_case, country=meta_ht.country) platform_specific_intervals = get_platform_specific_intervals( hl.read_table(path('platform_pca_loadings.ht')), threshold=0.01) logger.info( f'Excluding {len(platform_specific_intervals)} platform-specific intervals for subpop PCA.' ) qc_mt = hl.filter_intervals(qc_mt, platform_specific_intervals, keep=False) assign_and_write_subpops( qc_mt, hl.read_table(path('related_samples_to_drop.ht')), min_samples_for_subpop=args.min_samples_for_subpop, n_pcs=args.n_pcs, min_pop_prob=args.min_pop_prob, overwrite=args.overwrite, pop_ann='pop', subpop_ann='country', include_in_pop_count=qc_mt.is_case) if args.run_kgp_pca: logger.info("Joining data with 1000 Genomes") qc_mt = hl.read_matrix_table( path('qc.mt')).select_rows().select_entries("GT") qc_mt = qc_mt.select_cols(known_pop=hl.null(hl.tstr), known_subpop=hl.null(hl.tstr)) qc_mt = qc_mt.key_cols_by(_kgp=False, *qc_mt.col_key) kgp_mt = hl.read_matrix_table( kgp_phase3_genotypes_mt_path()).select_rows() kgp_mt = kgp_mt.select_cols(known_pop=kgp_mt.super_pops.get( kgp_mt.population, "oth").lower(), known_subpop=kgp_mt.population.lower()) kgp_mt = kgp_mt.filter_rows(hl.is_defined( qc_mt.rows()[kgp_mt.row_key])) kgp_mt = filter_rows_for_qc(kgp_mt) kgp_mt = kgp_mt.key_cols_by(_kgp=True, *kgp_mt.col_key) union_kgp_qc_mt = qc_mt.union_cols(kgp_mt) union_kgp_qc_mt.write(path('union_kgp_qc.mt'), overwrite=args.overwrite) logger.info("Computing PCA on data with 1000 Genomes") union_kgp_qc_mt = hl.read_matrix_table(path('union_kgp_qc.mt')) related_samples_to_drop_ht = hl.read_table( path('related_samples_to_drop.ht')) related_samples_to_drop_ht = related_samples_to_drop_ht.key_by( _kgp=False, *related_samples_to_drop_ht.key) pca_evals, union_kgp_pca_scores_ht, union_kgp_pca_loadings_ht = run_pca_with_relateds( union_kgp_qc_mt, related_samples_to_drop_ht, args.n_kgp_pcs) union_kgp_pca_loadings_ht.write(path('union_kgp_pca_loadings.ht'), args.overwrite) union_kgp_pca_scores_ht.write(path('union_kgp_pca_scores.ht'), args.overwrite) if args.assign_pops_kgp: logger.info("Assigning populations based on 1000 Genomes labels") union_kgp_qc_mt = hl.read_matrix_table(path('union_kgp_qc.mt')) union_kgp_pca_scores_ht = hl.read_table( path('union_kgp_pca_scores.ht')) union_kgp_pca_scores_ht = union_kgp_pca_scores_ht.annotate( known_pop=union_kgp_qc_mt[union_kgp_pca_scores_ht.key].known_pop) union_kgp_pop_ht, union_kgp_pop_rf_model = assign_population_pcs( union_kgp_pca_scores_ht, pc_cols=union_kgp_pca_scores_ht.scores[:args.n_kgp_pcs], known_col='known_pop', min_prob=args.min_kgp_pop_prob) union_kgp_pop_ht.write(path('union_kgp_pop.ht'), args.overwrite) with hl.hadoop_open(path('union_kgp_pop_rf_model.pkl'), 'wb') as out: pickle.dump(union_kgp_pop_rf_model, out) if args.assign_subpops_kgp: union_kgp_qc_mt = hl.read_matrix_table(path('union_kgp_qc.mt')) meta_ht = hl.read_table(args.meta) union_kgp_pop_ht = hl.read_table(path('union_kgp_pop.ht')) union_kgp_qc_mt = union_kgp_qc_mt.annotate_cols( is_case=meta_ht[union_kgp_qc_mt.col_key].is_case, pop=union_kgp_pop_ht[union_kgp_qc_mt.col_key].pop) platform_specific_intervals = get_platform_specific_intervals( hl.read_table(path('platform_pca_loadings.ht'))) logger.info( f'Excluding {len(platform_specific_intervals)} platform-specific intervals for subpop PCA.' ) union_kgp_qc_mt = hl.filter_intervals(union_kgp_qc_mt, platform_specific_intervals, keep=False) related_samples_to_drop_ht = hl.read_table( path('related_samples_to_drop.ht')) related_samples_to_drop_ht = related_samples_to_drop_ht.key_by( _kgp=False, *related_samples_to_drop_ht.key) assign_and_write_subpops( union_kgp_qc_mt, related_samples_to_drop_ht, min_samples_for_subpop=args.min_samples_for_subpop, n_pcs=args.n_kgp_pcs, min_pop_prob=args.min_kgp_pop_prob, overwrite=args.overwrite, pop_ann='pop', subpop_ann='known_subpop', include_in_pop_count=union_kgp_qc_mt.is_case, files_prefix='union_kgp_') if args.apply_stratified_filters: logger.info("Computing stratified QC") for variant_class_prefix in ['', 'bi_allelic_', 'multi_allelic_']: sample_qc_ht = hl.read_table( path(f'{variant_class_prefix}sample_qc.ht')) pop_ht = hl.read_table(path('pops.ht')) platform_ht = hl.read_table(path('platform_pca_results.ht')) sample_qc_ht = sample_qc_ht.annotate( qc_pop=pop_ht[sample_qc_ht.key].pop, qc_platform=platform_ht[sample_qc_ht.key].qc_platform) stratified_metrics_ht = compute_stratified_metrics_filter( sample_qc_ht, args.filtering_qc_metrics.split(","), ['qc_pop', 'qc_platform']) stratified_metrics_ht.write( path(f'{variant_class_prefix}stratified_metrics_filters.ht'), overwrite=args.overwrite) if args.write_full_meta: logger.info("Writing metadata table") # List all tables to join with the base meta meta_annotation_hts = [ hl.read_table(path('platform_pca_results.ht')).rename( {'scores': 'platform_pc_scores'}), hl.read_table(path('sex.ht')), flatten_duplicate_samples_ht(hl.read_table(path('duplicates.ht'))), hl.read_table(path('related_samples_to_drop.ht')).select( related_filtered=True), hl.read_table(path('pca_scores.ht')).rename( {'scores': 'pop_pc_scores'}), hl.read_table(path('pops.ht')).select('pop'), hl.read_table(path('nfe.pca_scores.ht')).rename( {'scores': 'nfe_pc_scores'}), hl.read_table(path('subpops.nfe.ht')).select('subpop') ] # union_kgp_pops_ht = hl.read_table(path('union_kgp_pops.ht')) # union_kgp_pops_ht = union_kgp_pops_ht.filter(~union_kgp_pops_ht._kgp).key_by('s') # union_kgp_pops_ht = union_kgp_pops_ht.select(kgp_pop=union_kgp_pops_ht.pop) # meta_annotation_hts.append(union_kgp_pops_ht) # # union_kgp_pca_scores_ht = hl.read_table(path('union_kgp_pca_scores.ht')).rename({'scores': 'kgp_pop_pc_scores'}) # union_kgp_pca_scores_ht = union_kgp_pca_scores_ht.filter(~union_kgp_pca_scores_ht._kgp).key_by('s') # meta_annotation_hts.append(union_kgp_pca_scores_ht) gnomad_meta_ht = get_gnomad_meta('exomes') gnomad_meta_ht = gnomad_meta_ht.select( gnomad_pop=gnomad_meta_ht.pop, gnomad_subpop=gnomad_meta_ht.subpop) meta_annotation_hts.append(gnomad_meta_ht) for variant_class_prefix in ['', 'bi_allelic_', 'multi_allelic_']: sample_qc_ht = hl.read_table( path(f'{variant_class_prefix}sample_qc.ht')) stratified_metrics_filters_ht = hl.read_table( path(f'{variant_class_prefix}stratified_metrics_filters.ht')) if variant_class_prefix: sample_qc_ht = sample_qc_ht.rename( {'sample_qc': f'{variant_class_prefix}sample_qc'}) stratified_metrics_filters_ht = stratified_metrics_filters_ht.rename( { f: f'{variant_class_prefix}{f}' for f in list(stratified_metrics_filters_ht.globals) + list(stratified_metrics_filters_ht.row_value) }) meta_annotation_hts.extend( [sample_qc_ht, stratified_metrics_filters_ht]) meta_ht = hl.read_table(args.meta) meta_ht = meta_ht.annotate_globals( **{ name: expr for ann_ht in meta_annotation_hts for name, expr in ann_ht.index_globals().items() }) meta_ht = meta_ht.annotate( **{ name: expr for ann_ht in meta_annotation_hts for name, expr in ann_ht[meta_ht.key].items() }) filtering_col_prefix = '' if args.filtering_variant_class == 'all' else args.filtering_variant_class + "_" meta_ht = meta_ht.annotate_globals( filtering_variant_class=args.filtering_variant_class) meta_ht = meta_ht.annotate(sample_filters=add_filters_expr( filters={ "ambiguous sex": hl.is_missing(meta_ht.is_female), 'call_rate': meta_ht.sample_qc.call_rate < args.min_call_rate, 'duplicate': hl.is_defined(meta_ht.dup_filtered) & meta_ht.dup_filtered, 'related': meta_ht.related_filtered }, current_filters=meta_ht[ f'{filtering_col_prefix}pop_platform_filters'])) meta_ht.write(path('full_meta.ht'), overwrite=args.overwrite)
# f"{tmp_dir}/ddd-elgh-ukbb/chr1_chr20_XY_ldpruned.mt", overwrite=True) pruned_mt = hl.read_matrix_table( f"{temp_dir}/ddd-elgh-ukbb/chr1_chr20_XY_ldpruned.mt") # PC relate pruned_mt = pruned_mt.select_entries( GT=hl.unphased_diploid_gt_index_call(pruned_mt.GT.n_alt_alleles())) eig, scores, _ = hl.hwe_normalized_pca(pruned_mt.GT, k=10, compute_loadings=False) # scores.write( # f"{tmp_dir}/ddd-elgh-ukbb/chr1_chr20_XY_pruned.pca_scores.ht", overwrite=True) relatedness_ht = hl.pc_relate(pruned_mt.GT, min_individual_maf=0.05, scores_expr=scores[pruned_mt.col_key].scores, block_size=4096, min_kinship=0.05, statistics='kin2') # relatedness_ht.write( # f"{tmp_dir}/ddd-elgh-ukbb/chr1_chr20_XY_relatedness.ht", overwrite=True) pairs = relatedness_ht.filter(relatedness_ht['kin'] > 0.125) related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, keep=False) # related_samples_to_remove.write( # f"{tmp_dir}/ddd-elgh-ukbb/chr1_chr20_XY_related_samples_to_remove.ht", overwrite=True) pca_mt = pruned_mt.filter_cols(hl.is_defined( related_samples_to_remove[pruned_mt.col_key]), keep=False) related_mt = pruned_mt.filter_cols(hl.is_defined(
# Writing out the matrix table with annotated filter information # Writing out the joint matrix makes the following QC steps run faster mt.write('path/for/joint/matrix') # In[9]: # Reading in the matrix table with all of the site mts combined joint_data = 'path/for/joint/matrix' mt_joint = hl.read_matrix_table(joint_data) # In[12]: # Calculates relatedness using pc_relate for all samples in a matrix table # Annotates a column which flags those who failed the relatedness filter as True pc_rel = hl.pc_relate(mt_joint.GT, 0.001, k=10, statistics='kin') pairs = pc_rel.filter(pc_rel['kin'] > 0.125) related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, keep=False) mt_joint = mt_joint.annotate_cols( related_filter=hl.is_defined(related_samples_to_remove[mt_joint.col_key])) # In[11]: ''' --- Conducting QC --- QC Steps: snp call rate sample call rate sex violations
@author: nbaya """ import hail as hl hl.init(log='/tmp/foo.log') wd = 'gs://qc-nbaya/spark/array_May2019/preimputation/spark_preimp7/' bfile = wd + 'SPARK.27K.genotype.20190501.hg19_preimp7.founders' #print(f'Using bfile: {bfile}') #mt = hl.import_plink(bed=bfile+'.bed', # bim=bfile+'.bim', # fam=bfile+'.fam') # #mt = mt.checkpoint(bfile+'.mt') mt = hl.read_matrix_table(bfile + '.mt') min_kinship = 0.09375 / 2 pcrelate = hl.pc_relate(call_expr=mt.GT, min_individual_maf=0.01, k=20, min_kinship=min_kinship, statistics='kin') ct = pcrelate.count() print('\n############\ncount:{ct}\n############') pcrelate.export(bfile + '.pc_relate.v2.tsv.bgz')