def test_sample_qc(self): data = [ {'v': '1:1:A:T', 's': '1', 'GT': hl.Call([0, 0]), 'GQ': 10, 'DP': 0}, {'v': '1:2:A:T,C', 's': '1', 'GT': hl.Call([1]), 'GQ': 15, 'DP': 5}, {'v': '1:3:A:G,C', 's': '1', 'GT': hl.Call([2, 2]), 'GQ': 10, 'DP': 4}, {'v': '1:4:G:A', 's': '1', 'GT': hl.Call([0, 1]), 'GQ': None, 'DP': 5}, {'v': '1:5:C:CG', 's': '1', 'GT': hl.Call([1, 1]), 'GQ': 20, 'DP': 3}, {'v': '1:6:C:A', 's': '1', 'GT': None, 'GQ': 0, 'DP': None}, ] ht = hl.Table.parallelize(data, hl.dtype('struct{v: str, s: str, GT: call, GQ: int, DP: int}')) ht = ht.transmute(**hl.parse_variant(ht.v)) mt = ht.to_matrix_table(['locus', 'alleles'], ['s']) mt = hl.sample_qc(mt, 'sqc') r = mt.cols().select('sqc').collect() self.assertAlmostEqual(r[0].sqc.gq_stats.mean, 11) self.assertAlmostEqual(r[0].sqc.gq_stats.stdev, 6.6332495807) self.assertAlmostEqual(r[0].sqc.gq_stats.min, 0) self.assertAlmostEqual(r[0].sqc.gq_stats.max, 20) self.assertAlmostEqual(r[0].sqc.dp_stats.mean, 3.399999999) self.assertAlmostEqual(r[0].sqc.dp_stats.stdev, 1.8547236990) self.assertAlmostEqual(r[0].sqc.dp_stats.min, 0) self.assertAlmostEqual(r[0].sqc.dp_stats.max, 5) self.assertAlmostEqual(r[0].sqc.call_rate, 0.8333333333) self.assertEqual(r[0].sqc.n_called, 5) self.assertEqual(r[0].sqc.n_not_called, 1) self.assertEqual(r[0].sqc.n_hom_ref, 1) self.assertEqual(r[0].sqc.n_het, 1) self.assertEqual(r[0].sqc.n_hom_var, 3) self.assertEqual(r[0].sqc.n_insertion, 2) self.assertEqual(r[0].sqc.n_deletion, 0) self.assertEqual(r[0].sqc.n_singleton, 3) self.assertEqual(r[0].sqc.n_transition, 1) self.assertEqual(r[0].sqc.n_transversion, 3) self.assertEqual(r[0].sqc.n_star, 0) self.assertEqual(r[0].sqc.n_non_ref, 4) self.assertAlmostEqual(r[0].sqc.r_ti_tv, 0.333333333) self.assertAlmostEqual(r[0].sqc.r_het_hom_var, 0.3333333333) self.assertAlmostEqual(r[0].sqc.r_insertion_deletion, None)
def generate_datasets(doctest_namespace): doctest_namespace['hl'] = hl ds = hl.import_vcf('data/sample.vcf.bgz') ds = ds.sample_rows(0.03) ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5), panel_maf=0.1, anno1=5, anno2=0, consequence="LOF", gene="A", score=5.0) ds = ds.annotate_rows(a_index=1) ds = hl.sample_qc(hl.variant_qc(ds)) ds = ds.annotate_cols(is_case=True, pheno=hl.struct(is_case=hl.rand_bool(0.5), is_female=hl.rand_bool(0.5), age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1), cov2=hl.rand_norm(0, 1), cohort="SIGMA") ds = ds.annotate_globals( global_field_1=5, global_field_2=10, pli={ 'SCN1A': 0.999, 'SONIC': 0.014 }, populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS']) ds = ds.annotate_rows(gene=['TTN']) ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS') ds = ds.checkpoint(f'output/example.mt', overwrite=True) doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate( consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata doctest_namespace['cols_to_keep'] = s_metadata doctest_namespace['cols_to_remove'] = s_metadata doctest_namespace['rows_to_keep'] = v_metadata doctest_namespace['rows_to_remove'] = v_metadata small_mt = hl.balding_nichols_model(3, 4, 4) doctest_namespace['small_mt'] = small_mt.checkpoint('output/small.mt', overwrite=True) # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={ 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32) }) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={ 'Age': hl.tint32, 'Children': hl.tarray(hl.tstr) }, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.null(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({ 'Alice': 43, 'Bob': 33, 'Charles': 44 }) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval( "1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) doctest_namespace['nd'] = hl._ndarray([[1, 2], [3, 4]]) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() burden_ds = hl.import_vcf('data/example_burden.vcf') burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True) burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s]) burden_ds = burden_ds.annotate_rows( weight=hl.float64(burden_ds.locus.position)) burden_ds = hl.variant_qc(burden_ds) genekt = hl.import_locus_intervals('data/gene.interval_list') burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus]) burden_ds = burden_ds.checkpoint(f'output/example_burden.vds', overwrite=True) doctest_namespace['burden_ds'] = burden_ds ld_score_one_pheno_sumstats = hl.import_table( 'data/ld_score_regression.one_pheno.sumstats.tsv', types={ 'locus': hl.tlocus('GRCh37'), 'alleles': hl.tarray(hl.tstr), 'chi_squared': hl.tfloat64, 'n': hl.tint32, 'ld_score': hl.tfloat64, 'phenotype': hl.tstr, 'chi_squared_50_irnt': hl.tfloat64, 'n_50_irnt': hl.tint32, 'chi_squared_20160': hl.tfloat64, 'n_20160': hl.tint32 }, key=['locus', 'alleles']) doctest_namespace[ 'ld_score_one_pheno_sumstats'] = ld_score_one_pheno_sumstats mt = hl.import_matrix_table( 'data/ld_score_regression.all_phenos.sumstats.tsv', row_fields={ 'locus': hl.tstr, 'alleles': hl.tstr, 'ld_score': hl.tfloat64 }, entry_type=hl.tstr) mt = mt.key_cols_by(phenotype=mt.col_id) mt = mt.key_rows_by(locus=hl.parse_locus(mt.locus), alleles=mt.alleles.split(',')) mt = mt.drop('row_id', 'col_id') mt = mt.annotate_entries(x=mt.x.split(",")) mt = mt.transmute_entries(chi_squared=hl.float64(mt.x[0]), n=hl.int32(mt.x[1])) mt = mt.annotate_rows(ld_score=hl.float64(mt.ld_score)) doctest_namespace['ld_score_all_phenos_sumstats'] = mt print("finished setting up doctest...")
pop_file = 'gs://rcstorage/population/ccdgf2_predicted_ethnicity_PC1-15.tsv' # define output files sample_qc_info_preqc_file = 'gs://rcstorage/qced/' + chrom + '/ccdgf2_sample_qc_info_preqc.txt' #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # II. Annotate samples with population #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print("annotating ethnicity") table = hl.import_table(pop_file, impute=True).key_by('Sample') vds = vds.annotate_cols(**table[vds.s]) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # III. Performing sample QC on remaining variants #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print("sample QC...") vds = hl.sample_qc(vds) print("writing sample QC results...") vds.cols().flatten().export(sample_qc_info_preqc_file) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # print Runtime #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ stop = timeit.default_timer() print("runtime: " + str(stop - start) + " seconds")
f"{temp_dir}/ddd-elgh-ukbb/new_labels/chr1_chr20_ldpruned_updated.mt") # pca_scores_pop pca_scores_pop = hl.read_table( f"{temp_dir}/ddd-elgh-ukbb/new_labels/pop_assignments_updated_august2020.ht") # pca_scores_superpop pca_scores_superpop = hl.read_table( f"{temp_dir}/ddd-elgh-ukbb/new_labels/pop_assignments_updated_august2020_superpops.ht") # annotate mt with pop and superpop mt = mt.annotate_cols(assigned_pop=pca_scores_pop[mt.s].pop) mt = mt.annotate_cols(assigned_superpop=pca_scores_superpop[mt.s].pop) # do sample_qc # calculate and annotate with metric heterozygosity mt_with_sampleqc = hl.sample_qc(mt, name='sample_qc') mt_with_sampleqc = mt_with_sampleqc.annotate_cols(sample_qc=mt_with_sampleqc.sample_qc.annotate( heterozygosity_rate=mt_with_sampleqc.sample_qc.n_het/mt_with_sampleqc.sample_qc.n_called)) # save sample_qc and heterozygosity table as ht table mt_with_sampleqc.write( f"{tmp_dir}/ddd-elgh-ukbb/mt_pops_superpops_sampleqc.mt", overwrite=True) mt_with_sampleqc.cols().write( f"{tmp_dir}/ddd-elgh-ukbb/mt_pops_superpops_sampleqc.ht", overwrite=True) pop_ht = hl.read_table( f"{tmp_dir}/ddd-elgh-ukbb/mt_pops_superpops_sampleqc.ht") # run function on metrics including heterozygosity first for pops: qc_metrics = ['heterozygosity_rate', 'n_snp', 'r_ti_tv', 'r_insertion_deletion', 'n_insertion', 'n_deletion', 'r_het_hom_var'] pop_filter_ht = compute_stratified_metrics_filter( pop_ht, qc_metrics, ['assigned_pop'])
files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"] for f in files: if os.path.isdir(f): shutil.rmtree(f) ds = hl.import_vcf('data/sample.vcf.bgz') ds = ds.sample_rows(0.03) ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5), panel_maf=0.1, anno1=5, anno2=0, consequence="LOF", gene="A", score=5.0) ds = ds.annotate_rows(a_index=1) ds = hl.sample_qc(hl.variant_qc(ds)) ds = ds.annotate_cols(is_case=True, pheno=hl.struct(is_case=hl.rand_bool(0.5), is_female=hl.rand_bool(0.5), age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1), cov2=hl.rand_norm(0, 1), cohort="SIGMA") ds = ds.annotate_globals( global_field_1=5, global_field_2=10, pli={
hl.parse_locus_interval(x, reference_genome='GRCh38') for x in ['chr1:START-chr22:END', 'chrX:START-chrX:END', 'chrY:START-chrY:END'] ] mt = hl.filter_intervals(mt, intervals) # Filter out the invariant rows. mt = hl.variant_qc(mt, name='qc') mt = mt.filter_rows((mt.qc.AF[0] > 0.0) & (mt.qc.AF[0] < 1.0)) mt_rows_filter = mt.rows().select().export(PADDED_150_INITIAL_VARIANT_LIST) n_variants = hl.import_table(PADDED_150_INITIAL_VARIANT_LIST).count() print('n variants after initial filter:') print(n_variants) mt = hl.sample_qc(mt, name='qc_150') mt = mt.filter_rows(mt.not_in_padded_target_intervals_100, keep=False) mt_rows_filter = mt.rows().select().export(PADDED_100_INITIAL_VARIANT_LIST) n_variants = hl.import_table(PADDED_100_INITIAL_VARIANT_LIST).count() print('n variants after initial filter:') print(n_variants) mt = hl.sample_qc(mt, name='qc_100') mt = mt.filter_rows(mt.not_in_padded_target_intervals_50, keep=False) mt_rows_filter = mt.rows().select().export(PADDED_50_INITIAL_VARIANT_LIST) n_variants = hl.import_table(PADDED_50_INITIAL_VARIANT_LIST).count() print('n variants after initial filter:') print(n_variants)
num2 = vds_post.count() print(num2) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # IV. Filtering variants without PASS #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print("removing variants...") vds_post = vds_post.filter_rows(vds_post.label == 'PASS', keep=True) num3 = vds_post.count() print(num3) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # V. Performing sample QC on remaining variants #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print("sample QC...") vds_post = hl.sample_qc(vds_post) print("writing sample QC results...") vds_post.cols().flatten().export(sample_qc_info_postqc_file) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # print Runtime #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ stop = timeit.default_timer() print("runtime: " + str(stop - start) + " seconds")
def sample_qc(): hl.sample_qc(get_mt()).cols()._force_count()
def variant_and_sample_qc(): mt = get_mt() hl.sample_qc(hl.variant_qc(mt))._force_count_rows()
files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"] for f in files: if os.path.isdir(f): shutil.rmtree(f) ds = hl.import_vcf('data/sample.vcf.bgz') ds = ds.sample_rows(0.03) ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5), panel_maf=0.1, anno1=5, anno2=0, consequence="LOF", gene="A", score=5.0) ds = ds.annotate_rows(a_index=1) ds = hl.sample_qc(hl.variant_qc(ds)) ds = ds.annotate_cols(is_case=True, pheno=hl.struct(is_case=hl.rand_bool(0.5), is_female=hl.rand_bool(0.5), age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1), cov2=hl.rand_norm(0, 1), cohort="SIGMA") ds = ds.annotate_globals(global_field_1=5, global_field_2=10, pli={'SCN1A': 0.999, 'SONIC': 0.014}, populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
"Other")))) mt = mt.checkpoint( f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}-split-multi_cohorts.mt", overwrite=True) print("Finished splitting and writing mt. ") intersection_table = hl.import_bed( intersection_bed, reference_genome='GRCh38') union_table = hl.import_bed(union_bed, reference_genome='GRCh38') mt_intersection = mt.filter_rows( hl.is_defined(intersection_table[mt.locus])) mt_union = mt.filter_rows(hl.is_defined(union_table[mt.locus])) mt_intersection = hl.sample_qc(mt_intersection, name='sample_QC_Hail') pandadf1 = mt_intersection.cols().flatten() print("Outputting table of sample qc") pandadf1.export( f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}_intersection_BED_sampleQC.tsv.bgz", header=True) mt_intersection = mt_intersection.checkpoint( f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}-intersection_BED.mt", overwrite=True) mt = mt.checkpoint( f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}-sampleqc-unfiltered_sex_annotated.mt", overwrite=True) mt_union = hl.sample_qc(mt_union, name='sample_QC_Hail') pandadf2 = mt_union.cols().flatten() print("Outputting table of sample qc") pandadf2.export(
###################### UNFILTERED SAMPLE AND VARIANT QC ############# ##################################################################### print('Annotating rows with snp and indel info') mt = mt_split.annotate_rows(Variant_Type=hl.cond( (hl.is_snp(mt_split.alleles[0], mt_split.alleles[1])), "SNP", hl.cond( hl.is_insertion(mt_split.alleles[0], mt_split.alleles[1]), "INDEL", hl.cond(hl.is_deletion(mt_split.alleles[0], mt_split.alleles[1]), "INDEL", "Other")))) # Unfiltered data summary stats: print("Finished annotating rows, annotating columns now") mt_sqc1_unfiltered = mt.annotate_cols( sample_QC_nonHail=sample_QC_nonHail.key_by("ID")[mt.s]) mt_sqc2_unfiltered = hl.sample_qc(mt_sqc1_unfiltered, name='sample_QC_Hail') panda_df_unfiltered_table = mt_sqc2_unfiltered.cols().flatten() print("Outputting table of sample qc") panda_df_unfiltered_table.export( f"{BUCKET}/output-tables/{CHROMOSOME}/{CHROMOSOME}_sampleQC_unfiltered.tsv.bgz", header=True) # Variant QC mt2 = hl.variant_qc(mt_sqc2_unfiltered, name='variant_QC_Hail') print('Exporting variant qc pandas table to disk') mt_rows = mt2.rows() mt_rows.select(mt_rows.variant_QC_Hail).flatten().export( f"{BUCKET}/output-tables/{CHROMOSOME}/{CHROMOSOME}_variantQC_unfiltered.tsv.bgz",
# The purpose of this script is to format and write out a matrix table which will be used to create 'table_x' # for our resource manuscript # author: Zan Koenig import hail as hl hl.init() # reading in the post QC version of the merged dataset (with metadata) mt = hl.read_matrix_table('gs://african-seq-data/hgdp_tgp/hgdp_tgp_postQC.mt') # Running sample_qc to get the n_snp and n_singleton counts mt = hl.sample_qc(mt, name = "new_sample_qc") # Grabbing only the columns from the matrix table (outputs table of just columns) col_table = mt.cols() # writing out a col table with only the columns needed for table x col_table = col_table.select(col_table.hgdp_tgp_meta.Study.region, col_table.hgdp_tgp_meta.Population, col_table.new_sample_qc.n_snp, col_table.new_sample_qc.n_singleton, col_table.bam_metrics.mean_coverage) # writing out col_table as a checkpoint to make the downstream steps run faster col_table.checkpoint('gs://african-seq-data/hgdp_tgp/table_x_checkpoint.ht') # this is a table of only the columns with only information col_table = hl.read_table('gs://african-seq-data/hgdp_tgp/table_x_checkpoint.ht')
global_ADhet_25=hl.agg.mean(mt_het.AD[1] / mt_het.DP < 0.25), global_ADhet_30=hl.agg.mean(mt_het.AD[1] / mt_het.DP < 0.30), global_ADhet_35=hl.agg.mean(mt_het.AD[1] / mt_het.DP < 0.35), global_ADhom=hl.agg.stats( (mt_het.AD[1] + mt_het.AD[1]) / mt_het.DP))) print(het_struct) mt_hom_var = mt.filter_entries(mt.GT.is_hom_var()) hom_struct = mt_hom_var.aggregate_entries( hl.struct( global_ADhet=hl.agg.stats(mt_hom_var.AD[1] / mt_hom_var.DP), global_ADhom=hl.agg.stats( (mt_hom_var.AD[0] + mt_hom_var.AD[1]) / mt_hom_var.DP), global_ADhom_80=hl.agg.mean( (mt_hom_var.AD[1] + mt_hom_var.AD[1]) / mt_hom_var.DP < 0.8), global_ADhom_85=hl.agg.mean( (mt_hom_var.AD[1] + mt_hom_var.AD[1]) / mt_hom_var.DP < 0.85), global_ADhom_90=hl.agg.mean( (mt_hom_var.AD[1] + mt_hom_var.AD[1]) / mt_hom_var.DP < 0.9), global_ADhom_95=hl.agg.mean( (mt_hom_var.AD[1] + mt_hom_var.AD[1]) / mt_hom_var.DP < 0.95))) print(hom_struct) # Also, wish to examine the Ti/Tv ratio within the calling intervals (excluding the padding). mt = mt.filter_rows(~mt.not_in_target_intervals) mt = hl.sample_qc(mt, name='sample_qc_in_target') mt.cols().select("imputesex", "sample_qc", "sample_qc_in_target").flatten().export(SAMPLE_QC_IN_TARGET)
'locus').distinct_by_row().key_rows_by('locus', 'alleles') mt_split = hl.split_multi_hts(mt_annotated, keep_star=False, left_aligned=False) mt = mt_split.annotate_rows(Variant_Type=hl.cond( (hl.is_snp(mt_split.alleles[0], mt_split.alleles[1])), "SNP", hl.cond( hl.is_insertion(mt_split.alleles[0], mt_split.alleles[1]), "INDEL", hl.cond(hl.is_deletion(mt_split.alleles[0], mt_split.alleles[1]), "INDEL", "Other")))) mt = mt.checkpoint( f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}-split-multi_cohorts.mt", overwrite=True) print("Finished splitting and writing mt. ") agilent_table = hl.import_bed(agilent, reference_genome='GRCh38') mt_agilent = mt.filter_rows(hl.is_defined(agilent_table[mt.locus])) mt_agilent = hl.sample_qc(mt_agilent, name='sample_QC_Hail') pandadf1 = mt_agilent.cols().flatten() print("Outputting table of sample qc") pandadf1.export( f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}_agilent_sampleQC.tsv.bgz", header=True) mt = mt.checkpoint( f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}-sampleqc-unfiltered_annotated.mt", overwrite=True)
def generate_datasets(doctest_namespace, output_dir): doctest_namespace['hl'] = hl files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"] for f in files: if os.path.isdir(f): shutil.rmtree(f) ds = hl.import_vcf('data/sample.vcf.bgz') ds = ds.sample_rows(0.03) ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5), panel_maf=0.1, anno1=5, anno2=0, consequence="LOF", gene="A", score=5.0) ds = ds.annotate_rows(a_index=1) ds = hl.sample_qc(hl.variant_qc(ds)) ds = ds.annotate_cols(is_case=True, pheno=hl.struct(is_case=hl.rand_bool(0.5), is_female=hl.rand_bool(0.5), age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1), cov2=hl.rand_norm(0, 1), cohort="SIGMA") ds = ds.annotate_globals( global_field_1=5, global_field_2=10, pli={ 'SCN1A': 0.999, 'SONIC': 0.014 }, populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS']) ds = ds.annotate_rows(gene=['TTN']) ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS') ds = ds.checkpoint(f'{output_dir.name}/example.vds', overwrite=True) doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate( consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata doctest_namespace['cols_to_keep'] = s_metadata doctest_namespace['cols_to_remove'] = s_metadata doctest_namespace['rows_to_keep'] = v_metadata doctest_namespace['rows_to_remove'] = v_metadata # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={ 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32) }) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={ 'Age': hl.tint32, 'Children': hl.tarray(hl.tstr) }, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.null(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({ 'Alice': 43, 'Bob': 33, 'Charles': 44 }) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval( "1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) doctest_namespace['nd'] = hl._ndarray([[1, 2], [3, 4]]) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() burden_ds = hl.import_vcf('data/example_burden.vcf') burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True) burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s]) burden_ds = burden_ds.annotate_rows( weight=hl.float64(burden_ds.locus.position)) burden_ds = hl.variant_qc(burden_ds) genekt = hl.import_locus_intervals('data/gene.interval_list') burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus]) burden_ds = burden_ds.checkpoint(f'{output_dir.name}/example_burden.vds', overwrite=True) doctest_namespace['burden_ds'] = burden_ds print("finished setting up doctest...")
import hail as hl hl.init() mt = hl.read_matrix_table( 'gs://gcp-public-data--gnomad/release/3.1/mt/genomes/' 'gnomad.genomes.v3.1.hgdp_1kg_subset_dense.mt') # mt epxloration - explore rows and columns mt.count_rows() mt.count_cols() mt.cols().show() mt.rows().show() # mt qc check mt_qc = hl.sample_qc(mt) p = hl.plot.histogram(mt_qc.sample_qc.call_rate, range=(0.88, 1), legend='Call Rate') p_2 = hl.plot.histogram(mt_qc.sample_qc.gq_stats.mean, legend='Mean Sample GQ') # PCA columns = mt.cols() pca_scores = columns.population_inference.pca_scores labels = columns.population_inference.pop pops = list(set(labels.collect())) mapper = CategoricalColorMapper(palette=turbo(8), factors=pops) # plot the first 5 PCs p = hl.plot.scatter( pca_scores[0],
mt_before = mt_before.annotate_cols( phenotype=sample_annotations[mt_before.col_key]) mt_before = mt_before.annotate_cols( imputesex=impute_sex_annotations[mt_before.col_key]) mt_before = hl.variant_qc(mt_before, name='qc') mt_before = mt_before.annotate_rows(qc=mt_before.qc.annotate( AC=mt_before.qc.AC[1], AF=mt_before.qc.AF[1], homozygote_count=mt_before.qc.homozygote_count[1])) mt_before = mt_before.filter_rows((mt_before.qc.AF > 0) & (mt_before.qc.AF < 1)) mt_before = hl.sample_qc(mt_before) n = mt_before.count() print('n samples:') print(n[1]) print('n variants:') print(n[0]) mt_before = mt_before.annotate_cols(sex=hl.case().when( mt_before.imputesex.impute_sex.is_female, "Female").default("Male")) mt_after = mt_before.filter_rows( hl.is_defined(ht_final_variants[mt_before.row_key])) mt_after = hl.sample_qc(mt_after)
#this is a python script loosely based on Kumar and Konrad's effort here: https://github.com/mkveerapen/covid19_sequencing #again, some of the QC at our institution was done by our genome center, and therefore you should refer to the above link for more thorough QC #specifically, variant recalibration should still be done, even if not shown here, can discuss with me on how to do it using gatk. import hail as hl #tmp_dir is where some of the temporary computations are done. I would make sure to assign it to a folder that does not have a strict data cap. hl.init(spark_conf=None, tmp_dir='/path/to/tmp_dir/') #import the data and sample QC hl.import_vcf('/path/to/sequence.file.normID.noChrM.vcf.gz', min_partitions=4, reference_genome='GRCh38', force_bgz=True).write('/hailFiles/hail.full.normID.noChrM.mt', overwrite=True) mtAll = hl.read_matrix_table('/hailFiles/hail.full.noChrM.mt') mtAll = mtAll.annotate_entries(AB=(mtAll.AD[1] / hl.sum(mtAll.AD))) mtAll = hl.sample_qc(mtAll) mtAll = mtAll.filter_cols((mtAll.sample_qc.call_rate >= 0.97) & (mtAll.sample_qc.dp_stats.mean >= 20)) mtAll = mtAll.filter_entries((mtAll.GQ >= 20) & (mtAll.DP >= 10) & ( (mtAll.GT.is_hom_ref() & (mtAll.AB <= 0.1)) | (mtAll.GT.is_het() & (mtAll.AB >= 0.25) & (mtAll.AB <= 0.75)) | (mtAll.GT.is_hom_var() & (mtAll.AB >= 0.9)))) hl.export_vcf(mtAll, '/path/to/sequence.file.normID.GTflt.AB.noChrM.vcf.gz')
# In[6]: # MT paths is a list of file paths for each of the datasets to be merged and QC'd mt_paths = ['file/path1', 'file/path2'] # In[7]: # Reading in and creating a list of all of the site matrix tables mt_list = [hl.import_vcf(mt_path, force_bgz=True) for mt_path in mt_paths] # Importing the metadata file as a hail table meta = hl.import_table(meta_data) # In[8]: # Annotating the matrix tables with sample QC data mt_list = [hl.sample_qc(mt, name='sample_qc') for mt in mt_list] # In[89]: # Annotating the matrix tables with variant QC data mt_list = [hl.variant_qc(mt, name='variant_qc') for mt in mt_list] # In[90]: # Annotating matrix tables with metadata from the meta table (see annotateMeta for details) mt_list = [annotateMeta(mt, meta, 'chip_well_barcode') for mt in mt_list] # In[91]: # Annotating matrix tables with sex filter results (see checkSex for details) mt_list = [checkSex(mt) for mt in mt_list]
INITIAL_SAMPLE_QC_FILE_INV_REMOVED = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/03_initial_sample_qc_b37_callset_inv_removed.tsv' variants_to_filter = hl.import_table(INITIAL_VARIANT_AUTO_LIST, types={ 'locus': hl.tlocus(), 'alleles': hl.tarray(hl.tstr) }) variants_to_filter = variants_to_filter.key_by( locus=variants_to_filter.locus, alleles=variants_to_filter.alleles) sample_annotations = hl.read_table(PHENOTYPES_TABLE) pprint(sample_annotations.describe()) mt = hl.read_matrix_table(MT) mt = mt.filter_rows(hl.is_defined(variants_to_filter[mt.row_key])) mt = mt.annotate_cols(phenotype=sample_annotations[mt.s]) mt_invariant_included = hl.sample_qc(mt, name='qc') mt_invariant_included.cols().select( 'phenotype', 'qc').flatten().export(output=INITIAL_SAMPLE_QC_FILE) mt = hl.variant_qc(mt, name='qc') mt_invariant_removed = mt.filter_rows((mt.qc.AF[0] > 0.0) & (mt.qc.AF[0] < 1.0)) mt_invariant_removed = hl.sample_qc(mt_invariant_removed, name='qc_sample') mt_invariant_removed.cols().select( 'phenotype', 'qc_sample').flatten().export(output=INITIAL_SAMPLE_QC_FILE_INV_REMOVED)
def sample_qc(mt_path): hl.sample_qc(hl.read_matrix_table(mt_path)).cols()._force_count()
def samples_qc(mt, mt_to_annotate, args): """ Performs samples QC on a matrix table, removing samples on chimera and contamination %, as well as being +/- 4 standard deviations from mean on TiTv, het/homvar, insertion/deletion ratios and n_singletons for a specific batch or cohort :param mt: matrix table, low-pass failing variants and genotypes filtered out :param mt_to_annotate: matrix table to annotate with failing samples information after calculating on filtered mt :param args: :return: returns annotated, unfiltered matrix table """ datestr = time.strftime("%Y.%m.%d") # Run variant QC to get up to date variant QC metrics for samples QC mt = hl.sample_qc(mt) # Pull data to cols and checkpoint mt_cols = mt.cols() mt_cols = mt_cols.checkpoint("samples_qc_cols_tmp.ht", overwrite=True) # Instantiate empty array for failing samples QC tags mt_cols = mt_cols.annotate(failing_samples_qc=hl.empty_array(hl.tstr)) ############################################################ # Find samples failing on chimeras or contamination values # ############################################################ mt_cols = mt_cols.annotate(failing_samples_qc=hl.cond( (mt_cols[args.chimeras_col] > args.chimeras_max) & hl.is_defined(mt_cols[args.chimeras_col]), mt_cols.failing_samples_qc.append( "failing_chimeras"), mt_cols.failing_samples_qc)) mt_cols = mt_cols.annotate(failing_samples_qc=hl.cond( (mt_cols[args.contamination_col] > args.contamination_max) & hl.is_defined(mt_cols[args.contamination_col]), mt_cols.failing_samples_qc.append( "failing_contamination"), mt_cols.failing_samples_qc)) failing_chim = mt_cols.aggregate( hl.agg.count_where( mt_cols.failing_samples_qc.contains("failing_chimeras"))) miss_chim = mt_cols.aggregate( hl.agg.count_where(~(hl.is_defined(mt_cols[args.chimeras_col])))) failing_contam = mt_cols.aggregate( hl.agg.count_where( mt_cols.failing_samples_qc.contains("failing_contamination"))) miss_contam = mt_cols.aggregate( hl.agg.count_where(~(hl.is_defined(mt_cols[args.contamination_col])))) logging.info( f"Number of samples failing on chimeras % > {args.chimeras_max}: {failing_chim}" ) logging.info(f"Number of samples missing chimeras %: {miss_chim}") logging.info( f"Number of samples failing on contamination % > {args.contamination_max}: {failing_contam}" ) logging.info(f"Number of samples missing contamination %: {miss_contam}") chim_stats = mt_cols.aggregate(hl.agg.stats(mt_cols[args.chimeras_col])) cont_stats = mt_cols.aggregate( hl.agg.stats(mt_cols[args.contamination_col])) logging.info(f"Chimeras statistics: {chim_stats}") logging.info(f"Contamination statistics: {cont_stats}") ############################################### # Find samples failing on sex-aware call rate # ############################################### if args.sample_call_rate is not None: mt_cols = mt_cols.annotate(failing_samples_qc=hl.cond( (mt_cols.sexaware_sample_call_rate < args.sample_call_rate) & hl.is_defined(mt_cols.sexaware_sample_call_rate), mt_cols.failing_samples_qc.append( "failing_sexaware_sample_call_rate"), mt_cols.failing_samples_qc)) mt_cols = mt_cols.annotate(failing_samples_qc=hl.cond( ~(hl.is_defined(mt_cols.sexaware_sample_call_rate)), mt_cols.failing_samples_qc.append( "missing_sexaware_sample_call_rate"), mt_cols.failing_samples_qc)) failing_cr = mt_cols.aggregate( hl.agg.count_where( mt_cols.failing_samples_qc.contains( "failing_sexaware_sample_call_rate"))) missing_cr = mt_cols.aggregate( hl.agg.count_where( mt_cols.failing_samples_qc.contains( "missing_sexaware_sample_call_rate"))) logging.info( f"Number of samples failing on sex-aware call rate > {args.sample_call_rate}: {failing_cr}" ) logging.info( f"Number of samples missing sex-aware call rate : {missing_cr}") cr_stats = mt_cols.aggregate( hl.agg.stats(mt_cols.sexaware_sample_call_rate)) logging.info(f"Sex-aware call rate statistics: {cr_stats}") ###################################################################################### # Find samples failing per-cohort on titv, het_homvar ratio, indel, and # singletons # ###################################################################################### if args.batch_col_name is not None: batch_none = mt_cols.aggregate( hl.agg.count_where(~(hl.is_defined(mt_cols[args.batch_col_name])))) mt_cols = mt_cols.annotate( **{ args.batch_col_name: hl.or_else(mt_cols[args.batch_col_name], "no_batch_info") }) if batch_none > 0: logging.info( f"Warning- {batch_none} samples have batch undefined. These samples will be grouped in one" f"batch for sample QC (named no_batch_info).") mt_cols.filter_cols(mt_cols[args.batch_col_name] == "no_batch_info").s.show(batch_none + 1) batch_set = mt_cols.aggregate( hl.agg.collect_as_set(mt_cols[args.batch_col_name])) else: args.batch_col_name = "mock_batch_col" mt_cols = mt_cols.annotate(mock_batch_col="all") batch_set = ["all"] # Convert batch strings to numeric values, create label for plotting batch_set_numeric = list(range(len(batch_set))) batch_key = list(zip(batch_set, batch_set_numeric)) mt_cols = mt_cols.annotate(plot_batch=0) for batch in batch_key: mt_cols = mt_cols.annotate( plot_batch=hl.cond(mt_cols[args.batch_col_name] == batch[0], batch[1], mt_cols.plot_batch)) mt_cols = mt_cols.annotate(plot_batch_jitter=mt_cols.plot_batch + hl.rand_unif(-0.3, 0.3)) batch_thresholds = {} batch_statistics = {} for measure in [ 'r_ti_tv', 'r_het_hom_var', 'r_insertion_deletion', 'n_singleton' ]: logging.info(f"Performing sample QC for measure {measure}") # Instantiate/reset box plot label mt_cols = mt_cols.annotate(boxplot_label=mt_cols[args.batch_col_name]) batch_thresholds[measure] = {} batch_statistics[measure] = {} mt_cols = mt_cols.annotate(failing_samples_qc=hl.cond( ~(hl.is_defined(mt_cols.sample_qc[measure])), mt_cols.failing_samples_qc.append(f"missing_{measure}"), mt_cols.failing_samples_qc)) for batch in batch_set: # See if values exist at all for all values defined_values = mt_cols.aggregate( hl.agg.count_where(hl.is_defined(mt_cols.sample_qc[measure]))) if defined_values > 0: # Get mean and standard deviation for each measure, for each batch's samples stats = mt_cols.aggregate( hl.agg.filter(mt_cols[args.batch_col_name] == batch, hl.agg.stats(mt_cols.sample_qc[measure]))) # Get cutoffs for each measure cutoff_upper = stats.mean + (args.sampleqc_sd_threshold * stats.stdev) cutoff_lower = stats.mean - (args.sampleqc_sd_threshold * stats.stdev) if measure == "n_singleton": logging.info( f"Max number of singletons for batch {batch}: {stats.max}" ) mt_cols = mt_cols.annotate(failing_samples_qc=hl.cond( ((mt_cols.sample_qc[measure] > cutoff_upper) | (mt_cols.sample_qc[measure] < cutoff_lower)) & hl.is_defined(mt_cols.sample_qc[measure]) & (mt_cols[args.batch_col_name] == batch), mt_cols.failing_samples_qc.append( f"failing_{measure}"), mt_cols.failing_samples_qc)) mt_cols = mt_cols.annotate(boxplot_label=hl.cond( ((mt_cols.sample_qc[measure] > cutoff_upper) | (mt_cols.sample_qc[measure] < cutoff_lower)) & hl.is_defined(mt_cols.sample_qc[measure]) & (mt_cols[args.batch_col_name] == batch), "outlier", mt_cols.boxplot_label)) # Collect thresholds and statistics for each batch batch_thresholds[measure][batch] = { 'min_thresh': cutoff_lower, 'max_thresh': cutoff_upper } batch_statistics[measure][batch] = stats else: logging.error( f"Error- no defined values for measure {measure}. NAs can be introduced by division by " f"zero. Samples not filtered on {measure}!") # Create plot for measure for each batch output_file(f"{datestr}_samples_qc_plots_{measure}.html") p = hl.plot.scatter(mt_cols.plot_batch_jitter, mt_cols.sample_qc[measure], label=mt_cols.boxplot_label, title=f"{measure} values split by batch.") save(p) ########################## # Report failing samples # ########################## for measure in [ 'r_ti_tv', 'r_het_hom_var', 'r_insertion_deletion', 'n_singleton' ]: failing_count = mt_cols.aggregate( hl.agg.count_where( mt_cols.failing_samples_qc.contains(f"failing_{measure}"))) missing_count = mt_cols.aggregate( hl.agg.count_where( mt_cols.failing_samples_qc.contains(f"missing_{measure}"))) logging.info( f"Number of samples failing on {measure}: {failing_count}") logging.info(f"Number of samples missing {measure}: {missing_count}") failing_any = mt_cols.aggregate( hl.agg.count_where(hl.len(mt_cols.failing_samples_qc) != 0)) logging.info( f"Number of samples failing samples QC on any measure: {failing_any}") if args.pheno_col is not None: cases_failing = mt_cols.aggregate( hl.agg.filter( mt_cols[args.pheno_col] == True, hl.agg.count_where(hl.len(mt_cols.failing_samples_qc) != 0))) controls_failing = mt_cols.aggregate( hl.agg.filter( mt_cols[args.pheno_col] == False, hl.agg.count_where(hl.len(mt_cols.failing_samples_qc) != 0))) logging.info(f"Cases failing QC: {cases_failing}") logging.info(f"Controls failing QC: {controls_failing}") ####################################################################################################### # Annotate original (unfiltered) matrix table with failing samples QC information + sample QC measure # ####################################################################################################### mt_to_annotate = mt_to_annotate.annotate_cols( sample_qc=mt_cols[mt_to_annotate.s].sample_qc) mt_to_annotate = mt_to_annotate.annotate_cols( failing_samples_qc=mt_cols[mt_to_annotate.s].failing_samples_qc) mt_to_annotate = mt_to_annotate.annotate_globals( samples_qc_stats_batches=batch_statistics) mt_to_annotate = mt_to_annotate.annotate_globals( samples_qc_stats_chim_cont={ 'chimeras': chim_stats, 'contamination': cont_stats }) mt_to_annotate = mt_to_annotate.annotate_globals( samples_qc_thresholds={ 'chimeras_max': str(args.chimeras_max), 'contamination_max': str(args.contamination_max), 'deviation_multiplier_threshold': str(args.sampleqc_sd_threshold), 'batches': str(batch_set), 'batch_cohort_name': str(args.batch_col_name) }) mt_to_annotate = mt_to_annotate.annotate_globals( samples_qc_batch_thresholds=batch_thresholds) return mt_to_annotate
def variant_and_sample_qc(mt_path): mt = hl.read_matrix_table(mt_path) hl.sample_qc(hl.variant_qc(mt))._force_count_rows()
def test_sample_qc(self): data = [ { 'v': '1:1:A:T', 's': '1', 'GT': hl.Call([0, 0]), 'GQ': 10, 'DP': 0 }, { 'v': '1:2:A:T,C', 's': '1', 'GT': hl.Call([1]), 'GQ': 15, 'DP': 5 }, { 'v': '1:3:A:G,C', 's': '1', 'GT': hl.Call([2, 2]), 'GQ': 10, 'DP': 4 }, { 'v': '1:4:G:A', 's': '1', 'GT': hl.Call([0, 1]), 'GQ': None, 'DP': 5 }, { 'v': '1:5:C:CG', 's': '1', 'GT': hl.Call([1, 1]), 'GQ': 20, 'DP': 3 }, { 'v': '1:6:C:A', 's': '1', 'GT': None, 'GQ': 0, 'DP': None }, ] ht = hl.Table.parallelize( data, hl.dtype('struct{v: str, s: str, GT: call, GQ: int, DP: int}')) ht = ht.transmute(**hl.parse_variant(ht.v)) mt = ht.to_matrix_table(['locus', 'alleles'], ['s']) mt = hl.sample_qc(mt, 'sqc') r = mt.cols().select('sqc').collect() self.assertAlmostEqual(r[0].sqc.gq_stats.mean, 11) self.assertAlmostEqual(r[0].sqc.gq_stats.stdev, 6.6332495807) self.assertAlmostEqual(r[0].sqc.gq_stats.min, 0) self.assertAlmostEqual(r[0].sqc.gq_stats.max, 20) self.assertAlmostEqual(r[0].sqc.dp_stats.mean, 3.399999999) self.assertAlmostEqual(r[0].sqc.dp_stats.stdev, 1.8547236990) self.assertAlmostEqual(r[0].sqc.dp_stats.min, 0) self.assertAlmostEqual(r[0].sqc.dp_stats.max, 5) self.assertAlmostEqual(r[0].sqc.call_rate, 0.8333333333) self.assertEqual(r[0].sqc.n_called, 5) self.assertEqual(r[0].sqc.n_not_called, 1) self.assertEqual(r[0].sqc.n_hom_ref, 1) self.assertEqual(r[0].sqc.n_het, 1) self.assertEqual(r[0].sqc.n_hom_var, 3) self.assertEqual(r[0].sqc.n_insertion, 2) self.assertEqual(r[0].sqc.n_deletion, 0) self.assertEqual(r[0].sqc.n_singleton, 3) self.assertEqual(r[0].sqc.n_transition, 1) self.assertEqual(r[0].sqc.n_transversion, 3) self.assertEqual(r[0].sqc.n_star, 0) self.assertEqual(r[0].sqc.n_non_ref, 4) self.assertAlmostEqual(r[0].sqc.r_ti_tv, 0.333333333) self.assertAlmostEqual(r[0].sqc.r_het_hom_var, 0.3333333333) self.assertAlmostEqual(r[0].sqc.r_insertion_deletion, None)
def main(args): global output_prefix output_prefix = args.output_dir.rstrip("/") + "/" + splitext( basename(args.input_mt))[0] if args.compute_qc_mt: qc_mt = get_qc_mt(hl.read_matrix_table(args.input_mt)) qc_mt = qc_mt.repartition(n_partitions=200) qc_mt.write(path('qc.mt'), overwrite=args.overwrite) if args.compute_qc_metrics: logger.info("Computing sample QC") mt = filter_to_autosomes(hl.read_matrix_table(args.input_mt)) strats = { 'bi_allelic': bi_allelic_expr(mt), 'multi_allelic': ~bi_allelic_expr(mt) } for strat, filter_expr in strats.items(): strat_sample_qc_ht = hl.sample_qc( mt.filter_rows(filter_expr)).cols() strat_sample_qc_ht.write(path(f'{strat}_sample_qc.ht'), overwrite=args.overwrite) strat_hts = [ hl.read_table(path(f'{strat}_sample_qc.ht')) for strat in strats ] sample_qc_ht = strat_hts.pop() sample_qc_ht = sample_qc_ht.select( sample_qc=merge_sample_qc_expr([sample_qc_ht.sample_qc] + [ strat_hts[i][sample_qc_ht.key].sample_qc for i in range(0, len(strat_hts)) ])) sample_qc_ht.write(path('sample_qc.ht'), overwrite=args.overwrite) if args.compute_callrate_mt: callrate_mt = compute_callrate_mt( hl.read_matrix_table(args.input_mt), hl.import_locus_intervals(exome_calling_intervals_path)) callrate_mt.write(path('callrate.mt'), args.overwrite) if args.run_platform_pca: eigenvalues, scores_ht, loadings_ht = run_platform_pca( hl.read_matrix_table(path('callrate.mt'))) scores_ht.write(path('platform_pca_scores.ht'), overwrite=args.overwrite) loadings_ht.write(path('platform_pca_loadings.ht'), overwrite=args.overwrite) if args.assign_platforms: platform_ht = assign_platform_from_pcs( hl.read_table(path('platform_pca_scores.ht')), hdbscan_min_cluster_size=args.hdbscan_min_cluster_size, hdbscan_min_samples=args.hdbscan_min_samples) platform_ht.write(f'{output_prefix}.platform_pca_results.ht', overwrite=args.overwrite) if args.impute_sex: sex_ht = infer_sex(hl.read_matrix_table(path('qc.mt')), hl.read_matrix_table(args.input_mt), hl.read_table(path('platform_pca_results.ht')), args.male_threshold, args.female_threshold, args.min_male_y_sites_called, args.max_y_female_call_rate, args.min_y_male_call_rate) sex_ht.write(path('sex.ht'), overwrite=args.overwrite) if args.run_pc_relate: logger.info('Running PCA for PC-Relate') qc_mt = hl.read_matrix_table(path('qc.mt')).unfilter_entries() eig, scores, _ = hl.hwe_normalized_pca(qc_mt.GT, k=10, compute_loadings=False) scores.write(path('pruned.pca_scores.ht'), args.overwrite) logger.info('Running PC-Relate') logger.warn( "PC-relate requires SSDs and doesn't work with preemptible workers!" ) scores = hl.read_table(path('pruned.pca_scores.ht')) relatedness_ht = hl.pc_relate(qc_mt.GT, min_individual_maf=0.05, scores_expr=scores[qc_mt.col_key].scores, block_size=4096, min_kinship=args.min_emission_kinship, statistics='all') relatedness_ht.write(path('relatedness.ht'), args.overwrite) if args.filter_dups: logger.info("Filtering duplicate samples") sample_qc_ht = hl.read_table(path('sample_qc.ht')) samples_rankings_ht = sample_qc_ht.select( rank=-1 * sample_qc_ht.sample_qc.dp_stats.mean) dups_ht = filter_duplicate_samples( hl.read_table(path('relatedness.ht')), samples_rankings_ht) dups_ht.write(path('duplicates.ht'), overwrite=args.overwrite) if args.infer_families: logger.info("Inferring families") duplicates_ht = hl.read_table(path('duplicates.ht')) dups_to_remove = duplicates_ht.aggregate( hl.agg.explode(lambda x: hl.agg.collect_as_set(x.s), duplicates_ht.filtered)) ped = infer_families(hl.read_table(path('relatedness.ht')), hl.read_table(path('sex.ht')), dups_to_remove) ped.write(path('pedigree.ped')) if args.filter_related_samples: logger.info("Filtering related samples") related_pairs_ht, related_pairs_tie_breaker = rank_related_samples( hl.read_table(path('relatedness.ht')), hl.read_table(args.meta), hl.read_table(path('sample_qc.ht')), hl.import_fam(path('pedigree.ped'), delimiter="\t")) related_samples_to_drop_ht = hl.maximal_independent_set( related_pairs_ht.i, related_pairs_ht.j, keep=False, tie_breaker=related_pairs_tie_breaker) related_samples_to_drop_ht = related_samples_to_drop_ht.key_by() related_samples_to_drop_ht = related_samples_to_drop_ht.select( **related_samples_to_drop_ht.node) related_samples_to_drop_ht = related_samples_to_drop_ht.key_by('s') related_samples_to_drop_ht.write(path('related_samples_to_drop.ht'), overwrite=args.overwrite) if args.run_pca: logger.info("Running population PCA") pca_evals, pop_pca_scores_ht, pop_pca_loadings_ht = run_pca_with_relateds( hl.read_matrix_table(path('qc.mt')), hl.read_table(path('related_samples_to_drop.ht')), args.n_pcs) pop_pca_loadings_ht.write(path('pop_pca_loadings.ht'), args.overwrite) pop_pca_scores_ht.write(path('pop_pca_scores.ht'), args.overwrite) if args.assign_pops: logger.info("Assigning global population labels") pop_pca_scores_ht = hl.read_table(path("pop_pca_scores.ht")) gnomad_meta_ht = get_gnomad_meta('exomes').select("pop")[ pop_pca_scores_ht.key] pop_pca_scores_ht = pop_pca_scores_ht.annotate(known_pop=hl.or_missing( gnomad_meta_ht.pop != "oth", gnomad_meta_ht.pop)) pop_ht, pops_rf_model = assign_population_pcs( pop_pca_scores_ht, pc_cols=pop_pca_scores_ht.scores[:args.n_pcs], known_col='known_pop', min_prob=args.min_pop_prob) pop_ht.write(path('pop.ht'), args.overwrite) with hl.hadoop_open(path('pop_rf_model.pkl'), 'wb') as out: pickle.dump(pops_rf_model, out) if args.assign_subpops: qc_mt = hl.read_matrix_table(path('qc.mt')) pop_ht = hl.read_table(path('pop.ht')) meta_ht = hl.read_table(args.meta)[qc_mt.col_key] qc_mt = qc_mt.annotate_cols(pop=pop_ht[qc_mt.col_key].pop, is_case=meta_ht.is_case, country=meta_ht.country) platform_specific_intervals = get_platform_specific_intervals( hl.read_table(path('platform_pca_loadings.ht')), threshold=0.01) logger.info( f'Excluding {len(platform_specific_intervals)} platform-specific intervals for subpop PCA.' ) qc_mt = hl.filter_intervals(qc_mt, platform_specific_intervals, keep=False) assign_and_write_subpops( qc_mt, hl.read_table(path('related_samples_to_drop.ht')), min_samples_for_subpop=args.min_samples_for_subpop, n_pcs=args.n_pcs, min_pop_prob=args.min_pop_prob, overwrite=args.overwrite, pop_ann='pop', subpop_ann='country', include_in_pop_count=qc_mt.is_case) if args.run_kgp_pca: logger.info("Joining data with 1000 Genomes") qc_mt = hl.read_matrix_table( path('qc.mt')).select_rows().select_entries("GT") qc_mt = qc_mt.select_cols(known_pop=hl.null(hl.tstr), known_subpop=hl.null(hl.tstr)) qc_mt = qc_mt.key_cols_by(_kgp=False, *qc_mt.col_key) kgp_mt = hl.read_matrix_table( kgp_phase3_genotypes_mt_path()).select_rows() kgp_mt = kgp_mt.select_cols(known_pop=kgp_mt.super_pops.get( kgp_mt.population, "oth").lower(), known_subpop=kgp_mt.population.lower()) kgp_mt = kgp_mt.filter_rows(hl.is_defined( qc_mt.rows()[kgp_mt.row_key])) kgp_mt = filter_rows_for_qc(kgp_mt) kgp_mt = kgp_mt.key_cols_by(_kgp=True, *kgp_mt.col_key) union_kgp_qc_mt = qc_mt.union_cols(kgp_mt) union_kgp_qc_mt.write(path('union_kgp_qc.mt'), overwrite=args.overwrite) logger.info("Computing PCA on data with 1000 Genomes") union_kgp_qc_mt = hl.read_matrix_table(path('union_kgp_qc.mt')) related_samples_to_drop_ht = hl.read_table( path('related_samples_to_drop.ht')) related_samples_to_drop_ht = related_samples_to_drop_ht.key_by( _kgp=False, *related_samples_to_drop_ht.key) pca_evals, union_kgp_pca_scores_ht, union_kgp_pca_loadings_ht = run_pca_with_relateds( union_kgp_qc_mt, related_samples_to_drop_ht, args.n_kgp_pcs) union_kgp_pca_loadings_ht.write(path('union_kgp_pca_loadings.ht'), args.overwrite) union_kgp_pca_scores_ht.write(path('union_kgp_pca_scores.ht'), args.overwrite) if args.assign_pops_kgp: logger.info("Assigning populations based on 1000 Genomes labels") union_kgp_qc_mt = hl.read_matrix_table(path('union_kgp_qc.mt')) union_kgp_pca_scores_ht = hl.read_table( path('union_kgp_pca_scores.ht')) union_kgp_pca_scores_ht = union_kgp_pca_scores_ht.annotate( known_pop=union_kgp_qc_mt[union_kgp_pca_scores_ht.key].known_pop) union_kgp_pop_ht, union_kgp_pop_rf_model = assign_population_pcs( union_kgp_pca_scores_ht, pc_cols=union_kgp_pca_scores_ht.scores[:args.n_kgp_pcs], known_col='known_pop', min_prob=args.min_kgp_pop_prob) union_kgp_pop_ht.write(path('union_kgp_pop.ht'), args.overwrite) with hl.hadoop_open(path('union_kgp_pop_rf_model.pkl'), 'wb') as out: pickle.dump(union_kgp_pop_rf_model, out) if args.assign_subpops_kgp: union_kgp_qc_mt = hl.read_matrix_table(path('union_kgp_qc.mt')) meta_ht = hl.read_table(args.meta) union_kgp_pop_ht = hl.read_table(path('union_kgp_pop.ht')) union_kgp_qc_mt = union_kgp_qc_mt.annotate_cols( is_case=meta_ht[union_kgp_qc_mt.col_key].is_case, pop=union_kgp_pop_ht[union_kgp_qc_mt.col_key].pop) platform_specific_intervals = get_platform_specific_intervals( hl.read_table(path('platform_pca_loadings.ht'))) logger.info( f'Excluding {len(platform_specific_intervals)} platform-specific intervals for subpop PCA.' ) union_kgp_qc_mt = hl.filter_intervals(union_kgp_qc_mt, platform_specific_intervals, keep=False) related_samples_to_drop_ht = hl.read_table( path('related_samples_to_drop.ht')) related_samples_to_drop_ht = related_samples_to_drop_ht.key_by( _kgp=False, *related_samples_to_drop_ht.key) assign_and_write_subpops( union_kgp_qc_mt, related_samples_to_drop_ht, min_samples_for_subpop=args.min_samples_for_subpop, n_pcs=args.n_kgp_pcs, min_pop_prob=args.min_kgp_pop_prob, overwrite=args.overwrite, pop_ann='pop', subpop_ann='known_subpop', include_in_pop_count=union_kgp_qc_mt.is_case, files_prefix='union_kgp_') if args.apply_stratified_filters: logger.info("Computing stratified QC") for variant_class_prefix in ['', 'bi_allelic_', 'multi_allelic_']: sample_qc_ht = hl.read_table( path(f'{variant_class_prefix}sample_qc.ht')) pop_ht = hl.read_table(path('pops.ht')) platform_ht = hl.read_table(path('platform_pca_results.ht')) sample_qc_ht = sample_qc_ht.annotate( qc_pop=pop_ht[sample_qc_ht.key].pop, qc_platform=platform_ht[sample_qc_ht.key].qc_platform) stratified_metrics_ht = compute_stratified_metrics_filter( sample_qc_ht, args.filtering_qc_metrics.split(","), ['qc_pop', 'qc_platform']) stratified_metrics_ht.write( path(f'{variant_class_prefix}stratified_metrics_filters.ht'), overwrite=args.overwrite) if args.write_full_meta: logger.info("Writing metadata table") # List all tables to join with the base meta meta_annotation_hts = [ hl.read_table(path('platform_pca_results.ht')).rename( {'scores': 'platform_pc_scores'}), hl.read_table(path('sex.ht')), flatten_duplicate_samples_ht(hl.read_table(path('duplicates.ht'))), hl.read_table(path('related_samples_to_drop.ht')).select( related_filtered=True), hl.read_table(path('pca_scores.ht')).rename( {'scores': 'pop_pc_scores'}), hl.read_table(path('pops.ht')).select('pop'), hl.read_table(path('nfe.pca_scores.ht')).rename( {'scores': 'nfe_pc_scores'}), hl.read_table(path('subpops.nfe.ht')).select('subpop') ] # union_kgp_pops_ht = hl.read_table(path('union_kgp_pops.ht')) # union_kgp_pops_ht = union_kgp_pops_ht.filter(~union_kgp_pops_ht._kgp).key_by('s') # union_kgp_pops_ht = union_kgp_pops_ht.select(kgp_pop=union_kgp_pops_ht.pop) # meta_annotation_hts.append(union_kgp_pops_ht) # # union_kgp_pca_scores_ht = hl.read_table(path('union_kgp_pca_scores.ht')).rename({'scores': 'kgp_pop_pc_scores'}) # union_kgp_pca_scores_ht = union_kgp_pca_scores_ht.filter(~union_kgp_pca_scores_ht._kgp).key_by('s') # meta_annotation_hts.append(union_kgp_pca_scores_ht) gnomad_meta_ht = get_gnomad_meta('exomes') gnomad_meta_ht = gnomad_meta_ht.select( gnomad_pop=gnomad_meta_ht.pop, gnomad_subpop=gnomad_meta_ht.subpop) meta_annotation_hts.append(gnomad_meta_ht) for variant_class_prefix in ['', 'bi_allelic_', 'multi_allelic_']: sample_qc_ht = hl.read_table( path(f'{variant_class_prefix}sample_qc.ht')) stratified_metrics_filters_ht = hl.read_table( path(f'{variant_class_prefix}stratified_metrics_filters.ht')) if variant_class_prefix: sample_qc_ht = sample_qc_ht.rename( {'sample_qc': f'{variant_class_prefix}sample_qc'}) stratified_metrics_filters_ht = stratified_metrics_filters_ht.rename( { f: f'{variant_class_prefix}{f}' for f in list(stratified_metrics_filters_ht.globals) + list(stratified_metrics_filters_ht.row_value) }) meta_annotation_hts.extend( [sample_qc_ht, stratified_metrics_filters_ht]) meta_ht = hl.read_table(args.meta) meta_ht = meta_ht.annotate_globals( **{ name: expr for ann_ht in meta_annotation_hts for name, expr in ann_ht.index_globals().items() }) meta_ht = meta_ht.annotate( **{ name: expr for ann_ht in meta_annotation_hts for name, expr in ann_ht[meta_ht.key].items() }) filtering_col_prefix = '' if args.filtering_variant_class == 'all' else args.filtering_variant_class + "_" meta_ht = meta_ht.annotate_globals( filtering_variant_class=args.filtering_variant_class) meta_ht = meta_ht.annotate(sample_filters=add_filters_expr( filters={ "ambiguous sex": hl.is_missing(meta_ht.is_female), 'call_rate': meta_ht.sample_qc.call_rate < args.min_call_rate, 'duplicate': hl.is_defined(meta_ht.dup_filtered) & meta_ht.dup_filtered, 'related': meta_ht.related_filtered }, current_filters=meta_ht[ f'{filtering_col_prefix}pop_platform_filters'])) meta_ht.write(path('full_meta.ht'), overwrite=args.overwrite)
mt.alleles[1])) #ref (0) to alternates(1) #Check SNPs: unique possible reference (ref) and alternate allele calls (alt) from entire dataset (all samples) unique_allelecalls = mt_snp.aggregate_rows( hl.struct(ref=hl.agg.collect_as_set(mt_snp.alleles[0]), alt=hl.agg.collect_as_set(mt_snp.alleles[1]))) pprint(unique_allelecalls) #Check SNPs: shows all lenghts of vectors with possible allels (including ref, alternate) a = mt_snp.aggregate_rows(hl.agg.collect_as_set(hl.len(mt_snp.alleles))) pprint(a) mt_AF = mt.filter_rows(mt.variant_qc.AF[1] >= 0.01) ######## 3. QUALITY CONTROL SAMPLES ######## 3.1 Filter samples for outliers more than (6 * SD) from mean (Part 1) # Calculate sample statistics mt = hl.sample_qc(mt) # Calculate statistics on sample statistics stats_singleton = mt.aggregate_cols(hl.agg.stats(mt.sample_qc.n_singleton)) stats_ti_tv = mt.aggregate_cols(hl.agg.stats(mt.sample_qc.r_ti_tv)) stats_het_hom_var = mt.aggregate_cols(hl.agg.stats(mt.sample_qc.r_het_hom_var)) stats_het = mt.aggregate_cols(hl.agg.stats(mt.sample_qc.n_het)) ######## 3.2 Sex check on chromosome X (inbreeding coefficient) # Determine sex from GT calls in sex chromosomes t = hl.impute_sex(mt.GT) # Only keep those where genetic sex matches self-reported Sex mt = mt.filter_cols(t[mt.s].is_female == mt.is_female) ######## 3.3 Check for genetic relationship / "duplicates" # Calculate identity-by-descent matrix mt_relatedness = hl.identity_by_descent(mt)
def query(): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') loadings_ht = hl.read_table(LOADINGS) gtf_ht = hl.experimental.import_gtf( GTF_FILE, reference_genome='GRCh38', skip_invalid_contigs=True, min_partitions=12, ) number_of_pcs = hl.len(loadings_ht.loadings).take(1)[0] - 1 for i in range(0, (number_of_pcs)): pc = i + 1 plot_filename = output_path(f'loadings_manhattan_plot_pc{pc}.png', 'web') if not hl.hadoop_exists(plot_filename): p = manhattan_loadings( iteration=i, gtf=gtf_ht, loadings=loadings_ht, title=f'Loadings of PC{pc}', collect_all=True, ) with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(p).save(f, format='PNG') html = file_html(p, CDN, 'my plot') plot_filename_html = output_path(f'loadings_pc{pc}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html) # Get samples which are driving loadings mt = hl.read_matrix_table(HGDP1KG_TOBWGS) scores = hl.read_table(SCORES) mt = mt.semi_join_cols(scores) loadings_ht = loadings_ht.key_by('locus') mt = mt.annotate_rows(loadings=loadings_ht[mt.locus].loadings) for dim in range(0, number_of_pcs): max_value = mt.aggregate_rows(hl.agg.stats(hl.abs( mt.loadings[dim]))).max significant_variants = mt.filter_rows( hl.abs(mt.loadings[dim]) == max_value) significant_variants = hl.sample_qc(significant_variants) significant_variant_list = significant_variants.locus.collect() print(f'PC{dim}:', significant_variant_list) heterozygous_samples = significant_variants.filter_cols( significant_variants.sample_qc.n_het > 0).s.collect() homozygous_alternate_samples = significant_variants.filter_cols( significant_variants.sample_qc.n_hom_var > 0).s.collect() if len(heterozygous_samples) > len(homozygous_alternate_samples): homozygous_alternate_samples.extend('null' for _ in range( len(heterozygous_samples) - len(homozygous_alternate_samples))) elif len(heterozygous_samples) < len(homozygous_alternate_samples): heterozygous_samples.extend('null' for _ in range( len(homozygous_alternate_samples) - len(heterozygous_samples))) # save as html html = pd.DataFrame({ 'heterozygous_samples': heterozygous_samples, 'homozygous_alternate_samples': homozygous_alternate_samples, }).to_html() plot_filename_html = output_path( f'significant_variants_non_ref_samples{dim}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html)
AMR_AF=mt_split.info.AMR_AF[mt_split.a_index - 1], SAS_AF=mt_split.info.SAS_AF[mt_split.a_index - 1], DP=mt_split.info.DP, AA=mt_split.info.AA, VT=(hl.case().when((mt_split.alleles[0].length() == 1) & (mt_split.alleles[1].length() == 1), 'SNP').when( mt_split.alleles[0].matches('<CN*>') | mt_split.alleles[1].matches('<CN*>'), 'SV').default('INDEL')), EX_TARGET=mt_split.info.EX_TARGET, MULTI_ALLELIC=mt_split.info.MULTI_ALLELIC)) n_rows, n_cols = mt_split.count() n_partitions = mt_split.n_partitions() mt_split = hl.sample_qc(mt_split) mt_split = hl.variant_qc(mt_split) mt_split = mt_split.annotate_globals( metadata=hl.struct(name='1000_Genomes_phase3_autosomes', reference_genome='GRCh37', n_rows=n_rows, n_cols=n_cols, n_partitions=n_partitions)) mt_split.write( 'gs://hail-datasets-hail-data/1000_Genomes_phase3_autosomes.GRCh37.mt', overwrite=True) mt = hl.read_matrix_table( 'gs://hail-datasets-hail-data/1000_Genomes_phase3_autosomes.GRCh37.mt')
#3. Split multi print("3. Split multi") mt_split = hl.split_multi_hts(mt_result, keep_star=False) # 4. annotate SNPs,indels print('Annotating rows with snp and indel info') mt = mt_split.annotate_rows(Variant_Type=hl.cond( (hl.is_snp(mt_split.alleles[0], mt_split.alleles[1])), "SNP", hl.cond( hl.is_insertion(mt_split.alleles[0], mt_split.alleles[1]), "INDEL", hl.cond(hl.is_deletion(mt_split.alleles[0], mt_split.alleles[1]), "INDEL", "Other")))) #4. Sample qc and variant qc print("4. Sample qc and variant qc ") mt_sampleqc = hl.sample_qc(mt, name='sample_QC_Hail') mt2 = hl.variant_qc(mt_sampleqc, name='variant_QC_Hail') #5.Annotate COMMON AND RARE VARIANTS to apply separate filters print("Annotate COMMON AND RARE VARIANTS to apply separate filters") #mt_common = mt_filtered.filter_rows(mt_filtered.variant_qc.AF[1] > 0.05) mt2 = mt2.annotate_rows( maf=hl.cond(mt2.variant_QC_Hail.AF[1] < 0.01, "< 1%", hl.cond(mt2.variant_QC_Hail.AF[1] < 0.05, "1%-5%", ">5%"))) #6. Common variants filtering: print("6. Common variants filtering:") mt = mt2 mt_filtered_variants_common = mt.filter_rows( (mt.maf == "< 1%") | #let all rare variants pass ((mt.maf != "< 1%") & ((mt.variant_QC_Hail.p_value_hwe > 10**-5) &
locus=variants_to_filter.locus, alleles=variants_to_filter.alleles) sample_annotations = hl.read_table(PHENOTYPES_TABLE) mt = hl.read_matrix_table(MT) mt = mt.filter_rows(hl.is_defined(variants_to_filter[mt.row_key])) mt = mt.annotate_cols(phenotype=sample_annotations[mt.s]) n = mt.count() pprint('n samples:') print(n[1]) pprint('n variants:') print(n[0]) mt = hl.sample_qc(mt, name='qc_padded_ice') TARGET_INTERVALS = 'gs://raw_data_bipolar_dalio_w1_w2_hail_02/inputs/ice_coding_v1_targets.interval_list' # Import the interval lists for the LCRs. target_intervals = hl.import_locus_intervals(TARGET_INTERVALS, reference_genome='GRCh38') mt = mt.annotate_rows( not_in_target_intervals=~hl.is_defined(target_intervals[mt.locus])) mt = mt.filter_rows(mt.not_in_target_intervals, keep=False) n = mt.count() pprint('n samples:') print(n[1]) pprint('n variants:') print(n[0])