def query(): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') tob_wgs = hl.read_matrix_table(TOB_WGS).key_rows_by('locus', 'alleles') snp_chip = hl.read_matrix_table(SNP_CHIP).key_rows_by('locus', 'alleles') # filter to loci that are contained in snp-chip data after densifying tob_wgs = hl.experimental.densify(tob_wgs) tob_wgs = tob_wgs.select_entries( GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA) ).select_cols() snp_chip = snp_chip.select_entries(snp_chip.GT).select_cols() snp_chip = snp_chip.key_cols_by(s=snp_chip.s + '_snp_chip') tob_combined = tob_wgs.union_cols(snp_chip) tob_combined = tob_combined.cache() print(tob_combined.count_rows()) # Perform PCA eigenvalues_path = output_path('eigenvalues.ht') scores_path = output_path('scores.ht') loadings_path = output_path('loadings.ht') eigenvalues, scores, loadings = hl.hwe_normalized_pca( tob_combined.GT, compute_loadings=True, k=20 ) hl.Table.from_pandas(pd.DataFrame(eigenvalues)).export(eigenvalues_path) scores.write(scores_path, overwrite=True) loadings.write(loadings_path, overwrite=True)
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') tob_wgs = hl.read_matrix_table(TOB_WGS) hgdp_1kg = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT) # keep loci that are contained in the densified, filtered tob-wgs mt hgdp_1kg = hgdp_1kg.semi_join_rows(tob_wgs.rows()) # Entries and columns must be identical tob_wgs_select = tob_wgs.select_entries( GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA)).select_cols() hgdp_1kg_select = hgdp_1kg.select_entries(hgdp_1kg.GT).select_cols() # Join datasets hgdp1kg_tobwgs_joined = hgdp_1kg_select.union_cols(tob_wgs_select) # Add in metadata information hgdp_1kg_metadata = hgdp_1kg.cols() hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.annotate_cols( hgdp_1kg_metadata=hgdp_1kg_metadata[hgdp1kg_tobwgs_joined.s]) # save this for population-level PCAs mt_path = output_path('hgdp1kg_tobwgs_joined_all_samples.mt') if not hl.hadoop_exists(mt_path): hgdp1kg_tobwgs_joined.write(mt_path) # Perform PCA eigenvalues_path = output_path('eigenvalues.ht') scores_path = output_path('scores.ht') loadings_path = output_path('loadings.ht') eigenvalues, scores, loadings = hl.hwe_normalized_pca( hgdp1kg_tobwgs_joined.GT, compute_loadings=True, k=20) hl.Table.from_pandas(pd.DataFrame(eigenvalues)).export(eigenvalues_path) scores.write(scores_path, overwrite=True) loadings.write(loadings_path, overwrite=True)
def query(output, pop): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') mt = hl.read_matrix_table(HGDP1KG_TOBWGS) if pop: # Get samples from the specified population only mt = mt.filter_cols(( mt.hgdp_1kg_metadata.population_inference.pop == pop.lower()) | (mt.s.contains('TOB'))) else: mt = mt.filter_cols(mt.s.contains('TOB')) # Perform PCA eigenvalues_path = f'{output}/eigenvalues.csv' scores_path = f'{output}/scores.ht' loadings_path = f'{output}/loadings.ht' eigenvalues, scores, loadings = hl.hwe_normalized_pca( mt.GT, compute_loadings=True, k=20) eigenvalues_df = pd.DataFrame(eigenvalues) eigenvalues_df.to_csv(eigenvalues_path, index=False) scores.write(scores_path, overwrite=True) loadings.write(loadings_path, overwrite=True) # get TOB-WGS allele frequencies tob_wgs = hl.read_matrix_table(TOB_WGS).key_rows_by('locus', 'alleles') tob_wgs = tob_wgs.annotate_entries(GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA)) tob_wgs = tob_wgs.annotate_rows( gt_stats=hl.agg.call_stats(tob_wgs.GT, tob_wgs.alleles)) # Get gnomAD allele frequency of variants that aren't in TOB-WGS loadings_gnomad = hl.read_table(GNOMAD_LIFTOVER_LOADINGS).key_by( 'locus', 'alleles') hgdp_1kg = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT) hgdp_1kg_row = hgdp_1kg.rows()[loadings_gnomad.locus, loadings_gnomad.alleles] tob_wgs_row = tob_wgs.rows()[loadings_gnomad.locus, loadings_gnomad.alleles] loadings_gnomad = loadings_gnomad.annotate( gnomad_AF=hgdp_1kg_row.gnomad_freq.AF, gnomad_popmax_AF=hgdp_1kg_row.gnomad_popmax.AF, TOB_WGS_AF=tob_wgs_row.gt_stats.AF, ) population_af_metadata = hgdp_1kg.gnomad_freq_meta.collect() loadings_gnomad = loadings_gnomad.annotate_globals( gnomad_freq_meta=population_af_metadata) gnomad_variants = loadings_gnomad.drop('loadings') gnomad_variants_path = f'{output}/gnomad_annotated_variants.mt' gnomad_variants.write(gnomad_variants_path)
def query(output): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') hgdp_1kg = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT) tob_wgs = hl.read_matrix_table(TOB_WGS).key_rows_by('locus', 'alleles') # filter to loci that are contained in both matrix tables after densifying tob_wgs = hl.experimental.densify(tob_wgs) # Entries and columns must be identical tob_wgs_select = tob_wgs.select_entries( GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA)) hgdp_1kg_select = hgdp_1kg.select_entries(hgdp_1kg.GT) hgdp_1kg_select = hgdp_1kg_select.select_cols() # Join datasets hgdp1kg_tobwgs_joined = hgdp_1kg_select.union_cols(tob_wgs_select) # Add in metadata information hgdp_1kg_metadata = hgdp_1kg.cols() hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.annotate_cols( hgdp_1kg_metadata=hgdp_1kg_metadata[hgdp1kg_tobwgs_joined.s]) # choose variants based off of gnomAD v3 parameters hgdp1kg_tobwgs_joined = hl.variant_qc(hgdp1kg_tobwgs_joined) hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.annotate_rows( IB=hl.agg.inbreeding(hgdp1kg_tobwgs_joined.GT, hgdp1kg_tobwgs_joined.variant_qc.AF[1])) hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.filter_rows( (hl.len(hgdp1kg_tobwgs_joined.alleles) == 2) & (hgdp1kg_tobwgs_joined.locus.in_autosome()) & (hgdp1kg_tobwgs_joined.variant_qc.AF[1] > 0.01) & (hgdp1kg_tobwgs_joined.variant_qc.call_rate > 0.99) & (hgdp1kg_tobwgs_joined.IB.f_stat > -0.25)) hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.cache() nrows = hgdp1kg_tobwgs_joined.count_rows() print(f'hgdp1kg_tobwgs_joined.count_rows() = {nrows}') hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.sample_rows( NUM_ROWS_BEFORE_LD_PRUNE / nrows, seed=12345) pruned_variant_table = hl.ld_prune(hgdp1kg_tobwgs_joined.GT, r2=0.1, bp_window_size=500000) hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.filter_rows( hl.is_defined(pruned_variant_table[hgdp1kg_tobwgs_joined.row_key])) mt_path = f'{output}/tob_wgs_hgdp_1kg_filtered_variants.mt' hgdp1kg_tobwgs_joined.write(mt_path)
def query(output): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') hgdp_1kg = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT) tob_wgs = hl.read_matrix_table(TOB_WGS).key_rows_by('locus', 'alleles') loadings = hl.read_table(GNOMAD_LIFTOVER_LOADINGS).key_by( 'locus', 'alleles') # filter to loci that are contained in both tables and the loadings after densifying tob_wgs = hl.experimental.densify(tob_wgs) hgdp_1kg = hgdp_1kg.filter_rows( hl.is_defined(loadings.index(hgdp_1kg['locus'], hgdp_1kg['alleles'])) & hl.is_defined( tob_wgs.index_rows(hgdp_1kg['locus'], hgdp_1kg['alleles']))) tob_wgs = tob_wgs.semi_join_rows(hgdp_1kg.rows()) # Entries and columns must be identical tob_wgs_select = tob_wgs.select_entries( GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA)) hgdp_1kg_select = hgdp_1kg.select_entries(hgdp_1kg.GT) hgdp_1kg_select = hgdp_1kg_select.select_cols() # Join datasets hgdp1kg_tobwgs_joined = hgdp_1kg_select.union_cols(tob_wgs_select) # Add in metadata information hgdp_1kg_metadata = hgdp_1kg.cols() hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.annotate_cols( hgdp_1kg_metadata=hgdp_1kg_metadata[hgdp1kg_tobwgs_joined.s]) mt_path = f'{output}/hgdp1kg_tobwgs_joined_all_samples.mt' if not hl.hadoop_exists(mt_path): hgdp1kg_tobwgs_joined.write(mt_path) hgdp1kg_tobwgs_joined = hl.read_matrix_table(mt_path) # Perform PCA eigenvalues_path = f'{output}/eigenvalues.csv' scores_path = f'{output}/scores.ht' loadings_path = f'{output}/loadings.ht' eigenvalues, scores, loadings = hl.hwe_normalized_pca( hgdp1kg_tobwgs_joined.GT, compute_loadings=True, k=20) # save the list of eigenvalues eigenvalues_df = pd.DataFrame(eigenvalues) eigenvalues_df.to_csv(eigenvalues_path, index=False) # save the scores and loadings as a hail table scores.write(scores_path, overwrite=True) loadings.write(loadings_path, overwrite=True)
def query(output, pop): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') mt = hl.read_matrix_table(HGDP1KG_TOBWGS) if pop: # Get samples from the specified population only mt = mt.filter_cols(( mt.hgdp_1kg_metadata.population_inference.pop == pop.lower()) | (mt.s.contains('TOB'))) else: mt = mt.filter_cols(mt.s.contains('TOB')) mt = mt.annotate_rows(af=hl.agg.mean(mt.GT.n_alt_alleles()) / 2) loadings = hl.read_table(LOADINGS) loadings = loadings.annotate(af=mt.rows()[loadings.key].af) reprocessed_samples = hl.read_matrix_table(REPROCESSED_1KG) reprocessed_samples = hl.experimental.densify(reprocessed_samples) reprocessed_samples = reprocessed_samples.annotate_entries( GT=lgt_to_gt(reprocessed_samples.LGT, reprocessed_samples.LA)) ht = pc_project(reprocessed_samples.GT, loadings.loadings, loadings.af) ht = ht.key_by(s=ht.s + '_reprocessed') pcs = hl.read_table(SCORES) union_scores = ht.union(pcs) union_scores = union_scores.annotate( original=(union_scores.s == 'HG02238') | (union_scores.s == 'NA12248') | (union_scores.s == 'NA20502'), reprocessed=union_scores.s.contains('reprocessed'), ) expr = ( hl.case().when( (union_scores.original) & ( union_scores.reprocessed # pylint: disable=singleton-comparison == False # noqa: E712 ), 'original', ).when( (union_scores.original == False) # pylint: disable=singleton-comparison & (union_scores.reprocessed), 'reprocessed', ).default('unedited')) union_scores = union_scores.annotate(cohort_sample_codes=expr) # get percentage of variance explained eigenvalues = hl.import_table(EIGENVALUES) eigenvalues = eigenvalues.to_pandas() eigenvalues.columns = ['eigenvalue'] eigenvalues = pd.to_numeric(eigenvalues.eigenvalue) variance = eigenvalues.divide(float(eigenvalues.sum())) * 100 variance = variance.round(2) # plot labels = union_scores.cohort_sample_codes sample_names = union_scores.s cohort_sample_codes = list(set(labels.collect())) tooltips = [('labels', '@label'), ('samples', '@samples')] for i in range(0, 10): pc1 = i pc2 = i + 1 plot_filename = (f'{output}/reprocessed_sample_projection_pc' + str(i + 1) + '.png') if not hl.hadoop_exists(plot_filename): plot = figure( title='Reprocessed Sample Projection', x_axis_label='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) + '%)', y_axis_label='PC' + str(pc2 + 1) + ' (' + str(variance[pc1]) + '%)', tooltips=tooltips, ) source = ColumnDataSource( dict( x=union_scores.scores[pc1].collect(), y=union_scores.scores[pc2].collect(), label=labels.collect(), samples=sample_names.collect(), )) plot.circle( 'x', 'y', alpha=0.5, source=source, size=8, color=factor_cmap('label', Dark2[len(cohort_sample_codes)], cohort_sample_codes), legend_group='label', ) plot.add_layout(plot.legend[0], 'left') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(plot).save(f, format='PNG') plot_filename_html = ('reprocessed_sample_projection_pc' + str(i + 1) + '.html') output_file(plot_filename_html) save(plot) subprocess.run(['gsutil', 'cp', plot_filename_html, output], check=False)
def query(): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') snp_chip = hl.read_matrix_table(SNP_CHIP) tob_wgs = hl.read_matrix_table(TOB_WGS) tob_wgs = hl.experimental.densify(tob_wgs) tob_wgs = tob_wgs.annotate_entries(GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA)) snp_chip = snp_chip.semi_join_rows(tob_wgs.rows()) snp_chip_path = output_path('snp_chip_filtered_by_tob_wgs.mt', 'tmp') snp_chip = snp_chip.checkpoint(snp_chip_path) # Perform PCA eigenvalues, scores, loadings = hl.hwe_normalized_pca( snp_chip.GT, compute_loadings=True, k=5) scores_path = output_path('scores.ht', 'tmp') loadings_path = output_path('loadings.ht', 'tmp') scores = scores.checkpoint(scores_path) loadings = loadings.checkpoint(loadings_path) # make tob_wgs rows equivalent to the snp_chip rows tob_wgs = tob_wgs.semi_join_rows(snp_chip.rows()) tob_wgs_path = output_path('tob_wgs_filtered_by_snp_chip.mt', 'tmp') tob_wgs = tob_wgs.checkpoint(tob_wgs_path) snp_chip = snp_chip.annotate_rows( af=hl.agg.mean(snp_chip.GT.n_alt_alleles()) / 2) loadings = loadings.annotate(af=snp_chip.rows()[loadings.key].af) # project WGS samples onto PCA ht = pc_project(tob_wgs.GT, loadings.loadings, loadings.af) ht_path = output_path('pc_project_tob_wgs.ht', 'tmp') ht = ht.checkpoint(ht_path) scores = scores.key_by(s=scores.s + '_snp_chip') union_scores = ht.union(scores) variance = [(x / sum(eigenvalues) * 100) for x in eigenvalues] variance = [round(x, 2) for x in variance] # Get partner sample information sample_names = union_scores.s.collect() def sample_type(sample_name): if sample_name.endswith('snp_chip'): partner_name = re.sub('_snp_chip', '', sample_name) tech = 'snp' else: partner_name = sample_name + '_snp_chip' tech = 'wgs' if partner_name in sample_names: prefix = 'dual_' else: prefix = '' return prefix + tech # plot labels = list(map(sample_type, sample_names)) cohort_sample_codes = list(set(labels)) tooltips = [('labels', '@label'), ('samples', '@samples')] # Get number of PCs number_of_pcs = len(eigenvalues) for i in range(0, (number_of_pcs - 1)): pc1 = i pc2 = i + 1 plot = figure( title='TOB-WGS + TOB SNP Chip', x_axis_label=f'PC{pc1 + 1} ({variance[pc1]})%)', y_axis_label=f'PC{pc2 + 1} ({variance[pc2]}%)', tooltips=tooltips, ) source = ColumnDataSource( dict( x=union_scores.scores[pc1].collect(), y=union_scores.scores[pc2].collect(), label=labels, samples=sample_names, )) source_filename = output_path(f'source_{pc2}.ht', 'tmp') hl.Table.from_pandas(pd.DataFrame(source.data)).export(source_filename) plot.circle( 'x', 'y', alpha=0.5, source=source, size=8, color=factor_cmap('label', Dark2[len(cohort_sample_codes)], cohort_sample_codes), legend_group='label', ) plot.add_layout(plot.legend[0], 'left') plot_filename = output_path(f'pc{pc2}.png', 'web') with hl.hadoop_open(plot_filename, 'wb') as f: get_screenshot_as_png(plot).save(f, format='PNG') html = file_html(plot, CDN, 'my plot') plot_filename_html = output_path(f'pc{pc2}.html', 'web') with hl.hadoop_open(plot_filename_html, 'w') as f: f.write(html)