def main(args): # init hail hl.init(default_reference=args.default_ref_genome) # input MT mt = hl.read_matrix_table(args.mt_input_path) # filter high-quality genotype # mt = filter_genotypes_ab(mt) # import capture interval table (intersect) intervals = hl.read_table(args.ht_intervals) # generate an interval x sample MT by computing per intervals callrate mt_callrate = compute_callrate_mt(mt=mt, intervals_ht=intervals) # run pca eigenvalues, ht_pca, _ = run_platform_pca( callrate_mt=mt_callrate, binarization_threshold=args.binarization_threshold) # normalize eigenvalues (0-100) eigenvalues_norm = [x / sum(eigenvalues) * 100 for x in eigenvalues] # compute eigenvalues cumulative sum ev_cumsum = hl.array_scan(lambda i, j: i + j, 0, hl.array(eigenvalues_norm)) # getting optimal number of PCs (those which explain 99% of the variance) n_optimal_pcs = hl.eval(hl.len(ev_cumsum.filter(lambda x: x < 99.0))) logger.info( f"Keep only principal components which explain up to 99% of the variance. Number of optimal PCs found: {n_optimal_pcs}" ) # filter out uninformative PCs ht_pca = ht_pca.annotate(scores=ht_pca.scores[:n_optimal_pcs]) # apply unsupervised clustering on PCs to infer samples platform ht_platform = assign_platform_from_pcs( platform_pca_scores_ht=ht_pca, pc_scores_ann='scores', hdbscan_min_cluster_size=args.hdbscan_min_cluster_size, hdbscan_min_samples=args.hdbscan_min_cluster_size) ht_platform.show() # write HT ht_platform.write(output=args.ht_output_path, overwrite=args.overwrite) # export to file if true if args.write_to_file: (ht_platform.export(f'{args.ht_output_path}.tsv.bgz')) hl.stop()
def gnomad_coverage_stats_optimized(mt_path): mt = hl.read_matrix_table(mt_path) mt = mt.annotate_rows( mean=hl.agg.mean(mt.x), count_array=hl.rbind( hl.agg.counter(hl.min(100, mt.x)), lambda c: hl.range(0, 100).map(lambda i: c.get(i, 0)))) mt = mt.annotate_rows(median=hl.rbind( hl.sum(mt.count_array) / 2, lambda s: hl.find( lambda x: x > s, hl.array_scan(lambda i, j: i + j, 0, mt.count_array))), **{ f'above_{x}': hl.sum(mt.count_array[x:]) for x in [1, 5, 10, 15, 20, 25, 30, 50, 100] }) mt.rows()._force_count()