예제 #1
0
def main(args):

    # init hail
    hl.init(default_reference=args.default_ref_genome)

    # input MT
    mt = hl.read_matrix_table(args.mt_input_path)

    # filter high-quality genotype
    # mt = filter_genotypes_ab(mt)

    # import capture interval table (intersect)
    intervals = hl.read_table(args.ht_intervals)

    # generate an interval x sample MT by computing per intervals callrate
    mt_callrate = compute_callrate_mt(mt=mt, intervals_ht=intervals)

    # run pca
    eigenvalues, ht_pca, _ = run_platform_pca(
        callrate_mt=mt_callrate,
        binarization_threshold=args.binarization_threshold)

    # normalize eigenvalues (0-100)
    eigenvalues_norm = [x / sum(eigenvalues) * 100 for x in eigenvalues]

    # compute eigenvalues cumulative sum
    ev_cumsum = hl.array_scan(lambda i, j: i + j, 0,
                              hl.array(eigenvalues_norm))

    # getting optimal number of PCs (those which explain 99% of the variance)
    n_optimal_pcs = hl.eval(hl.len(ev_cumsum.filter(lambda x: x < 99.0)))

    logger.info(
        f"Keep only principal components which explain up to 99% of the variance. Number of optimal PCs found: {n_optimal_pcs}"
    )

    # filter out uninformative PCs
    ht_pca = ht_pca.annotate(scores=ht_pca.scores[:n_optimal_pcs])

    # apply unsupervised clustering on PCs to infer samples platform
    ht_platform = assign_platform_from_pcs(
        platform_pca_scores_ht=ht_pca,
        pc_scores_ann='scores',
        hdbscan_min_cluster_size=args.hdbscan_min_cluster_size,
        hdbscan_min_samples=args.hdbscan_min_cluster_size)

    ht_platform.show()

    # write HT
    ht_platform.write(output=args.ht_output_path, overwrite=args.overwrite)

    # export to file if true
    if args.write_to_file:
        (ht_platform.export(f'{args.ht_output_path}.tsv.bgz'))

    hl.stop()
예제 #2
0
def gnomad_coverage_stats_optimized(mt_path):
    mt = hl.read_matrix_table(mt_path)
    mt = mt.annotate_rows(
        mean=hl.agg.mean(mt.x),
        count_array=hl.rbind(
            hl.agg.counter(hl.min(100, mt.x)),
            lambda c: hl.range(0, 100).map(lambda i: c.get(i, 0))))
    mt = mt.annotate_rows(median=hl.rbind(
        hl.sum(mt.count_array) / 2, lambda s: hl.find(
            lambda x: x > s,
            hl.array_scan(lambda i, j: i + j, 0, mt.count_array))),
                          **{
                              f'above_{x}': hl.sum(mt.count_array[x:])
                              for x in [1, 5, 10, 15, 20, 25, 30, 50, 100]
                          })
    mt.rows()._force_count()