def cluster_PETs(args):
    # reading the file
    logging.info(f"Reading PETs from {args.pets_filename} ...")
    columns = ["chrom1", "start1", "end1", "chrom2", "start2", "end2", "cnt"]
    pets = pd.concat([pd.read_csv(f, sep="\t", header=None, names=columns, low_memory=False, nrows=args.nrows) for f in args.pets_filename])
    logging.info(f"Read {len(pets):,} PETs.")

    # pre-proccess
    logging.info(f"Preprocessing (Exstension: {args.extension}bp, Self-ligation genomic span: {args.self_ligation}bp, "
                 f"PET cutoff: {args.pet_cutoff}) ...")

    # check the data integrity
    invalid_pets = pets[(pets.chrom1 != pets.chrom2) | (pets.start1 > pets.end1) | (pets.start2 > pets.end2)]
    if len(invalid_pets) > 0:
        logging.info(f"{len(invalid_pets)} inter-chromosomal or misordered PETs are ignored:")
        logging.info(invalid_pets.head())
    pets = pets[(pets.chrom1 == pets.chrom2) & (pets.start1 <= pets.end1) & (pets.start2 <= pets.end2)]

    # keep only non-self-ligating PETs
    pets = pets[pets.start2 - pets.end1 >= args.self_ligation]

    # keep PETs with count >= PET cutoff
    pets = pets[pets.cnt >= args.pet_cutoff]

    # add extension
    pets.start1 = (pets.start1 - args.extension).clip(0)
    pets.end1 = pets.end1 + args.extension
    pets.start2 = (pets.start2 - args.extension).clip(0)
    pets.end2 = pets.end2 + args.extension

    # remove not intersecting anchors
    peaks = None
    if args.peaks_filename:
        peaks = pd.read_csv(args.peaks_filename, sep="\t", header=None, usecols=[0, 1, 2, 6],
                            names=["Chromosome", "Start", "End", "Score"])
        peaks = PyRanges(peaks)
        pets["Chromosome"], pets["Start"], pets["End"] = pets.chrom1, pets.start1, pets.end1
        pets = PyRanges(pets).intersect(peaks).df
        pets["Chromosome"], pets["Start"], pets["End"] = pets.chrom2, pets.start2, pets.end2
        pets = PyRanges(pets).intersect(peaks).df
        peaks = peaks.df
        peaks["Center"] = peaks["Start"] + (peaks["End"]-peaks["Start"]) // 2

    logging.info(f"Done. {len(pets):,} PETs left.")

    chroms = pets.groupby(["chrom1"]).size().to_dict()

    start1s = numba.typed.List([pets[pets.chrom1 == chrom].start1.to_numpy() for chrom in chroms.keys()])
    end1s = numba.typed.List([pets[pets.chrom1 == chrom].end1.to_numpy() for chrom in chroms.keys()])
    start2s = numba.typed.List([pets[pets.chrom1 == chrom].start2.to_numpy() for chrom in chroms.keys()])
    end2s = numba.typed.List([pets[pets.chrom1 == chrom].end2.to_numpy() for chrom in chroms.keys()])
    cnts = numba.typed.List([pets[pets.chrom1 == chrom].cnt.to_numpy() for chrom in chroms.keys()])

    step = 0
    changes = np.ones(shape=(len(chroms),))
    while np.sum(changes) > 0:
        # sorting# if changes[i] > 0 else None\
        logging.info(f"Sorting (step: #{step+1}, PETs: {len(pets):,}) ...")
        orders = numba.typed.List([np.lexsort((end2s[i], start2s[i], end1s[i], start1s[i]))
                                   for i in range(len(chroms))])
        logging.info("Done.")

        # clustering
        logging.info(f"Clustering (step: #{step+1}, PETs: {len(pets):,}) ...")

        @numba.jit(nopython=True, parallel=True)
        def cluster(chroms, old_changes, orders, start1s, end1s, start2s, end2s, cnts):
            changes = np.zeros(shape=(len(chroms)-1,), dtype=np.uint64)

            for idx in numba.prange(len(chroms)-1):
                if old_changes[idx] > 0:
                    order, start1, end1, start2, end2, cnt =\
                        orders[idx], start1s[idx], end1s[idx], start2s[idx], end2s[idx], cnts[idx]
                    for _i in range(chroms[idx]):
                        i = order[_i]
                        if cnt[i] == 0:
                            continue
                        _j = _i + 1
                        while _j < chroms[idx]:
                            j = order[_j]
                            if start1[j] > end1[i]:
                                break
                            if cnt[j] == 0:
                                _j += 1
                                continue
                            if ((start1[i] <= start1[j] and start1[j] <= end1[i]) or (start1[i] <= end1[j] and end1[j] <= end1[i])) and\
                               ((start2[i] <= start2[j] and start2[j] <= end2[i]) or (start2[i] <= end2[j] and end2[j] <= end2[i])):
                                start1[i] = min(start1[i], start1[j])
                                end1[i] = max(end1[i], end1[j])
                                start2[i] = min(start2[i], start2[j])
                                end2[i] = max(end2[i], end2[j])
                                cnt[i] += cnt[j]
                                cnt[j] = 0
                                changes[idx] += 1
                            _j += 1
            return changes
        changes = cluster(numba.typed.List(chroms.values()), changes, orders, start1s, end1s, start2s, end2s, cnts)
        logging.info(f"Done. Changes: {int(sum(changes)):,}")
        step += 1

    # save to file
    logging.info(f"Saving to {args.clusters_filename} (cluster cufoff: {args.cluster_cutoff})... ")
    pets = pd.DataFrame()
    for i, (chrom, size) in enumerate(chroms.items()):
        pets = pd.concat([
            pets, pd.DataFrame(data={"chrom1": itertools.repeat(chrom, size),
                                     "start1": start1s[i][orders[i]],
                                     "end1": end1s[i][orders[i]],
                                     "chrom2": itertools.repeat(chrom, size),
                                     "start2": start2s[i][orders[i]],
                                     "end2": end2s[i][orders[i]],
                                     "cnt": cnts[i][orders[i]]})])
    pets = pets[pets.cnt >= args.cluster_cutoff]
    if peaks is not None:
        pets["Center1"] = pets.apply(lambda row: peaks.iloc[
                            peaks[(peaks.Chromosome == row.chrom1) &
                                  (peaks.Start <= row.end1) & (row.start1 <= peaks.End)]
                                 ["Score"].idxmax()]["Center"], axis=1)
        pets["Center2"] = pets.apply(lambda row: peaks.iloc[
                            peaks[(peaks.Chromosome == row.chrom1) &
                                  (peaks.Start <= row.end2) & (row.start2 <= peaks.End)]
                                 ["Score"].idxmax()]["Center"], axis=1)
    pets.to_csv(args.clusters_filename, sep="\t", index=False, header=False)
    logging.info(f"Done. Saved {len(pets):,} clusters.")

    return pets
예제 #2
0
def compute_peaks_and_zscores(cvg, center, left, right, chip, background_sum,
                              ratios, ratio, args):

    print("peaks and zscores")
    all_peaks, zs = _compute_peaks_and_zscores(cvg, center, left, right, chip,
                                               background_sum, ratios, ratio,
                                               args)
    print("peaks and zscores done")

    min_er = args["min_enrichment"]

    peaks_with_info = {}
    for peak_type, peaks in enumerate(all_peaks, 1):

        # print("find max start")
        # print(list(len(v) for v in zs[peak_type - 1].values()))
        # print(peaks)
        # print(zs[peak_type - 1].values())
        # t1 = list(zs[peak_type - 1].values())[0]
        # print(t1)
        # print(max(t1[1]))
        max_zs = {}
        for k, v in zs[peak_type - 1].items():
            max_zs[k] = np.array([max(v2[1]) for v2 in v])

        # max_zs = np.array(max_zs)
        # print("find max end")

        # print("len max_zs:", sum(len(v) for v in max_zs.values()))

        result = {k: -(pnorm(v) / np.log(10)) for k, v in max_zs.items()}
        # print(len(peaks))
        # print(len(np.concatenate([result[k] for k in natsorted(result)])))
        peaks.NLP = np.around(
            np.concatenate([result[k] for k in natsorted(result)]), 3)

        peaks.Location = np.array(np.ceil((peaks.Start + peaks.End) / 2),
                                  dtype=np.long)

        peaks.Type = peak_type

        peaks_loc = PyRanges(seqnames=peaks.Chromosome,
                             starts=peaks.Location,
                             ends=peaks.Location + 1,
                             strands=peaks.Strand)
        loc_cvg = peaks_loc.coverage()

        chip_cvg = loc_cvg * cvg
        bg_cvg = loc_cvg * background_sum

        peak_enrich_cvg_f = 1 + (ratio["+"] * chip_cvg["+"])
        peak_enrich_cvg_r = 1 + (ratio["-"] * chip_cvg["-"])
        peak_enrich_cvg = PyRles({
            k: v
            for k, v in list(peak_enrich_cvg_r.items() +
                             peak_enrich_cvg_f.items())
        })

        peak_enrich_ref = 1 + (bg_cvg)
        peak_enrich = peak_enrich_cvg / peak_enrich_ref

        vals_f = np.concatenate(
            [peak_enrich[k].values for k in peak_enrich["+"].keys()])
        vals_r = np.concatenate(
            [peak_enrich[k].values for k in peak_enrich["-"].keys()])
        vals_f = vals_f[np.isfinite(vals_f)]
        vals_r = vals_r[np.isfinite(vals_r)]

        # print(len(vals_f))
        vals_f = vals_f[vals_f > 1]
        vals_r = vals_r[vals_r > 1]

        if peak_type == 1:
            min_er_f = np.percentile(vals_f, min_er * 100)
            min_er_r = np.percentile(vals_r, min_er * 100)

        vals_f = vals_f > min_er_f
        vals_r = vals_r > min_er_r

        # print(np.sum(vals_f))
        # print(len(vals_f))
        # print(peaks["+"])

        peaks["+"].Enrichment = vals_f
        peaks["-"].Enrichment = vals_r

        peaks_loc["+"].Enrichment = vals_f
        peaks_loc["-"].Enrichment = vals_r

        peaks = peaks.apply(
            lambda df, _: df[df.Enrichment].drop("Enrichment", axis=1))
        peaks_loc = peaks_loc.apply(
            lambda df, _: df[df.Enrichment].drop("Enrichment", axis=1))
        peaks_loc.Start += 1
        peaks_loc.End += 1

        chip_cvg = np.array(np.concatenate([
            cvg[k][peaks[k].Location] for k in cvg.keys()
            if not peaks[k].empty()
        ]),
                            dtype=np.long)
        left_cvg = np.array(np.concatenate([
            left[k][peaks[k].Location] for k in left.keys()
            if not peaks[k].empty()
        ]),
                            dtype=np.long)
        right_cvg = np.array(np.concatenate([
            right[k][peaks[k].Location] for k in right.keys()
            if not peaks[k].empty()
        ]),
                             dtype=np.long)

        peaks.CVG = chip_cvg
        peaks.SURL = left_cvg
        peaks.SURR = right_cvg

        peaks.drop_empty()

        peaks_with_info[peak_type] = peaks

    return peaks_with_info