Пример #1
0
def getAnchorPETs(jdf, loops, pre, cut=0):
    """
    @param jdf,str, file of .jd
    @param loops: dict, 'chr8-chr8-605': ['chr8', 61242502, 61242734, 'chr8', 61244107, 61244150]
    """
    anchors = getAnchors(loops)
    key, mat = parseJd(jdf, cut)
    report = "%s:%s & %s loops,merged %s anchors" % (key, jdf, len(loops),
                                                     len(anchors))
    logger.info(report)
    xs_keys, xs = getCorLink(mat[:, 1])
    ys_keys, ys = getCorLink(mat[:, 2])
    ps = set()
    for r in anchors:
        #left end
        l_idx = np.searchsorted(xs_keys, r[0], side="left")
        r_idx = np.searchsorted(xs_keys, r[1], side="right")
        for i in range(l_idx, r_idx):
            ps.update(xs[xs_keys[i]])
        #right end
        l_idx = np.searchsorted(ys_keys, r[0], side="left")
        r_idx = np.searchsorted(ys_keys, r[1], side="right")
        for i in range(l_idx, r_idx):
            ps.update(ys[ys_keys[i]])
    nmat = mat[list(ps), ]
    joblib.dump(nmat, os.path.join(pre, "-".join(key) + ".jd"))
    report = "%s:%s raw PETs %s PETs in anchors" % (key, mat.shape[0],
                                                    nmat.shape[0])
    logger.info(report)
    return len(loops), len(anchors), mat.shape[0], nmat.shape[0]
Пример #2
0
def singleDBSCAN(f, eps, minPts, cut=0):
    """
    Run DBSCAN to detect interactions for one chromosome.
    #mat is list, every is [ pointId,x,y ]
    """
    dataI, readI, dataS, readS, dis, dss = [], [], [], [], [], []
    key, mat = parseJd(f, cut=0)
    if cut > 0:
        d = mat[:, 2] - mat[:, 1]
        p = np.where(d >= cut)[0]
        mat = mat[p, :]
        dss.extend(list(d[d < cut]))
    if len(mat) == 0:
        return key, f, dataI, dataS, list(dis), list(dss)
    #data for interaction records, read for readId
    report = "Clustering %s and %s using eps as %s, minPts as %s,pre-set distance cutoff as > %s" % (
        key[0], key[1], eps, minPts, cut)
    logger.info(report)
    db = DBSCAN(mat, eps, minPts)
    labels = pd.Series(db.labels)
    mat = np.array(mat)
    mat = pd.DataFrame(mat[:, 1:].astype("float"),
                       index=mat[:, 0],
                       columns=["X", "Y"])
    nlabels = set(labels.values)
    #collect clusters
    for label in nlabels:
        los = list(labels[labels == label].index)
        sub = mat.loc[los, :]
        #BEDPE format,+1 to escape the error that exact the same start and end
        #2017-05-18, changed to remove such interactions
        if int(np.min(sub["X"])) == int(np.max(sub["X"])) or int(
                np.min(sub["Y"])) == int(np.max(sub["Y"])):
            continue
        r = [
            key[0],
            int(np.min(sub["X"])),
            int(np.max(sub["X"])),
            key[1],
            int(np.min(sub["Y"])),
            int(np.max(sub["Y"])),
            #sub.shape[0],
            #",".join(map(str, los)),
            #los
        ]
        if r[2] < r[4]:
            dataI.append(r)
            readI.extend(los)
        else:
            dataS.append(r)
            readS.extend(los)
    report = "Clustering %s and %s finished. Estimated %s self-ligation reads and %s inter-ligation reads" % (
        key[0], key[1], len(readS), len(readI))
    logger.info(report)
    if len(dataI) > 0:
        dis = mat.loc[readI, "Y"] - mat.loc[readI, "X"]
    if len(dataS) > 0:
        dss.extend(list(mat.loc[readS, "Y"] - mat.loc[readS, "X"]))
    return key, f, dataI, dataS, list(dis), list(dss)
Пример #3
0
def getGenomeCoverage(f, cut=0):
    """
    Build the genomic model for random access. Could use a lot of memory.
    @param f:.jd file 
    @param cut: distance cutoff for self-ligation PETs.
    """
    key, mat = parseJd(f, cut)
    j = mat.shape[0]
    if j < 2:
        return None, 0
    xs_keys, xs = getCorLink(mat[:, 1])
    ys_keys, ys = getCorLink(mat[:, 2])
    return [[xs_keys, xs], [ys_keys, ys]], j
Пример #4
0
def getGenomeCoverage(f, cut=0):
    """
    Build the genomic model for random access. Could use a lot of memory.
    @param f:.jd file 
    @param cut: distance cutoff for self-ligation PETs.
    """
    key, mat = parseJd(f, cut)
    j = mat.shape[0]
    if j == 0:
        return None, 0
    m = max([np.max(mat[:, 1]), np.max(mat[:, 2])])
    model = [False] * (m + 1000000)  #+10 just in case boundary escape
    for t in mat:
        if model[t[1]] == False:
            model[t[1]] = []
        if model[t[2]] == False:
            model[t[2]] = []
        model[t[1]].append(t[0])
        model[t[2]].append(0 - t[0])
    return model, j * 2