Exemplo n.º 1
0
def _cluster(self, strand=False, maxdist=0, minnb=1):

    dfs = []

    idx_start, idx_end = 0, 0
    if strand:

        for (c, s), cdf in self.df.groupby(["Chromosome", "Strand"]):
            cdf = cdf.sort_values("Start")
            starts, ends = find_clusters(cdf.Start.values, cdf.End.values)
            df = pd.DataFrame({"Chromosome": c, "Start": starts, "End": ends, "Strand": s})
            dfs.append(df)

        df = pd.concat(dfs, ignore_index=True)["Chromosome Start End Strand".split()]

    else:

        for c, cdf in self.df.groupby(["Chromosome"]):
            cdf = cdf.sort_values("Start")
            starts, ends = find_clusters(cdf.Start.values, cdf.End.values)
            df = pd.DataFrame({"Chromosome": c, "Start": starts, "End": ends})
            dfs.append(df)

        df = pd.concat(dfs, ignore_index=True)["Chromosome Start End".split()]


    return df
Exemplo n.º 2
0
def _merge(df, kwargs):

    if df.empty:
        return None

    chromosome, strand = kwargs["chromosome"], kwargs.get("strand", None)

    cdf = df.sort_values("Start")

    starts, ends = find_clusters(cdf.Start.values, cdf.End.values)

    nidx = pd.Index(range(len(starts)))
    if strand:
        cluster_df = pd.DataFrame({
            "Chromosome":
            pd.Series(chromosome, dtype="category", index=nidx),
            "Start":
            starts,
            "End":
            ends,
            "Strand":
            pd.Series(strand, dtype="category", index=nidx)
        })
    else:
        cluster_df = pd.DataFrame({
            "Chromosome":
            pd.Series(chromosome, dtype="category", index=nidx),
            "Start":
            starts,
            "End":
            ends
        })

    return cluster_df
Exemplo n.º 3
0
def _set_union(scdf, ocdf, **kwargs):

    chromosome, strand = parse_grpby_key(kwargs["key"])

    strandedness = kwargs["strandedness"]
    strand = True if strandedness == "same" else False

    if len(scdf) == 0:
        return _cluster(ocdf, chromosome, strand=strand)
    elif len(ocdf) == 0:
        return _cluster(scdf, chromosome, strand=strand)

    _starts = np.concatenate([scdf.Start.values, ocdf.Start.values])
    _ends = np.concatenate([scdf.End.values, ocdf.End.values])

    cdf = pd.DataFrame({"Start": _starts, "End": _ends})["Start End".split()]
    cdf = cdf.sort_values("Start")
    starts, ends = find_clusters(cdf.Start.values, cdf.End.values)

    chromosome = scdf.head(1)["Chromosome"].iloc[0]
    if strandedness == "same":
        _strand = scdf.head(1)["Strand"].iloc[0]
    else:
        _strand = False

    cluster_df = _create_df_from_starts_ends(starts, ends, chromosome, _strand)

    return cluster_df
Exemplo n.º 4
0
def _cluster(df, chromosome, strand=False, **kwargs):

    cdf = df.sort_values("Start")
    starts, ends = find_clusters(cdf.Start.values, cdf.End.values)

    cluster_df = _create_df_from_starts_ends(starts, ends, chromosome, strand)

    return cluster_df
Exemplo n.º 5
0
def _merge(df, **kwargs):

    if df.empty:
        return None

    slack = kwargs.get("slack", 0)
    chromosome, strand = kwargs["chromosome"], kwargs.get("strand", None)

    cdf = df.sort_values("Start")

    starts, ends, number = find_clusters(cdf.Start.values, cdf.End.values,
                                         slack)

    nidx = pd.Index(range(len(starts)))
    if strand:
        cluster_df = pd.DataFrame({
            "Chromosome":
            pd.Series(chromosome, dtype="category", index=nidx),
            "Start":
            starts,
            "End":
            ends,
            "Strand":
            pd.Series(strand, dtype="category", index=nidx),
        })
    else:
        cluster_df = pd.DataFrame({
            "Chromosome":
            pd.Series(chromosome, dtype="category", index=nidx),
            "Start":
            starts,
            "End":
            ends
        })

    if kwargs["count"]:
        cluster_df.insert(cluster_df.shape[1], kwargs["count_col"], number)

    return cluster_df
Exemplo n.º 6
0
def _set_union(self, other, strand):

    if strand:
        assert self.stranded and other.stranded, \
            "Can only do stranded set union when both PyRanges contain strand info."

    if len(self) == 0:
        return _cluster(other, strand=strand)
    elif len(other) == 0:
        return _cluster(self, strand=strand)

    dfs = []

    if strand and len(list(self.__ncls__.keys())[0]) == 2: # chromosome and strand
        grpby_key = "Chromosome Strand".split()
        columns = "Chromosome Start End Strand".split()
    else:
        grpby_key = "Chromosome"
        columns = "Chromosome Start End".split()

    self_dfs =  {k: d for k, d in self.df.groupby(grpby_key)}
    other_dfs = {k: d for k, d in other.df.groupby(grpby_key)}

    idx_start, idx_end = 0, 0
    for key in set(self_dfs).union(other_dfs):

        if key in other_dfs and key in self_dfs:
            _starts = np.concatenate([
                self_dfs[key].Start.values,
                other_dfs[key].Start.values])
            _ends = np.concatenate([
                self_dfs[key].End.values,
                other_dfs[key].End.values])
        elif key in self_dfs and not key in other_dfs:
            _starts = self_dfs[key].Start.values
            _ends = self_dfs[key].End.values
        elif key in other_dfs and not key in self_dfs:
            _starts = other_dfs[key].Start.values
            _ends = other_dfs[key].End.values


        cdf = pd.DataFrame({"Start": _starts, "End": _ends})["Start End".split()]
        # print("cdf" * 10 + "\n", cdf)
        # cdf = sort_one_by_one(cdf, "Start", "End")
        cdf = cdf.sort_values("Start")
        # print("cdf" * 10 + "\n", cdf)
        # print(clusters)
        starts, ends = find_clusters(cdf.Start.values, cdf.End.values)
        # print(starts, ends)
        idx_end += len(starts)

        if strand:
            _df = pd.DataFrame({"Chromosome": key[0], "Start": starts, "End": ends, "Strand": key[1]})
        else:
            _df = pd.DataFrame({"Chromosome": key, "Start": starts, "End": ends})

        _df.index = range(idx_start, idx_end)
        dfs.append(_df)
        idx_start = idx_end

    df = pd.concat(dfs)[columns]

    return df