示例#1
0
def test_get_different_ties(data):

    df = data

    k = 2
    result = get_different_ties(df.index.values, df.ids.values, df.dist.values, k)
    print(df.reindex(result))
    print(result)
    assert list(result) == [0, 1, 2, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23]
示例#2
0
    def k_nearest(self, other, k=1, **kwargs):

        from pyranges.methods.k_nearest import _nearest
        from sorted_nearest import get_all_ties, get_different_ties

        kwargs = fill_kwargs(kwargs)
        kwargs["stranded"] = self.stranded and other.stranded

        overlap = kwargs.get("overlap", True)
        ties = kwargs.get("ties", False)

        self = pr.PyRanges({k: v.copy() for k, v in self.dfs.items()})

        try: # if k is an array
            k = k.values
        except:
            pass

        self.__k__ = k
        self.__IX__ = np.arange(len(self))


        # from time import time
        # start = time()
        dfs = pyrange_apply(_nearest, self, other, **kwargs)
        # end = time()
        # print("nearest", end - start)

        nearest = PyRanges(dfs)
        # nearest.msp()
        # raise
        # print("nearest len", len(nearest))

        if not overlap:
            # self = self.drop(like="__k__|__IX__")
            result = nearest#.drop(like="__k__|__IX__")
        else:
            from collections import defaultdict
            overlap_kwargs = {k: v for k, v in kwargs.items()}
            # print("kwargs ties:", kwargs.get("ties"))
            overlap_kwargs["how"] = defaultdict(lambda: None, {"first": "first", "last": "last"})[kwargs.get("ties")]
            # start = time()
            overlaps = self.join(other, **overlap_kwargs)
            # end = time()
            # print("overlaps", end - start)
            overlaps.Distance = 0
            # print("overlaps len", len(overlaps))

            result = pr.concat([overlaps, nearest])

        if not len(result):
            return pr.PyRanges()
        # print(result)
        # print(overlaps.drop(like="__").df)
        # raise

        # start = time()
        new_result = {}
        if ties in ["first", "last"]:
            # method = "tail" if ties == "last" else "head"
            # keep = "last" if ties == "last" else "first"

            for c, df in result:
                # start = time()
                # print(c)
                # print(df)

                df = df.sort_values(["__IX__", "Distance"])
                grpby = df.groupby("__k__", sort=False)
                dfs = []
                for k, kdf in grpby:
                    # print("k", k)
                    # print(kdf)
                    # dist_bool = ~kdf.Distance.duplicated(keep=keep)
                    # print(dist_bool)
                    # kdf = kdf[dist_bool]
                    grpby2 = kdf.groupby("__IX__", sort=False)
                    # f = getattr(grpby2, method)
                    _df = grpby2.head(k)
                    # print(_df)
                    dfs.append(_df)
                # raise

                if dfs:
                    new_result[c] = pd.concat(dfs)
                # print(new_result[c])
        elif ties == "different" or not ties:
            for c, df in result:

                # print(df)

                if df.empty:
                    continue
                dfs = []

                df = df.sort_values(["__IX__", "Distance"])
                grpby = df.groupby("__k__", sort=False)

                # for each index
                # want to keep until we have k
                # then keep all with same distance
                for k, kdf in grpby:
                    # print("kdf " * 10)
                    # print("k " * 5, k)
                    # print(kdf["__IX__ Distance".split()])
                    # print(kdf.dtypes)
                    # print(kdf.index.dtypes)
                    # if ties:
                    if ties:
                        lx = get_different_ties(kdf.index.values, kdf.__IX__.values, kdf.Distance.astype(np.int64).values, k)
                    else:
                        lx = get_all_ties(kdf.index.values, kdf.__IX__.values, kdf.Distance.astype(np.int64).values, k)
                    # print(lx)


                    # else:
                    #     lx = get_all_ties(kdf.index.values, kdf.__IX__.values, kdf.Distance.astype(np.int64).values, k)
                    _df = kdf.reindex(lx)
                    # print("_df", _df)
                    dfs.append(_df)

                if dfs:
                    new_result[c] = pd.concat(dfs)

        result = pr.PyRanges(new_result)

        if not result.__IX__.is_monotonic:
            result = result.sort("__IX__")

        result = result.drop(like="__IX__|__k__")

        self = self.drop(like="__k__|__IX__")

        def prev_to_neg(df, kwargs):

            strand = df.Strand.iloc[0] if "Strand" in df else "+"

            suffix = kwargs["suffix"]

            bools = df["End" + suffix] < df.Start
            if not strand == "+":
                bools = ~bools

            df.loc[bools, "Distance"] = -df.loc[bools, "Distance"]
            return df

        # print(result)
        result = result.apply(prev_to_neg, suffix=kwargs["suffix"])
        # print(result)

        # end = time()
        # print("final stuff", end - start)

        return result