Пример #1
0
def main(args):

    # TODO: need to create coverage of file if raw
    # else cluster Scores of binned

    requires_control = any(
        n in args for n in
        ["individual_log2fc_bigwigs", "input_bigwig", "log2fc_bigwig"])
    has_control = args.get("control")
    if requires_control and not has_control:
        raise Exception("Missing control data!")

    treatment_ranges = files_to_coverage(args["treatment"], args)

    if args.get("control"):
        control_ranges = files_to_coverage(args["control"], args)
        control_sum = pr.concat(control_ranges.values())

    treatment_sum = pr.concat(treatment_ranges.values())

    chromsizes = args["chromsizes_"]

    if args["bigwig"]:
        path = args["bigwig"]
        _create_path(path)

        for name, ranges in treatment_ranges.items():
            _basename = splitext(basename(name))[0]
            bw_name = join(path, _basename + ".bw")
            ranges.to_bigwig(bw_name, chromsizes)

        if has_control:
            for name, ranges in control_ranges.items():
                _basename = splitext(basename(name))[0]
                bw_name = join(path, _basename + ".bw")
                ranges.to_bigwig(bw_name, chromsizes)

    if args["individual_log2fc_bigwigs"]:

        path = args["individual_log2fc_bigwigs"]
        _create_path(path)
        for name, ranges in treatment_ranges.items():
            _basename = splitext(basename(name))[0]
            bw_name = join(path, _basename + "_log2fc.bw")
            ranges.to_bigwig(bw_name, chromsizes, divide_by=control_sum)

    if args["log2fc_bigwig"]:
        path = args["log2fc_bigwig"]
        _create_path(dirname(path))
        treatment_sum.to_bigwig(path, chromsizes, divide_by=control_sum)

    if args["chip_bigwig"]:
        path = args["chip_bigwig"]
        _create_path(dirname(path))
        treatment_sum.to_bigwig(path, chromsizes)

    if args["input_bigwig"]:
        path = args["input_bigwig"]
        _create_path(dirname(path))
        control_sum.to_bigwig(path, chromsizes)
Пример #2
0
def main(args):

    # TODO: need to create coverage of file if raw
    # else cluster Scores of binned

    treatment_ranges = files_to_coverage(args["treatment"], args)
    print(treatment_ranges)

    if args.get("control"):
        control_ranges = files_to_coverage(args["control"], args)
        control_sum = pr.concat(control_ranges.values())

    treatment_sum = pr.concat(treatment_ranges.values())

    print(treatment_sum)
Пример #3
0
def test_cluster_by(gr, strand):

    result = gr.cluster(by="ID", strand=strand).df
    print(result)
    df = gr.df

    if strand:
        groupby = ["Chromosome", "Strand", "ID"]
    else:
        groupby = ["Chromosome", "ID"]

    grs = []

    for _, gdf in natsorted(df.groupby(groupby)):
        grs.append(pr.PyRanges(gdf))

    clusters = [gr.cluster(strand=strand) for gr in grs]
    i = 1
    new_clusters = []
    for c in clusters:
        print("c")
        print(c)
        c.Cluster = i
        i += 1
        new_clusters.append(c)

    expected = pr.concat(new_clusters).df
    expected.loc[:, "Cluster"] = expected.Cluster.astype(np.int32)
    # expected = expected.drop_duplicates()

    print(expected)
    print(result)

    assert_df_equal(result, expected)
Пример #4
0
    def set_union(self, other, **kwargs):

        kwargs = fill_kwargs(kwargs)
        strandedness = kwargs["strandedness"]
        strand = True if strandedness else False

        gr = pr.concat([self, other], strand)
        gr = gr.merge(strand=strand, **kwargs)

        return gr
Пример #5
0
    def unstrand(self):

        if not self.stranded:
            return self

        gr = pr.concat([self["+"], self["-"]])

        gr = gr.apply(lambda df: df.drop("Strand", axis=1))

        return gr
Пример #6
0
    def unstrand(self):

        if not self.stranded:
            return self

        gr = pr.concat([self["+"], self["-"]])

        gr = gr.drop("Strand", drop_strand=True)

        return gr
Пример #7
0
    def unstrand(self):

        if not self.stranded:
            return self

        gr = pr.concat([self["+"], self["-"]])

        gr = gr.apply(lambda df: df.drop("Strand", axis=1).reset_index(drop=
                                                                       True))

        return pr.PyRanges(gr.dfs)
Пример #8
0
def assert_equal_length_before_after(gr1, gr2):

    print("in test")
    l1 = len(gr1)
    l2 = len(gr2)
    c = pr.concat([gr1, gr2])

    if not gr1.stranded or not gr2.stranded:
        assert not c.stranded

    lc = len(c)
    assert l1 + l2 == lc
Пример #9
0
def update_pr(changed_id, removed_id):
    global ERVs
    #print("{}\t{}".format(changed_id, removed_id))
    new_elem = elements[changed_id].span().pr()
    new_elem.ID = changed_id
    new_elem.Struct = elements[changed_id].meta_str()
    #print(new_elem)
    print("Merging {} into {}".format(removed_id, changed_id))
    ERVs = pr.concat([
        pr.PyRanges(
            ERVs.df.loc[~ERVs.df['ID'].isin([changed_id, removed_id])]),
        new_elem
    ])
Пример #10
0
    def to_example(self, nrows=10):

        nrows_half = int(min(nrows, len(self))/2)

        if nrows < len(self):
            first = self.head(nrows_half)
            last = self.tail(nrows_half)
            example = pr.concat([first, last])
        else:
            example = self

        d = {c: list(getattr(example, c)) for c in example.columns}

        return d
Пример #11
0
    def unstrand(self):

        # from pydbg import dbg
        # dbg(self.stranded)

        if not self.stranded:
            return self

        gr = pr.concat([self["+"], self["-"]])
        # dbg(gr)

        gr = gr.drop("Strand", drop_strand=True)
        # dbg(gr)
        return gr
Пример #12
0
    def get_target_proximal_ranges(self):
        downstream = self.get_target_ranges()
        upstream = self.get_target_ranges()

        upstream.End = upstream.Start - 1
        upstream.Start += -(self.get_target_proximity() + 1)
        # there may be edge exceptions where the Start coordinate < 0?

        downstream.Start = downstream.End + 1
        downstream.End += self.get_target_proximity() + 1
        # there may be edge exceptions where End coordinate drops off chromo.

        merged = pr.concat([upstream, downstream])
        return pr.gf.genome_bounds(merged,
                                   self.ref.get_reference_ranges(),
                                   clip=True)
Пример #13
0
def test_merge_by(gr, strand):

    print(gr)
    result = gr.merge(by="ID").df.drop("ID", axis=1)

    df = gr.df

    grs = []
    for _, gdf in df.groupby("ID"):
        grs.append(pr.PyRanges(gdf))

    expected = pr.concat([gr.merge() for gr in grs]).df

    print(expected)
    print(result)

    assert_df_equal(result, expected)
Пример #14
0
def lojs_overlap(feature_files, compare_pr):
    """
    Function to run left outer join in features to all_regions_file

    Args:
            :param feature_files: list of paths to file to run intersection with all_regions_file
            :param compare_pr: pyranges object containing all regions of interest. Should have column
                'idx'. Added in function epitome.functions.bed2Pyranges.

    :return arr: array same size as the number of genomic regions in all_regions_file
    """

    if len(feature_files) == 0:
        logger.warn("WARN: lojs_overlap failed for all files %s with 0 lines" %
                    ','.join(feature_files))
        return np.zeros(len(compare_pr))

    #### Number of files that must share a consensus ####
    if len(feature_files) <= 2:
        n = 1  # if there are 1-2 files just include all
    elif len(feature_files) >= 3 and len(feature_files) <= 7:
        n = 2
    else:
        n = int(len(feature_files) / 4)  # in 25% of files

    # Very slow: concatenate all bed files and only take regions with n overlap
    group_pr = pr.concat([pr.read_bed(i).merge() for i in feature_files])
    group_pr = group_pr.merge(count=True).df
    group_pr = group_pr[group_pr['Count'] >= n]

    # Remove count column and save to bed file
    group_pr.drop('Count', inplace=True, axis=1)

    type_ = (compare_pr.Start.dtype == 'int64')
    pr1 = pr.PyRanges(group_pr, int64=type_)

    intersected = compare_pr.count_overlaps(pr1)
    arr = intersected.df.sort_values(by='idx')['NumberOverlaps'].values
    arr[arr > 0] = 1
    return arr
Пример #15
0
    def set_union(self, other, **kwargs):

        kwargs = fill_kwargs(kwargs)
        strandedness = kwargs["strandedness"]
        strand = True if strandedness else False

        if not strand:
            self = self.unstrand()
            other = other.unstrand()

        # from pydbg import dbg
        # dbg(self)
        # dbg(other)
        gr = pr.concat([self, other], strand)

        # from pydbg import dbg
        # dbg(gr)

        gr = gr.merge(strand=strand, **kwargs)
        # dbg(gr)

        return gr
Пример #16
0
def count_overlaps(grs, features=None, how=None, nb_cpu=1, strandedness=None):

    if features is None:
        features = pr.concat(grs.values()).split()

    from pyranges.methods.intersection import _count_overlaps

    hits_gr = {}
    for name, gr in grs.items():

        gr = gr.drop()

        res = features.apply_pair(gr,
                                  _count_overlaps,
                                  as_pyranges=False,
                                  nb_cpu=nb_cpu,
                                  strandedness=strandedness)

        setattr(features, name, res)

        setattr(features, name, getattr(features, name).fillna(0))

    return features
Пример #17
0
def parse_bed_files(bed_files):
    """Creates PyRanges objects from the BED files."""

    # Skip if no BED files are provided
    if len(bed_files) == 0:
        return

    # Load BED files
    beds = [pr.read_bed(b) for b in bed_files]

    # Check that all BED files have the first four columns
    for bed_file, bed in zip(bed_files, beds):
        assert "Name" in bed.columns, f"Name (column 4) missing from {bed_file}."

    # Concatenate BED files and only keep Name column
    bed = pr.concat(beds)
    bed = bed.unstrand()
    bed = bed[["Name"]]

    # Ensure unique names
    assert bed.Name.is_unique, "Names (column 4) not unique across BED files."

    return bed
Пример #18
0
def mcc(grs, genome, labels=None, strand=False, verbose=False):
    import sys

    try:
        genome_length = int(genome)
    except (TypeError, ValueError):
        genome_length = int(genome.End.sum())

    from itertools import combinations_with_replacement, chain

    if labels is None:
        _labels = list(range(len(grs)))
        _labels = combinations_with_replacement(_labels, r=2)
    else:
        assert len(labels) == len(grs)
        _labels = combinations_with_replacement(labels, r=2)

    if verbose:
        # check that genome definition does not have many more
        # chromosomes than datafiles
        gr_cs = set(chain(*[gr.chromosomes for gr in grs]))

        g_cs = set(genome.chromosomes)
        surplus = g_cs - gr_cs
        if len(surplus):
            print(
                "The following chromosomes are in the genome, but not the PyRanges:",
                ", ".join(surplus),
                file=sys.stderr)

    # remove all non-loc columns before computation
    grs = [gr.merge(strand=strand) for gr in grs]

    if strand:

        def make_stranded(df):
            df = df.copy()
            df2 = df.copy()
            df.insert(df.shape[1], "Strand", "+")
            df2.insert(df2.shape[1], "Strand", "-")
            return pd.concat([df, df2])

        genome = genome.apply(make_stranded)

    strandedness = "same" if strand else None

    rowdicts = []
    for (lt, lf), (t, f) in zip(_labels, combinations_with_replacement(grs,
                                                                       r=2)):
        if verbose:
            print(lt, lf, file=sys.stderr)

        if lt == lf:

            if not strand:
                tp = t.length
                fn = 0
                tn = genome.length - tp
                fp = 0
                rowdicts.append({
                    "T": lt,
                    "F": lf,
                    "TP": tp,
                    "FP": fp,
                    "TN": tn,
                    "FN": fn,
                    "MCC": 1
                })
            else:
                for strand in "+ -".split():
                    tp = t[strand].length
                    fn = 0
                    tn = genome_length - tp
                    fp = 0
                    rowdicts.append({
                        "T": lt,
                        "F": lf,
                        "Strand": strand,
                        "TP": tp,
                        "FP": fp,
                        "TN": tn,
                        "FN": fn,
                        "MCC": 1
                    })
            continue

        else:
            c = pr.concat([t, f]).merge(strand=strand)
            j = t.join(f, strandedness=strandedness)
            tp_gr = j.new_position("intersection").merge(strand=strand)
            if strand:
                for strand in "+ -".split():
                    tp = tp_gr[strand].length
                    fp = f[strand].length - tp
                    fn = t[strand].length - tp
                    tn = genome_length - c[strand].length
                    mcc = _mcc(tp, fp, tn, fn)
                    rowdicts.append({
                        "T": lt,
                        "F": lf,
                        "Strand": strand,
                        "TP": tp,
                        "FP": fp,
                        "TN": tn,
                        "FN": fn,
                        "MCC": mcc
                    })
                    rowdicts.append({
                        "T": lf,
                        "F": lt,
                        "Strand": strand,
                        "TP": tp,
                        "FP": fn,
                        "TN": tn,
                        "FN": fp,
                        "MCC": mcc
                    })
            else:
                tp = tp_gr.length
                fp = f.length - tp
                fn = t.length - tp
                tn = genome_length - c.length
                mcc = _mcc(tp, fp, tn, fn)

                rowdicts.append({
                    "T": lt,
                    "F": lf,
                    "TP": tp,
                    "FP": fp,
                    "TN": tn,
                    "FN": fn,
                    "MCC": mcc
                })
                rowdicts.append({
                    "T": lf,
                    "F": lt,
                    "TP": tp,
                    "FP": fn,
                    "TN": tn,
                    "FN": fp,
                    "MCC": mcc
                })

    df = pd.DataFrame.from_dict(rowdicts).sort_values(["T", "F"])

    return df
Пример #19
0
total_sizes = []
cds_sizes = []
exon_sizes = []
panel_prs = []

for panel in panels:
    print(panel)
    panel_pr = pr.PyRanges(genie.loc[(genie['SEQ_ASSAY_ID'] == panel) & genie['Chromosome'].isin(chromosomes), 'Chromosome':'End_Position'].rename(columns={'Start_Position': 'Start', 'End_Position': 'End'})).merge()
    total_sizes.append(sum([i + 1 for i in panel_pr.lengths()]))
    cds_sizes.append(sum([i + 1 for i in panel_pr.intersect(gff_cds_pr).lengths()]))
    exon_sizes.append(sum([i + 1 for i in panel_pr.intersect(gff_exon_pr).lengths()]))
    panel_prs.append(panel_pr)


grs = {k: v for k, v in zip(['CDS', 'exon'] + list(panels), [gff_cds_pr, gff_exon_pr] + panel_prs)}
result = pr.count_overlaps(grs, pr.concat({'maf': maf_pr}.values()))
result = result.df

tcga_maf = pd.merge(tcga_maf, result.iloc[:, 3:], how='left', on='index')


panel_df['total'] = total_sizes
panel_df['cds'] = cds_sizes
panel_df['exon'] = exon_sizes

##get assumed size of the most common kit: https://bitbucket.org/cghub/cghub-capture-kit-info/src/master/BI/vendor/Agilent/whole_exome_agilent_1.1_refseq_plus_3_boosters.targetIntervals.bed
agilent_df = pd.read_csv(file_path / 'whole_exome_agilent_1.1_refseq_plus_3_boosters.targetIntervals.bed', sep='\t', low_memory=False, header=None)
kit_pr = pr.PyRanges(agilent_df.rename(columns={0: 'Chromosome', 1: 'Start', 2: 'End'})).merge()
kit_total = sum([i + 1 for i in kit_pr.lengths()])
kit_cds = sum([i + 1 for i in kit_pr.intersect(gff_cds_pr).merge().lengths()])
kit_exon = sum([i + 1 for i in kit_pr.intersect(gff_exon_pr).merge().lengths()])
Пример #20
0
def count_overlaps(grs, features=None, strandedness=None, how=None, nb_cpu=1):
    """Count overlaps in multiple pyranges.

    Parameters
    ----------
    grs : dict of PyRanges

        The PyRanges to use as queries.

    features : PyRanges, default None

        The PyRanges to use as subject in the query. If None, the PyRanges themselves are used as a query.

    strandedness : {None, "same", "opposite", False}, default None, i.e. auto

        Whether to compare PyRanges on the same strand, the opposite or ignore strand
        information. The default, None, means use "same" if both PyRanges are strande,
        otherwise ignore the strand information.

     how : {None, "all", "containment"}, default None, i.e. all

        What intervals to report. By default reports all overlapping intervals. "containment"
        reports intervals where the overlapping is contained within it.

    nb_cpu : int, default 1

        How many cpus to use. Can at most use 1 per chromosome or chromosome/strand tuple.
        Will only lead to speedups on large datasets.

    Examples
    --------

    >>> a = '''Chromosome Start End
    ... chr1    6    12
    ... chr1    10    20
    ... chr1    22    27
    ... chr1    24    30'''

    >>> b = '''Chromosome Start End
    ... chr1    12    32
    ... chr1    14    30'''

    >>> c = '''Chromosome Start End
    ... chr1    8    15
    ... chr1    10    14
    ... chr1    32    34'''

    >>> grs = {n: pr.from_string(s) for n, s in zip(["a", "b", "c"], [a, b, c])}
    >>> for k, v in grs.items():
    ...     print("Name: " + k)
    ...     print(v)
    Name: a
    +--------------+-----------+-----------+
    | Chromosome   |     Start |       End |
    | (category)   |   (int32) |   (int32) |
    |--------------+-----------+-----------|
    | chr1         |         6 |        12 |
    | chr1         |        10 |        20 |
    | chr1         |        22 |        27 |
    | chr1         |        24 |        30 |
    +--------------+-----------+-----------+
    Unstranded PyRanges object has 4 rows and 3 columns from 1 chromosomes.
    For printing, the PyRanges was sorted on Chromosome.
    Name: b
    +--------------+-----------+-----------+
    | Chromosome   |     Start |       End |
    | (category)   |   (int32) |   (int32) |
    |--------------+-----------+-----------|
    | chr1         |        12 |        32 |
    | chr1         |        14 |        30 |
    +--------------+-----------+-----------+
    Unstranded PyRanges object has 2 rows and 3 columns from 1 chromosomes.
    For printing, the PyRanges was sorted on Chromosome.
    Name: c
    +--------------+-----------+-----------+
    | Chromosome   |     Start |       End |
    | (category)   |   (int32) |   (int32) |
    |--------------+-----------+-----------|
    | chr1         |         8 |        15 |
    | chr1         |        10 |        14 |
    | chr1         |        32 |        34 |
    +--------------+-----------+-----------+
    Unstranded PyRanges object has 3 rows and 3 columns from 1 chromosomes.
    For printing, the PyRanges was sorted on Chromosome.

    >>> pr.count_overlaps(grs)
    +--------------+-----------+-----------+-----------+-----------+-----------+
    | Chromosome   | Start     | End       | a         | b         | c         |
    | (object)     | (int32)   | (int32)   | (int32)   | (int32)   | (int32)   |
    |--------------+-----------+-----------+-----------+-----------+-----------|
    | chr1         | 6         | 8         | 1         | 0         | 0         |
    | chr1         | 8         | 10        | 1         | 0         | 1         |
    | chr1         | 10        | 12        | 2         | 0         | 2         |
    | chr1         | 12        | 14        | 1         | 1         | 2         |
    | ...          | ...       | ...       | ...       | ...       | ...       |
    | chr1         | 24        | 27        | 2         | 2         | 0         |
    | chr1         | 27        | 30        | 1         | 2         | 0         |
    | chr1         | 30        | 32        | 0         | 1         | 0         |
    | chr1         | 32        | 34        | 0         | 0         | 1         |
    +--------------+-----------+-----------+-----------+-----------+-----------+
    Unstranded PyRanges object has 12 rows and 6 columns from 1 chromosomes.
    For printing, the PyRanges was sorted on Chromosome.

    >>> gr = pr.PyRanges(chromosomes=["chr1"] * 4, starts=[0, 10, 20, 30], ends=[10, 20, 30, 40])
    >>> gr
    +--------------+-----------+-----------+
    | Chromosome   |     Start |       End |
    | (category)   |   (int32) |   (int32) |
    |--------------+-----------+-----------|
    | chr1         |         0 |        10 |
    | chr1         |        10 |        20 |
    | chr1         |        20 |        30 |
    | chr1         |        30 |        40 |
    +--------------+-----------+-----------+
    Unstranded PyRanges object has 4 rows and 3 columns from 1 chromosomes.
    For printing, the PyRanges was sorted on Chromosome.

    >>> pr.count_overlaps(grs, gr)
    +--------------+-----------+-----------+-----------+-----------+-----------+
    | Chromosome   |     Start |       End |         a |         b |         c |
    | (category)   |   (int32) |   (int32) |   (int32) |   (int32) |   (int32) |
    |--------------+-----------+-----------+-----------+-----------+-----------|
    | chr1         |         0 |        10 |         1 |         0 |         1 |
    | chr1         |        10 |        20 |         2 |         2 |         2 |
    | chr1         |        20 |        30 |         2 |         2 |         0 |
    | chr1         |        30 |        40 |         0 |         1 |         1 |
    +--------------+-----------+-----------+-----------+-----------+-----------+
    Unstranded PyRanges object has 4 rows and 6 columns from 1 chromosomes.
    For printing, the PyRanges was sorted on Chromosome.
    """

    kwargs = {
        "as_pyranges": False,
        "nb_cpu": nb_cpu,
        "strandedness": strandedness,
        "how": how,
        "nb_cpu": nb_cpu
    }
    names = list(grs.keys())

    if features is None:
        features = pr.concat(grs.values()).split(between=True)

    from pyranges.methods.intersection import _count_overlaps

    for name, gr in grs.items():

        gr = gr.drop()

        res = features.apply_pair(gr, _count_overlaps, **kwargs)

        setattr(features, name, res)

        setattr(features, name, getattr(features, name).fillna(0))

    def to_int(df):
        df.loc[:, names] = df[names].astype(np.int32)
        return df

    features = features.apply(to_int)

    return features
Пример #21
0
    def k_nearest(self, other, k=1, **kwargs):

        from pyranges.methods.k_nearest import _nearest
        from sorted_nearest import get_all_ties, get_different_ties

        kwargs = fill_kwargs(kwargs)
        kwargs["stranded"] = self.stranded and other.stranded

        overlap = kwargs.get("overlap", True)
        ties = kwargs.get("ties", False)

        self = pr.PyRanges({k: v.copy() for k, v in self.dfs.items()})

        try: # if k is an array
            k = k.values
        except:
            pass

        self.__k__ = k
        self.__IX__ = np.arange(len(self))


        # from time import time
        # start = time()
        dfs = pyrange_apply(_nearest, self, other, **kwargs)
        # end = time()
        # print("nearest", end - start)

        nearest = PyRanges(dfs)
        # nearest.msp()
        # raise
        # print("nearest len", len(nearest))

        if not overlap:
            # self = self.drop(like="__k__|__IX__")
            result = nearest#.drop(like="__k__|__IX__")
        else:
            from collections import defaultdict
            overlap_kwargs = {k: v for k, v in kwargs.items()}
            # print("kwargs ties:", kwargs.get("ties"))
            overlap_kwargs["how"] = defaultdict(lambda: None, {"first": "first", "last": "last"})[kwargs.get("ties")]
            # start = time()
            overlaps = self.join(other, **overlap_kwargs)
            # end = time()
            # print("overlaps", end - start)
            overlaps.Distance = 0
            # print("overlaps len", len(overlaps))

            result = pr.concat([overlaps, nearest])

        if not len(result):
            return pr.PyRanges()
        # print(result)
        # print(overlaps.drop(like="__").df)
        # raise

        # start = time()
        new_result = {}
        if ties in ["first", "last"]:
            # method = "tail" if ties == "last" else "head"
            # keep = "last" if ties == "last" else "first"

            for c, df in result:
                # start = time()
                # print(c)
                # print(df)

                df = df.sort_values(["__IX__", "Distance"])
                grpby = df.groupby("__k__", sort=False)
                dfs = []
                for k, kdf in grpby:
                    # print("k", k)
                    # print(kdf)
                    # dist_bool = ~kdf.Distance.duplicated(keep=keep)
                    # print(dist_bool)
                    # kdf = kdf[dist_bool]
                    grpby2 = kdf.groupby("__IX__", sort=False)
                    # f = getattr(grpby2, method)
                    _df = grpby2.head(k)
                    # print(_df)
                    dfs.append(_df)
                # raise

                if dfs:
                    new_result[c] = pd.concat(dfs)
                # print(new_result[c])
        elif ties == "different" or not ties:
            for c, df in result:

                # print(df)

                if df.empty:
                    continue
                dfs = []

                df = df.sort_values(["__IX__", "Distance"])
                grpby = df.groupby("__k__", sort=False)

                # for each index
                # want to keep until we have k
                # then keep all with same distance
                for k, kdf in grpby:
                    # print("kdf " * 10)
                    # print("k " * 5, k)
                    # print(kdf["__IX__ Distance".split()])
                    # print(kdf.dtypes)
                    # print(kdf.index.dtypes)
                    # if ties:
                    if ties:
                        lx = get_different_ties(kdf.index.values, kdf.__IX__.values, kdf.Distance.astype(np.int64).values, k)
                    else:
                        lx = get_all_ties(kdf.index.values, kdf.__IX__.values, kdf.Distance.astype(np.int64).values, k)
                    # print(lx)


                    # else:
                    #     lx = get_all_ties(kdf.index.values, kdf.__IX__.values, kdf.Distance.astype(np.int64).values, k)
                    _df = kdf.reindex(lx)
                    # print("_df", _df)
                    dfs.append(_df)

                if dfs:
                    new_result[c] = pd.concat(dfs)

        result = pr.PyRanges(new_result)

        if not result.__IX__.is_monotonic:
            result = result.sort("__IX__")

        result = result.drop(like="__IX__|__k__")

        self = self.drop(like="__k__|__IX__")

        def prev_to_neg(df, kwargs):

            strand = df.Strand.iloc[0] if "Strand" in df else "+"

            suffix = kwargs["suffix"]

            bools = df["End" + suffix] < df.Start
            if not strand == "+":
                bools = ~bools

            df.loc[bools, "Distance"] = -df.loc[bools, "Distance"]
            return df

        # print(result)
        result = result.apply(prev_to_neg, suffix=kwargs["suffix"])
        # print(result)

        # end = time()
        # print("final stuff", end - start)

        return result