예제 #1
0
def test_inverse_div_mul_rles(df):
    """Testing with small integers, since small value floating points might lead to
mul then div not being equal to identity function because of float equality."""

    print(df)
    runlength = df.Runs.sum()

    cv = Rle(df.Runs.values, df.Values.values)

    newruns = np.random.permutation(df.Runs.values)
    print("newruns", newruns)
    cv2 = Rle(newruns, df.Values2.values)

    print("cv\n", cv)
    print("cv2\n", cv2)

    assert runlength == np.sum(cv.runs) and runlength == np.sum(cv2.runs)

    result = cv / cv2

    result2 = result * cv2

    print("result\n", result)
    print("result2\n", result2)

    assert np.all(np.equal(result2.runs, cv.runs))
    assert np.allclose(result2.values, cv.values)
예제 #2
0
def test_rle(runlengths, runlengths2, operation):

    # It is only compared against bioc with integers because float equality is hard,
    # for both libraries, sometimes end up with slightly different runlengths
    # [257.492105948544, 257.492075430654] gives
    # pyranges result
    #  +--------+---------+---------+---------+
    # | Runs   |       1 |       1 |       1 |
    # |--------+---------+---------+---------|
    # | Values | 257.492 | 257.492 | 257.492 |
    # +--------+---------+---------+---------+
    # Rle of length 3 containing 3 elements
    # s4vectors result
    #  +--------+---------+---------+
    # | Runs   |       2 |       1 |
    # |--------+---------+---------|
    # | Values | 257.492 | 257.492 |
    # +--------+---------+---------+
    # Rle of length 3 containing 2 elements

    pyop = {"+": "__add__", "-": "__sub__", "*": "__mul__", "/": "__truediv__"}[operation]

    print("runlengths", runlengths)
    print("runlengths2", runlengths2)

    r = Rle(runlengths.Runs, runlengths.Values)
    r2 = Rle(runlengths2.Runs, runlengths2.Values)

    print("r\n", r)
    print("r2\n", r2)

    m = getattr(r, pyop)
    result_pyranges = m(r2)

    print("pyranges result\n", result_pyranges)
    # f1 = "f1.txt"
    # f2 = "f2.txt"
    # runlengths.to_csv(f1, sep="\t", index=False)
    # runlengths2.to_csv(f2, sep="\t", index=False)
    result_df = None
    with tempfile.TemporaryDirectory() as temp_dir:
        f1 = "{}/f1.txt".format(temp_dir)
        f2 = "{}/f2.txt".format(temp_dir)
        outfile = "{}/result.txt".format(temp_dir)
        runlengths.to_csv(f1, sep="\t", index=False)
        runlengths2.to_csv(f2, sep="\t", index=False)

        cmd = rle_operation_cmd.format(f1, f2, operation, outfile) + " 2>/dev/null"

        subprocess.check_output(cmd, shell=True, executable="/bin/bash").decode()

        result = pd.read_table(outfile)
        s4vectors_result = Rle(result.Runs, result.Values)

    print("pyranges result\n", result_pyranges)
    print("s4vectors result\n", s4vectors_result)

    assert np.allclose(result_pyranges.runs, s4vectors_result.runs, equal_nan=False)
    assert np.allclose(result_pyranges.values, s4vectors_result.values, equal_nan=True)
예제 #3
0
def expected_result():

    r1 = Rle([1, 2, 3, 10], [2, 4, 5, 3])
    r2 = Rle([4, 1, 4, 2, 9], [-0.09, -0.08, -0.18, -0.28, 0.03])

    d = {"chr1": r1, "chr2": r2}

    return PyRles(d)
예제 #4
0
def grle2():

    r1 = Rle([1, 2, 3], [1, 2, 3])
    r2 = Rle([5, 4, 2], [-0.1, -0.2, -0.3])

    d = {"chr1": r1, "chr2": r2}

    return PyRles(d)
예제 #5
0
def d1():

    r1 = Rle([1, 5, 10], [1, 2, 3])
    r2 = Rle([4, 7, 9], [0.01, 0.02, 0.03])
    print(r1)

    d = {"chr1": r1, "chr2": r2}

    return d
예제 #6
0
def grle1():

    r1 = Rle([1, 5, 10], [1, 2, 3])
    r2 = Rle([4, 7, 9], [0.01, 0.02, 0.03])

    d = {"chr1": r1, "chr2": r2}
    print(d)
    g = PyRles(d)
    print(g)
    return g
예제 #7
0
def test_rle(runlengths, runlengths2, operation):

    # Only compared against bioc with integers because float equality is hard,
    # for both libraries, sometimes end up with slightly different runlengths
    # when consecutive values are almost equal

    pyop = {
        "+": "__add__",
        "-": "__sub__",
        "*": "__mul__",
        "/": "__truediv__"
    }[operation]

    print("runlengths", runlengths)
    print("runlengths2", runlengths2)

    r = Rle(runlengths.Runs, runlengths.Values)
    r2 = Rle(runlengths2.Runs, runlengths2.Values)

    print("r\n", r)
    print("r2\n", r2)

    m = getattr(r, pyop)
    result_pyranges = m(r2)

    print("pyranges result\n", result_pyranges)

    result_df = None
    with tempfile.TemporaryDirectory() as temp_dir:
        f1 = "{}/f1.txt".format(temp_dir)
        f2 = "{}/f2.txt".format(temp_dir)
        outfile = "{}/result.txt".format(temp_dir)
        runlengths.to_csv(f1, sep="\t", index=False)
        runlengths2.to_csv(f2, sep="\t", index=False)

        cmd = rle_operation_cmd.format(f1, f2, operation,
                                       outfile)  # + " 2>/dev/null"

        subprocess.check_output(cmd, shell=True,
                                executable="/bin/bash").decode()

        result = pd.read_csv(outfile, sep="\t")
        s4vectors_result = Rle(result.Runs, result.Values)

    print("pyranges result\n", result_pyranges)
    print("s4vectors result\n", s4vectors_result)

    assert np.allclose(result_pyranges.runs,
                       s4vectors_result.runs,
                       equal_nan=False)
    assert np.allclose(result_pyranges.values,
                       s4vectors_result.values,
                       equal_nan=True)
예제 #8
0
def init_chip(coverage, flank_distance):
    """Creating left by chopping off rightmost fd, shifting remaining flank_distance to the right and vice-versa
    """

    replicates = list(coverage)

    outdict = {}
    for replicate, location in product(replicates,
                                       ["left", "right", "center"]):

        cvg = coverage[replicate]
        _cvg = {}
        for direction in ["+", "-"]:

            chromosomes = [k[0] for k in cvg.keys()]
            for c in chromosomes:

                try:
                    rle = cvg[c, direction]
                except:
                    continue

                if location == "left":
                    length = np.sum(rle.runs) - flank_distance
                    newrle = rle[:length]
                    newruns = np.concatenate([
                        np.array([flank_distance], dtype=np.long), newrle.runs
                    ])
                    newvals = np.concatenate(
                        [np.array([0], dtype=np.double), newrle.values])
                    newrle = Rle(newruns, newvals)
                    _cvg[c, direction] = newrle
                elif location == "right":
                    length = np.sum(rle.runs)
                    newrle = Rle(rle.runs, rle.values)
                    newrle = newrle[flank_distance:]
                    # could be optimized by ending getitem after found start...
                    newruns = np.concatenate([
                        newrle.runs,
                        np.array([flank_distance], dtype=np.long)
                    ])
                    newvals = np.concatenate(
                        [newrle.values,
                         np.array([0], dtype=np.double)])
                    newrle = Rle(newruns, newvals)
                    _cvg[c, direction] = newrle
                else:
                    _cvg[c, direction] = rle

        outdict[replicate, location] = PyRles(_cvg)

    return outdict
예제 #9
0
def test_inverse_add_sub_rles(df):
    """Testing with small integers, since small value floating points might lead to
mul then div not being equal to identity function because of float equality."""

    cv = Rle(df.Runs.values, df.Values.values)

    cv2 = Rle(np.random.permutation(df.Runs.values), df.Values2.values)

    result = cv + cv2

    result2 = result - cv2

    assert np.all(np.equal(result2.runs, cv.runs))
    assert np.allclose(result2.values, cv.values)
예제 #10
0
def test_coverage_simple(simple):

    result = coverage(simple, value_col="Score")

    print(result)

    assert result == Rle([1, 2], [-1, 1])
예제 #11
0
def coverage(df, **kwargs):

    value_col = kwargs.get("value_col", None)

    if value_col:
        values = df[value_col].astype(np.float64).values
    else:
        values = np.ones(len(df))

    starts_df = pd.DataFrame({
        "Position": df.Start,
        "Value": values
    })["Position Value".split()]
    ends_df = pd.DataFrame({
        "Position": df.End,
        "Value": -1 * values
    })["Position Value".split()]
    _df = pd.concat([starts_df, ends_df], ignore_index=True)
    _df = _df.sort_values("Position", kind="mergesort")

    if _df.Position.dtype.name == "int32":
        _df.Position = _df.Position.astype(np.int64)

    runs, values = _coverage(_df.Position.values, _df.Value.values)

    return Rle(runs, values)
예제 #12
0
def rle2():

    r = pd.Series([2, 1], dtype=np.int16)
    v = pd.Series([0, 1], dtype=np.int16)
    r2 = Rle(r, v)

    return r2
예제 #13
0
def shorter_rle():

    runs = pd.Series([1, 2, 3])
    values = pd.Series([1, 0, 1])
    r = Rle(runs, values)

    return r
예제 #14
0
def test_coverage(df):

    print("---" * 10)
    p = pr.PyRanges(df)
    print("pyranges\n", p)

    c = p.coverage()["chr1"]

    result_df = None
    with tempfile.TemporaryDirectory() as temp_dir:
        f1 = "{}/f1.txt".format(temp_dir)
        outfile = "{}/result.txt".format(temp_dir)
        R_df = df
        R_df.End = R_df.End - 1
        R_df.to_csv(f1, sep="\t", index=False)

        cmd = coverage_cmd.format(f1, outfile) + " 2>/dev/null"

        subprocess.check_output(cmd, shell=True, executable="/bin/bash").decode()

        result = pd.read_table(outfile)[["Runs.value", "Values.value"]]
        result.columns = "Runs Values".split()
        result = pd.concat([pd.DataFrame(index=[0], data={"Runs": 1, "Values": 0}), result], ignore_index=True)
        s4vectors_result = Rle(result.Runs, result.Values)

    print("pyranges result\n", c)
    print("s4vectors result\n", s4vectors_result)
    print(str(c == s4vectors_result) + " " * 10, c == s4vectors_result)

    assert np.all(np.equal(c.runs, s4vectors_result.runs))
    assert np.all(np.equal(c.values, s4vectors_result.values))
예제 #15
0
def merge_runs(s):
    _new_rledict = {}
    for k, v in s.items():
        v.values[v.values < 0] = 0
        v.values[v.values > 0] = 1
        v = Rle(v.runs, v.values)
        _new_rledict[k] = v

    return PyRles(_new_rledict)
예제 #16
0
def test_subtract_result_same_start(chip, background):

    print(chip)
    print(background)

    result = chip - background

    print(result)

    assert result == Rle([1, 2], [-1, 1])
예제 #17
0
def sum_background(background):

    first_file = list(background.keys())[0]
    first_key = list(background[first_file].keys())[0]
    background_sum = PyRles({first_key: Rle([1], [0])})

    for v in background.values():

        background_sum += v

    return background_sum
예제 #18
0
def sum_chip(chip, where):

    locs = list(get_locs(chip, where).values())

    first_loc = locs[0]
    first_key = list(first_loc.keys())[0]
    chip_sum = PyRles({first_key: Rle([1], [0])})

    for v in locs:

        chip_sum += v

    return chip_sum
예제 #19
0
def binary_operation(operation, self, other, nb_cpu=1):

    func = {"div": __div, "mul": __mul, "add": __add, "sub": __sub}[operation]
    func, get = rd.get_multithreaded_funcs(func, nb_cpu)

    if nb_cpu > 1:
        import ray
        with suppress_stdout_stderr():
            ray.init(num_cpus=nb_cpu)

    if self.stranded != other.stranded:
        self, other = ensure_both_or_none_stranded(self, other)

    chromosomes_in_both, chromosomes_in_self_not_other, chromosomes_in_other_not_self = chromosomes_in_both_self_other(
        self, other)

    both_results = []
    for c in chromosomes_in_both:
        both_results.append(func.remote(self.rles[c], other.rles[c]))

    self_results = []
    for c in chromosomes_in_self_not_other:
        _other = Rle([np.sum(self.rles[c].runs)], [0])
        self_results.append(func.remote(self.rles[c], _other))

    other_results = []
    for c in chromosomes_in_other_not_self:
        _self = Rle([np.sum(other.rles[c].runs)], [0])
        other_results.append(func.remote(_self, other.rles[c]))

    rles = {
        k: v
        for k, v in zip(
            chromosomes_in_both + chromosomes_in_self_not_other +
            chromosomes_in_other_not_self,
            get(both_results + self_results + other_results))
    }
    return rd.RleDict(rles)
예제 #20
0
def test_subset_coverage(runlengths, interval):

    start, end = interval

    print("runlengths\n", runlengths)

    r = Rle(runlengths.Runs, runlengths.Values)

    result_pyranges = r[start:end]

    result_df = None
    with tempfile.TemporaryDirectory() as temp_dir:
        # temp_dir = "."
        f1 = "{}/f1.txt".format(temp_dir)
        outfile = "{}/result.txt".format(temp_dir)
        runlengths.to_csv(f1, sep="\t", index=False)

        cmd = rle_operation_cmd.format(f1, start + 1, end,
                                       outfile)  # + " 2>/dev/null"
        print(cmd)

        subprocess.check_output(cmd, shell=True,
                                executable="/bin/bash").decode()

        result = pd.read_csv(outfile, sep="\t")
        s4vectors_result = Rle(result.Runs, result.Values)

    print("pyranges result\n", result_pyranges)
    print("s4vectors result\n", s4vectors_result)

    assert np.allclose(result_pyranges.runs,
                       s4vectors_result.runs,
                       equal_nan=False)
    assert np.allclose(result_pyranges.values,
                       s4vectors_result.values,
                       equal_nan=True)
예제 #21
0
def zscores(x, y, ratio=1):

    _ratio = ratio

    new_pyrle = {}
    for k, v in (x + y).items():

        if isinstance(ratio, dict):
            _ratio = ratio[k[1]]

        _ratio = _ratio
        difference = (_ratio * x[k]) - y[k]

        np.seterr(all="ignore")
        denominator = Rle(v.runs, np.nan_to_num(np.sqrt((_ratio * v).values)))
        zs = difference / denominator
        np.seterr(all="warn")
        zs = zs.numbers_only()
        zs.values[zs.values < 0] = 0
        zs = zs.defragment()
        new_pyrle[k] = zs

    return PyRles(new_pyrle)
예제 #22
0
파일: test.py 프로젝트: pyranges/pyrle
import pyranges as pr
gr = pr.load_dataset("epigenome_roadmap")
rle = gr["chr1"].coverage()
print(list(rle.runs)[:20])
print(list(rle.values)[:20])

raise
import numpy as np
from pyrle import Rle
import pandas as pd

r = pd.Series([1, 2, 3, 4], dtype=np.int16)
# v = pd.Series([-1, 2.3, 3, 4.976], dtype=np.float)
r1 = Rle(r, r)

r2 = Rle(r * 2, r * 2)

# > r2
# numeric-Rle of length 20 with 4 runs
#   Lengths: 2 4 6 8
#   Values : 2 4 6 8
# > r4
# numeric-Rle of length 20 with 5 runs
#   Lengths:  1  2  3  4 10
#   Values :  1  2  3  4  0
# > r2 + r4
# numeric-Rle of length 20 with 7 runs
#   Lengths:  1  1  1  3  4  2  8
#   Values :  3  4  6  7 10  6  8

r3 = r1 + r2
예제 #23
0
def long_rle():

    r = pd.Series([6, 5, 4, 3, 2, 1], dtype=np.int16)
    r2 = Rle(r, r)

    return r2
예제 #24
0
def simple_rle2():

    r = pd.Series([1, 2, 3, 4], dtype=np.int16)
    r2 = Rle(r * 2, r * 2)

    return r2
예제 #25
0
def simple_rle():

    r = pd.Series([1, 2, 3, 4], dtype=np.int16)
    r1 = Rle(r, r)

    return r1
예제 #26
0
def weird_rle():

    return Rle([10, 20, 30, 40], [1, 2, 3, 4])
예제 #27
0
def expected_result():

    runs = [int(i) for i in "3    5    4    4    4    1    2".split()]
    values = [float(f) for f in "21 16.8   32   64 58.4  7.3 21.9".split()]

    return Rle(runs, values)
예제 #28
0
def rle2():

    return Rle([8, 8, 7], [4.2, 8.0, 7.3])
예제 #29
0
def rle1():

    return Rle([3, 9, 8, 1, 2], [5, 4, 8, 1, 3])
예제 #30
0
def expected_result_weird():

    return Rle([1, 99], [1, 0])