def test_inverse_div_mul_rles(df): """Testing with small integers, since small value floating points might lead to mul then div not being equal to identity function because of float equality.""" print(df) runlength = df.Runs.sum() cv = Rle(df.Runs.values, df.Values.values) newruns = np.random.permutation(df.Runs.values) print("newruns", newruns) cv2 = Rle(newruns, df.Values2.values) print("cv\n", cv) print("cv2\n", cv2) assert runlength == np.sum(cv.runs) and runlength == np.sum(cv2.runs) result = cv / cv2 result2 = result * cv2 print("result\n", result) print("result2\n", result2) assert np.all(np.equal(result2.runs, cv.runs)) assert np.allclose(result2.values, cv.values)
def test_rle(runlengths, runlengths2, operation): # It is only compared against bioc with integers because float equality is hard, # for both libraries, sometimes end up with slightly different runlengths # [257.492105948544, 257.492075430654] gives # pyranges result # +--------+---------+---------+---------+ # | Runs | 1 | 1 | 1 | # |--------+---------+---------+---------| # | Values | 257.492 | 257.492 | 257.492 | # +--------+---------+---------+---------+ # Rle of length 3 containing 3 elements # s4vectors result # +--------+---------+---------+ # | Runs | 2 | 1 | # |--------+---------+---------| # | Values | 257.492 | 257.492 | # +--------+---------+---------+ # Rle of length 3 containing 2 elements pyop = {"+": "__add__", "-": "__sub__", "*": "__mul__", "/": "__truediv__"}[operation] print("runlengths", runlengths) print("runlengths2", runlengths2) r = Rle(runlengths.Runs, runlengths.Values) r2 = Rle(runlengths2.Runs, runlengths2.Values) print("r\n", r) print("r2\n", r2) m = getattr(r, pyop) result_pyranges = m(r2) print("pyranges result\n", result_pyranges) # f1 = "f1.txt" # f2 = "f2.txt" # runlengths.to_csv(f1, sep="\t", index=False) # runlengths2.to_csv(f2, sep="\t", index=False) result_df = None with tempfile.TemporaryDirectory() as temp_dir: f1 = "{}/f1.txt".format(temp_dir) f2 = "{}/f2.txt".format(temp_dir) outfile = "{}/result.txt".format(temp_dir) runlengths.to_csv(f1, sep="\t", index=False) runlengths2.to_csv(f2, sep="\t", index=False) cmd = rle_operation_cmd.format(f1, f2, operation, outfile) + " 2>/dev/null" subprocess.check_output(cmd, shell=True, executable="/bin/bash").decode() result = pd.read_table(outfile) s4vectors_result = Rle(result.Runs, result.Values) print("pyranges result\n", result_pyranges) print("s4vectors result\n", s4vectors_result) assert np.allclose(result_pyranges.runs, s4vectors_result.runs, equal_nan=False) assert np.allclose(result_pyranges.values, s4vectors_result.values, equal_nan=True)
def expected_result(): r1 = Rle([1, 2, 3, 10], [2, 4, 5, 3]) r2 = Rle([4, 1, 4, 2, 9], [-0.09, -0.08, -0.18, -0.28, 0.03]) d = {"chr1": r1, "chr2": r2} return PyRles(d)
def grle2(): r1 = Rle([1, 2, 3], [1, 2, 3]) r2 = Rle([5, 4, 2], [-0.1, -0.2, -0.3]) d = {"chr1": r1, "chr2": r2} return PyRles(d)
def d1(): r1 = Rle([1, 5, 10], [1, 2, 3]) r2 = Rle([4, 7, 9], [0.01, 0.02, 0.03]) print(r1) d = {"chr1": r1, "chr2": r2} return d
def grle1(): r1 = Rle([1, 5, 10], [1, 2, 3]) r2 = Rle([4, 7, 9], [0.01, 0.02, 0.03]) d = {"chr1": r1, "chr2": r2} print(d) g = PyRles(d) print(g) return g
def test_rle(runlengths, runlengths2, operation): # Only compared against bioc with integers because float equality is hard, # for both libraries, sometimes end up with slightly different runlengths # when consecutive values are almost equal pyop = { "+": "__add__", "-": "__sub__", "*": "__mul__", "/": "__truediv__" }[operation] print("runlengths", runlengths) print("runlengths2", runlengths2) r = Rle(runlengths.Runs, runlengths.Values) r2 = Rle(runlengths2.Runs, runlengths2.Values) print("r\n", r) print("r2\n", r2) m = getattr(r, pyop) result_pyranges = m(r2) print("pyranges result\n", result_pyranges) result_df = None with tempfile.TemporaryDirectory() as temp_dir: f1 = "{}/f1.txt".format(temp_dir) f2 = "{}/f2.txt".format(temp_dir) outfile = "{}/result.txt".format(temp_dir) runlengths.to_csv(f1, sep="\t", index=False) runlengths2.to_csv(f2, sep="\t", index=False) cmd = rle_operation_cmd.format(f1, f2, operation, outfile) # + " 2>/dev/null" subprocess.check_output(cmd, shell=True, executable="/bin/bash").decode() result = pd.read_csv(outfile, sep="\t") s4vectors_result = Rle(result.Runs, result.Values) print("pyranges result\n", result_pyranges) print("s4vectors result\n", s4vectors_result) assert np.allclose(result_pyranges.runs, s4vectors_result.runs, equal_nan=False) assert np.allclose(result_pyranges.values, s4vectors_result.values, equal_nan=True)
def init_chip(coverage, flank_distance): """Creating left by chopping off rightmost fd, shifting remaining flank_distance to the right and vice-versa """ replicates = list(coverage) outdict = {} for replicate, location in product(replicates, ["left", "right", "center"]): cvg = coverage[replicate] _cvg = {} for direction in ["+", "-"]: chromosomes = [k[0] for k in cvg.keys()] for c in chromosomes: try: rle = cvg[c, direction] except: continue if location == "left": length = np.sum(rle.runs) - flank_distance newrle = rle[:length] newruns = np.concatenate([ np.array([flank_distance], dtype=np.long), newrle.runs ]) newvals = np.concatenate( [np.array([0], dtype=np.double), newrle.values]) newrle = Rle(newruns, newvals) _cvg[c, direction] = newrle elif location == "right": length = np.sum(rle.runs) newrle = Rle(rle.runs, rle.values) newrle = newrle[flank_distance:] # could be optimized by ending getitem after found start... newruns = np.concatenate([ newrle.runs, np.array([flank_distance], dtype=np.long) ]) newvals = np.concatenate( [newrle.values, np.array([0], dtype=np.double)]) newrle = Rle(newruns, newvals) _cvg[c, direction] = newrle else: _cvg[c, direction] = rle outdict[replicate, location] = PyRles(_cvg) return outdict
def test_inverse_add_sub_rles(df): """Testing with small integers, since small value floating points might lead to mul then div not being equal to identity function because of float equality.""" cv = Rle(df.Runs.values, df.Values.values) cv2 = Rle(np.random.permutation(df.Runs.values), df.Values2.values) result = cv + cv2 result2 = result - cv2 assert np.all(np.equal(result2.runs, cv.runs)) assert np.allclose(result2.values, cv.values)
def test_coverage_simple(simple): result = coverage(simple, value_col="Score") print(result) assert result == Rle([1, 2], [-1, 1])
def coverage(df, **kwargs): value_col = kwargs.get("value_col", None) if value_col: values = df[value_col].astype(np.float64).values else: values = np.ones(len(df)) starts_df = pd.DataFrame({ "Position": df.Start, "Value": values })["Position Value".split()] ends_df = pd.DataFrame({ "Position": df.End, "Value": -1 * values })["Position Value".split()] _df = pd.concat([starts_df, ends_df], ignore_index=True) _df = _df.sort_values("Position", kind="mergesort") if _df.Position.dtype.name == "int32": _df.Position = _df.Position.astype(np.int64) runs, values = _coverage(_df.Position.values, _df.Value.values) return Rle(runs, values)
def rle2(): r = pd.Series([2, 1], dtype=np.int16) v = pd.Series([0, 1], dtype=np.int16) r2 = Rle(r, v) return r2
def shorter_rle(): runs = pd.Series([1, 2, 3]) values = pd.Series([1, 0, 1]) r = Rle(runs, values) return r
def test_coverage(df): print("---" * 10) p = pr.PyRanges(df) print("pyranges\n", p) c = p.coverage()["chr1"] result_df = None with tempfile.TemporaryDirectory() as temp_dir: f1 = "{}/f1.txt".format(temp_dir) outfile = "{}/result.txt".format(temp_dir) R_df = df R_df.End = R_df.End - 1 R_df.to_csv(f1, sep="\t", index=False) cmd = coverage_cmd.format(f1, outfile) + " 2>/dev/null" subprocess.check_output(cmd, shell=True, executable="/bin/bash").decode() result = pd.read_table(outfile)[["Runs.value", "Values.value"]] result.columns = "Runs Values".split() result = pd.concat([pd.DataFrame(index=[0], data={"Runs": 1, "Values": 0}), result], ignore_index=True) s4vectors_result = Rle(result.Runs, result.Values) print("pyranges result\n", c) print("s4vectors result\n", s4vectors_result) print(str(c == s4vectors_result) + " " * 10, c == s4vectors_result) assert np.all(np.equal(c.runs, s4vectors_result.runs)) assert np.all(np.equal(c.values, s4vectors_result.values))
def merge_runs(s): _new_rledict = {} for k, v in s.items(): v.values[v.values < 0] = 0 v.values[v.values > 0] = 1 v = Rle(v.runs, v.values) _new_rledict[k] = v return PyRles(_new_rledict)
def test_subtract_result_same_start(chip, background): print(chip) print(background) result = chip - background print(result) assert result == Rle([1, 2], [-1, 1])
def sum_background(background): first_file = list(background.keys())[0] first_key = list(background[first_file].keys())[0] background_sum = PyRles({first_key: Rle([1], [0])}) for v in background.values(): background_sum += v return background_sum
def sum_chip(chip, where): locs = list(get_locs(chip, where).values()) first_loc = locs[0] first_key = list(first_loc.keys())[0] chip_sum = PyRles({first_key: Rle([1], [0])}) for v in locs: chip_sum += v return chip_sum
def binary_operation(operation, self, other, nb_cpu=1): func = {"div": __div, "mul": __mul, "add": __add, "sub": __sub}[operation] func, get = rd.get_multithreaded_funcs(func, nb_cpu) if nb_cpu > 1: import ray with suppress_stdout_stderr(): ray.init(num_cpus=nb_cpu) if self.stranded != other.stranded: self, other = ensure_both_or_none_stranded(self, other) chromosomes_in_both, chromosomes_in_self_not_other, chromosomes_in_other_not_self = chromosomes_in_both_self_other( self, other) both_results = [] for c in chromosomes_in_both: both_results.append(func.remote(self.rles[c], other.rles[c])) self_results = [] for c in chromosomes_in_self_not_other: _other = Rle([np.sum(self.rles[c].runs)], [0]) self_results.append(func.remote(self.rles[c], _other)) other_results = [] for c in chromosomes_in_other_not_self: _self = Rle([np.sum(other.rles[c].runs)], [0]) other_results.append(func.remote(_self, other.rles[c])) rles = { k: v for k, v in zip( chromosomes_in_both + chromosomes_in_self_not_other + chromosomes_in_other_not_self, get(both_results + self_results + other_results)) } return rd.RleDict(rles)
def test_subset_coverage(runlengths, interval): start, end = interval print("runlengths\n", runlengths) r = Rle(runlengths.Runs, runlengths.Values) result_pyranges = r[start:end] result_df = None with tempfile.TemporaryDirectory() as temp_dir: # temp_dir = "." f1 = "{}/f1.txt".format(temp_dir) outfile = "{}/result.txt".format(temp_dir) runlengths.to_csv(f1, sep="\t", index=False) cmd = rle_operation_cmd.format(f1, start + 1, end, outfile) # + " 2>/dev/null" print(cmd) subprocess.check_output(cmd, shell=True, executable="/bin/bash").decode() result = pd.read_csv(outfile, sep="\t") s4vectors_result = Rle(result.Runs, result.Values) print("pyranges result\n", result_pyranges) print("s4vectors result\n", s4vectors_result) assert np.allclose(result_pyranges.runs, s4vectors_result.runs, equal_nan=False) assert np.allclose(result_pyranges.values, s4vectors_result.values, equal_nan=True)
def zscores(x, y, ratio=1): _ratio = ratio new_pyrle = {} for k, v in (x + y).items(): if isinstance(ratio, dict): _ratio = ratio[k[1]] _ratio = _ratio difference = (_ratio * x[k]) - y[k] np.seterr(all="ignore") denominator = Rle(v.runs, np.nan_to_num(np.sqrt((_ratio * v).values))) zs = difference / denominator np.seterr(all="warn") zs = zs.numbers_only() zs.values[zs.values < 0] = 0 zs = zs.defragment() new_pyrle[k] = zs return PyRles(new_pyrle)
import pyranges as pr gr = pr.load_dataset("epigenome_roadmap") rle = gr["chr1"].coverage() print(list(rle.runs)[:20]) print(list(rle.values)[:20]) raise import numpy as np from pyrle import Rle import pandas as pd r = pd.Series([1, 2, 3, 4], dtype=np.int16) # v = pd.Series([-1, 2.3, 3, 4.976], dtype=np.float) r1 = Rle(r, r) r2 = Rle(r * 2, r * 2) # > r2 # numeric-Rle of length 20 with 4 runs # Lengths: 2 4 6 8 # Values : 2 4 6 8 # > r4 # numeric-Rle of length 20 with 5 runs # Lengths: 1 2 3 4 10 # Values : 1 2 3 4 0 # > r2 + r4 # numeric-Rle of length 20 with 7 runs # Lengths: 1 1 1 3 4 2 8 # Values : 3 4 6 7 10 6 8 r3 = r1 + r2
def long_rle(): r = pd.Series([6, 5, 4, 3, 2, 1], dtype=np.int16) r2 = Rle(r, r) return r2
def simple_rle2(): r = pd.Series([1, 2, 3, 4], dtype=np.int16) r2 = Rle(r * 2, r * 2) return r2
def simple_rle(): r = pd.Series([1, 2, 3, 4], dtype=np.int16) r1 = Rle(r, r) return r1
def weird_rle(): return Rle([10, 20, 30, 40], [1, 2, 3, 4])
def expected_result(): runs = [int(i) for i in "3 5 4 4 4 1 2".split()] values = [float(f) for f in "21 16.8 32 64 58.4 7.3 21.9".split()] return Rle(runs, values)
def rle2(): return Rle([8, 8, 7], [4.2, 8.0, 7.3])
def rle1(): return Rle([3, 9, 8, 1, 2], [5, 4, 8, 1, 3])
def expected_result_weird(): return Rle([1, 99], [1, 0])