def test_project_into_range(self): tStart = array([1, 1, 1, 1, 1, 2, 2, 2, 2, 10, 20]) tEnd = array([2, 3, 4, 5, 6, 3, 4, 5, 6, 15, 25]) assert_equal( True, all( RQ.projectIntoRange(tStart, tEnd, 1, 6) == array( [5, 8, 6, 4, 2]))) assert_equal( True, all( RQ.projectIntoRange(tStart, tEnd, 20, 26) == array( [1, 1, 1, 1, 1, 0])))
def coverageInWindow(refWin, hits): winId, winStart, winEnd = refWin a = np.array([(hit.referenceStart, hit.referenceEnd) for hit in hits if hit.referenceName == winId]) tStart = a[:, 0] tEnd = a[:, 1] cov = projectIntoRange(tStart, tEnd, winStart, winEnd) return cov
def coverageInWindow(refWin, hits): winId, winStart, winEnd = refWin a = np.array([(hit.referenceStart, hit.referenceEnd) for hit in hits if hit.referenceName == winId]) tStart = a[:,0] tEnd = a[:,1] cov = projectIntoRange(tStart, tEnd, winStart, winEnd) return cov
def coverageInWindow(refWin, hits): winId, winStart, winEnd = refWin a = np.array([(hit.referenceStart, hit.referenceEnd) for hit in hits if hit.referenceName == winId]) if len(a) == 0: return np.zeros(winEnd - winStart, dtype=np.uint) else: tStart = a[:, 0] tEnd = a[:, 1] cov = projectIntoRange(tStart, tEnd, winStart, winEnd) return cov
def coverageInWindow(refWin, hits): winId, winStart, winEnd = refWin a = np.array([(hit.referenceStart, hit.referenceEnd) for hit in hits if hit.referenceName == winId]) if len(a) == 0: return np.zeros(winEnd - winStart, dtype=np.uint) else: tStart = a[:,0] tEnd = a[:,1] cov = projectIntoRange(tStart, tEnd, winStart, winEnd) return cov
def kSpannedIntervals(refWindow, k, start, end, minLength=0): """ Find intervals in the window that are k-spanned by the reads. Given: `refWindow`: the window under consideration `k`: the number of reads that must span intervals to be returned `start`, `end`: numpy arrays of start and end coordinates for reads, where the extent of each read is [start, end). Must be ordered so that `start` is sorted in ascending order. Find a maximal set of maximal disjoint intervals within refWindow such that each interval is spanned by at least k reads. Intervals are returned in sorted order, as a list of (start, end) tuples. Note that this is a greedy search procedure and may not always return the optimal solution, in some sense. However it will always return the optimal solutions in the most common cases. """ assert k >= 1 winId, winStart_, winEnd_ = refWindow # Truncate to bounds implied by refWindow start = np.clip(start, winStart_, winEnd_) end = np.clip(end, winStart_, winEnd_) # Translate the start, end to coordinate system where # refWindow.start is 0. start = start - winStart_ end = end - winStart_ winStart = 0 winEnd = winEnd_ - winStart_ positions = np.arange(winEnd - winStart, dtype=int) coverage = projectIntoRange(start, end, winStart, winEnd) x = -1 y = 0 intervalsFound = [] while y < winEnd: # Step 1: let x be the first pos >= y that is k-covered eligible = np.flatnonzero((positions >= y) & (coverage >= k)) if len(eligible) > 0: x = eligible[0] else: break # Step 2: extend the window [x, y) until [x, y) is no longer # k-spanned. Do this by setting y to the k-th largest `end` # among reads covering x eligible = end[(start <= x)] eligible.sort() if len(eligible) >= k: y = eligible[-k] else: break intervalsFound.append((x, y)) # Translate intervals back return [ (s + winStart_, e + winStart_) for (s, e) in intervalsFound if e - s >= minLength ]
def kSpannedIntervals(refWindow, k, start, end, minLength=0): """ Find intervals in the window that are k-spanned by the reads. Given: `refWindow`: the window under consideration `k`: the number of reads that must span intervals to be returned `start`, `end`: numpy arrays of start and end coordinates for reads, where the extent of each read is [start, end). Must be ordered so that `start` is sorted in ascending order. Find a maximal set of maximal disjoint intervals within refWindow such that each interval is spanned by at least k reads. Intervals are returned in sorted order, as a list of (start, end) tuples. Note that this is a greedy search procedure and may not always return the optimal solution, in some sense. However it will always return the optimal solutions in the most common cases. """ assert k >= 1 winId, winStart_, winEnd_ = refWindow # Truncate to bounds implied by refWindow start = np.clip(start, winStart_, winEnd_) end = np.clip(end, winStart_, winEnd_) # Translate the start, end to coordinate system where # refWindow.start is 0. start = start - winStart_ end = end - winStart_ winStart = 0 winEnd = winEnd_ - winStart_ positions = np.arange(winEnd - winStart, dtype=int) coverage = projectIntoRange(start, end, winStart, winEnd) x = -1 y = 0 intervalsFound = [] while y < winEnd: # Step 1: let x be the first pos >= y that is k-covered eligible = np.flatnonzero((positions >= y) & (coverage >= k)) if len(eligible) > 0: x = eligible[0] else: break # Step 2: extend the window [x, y) until [x, y) is no longer # k-spanned. Do this by setting y to the k-th largest `end` # among reads covering x eligible = end[(start <= x)] eligible.sort() if len(eligible) >= k: y = eligible[-k] else: break intervalsFound.append((x, y)) # Translate intervals back return [(s + winStart_, e + winStart_) for (s, e) in intervalsFound if e - s >= minLength]
def test_project_into_range(self): tStart = array([1,1,1,1,1,2,2,2,2,10,20]) tEnd = array([2,3,4,5,6,3,4,5,6,15,25]) assert_equal(True, all(RQ.projectIntoRange(tStart, tEnd, 1, 6) == array([5, 8, 6, 4, 2]))) assert_equal(True, all(RQ.projectIntoRange(tStart, tEnd, 20, 26) == array([1, 1, 1, 1, 1, 0])))