def computeFromSeq(seqs, subseqLength=1): """Given a collection of sequences, returns an array of which seq an element i came from in the concatenation of all the seqs >>> s = [[1,2],[3,4,5]] >>> computeFromSeq(s) array([0, 0, 1, 1, 1]) >>> computeFromSeq(s, subseqLength=2) array([0, 1, 1]) >>> computeFromSeq([1,2]) array([0, 0]) """ # just one seq -> array of all 0s if isScalar(seqs[0]): return np.zeros(len(seqs), dtype=np.int) if len(seqs) == 1: return np.zeros(len(seqs[0]), dtype=np.int) seqLens = np.array(map(lambda seq: len(seq) - subseqLength + 1, seqs)) cumLen = np.cumsum(seqLens) combinedLength = cumLen[-1] startIdxs = np.r_[0, cumLen[:-1]] endIdxs = np.r_[startIdxs[1:], combinedLength] fromSeq = np.zeros(combinedLength, dtype=np.int) for i in range(len(startIdxs)): startIdx, endIdx = startIdxs[i], endIdxs[i] fromSeq[startIdx:endIdx] = i return fromSeq
def optimalAlignK(scores, m, k): """ Given an array of scores, return the indices I of the k best scores such that for all i, j in I, i !=j -> |i - m| >= m; in other words, the indices must be m apart Parameters ---------- scores: 1D, array-like an ordered collection of scores m: int minimum spacing between reported indices k: int or array-like of int number of indices to return Returns ------- idxs: an array of indices for each k value specified (a single array or a list thereof, depending on whether k is an int or a collection) >>> s = [2,1,4,3] >>> optimalAlignK(s, 2, 1) array(2) >>> optimalAlignK(s, 2, 2) array([0, 2]) >>> optimalAlignK(s, 3, 2) array([0, 3]) >>> optimalAlignK(s, 4, 2) array(2) >>> optimalAlignK(s, 2, [1, 2]) [array([2]), array([0, 2])] >>> s2 = [2,1,4,3,1,7,1] >>> optimalAlignK(s2, 2, [2, 3]) [array([2, 5]), array([0, 2, 5])] >>> optimalAlignK(s2, 3, [1, 2, 3]) [array([5]), array([2, 5]), array([0, 3, 6])] >>> s3 = [2,1,4,3,1,7,-99] >>> optimalAlignK(s3, 3, [3]) [] """ # ------------------------ err handling and arg munging if scores is None or not len(scores): raise RuntimeError("No scores given!") if k is None: raise RuntimeError("Number of locations to return must be >= 1") if isScalar(k): k = (k, ) k = np.sort(np.asarray(k)) kmax = np.max(k) if kmax < 1: raise RuntimeError("Number of locations to return must be >= 1") n = len(scores) if n <= m or k[-1] == 1: return np.array(np.argmax(scores), dtype=np.int) scores = np.asarray(scores) if np.all(scores <= 0.): print("Warning: optimalAlignK(): all scores <= 0") return [[] for kk in k] # ------------------------ find best score and parent for each k at each idx # initialize first m points historyShape = (len(scores), kmax) c = np.zeros(historyShape) - 1 # cumulative score c[:m, 0] = scores[:m] parentIdxs = np.zeros(historyShape, dtype=np.int) - 1 # previous best idx # compute scores and parent idxs bestScores = np.zeros(kmax) bestIdxs = np.zeros(kmax) - 1 for i in range(m, n): oldIdx = i - m betterForTheseK = c[oldIdx] > bestScores # print i, bestScores, bestIdxs if np.any(betterForTheseK ): # check not really needed; will just do nothing bestScores[betterForTheseK] = c[oldIdx, betterForTheseK] bestIdxs[betterForTheseK] = oldIdx parentIdxs[i, 1:] = bestIdxs[:-1] c[i, 1:] = bestScores[:-1] + scores[i] c[i, 1:] *= parentIdxs[i, 1:] >= 0 # only valid parents c[i, 0] = scores[i] # TODO? seemingly no point if < bestScores[0] # print np.c_[np.arange(n), scores, c] # print np.c_[np.arange(n), scores, parentIdxs] # compute best set of idxs for each value of k allParents = [] for kk in k: kIdx = kk - 1 parents = [] parent = np.argmax(c[:, kIdx]) if c[parent, kIdx] <= 0.: allParents.append([]) continue while parent >= 0: parents.append(parent) parent = parentIdxs[parent, kIdx] kIdx -= 1 parents = np.array(parents, dtype=np.int)[::-1] allParents.append(parents) if len(k) == 1: return allParents[0] return allParents
def optimalAlignK(scores, m, k): """ Given an array of scores, return the indices I of the k best scores such that for all i, j in I, i !=j -> |i - m| >= m; in other words, the indices must be m apart Parameters ---------- scores: 1D, array-like an ordered collection of scores m: int minimum spacing between reported indices k: int or array-like of int number of indices to return Returns ------- idxs: an array of indices for each k value specified (a single array or a list thereof, depending on whether k is an int or a collection) >>> s = [2,1,4,3] >>> optimalAlignK(s, 2, 1) array(2) >>> optimalAlignK(s, 2, 2) array([0, 2]) >>> optimalAlignK(s, 3, 2) array([0, 3]) >>> optimalAlignK(s, 4, 2) array(2) >>> optimalAlignK(s, 2, [1, 2]) [array([2]), array([0, 2])] >>> s2 = [2,1,4,3,1,7,1] >>> optimalAlignK(s2, 2, [2, 3]) [array([2, 5]), array([0, 2, 5])] >>> optimalAlignK(s2, 3, [1, 2, 3]) [array([5]), array([2, 5]), array([0, 3, 6])] >>> s3 = [2,1,4,3,1,7,-99] >>> optimalAlignK(s3, 3, [3]) [] """ # ------------------------ err handling and arg munging if scores is None or not len(scores): raise RuntimeError("No scores given!") if k is None: raise RuntimeError("Number of locations to return must be >= 1") if isScalar(k): k = (k,) k = np.sort(np.asarray(k)) kmax = np.max(k) if kmax < 1: raise RuntimeError("Number of locations to return must be >= 1") n = len(scores) if n <= m or k[-1] == 1: return np.array(np.argmax(scores), dtype=np.int) scores = np.asarray(scores) if np.all(scores <= 0.): print("Warning: optimalAlignK(): all scores <= 0") return [[] for kk in k] # ------------------------ find best score and parent for each k at each idx # initialize first m points historyShape = (len(scores), kmax) c = np.zeros(historyShape) - 1 # cumulative score c[:m, 0] = scores[:m] parentIdxs = np.zeros(historyShape, dtype=np.int) - 1 # previous best idx # compute scores and parent idxs bestScores = np.zeros(kmax) bestIdxs = np.zeros(kmax) - 1 for i in range(m, n): oldIdx = i - m betterForTheseK = c[oldIdx] > bestScores # print i, bestScores, bestIdxs if np.any(betterForTheseK): # check not really needed; will just do nothing bestScores[betterForTheseK] = c[oldIdx, betterForTheseK] bestIdxs[betterForTheseK] = oldIdx parentIdxs[i, 1:] = bestIdxs[:-1] c[i, 1:] = bestScores[:-1] + scores[i] c[i, 1:] *= parentIdxs[i, 1:] >= 0 # only valid parents c[i, 0] = scores[i] # TODO? seemingly no point if < bestScores[0] # print np.c_[np.arange(n), scores, c] # print np.c_[np.arange(n), scores, parentIdxs] # compute best set of idxs for each value of k allParents = [] for kk in k: kIdx = kk - 1 parents = [] parent = np.argmax(c[:, kIdx]) if c[parent, kIdx] <= 0.: allParents.append([]) continue while parent >= 0: parents.append(parent) parent = parentIdxs[parent, kIdx] kIdx -= 1 parents = np.array(parents, dtype=np.int)[::-1] allParents.append(parents) if len(k) == 1: return allParents[0] return allParents