def pairwiseDists(seqs, length, norm='each', tieDims=False, pad=True, removeZeros=True, k=-1): seqs = asListOrTuple(seqs) nDims = 1 if len(seqs[0].shape) < 2 or tieDims: Xnorm, _, _ = window.flattened_subseqs_of_length(seqs, length, norm=norm) else: nDims = seqs[0].shape[1] # bypass flattening--each dim of each seq is treated as a separate # 1D seq; we end up with a long list whose elements are 1D vectors, # each of which was originally a column within some ND array in seqs # # note that this may do weird things if there's more than one seq # because the dims for each seq are sequential, rather than the seqs # for each dim separatedByDim = map(lambda X: colsAsList(X), seqs) flatSeqs = flattenListOfLists(separatedByDim) flatSeqs = map(lambda v: v.flatten(), flatSeqs) # col vects -> 1D arrays Xnorm, _, _ = window.flattened_subseqs_of_length(flatSeqs, length, norm='each') nSamples, m = Xnorm.shape rowsPerDim = nSamples / nDims print "----- pairwiseDists" print "length", length print "origSeqs[0] shape", seqs[0].shape print "nsamples, m, rowsPerDim", Xnorm.shape, rowsPerDim print "-----" if pad: paddingLen = length - 1 else: paddingLen = 0 # print "Xnorm stats:", np.mean(Xnorm, axis=1), np.std(Xnorm, axis=1) # D = np.zeros((nSamples, nSamples+paddingLen*nDims)) # 0 pad at end so samples line up Dtensor = np.zeros((nDims, rowsPerDim, rowsPerDim + paddingLen)) # D = np.zeros((nSamples, nSamples)) maxPossibleDist = 2**2 * m maxIdx = 0 for dim in range(nDims): # extract subseqs associated with this dim minIdx = maxIdx maxIdx += rowsPerDim Xdim = Xnorm[minIdx:maxIdx] # compute dists to each one for i, row in enumerate(Xdim): if removeZeros: if np.sum(row * row) < 1.e-6: Dtensor[dim, i, :rowsPerDim] = maxPossibleDist continue diffs = Xdim - row diffs_sq = diffs * diffs # dMinIdx = minIdx + dim*paddingLen # dMaxIdx = dMinIdx + rowsPerDim dists = np.sum(diffs_sq, axis=1) # D[minIdx + i, dMinIdx:dMaxIdx] = dists Dtensor[dim, i, :rowsPerDim] = dists # only keep k lowest dists if k > 0: for j in np.arange(rowsPerDim): col = Dtensor[dim, :, j] highestIdxs = np.argsort(col)[k:] Dtensor[dim, highestIdxs, j] = maxPossibleDist # return Dtensor, D, Xnorm return Dtensor, Xnorm
def uniqueSubseqsInSignals(signal, length, maxDist, norm='each', tree=None): X, _, _ = window.flattened_subseqs_of_length(signal, length, norm=norm) Xnorm = zNormalizeRows(X, removeZeros=False) # print("subseqsInSignals: signal has %d subseqs" % (len(Xnorm))) # init kd tree--we can't give it any data yet because we only want to # search through seqs that have been added to the dictionary if tree is None: width = Xnorm.shape[1] tree = kd.create(dimensions=width) signalOccurIdxs = {} tree.add(Xnorm[0], 0) for startIdx, subseq in enumerate(Xnorm[1:]): if np.sum(subseq * subseq) < .001: # ignore zero seqs continue startIdx += 1 # since we skipped Xnorm[0] neighbors = tree.search_knn(subseq, 2) neighborIdx = -1 neighborDist = np.inf # pull out whichever neighbor isn't the query for node, dist in neighbors: idx = node.metadata if idx != startIdx: neighborIdx = idx neighborDist = dist if neighborIdx < 0: print "ERROR: knn returned <2 neighbors..." print "Neighbors returned:", neighbors assert (0) # print "neighborDist", neighborDist, maxDist if neighborDist < maxDist: # store that the subseq happened at this idx too l = signalOccurIdxs.get(neighborIdx, []) l.append(startIdx) # signalOccurIdxs[neighborIdx] = l else: # ah, so this can overwrite crap and yield too few features signalOccurIdxs[startIdx] = [startIdx] tree.add(subseq, startIdx) # rebalance if startIdx is a power of 2, so we do so log(N) times if 2**int(np.log2(startIdx)) == startIdx: # print "rebalancing at start idx %d" % (startIdx,) tree.rebalance() # signalOccurIdxs[neighborIdx] = [startIdx] # if res: # nn, dist = res # if dist <= maxDist: # # store that the subseq happened at this idx too # neighborID = nn.metadata # signalOccurIdxs[neighborID].append(startIdx) # continue # neighborID = startIdx # signalOccurIdxs[neighborID] = [startIdx] # tree.add(subseq, neighborID) return signalOccurIdxs, Xnorm # return Xnorm for convenience, although confusing...
def allZNormalizedSubseqs(seqs, length): X, _, _ = window.flattened_subseqs_of_length(seqs, length, norm='each') return zNormalizeRows(X, removeZeros=False)
def pairwiseDists(seqs, length, norm='each', tieDims=False, pad=True, removeZeros=True, k=-1): seqs = asListOrTuple(seqs) nDims = 1 if len(seqs[0].shape) < 2 or tieDims: Xnorm, _, _ = window.flattened_subseqs_of_length(seqs, length, norm=norm) else: nDims = seqs[0].shape[1] # bypass flattening--each dim of each seq is treated as a separate # 1D seq; we end up with a long list whose elements are 1D vectors, # each of which was originally a column within some ND array in seqs # # note that this may do weird things if there's more than one seq # because the dims for each seq are sequential, rather than the seqs # for each dim separatedByDim = map(lambda X: colsAsList(X), seqs) flatSeqs = flattenListOfLists(separatedByDim) flatSeqs = map(lambda v: v.flatten(), flatSeqs) # col vects -> 1D arrays Xnorm, _, _ = window.flattened_subseqs_of_length(flatSeqs, length, norm='each') nSamples, m = Xnorm.shape rowsPerDim = nSamples / nDims print "----- pairwiseDists" print "length", length print "origSeqs[0] shape", seqs[0].shape print "nsamples, m, rowsPerDim", Xnorm.shape, rowsPerDim print "-----" if pad: paddingLen = length - 1 else: paddingLen = 0 # print "Xnorm stats:", np.mean(Xnorm, axis=1), np.std(Xnorm, axis=1) # D = np.zeros((nSamples, nSamples+paddingLen*nDims)) # 0 pad at end so samples line up Dtensor = np.zeros((nDims, rowsPerDim, rowsPerDim+paddingLen)) # D = np.zeros((nSamples, nSamples)) maxPossibleDist = 2**2 * m maxIdx = 0 for dim in range(nDims): # extract subseqs associated with this dim minIdx = maxIdx maxIdx += rowsPerDim Xdim = Xnorm[minIdx:maxIdx] # compute dists to each one for i, row in enumerate(Xdim): if removeZeros: if np.sum(row*row) < 1.e-6: Dtensor[dim, i, :rowsPerDim] = maxPossibleDist continue diffs = Xdim - row diffs_sq = diffs * diffs # dMinIdx = minIdx + dim*paddingLen # dMaxIdx = dMinIdx + rowsPerDim dists = np.sum(diffs_sq, axis=1) # D[minIdx + i, dMinIdx:dMaxIdx] = dists Dtensor[dim, i,:rowsPerDim] = dists # only keep k lowest dists if k > 0: for j in np.arange(rowsPerDim): col = Dtensor[dim, :, j] highestIdxs = np.argsort(col)[k:] Dtensor[dim, highestIdxs, j] = maxPossibleDist # return Dtensor, D, Xnorm return Dtensor, Xnorm
def uniqueSubseqsInSignals(signal, length, maxDist, norm='each', tree=None): X, _, _ = window.flattened_subseqs_of_length(signal, length, norm=norm) Xnorm = zNormalizeRows(X, removeZeros=False) # print("subseqsInSignals: signal has %d subseqs" % (len(Xnorm))) # init kd tree--we can't give it any data yet because we only want to # search through seqs that have been added to the dictionary if tree is None: width = Xnorm.shape[1] tree = kd.create(dimensions=width) signalOccurIdxs = {} tree.add(Xnorm[0], 0) for startIdx, subseq in enumerate(Xnorm[1:]): if np.sum(subseq*subseq) < .001: # ignore zero seqs continue startIdx += 1 # since we skipped Xnorm[0] neighbors = tree.search_knn(subseq, 2) neighborIdx = -1 neighborDist = np.inf # pull out whichever neighbor isn't the query for node, dist in neighbors: idx = node.metadata if idx != startIdx: neighborIdx = idx neighborDist = dist if neighborIdx < 0: print "ERROR: knn returned <2 neighbors..." print "Neighbors returned:", neighbors assert(0) # print "neighborDist", neighborDist, maxDist if neighborDist < maxDist: # store that the subseq happened at this idx too l = signalOccurIdxs.get(neighborIdx, []) l.append(startIdx) # signalOccurIdxs[neighborIdx] = l else: # ah, so this can overwrite crap and yield too few features signalOccurIdxs[startIdx] = [startIdx] tree.add(subseq, startIdx) # rebalance if startIdx is a power of 2, so we do so log(N) times if 2**int(np.log2(startIdx)) == startIdx: # print "rebalancing at start idx %d" % (startIdx,) tree.rebalance() # signalOccurIdxs[neighborIdx] = [startIdx] # if res: # nn, dist = res # if dist <= maxDist: # # store that the subseq happened at this idx too # neighborID = nn.metadata # signalOccurIdxs[neighborID].append(startIdx) # continue # neighborID = startIdx # signalOccurIdxs[neighborID] = [startIdx] # tree.add(subseq, neighborID) return signalOccurIdxs, Xnorm # return Xnorm for convenience, although confusing...