def main(): # np.random.seed(123) # ================================ consts for everything # consts for generating data # n = 1000 n = 500 # n = 300 # length = 8 # length = 16 length = 32 # length = 50 # nInstances = 3 exampleLengths = [55, 60, 65] # exampleLengths = [60, 60, 60] noiseStd = .5 # consts for algorithm Lmin = max(20, length) # only needed for optimalAlignK() spacing Lmax = 100 # loose upper bound on pattern length minSim = .5 # loose cutoff for what counts as similar # k0 = len(exampleLengths) # for version where we tell it k answerIdxs = None # ------------------------ synthetic data # seq = synth.randconst(n, std=noiseStd) # seq = synth.notSoRandomWalk(n, std=noiseStd, trendFilterLength=80, lpfLength=4) seq = synth.notSoRandomWalk(n, std=noiseStd, trendFilterLength=80, lpfLength=16) seq = embedExamples(seq, exampleLengths) # ------------------------ msrc from ..datasets import read_msrc as msrc idxs = [2] # idxs = [0] downsampleBy = 2 recordings = msrc.getRecordings(idxs=idxs) r = list(recordings)[0] # seq = r.data # seq = r.data[:, :40] # seq = r.data[:, 20:23] seq = r.data[:, 24:27] # seq = r.data[:, 20:27] print "orig seq shape", seq.shape seq = ar.downsampleMat(seq, rowsBy=downsampleBy) print "downsampled seq shape", seq.shape length = max(8, Lmin / 2) Lmin = len(seq) / 20 Lmax = len(seq) / 10 # Lmax = len(seq) / 20 # k0 = 10 minSim = .5 answerIdxs = r.gestureIdxs / downsampleBy # print "seq shape", seq.shape prePadLen = Lmax - length postPadLen = length - 1 first = np.tile(seq[0], (prePadLen, 1)) last = np.tile(seq[-1], (postPadLen, 1)) seq = np.vstack( (first, seq, last)) # pad with fixed val to allow all window positions # ^ TODO pad simMat with zeros instead--this introduces fake subseqs answerIdxs += prePadLen # seq = np.vstack((seq, np.tile(flat, (length-1, 1)))) # lets it get the last rep # print "seq shape", seq.shape # r.plot() # plt.figure() # plt.plot(r.sampleTimes) # answerIdxs = r.gestureIdxs / downsampleBy # print r.gestureIdxs # print answerIdxs # plt.figure() # plt.plot(seq) # for idx in answerIdxs: # ax = plt.gca() # viz.plotVertLine(idx, ax=ax) # plt.show() # return # noise = synth.randconst(seq.shape) # add noise for debugging # seq = np.r_[noise, seq, noise] # ================================ simMat X = computeSimMat(seq, length) X[X < minSim] = 0. # Xorig = np.copy(X) X = ff2.localMaxFilterSimMat(X) Xblur = ff2.filterSimMat(X, length - 1, 'hamming', scaleFilterMethod='max1') # Xblur = ff2.filterSimMat(X, Lmin-1, 'hamming', scaleFilterMethod='max1') Xblur = np.minimum(Xblur, 1.) print "simMat dims:", X.shape Xnonzeros = np.count_nonzero(X) print "simMat nonzeros, total, frac = ", Xnonzeros, X.size, Xnonzeros / float( X.size) # ================================ plotting crap plt.figure() axSeq = plt.subplot2grid((4, 1), (0, 0)) axSim = plt.subplot2grid((4, 1), (1, 0), rowspan=3) for ax in (axSeq, axSim): ax.autoscale(tight=True) axSeq.plot(seq) if answerIdxs is not None: for idx in answerIdxs: viz.plotVertLine(idx, ax=axSeq) Xpad = synth.appendZeros(X, length - 1) axSim.imshow(Xpad, interpolation='nearest', aspect='auto') # im = axSim.imshow(Xpad, interpolation='nearest', aspect='auto') # plt.colorbar(im, cax=axSim) axSeq.set_title("Time Series") axSim.set_title("Similarities Matrix") # plt.figure() # plt.imshow(Xorig, interpolation='nearest', aspect='auto') # plt.colorbar() # plt.figure() # plt.imshow(X, interpolation='nearest', aspect='auto') # plt.colorbar() # # plt.figure() # # Xfilt = ff2.localMaxFilterSimMat(X, allowEq=True) # # plt.imshow(Xfilt, interpolation='nearest', aspect='auto') # # plt.colorbar() # # plt.figure() # # Xfilt = ff2.localMaxFilterSimMat(X, allowEq=False) # # plt.imshow(Xfilt, interpolation='nearest', aspect='auto') # # plt.colorbar() # plt.figure() # plt.imshow(Xblur, interpolation='nearest', aspect='auto') # plt.colorbar() # plt.show() # return # ================================ science # ------------------------ derived stats kMax = int(X.shape[1] / Lmin + .5) windowLen = Lmax - length + 1 # windowShape = (X.shape[0], Lmax) # windowSize = np.prod(windowShape) nLocs = X.shape[1] - windowLen + 1 p0 = np.mean(X) # fraction of entries that are 1 (roughly) # intersections = computeIntersections(X, windowLen) # windowSims = np.sum(intersections, axis=2) # colSims = np.dot(X.T, X) colSims = np.dot(X.T, Xblur) filt = np.zeros((windowLen, windowLen)) + np.diag( np.ones(windowLen)) # zeros except 1s on diag windowSims = sig.convolve2d(colSims, filt, mode='valid') windowVects = vectorizeWindowLocs(X, windowLen) windowVectsBlur = vectorizeWindowLocs(Xblur, windowLen) plt.figure() plt.imshow(windowSims, interpolation='nearest', aspect='auto') # plt.show() # return # ------------------------ find stuff # # # # Version where we we tell it k # # # bsfScore = 0 # bsfLocs = None # bsfIntersection = None # # selfSims = np.diagonal(windowSims) # # candidateRowIdxs = np.where(selfSims * k0 <= bsfScore)[0] # # for i in candidateRowIdxs: # # row = windowSims[i] # for i, row in enumerate(windowSims): # if i % 20 == 0: # print("computing stuff for row {}".format(i)) # if windowSims[i, i] * k0 <= bsfScore: # continue # idxs = sub.optimalAlignK(row, Lmin, k0) # intersection = windowVects[i] # sz = 0 # for idx in idxs: # intersection = np.minimum(intersection, windowVectsBlur[idx]) # sz = np.sum(intersection) # if sz * k0 <= bsfScore: # break # score = sz * k0 # if score > bsfScore: # print("window {0}, k={1}, score={2} is the new best!".format(i, k0, score)) # bsfScore = score # bsfLocs = idxs # bsfIntersection = np.copy(intersection) # # # # Version where we look for similarities to orig seq # # # bsfScore = 0 # bsfLocs = None # bsfIntersection = None # for i, row in enumerate(windowSims): # if i % 20 == 0: # print("computing stuff for row {}".format(i)) # # early abandon if this location has so little stuff that no # # intersection with it can possibly beat the best score # if windowSims[i,i] * kMax <= bsfScore: # highest score is kMax identical locs # # print("immediately abandoning window {}!".format(i)) # continue # # print("not abandoning window {}!".format(i)) # # best combination of idxs such that none are within Lmin of each other # idxs = sub.optimalAlignment(row, Lmin) # # print i, ": ", idxs # # order idxs by descending order of associated score # sizes = windowSims[i, idxs] # sortedSizesOrder = np.argsort(sizes)[::-1] # sortedIdxs = idxs[sortedSizesOrder] # # iteratively intersect with another near neighbor, compute the # # associated score, and check if it's better (or if we can early abandon) # intersection = windowVects[i] # numIdxs = len(sortedIdxs) # for j, idx in enumerate(sortedIdxs): # k = j + 1 # intersection = np.minimum(intersection, windowVectsBlur[idx]) # sz = np.sum(intersection) # use apodization window # # sz = np.count_nonzero(intersection) # just max-pool # score = sz * k # if k > 1 and score > bsfScore: # print("window {0}, k={1}, score={2} is the new best!".format(i, k, score)) # print("sortedIdxs = {}".format(str(sortedIdxs))) # print("sortedIdxScores = {}".format(str(windowSims[i, sortedIdxs]))) # print("------------------------") # bsfScore = score # bsfLocs = sortedIdxs[:k] # bsfIntersection = np.copy(intersection) # # early abandon if this can't possibly beat the best score, which # # is the case exactly when the intersection is so small that perfect # # matches at all future locations still wouldn't be good enough # elif sz * numIdxs <= bsfScore: # # print("early abandoning window {} at k={}".format(i, k)) # break # # # # Version where we look for similarities to orig seq and use nearest # # enemy dist as M0 # # # bsfScore = 0 # bsfLocs = None # bsfIntersection = None # for i, row in enumerate(windowSims): # if i % 20 == 0: # print("computing stuff for row {}".format(i)) # # early abandon if this location has so little stuff that no # # intersection with it can possibly beat the best score # if windowSims[i,i] * kMax <= bsfScore: # highest score is kMax identical locs # # print("immediately abandoning window {}!".format(i)) # continue # # best combination of idxs such that none are within Lmin of each other # # validRow = row[:(-length + 1)] # can't go past end of ts # # idxs = sub.optimalAlignment(validRow, Lmin) # idxs = sub.optimalAlignment(row, Lmin) # goes past end of ts, but better # # order idxs by descending order of associated score # sizes = windowSims[i, idxs] # sortedSizesOrder = np.argsort(sizes)[::-1] # sortedIdxs = idxs[sortedSizesOrder] # # iteratively intersect with another near neighbor, compute the # # associated score, and check if it's better (or if we can early abandon) # intersection = windowVects[i] # numIdxs = len(sortedIdxs) # # allZeros = np.zeros(intersection.shape) # nextIntersection = np.minimum(intersection, windowVectsBlur[sortedIdxs[0]]) # nextSz = np.sum(nextIntersection) # for j, idx in enumerate(sortedIdxs): # k = j + 1 # intersection = np.copy(nextIntersection) # sz = nextSz # if k < numIdxs: # nextIdx = sortedIdxs[k] # since k = j+1 # nextIntersection = np.minimum(intersection, windowVectsBlur[nextIdx]) # nextSz = np.sum(nextIntersection) # sum -> use apodization window # else: # nextSz = sz * p0 # score = (sz - nextSz) * k # if k > 1 and score > bsfScore: # print("window {0}, k={1}, score={2} is the new best!".format(i, k, score)) # print("sortedIdxs = {}".format(str(sortedIdxs))) # print("sortedIdxScores = {}".format(str(windowSims[i, sortedIdxs]))) # print("------------------------") # bsfScore = score # bsfLocs = sortedIdxs[:k] # bsfIntersection = np.copy(intersection) # # early abandon if this can't possibly beat the best score, which # # is the case exactly when the intersection is so small that perfect # # matches at all future locations still wouldn't be good enough # elif sz * numIdxs <= bsfScore: # # print("early abandoning window {} at k={}".format(i, k)) # break # # Version where we look for similarities to orig seq and use nearest # enemy dist as M0, and use mean values instead of intersection # bsfScore = 0 bsfLocs = None bsfIntersection = None for i, row in enumerate(windowSims): if i % 20 == 0: print("computing stuff for row {}".format(i)) # early abandon if this location has so little stuff that no # intersection with it can possibly beat the best score if windowSims[ i, i] * kMax <= bsfScore: # highest score is kMax identical locs continue # best combination of idxs such that none are within Lmin of each other # validRow = row[:(-length + 1)] # can't go past end of ts # idxs = sub.optimalAlignment(validRow, Lmin) idxs = sub.optimalAlignment(row, Lmin) # goes past end of ts, but better # order idxs by descending order of associated score sizes = windowSims[i, idxs] sortedSizesOrder = np.argsort(sizes)[::-1] sortedIdxs = idxs[sortedSizesOrder] # iteratively intersect with another near neighbor, compute the # associated score, and check if it's better (or if we can early abandon) intersection = windowVects[i] numIdxs = len(sortedIdxs) nextSz = np.sum(intersection) nextFilt = np.array(intersection, dtype=np.float) nextFiltSum = np.array(nextFilt, dtype=np.float) for j, idx in enumerate(sortedIdxs): k = j + 1 filt = np.copy(nextFilt) sz = nextSz if k < numIdxs: nextIdx = sortedIdxs[k] # since k = j+1 nextIntersection = np.minimum(filt, windowVectsBlur[nextIdx]) nextFiltSum += nextIntersection nextFilt = nextFiltSum / ( k + 1) # avg value of each feature in intersections # nextSz = np.sum(nextFilt) # big even if like no intersection... nextSz = np.sum(nextIntersection) bigEnoughIntersection = nextIntersection[ nextIntersection > minSim] nextSz = np.sum(bigEnoughIntersection) else: nextSz = sz * p0 score = (sz - nextSz) * k if k > 1 and score > bsfScore: print("window {0}, k={1}, score={2} is the new best!".format( i, k, score)) print("sortedIdxs = {}".format(str(sortedIdxs))) print("sortedIdxScores = {}".format( str(windowSims[i, sortedIdxs]))) print("------------------------") bsfScore = score bsfLocs = sortedIdxs[:k] bsfIntersection = np.copy(filt) # early abandon if this can't possibly beat the best score, which # is the case exactly when the intersection is so small that perfect # matches at all future locations still wouldn't be good enough elif sz * numIdxs <= bsfScore: # TODO can we actually early abandon here? next window loc # could increase filt, and thus score for a given loc isn't # necessarily non-increasing... # -can't abandon using this test, but pretty sure there's # a lower bound to be had here somewhere # print("early abandoning window {} at k={}".format(i, k)) break # # Version where we look for similarities to orig seq and use nearest # enemy dist as M0, and use mean values instead of intersection, # and don't sort the indices, but instead care about overlap # # bsfScore = 0 # bsfLocs = None # bsfIntersection = None # for i, row in enumerate(windowSims): # if i % 20 == 0: # print("computing stuff for row {}".format(i)) # # early abandon if this location has so little stuff that no # # intersection with it can possibly beat the best score # if windowSims[i,i] * kMax <= bsfScore: # highest score is kMax identical locs # continue # # best combination of idxs such that none are within Lmin of each other # # validRow = row[:(-length + 1)] # can't go past end of ts # # idxs = sub.optimalAlignment(validRow, Lmin) # idxs = sub.optimalAlignment(row, Lmin) # goes past end of ts, but better # # order idxs by descending order of associated score # sizes = windowSims[i, idxs] # sortedSizesOrder = np.argsort(sizes)[::-1] # sortedIdxs = idxs[sortedSizesOrder] # # iteratively intersect with another near neighbor, compute the # # associated score, and check if it's better (or if we can early abandon) # intersection = windowVects[i] # numIdxs = len(sortedIdxs) # nextSz = np.sum(intersection) # nextFilt = np.array(intersection, dtype=np.float) # nextFiltSum = np.array(nextFilt, dtype=np.float) # for j, idx in enumerate(sortedIdxs): # k = j + 1 # filt = np.copy(nextFilt) # sz = nextSz # if k < numIdxs: # nextIdx = sortedIdxs[k] # since k = j+1 # nextIntersection = np.minimum(filt, windowVectsBlur[nextIdx]) # nextFiltSum += nextIntersection # nextFilt = nextFiltSum / (k+1) # avg value of each feature in intersections # # nextSz = np.sum(nextFilt) # big even if like no intersection... # nextSz = np.sum(nextIntersection) # bigEnoughIntersection = nextIntersection[nextIntersection > minSim] # nextSz = np.sum(bigEnoughIntersection) # else: # nextSz = sz * p0 # score = (sz - nextSz) * k # if k > 1 and score > bsfScore: # print("window {0}, k={1}, score={2} is the new best!".format(i, k, score)) # print("sortedIdxs = {}".format(str(sortedIdxs))) # print("sortedIdxScores = {}".format(str(windowSims[i, sortedIdxs]))) # print("------------------------") # bsfScore = score # bsfLocs = sortedIdxs[:k] # bsfIntersection = np.copy(filt) # # early abandon if this can't possibly beat the best score, which # # is the case exactly when the intersection is so small that perfect # # matches at all future locations still wouldn't be good enough # elif sz * numIdxs <= bsfScore: # # TODO can we actually early abandon here? next window loc # # could increase filt, and thus score for a given loc isn't # # necessarily non-increasing... # # -can't abandon using this test, but pretty sure there's # # a lower bound to be had here somewhere # # print("early abandoning window {} at k={}".format(i, k)) # break # ------------------------ recover original ts bsfIntersection *= bsfIntersection >= minSim bsfIntersectionWindow = bsfIntersection.reshape((-1, windowLen)) sums = np.sum(bsfIntersectionWindow, axis=0) kBest = len(bsfLocs) p0 = np.power(p0, kBest) # expectedOnesPerCol = p0 * X.shape[1] expectedOnesPerCol = p0 * X.shape[1] * 2 sums -= expectedOnesPerCol plt.plot(sums) start, end, _ = maxSubarray(sums) patStart, patEnd = start, end + 1 + length # ================================ show output print "bestScore = {}".format(bsfScore) print "bestLocations = {}".format(str(bsfLocs)) for idx in bsfLocs: viz.plotRect(axSim, idx, idx + windowLen) # print bsfIntersectionWindow.shape # print sums.shape # plt.plot(sums) # viz.plotRect(plt.gca(), start, end + 1) for idx in bsfLocs: viz.plotRect(axSeq, idx + patStart, idx + patEnd) plt.figure() plt.imshow(bsfIntersectionWindow, interpolation='nearest', aspect='auto') plt.tight_layout() plt.show()
def learnFF(X, Xblur, Lmin, Lmax, length): """main algorithm""" # ------------------------ derived stats kMax = int(X.shape[1] / Lmin + .5) # windowLen = Lmax - length + 1 windowLen = Lmax # try matching ff10 print "using window len ", windowLen p0 = np.mean(X) # fraction of entries that are 1 (roughly) # p0 = np.mean(X > 0.) # fraction of entries that are 1 # TODO try this # p0 = 2 * np.mean(X > 0.) # lambda for l0 reg based on features being bernoulli at 2 locs minSim = p0 expectedOnesPerWindow = p0 * X.shape[0] * windowLen noiseSz = p0 * expectedOnesPerWindow # num ones to begin with # noiseSz *= -np.log2(p0) # TODO this is right mathematically, but what will it do? # noiseSz = p0 * X.shape[0] * windowLen # way too hard to beat colSims = np.dot(X.T, Xblur) filt = np.zeros((windowLen, windowLen)) + np.diag(np.ones(windowLen)) # zeros except 1s on diag windowSims = sig.convolve2d(colSims, filt, mode='valid') windowVects = vectorizeWindowLocs(X, windowLen) windowVectsBlur = vectorizeWindowLocs(Xblur, windowLen) print "p0, noiseSz = ", p0, noiseSz # plt.figure() # plt.imshow(windowSims, interpolation='nearest', aspect='auto') # ------------------------ find stuff # # Version where we look for similarities to orig seq and use nearest # enemy dist as M0, and use mean values instead of intersection # bsfScore = 0 bsfLocs = None bsfIntersection = None for i, row in enumerate(windowSims): if i % 10 == 0: print("computing stuff for row {}".format(i)) # early abandon if this location has so little stuff that no # intersection with it can possibly beat the best score if windowSims[i,i] * kMax <= bsfScore: # highest score is kMax identical locs continue # best combination of idxs such that none are within Lmin of each other idxs = sub.optimalAlignment(row, Lmin) # order idxs by descending order of associated score sizes = windowSims[i, idxs] sortedSizesOrder = np.argsort(sizes)[::-1] sortedIdxs = idxs[sortedSizesOrder] # iteratively intersect with another near neighbor, compute the # associated score, and check if it's better (or if we can early abandon) intersection = windowVects[i] numIdxs = len(sortedIdxs) nextSz = np.sum(intersection) nextFilt = np.array(intersection, dtype=np.float) nextFiltSum = np.array(nextFilt, dtype=np.float) for j, idx in enumerate(sortedIdxs): k = j + 1 filt = np.copy(nextFilt) sz = nextSz if k < numIdxs: nextIdx = sortedIdxs[k] # since k = j+1 # so we're zeroing out all the places where the filt is 0, but # where it isn't zero, we're not just adding the non-zeroed places # to the sum, but instead adding either them or the filter value # there, whichever is smaller; this is sort of a weird thing to # do. Maybe it gets us submodularity? # -actually, yes, this ensures that the weight of a given # feature is nonincreasing as locations are added # -which enables admissible early abandoning nextIntersection = np.minimum(filt, windowVectsBlur[nextIdx]) nextFiltSum += nextIntersection nextFilt = nextFiltSum / (k+1) # avg value of each feature in intersections # nextSz = np.sum(nextFilt) # big even if like no intersection... # nextSz = np.sum(nextIntersection) bigEnoughIntersection = nextIntersection[nextIntersection > minSim] nextSz = np.sum(bigEnoughIntersection) else: nextSz = sz * p0 enemySz = max(nextSz, noiseSz) score = (sz - enemySz) * k if k > 1 and score > bsfScore: print("window {0}, k={1}, score={2} is the new best!".format(i, k, score)) # print("sortedIdxs = {}".format(str(sortedIdxs))) # print("sortedIdxScores = {}".format(str(windowSims[i, sortedIdxs]))) print("------------------------") bsfScore = score bsfLocs = sortedIdxs[:k] bsfIntersection = np.copy(filt) # early abandon if this can't possibly beat the best score, which # is the case exactly when the intersection is so small that perfect # matches at all future locations still wouldn't be good enough elif sz * numIdxs <= bsfScore: break # TODO can we actually early abandon here? elif noiseSz > nextSz: break # ------------------------ recover original ts print "bestScore = {}".format(bsfScore) print "bestLocations = {}".format(str(bsfLocs)) bsfIntersection *= bsfIntersection >= minSim bsfIntersectionWindow = bsfIntersection.reshape((-1, windowLen)) sums = np.sum(bsfIntersectionWindow, axis=0) kBest = len(bsfLocs) expectedOnesFrac = np.power(p0, kBest) expectedOnesPerCol = expectedOnesFrac * X.shape[0] sums -= expectedOnesPerCol # plt.figure() # plt.plot(sums) start, end, _ = maxSubarray(sums) print "learnFF: startIdxs, endIdxs:" print np.array(bsfLocs) + start print np.array(bsfLocs) + end print "learnFF: filtLen, windowLen = {}, {}".format(end - start, windowLen) return bsfLocs, bsfIntersectionWindow, start, end
def main(): # np.random.seed(123) # ================================ consts for everything # consts for generating data # n = 1000 n = 500 # n = 300 # length = 8 # length = 16 # length = 32 # length = 50 # nInstances = 3 exampleLengths = [55, 60, 65] # exampleLengths = [60, 60, 60] noiseStd = .5 # consts for algorithm # Lmin = max(20, length) # only needed for optimalAlignK() spacing Lmin = 20 # only needed for optimalAlignK() spacing Lmax = 100 # loose upper bound on pattern length # minSim = .5 minSim = 0. length = Lmin // 2 # length = Lmin // 4 # length = 3 answerIdxs = None USE_MSRC = True # USE_MSRC = False # ================================ data # ------------------------ synthetic data # seq = synth.randconst(n, std=noiseStd) seq = synth.randwalk(n, std=noiseStd) # seq = synth.notSoRandomWalk(n, std=noiseStd, trendFilterLength=80, lpfLength=4) # seq = synth.notSoRandomWalk(n, std=noiseStd, trendFilterLength=80, lpfLength=16) seq = embedExamples(seq, exampleLengths) # seq = synth.appendZeros(seq, Lmax) # ------------------------ msrc if USE_MSRC: from ..datasets import read_msrc as msrc # idxs = [0] # idxs = [1] # idxs = [2] # idxs = [7] # length 1500, but instances of length like 20 # idxs = [8] # gets owned on this one cuz patterns of length like 100 # idxs = [9] # missing an annotation, it appears idxs = [10] # something crazy about feature rep here # TODO fix # idxs = [11] # crap cuz bad, low-variance signals # idxs = [12] # has garbagey sections like [10] # idxs = [13] # empty feature mat # TODO # idxs = [14] downsampleBy = 2 # downsampleBy = 1 recordings = msrc.getRecordings(idxs=idxs) r = list(recordings)[0] # seq = r.data # seq = r.data[:, :40] # seq = r.data[:, 20:23] seq = r.data[:, 24:27] # seq = r.data[:, 20:27] print "orig seq shape", seq.shape seq = ar.downsampleMat(seq, rowsBy=downsampleBy) print "downsampled seq shape", seq.shape # length = max(8, Lmin / 2) Lmin = len(seq) // 20 # Lmax = len(seq) // 8 Lmax = len(seq) // 10 length = Lmin // 2 # Lmax = len(seq) / 20 # k0 = 10 # minSim = .5 answerIdxs = r.gestureIdxs / downsampleBy # print "seq shape", seq.shape prePadLen = Lmax - length # postPadLen = length - 1 postPadLen = Lmax - length first = np.tile(seq[0], (prePadLen, 1)) last = np.tile(seq[-1], (postPadLen, 1)) seq = np.vstack((first, seq, last)) # pad with fixed val to allow all window positions # ^ TODO pad simMat with zeros instead--this introduces fake subseqs answerIdxs += prePadLen # seq = np.vstack((seq, np.tile(flat, (length-1, 1)))) # lets it get the last rep # print "seq shape", seq.shape # ================================ feature construction logMaxLength = int(np.floor(np.log2(Lmax))) # logMaxLength = int(np.ceil(np.log2(Lmax))) # logMinLength = 3 # -> length 8 # logMinLength = 4 # -> length 16 logMinLength = int(np.floor(np.log2(Lmin))) lengths = np.arange(logMinLength, logMaxLength + 1) lengths = 2 ** lengths # lengths = [16] cardinality = 8 breakpoints = rep.saxBreakpoints(cardinality) X = rep.multiNormalizeAndSparseQuantize(seq, lengths, breakpoints) # X = rep.multiSparseLineProject(seq, lengths, breakpoints, removeZeroRows=False) # lengths2 = np.arange(3, logMaxLength + 1) # lengths2 = 2 ** lengths2 lengths2 = lengths # TODO uncomment after debug # lengths2 = [8, 32] # breakpoints2 = rep.defaultSparseLineBreakpoints(seq, scaleHowMany=2) breakpoints2 = rep.defaultSparseLineBreakpoints(seq) X2 = rep.multiSparseLineProject(seq, lengths2, breakpoints2) # X2 = X2 > minSim X2 = X2 > 0. # ignore correlations # print "shapes:" # print X.shape # print X2.shape X = np.vstack((X, X2)) # plt.figure() # # viz.imshowBetter(X) # viz.imshowBetter(X2) # plt.figure() # viz.imshowBetter(X2 > 0.) # plt.show() # print seq.shape # plt.figure() # plt.plot(seq[:,0]) # bit of pattern, but only varies between -.4 and .2 # okay, so 1st dim is all zeros # variances = rep.slidingVariance(seq, 8) # for dim in range(len(variances)): # plt.figure() # plt.plot(variances[dim].flatten()) # print variances.shape # variances = rep.vstack3Tensor(variances.T) # print variances.shape # plt.plot(variances) # plt.show() # return X = localMaxFilterSimMat(X) # Xbool = np.copy(X) featureMeans = np.mean(X, axis=1).reshape((-1, 1)) # print featureMeans X *= -np.log2(featureMeans) # variable encoding costs for rows # X /= -np.log(featureMeans) # Xblur = localMaxFilterSimMat(X) # try only maxFiltering Xblur Xblur = filterSimMat(X, length-1, 'hamming', scaleFilterMethod='max1') # plt.figure() # viz.imshowBetter(X) # plt.figure() # viz.imshowBetter(Xblur) print "featureMat dims:", X.shape Xnonzeros = np.count_nonzero(X) print "featureMat nonzeros, total, frac = ", Xnonzeros, X.size, Xnonzeros / float(X.size) # plt.show() # return # ================================ plotting crap plt.figure() axSeq = plt.subplot2grid((4,1), (0,0)) axSim = plt.subplot2grid((4,1), (1,0), rowspan=3) for ax in (axSeq, axSim): ax.autoscale(tight=True) axSeq.plot(seq) # if answerIdxs is not None: # for idx in answerIdxs: # viz.plotVertLine(idx, ax=axSeq) padLen = len(seq) - X.shape[1] Xpad = synth.appendZeros(X, padLen) axSim.imshow(Xpad, interpolation='nearest', aspect='auto') # im = axSim.imshow(Xpad, interpolation='nearest', aspect='auto') # plt.colorbar(im, cax=axSim) axSeq.set_title("Time Series") axSim.set_title("Feature Matrix") # plt.show() # return # ================================ science # ------------------------ derived stats kMax = int(X.shape[1] / Lmin + .5) windowLen = Lmax - length + 1 p0 = np.mean(X) # fraction of entries that are 1 (roughly) # p0 = 2 * np.mean(X) # lambda for l0 reg based on features being bernoulli at 2 locs minSim = p0 # p0 = -np.log(np.mean(Xbool)) # fraction of entries that are 1 (roughly) # noiseSz = p0 * X.shape[0] * windowLen # way too hard to beat expectedOnesPerWindow = p0 * X.shape[0] * windowLen noiseSz = p0 * expectedOnesPerWindow # num ones to begin with # intersections = computeIntersections(X, windowLen) # windowSims = np.sum(intersections, axis=2) # colSims = np.dot(X.T, X) colSims = np.dot(X.T, Xblur) filt = np.zeros((windowLen, windowLen)) + np.diag(np.ones(windowLen)) # zeros except 1s on diag windowSims = sig.convolve2d(colSims, filt, mode='valid') windowVects = vectorizeWindowLocs(X, windowLen) windowVectsBlur = vectorizeWindowLocs(Xblur, windowLen) # plt.figure() # plt.imshow(windowSims, interpolation='nearest', aspect='auto') # ------------------------ find stuff # # Version where we look for similarities to orig seq and use nearest # enemy dist as M0, and use mean values instead of intersection # bsfScore = 0 bsfLocs = None bsfIntersection = None for i, row in enumerate(windowSims): if i % 20 == 0: print("computing stuff for row {}".format(i)) # early abandon if this location has so little stuff that no # intersection with it can possibly beat the best score if windowSims[i,i] * kMax <= bsfScore: # highest score is kMax identical locs continue # best combination of idxs such that none are within Lmin of each other # validRow = row[:(-length + 1)] # can't go past end of ts # idxs = sub.optimalAlignment(validRow, Lmin) idxs = sub.optimalAlignment(row, Lmin) # goes past end of ts, but better # order idxs by descending order of associated score sizes = windowSims[i, idxs] sortedSizesOrder = np.argsort(sizes)[::-1] sortedIdxs = idxs[sortedSizesOrder] # iteratively intersect with another near neighbor, compute the # associated score, and check if it's better (or if we can early abandon) intersection = windowVects[i] numIdxs = len(sortedIdxs) nextSz = np.sum(intersection) nextFilt = np.array(intersection, dtype=np.float) nextFiltSum = np.array(nextFilt, dtype=np.float) for j, idx in enumerate(sortedIdxs): k = j + 1 filt = np.copy(nextFilt) sz = nextSz if k < numIdxs: nextIdx = sortedIdxs[k] # since k = j+1 nextIntersection = np.minimum(filt, windowVectsBlur[nextIdx]) nextFiltSum += nextIntersection nextFilt = nextFiltSum / (k+1) # avg value of each feature in intersections # nextSz = np.sum(nextFilt) # big even if like no intersection... nextSz = np.sum(nextIntersection) bigEnoughIntersection = nextIntersection[nextIntersection > minSim] nextSz = np.sum(bigEnoughIntersection) else: nextSz = sz * p0 # nextSz = -1 enemySz = max(nextSz, noiseSz) score = (sz - enemySz) * k if k > 1 and score > bsfScore: print("window {0}, k={1}, score={2} is the new best!".format(i, k, score)) print("sortedIdxs = {}".format(str(sortedIdxs))) print("sortedIdxScores = {}".format(str(windowSims[i, sortedIdxs]))) print("------------------------") bsfScore = score bsfLocs = sortedIdxs[:k] bsfIntersection = np.copy(filt) # early abandon if this can't possibly beat the best score, which # is the case exactly when the intersection is so small that perfect # matches at all future locations still wouldn't be good enough elif sz * numIdxs <= bsfScore: # TODO can we actually early abandon here? next window loc # could increase filt, and thus score for a given loc isn't # necessarily non-increasing... # -can't abandon using this test, but pretty sure there's # a lower bound to be had here somewhere # print("early abandoning window {} at k={}".format(i, k)) break elif noiseSz > nextSz: break # # # # Version where we look for similarities to orig seq and use nearest # # enemy dist as M0, and use mean values instead of intersection, # # and don't sort the indices, but instead care about overlap # # # bsfScore = 0 # bsfLocs = None # bsfIntersection = None # for i, row in enumerate(windowSims): # if i % 20 == 0: # print("computing stuff for row {}".format(i)) # # early abandon if this location has so little stuff that no # # intersection with it can possibly beat the best score # if windowSims[i,i] * kMax <= bsfScore: # highest score is kMax identical locs # continue # # best combination of idxs such that none are within Lmin of each other # # validRow = row[:(-length + 1)] # can't go past end of ts # # idxs = sub.optimalAlignment(validRow, Lmin) # idxs = sub.optimalAlignment(row, Lmin) # goes past end of ts, but better # # order idxs by descending order of associated score # sizes = windowSims[i, idxs] # sortedSizesOrder = np.argsort(sizes)[::-1] # sortedIdxs = idxs[sortedSizesOrder] # # iteratively intersect with another near neighbor, compute the # # associated score, and check if it's better (or if we can early abandon) # intersection = windowVects[i] # numIdxs = len(sortedIdxs) # nextSz = np.sum(intersection) # nextFilt = np.array(intersection, dtype=np.float) # nextFiltSum = np.array(nextFilt, dtype=np.float) # for j, idx in enumerate(sortedIdxs): # k = j + 1 # filt = np.copy(nextFilt) # sz = nextSz # if k < numIdxs: # nextIdx = sortedIdxs[k] # since k = j+1 # nextIntersection = np.minimum(filt, windowVectsBlur[nextIdx]) # nextFiltSum += nextIntersection # nextFilt = nextFiltSum / (k+1) # avg value of each feature in intersections # # nextSz = np.sum(nextFilt) # big even if like no intersection... # nextSz = np.sum(nextIntersection) # bigEnoughIntersection = nextIntersection[nextIntersection > minSim] # nextSz = np.sum(bigEnoughIntersection) # else: # nextSz = sz * p0 # score = (sz - nextSz) * k # if k > 1 and score > bsfScore: # print("window {0}, k={1}, score={2} is the new best!".format(i, k, score)) # print("sortedIdxs = {}".format(str(sortedIdxs))) # print("sortedIdxScores = {}".format(str(windowSims[i, sortedIdxs]))) # print("------------------------") # bsfScore = score # bsfLocs = sortedIdxs[:k] # bsfIntersection = np.copy(filt) # # early abandon if this can't possibly beat the best score, which # # is the case exactly when the intersection is so small that perfect # # matches at all future locations still wouldn't be good enough # elif sz * numIdxs <= bsfScore: # # TODO can we actually early abandon here? next window loc # # could increase filt, and thus score for a given loc isn't # # necessarily non-increasing... # # -can't abandon using this test, but pretty sure there's # # a lower bound to be had here somewhere # # print("early abandoning window {} at k={}".format(i, k)) # break # ------------------------ recover original ts bsfIntersection *= bsfIntersection >= minSim bsfIntersectionWindow = bsfIntersection.reshape((-1, windowLen)) sums = np.sum(bsfIntersectionWindow, axis=0) kBest = len(bsfLocs) p0 = np.power(p0, kBest) # expectedOnesPerCol = p0 * X.shape[1] * 2 # expectedOnesPerCol = p0 * X.shape[1] expectedOnesPerCol = p0 * X.shape[0] sums -= expectedOnesPerCol # plt.figure() # plt.plot(sums) start, end, _ = maxSubarray(sums) # patStart, patEnd = start, end + 1 + length patStart, patEnd = start, end + 1 # patStart, patEnd = start + length // 2, end + 1 + length # ================================ show output print "bestScore = {}".format(bsfScore) print "bestLocations = {}".format(str(bsfLocs)) for idx in bsfLocs: viz.plotRect(axSim, idx, idx+windowLen) # print bsfIntersectionWindow.shape # print sums.shape # plt.plot(sums) # viz.plotRect(plt.gca(), start, end + 1) for idx in bsfLocs: viz.plotRect(axSeq, idx + patStart, idx + patEnd) if answerIdxs is not None: for idx in answerIdxs: viz.plotVertLine(idx, ax=axSeq) plt.figure() plt.imshow(bsfIntersectionWindow, interpolation='nearest', aspect='auto') plt.tight_layout() plt.show()
def extractTrueLocs(X, Xblur, bsfLocs, bsfFilt, windowLen, Lmin, Lmax, extractTrueLocsAlgo=DEFAULT_EXTRACT_LOCS_ALGO, **sink): if extractTrueLocsAlgo == 'none': return bsfLocs, bsfLocs + Lmax # determine expected value of an element of X (or, alternatively, Xblur) if extractTrueLocsAlgo == 'x': p0 = np.mean(X) else: p0 = np.mean(Xblur) if bsfFilt is None: print "WARNING: extractTrueLocs(): received None as filter" return np.array([0]), np.array([1]) print "extractTrueLocs(): bsf locs", bsfLocs print "extractTrueLocs(): bsfFilt shape", bsfFilt.shape # compute the total filter weight in each column, ignoring low values bsfFiltWindow = np.copy(bsfFilt) # minSim = p0 # bsfFiltWindow *= bsfFiltWindow >= minSim sums = np.sum(bsfFiltWindow, axis=0) # subtract off the amount of weight that we'd expect in each column by chance kBest = len(bsfLocs) expectedOnesFrac = np.power(p0, kBest - 1) # this is like 0; basically no point expectedOnesPerCol = expectedOnesFrac * X.shape[0] sums -= expectedOnesPerCol # # at least for a couple msrc examples, these are basically flat--which makes sense # plt.figure() # plt.plot(sums) # plt.plot(np.zeros(len(sums)) + expectedOnesPerCol) # # from ..utils.misc import nowAsString # # plt.savefig('/Users/davis/Desktop/ts/figs/msrc/sums-{}.pdf'.format(nowAsString())) # plt.show() # plt.close() # pick the optimal set of indices to maximize the sum of sequential column sums start, end, _ = maxSubarray(sums) # ensure we picked at least Lmin points sumsLength = len(sums) while end - start < Lmin: nextStartVal = sums[start - 1] if start > 0 else -np.inf nextEndVal = sums[end] if end < sumsLength else -np.inf if nextStartVal > nextEndVal: start -= 1 else: end += 1 # ensure we picked at most Lmax points while end - start > Lmax: if sums[start] > sums[end - 1]: end -= 1 else: start += 1 locs = np.sort(np.asarray(bsfLocs)) startIdxs = locs + start endIdxs = locs + end # # reconcile overlap; we first figure out how much we like the start vs end # # for different amounts of overlap # startSums = np.cumsum(sums) # endSums = np.cumsum(sums[::-1]) # # gaps = startIdxs[1:] - startIdxs[:-1] # for i in range(len(startIdxs) - 1): # te1, ts2 = endIdxs[i], startIdxs[i+1] # gap = ts2 - te1 # if gap > 0: # continue # # figure out best amount by which to crop start and end indices # overlap = -gap + 1 # bestSplitCost = np.inf # bestMoveStart = -1 # for moveStartThisMuch in range(0, overlap): # moveEndThisMuch = overlap - moveStartThisMuch # startCost = startSums[moveStartThisMuch-1] if moveStartThisMuch else 0. # endCost = endSums[moveEndThisMuch-1] if moveEndThisMuch else 0. # cost = startCost + endCost # if cost < bestSplitCost: # bestSplitCost = cost # bestMoveStart = moveStartThisMuch # startIdxs[i+1] += bestMoveStart # endIdxs[i] -= (overlap - bestMoveStart) if len(startIdxs) > 2: lengths = endIdxs - startIdxs maxInternalLength = np.max(lengths[1:-1]) startIdxs[0] = max(startIdxs[0], endIdxs[0] - maxInternalLength) endIdxs[-1] = min(endIdxs[-1], startIdxs[-1] + maxInternalLength) print "extractTrueLocs(): startIdxs, endIdxs", startIdxs, endIdxs return startIdxs, endIdxs
def extractTrueLocs(X, Xblur, bsfLocs, bsfFilt, windowLen, Lmin, Lmax, extractTrueLocsAlgo=DEFAULT_EXTRACT_LOCS_ALGO, **sink): if extractTrueLocsAlgo == 'none': return bsfLocs, bsfLocs + Lmax # determine expected value of an element of X (or, alternatively, Xblur) if extractTrueLocsAlgo == 'x': p0 = np.mean(X) else: p0 = np.mean(Xblur) if bsfFilt is None: print "WARNING: extractTrueLocs(): received None as filter" return np.array([0]), np.array([1]) print "extractTrueLocs(): bsf locs", bsfLocs print "extractTrueLocs(): bsfFilt shape", bsfFilt.shape # compute the total filter weight in each column, ignoring low values bsfFiltWindow = np.copy(bsfFilt) # minSim = p0 # bsfFiltWindow *= bsfFiltWindow >= minSim sums = np.sum(bsfFiltWindow, axis=0) # subtract off the amount of weight that we'd expect in each column by chance kBest = len(bsfLocs) expectedOnesFrac = np.power(p0, kBest-1) # this is like 0; basically no point expectedOnesPerCol = expectedOnesFrac * X.shape[0] sums -= expectedOnesPerCol # # at least for a couple msrc examples, these are basically flat--which makes sense # plt.figure() # plt.plot(sums) # plt.plot(np.zeros(len(sums)) + expectedOnesPerCol) # # from ..utils.misc import nowAsString # # plt.savefig('/Users/davis/Desktop/ts/figs/msrc/sums-{}.pdf'.format(nowAsString())) # plt.show() # plt.close() # pick the optimal set of indices to maximize the sum of sequential column sums start, end, _ = maxSubarray(sums) # ensure we picked at least Lmin points sumsLength = len(sums) while end - start < Lmin: nextStartVal = sums[start-1] if start > 0 else -np.inf nextEndVal = sums[end] if end < sumsLength else -np.inf if nextStartVal > nextEndVal: start -= 1 else: end += 1 # ensure we picked at most Lmax points while end - start > Lmax: if sums[start] > sums[end-1]: end -= 1 else: start += 1 locs = np.sort(np.asarray(bsfLocs)) startIdxs = locs + start endIdxs = locs + end # # reconcile overlap; we first figure out how much we like the start vs end # # for different amounts of overlap # startSums = np.cumsum(sums) # endSums = np.cumsum(sums[::-1]) # # gaps = startIdxs[1:] - startIdxs[:-1] # for i in range(len(startIdxs) - 1): # te1, ts2 = endIdxs[i], startIdxs[i+1] # gap = ts2 - te1 # if gap > 0: # continue # # figure out best amount by which to crop start and end indices # overlap = -gap + 1 # bestSplitCost = np.inf # bestMoveStart = -1 # for moveStartThisMuch in range(0, overlap): # moveEndThisMuch = overlap - moveStartThisMuch # startCost = startSums[moveStartThisMuch-1] if moveStartThisMuch else 0. # endCost = endSums[moveEndThisMuch-1] if moveEndThisMuch else 0. # cost = startCost + endCost # if cost < bestSplitCost: # bestSplitCost = cost # bestMoveStart = moveStartThisMuch # startIdxs[i+1] += bestMoveStart # endIdxs[i] -= (overlap - bestMoveStart) if len(startIdxs) > 2: lengths = endIdxs - startIdxs maxInternalLength = np.max(lengths[1:-1]) startIdxs[0] = max(startIdxs[0], endIdxs[0] - maxInternalLength) endIdxs[-1] = min(endIdxs[-1], startIdxs[-1] + maxInternalLength) print "extractTrueLocs(): startIdxs, endIdxs", startIdxs, endIdxs return startIdxs, endIdxs
def main(): # np.random.seed(123) # ================================ consts for everything # consts for generating data # n = 1000 n = 500 # n = 300 # length = 8 # length = 16 length = 32 # length = 50 # nInstances = 3 exampleLengths = [55, 60, 65] # exampleLengths = [60, 60, 60] noiseStd = 0.5 # consts for algorithm Lmin = max(20, length) # only needed for optimalAlignK() spacing Lmax = 100 # loose upper bound on pattern length minSim = 0.8 # loose cutoff for what counts as similar k0 = len(exampleLengths) # for version where we tell it k # ------------------------ synthetic data # seq = synth.randconst(n, std=noiseStd) # seq = synth.notSoRandomWalk(n, std=noiseStd, trendFilterLength=80, lpfLength=4) seq = synth.notSoRandomWalk(n, std=noiseStd, trendFilterLength=80, lpfLength=16) seq = embedExamples(seq, exampleLengths) # ------------------------ msrc # from ..datasets import read_msrc as msrc # idxs = [2] # recordings = msrc.getRecordings(idxs=idxs) # r = list(recordings)[0] # seq = r.data[:, 20:23] # print "orig seq shape", seq.shape # seq = ar.downsampleMat(seq, rowsBy=10) # print "downsampled seq shape", seq.shape # length = 8 # Lmin = len(seq) / 20 # Lmax = len(seq) / 10 # # Lmax = len(seq) / 20 # k0 = 10 # minSim = .5 # noise = synth.randconst(seq.shape) # add noise for debugging # seq = np.r_[noise, seq, noise] # ================================ simMat X = computeSimMat(seq, length) # X[X < minSim] = 0. # X = ff2.localMaxFilterSimMat(X) # maxPoolWidth = min(length-1, Lmin-1) # maxPoolWidth /= 2 # X = filters.maximum_filter1d(X, maxPoolWidth, axis=1) # X = filters.maximum_filter1d(X, length-1, axis=1) # X = filters.maximum_filter1d(X, length/2, axis=1) # X = filters.maximum_filter1d(X, 3, axis=1) # X = np.array(X > minSim, dtype=np.float) # X = X > minSim X[X < minSim] = 0.0 # X = ff2.filterSimMat(X, length-1, 'hamming', scaleFilterMethod='max1') # X = sub.removeCorrelatedRows(X, .9, accumulate=True) # correlation > .9 -> kill it # X = sub.removeCorrelatedRows(X, .9, accumulate=False) # correlation > .9 -> kill it print "simMat dims:", X.shape print "simMat nonzeros, total, frac = ", np.count_nonzero(X), X.size, np.count_nonzero(X) / float(X.size) # ================================ plotting crap plt.figure() axSeq = plt.subplot2grid((4, 1), (0, 0)) axSim = plt.subplot2grid((4, 1), (1, 0), rowspan=3) for ax in (axSeq, axSim): ax.autoscale(tight=True) axSeq.plot(seq) axSim.imshow(synth.appendZeros(X, length - 1), interpolation="nearest", aspect="auto") # im = axSim.imshow(synth.appendZeros(X, length-1), interpolation='nearest', aspect='auto') # plt.colorbar(im, cax=axSim) axSeq.set_title("Time Series") axSim.set_title("Similarities Matrix") # ================================ science # ------------------------ derived stats kMax = int(X.shape[1] / Lmin + 0.5) windowWidth = Lmax - length + 1 # windowShape = (X.shape[0], Lmax) # windowSize = np.prod(windowShape) nLocs = X.shape[1] - windowWidth + 1 # ------------------------ pairwise sims # colSims = np.dot(X.T, X) # filt = np.zeros((Lmax, Lmax)) + np.diag(np.ones(Lmax)) # zeros except 1s on diag # windowSims = sig.convolve2d(colSims, filt, mode='valid') print "computing intersections..." windowVects = vectorizeWindowLocs(X, windowWidth) windowSz = windowVects.shape[1] intersections = computeIntersections(X, windowVects, windowWidth) windowSims = np.sum(intersections, axis=2) # windowSims /= windowSz # assert(np.array_equal(windowSims, windowSims2)) # works # plt.figure() # plt.imshow(windowSims2) plt.figure() plt.imshow(windowSims / windowSz) plt.colorbar() # plt.show() # return print "computing similarity lower bound..." # TODO maybe try introducing beta prior to weight different values of k # -or, alternatively, see what happens if we tell it the right k # # Version where we look for stuff matching each intersection # # initialize with closest pair at least Lmin apart nonTrivialWindowSims = np.triu(windowSims) # zero lower half for i in range(nLocs): nonTrivialWindowSims[i, i : min(nLocs, i + Lmin)] = 0 # zero Lmin past diag highestSimIdx = np.argmax(nonTrivialWindowSims) bsfLocs = sorted([highestSimIdx // nLocs, highestSimIdx % nLocs]) bsfScore = windowSims[tuple(bsfLocs)] * 2 # list will yield a list bsfIntersection = intersections[bsfLocs] print "finding best locations..." # rowIntersectionSims = np.zeros(nLocs) # for i in range(nLocs): # if i % 20 == 0: # print("computing stuff for row {}".format(i)) # bestPossibleScores = windowSims[i,min(nLocs,i+Lmin):] * kMax / 2. # candidateIdxs = np.where(bestPossibleScores > bsfScore)[0] # # print "candidateIdxs shape", candidateIdxs.shape # for j in candidateIdxs: # intersection = intersections[i, j] # rowIntersectionSims *= 0 # rowIntersectionSims[candidateIdxs] = np.dot(intersections[i, candidateIdxs], intersection) # idxs = sub.optimalAlignment(rowIntersectionSims, Lmin) # # order idxs by descending order of associated score # sizes = rowIntersectionSims[idxs] # sortedSizesOrder = np.argsort(sizes)[::-1] # sortedIdxs = idxs[sortedSizesOrder] # # iteratively intersect with another near neighbor, compute the # # associated score, and check if it's better (or if we can early abandon) # numIdxs = len(sortedIdxs) # k = 2 # for idx in sortedIdxs[2:]: # first 2 are no better than orig intersection # k += 1 # intersection = np.logical_and(intersection, intersections[i, idx]) # sz = np.sum(intersection) # score = sz * k # if score > bsfScore: # print("window {0}, k={1}, score={2} is the new best!".format(i, k, score)) # bsfScore = score # bsfLocs = sortedIdxs[:k] # bsfIntersection = np.copy(intersection) # elif sz * numIdxs <= bsfScore: # # print("early abandoning window {} at k={}".format(i, k)) # break # # Version where we look for similarities to orig seq # bsfScore = 0 bsfLocs = None bsfIntersection = None for i, row in enumerate(windowSims): if i % 20 == 0: print ("computing stuff for row {}".format(i)) # early abandon if this location has so little stuff that no # intersection with it can possibly beat the best score if windowSims[i, i] * kMax <= bsfScore: # highest score is kMax identical locs # print("immediately abandoning window {}!".format(i)) continue # print("not abandoning window {}!".format(i)) # best combination of idxs such that none are within Lmin of each other idxs = sub.optimalAlignment(row, Lmin) # order idxs by descending order of associated score sizes = windowSims[i, idxs] sortedSizesOrder = np.argsort(sizes)[::-1] sortedIdxs = idxs[sortedSizesOrder] # retrieve intersection and compute score for best 2 locs k = 2 intersection = intersections[sortedIdxs[0], sortedIdxs[1]] score = windowSims[sortedIdxs[0], sortedIdxs[1]] * k # possibly update best-so-far score and window locations if score > bsfScore: bsfScore = score bsfLocs = sortedIdxs[:k] bsfIntersection = np.copy(intersection) # iteratively intersect with another near neighbor, compute the # associated score, and check if it's better (or if we can early abandon) numIdxs = len(sortedIdxs) for idx in sortedIdxs[2:]: k += 1 intersection = np.logical_and(intersection, windowVects[idx]) sz = np.count_nonzero(intersection) score = sz * k if score > bsfScore: print ("window {0}, k={1}, score={2} is the new best!".format(i, k, score)) bsfScore = score bsfLocs = sortedIdxs[:k] bsfIntersection = np.copy(intersection) # early abandon if this can't possibly beat the best score, which # is the case exactly when the intersection is so small that perfect # matches at all future locations still wouldn't be good enough elif sz * numIdxs <= bsfScore: # print("early abandoning window {} at k={}".format(i, k)) break # # Version where we we tell it k # # bsfScore = 0 # bsfLocs = None # bsfIntersection = None # # selfSims = np.diagonal(windowSims) # # candidateRowIdxs = np.where(selfSims * k0 <= bsfScore)[0] # # for i in candidateRowIdxs: # # row = windowSims[i] # for i, row in enumerate(windowSims): # if i % 20 == 0: # print("computing stuff for row {}".format(i)) # if windowSims[i,i] * k0 <= bsfScore: # continue # idxs = sub.optimalAlignK(row, Lmin, k0) # intersection = intersections[idxs[0], idxs[1]] # sz = 0 # for idx in idxs[2:]: # intersection = np.logical_and(intersection, windowVects[idx]) # sz = np.count_nonzero(intersection) # if sz * k0 <= bsfScore: # break # score = sz * k0 # if score > bsfScore: # print("window {0}, k={1}, score={2} is the new best!".format(i, k0, score)) # bsfScore = score # bsfLocs = idxs # bsfIntersection = np.copy(intersection) # ================================ show output print "bestScore = {}".format(bsfScore) print "bestLocations = {}".format(str(bsfLocs)) for idx in bsfLocs: viz.plotRect(axSim, idx, idx + windowWidth) bsfIntersectionWindow = bsfIntersection.reshape((-1, windowWidth)) sums = np.sum(bsfIntersectionWindow, axis=0) print bsfIntersectionWindow.shape print sums.shape plt.figure() plt.imshow(bsfIntersectionWindow, interpolation="nearest", aspect="auto") plt.colorbar() plt.figure() plt.plot(sums) p0 = np.mean(X) kBest = len(bsfLocs) p0 = np.power(p0, kBest) # expectedOnesPerCol = p0 * X.shape[1] expectedOnesPerCol = p0 * X.shape[1] * 2 sums -= expectedOnesPerCol plt.plot(sums) start, end, _ = maxSubarray(sums) patStart, patEnd = start, end + 1 + length viz.plotRect(plt.gca(), start, end + 1) for idx in bsfLocs: viz.plotRect(axSeq, idx + patStart, idx + patEnd) # plt.figure() # # windowSims[bestRowIdx, bestColIdxs] *= 10 # color these differently # # plt.imshow(windowSims, interpolation='none') # plt.imshow(windowSims) # plt.colorbar() # # for col in range(colSims.shape[1]): # bestRowIdx = -1 # bestColIdxs = [] # bestSum = -1 # kVals = np.arange(2,kMax) # for i, row in enumerate(windowSims): # optimalIdxs = sub.optimalAlignK(row, Lmin, kVals) # if not len(optimalIdxs): # continue # # print "optimalIdxs", optimalIdxs # sums = map(lambda idxs: np.sum(row[idxs]), optimalIdxs) # # print "sums", sums # sums = np.asarray(sums) # bestSumIdx = np.argmax(sums) # if sums[bestSumIdx] > bestSum: # bestRowIdx = i # bestColIdxs = optimalIdxs[bestSumIdx] # bestSum = sums[bestSumIdx] # print "bestRow = ", bestRowIdx # print "best end locs = ", bestColIdxs plt.tight_layout() plt.show()
def main(): # np.random.seed(123) # ================================ consts for everything # consts for generating data # n = 1000 n = 500 # n = 300 # length = 8 # length = 16 length = 32 # length = 50 # nInstances = 3 exampleLengths = [55, 60, 65] # exampleLengths = [60, 60, 60] noiseStd = .5 # consts for algorithm Lmin = max(20, length) # only needed for optimalAlignK() spacing Lmax = 100 # loose upper bound on pattern length minSim = .5 # loose cutoff for what counts as similar # k0 = len(exampleLengths) # for version where we tell it k answerIdxs = None # ------------------------ synthetic data # seq = synth.randconst(n, std=noiseStd) # seq = synth.notSoRandomWalk(n, std=noiseStd, trendFilterLength=80, lpfLength=4) seq = synth.notSoRandomWalk(n, std=noiseStd, trendFilterLength=80, lpfLength=16) seq = embedExamples(seq, exampleLengths) # ------------------------ msrc from ..datasets import read_msrc as msrc idxs = [2] # idxs = [0] downsampleBy = 2 recordings = msrc.getRecordings(idxs=idxs) r = list(recordings)[0] # seq = r.data # seq = r.data[:, :40] # seq = r.data[:, 20:23] seq = r.data[:, 24:27] # seq = r.data[:, 20:27] print "orig seq shape", seq.shape seq = ar.downsampleMat(seq, rowsBy=downsampleBy) print "downsampled seq shape", seq.shape length = max(8, Lmin / 2) Lmin = len(seq) / 20 Lmax = len(seq) / 10 # Lmax = len(seq) / 20 # k0 = 10 minSim = .5 answerIdxs = r.gestureIdxs / downsampleBy # print "seq shape", seq.shape prePadLen = Lmax - length postPadLen = length - 1 first = np.tile(seq[0], (prePadLen, 1)) last = np.tile(seq[-1], (postPadLen, 1)) seq = np.vstack((first, seq, last)) # pad with fixed val to allow all window positions # ^ TODO pad simMat with zeros instead--this introduces fake subseqs answerIdxs += prePadLen # seq = np.vstack((seq, np.tile(flat, (length-1, 1)))) # lets it get the last rep # print "seq shape", seq.shape # r.plot() # plt.figure() # plt.plot(r.sampleTimes) # answerIdxs = r.gestureIdxs / downsampleBy # print r.gestureIdxs # print answerIdxs # plt.figure() # plt.plot(seq) # for idx in answerIdxs: # ax = plt.gca() # viz.plotVertLine(idx, ax=ax) # plt.show() # return # noise = synth.randconst(seq.shape) # add noise for debugging # seq = np.r_[noise, seq, noise] # ================================ simMat X = computeSimMat(seq, length) X[X < minSim] = 0. # Xorig = np.copy(X) X = ff2.localMaxFilterSimMat(X) Xblur = ff2.filterSimMat(X, length-1, 'hamming', scaleFilterMethod='max1') # Xblur = ff2.filterSimMat(X, Lmin-1, 'hamming', scaleFilterMethod='max1') Xblur = np.minimum(Xblur, 1.) print "simMat dims:", X.shape Xnonzeros = np.count_nonzero(X) print "simMat nonzeros, total, frac = ", Xnonzeros, X.size, Xnonzeros / float(X.size) # ================================ plotting crap plt.figure() axSeq = plt.subplot2grid((4,1), (0,0)) axSim = plt.subplot2grid((4,1), (1,0), rowspan=3) for ax in (axSeq, axSim): ax.autoscale(tight=True) axSeq.plot(seq) if answerIdxs is not None: for idx in answerIdxs: viz.plotVertLine(idx, ax=axSeq) Xpad = synth.appendZeros(X, length-1) axSim.imshow(Xpad, interpolation='nearest', aspect='auto') # im = axSim.imshow(Xpad, interpolation='nearest', aspect='auto') # plt.colorbar(im, cax=axSim) axSeq.set_title("Time Series") axSim.set_title("Similarities Matrix") # plt.figure() # plt.imshow(Xorig, interpolation='nearest', aspect='auto') # plt.colorbar() # plt.figure() # plt.imshow(X, interpolation='nearest', aspect='auto') # plt.colorbar() # # plt.figure() # # Xfilt = ff2.localMaxFilterSimMat(X, allowEq=True) # # plt.imshow(Xfilt, interpolation='nearest', aspect='auto') # # plt.colorbar() # # plt.figure() # # Xfilt = ff2.localMaxFilterSimMat(X, allowEq=False) # # plt.imshow(Xfilt, interpolation='nearest', aspect='auto') # # plt.colorbar() # plt.figure() # plt.imshow(Xblur, interpolation='nearest', aspect='auto') # plt.colorbar() # plt.show() # return # ================================ science # ------------------------ derived stats kMax = int(X.shape[1] / Lmin + .5) windowLen = Lmax - length + 1 # windowShape = (X.shape[0], Lmax) # windowSize = np.prod(windowShape) nLocs = X.shape[1] - windowLen + 1 p0 = np.mean(X) # fraction of entries that are 1 (roughly) # intersections = computeIntersections(X, windowLen) # windowSims = np.sum(intersections, axis=2) # colSims = np.dot(X.T, X) colSims = np.dot(X.T, Xblur) filt = np.zeros((windowLen, windowLen)) + np.diag(np.ones(windowLen)) # zeros except 1s on diag windowSims = sig.convolve2d(colSims, filt, mode='valid') windowVects = vectorizeWindowLocs(X, windowLen) windowVectsBlur = vectorizeWindowLocs(Xblur, windowLen) plt.figure() plt.imshow(windowSims, interpolation='nearest', aspect='auto') # plt.show() # return # ------------------------ find stuff # # # # Version where we we tell it k # # # bsfScore = 0 # bsfLocs = None # bsfIntersection = None # # selfSims = np.diagonal(windowSims) # # candidateRowIdxs = np.where(selfSims * k0 <= bsfScore)[0] # # for i in candidateRowIdxs: # # row = windowSims[i] # for i, row in enumerate(windowSims): # if i % 20 == 0: # print("computing stuff for row {}".format(i)) # if windowSims[i, i] * k0 <= bsfScore: # continue # idxs = sub.optimalAlignK(row, Lmin, k0) # intersection = windowVects[i] # sz = 0 # for idx in idxs: # intersection = np.minimum(intersection, windowVectsBlur[idx]) # sz = np.sum(intersection) # if sz * k0 <= bsfScore: # break # score = sz * k0 # if score > bsfScore: # print("window {0}, k={1}, score={2} is the new best!".format(i, k0, score)) # bsfScore = score # bsfLocs = idxs # bsfIntersection = np.copy(intersection) # # # # Version where we look for similarities to orig seq # # # bsfScore = 0 # bsfLocs = None # bsfIntersection = None # for i, row in enumerate(windowSims): # if i % 20 == 0: # print("computing stuff for row {}".format(i)) # # early abandon if this location has so little stuff that no # # intersection with it can possibly beat the best score # if windowSims[i,i] * kMax <= bsfScore: # highest score is kMax identical locs # # print("immediately abandoning window {}!".format(i)) # continue # # print("not abandoning window {}!".format(i)) # # best combination of idxs such that none are within Lmin of each other # idxs = sub.optimalAlignment(row, Lmin) # # print i, ": ", idxs # # order idxs by descending order of associated score # sizes = windowSims[i, idxs] # sortedSizesOrder = np.argsort(sizes)[::-1] # sortedIdxs = idxs[sortedSizesOrder] # # iteratively intersect with another near neighbor, compute the # # associated score, and check if it's better (or if we can early abandon) # intersection = windowVects[i] # numIdxs = len(sortedIdxs) # for j, idx in enumerate(sortedIdxs): # k = j + 1 # intersection = np.minimum(intersection, windowVectsBlur[idx]) # sz = np.sum(intersection) # use apodization window # # sz = np.count_nonzero(intersection) # just max-pool # score = sz * k # if k > 1 and score > bsfScore: # print("window {0}, k={1}, score={2} is the new best!".format(i, k, score)) # print("sortedIdxs = {}".format(str(sortedIdxs))) # print("sortedIdxScores = {}".format(str(windowSims[i, sortedIdxs]))) # print("------------------------") # bsfScore = score # bsfLocs = sortedIdxs[:k] # bsfIntersection = np.copy(intersection) # # early abandon if this can't possibly beat the best score, which # # is the case exactly when the intersection is so small that perfect # # matches at all future locations still wouldn't be good enough # elif sz * numIdxs <= bsfScore: # # print("early abandoning window {} at k={}".format(i, k)) # break # # # # Version where we look for similarities to orig seq and use nearest # # enemy dist as M0 # # # bsfScore = 0 # bsfLocs = None # bsfIntersection = None # for i, row in enumerate(windowSims): # if i % 20 == 0: # print("computing stuff for row {}".format(i)) # # early abandon if this location has so little stuff that no # # intersection with it can possibly beat the best score # if windowSims[i,i] * kMax <= bsfScore: # highest score is kMax identical locs # # print("immediately abandoning window {}!".format(i)) # continue # # best combination of idxs such that none are within Lmin of each other # # validRow = row[:(-length + 1)] # can't go past end of ts # # idxs = sub.optimalAlignment(validRow, Lmin) # idxs = sub.optimalAlignment(row, Lmin) # goes past end of ts, but better # # order idxs by descending order of associated score # sizes = windowSims[i, idxs] # sortedSizesOrder = np.argsort(sizes)[::-1] # sortedIdxs = idxs[sortedSizesOrder] # # iteratively intersect with another near neighbor, compute the # # associated score, and check if it's better (or if we can early abandon) # intersection = windowVects[i] # numIdxs = len(sortedIdxs) # # allZeros = np.zeros(intersection.shape) # nextIntersection = np.minimum(intersection, windowVectsBlur[sortedIdxs[0]]) # nextSz = np.sum(nextIntersection) # for j, idx in enumerate(sortedIdxs): # k = j + 1 # intersection = np.copy(nextIntersection) # sz = nextSz # if k < numIdxs: # nextIdx = sortedIdxs[k] # since k = j+1 # nextIntersection = np.minimum(intersection, windowVectsBlur[nextIdx]) # nextSz = np.sum(nextIntersection) # sum -> use apodization window # else: # nextSz = sz * p0 # score = (sz - nextSz) * k # if k > 1 and score > bsfScore: # print("window {0}, k={1}, score={2} is the new best!".format(i, k, score)) # print("sortedIdxs = {}".format(str(sortedIdxs))) # print("sortedIdxScores = {}".format(str(windowSims[i, sortedIdxs]))) # print("------------------------") # bsfScore = score # bsfLocs = sortedIdxs[:k] # bsfIntersection = np.copy(intersection) # # early abandon if this can't possibly beat the best score, which # # is the case exactly when the intersection is so small that perfect # # matches at all future locations still wouldn't be good enough # elif sz * numIdxs <= bsfScore: # # print("early abandoning window {} at k={}".format(i, k)) # break # # Version where we look for similarities to orig seq and use nearest # enemy dist as M0, and use mean values instead of intersection # bsfScore = 0 bsfLocs = None bsfIntersection = None for i, row in enumerate(windowSims): if i % 20 == 0: print("computing stuff for row {}".format(i)) # early abandon if this location has so little stuff that no # intersection with it can possibly beat the best score if windowSims[i,i] * kMax <= bsfScore: # highest score is kMax identical locs continue # best combination of idxs such that none are within Lmin of each other # validRow = row[:(-length + 1)] # can't go past end of ts # idxs = sub.optimalAlignment(validRow, Lmin) idxs = sub.optimalAlignment(row, Lmin) # goes past end of ts, but better # order idxs by descending order of associated score sizes = windowSims[i, idxs] sortedSizesOrder = np.argsort(sizes)[::-1] sortedIdxs = idxs[sortedSizesOrder] # iteratively intersect with another near neighbor, compute the # associated score, and check if it's better (or if we can early abandon) intersection = windowVects[i] numIdxs = len(sortedIdxs) nextSz = np.sum(intersection) nextFilt = np.array(intersection, dtype=np.float) nextFiltSum = np.array(nextFilt, dtype=np.float) for j, idx in enumerate(sortedIdxs): k = j + 1 filt = np.copy(nextFilt) sz = nextSz if k < numIdxs: nextIdx = sortedIdxs[k] # since k = j+1 nextIntersection = np.minimum(filt, windowVectsBlur[nextIdx]) nextFiltSum += nextIntersection nextFilt = nextFiltSum / (k+1) # avg value of each feature in intersections # nextSz = np.sum(nextFilt) # big even if like no intersection... nextSz = np.sum(nextIntersection) bigEnoughIntersection = nextIntersection[nextIntersection > minSim] nextSz = np.sum(bigEnoughIntersection) else: nextSz = sz * p0 score = (sz - nextSz) * k if k > 1 and score > bsfScore: print("window {0}, k={1}, score={2} is the new best!".format(i, k, score)) print("sortedIdxs = {}".format(str(sortedIdxs))) print("sortedIdxScores = {}".format(str(windowSims[i, sortedIdxs]))) print("------------------------") bsfScore = score bsfLocs = sortedIdxs[:k] bsfIntersection = np.copy(filt) # early abandon if this can't possibly beat the best score, which # is the case exactly when the intersection is so small that perfect # matches at all future locations still wouldn't be good enough elif sz * numIdxs <= bsfScore: # TODO can we actually early abandon here? next window loc # could increase filt, and thus score for a given loc isn't # necessarily non-increasing... # -can't abandon using this test, but pretty sure there's # a lower bound to be had here somewhere # print("early abandoning window {} at k={}".format(i, k)) break # # Version where we look for similarities to orig seq and use nearest # enemy dist as M0, and use mean values instead of intersection, # and don't sort the indices, but instead care about overlap # # bsfScore = 0 # bsfLocs = None # bsfIntersection = None # for i, row in enumerate(windowSims): # if i % 20 == 0: # print("computing stuff for row {}".format(i)) # # early abandon if this location has so little stuff that no # # intersection with it can possibly beat the best score # if windowSims[i,i] * kMax <= bsfScore: # highest score is kMax identical locs # continue # # best combination of idxs such that none are within Lmin of each other # # validRow = row[:(-length + 1)] # can't go past end of ts # # idxs = sub.optimalAlignment(validRow, Lmin) # idxs = sub.optimalAlignment(row, Lmin) # goes past end of ts, but better # # order idxs by descending order of associated score # sizes = windowSims[i, idxs] # sortedSizesOrder = np.argsort(sizes)[::-1] # sortedIdxs = idxs[sortedSizesOrder] # # iteratively intersect with another near neighbor, compute the # # associated score, and check if it's better (or if we can early abandon) # intersection = windowVects[i] # numIdxs = len(sortedIdxs) # nextSz = np.sum(intersection) # nextFilt = np.array(intersection, dtype=np.float) # nextFiltSum = np.array(nextFilt, dtype=np.float) # for j, idx in enumerate(sortedIdxs): # k = j + 1 # filt = np.copy(nextFilt) # sz = nextSz # if k < numIdxs: # nextIdx = sortedIdxs[k] # since k = j+1 # nextIntersection = np.minimum(filt, windowVectsBlur[nextIdx]) # nextFiltSum += nextIntersection # nextFilt = nextFiltSum / (k+1) # avg value of each feature in intersections # # nextSz = np.sum(nextFilt) # big even if like no intersection... # nextSz = np.sum(nextIntersection) # bigEnoughIntersection = nextIntersection[nextIntersection > minSim] # nextSz = np.sum(bigEnoughIntersection) # else: # nextSz = sz * p0 # score = (sz - nextSz) * k # if k > 1 and score > bsfScore: # print("window {0}, k={1}, score={2} is the new best!".format(i, k, score)) # print("sortedIdxs = {}".format(str(sortedIdxs))) # print("sortedIdxScores = {}".format(str(windowSims[i, sortedIdxs]))) # print("------------------------") # bsfScore = score # bsfLocs = sortedIdxs[:k] # bsfIntersection = np.copy(filt) # # early abandon if this can't possibly beat the best score, which # # is the case exactly when the intersection is so small that perfect # # matches at all future locations still wouldn't be good enough # elif sz * numIdxs <= bsfScore: # # TODO can we actually early abandon here? next window loc # # could increase filt, and thus score for a given loc isn't # # necessarily non-increasing... # # -can't abandon using this test, but pretty sure there's # # a lower bound to be had here somewhere # # print("early abandoning window {} at k={}".format(i, k)) # break # ------------------------ recover original ts bsfIntersection *= bsfIntersection >= minSim bsfIntersectionWindow = bsfIntersection.reshape((-1, windowLen)) sums = np.sum(bsfIntersectionWindow, axis=0) kBest = len(bsfLocs) p0 = np.power(p0, kBest) # expectedOnesPerCol = p0 * X.shape[1] expectedOnesPerCol = p0 * X.shape[1] * 2 sums -= expectedOnesPerCol plt.plot(sums) start, end, _ = maxSubarray(sums) patStart, patEnd = start, end + 1 + length # ================================ show output print "bestScore = {}".format(bsfScore) print "bestLocations = {}".format(str(bsfLocs)) for idx in bsfLocs: viz.plotRect(axSim, idx, idx+windowLen) # print bsfIntersectionWindow.shape # print sums.shape # plt.plot(sums) # viz.plotRect(plt.gca(), start, end + 1) for idx in bsfLocs: viz.plotRect(axSeq, idx + patStart, idx + patEnd) plt.figure() plt.imshow(bsfIntersectionWindow, interpolation='nearest', aspect='auto') plt.tight_layout() plt.show()
def main(): # np.random.seed(123) # ================================ consts for everything # consts for generating data # n = 1000 n = 500 # n = 300 # length = 8 # length = 16 length = 32 # length = 50 # nInstances = 3 exampleLengths = [55, 60, 65] # exampleLengths = [60, 60, 60] noiseStd = .5 # consts for algorithm Lmin = max(20, length) # only needed for optimalAlignK() spacing Lmax = 100 # loose upper bound on pattern length minSim = .8 # loose cutoff for what counts as similar k0 = len(exampleLengths) # for version where we tell it k # ------------------------ synthetic data # seq = synth.randconst(n, std=noiseStd) # seq = synth.notSoRandomWalk(n, std=noiseStd, trendFilterLength=80, lpfLength=4) seq = synth.notSoRandomWalk(n, std=noiseStd, trendFilterLength=80, lpfLength=16) seq = embedExamples(seq, exampleLengths) # ------------------------ msrc # from ..datasets import read_msrc as msrc # idxs = [2] # recordings = msrc.getRecordings(idxs=idxs) # r = list(recordings)[0] # seq = r.data[:, 20:23] # print "orig seq shape", seq.shape # seq = ar.downsampleMat(seq, rowsBy=10) # print "downsampled seq shape", seq.shape # length = 8 # Lmin = len(seq) / 20 # Lmax = len(seq) / 10 # # Lmax = len(seq) / 20 # k0 = 10 # minSim = .5 # noise = synth.randconst(seq.shape) # add noise for debugging # seq = np.r_[noise, seq, noise] # ================================ simMat X = computeSimMat(seq, length) # X[X < minSim] = 0. # X = ff2.localMaxFilterSimMat(X) # maxPoolWidth = min(length-1, Lmin-1) # maxPoolWidth /= 2 # X = filters.maximum_filter1d(X, maxPoolWidth, axis=1) # X = filters.maximum_filter1d(X, length-1, axis=1) # X = filters.maximum_filter1d(X, length/2, axis=1) # X = filters.maximum_filter1d(X, 3, axis=1) # X = np.array(X > minSim, dtype=np.float) # X = X > minSim X[X < minSim] = 0. # X = ff2.filterSimMat(X, length-1, 'hamming', scaleFilterMethod='max1') # X = sub.removeCorrelatedRows(X, .9, accumulate=True) # correlation > .9 -> kill it # X = sub.removeCorrelatedRows(X, .9, accumulate=False) # correlation > .9 -> kill it print "simMat dims:", X.shape print "simMat nonzeros, total, frac = ", np.count_nonzero( X), X.size, np.count_nonzero(X) / float(X.size) # ================================ plotting crap plt.figure() axSeq = plt.subplot2grid((4, 1), (0, 0)) axSim = plt.subplot2grid((4, 1), (1, 0), rowspan=3) for ax in (axSeq, axSim): ax.autoscale(tight=True) axSeq.plot(seq) axSim.imshow(synth.appendZeros(X, length - 1), interpolation='nearest', aspect='auto') # im = axSim.imshow(synth.appendZeros(X, length-1), interpolation='nearest', aspect='auto') # plt.colorbar(im, cax=axSim) axSeq.set_title("Time Series") axSim.set_title("Similarities Matrix") # ================================ science # ------------------------ derived stats kMax = int(X.shape[1] / Lmin + .5) windowWidth = Lmax - length + 1 # windowShape = (X.shape[0], Lmax) # windowSize = np.prod(windowShape) nLocs = X.shape[1] - windowWidth + 1 # ------------------------ pairwise sims # colSims = np.dot(X.T, X) # filt = np.zeros((Lmax, Lmax)) + np.diag(np.ones(Lmax)) # zeros except 1s on diag # windowSims = sig.convolve2d(colSims, filt, mode='valid') print "computing intersections..." windowVects = vectorizeWindowLocs(X, windowWidth) windowSz = windowVects.shape[1] intersections = computeIntersections(X, windowVects, windowWidth) windowSims = np.sum(intersections, axis=2) # windowSims /= windowSz # assert(np.array_equal(windowSims, windowSims2)) # works # plt.figure() # plt.imshow(windowSims2) plt.figure() plt.imshow(windowSims / windowSz) plt.colorbar() # plt.show() # return print "computing similarity lower bound..." # TODO maybe try introducing beta prior to weight different values of k # -or, alternatively, see what happens if we tell it the right k # # Version where we look for stuff matching each intersection # # initialize with closest pair at least Lmin apart nonTrivialWindowSims = np.triu(windowSims) # zero lower half for i in range(nLocs): nonTrivialWindowSims[i, i:min(nLocs, i + Lmin)] = 0 # zero Lmin past diag highestSimIdx = np.argmax(nonTrivialWindowSims) bsfLocs = sorted([highestSimIdx // nLocs, highestSimIdx % nLocs]) bsfScore = windowSims[tuple(bsfLocs)] * 2 # list will yield a list bsfIntersection = intersections[bsfLocs] print "finding best locations..." # rowIntersectionSims = np.zeros(nLocs) # for i in range(nLocs): # if i % 20 == 0: # print("computing stuff for row {}".format(i)) # bestPossibleScores = windowSims[i,min(nLocs,i+Lmin):] * kMax / 2. # candidateIdxs = np.where(bestPossibleScores > bsfScore)[0] # # print "candidateIdxs shape", candidateIdxs.shape # for j in candidateIdxs: # intersection = intersections[i, j] # rowIntersectionSims *= 0 # rowIntersectionSims[candidateIdxs] = np.dot(intersections[i, candidateIdxs], intersection) # idxs = sub.optimalAlignment(rowIntersectionSims, Lmin) # # order idxs by descending order of associated score # sizes = rowIntersectionSims[idxs] # sortedSizesOrder = np.argsort(sizes)[::-1] # sortedIdxs = idxs[sortedSizesOrder] # # iteratively intersect with another near neighbor, compute the # # associated score, and check if it's better (or if we can early abandon) # numIdxs = len(sortedIdxs) # k = 2 # for idx in sortedIdxs[2:]: # first 2 are no better than orig intersection # k += 1 # intersection = np.logical_and(intersection, intersections[i, idx]) # sz = np.sum(intersection) # score = sz * k # if score > bsfScore: # print("window {0}, k={1}, score={2} is the new best!".format(i, k, score)) # bsfScore = score # bsfLocs = sortedIdxs[:k] # bsfIntersection = np.copy(intersection) # elif sz * numIdxs <= bsfScore: # # print("early abandoning window {} at k={}".format(i, k)) # break # # Version where we look for similarities to orig seq # bsfScore = 0 bsfLocs = None bsfIntersection = None for i, row in enumerate(windowSims): if i % 20 == 0: print("computing stuff for row {}".format(i)) # early abandon if this location has so little stuff that no # intersection with it can possibly beat the best score if windowSims[ i, i] * kMax <= bsfScore: # highest score is kMax identical locs # print("immediately abandoning window {}!".format(i)) continue # print("not abandoning window {}!".format(i)) # best combination of idxs such that none are within Lmin of each other idxs = sub.optimalAlignment(row, Lmin) # order idxs by descending order of associated score sizes = windowSims[i, idxs] sortedSizesOrder = np.argsort(sizes)[::-1] sortedIdxs = idxs[sortedSizesOrder] # retrieve intersection and compute score for best 2 locs k = 2 intersection = intersections[sortedIdxs[0], sortedIdxs[1]] score = windowSims[sortedIdxs[0], sortedIdxs[1]] * k # possibly update best-so-far score and window locations if score > bsfScore: bsfScore = score bsfLocs = sortedIdxs[:k] bsfIntersection = np.copy(intersection) # iteratively intersect with another near neighbor, compute the # associated score, and check if it's better (or if we can early abandon) numIdxs = len(sortedIdxs) for idx in sortedIdxs[2:]: k += 1 intersection = np.logical_and(intersection, windowVects[idx]) sz = np.count_nonzero(intersection) score = sz * k if score > bsfScore: print("window {0}, k={1}, score={2} is the new best!".format( i, k, score)) bsfScore = score bsfLocs = sortedIdxs[:k] bsfIntersection = np.copy(intersection) # early abandon if this can't possibly beat the best score, which # is the case exactly when the intersection is so small that perfect # matches at all future locations still wouldn't be good enough elif sz * numIdxs <= bsfScore: # print("early abandoning window {} at k={}".format(i, k)) break # # Version where we we tell it k # # bsfScore = 0 # bsfLocs = None # bsfIntersection = None # # selfSims = np.diagonal(windowSims) # # candidateRowIdxs = np.where(selfSims * k0 <= bsfScore)[0] # # for i in candidateRowIdxs: # # row = windowSims[i] # for i, row in enumerate(windowSims): # if i % 20 == 0: # print("computing stuff for row {}".format(i)) # if windowSims[i,i] * k0 <= bsfScore: # continue # idxs = sub.optimalAlignK(row, Lmin, k0) # intersection = intersections[idxs[0], idxs[1]] # sz = 0 # for idx in idxs[2:]: # intersection = np.logical_and(intersection, windowVects[idx]) # sz = np.count_nonzero(intersection) # if sz * k0 <= bsfScore: # break # score = sz * k0 # if score > bsfScore: # print("window {0}, k={1}, score={2} is the new best!".format(i, k0, score)) # bsfScore = score # bsfLocs = idxs # bsfIntersection = np.copy(intersection) # ================================ show output print "bestScore = {}".format(bsfScore) print "bestLocations = {}".format(str(bsfLocs)) for idx in bsfLocs: viz.plotRect(axSim, idx, idx + windowWidth) bsfIntersectionWindow = bsfIntersection.reshape((-1, windowWidth)) sums = np.sum(bsfIntersectionWindow, axis=0) print bsfIntersectionWindow.shape print sums.shape plt.figure() plt.imshow(bsfIntersectionWindow, interpolation='nearest', aspect='auto') plt.colorbar() plt.figure() plt.plot(sums) p0 = np.mean(X) kBest = len(bsfLocs) p0 = np.power(p0, kBest) # expectedOnesPerCol = p0 * X.shape[1] expectedOnesPerCol = p0 * X.shape[1] * 2 sums -= expectedOnesPerCol plt.plot(sums) start, end, _ = maxSubarray(sums) patStart, patEnd = start, end + 1 + length viz.plotRect(plt.gca(), start, end + 1) for idx in bsfLocs: viz.plotRect(axSeq, idx + patStart, idx + patEnd) # plt.figure() # # windowSims[bestRowIdx, bestColIdxs] *= 10 # color these differently # # plt.imshow(windowSims, interpolation='none') # plt.imshow(windowSims) # plt.colorbar() # # for col in range(colSims.shape[1]): # bestRowIdx = -1 # bestColIdxs = [] # bestSum = -1 # kVals = np.arange(2,kMax) # for i, row in enumerate(windowSims): # optimalIdxs = sub.optimalAlignK(row, Lmin, kVals) # if not len(optimalIdxs): # continue # # print "optimalIdxs", optimalIdxs # sums = map(lambda idxs: np.sum(row[idxs]), optimalIdxs) # # print "sums", sums # sums = np.asarray(sums) # bestSumIdx = np.argmax(sums) # if sums[bestSumIdx] > bestSum: # bestRowIdx = i # bestColIdxs = optimalIdxs[bestSumIdx] # bestSum = sums[bestSumIdx] # print "bestRow = ", bestRowIdx # print "best end locs = ", bestColIdxs plt.tight_layout() plt.show()
def main(): # np.random.seed(123) # ================================ consts for everything # consts for generating data # n = 1000 n = 500 # n = 300 # length = 8 # length = 16 # length = 32 # length = 50 # nInstances = 3 exampleLengths = [55, 60, 65] # exampleLengths = [60, 60, 60] noiseStd = .5 # consts for algorithm # Lmin = max(20, length) # only needed for optimalAlignK() spacing Lmin = 20 # only needed for optimalAlignK() spacing Lmax = 100 # loose upper bound on pattern length # minSim = .5 minSim = 0. length = Lmin // 2 # length = Lmin // 4 # length = 3 answerIdxs = None USE_MSRC = True # USE_MSRC = False # ================================ data # ------------------------ synthetic data # seq = synth.randconst(n, std=noiseStd) seq = synth.randwalk(n, std=noiseStd) # seq = synth.notSoRandomWalk(n, std=noiseStd, trendFilterLength=80, lpfLength=4) # seq = synth.notSoRandomWalk(n, std=noiseStd, trendFilterLength=80, lpfLength=16) seq = embedExamples(seq, exampleLengths) # seq = synth.appendZeros(seq, Lmax) # ------------------------ msrc if USE_MSRC: from ..datasets import read_msrc as msrc # idxs = [0] # idxs = [1] # idxs = [2] # idxs = [7] # length 1500, but instances of length like 20 # idxs = [8] # gets owned on this one cuz patterns of length like 100 # idxs = [9] # missing an annotation, it appears idxs = [10] # something crazy about feature rep here # TODO fix # idxs = [11] # crap cuz bad, low-variance signals # idxs = [12] # has garbagey sections like [10] # idxs = [13] # empty feature mat # TODO # idxs = [14] downsampleBy = 2 # downsampleBy = 1 recordings = msrc.getRecordings(idxs=idxs) r = list(recordings)[0] # seq = r.data # seq = r.data[:, :40] # seq = r.data[:, 20:23] seq = r.data[:, 24:27] # seq = r.data[:, 20:27] print "orig seq shape", seq.shape seq = ar.downsampleMat(seq, rowsBy=downsampleBy) print "downsampled seq shape", seq.shape # length = max(8, Lmin / 2) Lmin = len(seq) // 20 # Lmax = len(seq) // 8 Lmax = len(seq) // 10 length = Lmin // 2 # Lmax = len(seq) / 20 # k0 = 10 # minSim = .5 answerIdxs = r.gestureIdxs / downsampleBy # print "seq shape", seq.shape prePadLen = Lmax - length # postPadLen = length - 1 postPadLen = Lmax - length first = np.tile(seq[0], (prePadLen, 1)) last = np.tile(seq[-1], (postPadLen, 1)) seq = np.vstack( (first, seq, last)) # pad with fixed val to allow all window positions # ^ TODO pad simMat with zeros instead--this introduces fake subseqs answerIdxs += prePadLen # seq = np.vstack((seq, np.tile(flat, (length-1, 1)))) # lets it get the last rep # print "seq shape", seq.shape # ================================ feature construction logMaxLength = int(np.floor(np.log2(Lmax))) # logMaxLength = int(np.ceil(np.log2(Lmax))) # logMinLength = 3 # -> length 8 # logMinLength = 4 # -> length 16 logMinLength = int(np.floor(np.log2(Lmin))) lengths = np.arange(logMinLength, logMaxLength + 1) lengths = 2**lengths # lengths = [16] cardinality = 8 breakpoints = rep.saxBreakpoints(cardinality) X = rep.multiNormalizeAndSparseQuantize(seq, lengths, breakpoints) # X = rep.multiSparseLineProject(seq, lengths, breakpoints, removeZeroRows=False) # lengths2 = np.arange(3, logMaxLength + 1) # lengths2 = 2 ** lengths2 lengths2 = lengths # TODO uncomment after debug # lengths2 = [8, 32] # breakpoints2 = rep.defaultSparseLineBreakpoints(seq, scaleHowMany=2) breakpoints2 = rep.defaultSparseLineBreakpoints(seq) X2 = rep.multiSparseLineProject(seq, lengths2, breakpoints2) # X2 = X2 > minSim X2 = X2 > 0. # ignore correlations # print "shapes:" # print X.shape # print X2.shape X = np.vstack((X, X2)) # plt.figure() # # viz.imshowBetter(X) # viz.imshowBetter(X2) # plt.figure() # viz.imshowBetter(X2 > 0.) # plt.show() # print seq.shape # plt.figure() # plt.plot(seq[:,0]) # bit of pattern, but only varies between -.4 and .2 # okay, so 1st dim is all zeros # variances = rep.slidingVariance(seq, 8) # for dim in range(len(variances)): # plt.figure() # plt.plot(variances[dim].flatten()) # print variances.shape # variances = rep.vstack3Tensor(variances.T) # print variances.shape # plt.plot(variances) # plt.show() # return X = localMaxFilterSimMat(X) # Xbool = np.copy(X) featureMeans = np.mean(X, axis=1).reshape((-1, 1)) # print featureMeans X *= -np.log2(featureMeans) # variable encoding costs for rows # X /= -np.log(featureMeans) # Xblur = localMaxFilterSimMat(X) # try only maxFiltering Xblur Xblur = filterSimMat(X, length - 1, 'hamming', scaleFilterMethod='max1') # plt.figure() # viz.imshowBetter(X) # plt.figure() # viz.imshowBetter(Xblur) print "featureMat dims:", X.shape Xnonzeros = np.count_nonzero(X) print "featureMat nonzeros, total, frac = ", Xnonzeros, X.size, Xnonzeros / float( X.size) # plt.show() # return # ================================ plotting crap plt.figure() axSeq = plt.subplot2grid((4, 1), (0, 0)) axSim = plt.subplot2grid((4, 1), (1, 0), rowspan=3) for ax in (axSeq, axSim): ax.autoscale(tight=True) axSeq.plot(seq) # if answerIdxs is not None: # for idx in answerIdxs: # viz.plotVertLine(idx, ax=axSeq) padLen = len(seq) - X.shape[1] Xpad = synth.appendZeros(X, padLen) axSim.imshow(Xpad, interpolation='nearest', aspect='auto') # im = axSim.imshow(Xpad, interpolation='nearest', aspect='auto') # plt.colorbar(im, cax=axSim) axSeq.set_title("Time Series") axSim.set_title("Feature Matrix") # plt.show() # return # ================================ science # ------------------------ derived stats kMax = int(X.shape[1] / Lmin + .5) windowLen = Lmax - length + 1 p0 = np.mean(X) # fraction of entries that are 1 (roughly) # p0 = 2 * np.mean(X) # lambda for l0 reg based on features being bernoulli at 2 locs minSim = p0 # p0 = -np.log(np.mean(Xbool)) # fraction of entries that are 1 (roughly) # noiseSz = p0 * X.shape[0] * windowLen # way too hard to beat expectedOnesPerWindow = p0 * X.shape[0] * windowLen noiseSz = p0 * expectedOnesPerWindow # num ones to begin with # intersections = computeIntersections(X, windowLen) # windowSims = np.sum(intersections, axis=2) # colSims = np.dot(X.T, X) colSims = np.dot(X.T, Xblur) filt = np.zeros((windowLen, windowLen)) + np.diag( np.ones(windowLen)) # zeros except 1s on diag windowSims = sig.convolve2d(colSims, filt, mode='valid') windowVects = vectorizeWindowLocs(X, windowLen) windowVectsBlur = vectorizeWindowLocs(Xblur, windowLen) # plt.figure() # plt.imshow(windowSims, interpolation='nearest', aspect='auto') # ------------------------ find stuff # # Version where we look for similarities to orig seq and use nearest # enemy dist as M0, and use mean values instead of intersection # bsfScore = 0 bsfLocs = None bsfIntersection = None for i, row in enumerate(windowSims): if i % 20 == 0: print("computing stuff for row {}".format(i)) # early abandon if this location has so little stuff that no # intersection with it can possibly beat the best score if windowSims[ i, i] * kMax <= bsfScore: # highest score is kMax identical locs continue # best combination of idxs such that none are within Lmin of each other # validRow = row[:(-length + 1)] # can't go past end of ts # idxs = sub.optimalAlignment(validRow, Lmin) idxs = sub.optimalAlignment(row, Lmin) # goes past end of ts, but better # order idxs by descending order of associated score sizes = windowSims[i, idxs] sortedSizesOrder = np.argsort(sizes)[::-1] sortedIdxs = idxs[sortedSizesOrder] # iteratively intersect with another near neighbor, compute the # associated score, and check if it's better (or if we can early abandon) intersection = windowVects[i] numIdxs = len(sortedIdxs) nextSz = np.sum(intersection) nextFilt = np.array(intersection, dtype=np.float) nextFiltSum = np.array(nextFilt, dtype=np.float) for j, idx in enumerate(sortedIdxs): k = j + 1 filt = np.copy(nextFilt) sz = nextSz if k < numIdxs: nextIdx = sortedIdxs[k] # since k = j+1 nextIntersection = np.minimum(filt, windowVectsBlur[nextIdx]) nextFiltSum += nextIntersection nextFilt = nextFiltSum / ( k + 1) # avg value of each feature in intersections # nextSz = np.sum(nextFilt) # big even if like no intersection... nextSz = np.sum(nextIntersection) bigEnoughIntersection = nextIntersection[ nextIntersection > minSim] nextSz = np.sum(bigEnoughIntersection) else: nextSz = sz * p0 # nextSz = -1 enemySz = max(nextSz, noiseSz) score = (sz - enemySz) * k if k > 1 and score > bsfScore: print("window {0}, k={1}, score={2} is the new best!".format( i, k, score)) print("sortedIdxs = {}".format(str(sortedIdxs))) print("sortedIdxScores = {}".format( str(windowSims[i, sortedIdxs]))) print("------------------------") bsfScore = score bsfLocs = sortedIdxs[:k] bsfIntersection = np.copy(filt) # early abandon if this can't possibly beat the best score, which # is the case exactly when the intersection is so small that perfect # matches at all future locations still wouldn't be good enough elif sz * numIdxs <= bsfScore: # TODO can we actually early abandon here? next window loc # could increase filt, and thus score for a given loc isn't # necessarily non-increasing... # -can't abandon using this test, but pretty sure there's # a lower bound to be had here somewhere # print("early abandoning window {} at k={}".format(i, k)) break elif noiseSz > nextSz: break # # # # Version where we look for similarities to orig seq and use nearest # # enemy dist as M0, and use mean values instead of intersection, # # and don't sort the indices, but instead care about overlap # # # bsfScore = 0 # bsfLocs = None # bsfIntersection = None # for i, row in enumerate(windowSims): # if i % 20 == 0: # print("computing stuff for row {}".format(i)) # # early abandon if this location has so little stuff that no # # intersection with it can possibly beat the best score # if windowSims[i,i] * kMax <= bsfScore: # highest score is kMax identical locs # continue # # best combination of idxs such that none are within Lmin of each other # # validRow = row[:(-length + 1)] # can't go past end of ts # # idxs = sub.optimalAlignment(validRow, Lmin) # idxs = sub.optimalAlignment(row, Lmin) # goes past end of ts, but better # # order idxs by descending order of associated score # sizes = windowSims[i, idxs] # sortedSizesOrder = np.argsort(sizes)[::-1] # sortedIdxs = idxs[sortedSizesOrder] # # iteratively intersect with another near neighbor, compute the # # associated score, and check if it's better (or if we can early abandon) # intersection = windowVects[i] # numIdxs = len(sortedIdxs) # nextSz = np.sum(intersection) # nextFilt = np.array(intersection, dtype=np.float) # nextFiltSum = np.array(nextFilt, dtype=np.float) # for j, idx in enumerate(sortedIdxs): # k = j + 1 # filt = np.copy(nextFilt) # sz = nextSz # if k < numIdxs: # nextIdx = sortedIdxs[k] # since k = j+1 # nextIntersection = np.minimum(filt, windowVectsBlur[nextIdx]) # nextFiltSum += nextIntersection # nextFilt = nextFiltSum / (k+1) # avg value of each feature in intersections # # nextSz = np.sum(nextFilt) # big even if like no intersection... # nextSz = np.sum(nextIntersection) # bigEnoughIntersection = nextIntersection[nextIntersection > minSim] # nextSz = np.sum(bigEnoughIntersection) # else: # nextSz = sz * p0 # score = (sz - nextSz) * k # if k > 1 and score > bsfScore: # print("window {0}, k={1}, score={2} is the new best!".format(i, k, score)) # print("sortedIdxs = {}".format(str(sortedIdxs))) # print("sortedIdxScores = {}".format(str(windowSims[i, sortedIdxs]))) # print("------------------------") # bsfScore = score # bsfLocs = sortedIdxs[:k] # bsfIntersection = np.copy(filt) # # early abandon if this can't possibly beat the best score, which # # is the case exactly when the intersection is so small that perfect # # matches at all future locations still wouldn't be good enough # elif sz * numIdxs <= bsfScore: # # TODO can we actually early abandon here? next window loc # # could increase filt, and thus score for a given loc isn't # # necessarily non-increasing... # # -can't abandon using this test, but pretty sure there's # # a lower bound to be had here somewhere # # print("early abandoning window {} at k={}".format(i, k)) # break # ------------------------ recover original ts bsfIntersection *= bsfIntersection >= minSim bsfIntersectionWindow = bsfIntersection.reshape((-1, windowLen)) sums = np.sum(bsfIntersectionWindow, axis=0) kBest = len(bsfLocs) p0 = np.power(p0, kBest) # expectedOnesPerCol = p0 * X.shape[1] * 2 # expectedOnesPerCol = p0 * X.shape[1] expectedOnesPerCol = p0 * X.shape[0] sums -= expectedOnesPerCol # plt.figure() # plt.plot(sums) start, end, _ = maxSubarray(sums) # patStart, patEnd = start, end + 1 + length patStart, patEnd = start, end + 1 # patStart, patEnd = start + length // 2, end + 1 + length # ================================ show output print "bestScore = {}".format(bsfScore) print "bestLocations = {}".format(str(bsfLocs)) for idx in bsfLocs: viz.plotRect(axSim, idx, idx + windowLen) # print bsfIntersectionWindow.shape # print sums.shape # plt.plot(sums) # viz.plotRect(plt.gca(), start, end + 1) for idx in bsfLocs: viz.plotRect(axSeq, idx + patStart, idx + patEnd) if answerIdxs is not None: for idx in answerIdxs: viz.plotVertLine(idx, ax=axSeq) plt.figure() plt.imshow(bsfIntersectionWindow, interpolation='nearest', aspect='auto') plt.tight_layout() plt.show()