def learnFF(seq, X, Xblur, Lmin, Lmax, Lfilt, generateSeedsAlgo=DEFAULT_SEEDS_ALGO, generalizeSeedsAlgo=DEFAULT_GENERALIZE_ALGO, extractTrueLocsAlgo=DEFAULT_EXTRACT_LOCS_ALGO, generateSeedsStep=.1, padBothSides=True, **generalizeKwargs): padLen = (len(seq) - X.shape[1]) // 2 if padBothSides: X = ar.addZeroCols(X, padLen, prepend=True) X = ar.addZeroCols(X, padLen, prepend=False) Xblur = ar.addZeroCols(Xblur, padLen, prepend=True) Xblur = ar.addZeroCols(Xblur, padLen, prepend=False) tStartSeed = time.clock() # find seeds; i.e., candidate instance indices from which to generalize numShifts = int(1. / generateSeedsStep) + 1 stepLen = int(Lmax * generateSeedsStep) windowLen = Lmax + stepLen print "learnFF(): stepLen, numShifts", stepLen, numShifts if generateSeedsAlgo == 'pair': searchLen = (Lmin + Lmax) // 2 motif = findMotifOfLengthFast([seq], searchLen) seedIdxs = [motif.idx1, motif.idx2] print "seedIdxs from motif: ", seedIdxs seedIdxs = computeAllSeedIdxsFromPair(seedIdxs, numShifts, stepLen) elif generateSeedsAlgo == 'all': seedIdxs = np.arange(X.shape[1] - windowLen) # TODO remove after debug elif generateSeedsAlgo == 'random': seedIdxs = list(np.random.choice(np.arange(len(seq) - Lmax), 2)) seedIdxs = computeAllSeedIdxsFromPair(seedIdxs, numShifts, stepLen) elif generateSeedsAlgo == 'walk': # score all subseqs based on how much they don't look like random walks # when examined using different sliding window lengths scores = np.zeros(len(seq)) for dim in range(seq.shape[1]): # compute these just once, not once per length dimData = seq[:, dim].ravel() diffs = dimData[1:] - dimData[:-1] std = np.std(diffs) for divideBy in [1, 2, 4, 8]: partialScores = windowScoresRandWalk(dimData, Lmin // divideBy, std=std) scores[:len(partialScores)] += partialScores # figure out optimal seeds based on scores of all subseqs bestIdx = np.argmax(scores) start = max(0, bestIdx - Lmin) end = min(len(scores), start + Lmin) scores[start:end] = -1 secondBestIdx = np.argmax(scores) seedIdxs = [bestIdx, secondBestIdx] seedIdxs = computeAllSeedIdxsFromPair(seedIdxs, numShifts, stepLen) else: raise NotImplementedError( "Only algo 'pair' supported to generate seeds" "; got unrecognized algo {}".format(generateSeedsAlgo)) # compute start and end indices of seeds to try seedStartIdxs = np.sort(np.array(seedIdxs)) seedStartIdxs = seedStartIdxs[seedStartIdxs >= 0] seedStartIdxs = seedStartIdxs[seedStartIdxs < X.shape[1] - windowLen] seedEndIdxs = seedStartIdxs + windowLen print "learnFF(): seedIdxs after removing invalid idxs: ", seedStartIdxs print "learnFF(): fraction of idxs used as seeds: {}".format( len(seedStartIdxs) / float(len(seq))) tEndSeed = time.clock() generalizeKwargs['windowLen'] = windowLen # TODO remove after prototype bsfScore, bsfLocs, bsfFilt = findInstancesUsingSeedLocs( X, Xblur, seedStartIdxs, seedEndIdxs, Lmin, Lmax, Lfilt, generalizeSeedsAlgo=generalizeSeedsAlgo, **generalizeKwargs) # print "learnFF(): got bsfFilt shape", bsfFilt.shape startIdxs, endIdxs = extractTrueLocs( X, Xblur, bsfLocs, bsfFilt, windowLen, Lmin, Lmax, extractTrueLocsAlgo=extractTrueLocsAlgo) tEndFF = time.clock() print "learnFF(): seconds to find seeds, locs, total =\n\t{:.3f}\t{:.3f}\t{:.3f}".format( tEndSeed - tStartSeed, tEndFF - tEndSeed, tEndFF - tStartSeed) return startIdxs, endIdxs, bsfFilt
def neighborSims1D(seq, length, numNeighbors=100, samplingAlgo='walk', similarityAlgo='meanOnly', maxDist=.25, localMaxFilter=False, spacedMaxFilter=False, tryNumNeighbors=-1, **sink): # spacedMaxFilter=True, tryNumNeighbors=-1, **sink): # print "neighborSims1D(); seq shape, requested len, requested count" # print seq.shape, length, numNeighbors seq = seq.flatten() X = window.sliding_window_1D(seq, length) numSubseqs = X.shape[0] if numNeighbors < 1 or numNeighbors > numSubseqs: numNeighbors = numSubseqs # origNumNeighbors = numNeighbors # elif baseLength: # origNumNeighbors = numNeighbors # numNeighbors = int(numNeighbors * float(length) / baseLength) if samplingAlgo == 'std': probs = np.std(X, axis=1) elif samplingAlgo == 'var': probs = np.var(X, axis=1) elif samplingAlgo == 'unif': probs = np.ones(numSubseqs) elif samplingAlgo == 'walk': probs = windowScoresRandWalk(seq, length) else: raise ValueError("Unrecognized sampling algorithm {}".format(samplingAlgo)) # must assess at least as many subseqs as we want to return, and no more # than the largest number possible tryNumNeighbors = max(tryNumNeighbors, numNeighbors) tryNumNeighbors = min(tryNumNeighbors, numSubseqs) # print "neighborSims1D(); X shape ", X.shape # print np.var(X, axis=1) # allDists = pairwiseDists(X) # # allDists = pairwiseDists(X) / length # # import matplotlib.pyplot as plt # # from ..viz import viz_utils as viz # # plt.figure() # # viz.imshowBetter(allDists) # # plt.show() # # import sys # # sys.exit() # # closeEnough = (allDists < maxDist).astype(np.int) # # closeEnough = allDists < maxDist # closeEnough = allDists < (maxDist * length) # neighborCounts = np.sum(closeEnough, axis=1) # print neighborCounts # eligibleIdxs = np.where(neighborCounts > 2)[0] # self isn't a neighbor # # print eligibleIdxs # numEligibleIdxs = len(eligibleIdxs) # print "numSubseqs, numEligibleIdxs ", numSubseqs, numEligibleIdxs # select random subseqs probs /= np.sum(probs) allIdxs = np.arange(numSubseqs) startIdxs = randChoice(allIdxs, tryNumNeighbors, replace=False, p=probs) # minSpacing = length // 2 # startIdxs = randIdxs(numSubseqs, numNeighbors, minSpacing=minSpacing, # probabilities=probs, reduceSpacingIfNeeded=True) # probabilities=probs, reduceNumIfNeeded=True) neighbors = X[startIdxs] # mean normalize all subseqs X = X - np.mean(X, axis=1, keepdims=True) neighbors = neighbors - np.mean(neighbors, axis=1, keepdims=True) # zNorm = True # TODO remove # if zNorm: # X = ar.zNormalizeRows(X) # neighbors = ar.zNormalizeRows(neighbors) # SELF: pick up here by ensuring sufficient features # import dist # Xsort, projDistsSort, projVects, unsortIdxs = dist.buildOrderline(X, # referenceVectAlgo='sample', norm=None) # allVariances = np.var(X, axis=1) # sortIdxs = np.argsort(allVariances) # allVariances = allVariances[sortIdxs] # sims = np.zeros((origNumNeighbors, numSubseqs)) # extra rows for uniform output sims = np.zeros((tryNumNeighbors, numSubseqs)) # extra rows for uniform output if similarityAlgo == 'meanOnly': for i, neighbor in enumerate(neighbors): variance = np.var(neighbor) if variance < .0001: continue diffs = X - neighbor dists = np.sum(diffs * diffs, axis=1) / length dists /= variance # would be within [0, 2] if znormed dists[dists > maxDist] = np.inf neighborSims = np.maximum(0, 1. - dists) # print "i, sims shape", i, neighborSims.shape if localMaxFilter: idxs = ar.idxsOfRelativeExtrema(neighborSims.ravel(), maxima=True) sims[i, idxs] = neighborSims[idxs] elif spacedMaxFilter: idxs = nonOverlappingMaxima(neighborSims, length // 2) # idxs = nonOverlappingMaxima(neighborSims, 2) # spacing of 2 sims[i, idxs] = neighborSims[idxs] else: sims[i] = neighborSims else: raise ValueError("Unrecognized similarity algorithm {}".format( similarityAlgo)) if tryNumNeighbors > numNeighbors: # need to remove some neighbors # greedily take rows with most total similarity, but only counting # trivial matches once scores = np.zeros(len(sims)) for i, row in enumerate(sims): maximaIdxs = nonOverlappingMaxima(row, length // 2) scores[i] = np.sum(row[maximaIdxs]) sortIdxs = np.argsort(scores)[::-1] sims = sims[sortIdxs[:numNeighbors]] return sims.T
def learnFF(seq, X, Xblur, Lmin, Lmax, Lfilt, generateSeedsAlgo=DEFAULT_SEEDS_ALGO, generalizeSeedsAlgo=DEFAULT_GENERALIZE_ALGO, extractTrueLocsAlgo=DEFAULT_EXTRACT_LOCS_ALGO, generateSeedsStep=.1, padBothSides=True, **generalizeKwargs): padLen = (len(seq) - X.shape[1]) // 2 if padBothSides: X = ar.addZeroCols(X, padLen, prepend=True) X = ar.addZeroCols(X, padLen, prepend=False) Xblur = ar.addZeroCols(Xblur, padLen, prepend=True) Xblur = ar.addZeroCols(Xblur, padLen, prepend=False) tStartSeed = time.clock() # find seeds; i.e., candidate instance indices from which to generalize numShifts = int(1. / generateSeedsStep) + 1 stepLen = int(Lmax * generateSeedsStep) windowLen = Lmax + stepLen print "learnFF(): stepLen, numShifts", stepLen, numShifts if generateSeedsAlgo == 'pair': searchLen = (Lmin + Lmax) // 2 motif = findMotifOfLengthFast([seq], searchLen) seedIdxs = [motif.idx1, motif.idx2] print "seedIdxs from motif: ", seedIdxs seedIdxs = computeAllSeedIdxsFromPair(seedIdxs, numShifts, stepLen) elif generateSeedsAlgo == 'all': seedIdxs = np.arange(X.shape[1] - windowLen) # TODO remove after debug elif generateSeedsAlgo == 'random': seedIdxs = list(np.random.choice(np.arange(len(seq) - Lmax), 2)) seedIdxs = computeAllSeedIdxsFromPair(seedIdxs, numShifts, stepLen) elif generateSeedsAlgo == 'walk': # score all subseqs based on how much they don't look like random walks # when examined using different sliding window lengths scores = np.zeros(len(seq)) for dim in range(seq.shape[1]): # compute these just once, not once per length dimData = seq[:, dim].ravel() diffs = dimData[1:] - dimData[:-1] std = np.std(diffs) for divideBy in [1, 2, 4, 8]: partialScores = windowScoresRandWalk(dimData, Lmin // divideBy, std=std) scores[:len(partialScores)] += partialScores # figure out optimal seeds based on scores of all subseqs bestIdx = np.argmax(scores) start = max(0, bestIdx - Lmin) end = min(len(scores), start + Lmin) scores[start:end] = -1 secondBestIdx = np.argmax(scores) seedIdxs = [bestIdx, secondBestIdx] seedIdxs = computeAllSeedIdxsFromPair(seedIdxs, numShifts, stepLen) else: raise NotImplementedError("Only algo 'pair' supported to generate seeds" "; got unrecognized algo {}".format(generateSeedsAlgo)) # compute start and end indices of seeds to try seedStartIdxs = np.sort(np.array(seedIdxs)) seedStartIdxs = seedStartIdxs[seedStartIdxs >= 0] seedStartIdxs = seedStartIdxs[seedStartIdxs < X.shape[1] - windowLen] seedEndIdxs = seedStartIdxs + windowLen print "learnFF(): seedIdxs after removing invalid idxs: ", seedStartIdxs print "learnFF(): fraction of idxs used as seeds: {}".format( len(seedStartIdxs) / float(len(seq))) tEndSeed = time.clock() generalizeKwargs['windowLen'] = windowLen # TODO remove after prototype bsfScore, bsfLocs, bsfFilt = findInstancesUsingSeedLocs(X, Xblur, seedStartIdxs, seedEndIdxs, Lmin, Lmax, Lfilt, generalizeSeedsAlgo=generalizeSeedsAlgo, **generalizeKwargs) # print "learnFF(): got bsfFilt shape", bsfFilt.shape startIdxs, endIdxs = extractTrueLocs(X, Xblur, bsfLocs, bsfFilt, windowLen, Lmin, Lmax, extractTrueLocsAlgo=extractTrueLocsAlgo) tEndFF = time.clock() print "learnFF(): seconds to find seeds, locs, total =\n\t{:.3f}\t{:.3f}\t{:.3f}".format( tEndSeed - tStartSeed, tEndFF - tEndSeed, tEndFF - tStartSeed) return startIdxs, endIdxs, bsfFilt
def neighborSims1D(seq, length, numNeighbors=100, samplingAlgo='walk', similarityAlgo='meanOnly', maxDist=.25, localMaxFilter=False, spacedMaxFilter=False, tryNumNeighbors=-1, **sink): # spacedMaxFilter=True, tryNumNeighbors=-1, **sink): # print "neighborSims1D(); seq shape, requested len, requested count" # print seq.shape, length, numNeighbors seq = seq.flatten() X = window.sliding_window_1D(seq, length) numSubseqs = X.shape[0] if numNeighbors < 1 or numNeighbors > numSubseqs: numNeighbors = numSubseqs # origNumNeighbors = numNeighbors # elif baseLength: # origNumNeighbors = numNeighbors # numNeighbors = int(numNeighbors * float(length) / baseLength) if samplingAlgo == 'std': probs = np.std(X, axis=1) elif samplingAlgo == 'var': probs = np.var(X, axis=1) elif samplingAlgo == 'unif': probs = np.ones(numSubseqs) elif samplingAlgo == 'walk': probs = windowScoresRandWalk(seq, length) else: raise ValueError( "Unrecognized sampling algorithm {}".format(samplingAlgo)) # must assess at least as many subseqs as we want to return, and no more # than the largest number possible tryNumNeighbors = max(tryNumNeighbors, numNeighbors) tryNumNeighbors = min(tryNumNeighbors, numSubseqs) # print "neighborSims1D(); X shape ", X.shape # print np.var(X, axis=1) # allDists = pairwiseDists(X) # # allDists = pairwiseDists(X) / length # # import matplotlib.pyplot as plt # # from ..viz import viz_utils as viz # # plt.figure() # # viz.imshowBetter(allDists) # # plt.show() # # import sys # # sys.exit() # # closeEnough = (allDists < maxDist).astype(np.int) # # closeEnough = allDists < maxDist # closeEnough = allDists < (maxDist * length) # neighborCounts = np.sum(closeEnough, axis=1) # print neighborCounts # eligibleIdxs = np.where(neighborCounts > 2)[0] # self isn't a neighbor # # print eligibleIdxs # numEligibleIdxs = len(eligibleIdxs) # print "numSubseqs, numEligibleIdxs ", numSubseqs, numEligibleIdxs # select random subseqs probs /= np.sum(probs) allIdxs = np.arange(numSubseqs) startIdxs = randChoice(allIdxs, tryNumNeighbors, replace=False, p=probs) # minSpacing = length // 2 # startIdxs = randIdxs(numSubseqs, numNeighbors, minSpacing=minSpacing, # probabilities=probs, reduceSpacingIfNeeded=True) # probabilities=probs, reduceNumIfNeeded=True) neighbors = X[startIdxs] # mean normalize all subseqs X = X - np.mean(X, axis=1, keepdims=True) neighbors = neighbors - np.mean(neighbors, axis=1, keepdims=True) # zNorm = True # TODO remove # if zNorm: # X = ar.zNormalizeRows(X) # neighbors = ar.zNormalizeRows(neighbors) # SELF: pick up here by ensuring sufficient features # import dist # Xsort, projDistsSort, projVects, unsortIdxs = dist.buildOrderline(X, # referenceVectAlgo='sample', norm=None) # allVariances = np.var(X, axis=1) # sortIdxs = np.argsort(allVariances) # allVariances = allVariances[sortIdxs] # sims = np.zeros((origNumNeighbors, numSubseqs)) # extra rows for uniform output sims = np.zeros( (tryNumNeighbors, numSubseqs)) # extra rows for uniform output if similarityAlgo == 'meanOnly': for i, neighbor in enumerate(neighbors): variance = np.var(neighbor) if variance < .0001: continue diffs = X - neighbor dists = np.sum(diffs * diffs, axis=1) / length dists /= variance # would be within [0, 2] if znormed dists[dists > maxDist] = np.inf neighborSims = np.maximum(0, 1. - dists) # print "i, sims shape", i, neighborSims.shape if localMaxFilter: idxs = ar.idxsOfRelativeExtrema(neighborSims.ravel(), maxima=True) sims[i, idxs] = neighborSims[idxs] elif spacedMaxFilter: idxs = nonOverlappingMaxima(neighborSims, length // 2) # idxs = nonOverlappingMaxima(neighborSims, 2) # spacing of 2 sims[i, idxs] = neighborSims[idxs] else: sims[i] = neighborSims else: raise ValueError( "Unrecognized similarity algorithm {}".format(similarityAlgo)) if tryNumNeighbors > numNeighbors: # need to remove some neighbors # greedily take rows with most total similarity, but only counting # trivial matches once scores = np.zeros(len(sims)) for i, row in enumerate(sims): maximaIdxs = nonOverlappingMaxima(row, length // 2) scores[i] = np.sum(row[maximaIdxs]) sortIdxs = np.argsort(scores)[::-1] sims = sims[sortIdxs[:numNeighbors]] return sims.T