def test(self): self.testTimePerTs = 0 self.preLabels = np.zeros(self.numTest, dtype='uint32') for tsId in range(self.numTest): if int(tsId) % 10 == 0: print(tsId, end=', ') sys.stdout.flush() if int(tsId) % 100 == 0: print() sys.stdout.flush() # ts = scale(self.testTss[tsId]) ts = np.array(self.testTss[tsId]) tsLen = self.testLens[tsId] tic = perf_counter() cumSums = gu.getCumSums(ts) cumSums_2 = gu.getCumSums(ts * ts) votes = np.zeros(self.numCls, dtype='uint32') for methodId, (bop, selectedWords, selectedWordInfo, sigma2Centroids) in self.allInfo.items(): if bop.discretizer.winLen > tsLen: curCumSums = np.concatenate( (cumSums, cumSums[-1] * np.ones(bop.discretizer.winLen - tsLen))) curCumSums_2 = np.concatenate( (cumSums_2, cumSums_2[-1] * np.ones(bop.discretizer.winLen - tsLen))) else: curCumSums = cumSums curCumSums_2 = cumSums_2 transformedTs = bop.discretizer.transformTsFromCumSums( curCumSums, curCumSums_2) discretizedTs = bop.discretizer.discretizeTransformedTs( transformedTs) bagTs = bop.getBOP_DiscretizedTs(discretizedTs) for simId, methodIds in enumerate(self.allMethodIds): if methodId not in methodIds: continue curSelectedWords = selectedWords[simId] dists = np.zeros(self.numCls) if simId == 1: sigma2Ts = 0 sigmaProd = np.zeros(self.numCls) for word in curSelectedWords: infoByCls = selectedWordInfo[word][simId] cnt = 0 if word in bagTs.keys(): cnt = bagTs[word] if simId == 0: #ed dists += (cnt - infoByCls)**2 else: tf = 0 if cnt == 0 else 1 + np.log10(cnt) sigma2Ts += tf**2 sigmaProd += tf * infoByCls if simId == 1: divide = sigma2Ts * sigma2Centroids divide[np.where(divide == 0)] = -1 dists = 1 - sigmaProd**2 / divide preLabel = np.argmin(dists) votes[preLabel] += 1 self.preLabels[tsId] = np.argmax(votes) toc = perf_counter() self.testTimePerTs += toc - tic self.accuracy = accuracy_score(self.testLabels, self.preLabels) self.testTimePerTs /= self.numTest
def getAllCumSums(data): cumSums = gu.getCumSums(data) cumSums_2 = gu.getCumSums(data * data) weightedCumSums = gu.getCumSums(data * np.arange(data.shape[-1])) return (cumSums, cumSums_2, weightedCumSums)
def train(self): trainTss_padded = [] maxTsLen = max(self.trainLens) for i in range(self.numTrain): ts = np.array(self.trainTss[i]) tsLen = self.trainLens[i] # zTs = scale(np.array(ts)) # zTs = np.concatenate((zTs, np.zeros(maxTsLen - tsLen))) # trainTss_padded.append(zTs) ts = np.concatenate((ts, np.zeros(maxTsLen - tsLen))) trainTss_padded.append(ts) trainTss_padded = np.array(trainTss_padded) self.minWinLen = np.maximum(int( np.around(self.minTrainLen * self.minWinRatio)), self.minWinLen, dtype='int32') self.maxWinLen = np.minimum(int( np.around(self.minTrainLen * self.maxWinRatio)), self.minTrainLen, dtype='int32') self.winLenStep = np.maximum(int( np.around(self.minTrainLen * self.winRatioStep)), 1, dtype='int32') if self.minTrainLen < self.minWinLen: self.minWinLen = self.minTrainLen if self.minTrainLen < self.maxWinLen: self.maxWinLen = self.minTrainLen # numBitsWinLen = bu.numBits(np.ceil((self.maxWinLen - self.minWinLen) / self.winLenStep) + 1) # numBitsWordSize = bu.numBits(np.ceil((self.maxWordSize - self.minWordSize) / self.wordSizeStep) + 1) tic = perf_counter() allCumSums = gu.getCumSums(trainTss_padded) allCumSums_2 = gu.getCumSums(trainTss_padded * trainTss_padded) all_cv1_scores = [[], []] # allMethodIds = [[], []] allInfo = [] for wordSize in range(self.minWordSize, self.maxWordSize + 1, self.wordSizeStep): for winLen in range(self.minWinLen, self.maxWinLen + 1, self.winLenStep): discretizer = SAX.SAX(winLen, wordSize, self.card, True, True, self.binSizeTh) transformedTss = discretizer.transfromTssFromCumSums( allCumSums, allCumSums_2, self.trainLens) discretizedTss = discretizer.discretizeTransformedDataset_( transformedTss, self.trainLens, None, 'GD', 'Default') bop = BOP(discretizer, False) bagWord = bop.getWordFirstBop_DiscretizedTss(discretizedTss) words = [] fs = [] for word, cntTs in bagWord.items(): feats = np.zeros(self.numTrain) for tsId, cnt in cntTs.items(): feats[tsId] = cnt f = FStat_2(feats, self.trainLabels, self.numCls) if f: words.append(word) fs.append(f) numWords = len(words) if numWords == 0: continue wordRanks = np.argsort(-np.array(fs)) bestAcc_ed, bestAcc_cos, numSelected_ed, numSelected_cos, meanCntsByCls, tfIdfsByCls, sigmas2Centroids\ = self.crossValidation(numWords, words, wordRanks, bagWord) bestAccs = [bestAcc_ed, bestAcc_cos] numsSelected = np.array([numSelected_ed, numSelected_cos]) simIdRange = np.argsort(numsSelected) selectedWordInfo = {} selectedWords = [None, None] # methodId = self.createMethodId(winLenInd, numBitsWinLen, wordSizeInd, numBitsWordSize) prevNumSelected = 0 curSelectedWords = set() for simId in simIdRange: cv1_score = bestAccs[simId] all_cv1_scores[simId].append(cv1_score) # allMethodIds[simId].append(methodId) numSelected = numsSelected[simId] for i in range(prevNumSelected, numSelected): idx = wordRanks[i] word = words[idx] curSelectedWords.add(word) selectedWordInfo[word] = (meanCntsByCls[idx][:], tfIdfsByCls[idx][:]) selectedWords[simId] = deepcopy(curSelectedWords) prevNumSelected = numSelected allInfo.append( (bop, selectedWords, selectedWordInfo, sigmas2Centroids)) self.allMethodIds = [] allAvgAcc = np.empty(2) for i in range(2): cur_cv1_scores = np.array(all_cv1_scores[i]) numMet = len(cur_cv1_scores) if numMet > self.topK: methodIds = np.argpartition(-cur_cv1_scores, self.topK)[:self.topK] else: methodIds = np.arange(numMet) self.allMethodIds.append(set(methodIds)) allAvgAcc[i] = np.mean(cur_cv1_scores[methodIds]) maxAcc = np.amax(allAvgAcc) for i in range(2): if allAvgAcc[i] <= self.accRatio * maxAcc: self.allMethodIds[i] = set() self.allInfo = {} for methodIds in self.allMethodIds: for methodId in methodIds: if methodId not in self.allInfo.keys(): self.allInfo[methodId] = allInfo[methodId] toc = perf_counter() self.trainTime = toc - tic