def getTagsForListOfVideoIds(youtube, videoIds): videoTagData = {} for chunk in GeneralUtil.chunkList(videoIds, 50): idString = "" for videoId in chunk: idString += videoId + ',' idString = idString[:-1] request = youtube.videos().list( part="snippet,topicDetails", id=idString ) response = request.execute() for video in response['items']: tagData = {} if('tags' in video['snippet']): tagData['tags'] = video['snippet']['tags'] if('topicDetails' in video): tagData['topicDetails'] = video['topicDetails'] videoTagData[video['snippet']['title']] = tagData return videoTagData
def __init__(self, winLen, wordSize, card, meanNorm = True, stdNorm = True, binSizeTh = 3, step = -1): super().__init__(winLen, wordSize, card, binSizeTh, step) self.type = 'SAX' self.meanNorm = meanNorm self.stdNorm = stdNorm if self.meanNorm and self.stdNorm: self.avg = 0 self.stdv = 1 self.segStarts = gu.getSegStarts(self.winLen, self.wordSize) self.segSizes = self.segStarts[1 :] - self.segStarts[: len(self.segStarts) - 1]
def transformSub(self, cumSums, cumSums_2, wCumSums, pos): transformedSub = np.zeros(self.wordSize) #window mean and std if not (self.meanNorm or self.stdNorm): meanSub = 0 sigmaSub = 1 elif self.stdNorm: meanSub = (cumSums[pos + self.winLen] - cumSums[pos]) / self.winLen meanSub_2 = (cumSums_2[pos + self.winLen] - cumSums_2[pos]) / self.winLen varSub = meanSub_2 - meanSub * meanSub sigmaSub = np.sqrt(varSub) if varSub > 0 else 1 if not self.meanNorm: meanSub = 0 else: meanSub = (cumSums[pos + self.winLen] - cumSums[pos]) / self.winLen sigmaSub = 1 #timestamp parameters startPts = self.segStarts[:len(self.segStarts) - 1] + pos finishPts = self.segStarts[1:] + pos sum_X = gu.getAriSeqSum(startPts, finishPts - 1) mean_X = sum_X / self.segSizes mean_X2 = gu.getSumOfSquares(startPts, finishPts - 1) / self.segSizes #segment parameters # sumSegs = cumSums[self.segStarts[1 :]] - cumSums[self.segStarts[: len(self.segStarts) - 1]] sumSegs = cumSums[finishPts] - cumSums[startPts] meanSegs = (sumSegs / self.segSizes - meanSub) / sigmaSub # wCumSegs = wCumSums[self.segStarts[1 :]] - wCumSums[self.segStarts[: len(self.segStarts) - 1]] wCumSegs = wCumSums[finishPts] - wCumSums[startPts] wMeanSegs = (wCumSegs - meanSub * sum_X) / self.segSizes / sigmaSub #the coefficients slopes = (wMeanSegs - mean_X * meanSegs) / (mean_X2 - mean_X * mean_X) intercepts = meanSegs - slopes * mean_X if self.posNorm: intercepts += startPts * slopes #shift to the same starting timestamp of 0 transformedSub[0:self.wordSize - 1:2] = slopes transformedSub[1:self.wordSize:2] = intercepts return transformedSub
def infoGain_singleSplit(vals, labels, retMajorClasses = False): #takes in np.array if len(np.unique(vals)) == 1: #no distinguishing power at all if retMajorClasses: return (-1, -1, -1) return (-1, -1) total = len(vals) order = np.argsort(vals) sortedVals = vals[order] sortedLabels = labels[order] bestGain = -1 bestPos = -1 uniqLabels, cOut = np.unique(sortedLabels, return_counts = True) numCls = len(np.unique(sortedLabels)) labelMap = {} for i in range(numCls): labelMap[uniqLabels[i]] = i entAll = entropy(cOut, total) lastCVal = sortedVals[0] nOut = total nIn = 0 cIn = np.zeros(numCls) for split in range(total): cVal = sortedVals[split] if lastCVal != cVal: gain = infoGain(cIn, cOut, entAll, total, nIn, nOut) if gain >= bestGain: bestPos = split bestGain = gain lastCVal = cVal labelIdx = labelMap[sortedLabels[split]] cOut[labelIdx] -= 1 nOut -= 1 cIn[labelIdx] += 1 nIn += 1 splitPt = sortedVals[bestPos] if retMajorClasses: labelsOut = sortedLabels[sortedVals >= splitPt] uniqLabelsOut, cOut = np.unique(labelsOut, return_counts = True) maxC, maxCIdx = gu.maxWithTies(cOut) majorClasses = uniqLabelsOut[maxCIdx] return bestGain, splitPt, majorClasses return bestGain, splitPt
def getChannelData(youtube, channelIds, part): channelMap = {} for chunk in GeneralUtil.chunkList(channelIds, 50): idString = "" for id in chunk: idString += id + "," idString = idString[:-1] request = youtube.channels().list(part=part, id=idString) response = request.execute() for item in response['items']: channelMap[item['id']] = item return channelMap
def __init__(self, winLen, wordSize, card, meanNorm=True, stdNorm=True, posNorm=True, binSizeTh=3, step=-1): super().__init__(winLen, wordSize, card, binSizeTh, step) self.type = 'SLA' if self.wordSize % 2: self.wordSize += 1 #make it even so that both the slopes and the intercepts can be kept self.meanNorm = meanNorm self.stdNorm = stdNorm self.posNorm = posNorm self.segStarts = gu.getSegStarts(self.winLen, self.wordSize / 2) self.segSizes = self.segStarts[1:] - self.segStarts[:len(self.segStarts ) - 1]
def getChannelTopics(youtube, channelIds): channelTopicMap = {} for chunk in GeneralUtil.chunkList(channelIds, 50): idString = "" for id in chunk: idString += id + "," idString = idString[:-1] request = youtube.channels().list( part="snippet,topicDetails", id=idString ) response = request.execute() for item in response['items']: channelTopicMap[item['snippet']['title']] = item['topicDetails']['topicCategories'] return channelTopicMap
def getAllMeanX2(tsLen, segSize): starts = np.arange(tsLen - segSize + 1) finishes = np.arange(segSize - 1, tsLen) return gu.getSumOfSquares(starts, finishes) / segSize
def getAllSumX(tsLen, segSize): first = gu.getAriSeqSum(0, segSize - 1, 1) return np.arange(first, first + segSize * (tsLen - segSize) + 1, segSize)
def getAllCumSums(data): cumSums = gu.getCumSums(data) cumSums_2 = gu.getCumSums(data * data) weightedCumSums = gu.getCumSums(data * np.arange(data.shape[-1])) return (cumSums, cumSums_2, weightedCumSums)
def getCumSums_Ts(self, ts): return gu.getCumSums_1D(ts)
def test(self): self.testTimePerTs = 0 self.preLabels = np.zeros(self.numTest, dtype='uint32') for tsId in range(self.numTest): if int(tsId) % 10 == 0: print(tsId, end=', ') sys.stdout.flush() if int(tsId) % 100 == 0: print() sys.stdout.flush() # ts = scale(self.testTss[tsId]) ts = np.array(self.testTss[tsId]) tsLen = self.testLens[tsId] tic = perf_counter() cumSums = gu.getCumSums(ts) cumSums_2 = gu.getCumSums(ts * ts) votes = np.zeros(self.numCls, dtype='uint32') for methodId, (bop, selectedWords, selectedWordInfo, sigma2Centroids) in self.allInfo.items(): if bop.discretizer.winLen > tsLen: curCumSums = np.concatenate( (cumSums, cumSums[-1] * np.ones(bop.discretizer.winLen - tsLen))) curCumSums_2 = np.concatenate( (cumSums_2, cumSums_2[-1] * np.ones(bop.discretizer.winLen - tsLen))) else: curCumSums = cumSums curCumSums_2 = cumSums_2 transformedTs = bop.discretizer.transformTsFromCumSums( curCumSums, curCumSums_2) discretizedTs = bop.discretizer.discretizeTransformedTs( transformedTs) bagTs = bop.getBOP_DiscretizedTs(discretizedTs) for simId, methodIds in enumerate(self.allMethodIds): if methodId not in methodIds: continue curSelectedWords = selectedWords[simId] dists = np.zeros(self.numCls) if simId == 1: sigma2Ts = 0 sigmaProd = np.zeros(self.numCls) for word in curSelectedWords: infoByCls = selectedWordInfo[word][simId] cnt = 0 if word in bagTs.keys(): cnt = bagTs[word] if simId == 0: #ed dists += (cnt - infoByCls)**2 else: tf = 0 if cnt == 0 else 1 + np.log10(cnt) sigma2Ts += tf**2 sigmaProd += tf * infoByCls if simId == 1: divide = sigma2Ts * sigma2Centroids divide[np.where(divide == 0)] = -1 dists = 1 - sigmaProd**2 / divide preLabel = np.argmin(dists) votes[preLabel] += 1 self.preLabels[tsId] = np.argmax(votes) toc = perf_counter() self.testTimePerTs += toc - tic self.accuracy = accuracy_score(self.testLabels, self.preLabels) self.testTimePerTs /= self.numTest
def train(self): trainTss_padded = [] maxTsLen = max(self.trainLens) for i in range(self.numTrain): ts = np.array(self.trainTss[i]) tsLen = self.trainLens[i] # zTs = scale(np.array(ts)) # zTs = np.concatenate((zTs, np.zeros(maxTsLen - tsLen))) # trainTss_padded.append(zTs) ts = np.concatenate((ts, np.zeros(maxTsLen - tsLen))) trainTss_padded.append(ts) trainTss_padded = np.array(trainTss_padded) self.minWinLen = np.maximum(int( np.around(self.minTrainLen * self.minWinRatio)), self.minWinLen, dtype='int32') self.maxWinLen = np.minimum(int( np.around(self.minTrainLen * self.maxWinRatio)), self.minTrainLen, dtype='int32') self.winLenStep = np.maximum(int( np.around(self.minTrainLen * self.winRatioStep)), 1, dtype='int32') if self.minTrainLen < self.minWinLen: self.minWinLen = self.minTrainLen if self.minTrainLen < self.maxWinLen: self.maxWinLen = self.minTrainLen # numBitsWinLen = bu.numBits(np.ceil((self.maxWinLen - self.minWinLen) / self.winLenStep) + 1) # numBitsWordSize = bu.numBits(np.ceil((self.maxWordSize - self.minWordSize) / self.wordSizeStep) + 1) tic = perf_counter() allCumSums = gu.getCumSums(trainTss_padded) allCumSums_2 = gu.getCumSums(trainTss_padded * trainTss_padded) all_cv1_scores = [[], []] # allMethodIds = [[], []] allInfo = [] for wordSize in range(self.minWordSize, self.maxWordSize + 1, self.wordSizeStep): for winLen in range(self.minWinLen, self.maxWinLen + 1, self.winLenStep): discretizer = SAX.SAX(winLen, wordSize, self.card, True, True, self.binSizeTh) transformedTss = discretizer.transfromTssFromCumSums( allCumSums, allCumSums_2, self.trainLens) discretizedTss = discretizer.discretizeTransformedDataset_( transformedTss, self.trainLens, None, 'GD', 'Default') bop = BOP(discretizer, False) bagWord = bop.getWordFirstBop_DiscretizedTss(discretizedTss) words = [] fs = [] for word, cntTs in bagWord.items(): feats = np.zeros(self.numTrain) for tsId, cnt in cntTs.items(): feats[tsId] = cnt f = FStat_2(feats, self.trainLabels, self.numCls) if f: words.append(word) fs.append(f) numWords = len(words) if numWords == 0: continue wordRanks = np.argsort(-np.array(fs)) bestAcc_ed, bestAcc_cos, numSelected_ed, numSelected_cos, meanCntsByCls, tfIdfsByCls, sigmas2Centroids\ = self.crossValidation(numWords, words, wordRanks, bagWord) bestAccs = [bestAcc_ed, bestAcc_cos] numsSelected = np.array([numSelected_ed, numSelected_cos]) simIdRange = np.argsort(numsSelected) selectedWordInfo = {} selectedWords = [None, None] # methodId = self.createMethodId(winLenInd, numBitsWinLen, wordSizeInd, numBitsWordSize) prevNumSelected = 0 curSelectedWords = set() for simId in simIdRange: cv1_score = bestAccs[simId] all_cv1_scores[simId].append(cv1_score) # allMethodIds[simId].append(methodId) numSelected = numsSelected[simId] for i in range(prevNumSelected, numSelected): idx = wordRanks[i] word = words[idx] curSelectedWords.add(word) selectedWordInfo[word] = (meanCntsByCls[idx][:], tfIdfsByCls[idx][:]) selectedWords[simId] = deepcopy(curSelectedWords) prevNumSelected = numSelected allInfo.append( (bop, selectedWords, selectedWordInfo, sigmas2Centroids)) self.allMethodIds = [] allAvgAcc = np.empty(2) for i in range(2): cur_cv1_scores = np.array(all_cv1_scores[i]) numMet = len(cur_cv1_scores) if numMet > self.topK: methodIds = np.argpartition(-cur_cv1_scores, self.topK)[:self.topK] else: methodIds = np.arange(numMet) self.allMethodIds.append(set(methodIds)) allAvgAcc[i] = np.mean(cur_cv1_scores[methodIds]) maxAcc = np.amax(allAvgAcc) for i in range(2): if allAvgAcc[i] <= self.accRatio * maxAcc: self.allMethodIds[i] = set() self.allInfo = {} for methodIds in self.allMethodIds: for methodId in methodIds: if methodId not in self.allInfo.keys(): self.allInfo[methodId] = allInfo[methodId] toc = perf_counter() self.trainTime = toc - tic