def start(**kwargs): dataValues = kwargs["dataValues"] dataLabels = kwargs["dataLabels"] initialLabeledData = kwargs["initialLabeledData"] sizeOfBatch = kwargs["sizeOfBatch"] classes = kwargs["classes"] K = kwargs["K_variation"] batches = kwargs["batches"] sizeOfBatch = kwargs["sizeOfBatch"] clfName = kwargs["clfName"] densityFunction = kwargs["densityFunction"] poolSize = kwargs["poolSize"] isBatchMode = kwargs["isBatchMode"] print( "METHOD: {} as classifier and {} and Hellinger distance as dynamic CSE" .format(clfName, densityFunction)) usePCA = False arrAcc = [] arrX = [] arrY = [] arrUt = [] arrYt = [] arrClf = [] arrPredicted = [] initialDataLength = 0 finalDataLength = initialLabeledData #round((initialLabeledDataPerc)*sizeOfBatch) # ***** Box 1 ***** #Initial labeled data X, y = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, usePCA) clf = classifiers.classifier(X, y, K, clfName) #O(nd+kn) reset = True if isBatchMode: for t in range(batches): #print("passo: ",t) initialDataLength = finalDataLength finalDataLength = finalDataLength + sizeOfBatch #print(initialDataLength) #print(finalDataLength) # ***** Box 2 ***** Ut, yt = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, usePCA) # for decision boundaries plot arrClf.append(clf) arrX.append(X) arrY.append(y) arrUt.append(np.array(Ut)) arrYt.append(yt) predicted = clf.predict(Ut) arrPredicted.append(predicted) # Evaluating classification arrAcc.append(metrics.evaluate(yt, predicted)) # ***** Box 4 ***** excludingPercentage = cuttingPercentage(X, Ut, t) #excludingPercentageByClass, reset = cuttingPercentageByClass(X, Ut, y, predicted, classes, t) allInstances = [] allLabels = [] # ***** Box 5 ***** if reset == True: #Considers only the last distribution (time-series like) pdfsByClass = util.pdfByClass(Ut, predicted, classes, densityFunction) #O(n^{2}d) else: #Considers the past and actual data (concept-drift like) allInstances = np.vstack([X, Ut]) allLabels = np.hstack([y, yt]) pdfsByClass = util.pdfByClass(allInstances, allLabels, classes, densityFunction) selectedIndexes = util.compactingDataDensityBased2( pdfsByClass, excludingPercentage) #O(n log(n) c) #selectedIndexes = util.compactingDataDensityBased(pdfsByClass, excludingPercentageByClass) #print(t, excludingPercentage) # ***** Box 6 ***** if reset == True: #Considers only the last distribution (time-series like) X, y = util.selectedSlicedData(Ut, yt, selectedIndexes) #O(n) else: #Considers the past and actual data (concept-drift like) X, y = util.selectedSlicedData(allInstances, allLabels, selectedIndexes) clf = classifiers.classifier(X, y, K, clfName) #O(nd+kn) else: t = 0 inst = [] labels = [] clf = classifiers.classifier(X, y, K, clfName) remainingX, remainingY = util.loadLabeledData(dataValues, dataLabels, finalDataLength, len(dataValues), usePCA) reset = False for Ut, yt in zip(remainingX, remainingY): predicted = clf.predict(Ut.reshape(1, -1))[0] arrAcc.append(predicted) inst.append(Ut) labels.append(predicted) # for decision boundaries plot arrClf.append(clf) arrX.append(X) arrY.append(y) arrUt.append(Ut) arrYt.append(yt) arrPredicted.append(predicted) #new approach if len(inst) == poolSize: inst = np.array(inst) excludingPercentage = cuttingPercentage(X, inst, t) t += 1 '''if excludingPercentage < 0: #print("negative, reseting points") excludingPercentage = 0.5 #default reset = True else: reset = False ''' if reset == True: #Considers only the last distribution (time-series like) pdfsByClass = util.pdfByClass(inst, labels, classes, densityFunction) else: #Considers the past and actual data (concept-drift like) allInstances = np.vstack([X, inst]) allLabels = np.hstack([y, labels]) pdfsByClass = util.pdfByClass(allInstances, allLabels, classes, densityFunction) selectedIndexes = util.compactingDataDensityBased2( pdfsByClass, excludingPercentage) if reset == True: #Considers only the last distribution (time-series like) X, y = util.selectedSlicedData(inst, labels, selectedIndexes) else: #Considers the past and actual data (concept-drift like) X, y = util.selectedSlicedData(allInstances, allLabels, selectedIndexes) clf = classifiers.classifier(X, y, K, clfName) inst = [] labels = [] arrAcc = split_list(arrAcc, batches) arrAcc = makeAccuracy(arrAcc, remainingY) arrYt = split_list(arrYt, batches) arrPredicted = split_list(arrPredicted, batches) # returns accuracy array and last selected points return "AMANDA (Dynamic)", arrAcc, X, y, arrX, arrY, arrUt, arrYt, arrClf, arrPredicted
def start(**kwargs): dataValues = kwargs["dataValues"] dataLabels = kwargs["dataLabels"] initialLabeledData = kwargs["initialLabeledData"] sizeOfBatch = kwargs["sizeOfBatch"] classes = kwargs["classes"] K = kwargs["K_variation"] batches = kwargs["batches"] sizeOfBatch = kwargs["sizeOfBatch"] excludingPercentage = kwargs["excludingPercentage"] clfName = kwargs["clfName"] densityFunction = kwargs["densityFunction"] poolSize = kwargs["poolSize"] isBatchMode = kwargs["isBatchMode"] print("METHOD: Sliding {0} as classifier".format(clfName)) usePCA = False arrAcc = [] arrX = [] arrY = [] arrUt = [] arrYt = [] arrClf = [] arrPredicted = [] initialDataLength = 0 finalDataLength = initialLabeledData # ***** Box 1 ***** #Initial labeled data X, y = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, usePCA) clf = classifiers.classifier(X, y, K, clfName) if isBatchMode: for t in range(batches): # sliding clf.fit(X, y) initialDataLength = finalDataLength finalDataLength = finalDataLength + sizeOfBatch #print(initialDataLength) #print(finalDataLength) Ut, yt = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, usePCA) # for decision boundaries plot arrClf.append(clf) arrX.append(X) arrY.append(y) arrUt.append(np.array(Ut)) arrYt.append(yt) predicted = clf.predict(Ut) arrPredicted.append(predicted) # Evaluating classification arrAcc.append(metrics.evaluate(yt, predicted)) X, y = Ut, predicted else: inst = [] labels = [] remainingX, remainingY = util.loadLabeledData(dataValues, dataLabels, finalDataLength, len(dataValues), usePCA) for Ut, yt in zip(remainingX, remainingY): predicted = clf.predict(Ut.reshape(1, -1)) arrAcc.append(predicted) inst.append(Ut) labels.append(predicted) # for decision boundaries plot arrClf.append(clf) arrX.append(X) arrY.append(y) arrUt.append(Ut) arrYt.append(yt) arrPredicted.append(predicted) if len(inst) == poolSize: inst = np.asarray(inst) clf = classifiers.classifier(inst, labels, K, clfName) inst = [] labels = [] arrAcc = split_list(arrAcc, batches) arrAcc = makeAccuracy(arrAcc, remainingY) arrYt = split_list(arrYt, batches) arrPredicted = split_list(arrPredicted, batches) return "Sliding SSL", arrAcc, X, y, arrX, arrY, arrUt, arrYt, arrClf, arrPredicted
def fit(self, dataValues, dataLabels=None): arrAcc = [] classes = list(set(dataLabels)) initialDataLength = 0 self.excludingPercentage = 1 - self.excludingPercentage finalDataLength = self.initialLabeledData reset = True # ***** Box 1 ***** #Initial labeled data X, y = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, self.usePCA) if self.isBatchMode: for t in range(self.batches): #print("passo: ",t) initialDataLength = finalDataLength finalDataLength = finalDataLength + self.sizeOfBatch # ***** Box 2 ***** Ut, yt = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, self.usePCA) # ***** Box 3 ***** clf = classifiers.classifier(X, y, self.K, self.clfName) predicted = clf.predict(Ut) # Evaluating classification arrAcc.append(metrics.evaluate(yt, predicted)) # ***** Box 4 ***** #pdfs from each new points from each class applied on new arrived points '''pdfsByClass = util.pdfByClass(Ut, predicted, classes, self.densityFunction) # ***** Box 5 ***** selectedIndexes = util.compactingDataDensityBased2(pdfsByClass, self.excludingPercentage) # ***** Box 6 ***** X, y = util.selectedSlicedData(Ut, predicted, selectedIndexes)''' allInstances = [] allLabels = [] if reset == True: #Considers only the last distribution (time-series like) pdfsByClass = util.pdfByClass(Ut, predicted, classes, self.densityFunction) else: #Considers the past and actual data (concept-drift like) allInstances = np.vstack([X, Ut]) allLabels = np.hstack([y, predicted]) pdfsByClass = util.pdfByClass(allInstances, allLabels, classes, self.densityFunction) selectedIndexes = util.compactingDataDensityBased2( pdfsByClass, self.excludingPercentage) # ***** Box 6 ***** if reset == True: #Considers only the last distribution (time-series like) X, y = util.selectedSlicedData(Ut, predicted, selectedIndexes) else: #Considers the past and actual data (concept-drift like) X, y = util.selectedSlicedData(allInstances, allLabels, selectedIndexes) else: inst = [] labels = [] clf = classifiers.classifier(X, y, self.K, self.clfName) remainingX, remainingY = util.loadLabeledData( dataValues, dataLabels, finalDataLength, len(dataValues), self.usePCA) for Ut, yt in zip(remainingX, remainingY): predicted = clf.predict(Ut.reshape(1, -1)) arrAcc.append(predicted) inst.append(Ut) labels.append(predicted) if len(inst) == self.poolSize: inst = np.asarray(inst) #pdfsByClass = util.pdfByClass(inst, labels, classes, self.densityFunction) #selectedIndexes = util.compactingDataDensityBased2(pdfsByClass, self.excludingPercentage) #X, y = util.selectedSlicedData(inst, labels, selectedIndexes) if reset == True: #Considers only the last distribution (time-series like) pdfsByClass = util.pdfByClass(inst, labels, classes, self.densityFunction) else: #Considers the past and actual data (concept-drift like) allInstances = np.vstack([X, inst]) allLabels = np.hstack([y, labels]) pdfsByClass = util.pdfByClass(allInstances, allLabels, classes, self.densityFunction) selectedIndexes = util.compactingDataDensityBased2( pdfsByClass, excludingPercentage) if reset == True: #Considers only the last distribution (time-series like) X, y = util.selectedSlicedData(inst, labels, selectedIndexes) else: #Considers the past and actual data (concept-drift like) X, y = util.selectedSlicedData(allInstances, allLabels, selectedIndexes) clf = classifiers.classifier(X, y, self.K, self.clfName) inst = [] labels = [] arrAcc = split_list(arrAcc, self.batches) arrAcc = makeAccuracy(arrAcc, remainingY) # returns accuracy array and last selected points self.threshold_ = arrAcc return self
def start(**kwargs): dataValues = kwargs["dataValues"] dataLabels = kwargs["dataLabels"] initialLabeledData = kwargs["initialLabeledData"] sizeOfBatch = kwargs["sizeOfBatch"] classes = kwargs["classes"] K = kwargs["K_variation"] batches = kwargs["batches"] sizeOfBatch = kwargs["sizeOfBatch"] excludingPercentage = kwargs["excludingPercentage"] clfName = kwargs["clfName"] densityFunction = kwargs["densityFunction"] poolSize = kwargs["poolSize"] isBatchMode = kwargs["isBatchMode"] print("METHOD: {} as classifier and {} as core support extraction with cutting data method".format(clfName, densityFunction)) usePCA=False arrAcc = [] arrX = [] arrY = [] arrUt = [] arrYt = [] arrClf = [] arrPredicted = [] initialDataLength = 0 excludingPercentage = 1-excludingPercentage finalDataLength = initialLabeledData #round((initialLabeledDataPerc)*sizeOfBatch) reset = True # ***** Box 1 ***** #Initial labeled data X, y = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, usePCA) clf = classifiers.classifier(X, y, K, clfName) #O(nd+kn) if isBatchMode: for t in range(batches): #print("passo: ",t) initialDataLength=finalDataLength finalDataLength=finalDataLength+sizeOfBatch #print(initialDataLength) #print(finalDataLength) Ut, yt = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, usePCA) # for decision boundaries plot arrClf.append(clf) arrX.append(X) arrY.append(y) arrUt.append(np.array(Ut)) arrYt.append(yt) #classifies predicted = clf.predict(Ut) arrPredicted.append(predicted) # Evaluating classification arrAcc.append(metrics.evaluate(yt, predicted)) # ***** Box 4 ***** #pdfs from each new points from each class applied on new arrived points allInstances = [] allLabels = [] if reset == True: #Considers only the last distribution (time-series like) pdfsByClass = util.pdfByClass(Ut, yt, classes, densityFunction)#O(nmd) else: #Considers the past and actual data (concept-drift like) allInstances = np.vstack([X, Ut]) allLabels = np.hstack([y, yt]) pdfsByClass = util.pdfByClass(allInstances, allLabels, classes, densityFunction) selectedIndexes = util.compactingDataDensityBased2(pdfsByClass, excludingPercentage)#O(n log(n) c) # ***** Box 6 ***** if reset == True: #Considers only the last distribution (time-series like) X, y = util.selectedSlicedData(Ut, yt, selectedIndexes) else: #Considers the past and actual data (concept-drift like) X, y = util.selectedSlicedData(allInstances, allLabels, selectedIndexes)#O(n) #training clf = classifiers.classifier(X, y, K, clfName) #O(nd+kn) else: inst = [] labels = [] clf = classifiers.classifier(X, y, K, clfName) remainingX , remainingY = util.loadLabeledData(dataValues, dataLabels, finalDataLength, len(dataValues), usePCA) reset = False for Ut, yt in zip(remainingX, remainingY): predicted = clf.predict(Ut.reshape(1, -1))[0] arrAcc.append(predicted) inst.append(Ut) labels.append(predicted) # for decision boundaries plot arrClf.append(clf) arrX.append(X) arrY.append(y) arrUt.append(Ut) arrYt.append(yt) arrPredicted.append(predicted) if len(inst) == poolSize: inst = np.asarray(inst) '''pdfsByClass = util.pdfByClass(inst, labels, classes, densityFunction) selectedIndexes = util.compactingDataDensityBased2(pdfsByClass, excludingPercentage) X, y = util.selectedSlicedData(inst, labels, selectedIndexes) clf = classifiers.classifier(X, y, K, clfName) inst = [] labels = []''' if reset == True: #Considers only the last distribution (time-series like) pdfsByClass = util.pdfByClass(inst, labels, classes, densityFunction) else: #Considers the past and actual data (concept-drift like) allInstances = np.vstack([X, inst]) allLabels = np.hstack([y, labels]) pdfsByClass = util.pdfByClass(allInstances, allLabels, classes, densityFunction) selectedIndexes = util.compactingDataDensityBased2(pdfsByClass, excludingPercentage) if reset == True: #Considers only the last distribution (time-series like) X, y = util.selectedSlicedData(inst, labels, selectedIndexes) else: #Considers the past and actual data (concept-drift like) X, y = util.selectedSlicedData(allInstances, allLabels, selectedIndexes) clf = classifiers.classifier(X, y, K, clfName) inst = [] labels = [] arrAcc = split_list(arrAcc, batches) arrAcc = makeAccuracy(arrAcc, remainingY) arrYt = split_list(arrYt, batches) arrPredicted = split_list(arrPredicted, batches) # returns accuracy array and last selected points return "AMANDA (Fixed)", arrAcc, X, y, arrX, arrY, arrUt, arrYt, arrClf, arrPredicted