def Basic(trainSet, testSet, file, fout, name, vecin, clfName, isCount=False): auc = 0 l = GLOB(clfName).getClassifier() l.buildClassifier(trainSet) vec = l.evaluateModel(testSet) actual = testSet[:, -1] if isCount: vals = DPLIB.getMeasuresCount(actual, vec) print(name + ":" + file + ": " + str(vals)) fout.write("\n" + name + ":" + file + ": " + "Vals=" + str(vals)) else: tvals = DPLIB.getConfMatrix(actual, vec) vals = DPLIB.getMeasures(tvals) auc = DPLIB.getAUC(actual, vec) print(name + ":" + file + ": " + str(vals) + " AUC = " + str(auc)) fout.write("\n" + name + ":" + file + ": " + " AUC = " + str(auc) + ";" + "Vals=" + str(vals))
def NNFilterMulti(trainSeti, testSet, file, fout, name, vecin, count, clfName, tunelrn, vSets): startTime = Common.getCurrentTimeMil() trainSet = DPLIB.NNFilterMulti(trainSeti, testSet, count) l = GLOB(clfName, tunelrn).getClassifier() if (tunelrn): l = l.getTunedCLF(trainSet, vSets, fout, name, file) print("#TUNE-LRN-PARAMS-" + name + ":" + file + ": " + str(l.selectedParams)) fout.write("#TUNE-LRN-PARAMS-" + name + ":" + file + ": ") fout.write(str(l.selectedParams)) fout.write("\n") sCheck = l.getCLFOptions() print("#SETSET-LRN-PARAMS-" + name + ":" + file + ": " + str(sCheck)) fout.write("#SETSET-LRN-PARAMS-" + name + ":" + file + ": ") fout.write(str(sCheck)) fout.write("\n") l.buildClassifier(trainSet) vec = l.evaluateModel(testSet) vecin = vec tvals = DPLIB.getConfMatrix(testSet[:, -1], vecin) print("#CONF-TEST-" + name + ":" + file + ": " + str(tvals)) fout.write("#CONF-TEST-" + name + ":" + file + ": ") fout.write(str(tvals)) fout.write("\n") auc = DPLIB.getAUC(testSet[:, -1], vec) vals = DPLIB.getMeasures(tvals) print(name + ":" + file + ": " + str(vals) + " AUC = " + str(auc)) fout.write(name + ":" + file + ": ") fout.write(str(vals)) fout.write(" AUC = ") fout.write(str(auc)) fout.write("\n") time = Common.getCurrentTimeMil() - startTime print("#TIME-FOR:" + name + ":" + file + ": " + str(time)) fout.write("#TIME-FOR:" + name + ":" + file + ": " + str(time) + "\n") return vecin
def fit(self, trainSet, testSet, vSets, vSetType, clfName): if self.isCount: self.mad = GA.getBugSTDForMutation(trainSet) if len(set(list(trainSet[:, -1]))) < 2: self.prnt('Error: Number of classes can not be less than two.') print('Error: Number of classes can not be less than two.') return trainSet, testSet = np.copy(trainSet), np.copy(testSet) tstSize = len(testSet) partSize = int(tstSize / self.numParts) isOK = True np.random.shuffle(testSet) self.FinalLearner = None self.FinalDataset = None diffs = [] vSets = ValidationSetManager.getValidationSets(vSets, vSetType, trainSet, testPart) pop = GA.createInitialPopulation(trainSet, self.popSize, fixedTrainSize, self.chrmSize) pop = GA.assignFitness(pop, GLOB(clfName).getClassifier(), vSets, self.isCount) pop = DPLIB.SortPopulation(pop) for g in range(self.numGens): self.prnt(str(g) + " ") newPop = GA.generateNewPopulation(pop, self.sizeTopP, selectionType='TORNAMENT', isCount=self.isCount, mad=mad) newPop = GA.assignFitness(newPop, GLOB(clfName).getClassifier(), vSets, self.isCount) newPop = DPLIB.SortPopulation(newPop) newPop, rdel = DPLIB.CombinePops(pop, newPop) rdel = None diff, exit = GA.checkExit(pop, newPop, self.countComp) diffs.append(diff) pop.clear() pop = newPop if (pop[0].getFitness() > 0.0) and (exit): break self.FinalDataset = pop[0].ds
def getKeySetKeys(self): if self.isCount: self.keySet = list( DPLIB.getMeasuresCount([0, 1, 2, 3], [0, 1, 2, 3]).keys()) else: self.keySet = list(DPLIB.MEASURES_BIN.keys()) return self.keySet
def WMulti(files, file, testSet, fout, features, name, clfName, dp, convertToBinary=True): train = [] for file2 in files: if (file2[0:3] == file[0:3] and file2 < file < 0): train.append(file2) if (len(train)): trainSet = DPLIB.LoadCSV(train, dp, features, convertToBinary) if (name.lower().find("infogain") >= 0): #int indi[] = DPLIB.fSelectInfoGain(trainSet); #if (DPLIB.useIterativeInfoGainSubsetting) #{ # indi = DPLIB.iterativeInfoGainSubsetting(trainSet, indi,clfName); #} #else # indi = DPLIB.getTopX(indi); #trainSet = DPLIB.fSelectSet(trainSet, indi); #testSet = DPLIB.fSelectSet(testSet, indi); pass l = GLOB(clfName).getClassifier() l.buildClassifier(trainSet) vec = l.evaluateModel(testSet) tvals = DPLIB.getConfMatrix(testSet[:, -1], vec) auc = DPLIB.getAUC(testSet[:, -1], vec) vals = DPLIB.getMeasures(tvals) print(name + ":" + file + ": " + str(vals) + " AUC = " + str(auc)) fout.write("\n" + name + ":" + file + ": " + " AUC = " + str(auc) + ";" + "Vals=" + str(vals)) else: print(name + ":" + file + ": " + "!!!" + " AUC = !!!") fout.write("\n" + name + ":" + file + ": !!!")
def score(self, predictions, testLabels): if self.isCount: pass else: if testLabels != None: confs, measures = DPLIB.getConfAndExtMeasures(testLabels, predictions) return confs, measures return None
def toDict(lds): measures = None expNames = ('GIS','LSH','FIXED','VAR','VR','FX',) out = {} for line in DPLIB.doReplaces('\n'.join(['\n'.join(l) for l in lds])).split('\n'): line = line.strip() if line.startswith(expNames): parts = line.replace("': ","'=>").split(':') perf = parts[2].replace('{','').replace('}','').strip() method = parts[0] ds = parts[1] learner = method.split('-')[-1] apprName = method[:method.rfind('-')] featureSpace = apprName.split('-')[-1] vals = [] measures = [] for p in perf.split(','): p = p.strip().split('=>') p[0] = p[0].strip().replace("'","") p[1] = p[1].strip() if p[0] not in measures: measures.append(p[0]) vals.append(float(p[1])) if method not in out.keys(): out[method] = {} if ds not in out[method].keys(): out[method][ds] = {} out[method][ds]['measures'] = {} out[method][ds]['confs'] = [] for mindex, m in enumerate(measures): if m not in out[method][ds]['measures'].keys(): out[method][ds]['measures'][m] = [] out[method][ds]['measures'][m].append(vals[mindex]) elif line.startswith("#CONF-TEST:"): parts = line.replace("': ","'=>").split(':')[1:] perf = parts[2].replace('{','').replace('}','').strip() method = parts[0] ds = parts[1] learner = method.split('-')[-1] apprName = method[:method.rfind('-')] featureSpace = apprName.split('-')[-1] if method not in out.keys(): out[method] = {} if ds not in out[method].keys(): out[method][ds] = {} out[method][ds]['measures'] = {} out[method][ds]['confs'] = [] out[method][ds]['confs'].append(perf) return out
def createInitialPopulation(pool, popSize, fixedTrainSize, maxChromosomeSize): pop = [] for i in range(popSize): uinds = set() size = GA.getChromosomeSize(fixedTrainSize, maxChromosomeSize) while True: #Select a subset that has noth classes trSet = DPLIB.getRandomSubSet(size, pool) if len(set(list(trSet[:, -1]))) >= 2: break pop.append(trSet) return pop
def LOC50(testSeti, file, fout, name, locIndex): startTime = Common.getCurrentTimeMil() spentISTime = 0 tempTime = 0 spentISTime = Common.getCurrentTimeMil() allloc = testSeti[:, locIndex] med = np.median(allloc) predicted = [1 if t >= med else 0 for t in allloc] actual = testSeti[:, -1] tvals = DPLIB.getConfMatrix(actual, predicted) print("#CONF-TEST-" + name + ":" + file + ": " + str(tvals)) fout.write("#CONF-TEST-" + name + ":" + file + ": ") fout.write(str(tvals)) fout.write("\n") vals = DPLIB.getMeasures(tvals) auc = DPLIB.getAUC(actual, predicted) print(name + ":" + file + ": " + str(vals) + " AUC = " + str(auc)) fout.write(name + ":" + file + ": ") fout.write(str(vals)) fout.write(" AUC = ") fout.write(str(auc)) fout.write("\n") time = Common.getCurrentTimeMil() - startTime print("#TIME-FOR:" + name + ":" + file + ": " + str(time)) fout.write("#TIME-FOR:" + name + ":" + file + ": " + str(time) + "\n") print("#TIME-FOR-IS:" + name + ":" + file + ": " + str(time)) fout.write("#TIME-FOR-IS:" + name + ":" + file + ": " + str(time) + "\n")
def assignFitness(pop, clf, vSets, isCount): for i in range(len(pop)): clf.buildClassifier(pop[i]) all_confs_measures = [] all_predictions = clf.evaluateMultiModel(vSets) if isCount: pass else: for index, predictions in enumerate(all_predictions): confs_measures = DPLIB.getConfMatrixAndExtMeasures( vSets[index][:, -1], predictions) all_confs_measures.append(confs_measures) h = CHRM_GIS(pop[i], all_measures, all_confs_measures) pop[i] = h return pop
def Mutate(ds, mProb=0.1, mCount=1, isCount=False, mad=0.0): """ Performs mutation with specified parameters. Please note that the mutation should consider the fact that datasets might contain repeated instances of the same data row, and the operation should consider consistency. After performing mutation, all instances with exact same data should have a consistent label. """ r2 = np.random.rand() if (r2 <= mProb): rands = set() i = 0 while (i < mCount): r1 = np.random.randint(0, len(ds)) if len(rands) == len(ds): return ds if (r1 in rands): continue instLabel = ds[r1, -1] #Mutation for non binary class values. Shift Using normal distribution random value if isCount: shift = int(np.random.randn() * mad) classVal = instLabel + shift if (classVal < 0): classVal = 0 else: classVal = (1 - instLabel) st = DPLIB.FindAllSimilarInstancesIndexes(r1, ds) for r1 in st: rands.add(r1) ds[r1, -1] = classVal i += 1 return ds
def crossOver(ds1, ds2, fixedSize, isCount=False): """ Cross over operator. It supports both one point and two point cross over methods. Further, it can keep the datasets the same size, as well as change the data in a way to generate varying size chromosomes. Please note that the cross over should consider the fact that datasets might contain repeated instances of the same data row, and the operation should consider consistency. This is especially inportant since, data can come from multiple sources, as well as, the effect of mutation on particular instances from previous generations. After performing crossover, all instances with exact same data should have a consistent label. This is done through majority voting rule in the cross over operations. """ ss = len(ds1) point1 = 0 point2 = 0 if (fixedSize): point1 = np.random.randint(ss) point2 = point1 else: point1 = np.random.randint(ss) point2 = np.random.randint(len(ds2)) if (len(ds1) >= 4000): point1 = int(len(ds1) / 2) if (len(ds2) >= 4000): point2 = int(len(ds2) / 2) np.random.shuffle(ds1) np.random.shuffle(ds2) ds1c = np.copy(ds1[:point1, :]) ds2c = np.copy(ds2[:point2, :]) ds1c = np.append(ds1c, ds2[point2:, :], axis=0) ds2c = np.append(ds2c, ds1[point1:, :], axis=0) pSet = set() for i in range(len(ds1c)): if i in pSet: continue t = list(DPLIB.FindAllSimilarInstancesIndexes(i, ds1c)) lbl = 0 index = -1 for j in range(len(t)): index = t[j] lbl += ds1c[index, -1] pSet.add(index) lbl = lbl / (len(t)) if not isCount: if (lbl >= 0.5): lbl = 1 else: lbl = 0 else: if lbl < 0: lbl = 0 for j in range(len(t)): index = t[j] #Process extra #if ((int)ds1c.instance(index).classValue()!=(int)lbl) # ds1c.instance(index).SetExtra(ds1c.instance(index).GetExtra() +"-C="+String.valueOf((int)(1-lbl))+">"+String.valueOf((int)lbl)); ds1c[index, -1] = lbl pSet.clear() for i in range(len(ds2c)): if (i in pSet): continue t = list(DPLIB.FindAllSimilarInstancesIndexes(i, ds2c)) lbl = 0 index = -1 for j in range(len(t)): index = t[j] lbl += ds2c[index, -1] pSet.add(index) lbl = lbl / len(t) if not isCount: if (lbl >= 0.5): lbl = 1 else: lbl = 0 else: if lbl < 0: lbl = 0 for j in range(len(t)): index = t[j] #Process extra #if ((int)ds2c.instance(index).classValue()!=(int)lbl) # ds2c.instance(index).SetExtra(ds2c.instance(index).GetExtra() +"-C="+String.valueOf((int)(1-lbl))+">"+String.valueOf((int)lbl)); ds2c[index, -1] = lbl return ds1c, ds2c
def CreateBuckets(self, trainSet, testSet, vSets, name, testCut, iternum, save, superbit, stages, buckets, doprint, clfName, tunelrn): out = [] if self.isCount: keySet = list( DPLIB.getMeasuresCount([0, 1, 2, 3], [0, 1, 2, 3]).keys()) else: keySet = list( DPLIB.getExtMeasures({ "tp": 1, "tn": 2, "fp": 3, "fn": 4 }).keys()) out.append("#STARTED FOR-" + name + ":" + self.file + ": ") startTime = Common.getCurrentTimeMil() spentIsTime = 0 tempTime = 0 out.append("#Using also Label For train in LSH") if (vSets == None): vSets = [] vSets.append(trainSet) if (save): DPLIB.SaveToCsv( trainSet, "MAIN-TRAIN-FILE-" + "ITER=" + str(iternum) + "--" + "METHOD=" + name + "--FILE=" + self.file + "--") DPLIB.SaveToCsv( testSet, "MAIN-TEST-FILE-" + "ITER=" + str(iternum) + "--" + "METHOD=" + name + "--FILE=" + self.file + "--") for i in range(len(vSets)): DPLIB.SaveToCsv( trainSet, "VSET-FILE-" + "INDEX=" + str(i) + "ITER=" + str(iternum) + "--" + "METHOD=" + name + "--FILE=" + self.file + "--") np.random.shuffle(trainSet) np.random.shuffle(testSet) tempTime = Common.getCurrentTimeMil() count = len(trainSet) bins = {} # R^n n = trainSet.shape[1] - 1 binid = 0 #lshmin = LSHMinHash(stages, buckets, n); try: lshsuper = LSHSuperBit(stages=stages, buckets=buckets, dimensions=n) except Exception as ex: print('##SuperBit with specified parameters failed:' + str(ex)) return None sp = 0.75 # Compute a SuperBit signature, and a LSH hash for i in range(count): vector = trainSet[i, 1:].tolist() hash = None if (superbit): hash = lshsuper.hash(vector) else: ##Minhash support # #hash = lshmin.hash(vecBool); pass binid = hash[0] if not binid in bins.keys(): bins[binid] = [] bins[binid].append(trainSet[i]) spentIsTime += Common.getCurrentTimeMil() - tempTime numBins = len(bins.keys()) for binid in bins.keys(): bins[binid] = np.array(bins[binid]) out.append("#Number of BINS:" + name + ":" + self.file + ": " + str(numBins)) pop = [] for i in bins.keys(): trSet = bins[i] l = GLOB(clfName, tunelrn).getClassifier() #if (tunelrn): # l = l.getTunedCLF(trSet, vSets,fout,name, file); l.buildClassifier(trSet) cf = 0 j = 0 allvecs = [] confs = [] allcfs = [] allaucs = [] valsA = None confsA = None aucA = 0.0 for vSet in vSets: vec = None actuals = None vec = l.evaluateModel(vSet) actuals = vSet[:, -1] vals = None auc = 0 if self.isCount: vals = DPLIB.getMeasuresCount(actuals, vec) else: auc = DPLIB.getAUC(actuals, vec) aucA += auc allaucs.append(auc) if (testCut): vCF = 0.1 bestCF = 0 bestCFVal = -1 bestVals = None while True: tvals = DPLIB.getConfMatrix(actuals, vec, vCF) measures = DPLIB.getMeasures(tvals) fit = measures["F"] * measures["GMean1"] if (fit > bestCFVal or bestVals == None): bestCFVal = fit bestCF = vCF bestVals = tvals vCF += 0.1 if (vCF >= 1): break if (confsA == None): confsA = {key: 0 for key in bestVals.keys()} for j in confsA.keys(): confsA[j] += bestVals[j] confs.append(bestVals) vals = DPLIB.getMeasures(bestVals) cf += bestCF allcfs.append(bestCF) else: tvals = DPLIB.getConfMatrix(actuals, vec) if (confsA == None): confsA = {key: 0 for key in tvals.keys()} for j in confsA.keys(): confsA[j] += tvals[j] confs.append(tvals) vals = DPLIB.getMeasures(tvals) allcfs.append(DPLIB.DefaultCF) allvecs.append(vals) if (valsA == None): valsA = {key: 0 for key in keySet} for j in keySet: valsA[j] += vals[j] for j in keySet: valsA[j] /= len(vSets) h = None if not self.isCount: for j in confsA.keys(): confsA[j] /= len(vSets) if (testCut): cf /= len(vSets) aucA /= len(vSets) h = CHRM_GIS(trSet, valsA, aucA) h.fitnesses = allvecs h.aucs = allaucs h.conf = confsA h.confs = confs h.allcfs = allcfs if (testCut): h.bestCF = cf else: h.bestCF = DPLIB.DefaultCF else: h = CHRM_GIS_Count(trSet, valsA) h.fitnesses = allvecs pop.append(h) l = None tempTime = Common.getCurrentTimeMil() pop = DPLIB.MySort(pop) spentIsTime += Common.getCurrentTimeMil() - tempTime top = pop[0] out.append("#Instances in Top:" + str(len(top.ds))) out.append("#STAGES:" + name + ":" + self.file + ": " + str(stages)) out.append("#BUCKETS:" + name + ":" + self.file + ": " + str(buckets)) if not self.isCount: out.append("#BEST-CF-VALUE:" + name + ":" + self.file + ": " + str(top.bestCF)) l = GLOB(clfName, tunelrn).getClassifier() if (tunelrn): l = l.getTunedCLF(top.ds, vSets, fout, name, file) out.append("#TUNE-LRN-PARAMS-" + name + ":" + self.file + ": " + str(l.selectedParams)) sCheck = l.getCLFOptions() out.append("#SETSET-LRN-PARAMS-" + name + ":" + self.file + ": " + str(sCheck)) l.buildClassifier(top.ds) vec = l.evaluateModel(testSet) out.append("#LSH-FOR-TOP-ONLY") if self.isCount: vals = DPLIB.getMeasuresCount(testSet[:, -1], vec) out.append(name + ":" + self.file + ": " + str(vals)) else: tvals = DPLIB.getConfMatrix(testSet[:, -1], vec, top.bestCF) out.append("#CONF-TEST-" + name + ":" + self.file + ": " + str(tvals)) vals = DPLIB.getMeasures(tvals) auc = DPLIB.getAUC(testSet[:, -1], vec) vals['auc'] = auc out.append(name + ":" + self.file + ": " + str(vals)) for i in range(len(pop)): pop[i] = None pop = None for i in bins.keys(): bins[i] = None bins = None time = Common.getCurrentTimeMil() - startTime if (name.find("LSHTune") < 0): out.append("#TIME-FOR:" + name + ":" + self.file + ": " + str(time)) out.append("#TIME-FOR-IS:" + name + ":" + self.file + ": " + str(spentIsTime)) self.output = +out top.addToExtra("SPENT-TIME-IS", float(spentIsTime)) return top, out
def run(self, trainSeti, testSeti, name, fout, vSets, vSetType, fixedTrainSize, log, ignoreOK, threshold, thresholds, rejectedFits, rejectedPerfs, rejectedTestPerfs, clfName): mad = 0.0 if self.isCount: keySet = list( DPLIB.getMeasuresCount([0, 1, 2, 3], [0, 1, 2, 3]).keys()) mad = DPLIB.SetBugCountForMut(trainSeti) else: keySet = list( DPLIB.getExtMeasures({ "tp": 1, "tn": 2, "fp": 3, "fn": 4 }).keys()) startTime = Common.getCurrentTimeMil() tempTime = 0 spentISTime = 0 #For Binary Prediction, isCount = False auc = 0 preds = [] pop = [] trainSet = np.copy(trainSeti) testSet = np.copy(testSeti) pop.clear() tstSize = len(testSet) partSize = int(tstSize / self.numParts) preds.clear() diffs = [] auc = 0.0 #For isCount = True actuals = [] prrs = [] if (log): self.prnt("#GIS-OPTIONS;;For=" + name + "@" + ":iters=" + str(self.iters) + "-POPSIZE=" + str(self.popSize) + "-NumParts=" + str(self.numParts) + "-NumGens=" + str(self.numGens) + "-sizeTop=" + str(self.sizeTopP) + "-Learner=" + clfName + "\n") isOK = True np.random.shuffle(testSet) self.FinalLearners = [] self.FinalDatasets = [] for p in range(self.numParts): diffp = [] self.prnt("\n" + str(p) + ": ") tempTime = Common.getCurrentTimeMil() pop.clear() start = p * partSize end = (p + 1) * partSize if (end > tstSize): end = tstSize if (p == self.numParts - 1): end = tstSize testPart = testSet[start:end, :] spentISTime += Common.getCurrentTimeMil() - tempTime uinds = set() if (vSets == None or len(vSets) == 0): if (vSets == None): vSets = [] vSet = None retVal = "" if (vSetType == 'Train Set'): vSet = trainSeti if (log): retVal = DPLIB.getStats(vSet, True, True, True) self.prnt("#VSETINFO;;prt=" + str(p) + ";;For=" + name + "@" + ":" + retVal + "\n") retVal = None elif (vSetType == 'NN-Filter'): tempTime = Common.getCurrentTimeMil() vSet = DPLIB.NNFilter(trainSet, testPart, 1) spentISTime += Common.getCurrentTimeMil() - tempTime if (log): retVal = DPLIB.getStats(vSet, True, True, True) self.prnt("#VSETINFO;;prt=" + str(p) + ";;For=" + name + "@" + ":" + retVal + "\n") retVal = None #If random, but not fed into the func, generate one randomly, with size of testPart elif (vSetType == 'Multiple Random' or vSetType == 'Single Random'): size = len(testPart) vSet = [] j = 0 while (j < size): index = np.random.randint(trainSet.numInstances()) if (not index in uinds): uinds.add(index) else: continue vSets.append(trainSet[index]) j += 1 if (log): retVal = DPLIB.getStats(vSet, true, true, True) self.prnt("#VSETINFO;;prt=" + str(p) + ";;For=" + name + "@" + ":" + retVal + "\n") retVal = None vSet = np.array(vSet) elif (vSetType == '!!TEST!!'): #Upper Bound. Should not be used. self.prnt("Should not be used.") vSet = testSeti if (log): retVal = DPLIB.getStats(vSet, True, True, True) self.prnt("#VSETINFO;;prt=" + str(p) + ";;For=" + name + "@" + ":" + retVal + "\n") retVal = None elif vSetType == 'KS2': vSet = None vSets.append(vSet) else: retVal = "" for vSet in vSets: if (log): retVal = DPLIB.getStats(vSet, True, True, True) self.prnt("#VSETINFO;;prt=" + str(p) + ";;For=" + name + "@" + ":" + retVal + "\n") retVal = None for i in range(self.popSize): tempTime = Common.getCurrentTimeMil() uinds.clear() size = 0 if (fixedTrainSize): size = self.chrmSize else: size = np.random.randint(self.chrmSize) + 10 while True: trSet = [] j = 0 while (j < size): index = np.random.randint(len(trainSet)) trSet.append(trainSet[index]) if (not index in uinds): uinds.add(index) j += 1 spentISTime += Common.getCurrentTimeMil() - tempTime trSet = np.array(trSet) if len(set(list(trSet[:, -1]))) >= 2: break tempTime = Common.getCurrentTimeMil() pv, p_vals = DPLIB.checkSimilarity(trSet[:, :-1], testPart[:, :-1]) if self.isCount: h = CHRM_GIS_Count(trSet, None, extraAsFitness='p-val') h.addToExtra('p-val', sum(p_vals)) pop.append(h) else: h = CHRM_GIS(trSet, None, None, extraAsFitness='p-val') h.addToExtra('p-val', sum(p_vals)) pop.append(h) spentISTime += Common.getCurrentTimeMil() - tempTime tempTime = Common.getCurrentTimeMil() pop = DPLIB.MySort(pop) spentISTime += Common.getCurrentTimeMil() - tempTime cnt = 0 g = 0 for g in range(self.numGens): self.prnt(str(g) + " ") if (log): pass #retVal = "" #for i in range(len(pop)): # chrm = pop[i] # retVal = DPLIB.getStats(chrm.ds, False, False, False); # self.prnt("#POPITNFO;;gn="+str(g)+";;prt="+str(p)+";;For="+name+"@"+":"+retVal+"\n"); # self.prnt("#POPITVALS;;gn="+str(g)+";;prt="+str(p)+";;For="+name+"@"+":"+"rpaf="+str(chrm.fitness).replace(", ", ",") # +";;conf="+str(chrm.conf).replace(", ", ",")+";;fit="+str(chrm.getFitness())+";;TConf2="+str(chrm.testConf).replace(", ", ",")+";;TRpaf2="+str(chrm.testFitness).replace(", ", ",")+"\n"); # retVal = None; tempTime = Common.getCurrentTimeMil() newPop = [] for i in range(self.sizeTopP): newPop.append(pop[i]) i = 0 for i in range(0, len(pop) - self.sizeTopP, 2): idx1 = 0 idx2 = 0 while (idx1 == idx2): if (cnt >= 3): idx1 = np.random.randint(len(pop)) idx2 = np.random.randint(len(pop)) else: idx1 = GA.tornament(pop) idx2 = GA.tornament(pop) cnt += 1 cnt = 0 ds1 = pop[idx1].ds ds2 = pop[idx2].ds while True: ds1, ds2 = GA.crossOver(ds1, ds2, fixedTrainSize, isCount=self.isCount) if len(set(list(ds1[:, -1]))) >= 2 and len( set(list(ds2[:, -1]))) >= 2: break self.prnt('repeat cross') while True: ds1 = GA.Mutate(ds1, isCount=self.isCount, mad=mad) if len(set(list(ds1[:, -1]))) >= 2: break self.prnt( 'repeat mut ds1, because all elements are of type one class' ) while True: ds2 = GA.Mutate(ds2, isCount=self.isCount, mad=mad) if len(set(list(ds2[:, -1]))) >= 2: break self.prnt( 'repeat mut ds1, because all elements are of type one class' ) if self.isCount: newPop.append( CHRM_GIS_Count(ds1, None, extraAsFitness='p-val')) newPop.append( CHRM_GIS_Count(ds2, None, extraAsFitness='p-val')) else: newPop.append( CHRM_GIS(ds1, None, extraAsFitness='p-val')) newPop.append( CHRM_GIS(ds2, None, extraAsFitness='p-val')) spentISTime += Common.getCurrentTimeMil() - tempTime for i in range(len(newPop)): tempTime = Common.getCurrentTimeMil() pv, p_vals = DPLIB.checkSimilarity(newPop[i].ds[:, :-1], testPart[:, :-1]) newPop[i].addToExtra('p-val', sum(p_vals)) spentISTime += Common.getCurrentTimeMil() - tempTime tempTime = Common.getCurrentTimeMil() newPop = DPLIB.MySort(newPop) exit = False countComp = 0 newPop, rdel = DPLIB.CombinePops(pop, newPop) if (log): pass #retVal = "" #for i in range(len(rdel)): # chrm = rdel[i]; # retVal = DPLIB.getStats(chrm.ds, False, False, False); # self.prnt("#POPDELITNFO;;gn="+str(g)+";;prt="+str(p)+";;For="+name+"@"+":"+retVal+";;rpaf="+str(chrm.fitness).replace(", ", ",") # +";;conf="+str(chrm.conf).replace(", ", ",")+";;fit="+str(chrm.getFitness())+";;TConf2="+str(chrm.testConf).replace(", ", ",")+";;TRpaf2="+str(chrm.testFitness).replace(", ", ",") # +"\n"); # retVal = None; rdel = None diff = abs( GA.GetMeanFittness(pop, countComp) - GA.GetMeanFittness(newPop, countComp)) if (diff < 0.000001): exit = True diffp.append(diff) pop = newPop if (pop[0].getFitness() > 0.0) and (exit): break exit = False spentISTime += Common.getCurrentTimeMil() - tempTime w = [] if (self.count == 0): self.count = len(pop) for i in range(self.count): l = GLOB(clfName).getClassifier() tds = pop[i].ds self.FinalLearners.append(l) self.FinalDatasets.append(tds) testPartI = testPart l.buildClassifier(tds) if self.isCount: actual = DPLIB.getActuals(testPartI) prr = l.evaluateModel(testPartI) #vals = DPLIB.getMeasuresCount(actual,prr) actall = None predall = None if (len(actuals) == self.count): actuals[i] = actuals[i] + actual prrs[i] = prrs[i] + prr else: actuals.append(actual) prrs.append(prr) else: vec = l.evaluateModel(testPartI) if (len(preds) == self.count): preds[i] += list(vec) else: preds.append(list(vec)) if (log): pass #retVal = DPLIB.getStats(tds, True, True, True); #self.prnt("#TRPRTNFO;;prt="+str(p)+";;For="+name+"@"+":"+retVal+"\n"); #retVal = DPLIB.getStats(testPart,true,true, True); #self.prnt("#TSTPRTNFO;;prt="+str(p)+";;For="+name+"@"+":"+retVal+"\n"); #vals = DPLIB.getConfMatrix(testPart[:,-1],vec) #self.prnt("#TSTPRTVALS;;prt="+str(p)+";;For="+name+"@"+":"+ # "rpaf="+str(DPLIB.getMeasures(vals)).replace(", ", ",") # +";;conf="+str(vals).replace(", ", ",")+"\n"); #retVal = None; w.append(pop[i].getFitness()) isOK = True if not isOK: pass else: thresholds.append(pop[0].getFitness()) self.prnt() self.prnt("Best Top Fitness:" + str(pop[0].fitness)) self.prnt("Best Fitness (mean):", pop[0].getMeanFitness()) if self.isCount: vals = DPLIB.getMeasuresCountSet(actuals, prrs) else: vals1 = DPLIB.getConfMatrixSet(testSet[:, -1], preds) vals = DPLIB.getMeasures(vals1) if (isOK): if not self.isCount: if (len(preds) == 1): auc = DPLIB.getAUC(testSet[:, -1], preds[0]) else: auc = DPLIB.getAUCSet(testSet[:, -1], preds) vals['auc'] = auc self.prnt() self.prnt("#CONF-TEST:" + name + ":" + self.file + ": " + str(vals1)) self.prnt() self.prnt(name + ":" + self.file + ": " + str(vals)) self.prnt() else: self.prnt() self.prnt(name + ":" + self.file + ": " + str(vals)) self.prnt() else: bestI = pop[0] rejectedFits.append(bestI.getFitness()) rejVals = copy.deepcopy(bestI.fitness) rejectedPerfs.append(rejVals) testRejVals = copy.deepcopy(vals) rejectedTestPerfs.eppend(testRejVals) self.prnt("#NOTOKPREDS----" + name + ":" + self.file + ": " + str(vals)) if not self.isCount: self.prnt() self.prnt("#NOTOKPREDS----" + "#CONF-TEST:" + name + ":" + self.file + ": " + str(vals1)) time = Common.getCurrentTimeMil() - startTime self.prnt("#TIME-FOR:" + name + ":" + self.file + ": " + str(time)) self.prnt("#TIME-FOR-IS:" + name + ":" + self.file + ": " + str(spentISTime)) return isOK
def run(self): lrnrnames = self.lrnrs try: rnd = random.Random(Common.getCurrentTimeMil()) if self.expType == 'GIS': if self.isKS: self.gis = GISKS2(self.pars,self.file) else: self.gis = GIS(self.pars) elif self.expType == 'LSH': lsh = CPDP_LSH_Binary(self.pars, self.file) trainSetAll = DPLIB.LoadCSV(self.train, self.dp, self.features, convertToBinary = not self.isCount); testSetAll = DPLIB.LoadCSV(self.test, self.dp, self.features, convertToBinary = not self.isCount); ft = 'A' indi = None if not self.isCount: if self.pars['features'] == 'Iterative InfoGain Subsetting': ft = 'IG' indi = DPLIB.fSelectInfoGain(trainSetAll); if self.pars['features'] == 'All': print ('All') if self.pars['features'] == 'PCA': ft = 'PCA' print ('PCA') trainSetAll, testSetAll = DPLIB.applyPCA(trainSetAll, testSetAll, 0.95) for lk in range(len(lrnrnames)): lrnr = "-" + lrnrnames[lk]; clfName = lrnrnames[lk]; vSets = None if not self.isCount: if self.pars['features'] == 'Iterative InfoGain Subsetting': indis2 = DPLIB.iterativeInfoGainSubsetting(trainSetAll, indi, clfName); trainSetAll = DPLIB.fSelectSet(trainSetAll, indis2); testSetAll = DPLIB.fSelectSet(testSetAll, indis2); if self.pars['vSetType'] in ['Single Random','Multiple Random']: vSets = DatasetUtils.getRandomDatasets() c = 0 while (c < self.iters): print("Start:" + self.file + ": " + str(c)); print("===================================================="); #fout.write("#ITERINFO:For File=" +self.file + "-Iter:" + str(c) + "\n"); stages = None buckets = None sbtx = "" if self.expType == 'GIS': self.doGIS(trainSetAll, testSetAll, "FIXED-VMUL-GEN-"+ft, lrnr, fout, vSets, False, clfName, gis=gis); gis.prnt('---------------------------------------\n') elif self.expType == 'LSH': lsh.CreateBucketsTune(trainSetAll, testSetAll, vSets, name= "LSHTune-ALL-TOP-SUPER" + sbtx + lrnr, testCut= self.pars['tunecut'], iternum=c, save=False, superbit=self.pars['lshType'] =='SuperBit', clfName=clfName,tunelrn = self.pars['tunelrnr']); lsh.prnt('---------------------------------------\n') c+=1 #fout.write("===================================================================\n"); #fout.close(); print("File Processing Ended:" +self.file); except Exception as e: try: print (str(e)) print(traceback.format_exc()) except Exception as ex2: print("X2", str(ex2)); print(traceback.format_exc()) if self.expType == 'GIS': return gis elif self.expType == 'LSH': return lsh
def WCFolds(testSet, folds, file, fout, name, clfName): auc = 0 preds = [] actuals = [] vals = None tssCopy = testSet[:, :] rnd = random.Random(Common.getCurrentTimeMil()) np.random.shuffle(tssCopy) skf = StratifiedKFold(n_splits=folds) X = tssCopy[:, :-1] y = tssCopy[:, -1] for train_index, test_index in skf.split(X, y): cvtrain, cvtest = X[train_index], X[test_index] cvtrainY, cvtestY = y[train_index], y[test_index] cvtrain = np.append(cvtrain, cvtrainY.reshape((len(cvtrainY), 1)), axis=1) cvtest = np.append(cvtest, cvtestY.reshape((len(cvtestY), 1)), axis=1) if (name.lower().find("infogain") >= 0): pass #int indi[] = DPLIB.fSelectInfoGain(cvtrain); #if (DPLIB.useIterativeInfoGainSubsetting) #{ # indi = DPLIB.iterativeInfoGainSubsetting(cvtrain, indi, clfName); #} #else # indi = DPLIB.getTopX(indi); #cvtrain = DPLIB.fSelectSet(cvtrain, indi); #cvtest = DPLIB.fSelectSet(cvtest, indi); m = GLOB(clfName).getClassifier() m.buildClassifier(cvtrain) vec = m.evaluateModel(cvtest) preds.append(vec) actuals.append(cvtestY) if vals == None: vals = DPLIB.getConfMatrix(cvtestY, vec) else: v2 = DPLIB.getConfMatrix(cvtestY, vec) for key in vals.keys(): vals[key] += v2[key] auc = DPLIB.getAUCCV(actuals, preds) vals1 = DPLIB.getMeasures(vals) print(name + ":" + file + ": " + str(vals1) + " AUC = " + str(auc)) fout.write("\n" + name + ":" + file + ": " + " AUC = " + str(auc) + ";" + "Vals=" + str(vals1))
def NNFilter(trainSeti, testSet, file, fout, name, vecin, count, clfName, tunelrn, vSets, testCut): startTime = Common.getCurrentTimeMil() spentISTime = 0 tempTime = 0 bestFit = 0.0 bestCount = 0 btrainSet = None cfbf = DPLIB.DefaultCF if (count == 0): for i in range(1, 11): tempTime = Common.getCurrentTimeMil() trainSet = DPLIB.NNFilter(trainSeti, testSet, i) spentISTime += Common.getCurrentTimeMil() - tempTime l = GLOB(clfName, tunelrn).getClassifier() if (tunelrn): l = l.getTunedCLF(trainSet, vSets, fout, name, file) l.buildClassifier(trainSet) avgFit = 0.0 j = 0 for j in range(len(vSets)): vec = l.evaluateModel(vSets[j]) tvals = DPLIB.getConfMatrix(vSets[j][:, -1], vec) measures = DPLIB.getExtMeasures(tvals) fit = measures["F"] * measures["GMean1"] avgFit += fit avgFit /= len(vSets) if (avgFit > bestFit): bestFit = avgFit bestCount = i btrainSet = trainSet[:, :] if (testCut): cf = 0 j = 0 trainSet = btrainSet l = GLOB(clfName, tunelrn).getClassifier() if (tunelrn): l = l.getTunedCLF(trainSet, vSets, fout, name, file) l.buildClassifier(trainSet) avgFit = 0.0 for j in range(len(vSets)): vec = l.evaluateModel(vSets[j]) vCF = 0.1 bestCF = 0 bestCFVal = -1 bestVals = None while True: tvals = DPLIB.getConfMatrix(vSets[j][:, -1], vec, vCF) measures = DPLIB.getExtMeasures(tvals) fit = measures["F"] * measures["GMean1"] if (fit > bestCFVal or bestVals == None): bestCFVal = fit bestCF = vCF bestVals = tvals vCF += 0.1 if vCF >= 1: break cf += bestCF cf /= vSets.size() cfbf = cf trainSet = None if (count == 0): trainSet = btrainSet else: tempTime = Common.getCurrentTimeMil() trainSet = DPLIB.NNFilter(trainSeti, testSet, count) spentISTime = Common.getCurrentTimeMil() - tempTime bestCount = count l = GLOB(clfName, tunelrn).getClassifier() if (tunelrn): l = l.getTunedCLF(trainSet, vSets, fout, name, file) print("#TUNE-LRN-PARAMS-" + name + ":" + file + ": " + str(l.selectedParams)) fout.write("#TUNE-LRN-PARAMS-" + name + ":" + file + ": ") fout.write(str(l.selectedParams)) fout.write("\n") sCheck = l.getCLFOptions() print("#SETSET-LRN-PARAMS-" + name + ":" + file + ": " + str(sCheck)) fout.write("#SETSET-LRN-PARAMS-" + name + ":" + file + ": ") fout.write(str(sCheck)) fout.write("\n") l.buildClassifier(trainSet) vec = l.evaluateModel(testSet) vecin = vec tvals = DPLIB.getConfMatrix(testSet[:, -1], vecin, cfbf) if (count == 0): print("#BESTCOUNT-" + name + ":" + file + ": " + str(bestCount)) fout.write("#BESTCOUNT-" + name + ":" + file + ": ") fout.write(str(bestCount)) fout.write("\n") print("#BESTFIT-" + name + ":" + file + ": " + str(bestFit)) fout.write("#BESTFIT-" + name + ":" + file + ": ") fout.write(str(bestFit)) fout.write("\n") print("#CONF-TEST-" + name + ":" + file + ": " + str(tvals)) fout.write("#CONF-TEST-" + name + ":" + file + ": ") fout.write(str(tvals)) fout.write("\n") if (testCut): print("#NN-BEST-CF-VALUE:" + name + ":" + file + ": " + str(cfbf)) fout.write("#NN-BEST-CF-VALUE:" + name + ":" + file + ": ") fout.write(str(cfbf)) fout.write("\n") vals = DPLIB.getMeasures(tvals) auc = DPLIB.getAUC(testSet[:, -1], vecin) print(name + ":" + file + ": " + str(vals) + " AUC = " + str(auc)) fout.write(name + ":" + file + ": ") fout.write(str(vals)) fout.write(" AUC = ") fout.write(str(auc)) fout.write("\n") time = Common.getCurrentTimeMil() - startTime print("#TIME-FOR:" + name + ":" + file + ": " + str(time)) fout.write("#TIME-FOR:" + name + ":" + file + ": " + str(time) + "\n") print("#TIME-FOR-IS:" + name + ":" + file + ": " + str(spentISTime)) fout.write("#TIME-FOR-IS:" + name + ":" + file + ": " + str(spentISTime) + "\n") return vecin
for key in FutureBugs.keys(): bds = sorted([bugDates[b] for b in FutureBugs[key]]) if len(bds)>1: timespans.append((datetime.datetime.fromtimestamp(bds[-1])-datetime.datetime.fromtimestamp(bds[0])).total_seconds()/86400) timespansNoz.append(timespans[-1]) ids.append(key) else: timespans.append(0) print ('2 Timespan of Bugs: Mean,STD,Median ',np.mean(timespans),np.std(timespans),np.median(timespans)) print ('2 Timespan of Bugs (for multibugs only): Mean,STD ,Median',np.mean(timespansNoz),np.std(timespansNoz),np.median(timespansNoz)) print ('2 Count Multi Bug:', len(timespansNoz)) print ('2 Count One or Multi Bug:', len(timespans)) mad = DPLIB.MAD(timespansNoz) umad = mad+np.median(timespansNoz) lmad = np.median(timespansNoz)-mad if len(timespansNoz)>0: print ('2 MAD, Median , UMAD , Percent>UMAD, Median Filtered:', mad,np.median(timespansNoz),umad,'%.2f' %(100*len([t for t in timespansNoz if t>umad])/len(timespansNoz)), np.median([t for t in timespansNoz if t>umad])) else: print ('2 MAD, Median , UMAD , Percent>UMAD, Median Filtered:', mad,np.median(timespansNoz),umad,'PERCENT UNDEF', np.median([t for t in timespansNoz if t>umad])) #numfiles= [(key,len(logsd[int(key)][logcolnames.index('files')].split('---')[:-1])) for index,key in enumerate(ids) if timespansNoz[index]>umad] #print (numfiles) if len(timespansNoz)>0: try: pass #plt.figure(figsize=(10,10))