def bfs(self, graph, data): features = [] score = [] nvisited = [] visited, queue = set(), [] for i in graph.children: queue.append(i) while queue: vertex = queue.pop(0) if vertex not in visited: visited.add(vertex) queue.extend(vertex.children) columns = vertex.data columns.append('diagnosis') columns.sort() if (columns not in nvisited): nvisited.append(columns) cancerData = data[columns] result = cm.LogesticRegression(cancerData) * 100 if (97 < result): features.append(vertex.data) score.append(result) #print('Found score' + str(result)) #print(vertex.data) return features, score
def constructSolution(self, ant): featureSetIndex = [] for j in range(self.size): decision = random.random() if decision < self.fp[j] / 2.0: featureSetIndex.append(1) else: featureSetIndex.append(0) features = [0] for i, obj in enumerate(featureSetIndex): if obj: features.append(i + 1) newdata = self.data.iloc[:, features] if sum(featureSetIndex) == 0: score = 0.5 else: score = float(cm.LogesticRegression(newdata)) ant.val = score ant.subsets = copy.deepcopy(featureSetIndex) return ant
def parentSelectionRanked(self, population, data): popScore = [] scoreList = [] parentPool = [] shortCancerData = data rankMap = self.rankMap popSize = self.popSize # get fitness score for each parent for featureSetIndex in population: # say if no feature selected, same as random prediction score = 0.5 if sum(featureSetIndex) != 0: features = [0] for i, obj in enumerate(featureSetIndex): if obj: features.append(i + 1) newSCD = shortCancerData.iloc[:, features] score = float(cm.LogesticRegression(newSCD)) popScore.append((score, featureSetIndex)) scoreList.append(score) # rank the parent by score popScore.sort(key=lambda x: x[0]) k = int((popSize - 1)*popSize/2) # populated the parent pool based on rank for i in range(popSize): roll = self.randomFlip(k) ind = rankMap.get(roll) parentPool.append(copy.deepcopy(popScore[ind][1])) random.shuffle(parentPool) averageFitness = np.mean(scoreList) mostFitMember = popScore[-1] return parentPool, averageFitness, mostFitMember
def startSearch(self): data = self.data size = self.size currentTemp = self.t0 endTemp = self.t1 alpha = self.alpha shortCancerData = data currentScore = 0 featureSetIndex = self.randomStart() bestSolScore = currentScore bestSolSet = featureSetIndex[:] featureSize = len(featureSetIndex) # set termination conditions maxCounter = math.pow(2, size) counter = 0 iterSize = 100 print "Started Simulated Annealing with data size: %d, t: %.2f and limit: %d ... " % ( size, currentTemp, self.limit) while counter < maxCounter and currentTemp > endTemp: for ind in range(iterSize): # select the index of a random feature to include or exclude k = self.randomFlip(featureSize) featureSetIndex[k] = (featureSetIndex[k] != 1) # if the result makes the set contain zero features, pick again while sum(featureSetIndex) == 0: featureSetIndex[k] = (featureSetIndex[k] != 1) k = self.randomFlip(featureSize) featureSetIndex[k] = (featureSetIndex[k] != 1) # use the indice to construct the model and run evaluation function features = [0] for i, obj in enumerate(featureSetIndex): if obj: features.append(i + 1) newSCD = shortCancerData.iloc[:, features] score = float(cm.LogesticRegression(newSCD)) # OPTIONAL: record down the best result set. This is not part of the SA algorithm. if score > bestSolScore: bestSolScore = score bestSolSet = copy.deepcopy(featureSetIndex) # Perform score evaluation according to the current T if score > currentScore: currentScore = score else: x = random.random() acceptanceX = math.exp( (currentScore - score) / currentTemp) if x < acceptanceX: currentScore = score else: featureSetIndex[k] = (featureSetIndex[k] != 1) currentTemp = currentTemp * alpha iterSize = int(math.ceil(iterSize / alpha)) if not self.silent: print "Calculation round %.6f complete. CBA: %.6f; CA %.6f" % ( currentTemp, bestSolScore, currentScore) shortFeaturesName = list(shortCancerData.columns.values) selectedFeaturesName = [] for ind, obj in enumerate(bestSolSet): if obj: selectedFeaturesName.append(shortFeaturesName[ind + 1]) print "Features Selected: ", print selectedFeaturesName gc.collect() counter += 1 # this is the result of the SA algorithm (local optimum) shortFeaturesName = list(shortCancerData.columns.values) selectedFeaturesName = [] for ind, obj in enumerate(featureSetIndex): if obj: selectedFeaturesName.append(shortFeaturesName[ind + 1]) # this is the result of the tracking maximum (possible global optimum) bestFeatureName = [] for ind, obj in enumerate(bestSolSet): if obj: bestFeatureName.append(shortFeaturesName[ind + 1]) self.result = [("Current", selectedFeaturesName, currentScore), ("Best", bestFeatureName, bestSolScore)] return self.result
def startSearch(self): data = self.data size = self.size t = self.t limit = self.limit shortCancerData = data shortTermMemory = np.zeros(size) longTermMemory = 0 featureSetIndex = self.randomStart() bestSol = [] # set termination conditions ret = False maxCounter = math.pow(2, size) counter = 0 featureRes = None featureIndex = None print( "Started Tabu Search with data size: %d, t: %d and limit: %d ... " % (size, t, limit if limit else -1)) while not ret and counter < maxCounter: allResults = [] for ind, obj in enumerate(featureSetIndex): nFeatureSetIndex = copy.deepcopy(featureSetIndex) nFeatureSetIndex[ind] = (obj != 1) if sum(nFeatureSetIndex) != 0 and ( sum(nFeatureSetIndex) < limit if limit else True): features = [0] for i, obj1 in enumerate(nFeatureSetIndex): if obj1: features.append(i + 1) newSCD = shortCancerData.iloc[:, features] result = cm.LogesticRegression(newSCD) allResults.append((result, ind)) allResults.sort(reverse=True) for index, var in enumerate(allResults): featureRes = var[0] featureIndex = var[1] if shortTermMemory[featureIndex] == 0: featureSetIndex[featureIndex] = ( featureSetIndex[featureIndex] != 1) shortTermMemory[:] = [ x - 1 if x != 0 else x for x in shortTermMemory ] shortTermMemory[featureIndex] = t longTermMemory = featureRes if featureRes > longTermMemory else longTermMemory bestSol = featureSetIndex if featureRes > longTermMemory else featureSetIndex ret = False break elif featureRes >= longTermMemory: featureSetIndex[featureIndex] = ( featureSetIndex[featureIndex] != 1) shortTermMemory[:] = [ x - 1 if x != 0 else x for x in shortTermMemory ] shortTermMemory[featureIndex] = t bestSol = featureSetIndex ret = False elif index == (len(featureSetIndex) - 1): ret = True if not self.silent: print("Calculation round %d complete. CBA: %s; CA %s" % (counter, longTermMemory, featureRes if featureRes else "NaN")) shortFeaturesName = list(shortCancerData.columns.values) selectedFeaturesName = [] for ind, obj in enumerate(featureSetIndex): if obj: selectedFeaturesName.append(shortFeaturesName[ind + 1]) print(selectedFeaturesName) gc.collect() counter += 1 shortFeaturesName = list(shortCancerData.columns.values) selectedFeaturesName = [] for ind, obj in enumerate(featureSetIndex): if obj: selectedFeaturesName.append(shortFeaturesName[ind + 1]) self.result = (selectedFeaturesName, longTermMemory) return result