Пример #1
0
    def bfs(self, graph, data):
        features = []
        score = []
        nvisited = []
        visited, queue = set(), []
        for i in graph.children:
            queue.append(i)

        while queue:
            vertex = queue.pop(0)
            if vertex not in visited:
                visited.add(vertex)
                queue.extend(vertex.children)
                columns = vertex.data
                columns.append('diagnosis')
                columns.sort()
                if (columns not in nvisited):
                    nvisited.append(columns)
                    cancerData = data[columns]

                    result = cm.LogesticRegression(cancerData) * 100
                    if (97 < result):
                        features.append(vertex.data)
                        score.append(result)
                        #print('Found score' + str(result))
                        #print(vertex.data)

        return features, score
Пример #2
0
 def constructSolution(self, ant):
     featureSetIndex = []
     for j in range(self.size):
         decision = random.random()
         if decision < self.fp[j] / 2.0:
             featureSetIndex.append(1)
         else:
             featureSetIndex.append(0)
     features = [0]
     for i, obj in enumerate(featureSetIndex):
         if obj:
             features.append(i + 1)
     newdata = self.data.iloc[:, features]
     if sum(featureSetIndex) == 0:
         score = 0.5
     else:
         score = float(cm.LogesticRegression(newdata))
     ant.val = score
     ant.subsets = copy.deepcopy(featureSetIndex)
     return ant
Пример #3
0
    def parentSelectionRanked(self, population, data):
        popScore = []
        scoreList = []
        parentPool = []
        shortCancerData = data
        rankMap = self.rankMap
        popSize = self.popSize

        # get fitness score for each parent
        for featureSetIndex in population:
            # say if no feature selected, same as random prediction
            score = 0.5
            if sum(featureSetIndex) != 0:
                features = [0]
                for i, obj in enumerate(featureSetIndex):
                    if obj:
                        features.append(i + 1)
                newSCD = shortCancerData.iloc[:, features]
                score = float(cm.LogesticRegression(newSCD))
            popScore.append((score, featureSetIndex))
            scoreList.append(score)

        # rank the parent by score
        popScore.sort(key=lambda x: x[0])
        k = int((popSize - 1)*popSize/2)

        # populated the parent pool based on rank
        for i in range(popSize):
            roll = self.randomFlip(k)
            ind = rankMap.get(roll)
            parentPool.append(copy.deepcopy(popScore[ind][1]))

        random.shuffle(parentPool)
        averageFitness = np.mean(scoreList)
        mostFitMember = popScore[-1]
        return parentPool, averageFitness, mostFitMember
Пример #4
0
    def startSearch(self):
        data = self.data
        size = self.size
        currentTemp = self.t0
        endTemp = self.t1
        alpha = self.alpha

        shortCancerData = data
        currentScore = 0
        featureSetIndex = self.randomStart()
        bestSolScore = currentScore
        bestSolSet = featureSetIndex[:]

        featureSize = len(featureSetIndex)

        # set termination conditions
        maxCounter = math.pow(2, size)
        counter = 0
        iterSize = 100
        print "Started Simulated Annealing with data size: %d,  t: %.2f and limit: %d ... " % (
            size, currentTemp, self.limit)

        while counter < maxCounter and currentTemp > endTemp:
            for ind in range(iterSize):
                # select the index of a random feature to include or exclude
                k = self.randomFlip(featureSize)
                featureSetIndex[k] = (featureSetIndex[k] != 1)

                # if the result makes the set contain zero features, pick again
                while sum(featureSetIndex) == 0:
                    featureSetIndex[k] = (featureSetIndex[k] != 1)
                    k = self.randomFlip(featureSize)
                    featureSetIndex[k] = (featureSetIndex[k] != 1)

                # use the indice to construct the model and run evaluation function
                features = [0]
                for i, obj in enumerate(featureSetIndex):
                    if obj:
                        features.append(i + 1)
                newSCD = shortCancerData.iloc[:, features]
                score = float(cm.LogesticRegression(newSCD))

                # OPTIONAL: record down the best result set. This is not part of the SA algorithm.
                if score > bestSolScore:
                    bestSolScore = score
                    bestSolSet = copy.deepcopy(featureSetIndex)

                # Perform score evaluation according to the current T
                if score > currentScore:
                    currentScore = score
                else:
                    x = random.random()
                    acceptanceX = math.exp(
                        (currentScore - score) / currentTemp)
                    if x < acceptanceX:
                        currentScore = score
                    else:
                        featureSetIndex[k] = (featureSetIndex[k] != 1)

            currentTemp = currentTemp * alpha
            iterSize = int(math.ceil(iterSize / alpha))

            if not self.silent:
                print "Calculation round %.6f complete. CBA: %.6f; CA %.6f" % (
                    currentTemp, bestSolScore, currentScore)
                shortFeaturesName = list(shortCancerData.columns.values)
                selectedFeaturesName = []
                for ind, obj in enumerate(bestSolSet):
                    if obj:
                        selectedFeaturesName.append(shortFeaturesName[ind + 1])
                print "Features Selected: ",
                print selectedFeaturesName

            gc.collect()
            counter += 1

        # this is the result of the SA algorithm (local optimum)
        shortFeaturesName = list(shortCancerData.columns.values)
        selectedFeaturesName = []
        for ind, obj in enumerate(featureSetIndex):
            if obj:
                selectedFeaturesName.append(shortFeaturesName[ind + 1])

        # this is the result of the tracking maximum (possible global optimum)
        bestFeatureName = []
        for ind, obj in enumerate(bestSolSet):
            if obj:
                bestFeatureName.append(shortFeaturesName[ind + 1])

        self.result = [("Current", selectedFeaturesName, currentScore),
                       ("Best", bestFeatureName, bestSolScore)]
        return self.result
Пример #5
0
    def startSearch(self):
        data = self.data
        size = self.size
        t = self.t
        limit = self.limit

        shortCancerData = data
        shortTermMemory = np.zeros(size)
        longTermMemory = 0
        featureSetIndex = self.randomStart()
        bestSol = []

        # set termination conditions
        ret = False
        maxCounter = math.pow(2, size)
        counter = 0
        featureRes = None
        featureIndex = None

        print(
            "Started Tabu Search with data size: %d,  t: %d and limit: %d ... "
            % (size, t, limit if limit else -1))

        while not ret and counter < maxCounter:
            allResults = []
            for ind, obj in enumerate(featureSetIndex):
                nFeatureSetIndex = copy.deepcopy(featureSetIndex)
                nFeatureSetIndex[ind] = (obj != 1)
                if sum(nFeatureSetIndex) != 0 and (
                        sum(nFeatureSetIndex) < limit if limit else True):
                    features = [0]
                    for i, obj1 in enumerate(nFeatureSetIndex):
                        if obj1:
                            features.append(i + 1)

                    newSCD = shortCancerData.iloc[:, features]
                    result = cm.LogesticRegression(newSCD)
                    allResults.append((result, ind))

            allResults.sort(reverse=True)
            for index, var in enumerate(allResults):
                featureRes = var[0]
                featureIndex = var[1]
                if shortTermMemory[featureIndex] == 0:
                    featureSetIndex[featureIndex] = (
                        featureSetIndex[featureIndex] != 1)
                    shortTermMemory[:] = [
                        x - 1 if x != 0 else x for x in shortTermMemory
                    ]
                    shortTermMemory[featureIndex] = t
                    longTermMemory = featureRes if featureRes > longTermMemory else longTermMemory
                    bestSol = featureSetIndex if featureRes > longTermMemory else featureSetIndex
                    ret = False
                    break
                elif featureRes >= longTermMemory:
                    featureSetIndex[featureIndex] = (
                        featureSetIndex[featureIndex] != 1)
                    shortTermMemory[:] = [
                        x - 1 if x != 0 else x for x in shortTermMemory
                    ]
                    shortTermMemory[featureIndex] = t
                    bestSol = featureSetIndex
                    ret = False
                elif index == (len(featureSetIndex) - 1):
                    ret = True
            if not self.silent:
                print("Calculation round %d complete. CBA: %s; CA %s" %
                      (counter, longTermMemory,
                       featureRes if featureRes else "NaN"))
                shortFeaturesName = list(shortCancerData.columns.values)
                selectedFeaturesName = []
                for ind, obj in enumerate(featureSetIndex):
                    if obj:
                        selectedFeaturesName.append(shortFeaturesName[ind + 1])
                print(selectedFeaturesName)

            gc.collect()
            counter += 1

        shortFeaturesName = list(shortCancerData.columns.values)
        selectedFeaturesName = []
        for ind, obj in enumerate(featureSetIndex):
            if obj:
                selectedFeaturesName.append(shortFeaturesName[ind + 1])

        self.result = (selectedFeaturesName, longTermMemory)
        return result