def __init__(self,
                 inputs,
                 labels,
                 featureName,
                 featureNames,
                 featureTypes,
                 subset=[],
                 boundary=None,
                 operator=''):
        #Find the actual feature array based on the feature name
        colIndex = np.where(featureName == featureNames)[0][0]
        features = inputs[:, colIndex]
        #The key feature of noonterminal nodes is that they have a decision
        if featureTypes[colIndex] == 'string':
            #then the feature is of type categorical and the decision must be categorical
            self.decision = Decision.Categorical(featureName, subset)
        else:
            #else the feature is of type numerical and decision must be numerical
            self.decision = Decision.Numerical(featureName, operator, boundary)

        #create the true and false nodes, both unexplored, by filtering the
        #data to each node based on the decision
        trueIndices = np.vectorize(self.decision.function)(features)
        falseIndices = np.logical_not(trueIndices)
        self.trueNode = Node(inputs[trueIndices], labels[trueIndices], '')
        self.falseNode = Node(inputs[falseIndices], labels[falseIndices], '')

        #the node will never use its input field again, but the labels field
        #is needed for tree pruning
        Node.__init__(self, [], labels, self.decision.__str__(), 'decision')
示例#2
0
def selectBestFeature(inputs,
                      labels,
                      impurityCat,
                      impurityNum,
                      featuresToConsider,
                      featureNames,
                      featureTypes,
                      numIntervals,
                      seed=0):
    random.seed(seed)
    if featuresToConsider > inputs.shape[1] or featuresToConsider < 0:
        Exception('featuresToConsider must be between 0 and {maxx}'.format(
            maxx=featuresToConsider))
    possibleFeatures = np.random.choice(featureNames, featuresToConsider,
                                        False)
    bestImpurity = np.inf
    bestSplit = ('', '', ''
                 )  #in the form (featureName, isCategorical, arguements)

    for name in possibleFeatures:
        colIndex = np.where(name == featureNames)[0][0]
        vals = inputs[:, colIndex]
        isCategorical = False
        if featureTypes[colIndex] == 'string':
            isCategorical = True
        if isCategorical:
            #if the feature is categorical, compute a split for each possible
            #value and pick the maximum one
            subset = []
            splitImpurity = bestImpurity
            for cat in np.unique(vals):
                dec = Decision.Categorical(name, [cat])
                trueIndices = np.vectorize(dec.function)(vals)
                falseIndices = np.logical_not(trueIndices)
                trueNodeImpurity = impurityResubLabel(
                    labels[trueIndices], impurityCat, impurityNum,
                    sum(trueIndices) / len(labels))
                falseNodeImpurity = impurityResubLabel(
                    labels[falseIndices], impurityCat, impurityNum,
                    sum(falseIndices) / len(labels))
                if splitImpurity > trueNodeImpurity + falseNodeImpurity:
                    subset = [cat]
                    splitImpurity = trueNodeImpurity + falseNodeImpurity
            split = (name, True, subset)

        else:
            #else the feature is numerical. In that case I discretize the
            #continuous features into "numIntervals" bins  or
            #however many intervals are possible.

            def findReImpurityGivenBoundary(bound):
                lessBound = labels[vals <= bound]
                moreBound = labels[vals > bound]
                trueNodeReImpurity = impurityResubLabel(
                    lessBound, impurityCat, impurityNum,
                    len(lessBound) / len(labels))
                falseNodeReImpurity = impurityResubLabel(
                    moreBound, impurityCat, impurityNum,
                    len(moreBound) / len(labels))
                return trueNodeReImpurity + falseNodeReImpurity

            numIntervalsForContinuousFeat = min(numIntervals, inputs.shape[0])
            possibleBounds = [
                max(i) for i in np.array_split(np.sort(vals),
                                               numIntervalsForContinuousFeat)
            ]
            possibleBounds = np.array(possibleBounds)
            splitImpurity = np.inf
            bestBoundary = 0
            for bound in possibleBounds:
                imp = findReImpurityGivenBoundary(bound)
                if splitImpurity > imp:
                    splitImpurity = imp
                    bestBoundary = bound
            split = (name, False, ('less', bestBoundary))
        if bestImpurity > splitImpurity:
            bestImpurity = splitImpurity
            bestSplit = split
    return bestSplit