Пример #1
0
def min_entropy_split(dataset):
    #print 'in min_entropy_split'
    # Assume inputs are continuous, and are column vectors.
    hypotheses = []
    entropies = []
    # For each attribute find the best split point.
    for attribute in xrange(dataset.num_attributes()):
        values = ds.unique_values(dataset.inputs, attribute)
        #Iterate over the possible values of split & calculate entropy for each split.
        for split_point in values:

            def calc_entropy(data_set):
                num_points = data_set.num_examples()
                #return (num_points / float(dataset.num_examples())) * data_set.entropy_discrete()
                return (num_points / float(dataset.num_examples())
                        ) * ds.dataset_entropy_discrete(dataset)

            split_entropy = map(
                calc_entropy,
                ds.split_continuous(dataset, attribute, split_point))
            hypotheses.append((attribute, split_point))
            entropies.append(sum(split_entropy))
    # Select the attribute split pair that has the lowest entropy.
    entropies = np.matrix(entropies)
    min_idx = np.argmin(entropies)
    return hypotheses[min_idx]
Пример #2
0
 def train(self, dataset, splitting_func=min_entropy_split):
     if not self.make_leaf(dataset):
         #print 'in train.splitting', dataset.num_examples()
         self.split_attribute, self.split_point = splitting_func(dataset)
         #print 'self.split_attribute, self.split_point', self.split_attribute, self.split_point 
         data_sets = ds.split_continuous(dataset, self.split_attribute, self.split_point)
         if len(data_sets) < 2:
             self.prediction = dataset.outputs
             return
         
         def tree_split(set):
             #print 'tree', set.num_examples()
             return DecisionTree(set, splitting_func=splitting_func)
         # Create & train child decision nodes
         tests            = create_binary_tests(self.split_attribute, self.split_point)
         self.children    = zip(tests, map(tree_split, data_sets))
Пример #3
0
    def train(self, dataset, splitting_func=min_entropy_split):
        if not self.make_leaf(dataset):
            #print 'in train.splitting', dataset.num_examples()
            self.split_attribute, self.split_point = splitting_func(dataset)
            #print 'self.split_attribute, self.split_point', self.split_attribute, self.split_point
            data_sets = ds.split_continuous(dataset, self.split_attribute,
                                            self.split_point)
            if len(data_sets) < 2:
                self.prediction = dataset.outputs
                return

            def tree_split(set):
                #print 'tree', set.num_examples()
                return DecisionTree(set, splitting_func=splitting_func)

            # Create & train child decision nodes
            tests = create_binary_tests(self.split_attribute, self.split_point)
            self.children = zip(tests, map(tree_split, data_sets))
Пример #4
0
def min_entropy_split(dataset):
    #print 'in min_entropy_split'
    # Assume inputs are continuous, and are column vectors.
    hypotheses     = []
    entropies      = []
    # For each attribute find the best split point.
    for attribute in xrange(dataset.num_attributes()):
        values = ds.unique_values(dataset.inputs, attribute)
        #Iterate over the possible values of split & calculate entropy for each split.
        for split_point in values:
            def calc_entropy(data_set):
                num_points = data_set.num_examples()
                #return (num_points / float(dataset.num_examples())) * data_set.entropy_discrete()
                return (num_points / float(dataset.num_examples())) * ds.dataset_entropy_discrete(dataset)
            split_entropy = map(calc_entropy, ds.split_continuous(dataset, attribute, split_point))
            hypotheses.append((attribute, split_point))
            entropies.append(sum(split_entropy))
    # Select the attribute split pair that has the lowest entropy.
    entropies                              = np.matrix(entropies)
    min_idx                                = np.argmin(entropies)
    return hypotheses[min_idx]