def min_entropy_split(dataset): #print 'in min_entropy_split' # Assume inputs are continuous, and are column vectors. hypotheses = [] entropies = [] # For each attribute find the best split point. for attribute in xrange(dataset.num_attributes()): values = ds.unique_values(dataset.inputs, attribute) #Iterate over the possible values of split & calculate entropy for each split. for split_point in values: def calc_entropy(data_set): num_points = data_set.num_examples() #return (num_points / float(dataset.num_examples())) * data_set.entropy_discrete() return (num_points / float(dataset.num_examples()) ) * ds.dataset_entropy_discrete(dataset) split_entropy = map( calc_entropy, ds.split_continuous(dataset, attribute, split_point)) hypotheses.append((attribute, split_point)) entropies.append(sum(split_entropy)) # Select the attribute split pair that has the lowest entropy. entropies = np.matrix(entropies) min_idx = np.argmin(entropies) return hypotheses[min_idx]
def train(self, dataset, splitting_func=min_entropy_split): if not self.make_leaf(dataset): #print 'in train.splitting', dataset.num_examples() self.split_attribute, self.split_point = splitting_func(dataset) #print 'self.split_attribute, self.split_point', self.split_attribute, self.split_point data_sets = ds.split_continuous(dataset, self.split_attribute, self.split_point) if len(data_sets) < 2: self.prediction = dataset.outputs return def tree_split(set): #print 'tree', set.num_examples() return DecisionTree(set, splitting_func=splitting_func) # Create & train child decision nodes tests = create_binary_tests(self.split_attribute, self.split_point) self.children = zip(tests, map(tree_split, data_sets))
def min_entropy_split(dataset): #print 'in min_entropy_split' # Assume inputs are continuous, and are column vectors. hypotheses = [] entropies = [] # For each attribute find the best split point. for attribute in xrange(dataset.num_attributes()): values = ds.unique_values(dataset.inputs, attribute) #Iterate over the possible values of split & calculate entropy for each split. for split_point in values: def calc_entropy(data_set): num_points = data_set.num_examples() #return (num_points / float(dataset.num_examples())) * data_set.entropy_discrete() return (num_points / float(dataset.num_examples())) * ds.dataset_entropy_discrete(dataset) split_entropy = map(calc_entropy, ds.split_continuous(dataset, attribute, split_point)) hypotheses.append((attribute, split_point)) entropies.append(sum(split_entropy)) # Select the attribute split pair that has the lowest entropy. entropies = np.matrix(entropies) min_idx = np.argmin(entropies) return hypotheses[min_idx]