def load_project_data(project_name, n_folds=5): """returns two sets of data from the specified project. The first one contains 4/5 of the data for training. The second contains the remaining 1/5 of the data for testing.""" root_dir = abspath(DATA_PATH) data = parse_c45(project_name, root_dir) n_data = len(data) pos_data = [] neg_data = [] for ex in data: if ex[-1]: pos_data.append(ex) else: neg_data.append(ex) n_pos = len(pos_data) n_neg = len(neg_data) random.shuffle(pos_data) random.shuffle(neg_data) n_pos_fold = int(ceil(n_pos / float(n_folds))) #n fold cross validation n_neg_fold = int(ceil(n_neg / float(n_folds))) folds = [] for i in range(0, n_folds): #n_folds folds pos_fold = pos_data[n_pos_fold * i:n_pos_fold * i + n_pos_fold] neg_fold = neg_data[n_neg_fold * i:n_neg_fold * i + n_neg_fold] #not sure you need this, but it seems like a bad idea to train a # all on positive then negative examples. It' can't hurt really. # should not do anything for deterministic backprop, but will for # stochastic backprop pos_fold.extend(neg_fold) random.shuffle(pos_fold) folds.append(pos_fold) #create the different training and test set pairs fold_sets = [] for i in range(0, n_folds): test = folds.pop(i) train = [] for fold in folds: train.extend(fold) fold_sets.append((ExampleSet(train), ExampleSet(test))) folds.insert(i, test) return fold_sets
def test_part_discrete_data(self): train_data, test_data = load_project_data('example') examples = [ex for ex in train_data] + [ex for ex in test_data] data = ExampleSet(examples) n = Node() H_x, H_y_x, part_data = n.partition_data(data, 1) self.assertAlmostEqual(0.61219, H_y_x, 3) self.assertAlmostEqual(1.5849, H_x, 3) H_x, H_y_x, part_data = n.partition_data(data, 3) self.assertAlmostEqual(0.61219, H_y_x, 3) self.assertAlmostEqual(1.5849, H_x, 3) attr_index, part_data = n.max_GR(examples, [1, 3]) self.assertEqual(attr_index, 1) part_data_test = {} for ex in examples: part_data_test.setdefault(ex[1], []).append(ex) self.assertEqual(part_data_test, part_data) attr_index, part_data = n.max_GR(examples, [3, 1]) self.assertEqual(attr_index, 3) part_data_test = {} for ex in examples: part_data_test.setdefault(ex[3], []).append(ex) self.assertEqual(part_data_test, part_data)
def train(self,ex_set,attr_set,depth=0): """trains a tree, based on the given data. depth is used to track tree depth so that stopping conditions can be enforced""" ex_set = ExampleSet(ex_set) mcc,partable = self.check_ex_set(ex_set,attr_set) #print "Depth: ", depth, ", Data Length: ", len(ex_set) attr,part_data = self.max_GR(ex_set,attr_set) if part_data and not (depth == MAX_DEPTH and MAX_DEPTH > 0): self.attr_index = attr if ex_set.schema[attr].type == "CONTINUOUS": new_attr_set = attr_set[:] new_attr_set.remove(attr) else: new_attr_set = attr_set #print "Data Length: ", sum(len(sub_data) for f,sub_data in part_data.iteritems()) #print #print for feature,sub_data in part_data.iteritems(): self.children[feature] = Node() self.children[feature].train(sub_data,new_attr_set,depth+1) else: self.is_leaf = True self.classifier = mcc #most common classifier
def load_project_data(project_name): """returns two sets of data from the specified project. The first one contains 4/5 of the data for training. The second contains the remaining 1/5 of the data for testing.""" root_dir = abspath(DATA_PATH) data = parse_c45(project_name,root_dir) n_data = len(data) n_train,n_test = int(floor(4/5.0*n_data)),int(ceil(1/5.0*n_data)) train_choices = set(random.sample(xrange(n_data),n_train)) train_data, test_data = [],[] for i,ex in enumerate(data): if i in train_choices: train_data.append(ex) else: test_data.append(ex) return ExampleSet(train_data),ExampleSet(test_data)
def test_discrete_predict(self): train_data, test_data = load_project_data('example') examples = [ex for ex in train_data] + [ex for ex in test_data] data = ExampleSet(examples) n = Node() n.train(data, [1, 3]) #only train on the discrete data self.assertTrue(all([ex[-1] == n.predict(ex) for ex in data])) n.train(data, [3, 1]) #only train on the discrete data self.assertTrue(all([ex[-1] == n.predict(ex) for ex in data]))
def partition_data(self,ex_set,attr_index): """returns a 3-tuple of (H_x,H_y_x,partitioned_data)""" part_data = {} ex_set = ExampleSet(ex_set) #print " check",len(ex_set), if ex_set.schema[attr_index].type == 'CONTINUOUS': #print "CONTINUOUS" self.binner = ContBinner() ex_set = sorted(ex_set,key=lambda x:x[attr_index]) max_entropy_set = (None,None,None) print "test" for i,(ex1,ex2) in enumerate(zip(ex_set[:-1],ex_set[1:])): part_data = {} if ex1[-1] == ex2[-1]: #not a threshold continue else: self.binner.threshold = ex1[attr_index] for ex in ex_set: bin = self.binner(ex[attr_index]) part_data.setdefault(bin,[]).append(ex) H_x,H_y_x = self.calc_entropies(part_data) if H_y_x > max_entropy_set[1]: max_entropy_set = H_x,H_y_x,part_data print i H_x,H_y_x,part_data = max_entropy_set else: print "check" #print "DISCRETE" for i,ex in enumerate(ex_set): #converts the value to a binned value, really only needed for continuous attrs though bin = self.binner(ex[attr_index]) part_data.setdefault(bin,[]).append(ex) H_x,H_y_x = self.calc_entropies(part_data) return H_x,H_y_x,part_data
def test_is_partable_data(self): train_data, test_data = load_project_data('example') examples = [ex for ex in train_data] + [ex for ex in test_data] data = ExampleSet(examples) n = Node() self.assertTrue(n.check_ex_set(examples, [1, 3])) self.assertTrue(n.check_ex_set(examples, [1, 3])) index, part_data = n.max_GR(examples, [1, 3]) self.assertEqual(index, 1) test_part_data = [ex for ex in examples if ex[1] == 'red'] self.assertEqual(set(test_part_data), set(part_data['red'])) self.assertTrue(n.check_ex_set(part_data['red'], [ 3, ])[1]) index, sub_data = n.max_GR(part_data['red'], [3]) self.assertEqual(index, 3) test_part_data = [ex for ex in examples if ex[1] == 'green'] self.assertEqual(set(test_part_data), set(part_data['green'])) self.assertFalse(n.check_ex_set(part_data['green'], [ 3, ])[1]) test_part_data = [ex for ex in examples if ex[1] == 'blue'] self.assertEqual(set(test_part_data), set(part_data['blue'])) self.assertTrue(n.check_ex_set(part_data['blue'], [ 3, ])[1]) index, sub_data = n.max_GR(part_data['blue'], [3]) self.assertEqual(index, 3) index, sub_data = n.max_GR(examples, [3, 1]) self.assertEqual(index, 3)