def gen_tree(attribute, X, y, weights=None): """ Generate trees with height = 1. a / \ """ trees = [] for l1 in [-1, 1]: for l2 in [-1, 1]: # create a decision tree tree = DecisionTree() tree.labels = set(y) root = Branch() tree.tree = root # split attribute 1 root.split_feature = attribute # left branch of root left = Branch() left.predict = l1 root.children[0] = left # right branch of root right = Branch() right.predict = l2 root.children[1] = right # append tree to the list trees.append(tree) return trees
def train(self, data, col_y, iteration=50, max_height=2, print_flag=False): s = np.zeros((data.shape[0])) # initialize s vector self.max_height = max_height X = data.drop(col_y, axis=1) y = data[col_y] if print_flag: print('Start training') for i in range(iteration): if print_flag: if i % 10 == 9: print(' ... %i-th iteration' % (i + 1)) data[col_y] = y - s Tree = DecisionTree() Tree.construct_tree(data, col_y, max_height=self.max_height) self.trees[i] = Tree # One variable linear regression (fit residual) g_t = np.array([Tree.predict(x) for x in np.array(X)]) # prediction if np.sum(g_t**2) == 0: alpha = 0 else: alpha = np.sum(g_t * (y - s)) / np.sum( g_t**2) # compute regression coefficient self.coeff[i] = alpha s += alpha * g_t # update s if print_flag: print('----- END -----')
def __init__(self, max_depth=2, min_size=2, cost='mse'): DecisionTree.__init__(self, max_depth, min_size) self.cost_function = None if cost == 'mse': self.cost_function = cost else: raise NameError('Not valid cost function')
def simulation(self, errorstate, stackstate, attributes): training_set = [] print "Simulating" for state in stackstate: self.rewardmat[state.getLabel()] = -3.5 print "stack state" self.rewardmat[errorstate.getLabel()] = 7.5 for i in range(0,10): currentstate = r.choice(self.statelist) while(currentstate != self.statelist[errorstate.getLabel()]): #choose State action must change. #the probabilities will be different because the q_values are different action_chosen = currentstate.chooseStateAction(self.probmat[currentstate.getLabel()]) nextstate = self.statelist[action_chosen.getNextStateAddr()] #bellmanFordFunction function will be different self.bellmanFordFunction(currentstate, action_chosen) #updating the probabilities will need to change self.updateProbabilityMatrix(self.probmat[currentstate.getLabel()], self.qmat[currentstate.getLabel()]) currentstate = nextstate # Update the examples using the bellman ford functions # Create the tree here. training_set = self.generateTrainingSet() ldt = DecisionTree(attributes, training_set) rules = ldt.getRules() # For every state in for state in self.statelist: for action in state.getActions(): action.setVisited(False) """END"""
def __init__(self, *args, **kwargs ): if kwargs and args: raise ValueError( '''BoostedDecisionTree constructor can only be called with keyword arguments for the following keywords: training_datafile, entropy_threshold, max_depth_desired, csv_class_column_index, symbolic_to_numeric_cardinality_threshold, number_of_histogram_bins, csv_columns_for_features, number_of_histogram_bins, how_many_stages, debug1''') allowed_keys = 'training_datafile','entropy_threshold','max_depth_desired','csv_class_column_index',\ 'symbolic_to_numeric_cardinality_threshold','csv_columns_for_features',\ 'number_of_histogram_bins', 'how_many_stages','debug1','stagedebug' keywords_used = kwargs.keys() for keyword in keywords_used: if keyword not in allowed_keys: raise ValueError(keyword + ": Wrong keyword used --- check spelling") training_datafile=entropy_threshold=max_depth_desired=csv_class_column_index=number_of_histogram_bins= None symbolic_to_numeric_cardinality_threshold=csv_columns_for_features=how_many_stages=stagedebug=None if kwargs and not args: if 'how_many_stages' in kwargs : how_many_stages = kwargs.pop('how_many_stages') DecisionTree.__init__(self, **kwargs) if how_many_stages is not None: self._how_many_stages = how_many_stages else: self._how_many_stages = 4 self._all_trees = {i:DecisionTree(**kwargs) for i in range(how_many_stages)} self._training_samples = {i:[]for i in range(how_many_stages)} self._root_nodes = {i:None for i in range(how_many_stages)} self._sample_selection_probs = {i:{} for i in range(how_many_stages)} self._trust_factors = {i:None for i in range(how_many_stages)} self._misclassified_samples = {i:[] for i in range(how_many_stages)} self._classifications = None self._trust_weighted_decision_classes = None self._stagedebug = 0
def train(self, records, attributes): """ This function will train the random forest, the basic idea of training a Random Forest is as follows: 1. Draw n bootstrap samples using bootstrap() function 2. For each of the bootstrap samples, grow a tree with a subset of original attributes, which is of size m (m << # of total attributes) """ # Your code here for tree in range(self.tree_num): # creating a tree tree = DecisionTree() # randomly selecting 50% attributes for the tree tree_attributes = random.sample(attributes, int(len(attributes) * 0.5)) # selecting bootstrap samples for the tree by calling the bootstrap method bootstrap_samples = self.bootstrap(records) # training the tree tree.train(bootstrap_samples, tree_attributes) # adding the tree to the forest list self.forest.append(tree)
def train(self, records, attributes): """ This function will train the random forest, the basic idea of training a Random Forest is as follows: 1. Draw n bootstrap samples using bootstrap() function 2. For each of the bootstrap samples, grow a tree with a subset of original attributes, which is of size m (m << # of total attributes) """ for count in range(0, int(self.tree_num)): # Step 1 :Finding out the samples using bootstrap() for every treenum sample_rec = self.bootstrap(records) # Step 2 : For every treenum selecting 50% of the sample attributes # at random (without replacement) to be used for tree contruction sample_attr = [] while len(sample_attr) < ceil(0.5 * len(attributes)): rand = random.choice(attributes) if not rand in sample_attr: sample_attr.append(rand) # Creating a new Tree instance, training it based on the records and # attributes bootstraped above and adding to the forest Tree = DecisionTree() Tree.train(sample_rec, sample_attr) self.forest.append(Tree)
def spam(): #load all the spam data spam_data = scipy.io.loadmat('spam-dataset/spam_data.mat') test_data = spam_data['test_data'] training_labels = spam_data['training_labels'] training_data = spam_data['training_data'] print(training_data.shape[1], 'how many features used') training_data, training_labels = sklearn.utils.shuffle( training_data, training_labels) #split training data #learn_set, learn_labels = training_data[:4000], training_labels[:4000] learn_set, learn_labels = training_data, training_labels valid_set, valid_labels = training_data[4000:], training_labels[4000:] #train and predict on a single tree # spamTree = DecisionTree(learn_set, learn_labels) # spamTree.train(learn_set, learn_labels, spamTree.root) # pred_labels = spamTree.predict(test_data) #print(benchmark(pred_labels, valid_labels)[0]) #make random forest NUM_TREES = 100 forest = [] # pred_labels = np.zeros((valid_set.shape[0], 1)) # sumOfPred = np.zeros((valid_set.shape[0], 1)) pred_labels = np.zeros((test_data.shape[0], 1)) sumOfPred = np.zeros((test_data.shape[0], 1)) for i in range(0, NUM_TREES): print('Now at tree #', i) nPrime = np.random.choice(learn_set.shape[0], learn_set.shape[0], True) x = learn_set[nPrime] y = learn_labels[nPrime] tree = DecisionTree(x, y) tree.train(x, y, tree.root, True) forest.append(tree) for tree in forest: # sumOfPred += tree.predict(valid_set) sumOfPred += tree.predict(test_data) for i in range(0, test_data.shape[0]): if sumOfPred[i] / NUM_TREES > .5: pred_labels[i] = 1 elif sumOfPred[i] / NUM_TREES < .5: pred_labels[i] = 0 else: pred_labels[i] = random.randint(0, 1) #print(benchmark(pred_labels, valid_labels)[0]) #make csv csvList = [['Id,Category']] for i in range(1, 5858): csvList.append([i, int(pred_labels[i - 1][0])]) with open('spamForest.csv', 'w', newline='') as fp: a = csv.writer(fp, delimiter=',') a.writerows(csvList) return 0
def train(data, labels): # data is an array of attribute vectors # e.g. [[0, 7, 5, 2, 3, 4, 0, 18], [1, 3, 0, 4, 2, 0, 1, 0], ...] # labels is an array of class labels (as integers) # e.g. [0, 1, ...] model = DecisionTree() model.train(data, labels) return model
def testSmoke(self): from DecisionTree import DecisionTree dt = DecisionTree() feature = np.array([[0, 1], [1, 0], [1, 2], [0, 0], [1, 1]]) label = np.array([0, 1, 0, 0, 1]) dt.fit(feature, label) y_pred = dt.predict(feature) assert (y_pred == label).all()
def test_decision_tree(self): tree = DecisionTree() X = np.asarray([[1, 1],[0, 2], [3, 2]]) y = np.asarray([0, 1, 1]) tree.fit(X_=X, y_=y) self.assertTrue(tree.predict(np.asarray([[1,1]]))[0] == 0)
def test_DT(self): records, attributes = load_data("data/mushrooms_train.data") test_records = load_data("data/mushrooms_train.data")[0] #print(records, attributes) dt = DecisionTree() best_index, best_index_dict = dt.find_best_split(records, attributes, class_index=0) dt.shuffle_dataset(best_index_dict)
def fit(self, X, y): self.trees = [] for _ in range(self.estimators): tree = DecisionTree(min_samples_split=self.min_samples_split, max_depth=self.max_depth, n_features=self.n_features) X_sample, y_sample = bootstrap_sample(X, y) tree.fit(X_sample, y_sample) self.trees.append(tree)
def train_and_plot(X, Y, X_test, Y_test): decision_tree = DecisionTree().fit(X, Y) # decision_tree.print_tree() plot_decision_boundary(decision_tree, X, Y) Y_predict = decision_tree.predict(X_test.to_numpy()) test_error = 1 - metrics.accuracy_score(Y_test, Y_predict) num_nodes = decision_tree.count_nodes() print('test error : ', test_error) return (num_nodes, test_error)
def fit(self, X, Y): self.trees = [] for _ in range(self.n_trees): tree = DecisionTree(min_samples_split=self.min_samples_split, max_depth=self.max_depth, n_feats=self.n_feats) X_sample, Y_sample = BootstrapSample(X, Y) tree.fit(X_sample, Y_sample) self.trees.append(tree)
def fit(self, X, y): self._initial_approximation(X, y) for i in range(self._n_estimators): anti_grad = self.calculate_antigradient(X, y) estimator = DecisionTree(max_depth=self._max_depth, is_classification=False, impurity=self._impurity, min_samples_leaf=self._min_samples_leaf, max_features=self._max_features, min_features=self._min_features, max_steps=self._max_steps, rsm=self._rsm) estimator.fit(X, anti_grad) self._estimators.append(estimator)
class RandomForest: def __init__(self, n_subsets=1, n_replacement=5): self.innermodel = DecisionTree() self.n_subsets = n_subsets self.n_replacement = n_replacement def generate_subset(self, group_number, data): subset = [] i = 0 while i < len(data): if i % self.n_subsets == group_number: subset.append(data[i]) i += 1 return subset def fit(self, data): ## save the original data self.dataset = data forest = [] ## use to save the tree of each subset i = 0 while i < self.n_replacement: random.shuffle(data) ## split data into subsets and save to a list subsets = [] j = 0 while j < self.n_subsets: subset = self.generate_subset(j, data) subsets.append(subset) j += 1 for subset in subsets: tree = self.innermodel.fit(subset) forest.append(tree) i += 1 return forest def classify(self, row, forest): results = [] confidence = 0 for node in forest: result = self.innermodel.classify(row, node) for key in result: if len(result) == 1: results.append(key) else: results.append(max(result, key=result.get)) break classification = max(set(results), key=results.count) confidence = round( results.count(max(set(results), key=results.count)) / len(results) * 100, 2) return [classification, confidence]
def nbcTest(trainSet, trainLab, testSet, testLab): nbc = DecisionTree() nbc.fit(trainSet, trainLab) predLab = nbc.predict(testSet) print("nbc errors:") print(sum(testLab != predLab)) print(1 - sum(testLab != predLab) / len(testLab)) print("nbc confusion matrix:") print(confusion_matrix(testLab, predLab)) print()
def train_decision_tree(self, train): sample_indices = [ random.choice(range(len(train.data))) for _ in range(int(self.bagging_data_fraction * len(train.data))) ] sample = Dataset([train.data[i] for i in sample_indices], [train.labels[i] for i in sample_indices]) tree = DecisionTree(self.max_depth, self.num_features) tree.train(sample) self.trees.append(tree)
def train(self, dataset: pd.DataFrame, targetClass: str, n: int, m: int, verbose: bool): self.dataset = dataset self.targetClass = targetClass self.trees = [] for i in range(n): treeData, _ = bootstrap(self.dataset, self.dataset.shape[0]) tree = DecisionTree() tree.train(treeData, targetClass, m, verbose) self.trees.append(tree)
def build_tree(self, X, y): tree = DecisionTree( self.n_features, self.max_depth, self.min_samples_leaf ) # print(1) tree.fit(X, y) # print(tree) self.trees.append(tree)
def fit(self, X, y): self.trees = [] n_samples, n_features = X.shape[0], X.shape[1] for _ in range(self.n_trees): dt = DecisionTree(min_samples_split=self.min_samples_split, max_depth=self.max_depth, n_features=self.n_features) idxs = np.random.choice(n_samples, n_samples, replace=True) dt.fit(X[idxs], y[idxs]) self.trees.append(dt)
def decision_tree_classification(X, y, test_dat): classifier = DecisionTree(45) classifier.train(X, y) y_hat = classifier.predict(test_dat) f = open("census_predictions_decision_tree.csv", 'w') f.write("Id,Category\n") for i in range(np.size(test_dat, 0)): f.write(str(i + 1) + "," + str(int(y_hat[i, 0])) + "\n") f.close() print("DONE")
def generate_tree(tup): '''@parameters: tup: (trainX, trainY, testX) @return: (DecisionTree, array, array, array): tree, prediction, Ein, out_prediction''' trainX, trainY = tup[0], tup[1] bagX, bagY = bagging(trainX, trainY, 0.8) testX = tup[2] tree = DecisionTree().fit(bagX, bagY) prediction = tree.predict(trainX) return tree, prediction, np.mean(prediction != trainY), tree.predict(testX)
def fit(self, X, y): m, n = np.shape(X) self._tree_list = [] for i in range(self._max_depth): tree_tmp = DecisionTree(type=self._type, criterion=self._criterion, splitter=self._splitter, min_impurity_decrease=self._min_impurity_decrease, min_impurity_split=self._min_impurity_split, min_samples_split=self._min_samples_split, max_depth=self._max_depth) X_train, y_train, row, column = self.random_sample(X, y, self._bagging_fraction*m, self._feature_fraction*n) tree_tmp.fit(X_train, y_train) self._tree_list.append([column, tree_tmp])
def plotDecisionTreewithPieceNum(dataset): dt = DecisionTree(dataset) size = len(dataset) x = [i for i in range(2, size // 2 + 1) if size % i == 0] y = [] for t in x: y.append(dt.crossValidation(t)) plt.scatter(x, y, edgecolors="blue") plt.xlabel("Cross validation piece") plt.ylabel("Error rate") plt.title("Error rate vs piece number") plt.show()
def PrintTable(train, test, rangeEnd, ChangeUnknown=False): print('\t\t\tEntropy\t\tME\t\tGini') for maxDepth in range(1, rangeEnd + 1): EntropyTree = DecisionTree(train, maxDepth, 0, ChangeUnknown) METree = DecisionTree(train, maxDepth, 1, ChangeUnknown) GiniTree = DecisionTree(train, maxDepth, 2, ChangeUnknown) print("%2d & %5.4f & %5.4f & %5.4f \\\\ \\hline" % (maxDepth, EntropyTree.GetAccuracyLevel(test), METree.GetAccuracyLevel(test), GiniTree.GetAccuracyLevel(test)))
def create_classifier(self,inputs,outputs,weights): new_tree=DecisionTree(3,use_weights=True) new_tree.fit(inputs,outputs,weights) terror=np.empty(len(outputs),dtype=float) for indx,(data,truth) in enumerate(zip(inputs,outputs)): predict=new_tree.predict(data) terror[indx]=1.0-float(predict==truth) self.classifiers.append(new_tree) error=np.sum(weights*terror)/np.sum(weights) stage=np.log((1.0-error)/error) self.weights.append(stage) weights=weights*np.exp(stage*terror) return weights
def train(self, dataset, m, n): # m = number of attributes. # n = number o trees self.trees = [] for _ in range(n): bootstrap = dataset.bootstrap() t = DecisionTree() t.n_attr = m t.train(bootstrap) self.trees.append(t) self.trained = True
def best_params(): acc_max = 0 depth_max = 0 depth_list = [i * 10 for i in range(1, 21)] for depth in depth_list: clf = DecisionTree(max_depth=depth) clf.fit(X_train, Y_train) predictions = clf.predict(X_test) acc = accuracy(Y_test, predictions) if acc > acc_max: acc_max = acc depth_max = depth return (depth_max, acc_max)
def main(): attributes_train, data_train = read_from_file("train.txt") # DTL dtl = DecisionTree() tree = dtl.build(data_train, attributes_train) with open("output_tree.txt", "w") as file: tree_string = dtl.write_tree_to_file(tree, attributes_train, 0) file.write(tree_string[:len(tree_string) - 1]) # KNN knn = KNearestNeighbors(attributes_train, data_train) # NAIVE BAYES naive_bayes = NaiveBayes(attributes_train, data_train) attribute_text, data_test = read_from_file("test.txt") knn_result = [] naive_bayes_result = [] dtl_result = [] real_classify = [] for line in data_test: real_classify.append(line[-1]) entry = line[:-1] knn_result.append(knn.predict(entry, 5)) naive_bayes_result.append(naive_bayes.predict(entry)) dtl_result.append(dtl.predict(tree, entry, attribute_text)) acc_knn = 0 acc_nb = 0 acc_dtl = 0 # get accuracy for (dtl, knn, nb, real) in zip(dtl_result, knn_result, naive_bayes_result, real_classify): if dtl == real: acc_dtl += 1 if knn == real: acc_knn += 1 if nb == real: acc_nb += 1 acc_knn /= len(real_classify) acc_nb /= len(real_classify) acc_dtl /= len(real_classify) acc_knn = float(math.ceil(acc_knn * 100)) / float(100) acc_nb = float(math.ceil(acc_nb * 100)) / float(100) acc_dtl = float(math.ceil(acc_dtl * 100)) / float(100) with open('output.txt', 'w') as output: output.write("Num\tDT\tKNN\tnaiveBase\n") for i, (a, b, c) in (enumerate( zip(dtl_result, knn_result, naive_bayes_result))): output.write(str(i + 1) + "\t" + a + "\t" + b + "\t" + c + "\n") output.write("\t" + str(acc_dtl) + "\t" + str(acc_knn) + "\t" + str(acc_nb) + "\n")
class Decider(object): def __init__(self, trainingData='data/training.dat'): self.dt = DecisionTree(training_datafile=trainingData, debug1=0, debug2=0) self.dt.get_training_data() self.rootNode = self.dt.construct_decision_tree_classifier() def play(self, agentList, agent, map): from random import random troops = agent.aliveList() if random() > 0.5: troops.reverse() for troop in troops: bestValue = float('-inf') actions, teammateList, enemyList = map.legalActions(troop) if random() > 0.5: actions.reverse() for action in actions: self.makeDecision(agentList, agent, troop, action, map) def makeDecision(self, agentList, agent, troop, action, map): s1 = 'general=>' + generalSituation(agent, troop, action['target']) # situation 1 s2 = 'situation=>' + situation(troop, action['target']) s3 = 'injury=>' + maxInjury(troop, action['target']) s4 = 'attackGeneral=>' + str( maxAttackOnGeneral(troop, action['target'])) testSample = [s1, s2, s3, s4] try: classification = self.dt.classify(self.rootNode, testSample) except: print 'somethign wrong with dt!', testSample classification = {'positive': 0, 'negative': 1} print classification if classification['positive'] > classification['negative']: troop.move(action['target']) attackList = map.legalAttacks(troop, troop.posX, troop.posY) bestValue = float('inf') for enemy in attackList: if enemy['targetTroopId'] == 1: print 'enemy general!!!!' target = enemy break elif enemy['targetLife'] < bestValue: bestValue = enemy['targetLife'] target = enemy try: troop.doAttack(agentList, target['targetTroopId']) except UnboundLocalError: pass
def test_is_categorial(self): tree = DecisionTree() y = np.asarray([1,1,1,1,0,0,0]) self.assertTrue(tree._is_categorical(y)) y = np.asarray([1,1,2,4,1,2,4,4,4,4,4]) self.assertTrue(tree._is_categorical(y)) y = np.asarray([1.1,0.8,2.1,4,1,2.5,4,4,4,4.8,4]) self.assertFalse(tree._is_categorical(y)) y = np.asarray([100000002131, 12, 12]) self.assertTrue(tree._is_categorical(y))
def build_forest(self, X, y, num_trees, num_samples, num_features): ''' Return a list of num_trees DecisionTrees. ''' forest = [] for i in xrange(num_trees): sample_indices = np.random.choice(X.shape[0], num_samples, \ replace=True) sample_X = np.array(X[sample_indices]) sample_y = np.array(y[sample_indices]) dt = DecisionTree(self.impurity_criterion) dt.fit(sample_X, sample_y) forest.append(dt) return forest
def build_forest(self, X, y, num_trees, num_samples, num_features): ''' Return a list of num_trees DecisionTrees. ''' size = len(y) index = range(size) trees = [] for tree in range(num_trees): random_sample_index = np.random.choice(index, size, replace=True) X_random = X[random_sample_index] y_random = y[random_sample_index] dt = DecisionTree(num_features) dt.fit(X_random, y_random) trees.append(dt) return trees
class Decider(object): def __init__(self, trainingData='data/training.dat'): self.dt = DecisionTree(training_datafile = trainingData, debug1=0, debug2=0) self.dt.get_training_data() self.rootNode = self.dt.construct_decision_tree_classifier() def play(self, agentList, agent, map): from random import random troops = agent.aliveList() if random() > 0.5: troops.reverse() for troop in troops: bestValue = float('-inf') actions, teammateList, enemyList = map.legalActions(troop) if random() > 0.5: actions.reverse() for action in actions: self.makeDecision(agentList, agent, troop, action, map) def makeDecision(self, agentList, agent, troop, action, map): s1 = 'general=>' + generalSituation(agent, troop, action['target']) # situation 1 s2 = 'situation=>' + situation(troop, action['target']) s3 = 'injury=>' + maxInjury(troop, action['target']) s4 = 'attackGeneral=>' + str(maxAttackOnGeneral(troop, action['target'])) testSample = [s1, s2, s3, s4] try: classification = self.dt.classify(self.rootNode, testSample) except: print 'somethign wrong with dt!' , testSample classification = {'positive':0, 'negative':1} print classification if classification['positive'] > classification['negative']: troop.move(action['target']) attackList = map.legalAttacks(troop,troop.posX,troop.posY) bestValue = float('inf') for enemy in attackList: if enemy['targetTroopId'] == 1: print 'enemy general!!!!' target = enemy break elif enemy['targetLife'] < bestValue: bestValue = enemy['targetLife'] target = enemy try: troop.doAttack(agentList, target['targetTroopId']) except UnboundLocalError: pass
def test_make_split(): X, y, X1, y1, X2, y2 = fake_data() split_index, split_value = 1, 'bat' dt = DT() dt.categorical = np.array([False, True]) result = dt._make_split(X, y, split_index, split_value) try: X1_result, y1_result, X2_result, y2_result = result except ValueError: n.assert_true(False, 'result not in correct form: (X1, y1, X2, y2)') actual = (X1, y1, X2, y2) message = '_make_split got results\n%r\nShould be\n%r' % (result, actual) n.ok_(np.array_equal(X1, X1_result), message) n.ok_(np.array_equal(y1, y1_result), message) n.ok_(np.array_equal(X2, X2_result), message) n.ok_(np.array_equal(y2, y2_result), message)
def test_tree(filename): df = pd.read_csv(filename) y = df.pop("Result").values X = df.values print X tree = DecisionTree() tree.fit(X, y, df.columns) print tree print y_predict = tree.predict(X) print "%26s %10s %10s" % ("FEATURES", "ACTUAL", "PREDICTED") print "%26s %10s %10s" % ("----------", "----------", "----------") for features, true, predicted in izip(X, y, y_predict): print "%26s %10s %10s" % (str(features), str(true), str(predicted))
def test_choose_split_index(): X, y, X1, y1, X2, y2 = fake_data() index, value = 1, 'cat' dt = DT() dt.categorical = np.array([False, True]) result = dt._choose_split_index(X, y) try: split_index, split_value, splits = result except ValueError: message = 'result not in correct form. Should be:\n' \ ' split_index, split_value, splits' n.assert_true(False, message) message = 'choose split for data:\n%r\n%r\n' \ 'split index, split value should be: %r, %r\n' \ 'not: %r, %r' \ % (X, y, index, value, split_index, split_value) n.eq_(split_index, index, message) n.eq_(split_value, value, message)
def train(self,data,labels): """Trains the random forest using a bunch of decision trees. * training_data: n x d numpy matrix of data, where row = sample point, column = feature * training_labels: flat nparray of labels, where item i is the label for point i """ num_points = data.shape[0] for i in xrange(self.num_trees): sample_indices = np.random.choice(num_points,size=self.data_bagging_size,replace=True) sample_data = data[sample_indices] sample_labels = labels[sample_indices] tree = DecisionTree(feature_bagging_criteria=self.feature_bagging_criteria, impurity_measure=self.impurity_measure, min_impurity_decrease=self.min_impurity_decrease, min_impurity=self.min_impurity, max_percentage_in_class=self.max_percentage_in_class, max_height=self.max_height, min_points_per_node=self.min_points_per_node, feature_name_map=self.feature_name_map) tree.train(sample_data,sample_labels) self.trees.append(tree)
def build_forest(self, X, y, num_trees, num_samples, num_features): # * Return a list of num_trees DecisionTrees. forest = [] # * for each of the num trees for i in xrange (num_trees): # * create an random selection of the indices of the arrays, sampling # with replacement. indices = [i for i in xrange(num_samples)] indices_sample = r.sample(indices, len(indices)/3) X_sample = X[indices_sample] y_sample = y[indices_sample] # * use these sample indices to select a subset of X and y # with the new sample_X and sample_y, build a new tree as a member # of the forest and add to the list. tree = DecisionTree() tree.fit(X_sample, y_sample, self.features) forest.append(tree) # * Return a list of num_trees DecisionTrees. return forest
def test_predict(self): data, label = creatDataLabel() tree = DecisionTree(maxDeep = 5) tree.buildTree(data, label) X = numpy.array([1,1,1,1]) self.assertTrue(tree.predict(X) == 1) X = numpy.array([1,0,1,2]) self.assertTrue(tree.predict(X) == 1) X = numpy.array([2,0,1,1]) self.assertTrue(tree.predict(X) == 1) X = numpy.array([2,1,0,1]) self.assertTrue(tree.predict(X) == 1)
def main(): print '#############################################' print '## C4.5 based Decision Tree ##' print '## To see usage by main.py --help ##' print '## Author: mumuhr ##' print '## 06.01.2015 ##' print '#############################################' parser = OptionParser() parser.add_option("-i", dest="inc", default='dataset', help="dataset directory") parser.add_option("-f", "--file", dest="fileName", default='adult2', help="file name of training dataset") parser.add_option("-c", "--class-attr", dest="classAttr", default='class', help="classification attribute") parser.add_option("-d", "--depth", dest="depth", default=6, help="max recursion depth of decision trees") parser.add_option("-t", "--tree-num", dest="treeNum", default=5, help="num of decision trees") parser.add_option("-e", "--epsilon", dest="epsilon", default=1, help="total privacy budget") parser.add_option("-v", "--verbose", dest="verbose", default=True, help="open verbose mode") parser.add_option("--mode", dest='mode', default='All', help='choose build mode: DecisionTree/RandomForest/ALL') (options, args) = parser.parse_args() if options.verbose: logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) else: logging.basicConfig(stream=sys.stdout, level=logging.INFO) logging.info('init Decision Tree building data') trainFileName = options.inc + '/' + options.fileName + 'Training.csv' testFileName = options.inc + '/' + options.fileName + '.csv' dataRaw = DataLoader.getData(trainFileName) attributes = dataRaw[0] attributesType = dataRaw[1] if logging.DEBUG: for index in range(0, len(attributesType)): logging.debug('attr: %s; attrType: %s', attributes[index], attributesType[index]) logging.debug('class attr: %s', attributes[-1]) dataRaw.remove(attributes) dataRaw.remove(attributesType) dataTrain = DataLoader.toFloat(dataRaw, attributesType) dataTestRaw = DataLoader.getData(testFileName) dataTest = DataLoader.toFloat(dataTestRaw, attributesType) classStd = [] for row in dataTest: classStd.append(row[-1]) target = options.classAttr depth = options.depth treeNum = options.treeNum epsilon = options.epsilon logging.debug('target: %s', target) logging.debug('depth: %s', str(depth)) if config.config.MakeTree == 'DecisionTree' or config.config.MakeTree == 'All': # Run C4.5 logging.info('Run C4.5 to generate Decision Tree') tree = DecisionTree.makeTree(dataTrain, attributes, attributesType, target, depth, depth, epsilon) # Classify testing data logging.debug('Classify testing data by generated Decision Tree') classResult = DecisionTree.classify(tree, attributes, attributesType, dataTest) # Output Classification Accuracy acc = ResultParser.classAccDecisionTree(classStd, classResult) logging.info('Classification Accuracy: ' + str(acc)) if config.config.MakeTree == 'RandomForest' or config.config.MakeTree == 'All': # Run Random Forest logging.info('Run RandomForest to generate Decision Tree') trees = RandomForest.randomForest(dataTrain, attributes, attributesType, target, depth, depth, treeNum, epsilon) classResults = [] for tree in trees: classResult = DecisionTree.classify(tree, attributes, attributesType, dataTest) classResults.append(classResult) acc = ResultParser.classAccRandomForest(classStd, classResults) logging.info('Classification Accuracy: ' + str(acc))
from DecisionTree import DecisionTree dt = DecisionTree( training_datafile = "data/dataset.csv", csv_class_column_index = 2, csv_columns_for_features = [1,2,3,4,5], entropy_threshold = 0.01, max_depth_desired = 8, symbolic_to_numeric_cardinality_threshold = 10, ) dt.get_training_data() dt.calculate_first_order_probabilities() dt.calculate_class_priors() dt.show_training_data() root_node = dt.construct_decision_tree_classifier() root_node.display_decision_tree(" ") test_sample = ['ColorDeCabello = Negro', 'Altura = Alto', 'Peso = Alto', 'Proteccion = No', 'Quemadura = Si'] classification = dt.classify(root_node, test_sample) print "Classification: ", classification
["s", "China", "no", 18, "Premium"], ["t", "China", "no", 17, "None"], ] my_data2 = [ ["a", "USA", "yes", "18", "None"], ["b", "France", "yes", "23", "Premium"], ["c", "USA", "yes", "24", "Basic"], ["d", "France", "yes", "23", "Basic"], ] train_flowers = data.read_filedata("..//data//train_data.txt", "ALL", ",", [0, 1, 2, 3]) test_flowers = data.read_filedata("..//data//test_data.txt", "ALL", ",", [0, 1, 2, 3]) tree = DecisionTree(train_flowers) treepredict.buildtree(tree) tree.printTree() right = 0 wrong = 0 for flower in test_flowers: result = treepredict.predic(tree, flower) if flower[-1] in result: if right == 49: pass right += 1 else: wrong += 1 print "正确预测:" + str(right) + "个"
def test_is_stop_criterion(self): tree = DecisionTree() self.assertTrue(tree._is_stop_criterion(np.asarray([1]))) self.assertTrue(tree._is_stop_criterion(np.asarray([1, 1, 1, 1, 1]))) self.assertFalse(tree._is_stop_criterion(np.asarray([1, 1, 0, 0, 1])))
def construct_cascade_of_trees(self): self._training_samples[0] = self._all_sample_names self._misclassified_samples[0] = self.evaluate_one_stage_of_cascade(self._all_trees[0], self._root_nodes[0]) if self._stagedebug: self.show_class_labels_for_misclassified_samples_in_stage(0) print("\nSamples misclassified by base classifier: %s" % str(self._misclassified_samples[0])) print("\nNumber of misclassified samples: %d" % len(self._misclassified_samples[0])) misclassification_error_rate = sum([self._sample_selection_probs[0][x] for x in self._misclassified_samples[0]]) if self._stagedebug: print("\nMisclassification_error_rate for base classifier: %g" % misclassification_error_rate) self._trust_factors[0] = 0.5 * math.log((1-misclassification_error_rate)/misclassification_error_rate) if self._stagedebug: print("\nBase class trust factor: %s" % str(self._trust_factors[0])) for stage_index in range(1,self._how_many_stages): if self._stagedebug: print("\n\n==========================Constructing stage indexed %d=========================\n" % stage_index) self._sample_selection_probs[stage_index] = \ {sample : self._sample_selection_probs[stage_index - 1][sample] * math.exp(-1.0 * self._trust_factors[stage_index - 1] * (-1.0 if sample in self._misclassified_samples[stage_index - 1] else 1.0)) for sample in self._all_sample_names} normalizer = sum(self._sample_selection_probs[stage_index].values()) if self._stagedebug: print("\nThe normalizer is: ", normalizer) self._sample_selection_probs[stage_index].update((sample,prob/normalizer) for sample,prob in self._sample_selection_probs[stage_index].items()) prob_distribution = sorted(self._sample_selection_probs[stage_index].items(), key=lambda x: x[1], reverse=True) if self._stagedebug: print("\nProbability distribution: %s" % str([(sample_index(x), "%.3f"%y) for x, y in prob_distribution])) training_samples_this_stage = [] sum_of_probs = 0.0 for sample in [x[0] for x in prob_distribution]: sum_of_probs += self._sample_selection_probs[stage_index][sample] if sum_of_probs > 0.5: break else: training_samples_this_stage.append(sample) self._training_samples[stage_index] = sorted(training_samples_this_stage, key=lambda x: sample_index(x)) if self._stagedebug: print("\nTraining samples this stage: %s" % str(self._training_samples[stage_index])) print("\nNumber of training samples this stage %d" % len(self._training_samples[stage_index])) training_samples_selection_check = set(self._misclassified_samples[stage_index-1]).intersection(set(self._training_samples[stage_index])) if self._stagedebug: print("\nTraining samples in the misclassified set: %s" % str(sorted(training_samples_selection_check, key=lambda x: sample_index(x)))) print("\nNumber_of_miscalssified_samples_in_training_set: %d" % len(training_samples_selection_check)) dt_this_stage = DecisionTree('boostingmode') training_data_this_stage = { x : self._all_training_data[x] for x in self._training_samples[stage_index]} dt_this_stage._training_data_dict = training_data_this_stage dt_this_stage._class_names = self._all_trees[0]._class_names dt_this_stage._feature_names = self._all_trees[0]._feature_names dt_this_stage._entropy_threshold = self._all_trees[0]._entropy_threshold dt_this_stage._max_depth_desired = self._all_trees[0]._max_depth_desired dt_this_stage._symbolic_to_numeric_cardinality_threshold = \ self._all_trees[0]._symbolic_to_numeric_cardinality_threshold dt_this_stage._samples_class_label_dict = \ {sample_name : self._all_trees[0]._samples_class_label_dict[sample_name] for sample_name in dt_this_stage._training_data_dict.keys()} dt_this_stage._features_and_values_dict = \ {feature : [] for feature in self._all_trees[0]._features_and_values_dict} pattern = r'(\S+)\s*=\s*(\S+)' for item in sorted(dt_this_stage._training_data_dict.items(), key = lambda x: sample_index(x[0])): for feature_and_value in item[1]: m = re.search(pattern, feature_and_value) feature,value = m.group(1),m.group(2) if value != 'NA': dt_this_stage._features_and_values_dict[feature].append(convert(value)) dt_this_stage._features_and_unique_values_dict = {feature : sorted(list(set(dt_this_stage._features_and_values_dict[feature]))) for feature in dt_this_stage._features_and_values_dict} dt_this_stage._numeric_features_valuerange_dict = {feature : [] for feature in self._all_trees[0]._numeric_features_valuerange_dict} dt_this_stage._numeric_features_valuerange_dict = {feature : [min(dt_this_stage._features_and_unique_values_dict[feature]), max(dt_this_stage._features_and_unique_values_dict[feature])] for feature in self._all_trees[0]._numeric_features_valuerange_dict} if self._stagedebug: print("\n\nPrinting features and their values in the training set:\n") for item in sorted(dt_this_stage._features_and_values_dict.items()): print(item[0] + " => " + str(item[1])) print("\n\nPrinting unique values for features:\n") for item in sorted(dt_this_stage._features_and_unique_values_dict.items()): print(item[0] + " => " + str(item[1])) print("\n\nPrinting unique value ranges for features:\n") for item in sorted(dt_this_stage._numeric_features_valuerange_dict.items()): print(item[0] + " => " + str(item[1])) dt_this_stage._feature_values_how_many_uniques_dict = {feature : [] for feature in self._all_trees[0]._features_and_unique_values_dict} dt_this_stage._feature_values_how_many_uniques_dict = {feature : len(dt_this_stage._features_and_unique_values_dict[feature]) for feature in self._all_trees[0]._features_and_unique_values_dict} # if stagedebug: dt_this_stage._debug2 = 1 dt_this_stage.calculate_first_order_probabilities() dt_this_stage.calculate_class_priors() if self._stagedebug: print("\n\n>>>>>>>Done with the initialization of the tree for this stage<<<<<<<<<<\n") root_node_this_stage = dt_this_stage.construct_decision_tree_classifier() if self._stagedebug: root_node_this_stage.display_decision_tree(" ") self._all_trees[stage_index] = dt_this_stage self._root_nodes[stage_index] = root_node_this_stage self._misclassified_samples[stage_index] = \ self.evaluate_one_stage_of_cascade(self._all_trees[stage_index], self._root_nodes[stage_index]) if self._stagedebug: print("\nSamples misclassified by this stage classifier: %s" % str(self._misclassified_samples[stage_index])) print("\nNumber of misclassified samples: %d" % len(self._misclassified_samples[stage_index])) self.show_class_labels_for_misclassified_samples_in_stage(stage_index) misclassification_error_rate = sum( [self._sample_selection_probs[stage_index][x] for x in self._misclassified_samples[stage_index]] ) if self._stagedebug: print("\nMisclassification_error_rate: %g" % misclassification_error_rate) self._trust_factors[stage_index] = \ 0.5 * math.log((1-misclassification_error_rate)/misclassification_error_rate) if self._stagedebug: print("\nThis stage trust factor: %g" % self._trust_factors[stage_index])
import pdb # pdb.set_trace() import matplotlib.pyplot as plt from DecisionTree import DecisionTree import csv import sklearn import sklearn.utils # ['Xvalidate', '__globals__', '__header__', 'Ytrain', 'Xtrain', '__version__', 'Yvalidate']] data = scipy.io.loadmat("../spam-dataset/spam_data.mat") t_data = sklearn.utils.shuffle(data["training_data"], random_state=0) # (5172, 32) t_labels = sklearn.utils.shuffle(data["training_labels"].ravel(), random_state=0) # (1, 5172) training_data = t_data[0:4137] training_labels = t_labels[0:4137] validation_data = t_data[4137:5712] validation_labels = t_labels[4137:5712] classifier = DecisionTree() classifier.train(training_data, training_labels) error_rate = classifier.test(validation_data, validation_labels) print error_rate # TESTING CODE # predictions = classifier.predict(test_data) # test_data = data["test_data"] # (5857, 32) last one was 0.46755
# coding: UTF-8 ''' Created on 2013-3-21 @author: peixinchen ''' from DecisionTree import DecisionTree data = DecisionTree(training_datafile = "decision.dat") data.get_training_data() rootNode = data.construct_decision_tree_classifier() test_case = [ "outlook=>sunny", "temperature=>hot", "humidity=>high", "wind=>strong", ] classification = data.classify(rootNode, test_case) print classification if __name__ == '__main__': pass
def test_build(self): return tree = DecisionTree(0, maxDeep = 5) data, label = creatDataLabel() tree.buildTree(data, label) self.assertTrue(tree._maxLabel() == 1)
def __init__(self, trainingData='data/training.dat'): self.dt = DecisionTree(training_datafile = trainingData, debug1=0, debug2=0) self.dt.get_training_data() self.rootNode = self.dt.construct_decision_tree_classifier()
def predict(self, X): ''' Return a numpy array of the labels predicted for the given test data. ''' answers = np.array([tree.predict(X) for tree in self.forest]).T return np.array([Counter(row).most_common(1)[0][0] for row in answers]) def score(self, X, y): ''' Return the accuracy of the Random Forest for the given test data and labels. ''' return sum(self.predict(X) == y) / float(len(y)) if __name__ == '__main__': from sklearn.cross_validation import train_test_split import pandas as pd df = pd.read_csv('data/congressional_voting.csv', names=['Party']+range(1, 17)) y = df.pop('Party').values X = df.values X_train, X_test, y_train, y_test = train_test_split(X, y) rf = RandomForest(num_trees=10, num_features=5) rf.fit(X_train, y_train) print "Random Forest score:", rf.score(X_test, y_test) dt = DecisionTree() dt.fit(X_train, y_train) print "Decision Tree score:", dt.score(X_test, y_test)
def main(argv): # Handle User input trainFile = '' testFile = '' m = 4 if len(sys.argv) == 4: trainFile = sys.argv[1] testFile = sys.argv[2] outname = sys.argv[3] else: sys.exit("Bad input: Please provide a test file, train file, and outfile name") # Ingest the datasets trainset = readFile(trainFile) testset = readFile(testFile) # test decision tree constructor #a = DecisionTree(trainset, m) # prep a file for graphing data f = open(outname, 'w+') f.write('samplePercentage,Accuracy,Min,Max\n') # train using various sample sizes samplePercs = [0.05, 0.1, 0.2, 0.5] for samplePerc in samplePercs: # get the number of instances I'll be using sampleSize = int(len(trainset.instances)*samplePerc) # populate the samples samples = [] for i in range(10): samples.append(random.sample(trainset.instances, sampleSize)) accuracies = [] for sample in samples: # train using this sample tmpTrain = copy.deepcopy(trainset) tmpTrain.overrideInstances(sample) tmpTree = DecisionTree(tmpTrain, m) scores = [] for instance in testset.instances: scores.append(1 if tmpTree.classify(instance, tmpTree.root) == instance[-1] else 0) accuracies.append(float(sum(scores))/len(scores)) # write the data to a file avg = str((float(sum(accuracies))/len(accuracies))*100) mi = str((min(accuracies))*100) ma = str((max(accuracies))*100) #f.write(str(samplePerc*100) + ',' + avg + ',Average\n') #f.write(str(samplePerc*100) + ',' + mi + ',Minimum\n') #f.write(str(samplePerc*100) + ',' + ma + ',Maximum\n') f.write(str(samplePerc*100) + ',' + avg + ',' + mi + ',' + ma + '\n') # do one more classification accuracy using the whole training set scores = [] a = DecisionTree(trainset, m) for instance in testset.instances: scores.append(1 if a.classify(instance, a.root) == instance[-1] else 0) avg = str((float(sum(scores))/len(scores))*100) #f.write('100,' + str((float(sum(scores))/len(scores))*100) + ',Average\n') #f.write('100,' + str((float(sum(scores))/len(scores))*100) + ',Minimum\n') #f.write('100,' + str((float(sum(scores))/len(scores))*100) + ',Maximum\n') f.write('100,' + avg + ',' + avg + ',' + avg + '\n')
# -*- coding: utf-8 -*_ from DecisionTree import DecisionTree dt = DecisionTree(training_datafile ="./jueceshu.data") dt.get_training_data() dt.show_training_data() root_node = dt.construct_decision_tree_classifier() root_node.display_decision_tree(" ") test_sample = ['exercising=>never', 'smoking=>heavy', 'fatIntake=>heavy', 'videoAddiction=>heavy'] classsification = dt.classify(root_node,test_sample) print classsification
def __init__(self, impurity_criterion, num_features = None, prune = False): DecisionTree.__init__(self, impurity_criterion='entropy') self.k = num_features self.pruning = prune