def make_forest(data, n_bootstraps, scoref, min_gain=0.01): """Function to grow a random forest given some training data.""" trees = [] for _ in xrange(n_bootstraps): data_boot = make_boot(data, data.shape[0]) trees.append(dt.buildtree(data_boot, scoref, min_gain)) return Forest(trees)
start = stepSize * i training = dataset[start:start + stepSize] trainingLabels = labelInt[start:start + stepSize] ############# Feature Extraction ############## my_model = PCA(n_components=pca_comps, svd_solver='full') newSet = my_model.fit_transform(training).tolist() newTestSet = my_model.transform(test).tolist() newTrainSet = my_model.transform(training).tolist() ############# Model Building ############## for k in range(len(newSet)): newSet[k].append(trainingLabels[k]) passingData = newSet[:] models.append(dt.buildtree(passingData)) # dt.prune(b,0.1) ############# Classification of Test Records ############## for j in range(len(newTestSet)): if j not in test_classify: test_classify[j] = [] test_classify[j].append(dt.classify(newTestSet[j], models[i])) ############# Accuracy Calculations ############## d = [] f = [] flat = [] for l in test_classify.values():
tot_count = 0 tot_correct = 0 train_data = parse_file(train_file_path) test_data = parse_file(test_file_path) #Calculating the Accuracy at every level correct = 0 total = 0 TP = 0 TN = 0 FP = 0 FN = 0 depth = 0 for i in range(1, 7): tree = dtree.buildtree(train_data, 0, i) for data in test_data: predicted = list(dtree.decision(tree, data).keys())[0] actual = data[-1] total = total + 1 if predicted == 1.0 and actual == 1.0: correct = correct + 1 TP = TP + 1 if predicted == 0.0 and actual == 0.0: correct = correct + 1 TN = TN + 1 if predicted == 1.0 and actual == 0.0: FP = FP + 1 if predicted == 0.0 and actual == 1.0: FN = FN + 1 tot_correct += correct
def build_BNN(data, output_condition, cd=98, mss=1, md=10, relevant_neuron_dictionary={}, with_data=0, discretization=0, cluster_means=None): ''' Starting from the target condition and until the conditions with respect to the first hidden layer, it extracts a DNF that explains each condition using conditions of the next shallower layer param data: instance of DataSet param output_condition: condition of interest param cd: class dominance param mss: minimum dataset size param md: maximum tree depth param with_data: Avoid ==0. If == 1, the regular simplification operations are performed, if == 2, post-ppruning is performed param discretization: method used to determine the thresholds that split the activation range of each neuron ''' BNN = {} deep_layer = data.output_layer target_class = [output_condition] while deep_layer > 0: target_split_values = set((l, n, t) for (l, n, t, u) in target_class) if not target_split_values: warnings.warn( 'Warning: no split points, returning current dictionary at layer: ' + str(deep_layer)) print('Target split values', target_split_values) used_shallow_conditions = set([]) current_data = temp_data(data, deep_layer - 1, target_class) if discretization == 0: split_points = dis.all_features_trivial_mid_points(current_data) elif discretization == 1 or discretization == 3: split_points = dis.one_time_discretization( current_data, discretization, rnd=relevant_neuron_dictionary, tsv=list(target_split_values)) elif discretization == 2 or discretization == 4: split_points = cluster_means[deep_layer - 1] elif discretization == 6: colum = [[d[c] for d in current_data] for c in range(len(current_data[0]) - 1)] split_points = [[sum(vq.kmeans(v, 2)[0]) / 2] for v in colum] elif discretization == 5: if deep_layer == 1: split_points = [[0.5] for l in range(len(current_data[0]) - 1)] else: split_points = [[0] for l in range(len(current_data[0]) - 1)] print('Split points', [len(l) for l in split_points]) print(split_points) print('') for i in target_split_values: print('') print('i: ', i) t = time.time() i_data = temp_data(data, deep_layer - 1, i) tree = None if relevant_neuron_dictionary and discretization == 0: pruned_split_points = [ _sp(j, i, split_points, relevant_neuron_dictionary) for j in range(len(split_points)) ] print(pruned_split_points) tree = dt.buildtree(i_data, pruned_split_points, class_dominance=cd, min_set_size=mss, max_depth=md, root=True) else: tree = dt.buildtree(i_data, split_points, class_dominance=cd, min_set_size=mss, max_depth=md, root=True) if not tree: cero_class = sum(1 for x in i_data if x[-1] == 0) one_class = sum(1 for x in i_data if x[-1] == 1) if cero_class > one_class: BNN[(i[0], i[1], i[2], True)] = False BNN[(i[0], i[1], i[2], False)] = True else: BNN[(i[0], i[1], i[2], False)] = True BNN[(i[0], i[1], i[2], True)] = False break print('Tree is formed') print('Time: ', time.time() - t) dnfs = dt.get_dnfs(deep_layer - 1, tree) if (i[0], i[1], i[2], False) in target_class: print('False case') pruned = None if isinstance(dnfs[0], list): # print('Fidelity pre-pruning:', ef.accuracy_of_dnf(data, (i[0], i[1], i[2], False), dnfs[0], True, False, False, True)) # print('Precision pre-pruning:', ef.precision_of_dnf(data, (i[0], i[1], i[2], False), dnfs[0], True, False, False, True)) # print('Recall pre-pruning:', ef.recall_of_dnf(data, (i[0], i[1], i[2], False), dnfs[0], True, False, False, True)) data.update_dictionary([(l, n, t) for conj in dnfs[0] for (l, n, t, u) in conj]) if with_data == 0: pruned = s.boolean_simplify_basic(dnfs[0]) elif with_data >= 1: pruned = s.boolean_simplify_complex(dnfs[0]) if with_data == 2: pruned = p.post_prune(pruned, (i[0], i[1], i[2], False), data.example_cond_dict, data.dict_indexes, data=None) used_shallow_conditions.update( set(c for conj in pruned for c in conj)) else: pruned = dnfs[0] # print('Fidelity post-pruning:', ef.accuracy_of_dnf(data, (i[0], i[1], i[2], False), pruned, True, False, False, True)) # print('Precision post-pruning:', ef.precision_of_dnf(data, (i[0], i[1], i[2], False), pruned, True, False, False, True)) # print('Recall post-pruning:', ef.recall_of_dnf(data, (i[0], i[1], i[2], False), pruned, True, False, False, True)) BNN[(i[0], i[1], i[2], False)] = pruned print((i[0], i[1], i[2], False), pruned) if (i[0], i[1], i[2], True) in target_class: print('True case') pruned = None if isinstance(dnfs[1], list): # print('Fidelity pre-pruning:', ef.accuracy_of_dnf(data, (i[0], i[1], i[2], True), dnfs[1], True, False, False, True)) # print('Precision pre-pruning:', ef.precision_of_dnf(data, (i[0], i[1], i[2], True), dnfs[1], True, False, False, True)) # print('Recall pre-pruning:', ef.recall_of_dnf(data, (i[0], i[1], i[2], True), dnfs[1], True, False, False, True)) data.update_dictionary([(l, n, t) for conj in dnfs[1] for (l, n, t, u) in conj]) if with_data == 0: pruned = s.boolean_simplify_basic(dnfs[1]) elif with_data >= 1: pruned = s.boolean_simplify_complex(dnfs[1]) if with_data == 2: pruned = p.post_prune(pruned, (i[0], i[1], i[2], True), data.example_cond_dict, data.dict_indexes, data=None) used_shallow_conditions.update( set(c for conj in pruned for c in conj)) else: pruned = dnfs[1] # print('Fidelity post-pruning:', ef.accuracy_of_dnf(data, (i[0], i[1], i[2], True), pruned, True, False, False, True)) # print('Precision post-pruning:', ef.precision_of_dnf(data, (i[0], i[1], i[2], True), pruned, True, False, False, True)) # print('Recall post-pruning:', ef.recall_of_dnf(data, (i[0], i[1], i[2], True), pruned, True, False, False, True)) BNN[(i[0], i[1], i[2], True)] = pruned print((i[0], i[1], i[2], True), pruned) deep_layer -= 1 target_class = list(used_shallow_conditions) return BNN
trainingLabels = [labelInt[i] for i in training_idx] testLabels = [labelInt[i] for i in test_idx] ############# Feature Extraction ############## my_model = PCA(n_components=pca_comps, svd_solver='full') newSet = my_model.fit_transform(training).tolist() newTestSet = my_model.transform(test).tolist() newTrainSet = my_model.transform(training).tolist() ############# Model Building ############## for i in range(len(newSet)): newSet[i].append(trainingLabels[i]) passingData = newSet[:] b = dt.buildtree(passingData) dt.prune(b, 0.1) ############# Classification of Train Records ############## count = 0 for i in range(len(newTrainSet)): a = dt.classify(newTrainSet[i], b) for key in a.keys(): if (key == trainingLabels[i]): count = count + 1 ############# Accuracy Calculations for Training DataSet ############## accuracy = (count / len(newTrainSet)) * 100 final_train_acc += accuracy print('Train accuracy:', accuracy)
# remove index column features_train = features_train[:,1:] labels_train = np.genfromtxt('../census-dataset/census-train-labels.csv', delimiter=' ', skip_header=1) # remove index column labels_train = labels_train[:,1:][:,0] # split to obtain train and test set x_train, x_test, y_train, y_test = train_test_split(features_train, labels_train, test_size=0.33) # concatenate features and labels data_train = np.column_stack((x_train, y_train)) data_test = np.column_stack((x_test, y_test)) # build decision tree using entropy decision_tree = dt.buildtree(data_train, dt.entropy, 0.01) min_gain_error = {} # test minimal gain values for pruning for min_gain_value in np.arange(0,1, 0.01): dt_temp = copy.copy(decision_tree) dt.prune(dt_temp, min_gain_value) # classify test data y_hat = map(lambda obs : dt.classify(obs, dt_temp), x_test) y_hat = map(dt.convertToLabel, y_hat) y_hat = np.array(y_hat) error = (y_hat != y_test).sum() / float(y_test.shape[0]) min_gain_error[min_gain_value] = error # prune tree with optimal min_gain value
label, pixels = dataset[test_idx[i]] record = (pixels.flatten()).tolist() testing_labels.append(label) rows_test_total.append(record) ############# Feature Extraction ############## FinalTrain = [] my_model = PCA(n_components=pca_comps, svd_solver='full') newSet = my_model.fit_transform(rows_total).tolist() newtestSet = my_model.transform(rows_test_total).tolist() ############# Model Building ############## for i in range(len(rows_total)): newSet[i].append(training_labels[i]) b = dt.buildtree(newSet) dt.prune(b, 0.1) ############# Classification of Test Records ############## number = 0 accuracy = 0 for i in range(testSize): a = dt.classify(newtestSet[i], b) for key in a.keys(): if (key == testing_labels[i]): number = number + 1 ############# Accuracy Calculations ############## accuracy = (number / testSize) * 100 final_test_acc += accuracy
def buildtree(depth, test_data, train_data, current_index): tot_count = 0 tot_correct = 0 #Calculating the Accuracy at every level correct = 0 total = 0 TP = 0 TN = 0 FP = 0 FN = 0 #print ("Depth Entered is :" ,depth) predicted_list = [] predicted_list_1 = [] for i in range(depth): tree = dtree.buildtree(train_data, 0, i) for data in train_data: predicted = list(dtree.decision(tree, data).keys())[-1] predicted_list.append(predicted) for data in test_data: predicted = list(dtree.decision(tree, data).keys())[0] predicted_list_1.append(predicted) one_count_testdata = predicted_list_1.count(1) zero_count_testdata = predicted_list_1.count(0) actual = data[-1] total = total + 1 if predicted == 1.0 and actual == 1.0: correct = correct + 1 TP = TP + 1 if predicted == 0.0 and actual == 0.0: correct = correct + 1 TN = TN + 1 if predicted == 1.0 and actual == 0.0: FP = FP + 1 if predicted == 0.0 and actual == 1.0: FN = FN + 1 tot_correct += correct tot_count += total Accuracy = round(100 * correct / total, 2) Depth_list.append(depth) Accuracy_list.append(Accuracy) depth = depth + 1 #print (Accuracy_list) #print (Depth_list) #printing the confusion matrix print("Accuracy::", str(Accuracy) + '%') print("False Negatives ", str(FN)) print("False positives ", str(FP)) print("True Negatives ", str(TN)) print("True Positives ", str(TP)) print("Confusion Matrix for bagging") print("------") print("| ", TP, "|", FN, "|") print("------") print("| ", FP, "|", TN, "|") print("------")
def setUp(self): my_data = np.genfromtxt('decision_tree_example.txt', dtype=None) self.rows = my_data.tolist() self.tree = decision_tree.buildtree(self.rows)