def remove_features(removal_order, train_file, test_file, attr_file, max_features): train_accs = [] test_accs = [] remove_columns = [] for col in removal_order: print(col) remove_columns.append(col) if len(remove_columns) == max_features: break print(remove_columns) train_data, train_attr = read_data(train, attr, remove_columns=remove_columns) test_data, test_attr = read_data(test, attr, remove_columns=remove_columns) tree = decision_tree.DecisionTreeLearning(train_data, train_attr, "normal", "class") decision_tree.print_tree(tree) y_pred, y_true = decision_tree.predict(train_data, tree) train_acc = decision_tree.accuracy_score(y_pred, y_true) print('Accuracy on Training Data: {0}'.format(train_acc * 100)) y_pred, y_true = decision_tree.predict(test_data, tree) test_acc = decision_tree.accuracy_score(y_pred, y_true) print('Accuracy on Training Data: {0}'.format(test_acc * 100)) train_accs.append(train_acc) test_accs.append(test_acc) return train_accs, test_accs
def menu_1(): print("\nType the filename you would like to run the classifier on") filename = raw_input(" >> ") print("\nPlease type the filename with the types of classifiers listed") label_file = raw_input(" >> ") prepped_data = prep_data(filename, label_file) tree = decision_tree.build_decision_tree(prepped_data) decision_tree.print_tree(tree, 100)
def main(): train_file = 'train.txt' test_file = 'test.txt' bayes_accuracy = naive_bayes(train_file, test_file) knn_accuracy = knn(train_file, test_file, k=5) dt_accuracy, tree = decision_tree(train_file, test_file) with open('output.txt', 'w') as f: print_tree(tree, f) f.write('\n{}\t{}\t{}\n'.format(round(dt_accuracy, 2), round(knn_accuracy, 2), round(bayes_accuracy, 2)))
def run_tests(df, df_training, labels): for m in dt.Measure: for i in range(1, 5): tree_depth = i min_split = 1 test_set = df.values measure = m tree = dt.build_tree(df_training.values, max_depth=tree_depth, min_size=min_split, measure=measure) print("=" * 40) dt.print_tree(tree, labels) print('Min split: {}'.format(min_split)) print('Tree depth: {}'.format(tree_depth)) print('Train Size: {}'.format(len(df_training))) print('Test Size: {}'.format(len(test_set))) print('Accuracy: {:.4f}'.format(dt.accuracy(test_set, tree))) print('Measure: {}'.format(measure)) print("=" * 40)
def main(): #Set display option for data frames pd.set_option('display.max_columns', 11) pd.set_option('display.width', 200) #Read data and remove garbage df = pd.read_csv('winequalityN.csv') df = dt.remove_garbage( pd.DataFrame(data=df, columns=list(df.columns.values))) cols = df.columns.tolist() cols = cols[1:] + cols[0:1] #Move wine color column to last column #df = df[cols] df = df[cols].drop(['total sulfur dioxide'], axis='columns') labels = df.columns.values #Extract training data, sample size n df_white = df[(df['type'] == 0.0)] df_red = df[(df['type'] == 1.0)] df_training = df.sample(n=100, random_state=1) #Mixed sample # run_tests(df, df_training, labels) tree_depth = 3 min_split = 1 test_set = df.values measure = dt.Measure.GINI tree = dt.build_tree(df_training.values, max_depth=tree_depth, min_size=min_split, measure=measure) print("=" * 40) dt.print_tree(tree, labels) print('Min split: {}'.format(min_split)) print('Tree depth: {}'.format(tree_depth)) print('Train Size: {}'.format(len(df_training))) print('Test Size: {}'.format(len(test_set))) print('Accuracy: {:.4f}'.format(dt.accuracy(test_set, tree))) print('Measure: {}'.format(measure)) print("=" * 40) dt.prune_tree(tree) dt.print_tree(tree, labels)
test_fraction = 0.25 max_depth = 2 min_sample_per_node = 2 criterion = 'gini' prediction_type = 'classification' # load dataset X, Y, X_feature_names = common_fns.get_data(filename=filename, target=target) # random split into test and train X_train, Y_train, X_test, Y_test = common_fns.split_train_test( X, Y, test_fraction=test_fraction) # fit tree on train set tree = dtree.fit_decision_tree(X=X_train, Y=Y_train, max_depth=max_depth, min_sample_per_node=min_sample_per_node, criterion=criterion) # print tree for debugging/reference dtree.print_tree(tree=tree, X_feature_names=X_feature_names) # make predictions on test set Y_predict = dtree.predict_all(tree=tree, X=X_test, prediction_type=prediction_type) # calculate various classification scores dtree.calculate_scores(Y_predict=Y_predict, Y_ref=Y_test)
def __repr__(self): #the reason this is formatted like this is because Python doesn't like #printing with no returns. This basically calls the function that prints everything #then prints None. The way this is formatted helps the None from being printed. return '' if str(watdt.print_tree(self.tree)) == None else ''
from decision_tree import get_header from decision_tree import set_header from decision_tree import get_unique_values import csv training_data = [] with open('data.csv', encoding="utf8") as csvfile: readCSV = csv.reader(csvfile, delimiter=',') for row in readCSV: new_row = [] for item in row[0].split(','): new_row.append(item) training_data.append(new_row) my_tree = build_tree(training_data) print_tree(my_tree) print() testing_data = [] for i in range(len(get_header()) - 1): ask = 'Введіть ' + str(get_header()[i]) + str( get_unique_values(training_data, i)) + ': ' user_input = input(ask) testing_data.append(user_input) print("Передбачено: %s" % (print_leaf(classify(testing_data, my_tree)))) input()