def test_i_put_result_draw(set_trump): set_trump('hearts') node = build_tree([Card('clubs', 10)], [Card('clubs', 11)], True) assert node_estimate(node) == 0.5 node = build_tree([Card('clubs', 14)], [Card('hearts', 6)], True) assert node_estimate(node) == 0.5
def test_rival_puts_rival_wins(set_trump): set_trump('hearts') node = build_tree([Card('spades', 10)], [Card('clubs', 9)], False) assert node_estimate(node) == 0.0 node = build_tree([Card('clubs', 6), Card('clubs', 8), Card('clubs', 9), Card('hearts', 8)], [Card('clubs', 7)], False) assert node_estimate(node) == 0.0
def test_i_put_i_win(set_trump, monkeypatch): set_trump('hearts') monkeypatch.setattr(decision_tree, 'MAXDEPTH', 4) node = build_tree([Card('clubs', 10)], [Card('clubs', 9)], True) assert node_estimate(node) == 1.0 node = build_tree([Card('clubs', 6), Card('clubs', 8), Card('clubs', 9), Card('hearts', 8)], [Card('clubs', 7)], True) assert node_estimate(node) == 1.0
def random_forest_training(data_train, trees_num): '''构建随机森林 input: data_train(list):训练数据 trees_num(int):分类树的个数 output: trees_result(list):每一棵树的最好划分 trees_feature(list):每一棵树中对原始特征的选择 ''' trees_result = [] # 构建好每一棵树的最好划分 trees_feature = [] n = np.shape(data_train)[1] # 样本的维数 if n > 2: k = int(log(n - 1, 2)) + 1 # 设置特征的个数 else: k = 1 # 开始构建每一棵树 for i in xrange(trees_num): # 1、随机选择m个样本, k个特征 data_samples, feature = choose_samples(data_train, k) # 2、构建每一棵分类树 tree = build_tree(data_samples) # 3、保存训练好的分类树 trees_result.append(tree) # 4、保存好该分类树使用到的特征 trees_feature.append(feature) return trees_result, trees_feature
def random_forest(trainX, trainy, trees_num, gr=0): i = 0 forest = [] while i < trees_num: trainset = bootstrap(len(trainX)) #print(trainset) feature_num = trainX.shape[1] flags = [1] * feature_num K = round(np.log2(feature_num)) subtree = Tree() temp_tree = build_tree(trainX[trainset], trainy[trainset], flags, depth=trainX.shape[1], RF=1, K=K, gr=gr) subtree.data = temp_tree.data subtree.left = temp_tree.left subtree.right = temp_tree.right forest.append(subtree) print("第几棵树", i) i = i + 1 return forest
def proj_demo(name): # sample subset of dataset, for readable tree and reasonable runtime dat = u.get_data(name).sample(120) # split to simple train/test split sets = u.split_to_train_test_sets(dat) training_set = sets['Training_Set'] test_set = sets['Test_Set'] # build tree tree = dt.build_tree(training_set) # show data subsets and tree print('Training Data Sample: \n ', training_set.head()) print('Test Data Sample : \n', test_set.head()) pprint(tree) sleep(5) # classify each item in training set total_tests = test_set.shape[0] correct_guesses = 0 print('Sample Classifications:') for i in range(total_tests): # test each item test_item = test_set.iloc[i, :] prediction = dt.make_prediction(test_item, tree) actual_val = test_item[-1] correct = prediction == actual_val if i < 5: print( f'Correct: {correct} \t Actual Value: {actual_val} \t Predicted Value: {prediction}' ) # compare prediction and real value if correct: correct_guesses += 1 # show classification accuracy print(f'Accuracy: {round(correct_guesses/total_tests,2)} \n')
def __init__(self, data, targets, headers): self.data = data self.targets = targets self.headers = headers #merge the data and targets into one dataframe so it can be used to #build the tree. frames = [data, targets] final_data = pd.concat(frames, axis=1) self.tree = watdt.build_tree(final_data, headers[:-1])
def get_risks(records): # UWAGA! Przy każdym uruchomienu trenowane jest drzewo - jeżeli # będzie to wolny proces, to można zapisać drzewo do pliku! path = str(Path(os.getcwd()).parent) + "/data/artif_data.txt" f = open(path, 'r') training_data = [line.rstrip().split(',') for line in f] header = training_data.pop(0) tree = build_tree(training_data) predicted_risks = {} for area in records: predicted_risks[area] = list(classify(records[area], tree).keys())[0] return predicted_risks
def run_tests(df, df_training, labels): for m in dt.Measure: for i in range(1, 5): tree_depth = i min_split = 1 test_set = df.values measure = m tree = dt.build_tree(df_training.values, max_depth=tree_depth, min_size=min_split, measure=measure) print("=" * 40) dt.print_tree(tree, labels) print('Min split: {}'.format(min_split)) print('Tree depth: {}'.format(tree_depth)) print('Train Size: {}'.format(len(df_training))) print('Test Size: {}'.format(len(test_set))) print('Accuracy: {:.4f}'.format(dt.accuracy(test_set, tree))) print('Measure: {}'.format(measure)) print("=" * 40)
def main(): #Set display option for data frames pd.set_option('display.max_columns', 11) pd.set_option('display.width', 200) #Read data and remove garbage df = pd.read_csv('winequalityN.csv') df = dt.remove_garbage( pd.DataFrame(data=df, columns=list(df.columns.values))) cols = df.columns.tolist() cols = cols[1:] + cols[0:1] #Move wine color column to last column #df = df[cols] df = df[cols].drop(['total sulfur dioxide'], axis='columns') labels = df.columns.values #Extract training data, sample size n df_white = df[(df['type'] == 0.0)] df_red = df[(df['type'] == 1.0)] df_training = df.sample(n=100, random_state=1) #Mixed sample # run_tests(df, df_training, labels) tree_depth = 3 min_split = 1 test_set = df.values measure = dt.Measure.GINI tree = dt.build_tree(df_training.values, max_depth=tree_depth, min_size=min_split, measure=measure) print("=" * 40) dt.print_tree(tree, labels) print('Min split: {}'.format(min_split)) print('Tree depth: {}'.format(tree_depth)) print('Train Size: {}'.format(len(df_training))) print('Test Size: {}'.format(len(test_set))) print('Accuracy: {:.4f}'.format(dt.accuracy(test_set, tree))) print('Measure: {}'.format(measure)) print("=" * 40) dt.prune_tree(tree) dt.print_tree(tree, labels)
def pushTrainTreeButton(ui): # 按下创建决策树按钮 # 提示加载数据 ui.tipBrowser.append("start loading train data...") load_tree_file = str(ui.trainTreeLine.text()) dt_gui.columns, data = dt.load_data(load_tree_file) train_columns = dt_gui.columns[:] dt_gui.tree = dt.build_tree(data, train_columns) print(dt_gui.tree) # 显示决策树 tree_content_list = [] dt.tree_content(dt_gui.tree, dt_gui.columns, tree_content_list) ui.treeShow.setText("") # 先清空 tree_str = "" for l in tree_content_list: tree_str += (l + "\n") ui.treeShow.setText(tree_str) # 提示创建成功 ui.tipBrowser.append("train tree success") print("完成决策树训练")
def alternative_classifier(train_set, train_labels, test_set, test_labels, **kwargs): pred_set = [] train_set_red, test_set_red = reduce_data(train_set, test_set, [9, 12]) train_data = np.insert(train_set_red, 2, train_labels, axis=1) test_data = np.insert(test_set_red, 2, test_labels, axis=1) tree = build_tree(train_data) for row in test_data: prediction = classify(row, tree) pred_set.append(prediction) accuracy = calculate_accuracy(test_labels, pred_set) print(accuracy) confusionMatrix = calculate_confusion_matrix(test_labels, pred_set) plot_matrix(confusionMatrix) plt.show() return pred_set
def five_fold_validation(data_set): # split the datasets into fifths splits = u.five_fold_split(data_set) errors = [] # for each fifth of the dataset for split in splits: test_set = None training_set = pd.DataFrame(columns=data_set.columns.values) # check each fifth for s in splits: # if fifth in question if s == split: # this fifth is test set test_set = splits[s] # all others are training sets else: training_set = training_set.append(splits[s], sort=False) # construct tree with training set tree = dt.build_tree(training_set) pprint(tree) print(test_set) # number of values in test set total_tests = test_set.shape[0] correct_guesses = 0 for i in range(total_tests): # test each item test_item = test_set.iloc[i, :] prediction = dt.make_prediction(test_item, tree) actual_val = test_item[-1] # compare prediction and real value if prediction == actual_val: correct_guesses += 1 errors.append(correct_guesses / total_tests) # retrn average error return sum(errors) / len(errors)
def test_rival_puts_result_draw(set_trump): set_trump('hearts') node = build_tree([Card('clubs', 10)], [Card('clubs', 9)], False) assert node_estimate(node) == 0.5
def test_build_tree(self): node = build_tree(train_instances, train_labels, range(len(train_instances[0]))) print node
if not os.path.exists(saved_filename): # If we don't have a saved tree, save it all_feats = [] for i in range(len(text)): feat = make_features(text[i], phones[i]) all_feats.extend(feat) idx = list(range(len(all_feats))) random_state = np.random.RandomState(1999) random_state.shuffle(idx) # Out of 900k samples... but scaling is poor num_samples = 10000 idx = idx[:num_samples] all_feats = [all_feats[i] for i in idx] # Let max leaves be > number of phones (44) tree = build_tree(all_feats, max_depth=50) dump_tree_to_json(tree, saved_filename) tree = load_tree_from_json(saved_filename) if len(sys.argv) > 1: pred_text = list(sys.argv[1:]) pred_text = [t.upper() for t in pred_text] else: pred_text = ["HEISENBERG"] all_wav = [] for pt in pred_text: # quality of life hacks for simple words if pt == "I": print("Replacing I -> EYE") pt = "EYE"
def test_i_put_rival_wins(set_trump): set_trump('hearts') node = build_tree([Card('clubs', 10), Card('diamonds', 8)], [Card('hearts', 6)], True) assert node_estimate(node) == 0.0
import numpy as np import decision_tree as DT import perform_eval as PE import kfold as KF import time start_time = time.time() depth = 4 accuracy = [] precision = [] recall = [] f_1 = [] for i in range(len(KF.res)): my_tree = DT.build_tree(KF.res[i].train_set, depth) test_label = [] for row in KF.res[i].test_set: test_label.append(row[-1]) print( '--------------------------------------------round: %i------------------------------------------------' % (i)) # DT.print_tree(my_tree, "") # print("original labels:", test_label) predict = [] count = 0 for j in range(len(KF.res[i].test_set)): predict.append(DT.predict_val(KF.res[i].test_set[j], my_tree)) # print("predicted labels:", predict) a, p, r, F = PE.evaluate(predict, test_label) accuracy.append(a) precision.append(p) recall.append(r)
from decision_tree import get_header from decision_tree import set_header from decision_tree import get_unique_values import csv training_data = [] with open('data.csv', encoding="utf8") as csvfile: readCSV = csv.reader(csvfile, delimiter=',') for row in readCSV: new_row = [] for item in row[0].split(','): new_row.append(item) training_data.append(new_row) my_tree = build_tree(training_data) print_tree(my_tree) print() testing_data = [] for i in range(len(get_header()) - 1): ask = 'Введіть ' + str(get_header()[i]) + str( get_unique_values(training_data, i)) + ': ' user_input = input(ask) testing_data.append(user_input) print("Передбачено: %s" % (print_leaf(classify(testing_data, my_tree)))) input()
def main(): headers, data_set = read_dataset("../csv_data/data_set.csv") my_tree = build_tree(data_set, headers) #print_tree(my_tree) print( print_leaf(classify([6.44, 21.0, 65.22, 1431.0, 19.0, 99.0], my_tree)))
from decision_tree import group_by_fn from scipy import stats import pandas as pd income_train = pd.read_csv('../dataset/adult.data', header=None).values.tolist() income_test = pd.read_csv('../dataset/adult.test', header=None).values.tolist() # categorial data def workclass(x) : return x[1] def edu(x) : return x[3] def marital_status(x) : return x[4] def occupation(x) : return x[5] def relationship(x) : return x[6] def race(x) : return x[7] def sex(x) : return x[8] def native_country(x) : return x[12] def income(x): return x[14] attrfns = [workclass, edu, marital_status, occupation, relationship, race, sex, native_country] root = build_tree(income_train, attrfns, income, (' <=50K', ' >50K'), chi_split) print("Created decision tree with {0} nodes, depth {1}".format(count_nodes(root), depth(root))) print(accuracy(root, income_test, income))
def test_rival_puts_i_win(set_trump): set_trump('hearts') node = build_tree([Card('clubs', 10)], [Card('clubs', 9), Card('clubs', 6)], False) assert node_estimate(node) == 1.0
from decision_tree import depth from decision_tree import gain from decision_tree import group_by_fn import pandas as pd # https://archive.ics.uci.edu/ml/datasets/Statlog+(Australian+Credit+Approval) aust_data = pd.read_csv('../dataset/australian.dat', header=None, delimiter=' ') shuffled_data = aust_data.sample(frac=1).reset_index(drop=True) # categorial data def zero(x): return x[0] def three(x): return x[3] def four(x): return x[4] def five(x): return x[5] def seven(x): return x[7] def eight(x): return x[8] def ten(x): return x[10] def eleven(x): return x[11] attrfns = [zero, three, four, five, seven, eight, ten, eleven] def classfn(x): return int(x[14]) x = shuffled_data[:414].values.tolist() test_data = shuffled_data[414:].values.tolist() root = build_tree(x, attrfns, classfn, (0,1), chi_split) print("Created decision tree with {0} nodes, depth {1}".format(count_nodes(root), depth(root))) print(accuracy(root, test_data, classfn))
return x[6] def eight(x): return x[8] def ninth(x): return x[9] def eleventh(x): return x[11] def twelveth(x): return x[12] cc_att_fns = [ zeroth, third, fourth, fifth, sixth, eight, ninth, eleventh, twelveth ] x = shuffled_data[:414].values.tolist() test_data = shuffled_data[414:].values.tolist() root = build_tree(x, cc_att_fns, cc_class, ('+', '-'), chi_split) print("Created decision tree with {0} nodes, depth {1}".format( count_nodes(root), depth(root))) print(accuracy(root, test_data, cc_class))
feature_.pop('task_duration') feature_.pop('read_from_hdfs') feature_.pop('records_read') feature_.pop('input_bytes/result_bytes') feature_.pop('shuffle_read') feature_.pop('bytes_per_record') feature_.pop('remote_fetch') feature_.pop('shuffle_write') feature_.pop('write_bytes_per_record') feature_.pop('write_bytes/read_bytes') labels.append(label) row = [] for key in feature_: if flag_key: keys.append(key) row.append(feature_[key]) flag_key = False dataset.append(row) accuracy, precision, recall = decision_tree.build_tree( dataset, labels, keys) print('accuracy,precision,recall=', accuracy, precision, recall) exit() # clean dataset feature_values = {} for key in dataset[0][0]: feature_values[key] = [] for piece in dataset: piece = piece[0] feature_values.append(piece[key])