label = {'y': ['yes', 'no']} num_run = 100 T = 1000 test_py = np.array([[0 for x in range(test_size)] for y in range(num_run)]) test_py_first = np.array([0 for x in range(test_size)]) for iter in range(num_run): train_subset = train_data.sample(n=1000, replace=False, random_state=iter) for t in range(T): print('iter: ', iter, 't: ', t) # sample with replace sampled = train_subset.sample(frac=0.01, replace=True, random_state=t) # ID3 dt_generator = dt.ID3(feature_selection=0, max_depth=17, subset=6) # get decision tree decision_tree = dt_generator.generate_decision_tree( sampled, features, label) ## predict # test py = dt_generator.classify(decision_tree, test_data) py = np.array(py.tolist()) py[py == 'yes'] = 1 py[py == 'no'] = -1 py = py.astype(int) test_py[iter] = test_py[iter] + py if t == 0: test_py_first = test_py_first + py true_value = np.array(test_data['y'].tolist())
'maint': ['vhigh', 'high', 'med', 'low'], 'doors': ['2', '3', '4', '5more'], 'persons': ['2', '4', 'more'], 'lug_boot': ['small', 'med', 'big'], 'safety': ['low', 'med', 'high'] } label = {'label': ['unacc', 'acc', 'good', 'vgood']} train_acc = [[0 for x in range(6)] for y in range(3)] test_acc = [[0 for x in range(6)] for y in range(3)] for feature_selection in range(3): for max_depth in range(6): # ID3 dt_generator = dt.ID3(feature_selection=feature_selection, max_depth=max_depth + 1) # get decision tree decision_tree = dt_generator.generate_decision_tree( train_data, features, label) # train acc # predict train_data['plabel'] = dt_generator.classify(decision_tree, train_data) train_acc[feature_selection][max_depth] = train_data.apply( lambda row: 1 if row['label'] == row['plabel'] else 0, axis=1).sum() / train_size # test acc # predict test_data['plabel'] = dt_generator.classify(decision_tree, test_data) test_acc[feature_selection][max_depth] = test_data.apply( lambda row: 1 if row['label'] == row['plabel'] else 0, axis=1).sum() / test_size