def main(): X_train, X_test, y_train, y_test = prank_data_split( '../dataset/ratings.csv', 0.2) #cross validation# depth_array = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]) depth = kfoldcv(X_train.as_matrix(), y_train.as_matrix(), 5, depth_array, '../dataset/movies.csv') #prediction# print('predict using decision tree with max depth', depth, ':') X_train, X_test, y_train, y_test = generate_matrix( X_train, X_test, y_train, y_test, '../dataset/movies.csv') regr = decision_tree(X_train, y_train, depth) y_predicted = predict(regr, X_test) rmse = get_RMSE(y_test, y_predicted) print('rmse:', rmse) mae = get_MAE(y_test, y_predicted) print('mae:', mae) #specificity, sensitivity, precision, accuracy spec, sens, prec, accu = get_spec_sens_prec_accu(y_test, y_predicted) print('spec:', spec) print('sens:', sens) print('prec:', prec) print('accu:', accu)
def performing_algorithm(X, y, X_test): """ :param X: Matrix :param y: Matrix :param X_test: Matrix :return: Prediction of chosen algorithm """ if args.algorithm == "linear_regression": return linear_regression(X, y, X_test) elif args.algorithm == "decision_tree": return decision_tree(X, y, X_test) elif args.algorithm == "SVM": return SVM(X, y, X_test)
def get_best_attribute(attributes, examples): """ workers = parallel.Workers() workers.initialize_n_workers(4) workers.set_function(get_attribute_info_gain) workers.start() inputs = [[a, examples] for a in attributes] output = workers.run_over_data(inputs) """ best_gain = float("-inf") best_attr = None for a in attributes: gain, attr = get_attribute_info_gain(a, examples) if gain < best_gain: best_gain = gain best_attr = a return best_attr, decision_tree(best_attr, dt_util.get_goal_counts(examples), dt_util.GOAL_INDEX)
def __init__(self, data, labels, num_trees, weights=None, randomized=False): self.data = data self.labels = labels self.trees = [] num_obs = data.shape[0] num_features = data.shape[1] # uniformly weight data as default (can be modified for boosting) if weights == None: weights = 1.0/num_obs*numpy.ones((num_obs,1)) # split into num_trees training sets with data sampled with replacement according to weighting scheme # the sets are the same size as the original training set data_sets = numpy.zeros((num_trees, num_obs, num_features)) label_sets = numpy.zeros((num_trees, num_obs, 1)) for i in range(num_trees): for j in range(num_obs): sampled_obs_index = self.sample_index(weights) data_sets[i,j] = data[sampled_obs_index, :] label_sets[i,j] = labels[sampled_obs_index] # train num_trees decision trees for i in range(num_trees): self.trees.append(decision_tree(data_sets[i,:,:], label_sets[i,:], randomized))
[90, 1, 0, 3.5], # FIT [75, 1, 1, 3.1], # FIT [85, 2, 1, 3.1], # NOT_FIT [65, 0, 1, 2.1], # NOT_FIT [70, 1, 0, 3.0] ]) # NOT_FIT # класс для каждой кандидатки: Y = np.array( [FIT, FIT, FIT, FIT, NOT_FIT, FIT, FIT, NOT_FIT, NOT_FIT, NOT_FIT]) # типы переменных в столбцах обучающей выборки scale = np.array([NUMERICAL, CATEGORICAL, CATEGORICAL, NUMERICAL]) # рекурсивное построение дерева решений decision_tree(X, Y, scale) # классификация каждого примера с помощью # классификатора, созданного на основе дерева y = np.array([clf.classify(X[i, :]) for i in range(len(X))]) # классификация успешна, если все примеры правильно классифицированы if np.all(y == Y): print('\nclassification success!\n') else: print('\nclassification fail... :(\n') # проверка себя с помощью классификатора # TODO: после того, как вы построили дерево решений и реализовали на его # основе функцию classify, раскомментируйте код ниже и проверьте себя, # подходите ли вы на роль ассистентки профессора Буковски :)
from decision_tree import * from numpy import array import scipy.io as sio def max_depth(node): if node.is_leaf(): return node.depth else: return max(max_depth(node.left_child), max_depth(node.right_child)) all_data = sio.loadmat('spam.mat') dt = decision_tree(all_data['Xtrain'][0:3200,:], all_data['ytrain'][0:3200], True) score = 0 for i in range(3200, 3450): score += dt.classify(all_data['Xtrain'][i,:]) == all_data['ytrain'][i][0] print 'score = ' + str(score) print 'error = ', str(1 - float(score) / 250) print 'max depth = ', max_depth(dt.root) dt = decision_tree(all_data['Xtrain'], all_data['ytrain'], True) score = 0 for i in range(3450): score += dt.classify(all_data['Xtrain'][i,:]) == all_data['ytrain'][i][0] print 'score = ' + str(score) print 'error = ', str(1 - (float(score) / 3450)) print 'max depth = ', max_depth(dt.root) #dt = decision_tree(all_data['Xtrain'], all_data['ytrain'])