def decision_spambase_set_no_libs(): """ Solution for HW1 prob 1 """ print('Homework 1 problem 1 - No Libraries - Regression Decision tree') print('Spambase Dataset') spam_data = utils.load_and_normalize_spam_data() test, train = utils.split_test_and_train(spam_data) print str(len(train)) + " # in training set <--> # in test " + str(len(test)) node = mytree.Node(np.ones(len(train))) branch_node(node, train, 5, 'is_spam') #node.show_children_tree() node.show_children_tree(follow=False) model = mytree.Tree(node) model.print_leaves() print 'Trained model error is : ' + str(model.error()) node.presence = np.ones(len(test)) test_node(node, test, 'is_spam') test_tree = mytree.Tree(node) prediction = test_tree.predict_obj() test_tree.print_leaves_test() print 'predict sum: ' + str(sum(prediction)) print 'MSE:' + str(test_tree.error_test()) [tp, tn, fp, fn] = mystats.get_performance_stats(test['is_spam'].as_matrix(), prediction) print 'TP: {}\tFP: {}\nTN: {}\tFN: {}'.format(tp, fp, tn, fn) print 'Accuracy: ' + str(mystats.compute_accuracy(tp,tn, fp,fn)) print 'MSE: ' + str(mystats.compute_MSE_arrays(prediction, test['is_spam']))
def regression_line_spam_no_libs(): """ Solution for HW1 prob 2 """ print('Homework 1 problem 2 - No Libraries - Regression Line') print('Spam Dataset') spam_data = utils.load_and_normalize_spam_data() test, train = utils.split_test_and_train(spam_data) columns = train.columns[:-1] Y_fit = mystats.linear_regression_points(train[columns], train['is_spam']) #print 'Y_fit' #print Y_fit #for i in range(0, len(Y_fit)): # print str(Y_fit[i]) + ' -- ' + str(train['is_spam'][i]) col_MSE = {} for i, col in enumerate(columns): col_fit = Y_fit[i] + Y_fit[-1] col_MSE[col] = mystats.compute_MSE_arrays(col_fit, train['is_spam']) print col_MSE RMSE = np.sqrt(col_MSE.values()) average_MSE = utils.average(col_MSE.values()) average_RMSE = utils.average(RMSE) print 'Average MSE: ' + str(average_MSE) print 'Average RMSE: ' + str(average_RMSE)
def test_regression_line_housing_no_libs(): """ Testing 2 variable solution for HW1 prob 2 """ print('Testing linear regression with 2 columns') test, train = utils.load_and_normalize_housing_set() print str(len(train)) + " # in training set <--> # in test " + str(len(test)) columns = train.columns[:-1] Y_fit = mystats.linear_regression_points(train[columns[0]], train['MEDV']) #for i, col in enumerate(columns): print 'Y_fit' print Y_fit for i in range(0, len(Y_fit)): print str(Y_fit[i]) + ' -- ' + str(train['MEDV'][i]) print train[columns[0]] #myplot.points([train[columns[0]], train['MEDV']]) #myplot.points([train[columns[0]], list(Y_fit[0])]) myplot.fit_v_point([train[columns[0]], train['MEDV'], list(Y_fit[0] + Y_fit[-1])]) col_MSE = {} print columns[0] i = 0 col = 'CRIM' col_fit = Y_fit[i] + Y_fit[-1] col_MSE[col] = mystats.compute_MSE_arrays(col_fit, train['MEDV']) print col_MSE
def decision_housing_set_no_libs(): """ Solution for HW1 prob 1 """ print('Homework 1 problem 1 - No Libraries - Regression Decision tree') print('Housing Dataset') test, train = utils.load_and_normalize_housing_set() # The following 2 lines are for debugging #train = utils.train_subset(train, ['ZN','CRIM', 'TAX', 'DIS', 'MEDV'], n=50) #test = utils.train_subset(test, ['ZN', 'CRIM', 'TAX', 'DIS', 'MEDV'], n=3) print str(len(train)) + " # in training set <--> # in test " + str(len(test)) node = mytree.Node(np.ones(len(train))) branch_node(node, train, 2, 'MEDV', regression=True) #node.show_children_tree() node.show_children_tree(follow=False) model = mytree.Tree(node) model.print_leaves() model.print_tree(train) print 'Trained model error is : ' + str(model.error()) train_prediction = model.predict_obj() print 'Training MSE is: ' + str(mystats.compute_MSE_arrays(train_prediction, train['MEDV'])) sys.exit() node.presence = np.ones(len(test)) test_node(node, test, 'MEDV', regression=True) test_tree = mytree.Tree(node) prediction = test_tree.predict_obj() #raw_input() print 'predict sum: ' + str(sum(prediction)) test_tree.print_leaves_test() print 'ERROR: ' + str(test_tree.error_test()) print prediction print 'train' print train['MEDV'] print 'test' print test['MEDV'] MSE = mystats.compute_MSE_arrays(prediction, test['MEDV']) print 'MSE: ' + str(MSE) print 'RMSE: ' + str(np.sqrt(MSE)) test_tree.print_tree(test, long=False)