def test_regression_line_housing_no_libs(): """ Testing 2 variable solution for HW1 prob 2 """ print('Testing linear regression with 2 columns') test, train = utils.load_and_normalize_housing_set() print str(len(train)) + " # in training set <--> # in test " + str(len(test)) columns = train.columns[:-1] Y_fit = mystats.linear_regression_points(train[columns[0]], train['MEDV']) #for i, col in enumerate(columns): print 'Y_fit' print Y_fit for i in range(0, len(Y_fit)): print str(Y_fit[i]) + ' -- ' + str(train['MEDV'][i]) print train[columns[0]] #myplot.points([train[columns[0]], train['MEDV']]) #myplot.points([train[columns[0]], list(Y_fit[0])]) myplot.fit_v_point([train[columns[0]], train['MEDV'], list(Y_fit[0] + Y_fit[-1])]) col_MSE = {} print columns[0] i = 0 col = 'CRIM' col_fit = Y_fit[i] + Y_fit[-1] col_MSE[col] = mystats.compute_MSE_arrays(col_fit, train['MEDV']) print col_MSE
def testHW2(): # Success test, train = utils.load_and_normalize_housing_set() df_train = pd.DataFrame(train) df_test = pd.DataFrame(test) print df_train.head(10) #raw_input() print hw2.linear_gd(df_train, df_test, 'MEDV')
def regression_line_housing_no_libs(): """ Solution for HW1 prob 2 """ print('Homework 1 problem 2 - No Libraries - Regression Line') print('Housing Dataset') test, train = utils.load_and_normalize_housing_set() print str(len(train)) + " # in training set <--> # in test " + str(len(test)) columns = train.columns[:-1] Y_fit = mystats.linear_regression_points(train[columns], train['MEDV']) print 'Y_fit' print Y_fit #for i in range(0, len(Y_fit)): # print str(Y_fit[i]) + ' -- ' + str(train['MEDV'][i]) row_sums = np.zeros(len(Y_fit[0])) for col in Y_fit: for i in range(0, len(col)): row_sums[i] += col[i] print row_sums col_MSE = {} for i, col in enumerate(columns): col_fit = row_sums[i] # Y_fit[i] + Y_fit[-1] col_MSE[col] = mystats.compute_MSE(col_fit, train['MEDV']) print col_MSE RMSE = np.sqrt(col_MSE.values()) average_MSE = utils.average(col_MSE.values()) average_RMSE = utils.average(RMSE) print 'Average MSE: ' + str(average_MSE) print 'Average RMSE: ' + str(average_RMSE)
def testHW2_subset(): # Success test, train = utils.load_and_normalize_housing_set() df_full = pd.DataFrame(train) df_test = utils.train_subset(df_full, ['CRIM', 'TAX', 'B', 'MEDV'], n=10) df_train = utils.train_subset(df_full, ['CRIM', 'TAX', 'B', 'MEDV'], n=10) dfX_test = pd.DataFrame([df_test['CRIM'], df_test['TAX'], df_test['MEDV']]).transpose() dfX_train = pd.DataFrame([df_train['CRIM'], df_train['TAX'], df_train['MEDV']]).transpose() print hw2.linear_gd(dfX_train, dfX_test, 'MEDV')
def testHW2_allcols(): # Fail test, train = utils.load_and_normalize_housing_set() df_full = pd.DataFrame(train) cols = [col for col in df_full.columns if col != 'MEDV'] df_test = utils.train_subset(df_full, cols, n=10) df_train = utils.train_subset(df_full, cols, n=10) #dfX_test = pd.DataFrame([df_test['CRIM'], df_test['TAX'], df_test['MEDV']]).transpose() #dfX_train = pd.DataFrame([df_train['CRIM'], df_train['TAX'], df_train['MEDV']]).transpose() print hw2.linear_gd(df_train, df_test, 'MEDV')
def testScale(): test, train = utils.load_and_normalize_housing_set() df_full = pd.DataFrame(train) df = utils.train_subset(df_full, ['CRIM', 'TAX', 'B', 'MEDV'], n=10) w = [] for i in range(0,len(df['TAX'])): w.append(random.random()) scaled = utils.scale(w, min(df['TAX']), max(df['TAX'])) plot.fit_v_point([w, df['MEDV'], scaled])
def regression_housing_set(): """ Solution for HW1 prob 1 """ print('Homework 1 problem 1 - Regression Decision tree') print('Housing Dataset') test, train = utils.load_and_normalize_housing_set() dt_reg = train_regression_tree(train) predicted = test_regression_tree(dt_reg, test) error = mystats.calculate_chisq_error(predicted, test['MEDV']) print 'Error: ' + str(error)
def testGradientByColumn(): test, train = utils.load_and_normalize_housing_set() blacklist = ['NOX', 'RM'] df_full = pd.DataFrame(train) for i in range(2, len(df_full.columns) - 1): cols = [] for j in range(1, i): if df_full.columns[j] not in blacklist: cols.append(df_full.columns[j]) cols.append('MEDV') print cols raw_input() testGradient_by_columns(df_full, cols)
def q7(): h_test, h_train = utils.load_and_normalize_housing_set() housingData_test = hw3.pandas_to_data(h_test) housingData_train = hw3.pandas_to_data(h_train) y, X = hw4.split_truth_from_data(housingData_train) y_test, X_test = hw4.split_truth_from_data(housingData_test) #gb = GradientBoostingRegressor(learning_rate=.1, n_estimators=1, max_depth=1) gb = gradb.GradientBoostRegressor(learning_rate=.1, n_estimators=100, max_depth=1, learner=lambda: DecisionTreeRegressor(max_depth=1)) gb.fit(X, y) gb.print_stats() yhat = gb.predict(X_test) print y_test[:10] print yhat[:10] print 'MSE: {}'.format(hw4.compute_mse(y_test, yhat))
def do2A(): """ HW 2A Train linear regression using gradient descent on spambase and housing data """ print('HW2 A. Gradient descent with housing and spam data sets') num_iters = 50 learning_param = 0.25 housingData_test, housingData_train = utils.load_and_normalize_housing_set() theta, error_matrix = gradient_descent(housingData_test, 'MEDV', num_iters, learning_param) print('Errors for housing set') print error_matrix print('theta for housing set') print theta
def q_1(): h_test, h_train = utils.load_and_normalize_housing_set() h_results = [] s_results = [] # h_results.append(dec_or_reg_tree(h_train, h_test, 'MEDV')) # MSE - 568 test- 448 # h_results.append(linear_reg_errors(h_train, h_test, 'MEDV')) # MSE - 27 test -14 # h_results.append(linear_reg_errors(h_train, h_test, 'MEDV', True)) # 24176 - 68289 # h_results.append(linear_gd(h_train, h_test, 'MEDV')) # works but MSE too low? .0022 - .0013 # h_results.append(logistic_gd(h_train, h_test, 'MEDV')) # 1.46e_13 - 1.17e+13 s_test, s_train = utils.split_test_and_train(utils.load_and_normalize_spam_data()) s_results.append(dec_or_reg_tree(s_train, s_test, "is_spam")) # works .845 - .86 s_results.append(linear_reg_errors(s_train, s_test, "is_spam")) # works .8609 - .903 s_results.append(linear_reg_errors(s_train, s_test, "is_spam", True)) # works .8416 - .8543 s_results.append(k_folds_linear_gd(s_train, s_test, "is_spam")) # does not work .6114 - .6114 s_results.append(logistic_gd(s_train, s_test, "is_spam")) # returns perfect... 1- 1 print_results_1(s_results, h_results)
def testGradient(): # Great success with subset test, train = utils.load_and_normalize_housing_set() df_full = pd.DataFrame(train) subset_size = 100 df = utils.train_subset(df_full, ['CRIM', 'TAX', 'B', 'MEDV'], n=subset_size) dfX = pd.DataFrame([df['CRIM'], df['TAX']]).transpose() print len(dfX) print dfX #raw_input() fit = gd.gradient(dfX, df['MEDV'].head(subset_size), .5, max_iterations=300) print 'read v fit' print len(dfX) print df['MEDV'].head(10) print fit data = gd.add_col(gd.pandas_to_data(dfX), 1) print np.dot(data, fit)
def decision_housing_set_no_libs(): """ Solution for HW1 prob 1 """ print('Homework 1 problem 1 - No Libraries - Regression Decision tree') print('Housing Dataset') test, train = utils.load_and_normalize_housing_set() # The following 2 lines are for debugging #train = utils.train_subset(train, ['ZN','CRIM', 'TAX', 'DIS', 'MEDV'], n=50) #test = utils.train_subset(test, ['ZN', 'CRIM', 'TAX', 'DIS', 'MEDV'], n=3) print str(len(train)) + " # in training set <--> # in test " + str(len(test)) node = mytree.Node(np.ones(len(train))) branch_node(node, train, 2, 'MEDV', regression=True) #node.show_children_tree() node.show_children_tree(follow=False) model = mytree.Tree(node) model.print_leaves() model.print_tree(train) print 'Trained model error is : ' + str(model.error()) train_prediction = model.predict_obj() print 'Training MSE is: ' + str(mystats.compute_MSE_arrays(train_prediction, train['MEDV'])) sys.exit() node.presence = np.ones(len(test)) test_node(node, test, 'MEDV', regression=True) test_tree = mytree.Tree(node) prediction = test_tree.predict_obj() #raw_input() print 'predict sum: ' + str(sum(prediction)) test_tree.print_leaves_test() print 'ERROR: ' + str(test_tree.error_test()) print prediction print 'train' print train['MEDV'] print 'test' print test['MEDV'] MSE = mystats.compute_MSE_arrays(prediction, test['MEDV']) print 'MSE: ' + str(MSE) print 'RMSE: ' + str(np.sqrt(MSE)) test_tree.print_tree(test, long=False)
def testLinRidge(): h_test, h_train = utils.load_and_normalize_housing_set() #print hw2.linear_reg_errors(h_train, h_test, 'MEDV', True) print hw2.linear_reg(h_train, 'MEDV', False, False)
def do2B(): hd_test, hd_train = utils.load_and_normalize_housing_set() logistic_regression(hd_train, hd_test, 'MEDV')