def select_polynomial_degree(train_data, val_data): model_by_degree = {} rss_all = {} list_of_degrees = range(1,16) for degree in list_of_degrees: data_n_train = reg.polynomial_sframe(train_data['sqft_living'],degree) features_names = data_n_train.column_names() data_n_train['price'] = train_data['price'] model_n = gp.create_linear_regression(data_n_train,target='price',features=features_names) data_n_val = reg.polynomial_sframe(val_data['sqft_living'],degree) data_n_val['price'] = val_data['price'] rss_n = reg.get_model_residual_sum_of_squares(model_n,data_n_val,data_n_val['price']) rss_all[degree] = rss_n # print 'RSS(%s): %s' % (degree,rss_n) model_by_degree[degree] = model_n return gp.find_key_min(rss_all), model_by_degree
def create_regression_model_by_degree(sales, list_of_degrees, sales_col='sqft_living', target='price'): poly_n_data= {} for degree in list_of_degrees: polyn_data = reg.polynomial_sframe(sales[sales_col], degree) features_names = polyn_data.column_names() # print features_names polyn_data[target] = sales[target] model_n = gp.create_linear_regression(polyn_data, target=target, features = features_names) # model_n.get("coefficients").print_rows(16) power_n = 'power_%s'%degree poly_n_data[power_n] = {'model':model_n, 'coefficients':model_n.get("coefficients"),'data': polyn_data} # plt.plot(polyn_data[power_n],polyn_data['price'],'.',polyn_data[power_n], model_n.predict(polyn_data),'-') # plt.show() return poly_n_data
def quiz_1_selecting_l2_penalty(sales): print "\n**********************************" print "* k-Fold validation *" print "**********************************\n" (train_valid,test) = sales.random_split(.9,seed=1) all_l2_rss_avg = selecting_l2_via_cross_validation(train_valid) best_l2_penalty = min(all_l2_rss_avg,key=all_l2_rss_avg.get) print "\nQ6: Best L2 penalty via 10-fold validation L2 (%.2f): %s" % (best_l2_penalty,all_l2_rss_avg[best_l2_penalty]) degree = 15 model_train_valid,poly_sframe_train_valid = reg.polynomial_ridge_regression(train_valid,degree,target='price',l2_penalty=float(best_l2_penalty)) poly_sframe_test = reg.polynomial_sframe(test['sqft_living'],degree) poly_sframe_test['price'] = test['price'] rss_n = reg.get_model_residual_sum_of_squares(model_train_valid,poly_sframe_test,poly_sframe_test['price']) print "\nQ7: Predictions for degree=%s TEST error (RSS)=%s" % (degree,rss_n) print "\t- Between 8e13 and 4e14"
def selecting_l2_via_cross_validation(train_valid): train_valid_shuffled = gp.graphlab.toolkits.cross_validation.shuffle(train_valid,random_seed=1) k,target,l2_penalties = 10,'price',np_utils.np.logspace(1,7,num=13) poly_sframe = reg.polynomial_sframe(train_valid_shuffled['sqft_living'],degree=15) features_list = poly_sframe.column_names() poly_sframe[target] = train_valid_shuffled[target] l2_rss_avg = compute_k_fold_cross_validation(k,poly_sframe,target,features_list,l2_penalties) plt.figure(figsize=(10,8)) reg.plot_k_cross_vs_penalty(l2_penalties, l2_rss_avg) plt.savefig('../graphs/k_fold_vd_penalty_l2.png') plt.close() all_l2_rss_avg = dict(zip(l2_penalties,l2_rss_avg)) return all_l2_rss_avg
def main(): try: print "\n**********************************" print "* Polynomial Regression Model *" print "**********************************\n" sales = gp.load_data('../../data_sets/kc_house_data.gl/') train,test = sales.random_split(0.5,seed=0) set_1,set_2 = train.random_split(0.5,seed=0) set_3,set_4 = test.random_split(0.5,seed=0) list_of_degrees = [15] #[1,3,5,15] list_of_sets = [set_1,set_2,set_3,set_4] polynomial_regressions = get_polynomial_regression_by_sets(list_of_degrees, list_of_sets) print "\nQ1: power_15 for all four models:" pw_degree = 'power_15' for idx,sets in enumerate(list_of_sets): idx_set = 'set_%s' % (idx + 1) poly_n_coeff = polynomial_regressions[idx_set][pw_degree]['coefficients'] coeff_dict = gp.convert_sframe_to_simple_dict(poly_n_coeff,'name','value') print "\t- %s: %s"%(idx_set, coeff_dict[pw_degree]) print "\nQ2: fitted lines all look the same plots: FALSE" training, test_data = sales.random_split(0.9,seed=1) train_data, val_data = training.random_split(0.5,seed=1) best_degree, model_by_degree = select_polynomial_degree(train_data,val_data) print "\nQ3: the lowest RSS on Validation data is degree:%s" % best_degree data_n_test = reg.polynomial_sframe(test_data['sqft_living'],best_degree) data_n_test['price'] = test_data['price'] rss_n = reg.get_model_residual_sum_of_squares(model_by_degree[best_degree],data_n_test,data_n_test['price']) print "\nQ4: RSS on TEST with the degree:%s from Validation data is:%s" % (best_degree,rss_n) except Exception as details: print "Error >> %s" % details traceback.print_exc()