def gradient_descent_model(train_data, test_data): print "\n**********************************" print "* Gradient Descent Model *" print "**********************************\n" gradient = GrandientDescent() # Model 1: #==================================================== simple_features, my_output = ['sqft_living'], 'price' initial_weights = np_utils.np.array([-47000.,1.]) step_size, tolerance = 7e-12, 2.5e7 parameters = [initial_weights,step_size,tolerance] # TRAINING data simple_weights = calculate_weights(gradient, train_data, simple_features, my_output, parameters) # TEST data model1_predictions = get_predictions(test_data, simple_features, my_output, simple_weights) model1_pred_house1 = round(model1_predictions[0],1) print "Quiz_2 (week_2):" print "\nQ1: weight for sqft_living (model 1) is: %s" % round(simple_weights[-1],1) print "\nQ2: predicted price 1st house in TEST data (model 1) is: %s" % model1_pred_house1 # Model 2: #==================================================== model_features, my_output_m2 = ['sqft_living', 'sqft_living15'],'price' initial_weights_m2 = np_utils.np.array([-100000., 1., 1.]) step_size_m2,tolerance_m2 = 4e-12, 1e9 parameters_m2 = [initial_weights_m2,step_size_m2,tolerance_m2] # TRAINING data estimated_weights = calculate_weights(gradient, train_data, model_features, my_output_m2, parameters_m2) # TEST data model2_predictions = get_predictions(test_data, model_features, my_output_m2, estimated_weights) model2_pred_house1 = round(model2_predictions[0],1) print "\nQ3: predicted price 1st house in TEST data (model 2) is: %s" % model2_pred_house1 true_price_house1 = test_data['price'][0] print "\nQ4: True price for the 1st house on the TEST data is: %s" % true_price_house1 print "\t->diff model1: %s" % abs(model1_pred_house1-true_price_house1) print "\t->diff model2: %s" % abs(model2_pred_house1-true_price_house1) RSS1 = reg.compute_RSS(model1_predictions,test_data['price']) RSS2 = reg.compute_RSS(model2_predictions,test_data['price']) print "\nQ5: Which model (1 or 2) has lowest RSS on all of the TEST data" print "\t->RSS model1: %s" % RSS1 print "\t->RSS model2: %s" % RSS2
def find_rss_on_validation_test(training, validation, all_features, l1_penalties, max_nonzeros): RSS_best, L1_best, best_model = None, None, None for l1_penalty in l1_penalties: current_model = gp.graphlab.linear_regression.create( training, target='price', features=all_features, validation_set=None, verbose=False, l2_penalty=0., l1_penalty=l1_penalty) current_num_nnz = current_model.coefficients['value'].nnz() # print "\t\tNon-Zeros: %s" % current_num_nnz if current_num_nnz == max_nonzeros: predictions = current_model.predict(validation) RSS = reg.compute_RSS(predictions, validation['price']) print "L1 penalty (%.2f)\t\tRSS=%s" % (l1_penalty, RSS) if RSS_best is None or RSS < RSS_best: RSS_best = RSS L1_best = l1_penalty best_model = current_model lasso_info = { 'RSS_best': RSS_best, 'L1_best': L1_best, 'Best model': best_model } return lasso_info
def more_features_with_lasso_coordinate(lasso, sales): train_data, test_data = sales.random_split(.8, seed=0) all_features = [ 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated' ] feature_matrix_norm, train_output, train_norms = np_utils.get_normalized_data( train_data, all_features, 'price') initial_weights = np_utils.np.zeros(len(all_features) + 1) weights_info, nnz_features = {}, {} penalty_tolerance = [[1e7, 1.0], [1e8, 1.0], [1e4, 5e5]] penalty_str = {1e7: '1e7', 1e8: '1e8', 1e4: '1e4'} print "\nFeatures assigned for Q5,Q6,Q7:" for penalty, tolerance in penalty_tolerance: weights = lasso.lasso_cyclical_coordinate_descent( feature_matrix_norm, train_output, initial_weights, penalty, tolerance) # print weights weights_normalized = weights / train_norms weights_info[penalty] = weights_normalized dict_weights = dict( zip(['constant'] + all_features, weights_normalized)) nnz_features[penalty] = filter(lambda x: dict_weights[x] > 0, dict_weights) print "\n\tL1_penalty_%s: %s" % (penalty_str[penalty], nnz_features[penalty]) print "\nQ8: three models RSS on the TEST data:" test_feature_matrix, test_output = np_utils.get_numpy_data( test_data, all_features, 'price') for penalty, tolerance in penalty_tolerance: current_predictions = np_utils.predict_output(test_feature_matrix, weights_info[penalty]) RSS = reg.compute_RSS(current_predictions, test_output) print "\n\tL1_penalty_%s: %s" % (penalty_str[penalty], RSS)
def evaluate_lasso_coordinate(lasso, sales): simple_features = ['sqft_living', 'bedrooms'] my_output = 'price' initial_weights = np_utils.np.zeros(3) l1_penalty = 1e7 tolerance = 1.0 feature_matrix_norm, output, norms = np_utils.get_normalized_data( sales, simple_features, my_output) weights = lasso.lasso_cyclical_coordinate_descent(feature_matrix_norm, output, initial_weights, l1_penalty, tolerance) # print weights current_predictions = np_utils.predict_output(feature_matrix_norm, weights) RSS = reg.compute_RSS(current_predictions, output) print "\nQ3: Lasso-coordinate with normalized dataset RSS is: %s" % RSS print "\nQ4: Features assigned a zero weight at convergence: %s" % simple_features[ -1] print "\t->%s" % weights
def compute_ridge_rss(weights_list, feature_matrix, test_data): for weights_vals in weights_list: current_predictions = np_utils.predict_output(feature_matrix,weights_vals) RSS1 = reg.compute_RSS(current_predictions,test_data['price']) # print 'RSS1: %s' % (RSS1) print "\n\tTEST error (RSS) is: %s" % (RSS1)