def test_feature_derivative_ridge_002(self): (example_features, example_output) = get_numpy_data(sales, ['sqft_living'], 'price') my_weights = np.array([1., 10.]) test_predictions = predict_output(example_features, my_weights) errors = test_predictions - example_output # prediction errors a = feature_derivative_ridge(errors, example_features[:,0], my_weights[0], 1, True) b = np.sum(errors)*2. self.assertEqual(b, a)
def test_ridge_regression_gradient_descent_004(self): model_features = ['sqft_living', 'sqft_living15'] # sqft_living15 is the average squarefeet for the nearest 15 neighbors. my_output = 'price' train_data,test_data = sales.random_split(.8,seed=0) (feature_matrix, output) = get_numpy_data(train_data, model_features, my_output) (test_feature_matrix, test_output) = get_numpy_data(test_data, model_features, my_output) initial_weights = np.array([0.0,0.0,0.0]) step_size = 1e-12 max_iterations = 1000 l2_penalty = 1e11 multiple_weights_high_penalty = ridge_regression_gradient_descent(feature_matrix, output, initial_weights, step_size, l2_penalty, max_iterations) a = np.array([6.7429658,91.48927361,78.43658768]) self.assertEqual(str(a), str(multiple_weights_high_penalty))
def test_ridge_regression_gradient_descent_003(self): model_features = ['sqft_living', 'sqft_living15'] # sqft_living15 is the average squarefeet for the nearest 15 neighbors. my_output = 'price' train_data,test_data = sales.random_split(.8,seed=0) (feature_matrix, output) = get_numpy_data(train_data, model_features, my_output) (test_feature_matrix, test_output) = get_numpy_data(test_data, model_features, my_output) initial_weights = np.array([0.0,0.0,0.0]) step_size = 1e-12 max_iterations = 1000 l2_penalty = 0. multiple_weights_0_penalty = ridge_regression_gradient_descent(feature_matrix, output, initial_weights, step_size, l2_penalty, max_iterations) a = np.array([-0.35743482,243.0541689,22.41481594]) self.assertEqual(str(a), str(multiple_weights_0_penalty))
def test_ridge_regression_gradient_descent_002(self): simple_features = ['sqft_living'] my_output = 'price' train_data,test_data = sales.random_split(.8,seed=0) (simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output) (simple_test_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output) initial_weights = np.array([0., 0.]) step_size = 1e-12 max_iterations = 1000 l2_penalty = 1e11 simple_weights_high_penalty = ridge_regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, l2_penalty, max_iterations) a = np.array([9.76730383,124.57217565]) self.assertEqual(str(a), str(simple_weights_high_penalty))
def test_ridge_regression_gradient_descent_001(self): simple_features = ['sqft_living'] my_output = 'price' train_data,test_data = sales.random_split(.8,seed=0) (simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output) (simple_test_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output) initial_weights = np.array([0., 0.]) step_size = 1e-12 max_iterations = 1000 l2_penalty = 0. simple_weights_0_penalty = ridge_regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, l2_penalty, max_iterations) a = np.array([-1.63113501e-01,2.63024369e+02]) self.assertEqual(str(a), str(simple_weights_0_penalty))
import graphlab as gl import numpy as np from regression import get_numpy_data from regression import predict_output from regression import regression_gradient_descent sales = gl.SFrame('kc_house_data.gl/') # train and test #Q1 & Q2 train_data,test_data = sales.random_split(.8,seed=0) simple_features = ['sqft_living'] my_output= 'price' (simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output) initial_weights = np.array([-47000., 1.]) step_size = 7e-12 tolerance = 2.5e7 weights1 = regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, tolerance) (feature_matrix1, output) = get_numpy_data(test_data, simple_features, my_output) predict1 = predict_output(feature_matrix1,weights1) #Q1 print "Q1: What is the value of the weight for sqft_living", weights1[1] #Q2 print "Q2: What is the predicted price for the 1st house in the Test data set for model 1", predict1[0]
'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'lat', 'long', 'sqft_living15', 'sqft_lot15'] features_train, output_train = get_numpy_data(train, feature_list, 'price') features_test, output_test = get_numpy_data(test, feature_list, 'price') features_valid, output_valid = get_numpy_data(validation, feature_list, 'price') features_train, norms = normalize_features(features_train) # normalize training set features (columns) features_test = features_test / norms # normalize test set by training set norms features_valid = features_valid / norms # normalize validation set by training set norms query = features_test[0] print "1st row of teat features: ", query comp = features_train[9] print "10th row of training features: ", comp distance = np.sqrt(np.sum((query-comp)**2)) print "*** QUIZ QUESTION ***" print "Euclidean distance between the query house and the 10th house of the training set: ", distance
# so we'll convert them to int, before using it below sales['floors'] = sales['floors'].astype(int) # --------------------------------------- # Normalize features # --------------------------------------- print("*** Normalize features") # --------------------------------------- # Implementing Coordinate Descent with normalized features # --------------------------------------- print("*** Implementing Coordinate Descent with normalized features") simple_features = ['sqft_living', 'bedrooms'] my_output = 'price' (simple_feature_matrix, output) = get_numpy_data(sales, simple_features, my_output) simple_feature_matrix, norms = normalize_features(simple_feature_matrix) weights = np.array([1., 4., 1.]) weights prediction = predict_output(simple_feature_matrix, weights) prediction w = weights # need to normalize output here? ro = {} for i in range(0,len(w)): feature_i = simple_feature_matrix[:,i] tmp = feature_i * (output - prediction + w[i]*feature_i) print tmp
sys.path.append("..") import sys import graphlab sales = graphlab.SFrame('kc_house_data.gl/') import numpy as np # note this allows us to refer to numpy as np instead import unittest from regression import get_numpy_data from regression import predict_output from regression import feature_derivative from regression import regression_gradient_descent (example_features, example_output) = get_numpy_data(sales, ['sqft_living'], 'price') # the [] around 'sqft_living' makes it a list print "example_features" print len(example_features) print example_features[0:3, :] # this accesses the first row of the data the ':' indicates 'all columns' print "example_output" print len(example_output) print example_output[0] # and the corresponding output # Predicting output given regression weights print("*** Predicting output given regression weights") my_weights = np.array([1., 1.]) # the example weights
print features print "Should print: \n[[ 0.6 0.6 0.6]\n [ 0.8 0.8 0.8]]"# should print # [[ 0.6 0.6 0.6] # [ 0.8 0.8 0.8]] print norms print "Should print: \n[5. 10. 15.]" # should print # [5. 10. 15.] print "==== Implementing Coordinate Descent with normalized features ====" print "=== Effect of L1 penalty ===" simple_features = ['sqft_living', 'bedrooms'] my_output = 'price' (simple_feature_matrix, output) = get_numpy_data(sales, simple_features, my_output) simple_feature_matrix, norms = normalize_features(simple_feature_matrix) weights = np.array([1., 4., 1.]) prediction = predict_output(simple_feature_matrix, weights) ro = {} for i in range(len(weights)): feature_i = simple_feature_matrix[:,i] tmp = feature_i * (output - prediction + weights[i]*feature_i) ro[i] = tmp.sum() print "ro[", i, "] is:", ro[i] print "***** Quiz question *****" print "Range 1 of L1 is [", 2*ro[2], ", ", 2*ro[1], ")." print "Range 2 of L1 is lambda <", 2*ro[2]
print features print "Should print: \n[[ 0.6 0.6 0.6]\n [ 0.8 0.8 0.8]]" # should print # [[ 0.6 0.6 0.6] # [ 0.8 0.8 0.8]] print norms print "Should print: \n[5. 10. 15.]" # should print # [5. 10. 15.] print "==== Implementing Coordinate Descent with normalized features ====" print "=== Effect of L1 penalty ===" simple_features = ['sqft_living', 'bedrooms'] my_output = 'price' (simple_feature_matrix, output) = get_numpy_data(sales, simple_features, my_output) simple_feature_matrix, norms = normalize_features(simple_feature_matrix) weights = np.array([1., 4., 1.]) prediction = predict_output(simple_feature_matrix, weights) ro = {} for i in range(len(weights)): feature_i = simple_feature_matrix[:, i] tmp = feature_i * (output - prediction + weights[i] * feature_i) ro[i] = tmp.sum() print "ro[", i, "] is:", ro[i] print "***** Quiz question *****" print "Range 1 of L1 is [", 2 * ro[2], ", ", 2 * ro[1], ")." print "Range 2 of L1 is lambda <", 2 * ro[2]
import graphlab as gl import numpy as np import matplotlib as mpl import matplotlib.pyplot as plt from regression import get_numpy_data from regression import predict_output from regression import feature_derivative_ridge from regression import ridge_regression_gradient_descent from regression import get_simple_residuals mpl.use('TkAgg') sales = gl.SFrame('kc_house_data.gl/') (example_features, example_output) = get_numpy_data(sales, ['sqft_living'], 'price') my_weights = np.array([1., 10.]) test_predictions = predict_output(example_features, my_weights) errors = test_predictions - example_output # prediction errors # next two lines should print the same values print feature_derivative_ridge(errors, example_features[:,1], my_weights[1], 1, False) print np.sum(errors*example_features[:,1])*2+20. print '' # -5.65541667824e+13 # -5.65541667824e+13 # next two lines should print the same values print feature_derivative_ridge(errors, example_features[:,0], my_weights[0], 1, True) print np.sum(errors)*2. # -22446749336.0 # -22446749336.0
from regression import multiple_predict_knn from regression import get_simple_residuals import matplotlib.pyplot as plt sales = gl.SFrame('kc_house_data_small.gl/') (train_and_validation, test) = sales.random_split(.8, seed=1) # initial train/test split (train, validation) = train_and_validation.random_split( .8, seed=1) # split training set into training and validation sets feature_list = [ 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'lat', 'long', 'sqft_living15', 'sqft_lot15' ] features_train, output_train = get_numpy_data(train, feature_list, 'price') features_test, output_test = get_numpy_data(test, feature_list, 'price') features_valid, output_valid = get_numpy_data(validation, feature_list, 'price') features_train, norms = normalize_features( features_train) # normalize training set features (columns) features_test = features_test / norms # normalize test set by training set norms features_valid = features_valid / norms # normalize validation set by training set norms query = features_test[0] print "1st row of teat features: ", query comp = features_train[9] print "10th row of training features: ", comp distance = np.sqrt(np.sum((query - comp)**2))