class KFoldCrossValidation: # Usage: # Computes K Fold Cross Validation def __init__(self): # Usage: # Constructor for KFoldCrossValidation, used to setup ConvertNumpy class to convert pandas # data to numpy. # Arguments: # None # Create an instance of the Convert Numpy class self.convert_numpy = ConvertNumpy() # Create an instance of the Predict Output Class self.predict_output = PredictOutput() # Create an instance of the Residual Sum Squares Class self.residual_sum_squares = ResidualSumSquares() def k_fold_cross_validation(self, k, data, model, model_parameters, output, features): # Usage: # Takes in our data, and splits the data to smaller subsets, and these smaller subsets # are used as validation sets, and everything else not included in the validation set is used # as training sets. The model will be trained using the training set, and the performance assessment # such as RSS would be used on the validation set against the model. # Parameters: # k (int) : number of folds # data (pandas object) : data used for k folds cross validation # model (object) : model used for k folds cross validation # model_parameters (dict) : model parameters to train the specified model # features (list of string) : a list of feature names # output (string) : output name # Return: # validation_error (double) : average validation error # Get the length of the data length_data = len(data) # Sum of the validation error, will divide by k (fold) later validation_error_sum = 0 # Loop through each fold for i in range(k): # Compute the start section of the current fold start = int((length_data*i)/k) # Compute the end section of the current fold end = int((length_data*(i+1))/k-1) # Get our validation set from the start to the end+1 (+1 since we need to include the end) # <Start : end + 1> Validation Set validation_set = data[start:end+1] # The Training set the left and the right parts of the validation set # < 0 : Start > Train Set 1 # < Start : End + 1 > Validation Set # < End + 1 : n > Train Set 2 # Train Set 1 + Train Set 2 = All data excluding validation set training_set = data[0:start].append(data[end+1:length_data]) # Convert our pandas frame to numpy validation_feature_matrix, validation_output = self.convert_numpy.convert_to_numpy(validation_set, features, output, 1) # Convert our pandas frame to numpy training_feature_matrix, training_output = self.convert_numpy.convert_to_numpy(training_set, features, output, 1) # Create a model with Train Set 1 + Train Set 2 final_weights = model(**model_parameters, feature_matrix=training_feature_matrix, output=training_output) # Predict the output of test features predicted_output = self.predict_output.predict_output_linear_regression(validation_feature_matrix, final_weights) # compute squared error (in other words, rss) validation_error_sum += self.residual_sum_squares.residual_sum_squares_linear_regression(validation_output, predicted_output) # Return the validation_error_sum divided by fold return validation_error_sum/k
class TestRidgeRegression(unittest.TestCase): # Usage: # Tests for the Linear Regression Class. def setUp(self): # Usage: # Constructor for TestRidgeRegression # Arguments: # None # Create an instance of the Convert Numpy class self.convert_numpy = ConvertNumpy() # Create an instance of the Linear Regression class self.ridge_regression = RidgeRegression() # Create an instance of the Predict Output Class self.predict_output = PredictOutput() # Create an instance of the Residual Sum Squares Class self.residual_sum_squares = ResidualSumSquares() # Create an instance of the K Fold Cross Validation Class self.k_fold_cross_validation = KFoldCrossValidation() # Create a dictionary type to store relevant data types so that our pandas # will read the correct information dtype_dict = { 'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float, 'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str, 'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int, 'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int, 'view': int } # Create a kc_house_frame that encompasses all test and train data self.kc_house_frame = pd.read_csv( './unit_tests/test_data/kc_house/kc_house_data.csv', dtype=dtype_dict) # Create a kc_house_test_frame that encompasses only train data self.kc_house_train_frame = pd.read_csv( './unit_tests/test_data/kc_house/kc_house_train_data.csv', dtype=dtype_dict) # Create a kc_house_frames that encompasses only test data self.kc_test_frames = pd.read_csv( './unit_tests/test_data/kc_house/kc_house_test_data.csv', dtype=dtype_dict) # Create a kc_house_train_valid_shuffled that encompasses both train and valid data and shuffled self.kc_house_train_valid_shuffled = pd.read_csv( './unit_tests/test_data/kc_house_with_validation_k_fold/wk3_kc_house_train_valid_shuffled.csv', dtype=dtype_dict) def test_01_gradient_descent_no_penalty(self): # Usage: # Tests the result on gradient descent with low penalty # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy( self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 0.0 # Compute our gradient descent value final_weights = self.ridge_regression.gradient_descent( feature_matrix, output, initial_weights, step_size, tolerance, l2_penalty, max_iterations) # We will use sqft_iving, and sqft_living15 test_features = ['sqft_living'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy( self.kc_test_frames, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.predict_output_linear_regression( test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_linear_regression( test_output, predicted_output) # Assert that the weights is correct self.assertEquals(round(-0.16311351478746433, 5), round(final_weights[0], 5)) self.assertEquals(round(263.02436896538489, 3), round(final_weights[1], 3)) # Assert that rss is correct self.assertEquals(round(275723632153607.72, -5), round(rss, -5)) def test_02_gradient_descent_high_penalty(self): # Usage: # Tests the result on gradient descent with high penalty # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy( self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 1e11 # Compute our gradient descent value final_weights = self.ridge_regression.gradient_descent( feature_matrix, output, initial_weights, step_size, tolerance, l2_penalty, max_iterations) # We will use sqft_iving test_features = ['sqft_living'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy( self.kc_test_frames, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.predict_output_linear_regression( test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_linear_regression( test_output, predicted_output) # Assert that the weights is correct self.assertEquals(round(0.048718475774044, 5), round(final_weights[0], 5)) self.assertEquals(round(124.57402057376679, 3), round(final_weights[1], 3)) # Assert that rss is correct self.assertEquals(round(694654309578537.25, -5), round(rss, -5)) def test_03_gradient_descent_multiple_high_penalty(self): # Usage: # Tests the result on gradient descent with high penalty # Arguments: # None # We will use sqft_iving, and sqft_living15 features = ['sqft_living', 'sqft_living15'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy( self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([0.0, 0.0, 0.0]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 1e11 # Compute our gradient descent value final_weights = self.ridge_regression.gradient_descent( feature_matrix, output, initial_weights, step_size, tolerance, l2_penalty, max_iterations) # We will use sqft_iving, and sqft_living15 test_features = ['sqft_living', 'sqft_living15'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy( self.kc_test_frames, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.predict_output_linear_regression( test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_linear_regression( test_output, predicted_output) # Assert that the weights is correct self.assertEquals(round(0.033601165521060711, 5), round(final_weights[0], 5)) self.assertEquals(round(91.490167574878328, 3), round(final_weights[1], 3)) self.assertEquals(round(78.437490333967176, 3), round(final_weights[2], 3)) # Assert that rss is correct self.assertEquals(round(500408530236718.31, 0), round(rss, 0)) # Look at the first predicted output self.assertEquals(round(270449.70602770313, 3), round(predicted_output[0], 3)) # The first output should be 310000 in the test set self.assertEquals(310000.0, test_output[0]) def test_04_gradient_descent_k_fold(self): # Usage: # Tests best l2_penalty for ridge regression using gradient descent # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Tolerance tolerance = None # Max Iterations to Run max_iterations = 1000 # Number of Folds folds = 10 # Store Cross Validation results cross_validation_results = [] # We want to test l2 penalty values in [10^1, 10^2, 10^3, 10^4, ..., 10^11] for l2_penalty in np.logspace(1, 11, num=11): # Create a dictionary of model_parameters model_parameters = { 'step_size': step_size, 'max_iteration': max_iterations, 'initial_weights': initial_weights, 'tolerance': tolerance, 'l2_penalty': l2_penalty } # Compute the cross validation results cross_validation = self.k_fold_cross_validation.k_fold_cross_validation( folds, self.kc_house_train_frame, self.ridge_regression.gradient_descent, model_parameters, output, features) # Append it into the results cross_validation_results.append((l2_penalty, cross_validation)) # Lowest Result lowest = sorted(cross_validation_results, key=lambda x: x[1])[0] # Assert True that 10000000 is the l2_penalty that gives the lowest cross validation error self.assertEquals(10000000.0, lowest[0]) # Assert True that is the lowest l2_penalty self.assertEquals(round(120916225812152.84, 0), round(lowest[1], 0)) def test_05_hill_climbing(self): # Usage: # Tests the result on hill climbing # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will be price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy( self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 0.0 # Compute our hill climbing value final_weights = self.ridge_regression.hill_climbing( feature_matrix, output, initial_weights, step_size, tolerance, l2_penalty, max_iterations) # Assert that the weights is correct self.assertEquals(round(-7.7535764461428101e+70, -68), round(final_weights[0], -68)) self.assertEquals(round(-1.9293745396177612e+74, -70), round(final_weights[1], -70))
class TestLinearRegression(unittest.TestCase): # Usage: # Tests for the Linear Regression Class. def setUp(self): # Usage: # Constructor for TestLinearRegression # Arguments: # None # Create an instance of the Convert Numpy class self.convert_numpy = ConvertNumpy() # Create an instance of the Linear Regression class self.linear_regression = LinearRegression() # Create an instance of the Predict Output Class self.predict_output = PredictOutput() # Create an instance of the Residual Sum Squares Class self.residual_sum_squares = ResidualSumSquares() # Create a dictionary type to store relevant data types so that our pandas # will read the correct information dtype_dict = { 'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float, 'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str, 'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int, 'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int, 'view': int } # Create a kc_house_frame that encompasses all test and train data self.kc_house_frame = pd.read_csv( './unit_tests/test_data/kc_house/kc_house_data.csv', dtype=dtype_dict) # Create a kc_house_test_frame that encompasses only train data self.kc_house_train_frame = pd.read_csv( './unit_tests/test_data/kc_house/kc_house_train_data.csv', dtype=dtype_dict) # Create a kc_house_frames that encompasses only test data self.kc_test_frames = pd.read_csv( './unit_tests/test_data/kc_house/kc_house_test_data.csv', dtype=dtype_dict) def test_01_gradient_descent(self): # Usage: # Tests the result on gradient descent # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy( self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([-47000., 1.]) # Step size step_size = 7e-12 # Tolerance tolerance = 2.5e7 # Compute our gradient descent value final_weights = self.linear_regression.gradient_descent( feature_matrix, output, initial_weights, step_size, tolerance) # Assert that the weights is correct self.assertEquals(round(-46999.887165546708, 3), round(final_weights[0], 3)) self.assertEquals(round(281.91211917520917, 3), round(final_weights[1], 3)) def test_02_gradient_descent_multiple(self): # Usage: # Computes gradient descent on multiple input, and computes predicted model and RSS # Arguments: # None # We will use sqft_iving, and sqft_living15 features = ['sqft_living', 'sqft_living15'] # Output will be price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy( self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([-100000., 1., 1.]) # Step size step_size = 4e-12 # Tolerance tolerance = 1e9 # Compute our gradient descent value final_weights = self.linear_regression.gradient_descent( feature_matrix, output, initial_weights, step_size, tolerance) # We will use sqft_iving, and sqft_living15 test_features = ['sqft_living', 'sqft_living15'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy( self.kc_test_frames, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.predict_output_linear_regression( test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_linear_regression( test_output, predicted_output) # Assert that rss is correct self.assertEquals(round(270263443629803.41, -3), round(rss, -3)) def test_03_hill_climbing(self): # Usage: # Tests the result on hill climbing # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will be price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy( self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([-47000., 1.]) # Step size step_size = 7e-12 # Tolerance tolerance = 2.5e7 # Compute our hill climbing value final_weights = self.linear_regression.hill_climbing( feature_matrix, output, initial_weights, step_size, tolerance) # Assert that the weights is correct self.assertEquals(round(-47000.142201335177, 3), round(final_weights[0], 3)) self.assertEquals(round(-352.86068692252599, 3), round(final_weights[1], 3))
class TestLinearRegression(unittest.TestCase): # Usage: # Tests for the Linear Regression Class. def setUp(self): # Usage: # Constructor for TestLinearRegression # Arguments: # None # Create an instance of the Convert Numpy class self.convert_numpy = ConvertNumpy() # Create an instance of the Linear Regression class self.linear_regression = LinearRegression() # Create an instance of the Predict Output Class self.predict_output = PredictOutput() # Create an instance of the Residual Sum Squares Class self.residual_sum_squares = ResidualSumSquares() # Create a dictionary type to store relevant data types so that our pandas # will read the correct information dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int} # Create a kc_house_frame that encompasses all test and train data self.kc_house_frame = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_data.csv', dtype=dtype_dict) # Create a kc_house_test_frame that encompasses only train data self.kc_house_train_frame = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_train_data.csv', dtype=dtype_dict) # Create a kc_house_frames that encompasses only test data self.kc_test_frames = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_test_data.csv', dtype=dtype_dict) def test_01_gradient_descent(self): # Usage: # Tests the result on gradient descent # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([-47000., 1.]) # Step size step_size = 7e-12 # Tolerance tolerance = 2.5e7 # Compute our gradient descent value final_weights = self.linear_regression.gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance) # Assert that the weights is correct self.assertEquals(round(-46999.887165546708, 3), round(final_weights[0], 3)) self.assertEquals(round(281.91211917520917, 3), round(final_weights[1], 3)) def test_02_gradient_descent_multiple(self): # Usage: # Computes gradient descent on multiple input, and computes predicted model and RSS # Arguments: # None # We will use sqft_iving, and sqft_living15 features = ['sqft_living', 'sqft_living15'] # Output will be price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([-100000., 1., 1.]) # Step size step_size = 4e-12 # Tolerance tolerance = 1e9 # Compute our gradient descent value final_weights = self.linear_regression.gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance) # We will use sqft_iving, and sqft_living15 test_features = ['sqft_living', 'sqft_living15'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_test_frames, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.predict_output_linear_regression(test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_linear_regression(test_output, predicted_output) # Assert that rss is correct self.assertEquals(round(270263443629803.41, -3), round(rss, -3)) def test_03_hill_climbing(self): # Usage: # Tests the result on hill climbing # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will be price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([-47000., 1.]) # Step size step_size = 7e-12 # Tolerance tolerance = 2.5e7 # Compute our hill climbing value final_weights = self.linear_regression.hill_climbing(feature_matrix, output, initial_weights, step_size, tolerance) # Assert that the weights is correct self.assertEquals(round(-47000.142201335177, 3), round(final_weights[0], 3)) self.assertEquals(round(-352.86068692252599, 3), round(final_weights[1], 3))
class TestRidgeRegression(unittest.TestCase): # Usage: # Tests for the Linear Regression Class. def setUp(self): # Usage: # Constructor for TestRidgeRegression # Arguments: # None # Create an instance of the Convert Numpy class self.convert_numpy = ConvertNumpy() # Create an instance of the Linear Regression class self.ridge_regression = RidgeRegression() # Create an instance of the Predict Output Class self.predict_output = PredictOutput() # Create an instance of the Residual Sum Squares Class self.residual_sum_squares = ResidualSumSquares() # Create an instance of the K Fold Cross Validation Class self.k_fold_cross_validation = KFoldCrossValidation() # Create a dictionary type to store relevant data types so that our pandas # will read the correct information dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int} # Create a kc_house_frame that encompasses all test and train data self.kc_house_frame = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_data.csv', dtype=dtype_dict) # Create a kc_house_test_frame that encompasses only train data self.kc_house_train_frame = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_train_data.csv', dtype=dtype_dict) # Create a kc_house_frames that encompasses only test data self.kc_test_frames = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_test_data.csv', dtype=dtype_dict) # Create a kc_house_train_valid_shuffled that encompasses both train and valid data and shuffled self.kc_house_train_valid_shuffled = pd.read_csv('./unit_tests/test_data/kc_house_with_validation_k_fold/wk3_kc_house_train_valid_shuffled.csv', dtype=dtype_dict) def test_01_gradient_descent_no_penalty(self): # Usage: # Tests the result on gradient descent with low penalty # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 0.0 # Compute our gradient descent value final_weights = self.ridge_regression.gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance, l2_penalty, max_iterations) # We will use sqft_iving, and sqft_living15 test_features = ['sqft_living'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_test_frames, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.predict_output_linear_regression(test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_linear_regression(test_output, predicted_output) # Assert that the weights is correct self.assertEquals(round(-0.16311351478746433, 5), round(final_weights[0], 5)) self.assertEquals(round(263.02436896538489, 3), round(final_weights[1], 3)) # Assert that rss is correct self.assertEquals(round(275723632153607.72, -5), round(rss, -5)) def test_02_gradient_descent_high_penalty(self): # Usage: # Tests the result on gradient descent with high penalty # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 1e11 # Compute our gradient descent value final_weights = self.ridge_regression.gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance, l2_penalty, max_iterations) # We will use sqft_iving test_features = ['sqft_living'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_test_frames, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.predict_output_linear_regression(test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_linear_regression(test_output, predicted_output) # Assert that the weights is correct self.assertEquals(round(0.048718475774044, 5), round(final_weights[0], 5)) self.assertEquals(round(124.57402057376679, 3), round(final_weights[1], 3)) # Assert that rss is correct self.assertEquals(round(694654309578537.25, -5), round(rss, -5)) def test_03_gradient_descent_multiple_high_penalty(self): # Usage: # Tests the result on gradient descent with high penalty # Arguments: # None # We will use sqft_iving, and sqft_living15 features = ['sqft_living', 'sqft_living15'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([0.0, 0.0, 0.0]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 1e11 # Compute our gradient descent value final_weights = self.ridge_regression.gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance, l2_penalty, max_iterations) # We will use sqft_iving, and sqft_living15 test_features = ['sqft_living', 'sqft_living15'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_test_frames, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.predict_output_linear_regression(test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_linear_regression(test_output, predicted_output) # Assert that the weights is correct self.assertEquals(round(0.033601165521060711, 5), round(final_weights[0], 5)) self.assertEquals(round(91.490167574878328, 3), round(final_weights[1], 3)) self.assertEquals(round(78.437490333967176, 3), round(final_weights[2], 3)) # Assert that rss is correct self.assertEquals(round(500408530236718.31, 0), round(rss, 0)) # Look at the first predicted output self.assertEquals(round(270449.70602770313, 3), round(predicted_output[0], 3)) # The first output should be 310000 in the test set self.assertEquals(310000.0, test_output[0]) def test_04_gradient_descent_k_fold(self): # Usage: # Tests best l2_penalty for ridge regression using gradient descent # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Tolerance tolerance = None # Max Iterations to Run max_iterations = 1000 # Number of Folds folds = 10 # Store Cross Validation results cross_validation_results = [] # We want to test l2 penalty values in [10^1, 10^2, 10^3, 10^4, ..., 10^11] for l2_penalty in np.logspace(1, 11, num=11): # Create a dictionary of model_parameters model_parameters = {'step_size': step_size, 'max_iteration': max_iterations, 'initial_weights': initial_weights, 'tolerance': tolerance, 'l2_penalty': l2_penalty} # Compute the cross validation results cross_validation = self.k_fold_cross_validation.k_fold_cross_validation(folds, self.kc_house_train_frame, self.ridge_regression.gradient_descent, model_parameters, output, features) # Append it into the results cross_validation_results.append((l2_penalty, cross_validation)) # Lowest Result lowest = sorted(cross_validation_results, key=lambda x: x[1])[0] # Assert True that 10000000 is the l2_penalty that gives the lowest cross validation error self.assertEquals(10000000.0, lowest[0]) # Assert True that is the lowest l2_penalty self.assertEquals(round(120916225812152.84, 0), round(lowest[1], 0)) def test_05_hill_climbing(self): # Usage: # Tests the result on hill climbing # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will be price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 0.0 # Compute our hill climbing value final_weights = self.ridge_regression.hill_climbing(feature_matrix, output, initial_weights, step_size, tolerance, l2_penalty, max_iterations) # Assert that the weights is correct self.assertEquals(round(-7.7535764461428101e+70, -68), round(final_weights[0], -68)) self.assertEquals(round(-1.9293745396177612e+74, -70), round(final_weights[1], -70))