def setUp(self): # Usage: # Constructor for TestRidgeRegression # Arguments: # None # Create an instance of the Convert Numpy class self.convert_numpy = ConvertNumpy() # Create an instance of the Linear Regression class self.ridge_regression = RidgeRegression() # Create an instance of the Predict Output Class self.predict_output = PredictOutput() # Create an instance of the Residual Sum Squares Class self.residual_sum_squares = ResidualSumSquares() # Create an instance of the K Fold Cross Validation Class self.k_fold_cross_validation = KFoldCrossValidation() # Create a dictionary type to store relevant data types so that our pandas # will read the correct information dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int} # Create a kc_house_frame that encompasses all test and train data self.kc_house_frame = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_data.csv', dtype=dtype_dict) # Create a kc_house_test_frame that encompasses only train data self.kc_house_train_frame = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_train_data.csv', dtype=dtype_dict) # Create a kc_house_frames that encompasses only test data self.kc_test_frames = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_test_data.csv', dtype=dtype_dict) # Create a kc_house_train_valid_shuffled that encompasses both train and valid data and shuffled self.kc_house_train_valid_shuffled = pd.read_csv('./unit_tests/test_data/kc_house_with_validation_k_fold/wk3_kc_house_train_valid_shuffled.csv', dtype=dtype_dict)
def setUp(self): # Usage: # Constructor for TestRidgeRegression # Arguments: # None # Create an instance of the Convert Numpy class self.convert_numpy = ConvertNumpy() # Create an instance of the Linear Regression class self.ridge_regression = RidgeRegression() # Create an instance of the Predict Output Class self.predict_output = PredictOutput() # Create an instance of the Residual Sum Squares Class self.residual_sum_squares = ResidualSumSquares() # Create an instance of the K Fold Cross Validation Class self.k_fold_cross_validation = KFoldCrossValidation() # Create a dictionary type to store relevant data types so that our pandas # will read the correct information dtype_dict = { 'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float, 'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str, 'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int, 'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int, 'view': int } # Create a kc_house_frame that encompasses all test and train data self.kc_house_frame = pd.read_csv( './unit_tests/test_data/kc_house/kc_house_data.csv', dtype=dtype_dict) # Create a kc_house_test_frame that encompasses only train data self.kc_house_train_frame = pd.read_csv( './unit_tests/test_data/kc_house/kc_house_train_data.csv', dtype=dtype_dict) # Create a kc_house_frames that encompasses only test data self.kc_test_frames = pd.read_csv( './unit_tests/test_data/kc_house/kc_house_test_data.csv', dtype=dtype_dict) # Create a kc_house_train_valid_shuffled that encompasses both train and valid data and shuffled self.kc_house_train_valid_shuffled = pd.read_csv( './unit_tests/test_data/kc_house_with_validation_k_fold/wk3_kc_house_train_valid_shuffled.csv', dtype=dtype_dict)
class TestRidgeRegression(unittest.TestCase): # Usage: # Tests for the Linear Regression Class. def setUp(self): # Usage: # Constructor for TestRidgeRegression # Arguments: # None # Create an instance of the Convert Numpy class self.convert_numpy = ConvertNumpy() # Create an instance of the Linear Regression class self.ridge_regression = RidgeRegression() # Create an instance of the Predict Output Class self.predict_output = PredictOutput() # Create an instance of the Residual Sum Squares Class self.residual_sum_squares = ResidualSumSquares() # Create an instance of the K Fold Cross Validation Class self.k_fold_cross_validation = KFoldCrossValidation() # Create a dictionary type to store relevant data types so that our pandas # will read the correct information dtype_dict = { 'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float, 'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str, 'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int, 'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int, 'view': int } # Create a kc_house_frame that encompasses all test and train data self.kc_house_frame = pd.read_csv( './unit_tests/test_data/kc_house/kc_house_data.csv', dtype=dtype_dict) # Create a kc_house_test_frame that encompasses only train data self.kc_house_train_frame = pd.read_csv( './unit_tests/test_data/kc_house/kc_house_train_data.csv', dtype=dtype_dict) # Create a kc_house_frames that encompasses only test data self.kc_test_frames = pd.read_csv( './unit_tests/test_data/kc_house/kc_house_test_data.csv', dtype=dtype_dict) # Create a kc_house_train_valid_shuffled that encompasses both train and valid data and shuffled self.kc_house_train_valid_shuffled = pd.read_csv( './unit_tests/test_data/kc_house_with_validation_k_fold/wk3_kc_house_train_valid_shuffled.csv', dtype=dtype_dict) def test_01_gradient_descent_no_penalty(self): # Usage: # Tests the result on gradient descent with low penalty # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy( self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 0.0 # Compute our gradient descent value final_weights = self.ridge_regression.gradient_descent( feature_matrix, output, initial_weights, step_size, tolerance, l2_penalty, max_iterations) # We will use sqft_iving, and sqft_living15 test_features = ['sqft_living'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy( self.kc_test_frames, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.predict_output_linear_regression( test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_linear_regression( test_output, predicted_output) # Assert that the weights is correct self.assertEquals(round(-0.16311351478746433, 5), round(final_weights[0], 5)) self.assertEquals(round(263.02436896538489, 3), round(final_weights[1], 3)) # Assert that rss is correct self.assertEquals(round(275723632153607.72, -5), round(rss, -5)) def test_02_gradient_descent_high_penalty(self): # Usage: # Tests the result on gradient descent with high penalty # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy( self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 1e11 # Compute our gradient descent value final_weights = self.ridge_regression.gradient_descent( feature_matrix, output, initial_weights, step_size, tolerance, l2_penalty, max_iterations) # We will use sqft_iving test_features = ['sqft_living'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy( self.kc_test_frames, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.predict_output_linear_regression( test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_linear_regression( test_output, predicted_output) # Assert that the weights is correct self.assertEquals(round(0.048718475774044, 5), round(final_weights[0], 5)) self.assertEquals(round(124.57402057376679, 3), round(final_weights[1], 3)) # Assert that rss is correct self.assertEquals(round(694654309578537.25, -5), round(rss, -5)) def test_03_gradient_descent_multiple_high_penalty(self): # Usage: # Tests the result on gradient descent with high penalty # Arguments: # None # We will use sqft_iving, and sqft_living15 features = ['sqft_living', 'sqft_living15'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy( self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([0.0, 0.0, 0.0]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 1e11 # Compute our gradient descent value final_weights = self.ridge_regression.gradient_descent( feature_matrix, output, initial_weights, step_size, tolerance, l2_penalty, max_iterations) # We will use sqft_iving, and sqft_living15 test_features = ['sqft_living', 'sqft_living15'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy( self.kc_test_frames, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.predict_output_linear_regression( test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_linear_regression( test_output, predicted_output) # Assert that the weights is correct self.assertEquals(round(0.033601165521060711, 5), round(final_weights[0], 5)) self.assertEquals(round(91.490167574878328, 3), round(final_weights[1], 3)) self.assertEquals(round(78.437490333967176, 3), round(final_weights[2], 3)) # Assert that rss is correct self.assertEquals(round(500408530236718.31, 0), round(rss, 0)) # Look at the first predicted output self.assertEquals(round(270449.70602770313, 3), round(predicted_output[0], 3)) # The first output should be 310000 in the test set self.assertEquals(310000.0, test_output[0]) def test_04_gradient_descent_k_fold(self): # Usage: # Tests best l2_penalty for ridge regression using gradient descent # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Tolerance tolerance = None # Max Iterations to Run max_iterations = 1000 # Number of Folds folds = 10 # Store Cross Validation results cross_validation_results = [] # We want to test l2 penalty values in [10^1, 10^2, 10^3, 10^4, ..., 10^11] for l2_penalty in np.logspace(1, 11, num=11): # Create a dictionary of model_parameters model_parameters = { 'step_size': step_size, 'max_iteration': max_iterations, 'initial_weights': initial_weights, 'tolerance': tolerance, 'l2_penalty': l2_penalty } # Compute the cross validation results cross_validation = self.k_fold_cross_validation.k_fold_cross_validation( folds, self.kc_house_train_frame, self.ridge_regression.gradient_descent, model_parameters, output, features) # Append it into the results cross_validation_results.append((l2_penalty, cross_validation)) # Lowest Result lowest = sorted(cross_validation_results, key=lambda x: x[1])[0] # Assert True that 10000000 is the l2_penalty that gives the lowest cross validation error self.assertEquals(10000000.0, lowest[0]) # Assert True that is the lowest l2_penalty self.assertEquals(round(120916225812152.84, 0), round(lowest[1], 0)) def test_05_hill_climbing(self): # Usage: # Tests the result on hill climbing # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will be price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy( self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 0.0 # Compute our hill climbing value final_weights = self.ridge_regression.hill_climbing( feature_matrix, output, initial_weights, step_size, tolerance, l2_penalty, max_iterations) # Assert that the weights is correct self.assertEquals(round(-7.7535764461428101e+70, -68), round(final_weights[0], -68)) self.assertEquals(round(-1.9293745396177612e+74, -70), round(final_weights[1], -70))
class TestRidgeRegression(unittest.TestCase): """Test for RidgeRegression. Uses housing data to test RidgeRegression. Statics: _multiprocess_can_split_ (bool): Flag for nose tests to run tests in parallel. """ _multiprocess_can_split_ = True def setUp(self): """Constructor for TestRidgeRegression. Loads housing data, and creates training and testing data. """ self.convert_numpy = ConvertNumpy() self.ridge_regression = RidgeRegression() self.predict_output = PredictOutput() self.residual_sum_squares = ResidualSumSquares() self.k_fold_cross_validation = KFoldCrossValidation() # Create a dictionary type to store relevant data types so that our pandas # will read the correct information dtype_dict = {'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float, 'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str, 'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int, 'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int, 'view': int} # Create a kc_house that encompasses all test and train data self.kc_house = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_data.csv', dtype=dtype_dict) # Create a kc_house_test_frame that encompasses only train data self.kc_house_train = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_train_data.csv', dtype=dtype_dict) # Create a kc_house_frames that encompasses only test data self.kc_house_test = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_test_data.csv', dtype=dtype_dict) # Create a kc_house_train_valid_shuffled that encompasses both train and valid data and shuffled self.kc_house_train_valid_shuffled = pd.read_csv('./unit_tests/test_data/regression/' 'kc_house_with_validation_k_fold/' 'wk3_kc_house_train_valid_shuffled.csv', dtype=dtype_dict) def test_01_gradient_descent_no_penalty(self): """Tests gradient descent algorithm. Tests the result on gradient descent with low penalty. """ # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1) # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 0.0 # Compute our gradient descent value final_weights = self.ridge_regression.gradient_descent(feature_matrix, output, {"initial_weights": initial_weights, "step_size": step_size, "tolerance": tolerance, "l2_penalty": l2_penalty, "max_iteration": max_iterations}) # We will use sqft_iving, and sqft_living15 test_features = ['sqft_living'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_house_test, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.regression(test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_regression(test_output, predicted_output) # Assert that the weights is correct self.assertEquals(round(-0.16311351478746433, 5), round(final_weights[0], 5)) self.assertEquals(round(263.02436896538489, 3), round(final_weights[1], 3)) # Assert that rss is correct self.assertEquals(round(275723632153607.72, -5), round(rss, -5)) def test_02_gradient_descent_high_penalty(self): """Tests gradient descent. Tests the result on gradient descent with high penalty. """ # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1) # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 1e11 # Compute our gradient descent value final_weights = self.ridge_regression.gradient_descent(feature_matrix, output, {"initial_weights": initial_weights, "step_size": step_size, "tolerance": tolerance, "l2_penalty": l2_penalty, "max_iteration": max_iterations}) # We will use sqft_iving test_features = ['sqft_living'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_house_test, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.regression(test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_regression(test_output, predicted_output) # Assert that the weights is correct self.assertEquals(round(9.7673000000000005, 5), round(final_weights[0], 5)) self.assertEquals(round(124.572, 3), round(final_weights[1], 3)) # Assert that rss is correct self.assertEquals(round(694642101500000.0, -5), round(rss, -5)) def test_03_gradient_descent_multiple_high_penalty(self): """Tests gradient descent. Tests gradient descent with multiple features, and high penalty. """ # We will use sqft_iving, and sqft_living15 features = ['sqft_living', 'sqft_living15'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1) # Create our initial weights initial_weights = np.array([0.0, 0.0, 0.0]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 1e11 # Compute our gradient descent value final_weights = self.ridge_regression.gradient_descent(feature_matrix, output, {"initial_weights": initial_weights, "step_size": step_size, "tolerance": tolerance, "l2_penalty": l2_penalty, "max_iteration": max_iterations}) # We will use sqft_iving, and sqft_living15 test_features = ['sqft_living', 'sqft_living15'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_house_test, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.regression(test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_regression(test_output, predicted_output) # Assert that the weights is correct self.assertEquals(round(6.7429699999999997, 5), round(final_weights[0], 5)) self.assertEquals(round(91.489000000000004, 3), round(final_weights[1], 3)) self.assertEquals(round(78.437490333967176, 3), round(final_weights[2], 3)) # Assert that rss is correct self.assertEquals(round(500404800500842.0, -5), round(rss, -5)) # Look at the first predicted output self.assertEquals(round(270453.53000000003, 3), round(predicted_output[0], 3)) # The first output should be 310000 in the test set self.assertEquals(310000.0, test_output[0]) def test_04_gradient_descent_k_fold(self): """Tests gradient descent with K fold cross validation. Tests best l2_penalty for ridge regression using gradient descent. """ # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Tolerance tolerance = None # Max Iterations to Run max_iterations = 1000 # Number of Folds folds = 10 # Store Cross Validation results cross_validation_results = [] # We want to test l2 penalty values in [10^1, 10^2, 10^3, 10^4, ..., 10^11] for l2_penalty in np.logspace(1, 11, num=11): # Create a dictionary of model_parameters model_parameters = {'step_size': step_size, 'max_iteration': max_iterations, 'initial_weights': initial_weights, 'tolerance': tolerance, 'l2_penalty': l2_penalty} # Compute the cross validation results cv = self.k_fold_cross_validation.k_fold_cross_validation(folds, self.ridge_regression.gradient_descent, model_parameters, {"data": self.kc_house_train, "output": output, "features": features}) # Append it into the results cross_validation_results.append((l2_penalty, cv)) # Lowest Result lowest = sorted(cross_validation_results, key=lambda x: x[1])[0] # Assert True that 10000000 is the l2_penalty that gives the lowest cross validation error self.assertEquals(10000000.0, lowest[0]) # Assert True that is the lowest l2_penalty self.assertEquals(round(120916225809145.0, 0), round(lowest[1], 0)) def test_05_gradient_ascent(self): """Tests gradient ascent. Tests gradient ascent and compare it with known values. """ # We will use sqft_living for our features features = ['sqft_living'] # Output will be price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1) # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 0.0 # Compute our hill climbing value final_weights = self.ridge_regression.gradient_ascent(feature_matrix, output, {"initial_weights": initial_weights, "step_size": step_size, "tolerance": tolerance, "l2_penalty": l2_penalty, "max_iteration": max_iterations}) # Assert that the weights is correct self.assertEquals(round(-7.7535764461428101e+70, -68), round(final_weights[0], -68)) self.assertEquals(round(-1.9293745396177612e+74, -70), round(final_weights[1], -70)) def test_07_gradient_ascent_high_tolerance(self): """Tests gradient ascent. Tests gradient ascent and compare it with known values. """ # We will use sqft_living for our features features = ['sqft_living'] # Output will be price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1) # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = 1 # L2 Penalty l2_penalty = 0.0 # Compute our hill climbing value final_weights = self.ridge_regression.gradient_ascent(feature_matrix, output, {"initial_weights": initial_weights, "step_size": step_size, "tolerance": tolerance, "l2_penalty": l2_penalty, "max_iteration": max_iterations}) # Assert that the weights is correct self.assertEquals(0, round(final_weights[0], -68)) self.assertEquals(0, round(final_weights[1], -70)) def test_08_gradient_descent_no_penalty_high_tolerance(self): """Tests gradient descent algorithm. Tests the result on gradient descent with low penalty. """ # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1) # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 100000 # Tolerance tolerance = 10000000000 # L2 Penalty l2_penalty = 0.0 # Compute our gradient descent value final_weights = self.ridge_regression.gradient_descent(feature_matrix, output, {"initial_weights": initial_weights, "step_size": step_size, "tolerance": tolerance, "l2_penalty": l2_penalty, "max_iteration": max_iterations}) # We will use sqft_iving, and sqft_living15 test_features = ['sqft_living'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_house_test, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.regression(test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_regression(test_output, predicted_output) # Assert that the weights is correct self.assertEquals(round(0.093859999999999999, 5), round(final_weights[0], 5)) self.assertEquals(round(262.98200000000003, 3), round(final_weights[1], 3)) # Assert that rss is correct self.assertEquals(round(275724298300000.0, -5), round(rss, -5))
class TestRidgeRegression(unittest.TestCase): # Usage: # Tests for the Linear Regression Class. def setUp(self): # Usage: # Constructor for TestRidgeRegression # Arguments: # None # Create an instance of the Convert Numpy class self.convert_numpy = ConvertNumpy() # Create an instance of the Linear Regression class self.ridge_regression = RidgeRegression() # Create an instance of the Predict Output Class self.predict_output = PredictOutput() # Create an instance of the Residual Sum Squares Class self.residual_sum_squares = ResidualSumSquares() # Create an instance of the K Fold Cross Validation Class self.k_fold_cross_validation = KFoldCrossValidation() # Create a dictionary type to store relevant data types so that our pandas # will read the correct information dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int} # Create a kc_house_frame that encompasses all test and train data self.kc_house_frame = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_data.csv', dtype=dtype_dict) # Create a kc_house_test_frame that encompasses only train data self.kc_house_train_frame = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_train_data.csv', dtype=dtype_dict) # Create a kc_house_frames that encompasses only test data self.kc_test_frames = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_test_data.csv', dtype=dtype_dict) # Create a kc_house_train_valid_shuffled that encompasses both train and valid data and shuffled self.kc_house_train_valid_shuffled = pd.read_csv('./unit_tests/test_data/kc_house_with_validation_k_fold/wk3_kc_house_train_valid_shuffled.csv', dtype=dtype_dict) def test_01_gradient_descent_no_penalty(self): # Usage: # Tests the result on gradient descent with low penalty # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 0.0 # Compute our gradient descent value final_weights = self.ridge_regression.gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance, l2_penalty, max_iterations) # We will use sqft_iving, and sqft_living15 test_features = ['sqft_living'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_test_frames, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.predict_output_linear_regression(test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_linear_regression(test_output, predicted_output) # Assert that the weights is correct self.assertEquals(round(-0.16311351478746433, 5), round(final_weights[0], 5)) self.assertEquals(round(263.02436896538489, 3), round(final_weights[1], 3)) # Assert that rss is correct self.assertEquals(round(275723632153607.72, -5), round(rss, -5)) def test_02_gradient_descent_high_penalty(self): # Usage: # Tests the result on gradient descent with high penalty # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 1e11 # Compute our gradient descent value final_weights = self.ridge_regression.gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance, l2_penalty, max_iterations) # We will use sqft_iving test_features = ['sqft_living'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_test_frames, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.predict_output_linear_regression(test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_linear_regression(test_output, predicted_output) # Assert that the weights is correct self.assertEquals(round(0.048718475774044, 5), round(final_weights[0], 5)) self.assertEquals(round(124.57402057376679, 3), round(final_weights[1], 3)) # Assert that rss is correct self.assertEquals(round(694654309578537.25, -5), round(rss, -5)) def test_03_gradient_descent_multiple_high_penalty(self): # Usage: # Tests the result on gradient descent with high penalty # Arguments: # None # We will use sqft_iving, and sqft_living15 features = ['sqft_living', 'sqft_living15'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([0.0, 0.0, 0.0]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 1e11 # Compute our gradient descent value final_weights = self.ridge_regression.gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance, l2_penalty, max_iterations) # We will use sqft_iving, and sqft_living15 test_features = ['sqft_living', 'sqft_living15'] # Output will be price test_output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_test_frames, test_features, test_output, 1) # Predict the output of test features predicted_output = self.predict_output.predict_output_linear_regression(test_feature_matrix, final_weights) # Compute RSS rss = self.residual_sum_squares.residual_sum_squares_linear_regression(test_output, predicted_output) # Assert that the weights is correct self.assertEquals(round(0.033601165521060711, 5), round(final_weights[0], 5)) self.assertEquals(round(91.490167574878328, 3), round(final_weights[1], 3)) self.assertEquals(round(78.437490333967176, 3), round(final_weights[2], 3)) # Assert that rss is correct self.assertEquals(round(500408530236718.31, 0), round(rss, 0)) # Look at the first predicted output self.assertEquals(round(270449.70602770313, 3), round(predicted_output[0], 3)) # The first output should be 310000 in the test set self.assertEquals(310000.0, test_output[0]) def test_04_gradient_descent_k_fold(self): # Usage: # Tests best l2_penalty for ridge regression using gradient descent # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will use price output = ['price'] # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Tolerance tolerance = None # Max Iterations to Run max_iterations = 1000 # Number of Folds folds = 10 # Store Cross Validation results cross_validation_results = [] # We want to test l2 penalty values in [10^1, 10^2, 10^3, 10^4, ..., 10^11] for l2_penalty in np.logspace(1, 11, num=11): # Create a dictionary of model_parameters model_parameters = {'step_size': step_size, 'max_iteration': max_iterations, 'initial_weights': initial_weights, 'tolerance': tolerance, 'l2_penalty': l2_penalty} # Compute the cross validation results cross_validation = self.k_fold_cross_validation.k_fold_cross_validation(folds, self.kc_house_train_frame, self.ridge_regression.gradient_descent, model_parameters, output, features) # Append it into the results cross_validation_results.append((l2_penalty, cross_validation)) # Lowest Result lowest = sorted(cross_validation_results, key=lambda x: x[1])[0] # Assert True that 10000000 is the l2_penalty that gives the lowest cross validation error self.assertEquals(10000000.0, lowest[0]) # Assert True that is the lowest l2_penalty self.assertEquals(round(120916225812152.84, 0), round(lowest[1], 0)) def test_05_hill_climbing(self): # Usage: # Tests the result on hill climbing # Arguments: # None # We will use sqft_living for our features features = ['sqft_living'] # Output will be price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train_frame, features, output, 1) # Create our initial weights initial_weights = np.array([0., 0.]) # Step size step_size = 1e-12 # Max Iterations to Run max_iterations = 1000 # Tolerance tolerance = None # L2 Penalty l2_penalty = 0.0 # Compute our hill climbing value final_weights = self.ridge_regression.hill_climbing(feature_matrix, output, initial_weights, step_size, tolerance, l2_penalty, max_iterations) # Assert that the weights is correct self.assertEquals(round(-7.7535764461428101e+70, -68), round(final_weights[0], -68)) self.assertEquals(round(-1.9293745396177612e+74, -70), round(final_weights[1], -70))