def setUp(self): """Constructor for TestKNearestNeighborRegression. Loads housing data, and creates training and testing data. """ self.convert_numpy = ConvertNumpy() self.normalize_features = NormalizeFeatures() self.knn = KNearestNeighborRegression() self.euclidean_distance = EuclideanDistance() self.determine_k_knn = DetermineKKnn() # Create a dictionary type to store relevant data types so that our pandas # will read the correct information dtype_dict = {'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float, 'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str, 'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int, 'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int, 'view': int} # Create a kc_house that encompasses all test and train data self.kc_house = pd.read_csv('./unit_tests/test_data/regression/kc_house_knn/kc_house_data_small.csv', dtype=dtype_dict) # Create a kc_house_test_frame that encompasses only train data self.kc_house_train = pd.read_csv('./unit_tests/test_data/regression/kc_house_knn/' 'kc_house_data_small_train.csv', dtype=dtype_dict) # Create a kc_house_frames that encompasses only test data self.kc_house_test = pd.read_csv('./unit_tests/test_data/regression/kc_house_knn/kc_house_data_small_test.csv', dtype=dtype_dict) # Create a kc_house_frames that encompasses only validation data self.kc_house_valid = pd.read_csv('./unit_tests/test_data/regression/kc_house_knn/kc_house_data_validation.csv', dtype=dtype_dict) # Convert all the frames with the floors to float type self.kc_house['floors'] = self.kc_house['floors'].astype(float) self.kc_house_train['floors'] = self.kc_house_train['floors'].astype(float) self.kc_house_test['floors'] = self.kc_house_test['floors'].astype(float) self.kc_house_valid['floors'] = self.kc_house_valid['floors'].astype(float) # Then back to int type self.kc_house['floors'] = self.kc_house['floors'].astype(int) self.kc_house_train['floors'] = self.kc_house_train['floors'].astype(int) self.kc_house_test['floors'] = self.kc_house_test['floors'].astype(int) self.kc_house_valid['floors'] = self.kc_house_valid['floors'].astype(int)
def setUp(self): """Constructor for TestLassoRegression. Loads housing data, and creates training and testing data. """ self.convert_numpy = ConvertNumpy() self.normalize_features = NormalizeFeatures() self.lasso = LassoRegression() self.predict_output = PredictOutput() self.residual_sum_squares = ResidualSumSquares() self.k_fold_cross_validation = KFoldCrossValidation() # Create a dictionary type to store relevant data types so that our pandas # will read the correct information dtype_dict = {'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float, 'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str, 'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int, 'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int, 'view': int} # Create a kc_house that encompasses all test and train data self.kc_house = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_data.csv', dtype=dtype_dict) # Create a kc_house_test_frame that encompasses only train data self.kc_house_train = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_train_data.csv', dtype=dtype_dict) # Create a kc_house_frames that encompasses only test data self.kc_house_test = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_test_data.csv', dtype=dtype_dict) # Convert all the frames with the floors to float type self.kc_house['floors'] = self.kc_house['floors'].astype(float) self.kc_house_train['floors'] = self.kc_house['floors'].astype(float) self.kc_house_test['floors'] = self.kc_house['floors'].astype(float) # Then back to int type self.kc_house['floors'] = self.kc_house['floors'].astype(int) self.kc_house_train['floors'] = self.kc_house['floors'].astype(int) self.kc_house_test['floors'] = self.kc_house['floors'].astype(int)
class TestLassoRegression(unittest.TestCase): """Tests for TestLassoRegression. Uses housing data to test LassoRegression. Statics: _multiprocess_can_split_ (bool): Flag for nose tests to run tests in parallel. """ _multiprocess_can_split_ = True def setUp(self): """Constructor for TestLassoRegression. Loads housing data, and creates training and testing data. """ self.convert_numpy = ConvertNumpy() self.normalize_features = NormalizeFeatures() self.lasso = LassoRegression() self.predict_output = PredictOutput() self.residual_sum_squares = ResidualSumSquares() self.k_fold_cross_validation = KFoldCrossValidation() # Create a dictionary type to store relevant data types so that our pandas # will read the correct information dtype_dict = {'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float, 'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str, 'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int, 'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int, 'view': int} # Create a kc_house that encompasses all test and train data self.kc_house = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_data.csv', dtype=dtype_dict) # Create a kc_house_test_frame that encompasses only train data self.kc_house_train = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_train_data.csv', dtype=dtype_dict) # Create a kc_house_frames that encompasses only test data self.kc_house_test = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_test_data.csv', dtype=dtype_dict) # Convert all the frames with the floors to float type self.kc_house['floors'] = self.kc_house['floors'].astype(float) self.kc_house_train['floors'] = self.kc_house['floors'].astype(float) self.kc_house_test['floors'] = self.kc_house['floors'].astype(float) # Then back to int type self.kc_house['floors'] = self.kc_house['floors'].astype(int) self.kc_house_train['floors'] = self.kc_house['floors'].astype(int) self.kc_house_test['floors'] = self.kc_house['floors'].astype(int) def test_01_normalize_features(self): """Tests normalizing features. Test normalization features, and compare it with known values. """ # Normalize the features, and also return the norms features, norms = self.normalize_features.l2_norm(np.array([[3., 6., 9.], [4., 8., 12.]])) # Assert that the np array is equal to features self.assertTrue(np.array_equal(np.array([[0.6, 0.6, 0.6], [0.8, 0.8, 0.8]]), features), True) # Assert that the np array is equal to norms self.assertTrue(np.array_equal(np.array([5., 10., 15.]), norms), True) def test_02_compute_ro(self): """Test compute ro Test compute one round of ro. """ # We will use sqft_iving, and sqft_living15 features = ['sqft_living', 'bedrooms'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house, features, output, 1) # Create our initial weights normalized_feature_matrix, _ = self.normalize_features.l2_norm(feature_matrix) # Set initial weights weights = np.array([1., 4., 1.]) # Compute ro_j ro_j = self.lasso.compute_ro_j(normalized_feature_matrix, output, weights) # Assert the output of ro_j self.assertTrue(np.allclose(ro_j, np.array([79400300.03492916, 87939470.77299108, 80966698.67596565]))) def test_03_compute_coordinate_descent_step(self): """Test one coordinate descent step. Test one coordinate descent step and compare it with known values. """ # Assert that both are equal self.assertEquals(round(self.lasso.lasso_coordinate_descent_step({"i": 1, "weights": np.array([1., 4.])}, np.array([[3./math.sqrt(13), 1./math.sqrt(10)], [2./math.sqrt(13), 3./math.sqrt(10)]]), np.array([1., 1.]), {"l1_penalty": 0.1}), 8), round(0.425558846691, 8)) def test_04_coordinate_descent(self): """Test coordinate descent. Test coordinate descent and compare with known values. """ # We will use sqft_iving, and sqft_living15 features = ['sqft_living', 'bedrooms'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house, features, output, 1) # Create our initial weights normalized_feature_matrix, _ = self.normalize_features.l2_norm(feature_matrix) # Set initial weights initial_weights = np.zeros(3) # Set l1 penalty l1_penalty = 1e7 # Set tolerance tolerance = 1.0 # Compute the weights using coordinate descent weights = self.lasso.lasso_cyclical_coordinate_descent(normalized_feature_matrix, output, {"initial_weights": initial_weights, "l1_penalty": l1_penalty, "tolerance": tolerance}) # Assert that these two numpy arrays are the same self.assertTrue(np.allclose(weights, np.array([21624998.3663629, 63157246.78545423, 0.]), True)) # Predict the output predicted_output = self.predict_output.regression(normalized_feature_matrix, weights) # Assert that the RSS is what we wanted self.assertEquals(round(self.residual_sum_squares.residual_sum_squares_regression(output, predicted_output), -10), round(1.63049248148e+15, -10)) def test_05_coordinate_descent_with_normalization(self): """Test coordinate descent with normalization. Test coordinate descent and then normalize the result, so that we can use the weights on a test set. """ # We will use multiple features features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated'] # Output will use price output = ['price'] # Convert our pandas frame to numpy feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1) # Create our initial weights normalized_feature_matrix, norms = self.normalize_features.l2_norm(feature_matrix) # Compute Multiple Weights weights1e7 = self.lasso.lasso_cyclical_coordinate_descent(normalized_feature_matrix, output, {"initial_weights": np.zeros(len(features)+1), "l1_penalty": 1e7, "tolerance": 1}) weights1e8 = self.lasso.lasso_cyclical_coordinate_descent(normalized_feature_matrix, output, {"initial_weights": np.zeros(len(features)+1), "l1_penalty": 1e8, "tolerance": 1}) weights1e4 = self.lasso.lasso_cyclical_coordinate_descent(normalized_feature_matrix, output, {"initial_weights": np.zeros(len(features)+1), "l1_penalty": 1e4, "tolerance": 5e5}) # Compute multiple normalized normalized_weights1e4 = weights1e4 / norms normalized_weights1e7 = weights1e7 / norms normalized_weights1e8 = weights1e8 / norms # We will use multiple features features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated'] # Output will use price output = ['price'] # Convert our test pandas frame to numpy test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_house_test, features, output, 1) # Predict the output predicted_output = self.predict_output.regression(test_feature_matrix, normalized_weights1e4) # Assert that the RSS is what we wanted self.assertEquals(round(self.residual_sum_squares.residual_sum_squares_regression(test_output, predicted_output), -12), round(2.2778100476e+14, -12)) # Predict the output predicted_output = self.predict_output.regression(test_feature_matrix, normalized_weights1e7) # Assert that the RSS is what we wanted self.assertEquals(round(self.residual_sum_squares.residual_sum_squares_regression(test_output, predicted_output), -12), round(2.75962079909e+14, -12)) # Predict the output predicted_output = self.predict_output.regression(test_feature_matrix, normalized_weights1e8) # Assert that the RSS is what we wanted self.assertEquals(round(self.residual_sum_squares.residual_sum_squares_regression(test_output, predicted_output), -12), round(5.37049248148e+14, -12))
class TestKNearestNeighborRegression(unittest.TestCase): """Tests for TestKNearestNeighborRegression. Uses housing data to test KNearestNeighborRegression. Statics: _multiprocess_can_split_ (bool): Flag for nose tests to run tests in parallel. """ _multiprocess_can_split_ = True def setUp(self): """Constructor for TestKNearestNeighborRegression. Loads housing data, and creates training and testing data. """ self.convert_numpy = ConvertNumpy() self.normalize_features = NormalizeFeatures() self.knn = KNearestNeighborRegression() self.euclidean_distance = EuclideanDistance() self.determine_k_knn = DetermineKKnn() # Create a dictionary type to store relevant data types so that our pandas # will read the correct information dtype_dict = {'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float, 'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str, 'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int, 'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int, 'view': int} # Create a kc_house that encompasses all test and train data self.kc_house = pd.read_csv('./unit_tests/test_data/regression/kc_house_knn/kc_house_data_small.csv', dtype=dtype_dict) # Create a kc_house_test_frame that encompasses only train data self.kc_house_train = pd.read_csv('./unit_tests/test_data/regression/kc_house_knn/' 'kc_house_data_small_train.csv', dtype=dtype_dict) # Create a kc_house_frames that encompasses only test data self.kc_house_test = pd.read_csv('./unit_tests/test_data/regression/kc_house_knn/kc_house_data_small_test.csv', dtype=dtype_dict) # Create a kc_house_frames that encompasses only validation data self.kc_house_valid = pd.read_csv('./unit_tests/test_data/regression/kc_house_knn/kc_house_data_validation.csv', dtype=dtype_dict) # Convert all the frames with the floors to float type self.kc_house['floors'] = self.kc_house['floors'].astype(float) self.kc_house_train['floors'] = self.kc_house_train['floors'].astype(float) self.kc_house_test['floors'] = self.kc_house_test['floors'].astype(float) self.kc_house_valid['floors'] = self.kc_house_valid['floors'].astype(float) # Then back to int type self.kc_house['floors'] = self.kc_house['floors'].astype(int) self.kc_house_train['floors'] = self.kc_house_train['floors'].astype(int) self.kc_house_test['floors'] = self.kc_house_test['floors'].astype(int) self.kc_house_valid['floors'] = self.kc_house_valid['floors'].astype(int) def test_01_compute_euclidean_distance(self): """Tests Euclidean distance. Tests Euclidean distance and compare it with known values. """ # List of features to convert to numpy feature_list = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'lat', 'long', 'sqft_living15', 'sqft_lot15'] # Output to convert to numpy output = ['price'] # Extract features and output for train, test, and validation set features_train, _ = self.convert_numpy.convert_to_numpy(self.kc_house_train, feature_list, output, 1) features_test, _ = self.convert_numpy.convert_to_numpy(self.kc_house_test, feature_list, output, 1) # features_valid, output_valid = self.convert_numpy.convert_to_numpy(self.kc_house_valid, feature_list, # output, 1) # Normalize our training features, and then normalize the test set and valid set features_train, norms = self.normalize_features.l2_norm(features_train) features_test = features_test / norms # features_valid = features_valid / norms # Compute the euclidean distance distance = self.euclidean_distance.euclidean_distance(features_test[0], features_train[9]) # Assert that both are equal self.assertEqual(round(distance, 3), round(0.059723593716661257, 3)) def test_02_compute_euclidean_distance_query_point(self): """Tests Euclidean distance with a set of query points. Test to compute euclidean distance from a query point to multiple points in the training set """ # List of features to convert to numpy feature_list = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'lat', 'long', 'sqft_living15', 'sqft_lot15'] # Output to convert to numpy output = ['price'] # Extract features and output for train, test, and validation set features_train, output_train = self.convert_numpy.convert_to_numpy(self.kc_house_train, feature_list, output, 1) features_test, _ = self.convert_numpy.convert_to_numpy(self.kc_house_test, feature_list, output, 1) # features_valid, output_valid = self.convert_numpy.convert_to_numpy(self.kc_house_valid, feature_list, # output, 1) # Normalize our training features, and then normalize the test set and valid set features_train, norms = self.normalize_features.l2_norm(features_train) features_test = features_test / norms # features_valid = features_valid / norms # Determine the smallest euclidean distance set we get smallest = sys.maxsize smallest_index = 0 for index, val in enumerate(self.euclidean_distance.euclidean_distance_cmp_one_value(features_train, features_test[2])): if val < smallest: smallest = val smallest_index = index # Assert that we are getting the right prediction (for 1-NN neighbor) self.assertEqual(round(smallest, 8), round(0.00286049526751, 8)) self.assertEqual(output_train[smallest_index], 249000) self.assertEqual(smallest_index, 382) def test_03_compute_knn(self): """Tests knn regression algorithm. Tests the knn algorithm and compare it with known values. """ # List of features to convert to numpy feature_list = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'lat', 'long', 'sqft_living15', 'sqft_lot15'] # Output to convert to numpy output = ['price'] # Extract features and output for train, test, and validation set features_train, output_train = self.convert_numpy.convert_to_numpy(self.kc_house_train, feature_list, output, 1) features_test, _ = self.convert_numpy.convert_to_numpy(self.kc_house_test, feature_list, output, 1) # features_valid, output_valid = self.convert_numpy.convert_to_numpy(self.kc_house_valid, feature_list, # output, 1) # Normalize our training features, and then normalize the test set and valid set features_train, norms = self.normalize_features.l2_norm(features_train) features_test = features_test / norms # features_valid = features_valid / norms # Assert that the array is the closest with the 3rd house in features_test self.assertTrue(np.array_equal(self.knn.k_nearest_neighbor_regression(4, features_train, features_test[2]), np.array([382, 1149, 4087, 3142]))) # Assert that the 413987.5 is the correct prediction self.assertEqual(self.knn.predict_k_nearest_neighbor_regression(4, features_train, output_train, features_test[2]), 413987.5) # Compute the lowest predicted value lowest_predicted = sys.maxsize lowest_predicted_index = 0 for index, val in enumerate(self.knn.predict_k_nearest_neighbor_all_regression(10, features_train, output_train, features_test[0:10])): if val < lowest_predicted: lowest_predicted = val lowest_predicted_index = index # Assert that the few values such as lowest predicted values and index are the one we expect self.assertEqual(lowest_predicted, 350032.0) self.assertEqual(lowest_predicted_index, 6) def test_03_compute_best_k(self): """Compute best K for KNN Regression. Compute best K using K Fold Cross Validation. """ # List of features to convert to numpy feature_list = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'lat', 'long', 'sqft_living15', 'sqft_lot15'] # Output to convert to numpy output = ['price'] # Extract features and output for train, test, and validation set features_train, output_train = self.convert_numpy.convert_to_numpy(self.kc_house_train, feature_list, output, 1) # features_test, output_test = self.convert_numpy.convert_to_numpy(self.kc_house_test, feature_list, # output, 1) features_valid, output_valid = self.convert_numpy.convert_to_numpy(self.kc_house_valid, feature_list, output, 1) # Normalize our training features, and then normalize the test set and valid set features_train, norms = self.normalize_features.l2_norm(features_train) # features_test = features_test / norms features_valid = features_valid / norms # Compute the lowest K and lowest K's RSS low_rss, low_idx = self.determine_k_knn.determine_k_knn(self.knn.predict_k_nearest_neighbor_all_regression, 1, 16, {"features_train": features_train, "features_valid": features_valid, "output_train": output_train, "output_valid": output_valid}) # Assert that the lowest k and rss is correct self.assertEqual(round(low_rss, -13), round(6.73616787355e+13, -13)) self.assertEqual(low_idx, 8)