示例#1
0
    def setUp(self):
        # Usage:
        #       Constructor for TestLinearRegression
        # Arguments:
        #       None

        # Create an instance of the Convert Numpy class
        self.convert_numpy = ConvertNumpy()

        # Create an instance of the Linear Regression class
        self.linear_regression = LinearRegression()

        # Create an instance of the Predict Output Class
        self.predict_output = PredictOutput()

        # Create an instance of the Residual Sum Squares Class
        self.residual_sum_squares = ResidualSumSquares()

        # Create a dictionary type to store relevant data types so that our pandas
        # will read the correct information
        dtype_dict = {
            'bathrooms': float,
            'waterfront': int,
            'sqft_above': int,
            'sqft_living15': float,
            'grade': int,
            'yr_renovated': int,
            'price': float,
            'bedrooms': float,
            'zipcode': str,
            'long': float,
            'sqft_lot15': float,
            'sqft_living': float,
            'floors': str,
            'condition': int,
            'lat': float,
            'date': str,
            'sqft_basement': int,
            'yr_built': int,
            'id': str,
            'sqft_lot': int,
            'view': int
        }

        # Create a kc_house_frame that encompasses all test and train data
        self.kc_house_frame = pd.read_csv(
            './unit_tests/test_data/kc_house/kc_house_data.csv',
            dtype=dtype_dict)

        # Create a kc_house_test_frame that encompasses only train data
        self.kc_house_train_frame = pd.read_csv(
            './unit_tests/test_data/kc_house/kc_house_train_data.csv',
            dtype=dtype_dict)

        # Create a kc_house_frames that encompasses only test data
        self.kc_test_frames = pd.read_csv(
            './unit_tests/test_data/kc_house/kc_house_test_data.csv',
            dtype=dtype_dict)
示例#2
0
    def __init__(self):
        # Usage:
        #       Constructor for KFoldCrossValidation, used to setup ConvertNumpy class to convert pandas
        #       data to numpy.
        # Arguments:
        #       None

        # Create an instance of the Convert Numpy class
        self.convert_numpy = ConvertNumpy()

        # Create an instance of the Predict Output Class
        self.predict_output = PredictOutput()

        # Create an instance of the Residual Sum Squares Class
        self.residual_sum_squares = ResidualSumSquares()
    def setUp(self):
        # Usage:
        #       Constructor for TestLinearRegression
        # Arguments:
        #       None

        # Create an instance of the Convert Numpy class
        self.convert_numpy = ConvertNumpy()

        # Create an instance of the Linear Regression class
        self.linear_regression = LinearRegression()

        # Create an instance of the Predict Output Class
        self.predict_output = PredictOutput()

        # Create an instance of the Residual Sum Squares Class
        self.residual_sum_squares = ResidualSumSquares()

        # Create a dictionary type to store relevant data types so that our pandas
        # will read the correct information
        dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float,
                      'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str,
                      'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int,
                      'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int,
                      'view':int}

        # Create a kc_house_frame that encompasses all test and train data
        self.kc_house_frame = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_data.csv', dtype=dtype_dict)

        # Create a kc_house_test_frame that encompasses only train data
        self.kc_house_train_frame = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_train_data.csv', dtype=dtype_dict)

        # Create a kc_house_frames that encompasses only test data
        self.kc_test_frames = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_test_data.csv', dtype=dtype_dict)
示例#4
0
    def __init__(self):
        """Constructor for DetermineKKnn to setup RSS class.

        Constructor to setup RSS Class.

        """
        self.residual_sum_squares = ResidualSumSquares()
    def __init__(self):
        """Constructor for KFoldCrossValidation.

        Constructor for KFoldCrossValidation, used to setup numpy conversion, output prediction, and residual sum
        of squares.

        """
        self.convert_numpy = ConvertNumpy()
        self.predict_output = PredictOutput()
        self.residual_sum_squares = ResidualSumSquares()
    def setUp(self):
        """Constructor for TestLassoRegression.

        Loads housing data, and creates training and testing data.

        """
        self.convert_numpy = ConvertNumpy()
        self.normalize_features = NormalizeFeatures()
        self.lasso = LassoRegression()
        self.predict_output = PredictOutput()
        self.residual_sum_squares = ResidualSumSquares()
        self.k_fold_cross_validation = KFoldCrossValidation()

        # Create a dictionary type to store relevant data types so that our pandas
        # will read the correct information
        dtype_dict = {'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float,
                      'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str,
                      'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int,
                      'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int,
                      'view': int}

        # Create a kc_house that encompasses all test and train data
        self.kc_house = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_data.csv', dtype=dtype_dict)

        # Create a kc_house_test_frame that encompasses only train data
        self.kc_house_train = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_train_data.csv',
                                          dtype=dtype_dict)

        # Create a kc_house_frames that encompasses only test data
        self.kc_house_test = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_test_data.csv',
                                         dtype=dtype_dict)

        # Convert all the frames with the floors to float type
        self.kc_house['floors'] = self.kc_house['floors'].astype(float)
        self.kc_house_train['floors'] = self.kc_house['floors'].astype(float)
        self.kc_house_test['floors'] = self.kc_house['floors'].astype(float)

        # Then back to int type
        self.kc_house['floors'] = self.kc_house['floors'].astype(int)
        self.kc_house_train['floors'] = self.kc_house['floors'].astype(int)
        self.kc_house_test['floors'] = self.kc_house['floors'].astype(int)
    def setUp(self):
        """Constructor for TestRidgeRegression.

        Loads housing data, and creates training and testing data.

        """
        self.convert_numpy = ConvertNumpy()
        self.ridge_regression = RidgeRegression()
        self.predict_output = PredictOutput()
        self.residual_sum_squares = ResidualSumSquares()
        self.k_fold_cross_validation = KFoldCrossValidation()

        # Create a dictionary type to store relevant data types so that our pandas
        # will read the correct information
        dtype_dict = {'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float,
                      'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str,
                      'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int,
                      'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int,
                      'view': int}

        # Create a kc_house that encompasses all test and train data
        self.kc_house = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_data.csv',
                                    dtype=dtype_dict)

        # Create a kc_house_test_frame that encompasses only train data
        self.kc_house_train = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_train_data.csv',
                                          dtype=dtype_dict)

        # Create a kc_house_frames that encompasses only test data
        self.kc_house_test = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_test_data.csv',
                                         dtype=dtype_dict)

        # Create a kc_house_train_valid_shuffled that encompasses both train and valid data and shuffled
        self.kc_house_train_valid_shuffled = pd.read_csv('./unit_tests/test_data/regression/'
                                                         'kc_house_with_validation_k_fold/'
                                                         'wk3_kc_house_train_valid_shuffled.csv',
                                                         dtype=dtype_dict)
class TestRidgeRegression(unittest.TestCase):
    #   Usage:
    #       Tests for the Linear Regression Class.

    def setUp(self):
        # Usage:
        #       Constructor for TestRidgeRegression
        # Arguments:
        #       None

        # Create an instance of the Convert Numpy class
        self.convert_numpy = ConvertNumpy()

        # Create an instance of the Linear Regression class
        self.ridge_regression = RidgeRegression()

        # Create an instance of the Predict Output Class
        self.predict_output = PredictOutput()

        # Create an instance of the Residual Sum Squares Class
        self.residual_sum_squares = ResidualSumSquares()

        # Create an instance of the K Fold Cross Validation Class
        self.k_fold_cross_validation = KFoldCrossValidation()

        # Create a dictionary type to store relevant data types so that our pandas
        # will read the correct information
        dtype_dict = {
            'bathrooms': float,
            'waterfront': int,
            'sqft_above': int,
            'sqft_living15': float,
            'grade': int,
            'yr_renovated': int,
            'price': float,
            'bedrooms': float,
            'zipcode': str,
            'long': float,
            'sqft_lot15': float,
            'sqft_living': float,
            'floors': str,
            'condition': int,
            'lat': float,
            'date': str,
            'sqft_basement': int,
            'yr_built': int,
            'id': str,
            'sqft_lot': int,
            'view': int
        }

        # Create a kc_house_frame that encompasses all test and train data
        self.kc_house_frame = pd.read_csv(
            './unit_tests/test_data/kc_house/kc_house_data.csv',
            dtype=dtype_dict)

        # Create a kc_house_test_frame that encompasses only train data
        self.kc_house_train_frame = pd.read_csv(
            './unit_tests/test_data/kc_house/kc_house_train_data.csv',
            dtype=dtype_dict)

        # Create a kc_house_frames that encompasses only test data
        self.kc_test_frames = pd.read_csv(
            './unit_tests/test_data/kc_house/kc_house_test_data.csv',
            dtype=dtype_dict)

        # Create a kc_house_train_valid_shuffled that encompasses both train and valid data and shuffled
        self.kc_house_train_valid_shuffled = pd.read_csv(
            './unit_tests/test_data/kc_house_with_validation_k_fold/wk3_kc_house_train_valid_shuffled.csv',
            dtype=dtype_dict)

    def test_01_gradient_descent_no_penalty(self):
        # Usage:
        #       Tests the result on gradient descent with low penalty
        # Arguments:
        #       None

        # We will use sqft_living for our features
        features = ['sqft_living']

        # Output will use price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(
            self.kc_house_train_frame, features, output, 1)

        # Create our initial weights
        initial_weights = np.array([0., 0.])

        # Step size
        step_size = 1e-12

        # Max Iterations to Run
        max_iterations = 1000

        # Tolerance
        tolerance = None

        # L2 Penalty
        l2_penalty = 0.0

        # Compute our gradient descent value
        final_weights = self.ridge_regression.gradient_descent(
            feature_matrix, output, initial_weights, step_size, tolerance,
            l2_penalty, max_iterations)

        # We will use sqft_iving, and sqft_living15
        test_features = ['sqft_living']

        # Output will be price
        test_output = ['price']

        # Convert our test pandas frame to numpy
        test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(
            self.kc_test_frames, test_features, test_output, 1)

        # Predict the output of test features
        predicted_output = self.predict_output.predict_output_linear_regression(
            test_feature_matrix, final_weights)

        # Compute RSS
        rss = self.residual_sum_squares.residual_sum_squares_linear_regression(
            test_output, predicted_output)

        # Assert that the weights is correct
        self.assertEquals(round(-0.16311351478746433, 5),
                          round(final_weights[0], 5))
        self.assertEquals(round(263.02436896538489, 3),
                          round(final_weights[1], 3))

        # Assert that rss is correct
        self.assertEquals(round(275723632153607.72, -5), round(rss, -5))

    def test_02_gradient_descent_high_penalty(self):
        # Usage:
        #       Tests the result on gradient descent with high penalty
        # Arguments:
        #       None

        # We will use sqft_living for our features
        features = ['sqft_living']

        # Output will use price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(
            self.kc_house_train_frame, features, output, 1)

        # Create our initial weights
        initial_weights = np.array([0., 0.])

        # Step size
        step_size = 1e-12

        # Max Iterations to Run
        max_iterations = 1000

        # Tolerance
        tolerance = None

        # L2 Penalty
        l2_penalty = 1e11

        # Compute our gradient descent value
        final_weights = self.ridge_regression.gradient_descent(
            feature_matrix, output, initial_weights, step_size, tolerance,
            l2_penalty, max_iterations)

        # We will use sqft_iving
        test_features = ['sqft_living']

        # Output will be price
        test_output = ['price']

        # Convert our test pandas frame to numpy
        test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(
            self.kc_test_frames, test_features, test_output, 1)

        # Predict the output of test features

        predicted_output = self.predict_output.predict_output_linear_regression(
            test_feature_matrix, final_weights)

        # Compute RSS
        rss = self.residual_sum_squares.residual_sum_squares_linear_regression(
            test_output, predicted_output)

        # Assert that the weights is correct
        self.assertEquals(round(0.048718475774044, 5),
                          round(final_weights[0], 5))
        self.assertEquals(round(124.57402057376679, 3),
                          round(final_weights[1], 3))

        # Assert that rss is correct
        self.assertEquals(round(694654309578537.25, -5), round(rss, -5))

    def test_03_gradient_descent_multiple_high_penalty(self):
        # Usage:
        #       Tests the result on gradient descent with high penalty
        # Arguments:
        #       None

        # We will use sqft_iving, and sqft_living15
        features = ['sqft_living', 'sqft_living15']

        # Output will use price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(
            self.kc_house_train_frame, features, output, 1)

        # Create our initial weights
        initial_weights = np.array([0.0, 0.0, 0.0])

        # Step size
        step_size = 1e-12

        # Max Iterations to Run
        max_iterations = 1000

        # Tolerance
        tolerance = None

        # L2 Penalty
        l2_penalty = 1e11

        # Compute our gradient descent value
        final_weights = self.ridge_regression.gradient_descent(
            feature_matrix, output, initial_weights, step_size, tolerance,
            l2_penalty, max_iterations)

        # We will use sqft_iving, and sqft_living15
        test_features = ['sqft_living', 'sqft_living15']

        # Output will be price
        test_output = ['price']

        # Convert our test pandas frame to numpy
        test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(
            self.kc_test_frames, test_features, test_output, 1)

        # Predict the output of test features
        predicted_output = self.predict_output.predict_output_linear_regression(
            test_feature_matrix, final_weights)

        # Compute RSS
        rss = self.residual_sum_squares.residual_sum_squares_linear_regression(
            test_output, predicted_output)

        # Assert that the weights is correct
        self.assertEquals(round(0.033601165521060711, 5),
                          round(final_weights[0], 5))
        self.assertEquals(round(91.490167574878328, 3),
                          round(final_weights[1], 3))
        self.assertEquals(round(78.437490333967176, 3),
                          round(final_weights[2], 3))

        # Assert that rss is correct
        self.assertEquals(round(500408530236718.31, 0), round(rss, 0))

        # Look at the first predicted output
        self.assertEquals(round(270449.70602770313, 3),
                          round(predicted_output[0], 3))

        # The first output should be 310000 in the test set
        self.assertEquals(310000.0, test_output[0])

    def test_04_gradient_descent_k_fold(self):
        # Usage:
        #       Tests best l2_penalty for ridge regression using gradient descent
        # Arguments:
        #       None

        # We will use sqft_living for our features
        features = ['sqft_living']

        # Output will use price
        output = ['price']

        # Create our initial weights
        initial_weights = np.array([0., 0.])

        # Step size
        step_size = 1e-12

        # Tolerance
        tolerance = None

        # Max Iterations to Run
        max_iterations = 1000

        # Number of Folds
        folds = 10

        # Store Cross Validation results
        cross_validation_results = []

        # We want to test l2 penalty values in [10^1, 10^2, 10^3, 10^4, ..., 10^11]
        for l2_penalty in np.logspace(1, 11, num=11):

            # Create a dictionary of model_parameters
            model_parameters = {
                'step_size': step_size,
                'max_iteration': max_iterations,
                'initial_weights': initial_weights,
                'tolerance': tolerance,
                'l2_penalty': l2_penalty
            }

            # Compute the cross validation results
            cross_validation = self.k_fold_cross_validation.k_fold_cross_validation(
                folds, self.kc_house_train_frame,
                self.ridge_regression.gradient_descent, model_parameters,
                output, features)

            # Append it into the results
            cross_validation_results.append((l2_penalty, cross_validation))

        # Lowest Result
        lowest = sorted(cross_validation_results, key=lambda x: x[1])[0]

        # Assert True that 10000000 is the l2_penalty that gives the lowest cross validation error
        self.assertEquals(10000000.0, lowest[0])

        # Assert True that is the lowest l2_penalty
        self.assertEquals(round(120916225812152.84, 0), round(lowest[1], 0))

    def test_05_hill_climbing(self):
        # Usage:
        #       Tests the result on hill climbing
        # Arguments:
        #       None

        # We will use sqft_living for our features
        features = ['sqft_living']

        # Output will be price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(
            self.kc_house_train_frame, features, output, 1)

        # Create our initial weights
        initial_weights = np.array([0., 0.])

        # Step size
        step_size = 1e-12

        # Max Iterations to Run
        max_iterations = 1000

        # Tolerance
        tolerance = None

        # L2 Penalty
        l2_penalty = 0.0

        # Compute our hill climbing value
        final_weights = self.ridge_regression.hill_climbing(
            feature_matrix, output, initial_weights, step_size, tolerance,
            l2_penalty, max_iterations)

        # Assert that the weights is correct
        self.assertEquals(round(-7.7535764461428101e+70, -68),
                          round(final_weights[0], -68))
        self.assertEquals(round(-1.9293745396177612e+74, -70),
                          round(final_weights[1], -70))
class TestLinearRegression(unittest.TestCase):
    #   Usage:
    #       Tests for the Linear Regression Class.

    def setUp(self):
        # Usage:
        #       Constructor for TestLinearRegression
        # Arguments:
        #       None

        # Create an instance of the Convert Numpy class
        self.convert_numpy = ConvertNumpy()

        # Create an instance of the Linear Regression class
        self.linear_regression = LinearRegression()

        # Create an instance of the Predict Output Class
        self.predict_output = PredictOutput()

        # Create an instance of the Residual Sum Squares Class
        self.residual_sum_squares = ResidualSumSquares()

        # Create a dictionary type to store relevant data types so that our pandas
        # will read the correct information
        dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float,
                      'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str,
                      'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int,
                      'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int,
                      'view':int}

        # Create a kc_house_frame that encompasses all test and train data
        self.kc_house_frame = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_data.csv', dtype=dtype_dict)

        # Create a kc_house_test_frame that encompasses only train data
        self.kc_house_train_frame = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_train_data.csv', dtype=dtype_dict)

        # Create a kc_house_frames that encompasses only test data
        self.kc_test_frames = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_test_data.csv', dtype=dtype_dict)

    def test_01_gradient_descent(self):
        # Usage:
        #       Tests the result on gradient descent
        # Arguments:
        #       None

        # We will use sqft_living for our features
        features = ['sqft_living']

        # Output will use price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train_frame, features, output, 1)

        # Create our initial weights
        initial_weights = np.array([-47000., 1.])

        # Step size
        step_size = 7e-12

        # Tolerance
        tolerance = 2.5e7

        # Compute our gradient descent value
        final_weights = self.linear_regression.gradient_descent(feature_matrix, output,
                                                                initial_weights, step_size,
                                                                tolerance)

        # Assert that the weights is correct
        self.assertEquals(round(-46999.887165546708, 3), round(final_weights[0], 3))
        self.assertEquals(round(281.91211917520917, 3), round(final_weights[1], 3))

    def test_02_gradient_descent_multiple(self):
        # Usage:
        #   Computes gradient descent on multiple input, and computes predicted model and RSS
        # Arguments:
        #   None

        # We will use sqft_iving, and sqft_living15
        features = ['sqft_living', 'sqft_living15']

        # Output will be price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train_frame, features, output, 1)

        # Create our initial weights
        initial_weights = np.array([-100000., 1., 1.])

        # Step size
        step_size = 4e-12

        # Tolerance
        tolerance = 1e9

        # Compute our gradient descent value
        final_weights = self.linear_regression.gradient_descent(feature_matrix, output,
                                                                initial_weights, step_size,
                                                                tolerance)

        # We will use sqft_iving, and sqft_living15
        test_features = ['sqft_living', 'sqft_living15']

        # Output will be price
        test_output = ['price']

        # Convert our test pandas frame to numpy
        test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_test_frames, test_features, test_output, 1)

        # Predict the output of test features
        predicted_output = self.predict_output.predict_output_linear_regression(test_feature_matrix, final_weights)

        # Compute RSS
        rss = self.residual_sum_squares.residual_sum_squares_linear_regression(test_output, predicted_output)

        # Assert that rss is correct
        self.assertEquals(round(270263443629803.41, -3), round(rss, -3))


    def test_03_hill_climbing(self):
        # Usage:
        #       Tests the result on hill climbing
        # Arguments:
        #       None

        # We will use sqft_living for our features
        features = ['sqft_living']

        # Output will be price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train_frame, features, output, 1)

        # Create our initial weights
        initial_weights = np.array([-47000., 1.])

        # Step size
        step_size = 7e-12

        # Tolerance
        tolerance = 2.5e7

        # Compute our hill climbing value
        final_weights = self.linear_regression.hill_climbing(feature_matrix, output,
                                                             initial_weights, step_size,
                                                             tolerance)

        # Assert that the weights is correct
        self.assertEquals(round(-47000.142201335177, 3), round(final_weights[0], 3))
        self.assertEquals(round(-352.86068692252599, 3), round(final_weights[1], 3))
示例#10
0
class DetermineKKnn:

    """Computes the best K for KNN algorithms.

    Computes the best K for KNN algorithms by finding the best K that has the lowest RSS.

    Attributes:
        residual_sum_squares (ResidualSumSquares): Class to compute residual sum of squares.

    """

    def __init__(self):
        """Constructor for DetermineKKnn to setup RSS class.

        Constructor to setup RSS Class.

        """
        self.residual_sum_squares = ResidualSumSquares()

    def determine_k_knn(self, knn_model, start_k, end_k, train_valid_data):
        """Determines the best K value for knn algorithms.

        The best K value is computed by computing the lowest RSS value between K values start_k and end_k.

        Args:
            knn_model (function): A function that can be called to compute knn with features_train, output_train, and
                features_valid.
            start_k (int): Starting k value to compute.
            end_k (int): Ending k value to compute.
            train_valid_data (dict): A dictionary that stored training and validation data,
                {
                    features_train (numpy.matrix): A matrix of training points,
                    features_valid (numpy.matrix): A matrix of validation points,
                    output_train  (numpy.array): Outputs for training data,
                    output_valid  (numpy.array): Outputs for validation data.
                }
        Returns:
            A tuple of lowest_k and lowest_k_index:
                (
                    lowest_k (float): Best k value's RSS.
                    lowest_k_index (int): Best k value.
                )

        """
        # Get the largest number
        lowest_k = sys.maxsize

        # This stores the index of the lowest RSS number
        lowest_k_index = 0

        # Loop through k from start_k to end_k
        for k in range(start_k, end_k):

            # Use the knn model to compute a list of average knn
            model = knn_model(k, train_valid_data["features_train"], train_valid_data["output_train"],
                              train_valid_data["features_valid"])

            # Compute RSS by subtracting the output valid with the model
            rss = self.residual_sum_squares.residual_sum_squares_regression(train_valid_data["output_valid"], model)

            # If the rss is less than our lowest k,
            if rss < lowest_k:

                # Update the best k value and best k's value RSS
                lowest_k = rss
                lowest_k_index = k

        # Return the best k value and it's RSS
        return lowest_k, lowest_k_index
class TestRidgeRegression(unittest.TestCase):
    #   Usage:
    #       Tests for the Linear Regression Class.

    def setUp(self):
        # Usage:
        #       Constructor for TestRidgeRegression
        # Arguments:
        #       None

        # Create an instance of the Convert Numpy class
        self.convert_numpy = ConvertNumpy()

        # Create an instance of the Linear Regression class
        self.ridge_regression = RidgeRegression()

        # Create an instance of the Predict Output Class
        self.predict_output = PredictOutput()

        # Create an instance of the Residual Sum Squares Class
        self.residual_sum_squares = ResidualSumSquares()

        # Create an instance of the K Fold Cross Validation Class
        self.k_fold_cross_validation = KFoldCrossValidation()

        # Create a dictionary type to store relevant data types so that our pandas
        # will read the correct information
        dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float,
                      'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str,
                      'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int,
                      'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int,
                      'view':int}

        # Create a kc_house_frame that encompasses all test and train data
        self.kc_house_frame = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_data.csv', dtype=dtype_dict)

        # Create a kc_house_test_frame that encompasses only train data
        self.kc_house_train_frame = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_train_data.csv', dtype=dtype_dict)

        # Create a kc_house_frames that encompasses only test data
        self.kc_test_frames = pd.read_csv('./unit_tests/test_data/kc_house/kc_house_test_data.csv', dtype=dtype_dict)

        # Create a kc_house_train_valid_shuffled that encompasses both train and valid data and shuffled
        self.kc_house_train_valid_shuffled = pd.read_csv('./unit_tests/test_data/kc_house_with_validation_k_fold/wk3_kc_house_train_valid_shuffled.csv',
                                                         dtype=dtype_dict)

    def test_01_gradient_descent_no_penalty(self):
        # Usage:
        #       Tests the result on gradient descent with low penalty
        # Arguments:
        #       None

        # We will use sqft_living for our features
        features = ['sqft_living']

        # Output will use price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train_frame, features, output, 1)

        # Create our initial weights
        initial_weights = np.array([0., 0.])

        # Step size
        step_size = 1e-12

        # Max Iterations to Run
        max_iterations = 1000

        # Tolerance
        tolerance = None

        # L2 Penalty
        l2_penalty = 0.0

        # Compute our gradient descent value
        final_weights = self.ridge_regression.gradient_descent(feature_matrix, output,
                                                               initial_weights, step_size,
                                                               tolerance, l2_penalty, max_iterations)

        # We will use sqft_iving, and sqft_living15
        test_features = ['sqft_living']

        # Output will be price
        test_output = ['price']

        # Convert our test pandas frame to numpy
        test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_test_frames, test_features, test_output, 1)

        # Predict the output of test features
        predicted_output = self.predict_output.predict_output_linear_regression(test_feature_matrix, final_weights)

        # Compute RSS
        rss = self.residual_sum_squares.residual_sum_squares_linear_regression(test_output, predicted_output)

        # Assert that the weights is correct
        self.assertEquals(round(-0.16311351478746433, 5), round(final_weights[0], 5))
        self.assertEquals(round(263.02436896538489, 3), round(final_weights[1], 3))

        # Assert that rss is correct
        self.assertEquals(round(275723632153607.72, -5), round(rss, -5))

    def test_02_gradient_descent_high_penalty(self):
        # Usage:
        #       Tests the result on gradient descent with high penalty
        # Arguments:
        #       None

        # We will use sqft_living for our features
        features = ['sqft_living']

        # Output will use price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train_frame, features, output, 1)

        # Create our initial weights
        initial_weights = np.array([0., 0.])

        # Step size
        step_size = 1e-12

        # Max Iterations to Run
        max_iterations = 1000

        # Tolerance
        tolerance = None

        # L2 Penalty
        l2_penalty = 1e11

        # Compute our gradient descent value
        final_weights = self.ridge_regression.gradient_descent(feature_matrix, output,
                                                               initial_weights, step_size,
                                                               tolerance, l2_penalty, max_iterations)

        # We will use sqft_iving
        test_features = ['sqft_living']

        # Output will be price
        test_output = ['price']

        # Convert our test pandas frame to numpy
        test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_test_frames, test_features, test_output, 1)

        # Predict the output of test features

        predicted_output = self.predict_output.predict_output_linear_regression(test_feature_matrix, final_weights)

        # Compute RSS
        rss = self.residual_sum_squares.residual_sum_squares_linear_regression(test_output, predicted_output)

        # Assert that the weights is correct
        self.assertEquals(round(0.048718475774044, 5), round(final_weights[0], 5))
        self.assertEquals(round(124.57402057376679, 3), round(final_weights[1], 3))

        # Assert that rss is correct
        self.assertEquals(round(694654309578537.25, -5), round(rss, -5))

    def test_03_gradient_descent_multiple_high_penalty(self):
        # Usage:
        #       Tests the result on gradient descent with high penalty
        # Arguments:
        #       None

        # We will use sqft_iving, and sqft_living15
        features = ['sqft_living', 'sqft_living15']

        # Output will use price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train_frame, features, output, 1)

        # Create our initial weights
        initial_weights = np.array([0.0, 0.0, 0.0])

        # Step size
        step_size = 1e-12

        # Max Iterations to Run
        max_iterations = 1000

        # Tolerance
        tolerance = None

        # L2 Penalty
        l2_penalty = 1e11

        # Compute our gradient descent value
        final_weights = self.ridge_regression.gradient_descent(feature_matrix, output,
                                                               initial_weights, step_size,
                                                               tolerance, l2_penalty, max_iterations)

        # We will use sqft_iving, and sqft_living15
        test_features = ['sqft_living', 'sqft_living15']

        # Output will be price
        test_output = ['price']

        # Convert our test pandas frame to numpy
        test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_test_frames, test_features, test_output, 1)

        # Predict the output of test features
        predicted_output = self.predict_output.predict_output_linear_regression(test_feature_matrix, final_weights)

        # Compute RSS
        rss = self.residual_sum_squares.residual_sum_squares_linear_regression(test_output, predicted_output)

        # Assert that the weights is correct
        self.assertEquals(round(0.033601165521060711, 5), round(final_weights[0], 5))
        self.assertEquals(round(91.490167574878328, 3), round(final_weights[1], 3))
        self.assertEquals(round(78.437490333967176, 3), round(final_weights[2], 3))

        # Assert that rss is correct
        self.assertEquals(round(500408530236718.31, 0), round(rss, 0))

        # Look at the first predicted output
        self.assertEquals(round(270449.70602770313, 3), round(predicted_output[0], 3))

        # The first output should be 310000 in the test set
        self.assertEquals(310000.0, test_output[0])

    def test_04_gradient_descent_k_fold(self):
        # Usage:
        #       Tests best l2_penalty for ridge regression using gradient descent
        # Arguments:
        #       None

        # We will use sqft_living for our features
        features = ['sqft_living']

        # Output will use price
        output = ['price']

        # Create our initial weights
        initial_weights = np.array([0., 0.])

        # Step size
        step_size = 1e-12

        # Tolerance
        tolerance = None

        # Max Iterations to Run
        max_iterations = 1000

        # Number of Folds
        folds = 10

        # Store Cross Validation results
        cross_validation_results = []

        # We want to test l2 penalty values in [10^1, 10^2, 10^3, 10^4, ..., 10^11]
        for l2_penalty in np.logspace(1, 11, num=11):

            # Create a dictionary of model_parameters
            model_parameters = {'step_size': step_size,
                                'max_iteration': max_iterations,
                                'initial_weights': initial_weights,
                                'tolerance': tolerance,
                                'l2_penalty': l2_penalty}

            # Compute the cross validation results
            cross_validation = self.k_fold_cross_validation.k_fold_cross_validation(folds,
                                                                                    self.kc_house_train_frame,
                                                                                    self.ridge_regression.gradient_descent,
                                                                                    model_parameters, output, features)

            # Append it into the results
            cross_validation_results.append((l2_penalty, cross_validation))

        # Lowest Result
        lowest = sorted(cross_validation_results, key=lambda x: x[1])[0]

        # Assert True that 10000000 is the l2_penalty that gives the lowest cross validation error
        self.assertEquals(10000000.0, lowest[0])

        # Assert True that is the lowest l2_penalty
        self.assertEquals(round(120916225812152.84, 0), round(lowest[1], 0))

    def test_05_hill_climbing(self):
        # Usage:
        #       Tests the result on hill climbing
        # Arguments:
        #       None

        # We will use sqft_living for our features
        features = ['sqft_living']

        # Output will be price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train_frame, features, output, 1)

        # Create our initial weights
        initial_weights = np.array([0., 0.])

        # Step size
        step_size = 1e-12

        # Max Iterations to Run
        max_iterations = 1000

        # Tolerance
        tolerance = None

        # L2 Penalty
        l2_penalty = 0.0

        # Compute our hill climbing value
        final_weights = self.ridge_regression.hill_climbing(feature_matrix, output,
                                                            initial_weights, step_size,
                                                            tolerance, l2_penalty, max_iterations)

        # Assert that the weights is correct
        self.assertEquals(round(-7.7535764461428101e+70, -68), round(final_weights[0], -68))
        self.assertEquals(round(-1.9293745396177612e+74, -70), round(final_weights[1], -70))
示例#12
0
class TestLinearRegression(unittest.TestCase):
    #   Usage:
    #       Tests for the Linear Regression Class.

    def setUp(self):
        # Usage:
        #       Constructor for TestLinearRegression
        # Arguments:
        #       None

        # Create an instance of the Convert Numpy class
        self.convert_numpy = ConvertNumpy()

        # Create an instance of the Linear Regression class
        self.linear_regression = LinearRegression()

        # Create an instance of the Predict Output Class
        self.predict_output = PredictOutput()

        # Create an instance of the Residual Sum Squares Class
        self.residual_sum_squares = ResidualSumSquares()

        # Create a dictionary type to store relevant data types so that our pandas
        # will read the correct information
        dtype_dict = {
            'bathrooms': float,
            'waterfront': int,
            'sqft_above': int,
            'sqft_living15': float,
            'grade': int,
            'yr_renovated': int,
            'price': float,
            'bedrooms': float,
            'zipcode': str,
            'long': float,
            'sqft_lot15': float,
            'sqft_living': float,
            'floors': str,
            'condition': int,
            'lat': float,
            'date': str,
            'sqft_basement': int,
            'yr_built': int,
            'id': str,
            'sqft_lot': int,
            'view': int
        }

        # Create a kc_house_frame that encompasses all test and train data
        self.kc_house_frame = pd.read_csv(
            './unit_tests/test_data/kc_house/kc_house_data.csv',
            dtype=dtype_dict)

        # Create a kc_house_test_frame that encompasses only train data
        self.kc_house_train_frame = pd.read_csv(
            './unit_tests/test_data/kc_house/kc_house_train_data.csv',
            dtype=dtype_dict)

        # Create a kc_house_frames that encompasses only test data
        self.kc_test_frames = pd.read_csv(
            './unit_tests/test_data/kc_house/kc_house_test_data.csv',
            dtype=dtype_dict)

    def test_01_gradient_descent(self):
        # Usage:
        #       Tests the result on gradient descent
        # Arguments:
        #       None

        # We will use sqft_living for our features
        features = ['sqft_living']

        # Output will use price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(
            self.kc_house_train_frame, features, output, 1)

        # Create our initial weights
        initial_weights = np.array([-47000., 1.])

        # Step size
        step_size = 7e-12

        # Tolerance
        tolerance = 2.5e7

        # Compute our gradient descent value
        final_weights = self.linear_regression.gradient_descent(
            feature_matrix, output, initial_weights, step_size, tolerance)

        # Assert that the weights is correct
        self.assertEquals(round(-46999.887165546708, 3),
                          round(final_weights[0], 3))
        self.assertEquals(round(281.91211917520917, 3),
                          round(final_weights[1], 3))

    def test_02_gradient_descent_multiple(self):
        # Usage:
        #   Computes gradient descent on multiple input, and computes predicted model and RSS
        # Arguments:
        #   None

        # We will use sqft_iving, and sqft_living15
        features = ['sqft_living', 'sqft_living15']

        # Output will be price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(
            self.kc_house_train_frame, features, output, 1)

        # Create our initial weights
        initial_weights = np.array([-100000., 1., 1.])

        # Step size
        step_size = 4e-12

        # Tolerance
        tolerance = 1e9

        # Compute our gradient descent value
        final_weights = self.linear_regression.gradient_descent(
            feature_matrix, output, initial_weights, step_size, tolerance)

        # We will use sqft_iving, and sqft_living15
        test_features = ['sqft_living', 'sqft_living15']

        # Output will be price
        test_output = ['price']

        # Convert our test pandas frame to numpy
        test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(
            self.kc_test_frames, test_features, test_output, 1)

        # Predict the output of test features
        predicted_output = self.predict_output.predict_output_linear_regression(
            test_feature_matrix, final_weights)

        # Compute RSS
        rss = self.residual_sum_squares.residual_sum_squares_linear_regression(
            test_output, predicted_output)

        # Assert that rss is correct
        self.assertEquals(round(270263443629803.41, -3), round(rss, -3))

    def test_03_hill_climbing(self):
        # Usage:
        #       Tests the result on hill climbing
        # Arguments:
        #       None

        # We will use sqft_living for our features
        features = ['sqft_living']

        # Output will be price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(
            self.kc_house_train_frame, features, output, 1)

        # Create our initial weights
        initial_weights = np.array([-47000., 1.])

        # Step size
        step_size = 7e-12

        # Tolerance
        tolerance = 2.5e7

        # Compute our hill climbing value
        final_weights = self.linear_regression.hill_climbing(
            feature_matrix, output, initial_weights, step_size, tolerance)

        # Assert that the weights is correct
        self.assertEquals(round(-47000.142201335177, 3),
                          round(final_weights[0], 3))
        self.assertEquals(round(-352.86068692252599, 3),
                          round(final_weights[1], 3))
示例#13
0
class TestRidgeRegression(unittest.TestCase):

    """Test for RidgeRegression.

    Uses housing data to test RidgeRegression.

    Statics:
        _multiprocess_can_split_ (bool): Flag for nose tests to run tests in parallel.

    """

    _multiprocess_can_split_ = True

    def setUp(self):
        """Constructor for TestRidgeRegression.

        Loads housing data, and creates training and testing data.

        """
        self.convert_numpy = ConvertNumpy()
        self.ridge_regression = RidgeRegression()
        self.predict_output = PredictOutput()
        self.residual_sum_squares = ResidualSumSquares()
        self.k_fold_cross_validation = KFoldCrossValidation()

        # Create a dictionary type to store relevant data types so that our pandas
        # will read the correct information
        dtype_dict = {'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float,
                      'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str,
                      'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int,
                      'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int,
                      'view': int}

        # Create a kc_house that encompasses all test and train data
        self.kc_house = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_data.csv',
                                    dtype=dtype_dict)

        # Create a kc_house_test_frame that encompasses only train data
        self.kc_house_train = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_train_data.csv',
                                          dtype=dtype_dict)

        # Create a kc_house_frames that encompasses only test data
        self.kc_house_test = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_test_data.csv',
                                         dtype=dtype_dict)

        # Create a kc_house_train_valid_shuffled that encompasses both train and valid data and shuffled
        self.kc_house_train_valid_shuffled = pd.read_csv('./unit_tests/test_data/regression/'
                                                         'kc_house_with_validation_k_fold/'
                                                         'wk3_kc_house_train_valid_shuffled.csv',
                                                         dtype=dtype_dict)

    def test_01_gradient_descent_no_penalty(self):
        """Tests gradient descent algorithm.

        Tests the result on gradient descent with low penalty.

        """
        # We will use sqft_living for our features
        features = ['sqft_living']

        # Output will use price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1)

        # Create our initial weights
        initial_weights = np.array([0., 0.])

        # Step size
        step_size = 1e-12

        # Max Iterations to Run
        max_iterations = 1000

        # Tolerance
        tolerance = None

        # L2 Penalty
        l2_penalty = 0.0

        # Compute our gradient descent value
        final_weights = self.ridge_regression.gradient_descent(feature_matrix, output,
                                                               {"initial_weights": initial_weights,
                                                                "step_size": step_size,
                                                                "tolerance": tolerance,
                                                                "l2_penalty": l2_penalty,
                                                                "max_iteration": max_iterations})

        # We will use sqft_iving, and sqft_living15
        test_features = ['sqft_living']

        # Output will be price
        test_output = ['price']

        # Convert our test pandas frame to numpy
        test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_house_test, test_features,
                                                                               test_output, 1)

        # Predict the output of test features
        predicted_output = self.predict_output.regression(test_feature_matrix, final_weights)

        # Compute RSS
        rss = self.residual_sum_squares.residual_sum_squares_regression(test_output, predicted_output)

        # Assert that the weights is correct
        self.assertEquals(round(-0.16311351478746433, 5), round(final_weights[0], 5))
        self.assertEquals(round(263.02436896538489, 3), round(final_weights[1], 3))

        # Assert that rss is correct
        self.assertEquals(round(275723632153607.72, -5), round(rss, -5))

    def test_02_gradient_descent_high_penalty(self):
        """Tests gradient descent.

        Tests the result on gradient descent with high penalty.

        """
        # We will use sqft_living for our features
        features = ['sqft_living']

        # Output will use price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1)

        # Create our initial weights
        initial_weights = np.array([0., 0.])

        # Step size
        step_size = 1e-12

        # Max Iterations to Run
        max_iterations = 1000

        # Tolerance
        tolerance = None

        # L2 Penalty
        l2_penalty = 1e11

        # Compute our gradient descent value
        final_weights = self.ridge_regression.gradient_descent(feature_matrix, output,
                                                               {"initial_weights": initial_weights,
                                                                "step_size": step_size,
                                                                "tolerance": tolerance,
                                                                "l2_penalty": l2_penalty,
                                                                "max_iteration": max_iterations})

        # We will use sqft_iving
        test_features = ['sqft_living']

        # Output will be price
        test_output = ['price']

        # Convert our test pandas frame to numpy
        test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_house_test, test_features,
                                                                               test_output, 1)

        # Predict the output of test features

        predicted_output = self.predict_output.regression(test_feature_matrix, final_weights)

        # Compute RSS
        rss = self.residual_sum_squares.residual_sum_squares_regression(test_output, predicted_output)

        # Assert that the weights is correct
        self.assertEquals(round(9.7673000000000005, 5), round(final_weights[0], 5))
        self.assertEquals(round(124.572, 3), round(final_weights[1], 3))

        # Assert that rss is correct
        self.assertEquals(round(694642101500000.0, -5), round(rss, -5))

    def test_03_gradient_descent_multiple_high_penalty(self):
        """Tests gradient descent.

        Tests gradient descent with multiple features, and high penalty.

        """
        # We will use sqft_iving, and sqft_living15
        features = ['sqft_living', 'sqft_living15']

        # Output will use price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1)

        # Create our initial weights
        initial_weights = np.array([0.0, 0.0, 0.0])

        # Step size
        step_size = 1e-12

        # Max Iterations to Run
        max_iterations = 1000

        # Tolerance
        tolerance = None

        # L2 Penalty
        l2_penalty = 1e11

        # Compute our gradient descent value
        final_weights = self.ridge_regression.gradient_descent(feature_matrix, output,
                                                               {"initial_weights": initial_weights,
                                                                "step_size": step_size,
                                                                "tolerance": tolerance,
                                                                "l2_penalty": l2_penalty,
                                                                "max_iteration": max_iterations})

        # We will use sqft_iving, and sqft_living15
        test_features = ['sqft_living', 'sqft_living15']

        # Output will be price
        test_output = ['price']

        # Convert our test pandas frame to numpy
        test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_house_test, test_features,
                                                                               test_output, 1)

        # Predict the output of test features
        predicted_output = self.predict_output.regression(test_feature_matrix, final_weights)

        # Compute RSS
        rss = self.residual_sum_squares.residual_sum_squares_regression(test_output, predicted_output)

        # Assert that the weights is correct
        self.assertEquals(round(6.7429699999999997, 5), round(final_weights[0], 5))
        self.assertEquals(round(91.489000000000004, 3), round(final_weights[1], 3))
        self.assertEquals(round(78.437490333967176, 3), round(final_weights[2], 3))

        # Assert that rss is correct
        self.assertEquals(round(500404800500842.0, -5), round(rss, -5))

        # Look at the first predicted output
        self.assertEquals(round(270453.53000000003, 3), round(predicted_output[0], 3))

        # The first output should be 310000 in the test set
        self.assertEquals(310000.0, test_output[0])

    def test_04_gradient_descent_k_fold(self):
        """Tests gradient descent with K fold cross validation.

        Tests best l2_penalty for ridge regression using gradient descent.

        """
        # We will use sqft_living for our features
        features = ['sqft_living']

        # Output will use price
        output = ['price']

        # Create our initial weights
        initial_weights = np.array([0., 0.])

        # Step size
        step_size = 1e-12

        # Tolerance
        tolerance = None

        # Max Iterations to Run
        max_iterations = 1000

        # Number of Folds
        folds = 10

        # Store Cross Validation results
        cross_validation_results = []

        # We want to test l2 penalty values in [10^1, 10^2, 10^3, 10^4, ..., 10^11]
        for l2_penalty in np.logspace(1, 11, num=11):

            # Create a dictionary of model_parameters
            model_parameters = {'step_size': step_size,
                                'max_iteration': max_iterations,
                                'initial_weights': initial_weights,
                                'tolerance': tolerance,
                                'l2_penalty': l2_penalty}

            # Compute the cross validation results
            cv = self.k_fold_cross_validation.k_fold_cross_validation(folds, self.ridge_regression.gradient_descent,
                                                                      model_parameters, {"data": self.kc_house_train,
                                                                                         "output": output,
                                                                                         "features": features})

            # Append it into the results
            cross_validation_results.append((l2_penalty, cv))

        # Lowest Result
        lowest = sorted(cross_validation_results, key=lambda x: x[1])[0]

        # Assert True that 10000000 is the l2_penalty that gives the lowest cross validation error
        self.assertEquals(10000000.0, lowest[0])

        # Assert True that is the lowest l2_penalty
        self.assertEquals(round(120916225809145.0, 0), round(lowest[1], 0))

    def test_05_gradient_ascent(self):
        """Tests gradient ascent.

        Tests gradient ascent and compare it with known values.

        """
        # We will use sqft_living for our features
        features = ['sqft_living']

        # Output will be price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1)

        # Create our initial weights
        initial_weights = np.array([0., 0.])

        # Step size
        step_size = 1e-12

        # Max Iterations to Run
        max_iterations = 1000

        # Tolerance
        tolerance = None

        # L2 Penalty
        l2_penalty = 0.0

        # Compute our hill climbing value
        final_weights = self.ridge_regression.gradient_ascent(feature_matrix, output,
                                                              {"initial_weights": initial_weights,
                                                               "step_size": step_size,
                                                               "tolerance": tolerance,
                                                               "l2_penalty": l2_penalty,
                                                               "max_iteration": max_iterations})

        # Assert that the weights is correct
        self.assertEquals(round(-7.7535764461428101e+70, -68), round(final_weights[0], -68))
        self.assertEquals(round(-1.9293745396177612e+74, -70), round(final_weights[1], -70))

    def test_07_gradient_ascent_high_tolerance(self):
        """Tests gradient ascent.

        Tests gradient ascent and compare it with known values.

        """
        # We will use sqft_living for our features
        features = ['sqft_living']

        # Output will be price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1)

        # Create our initial weights
        initial_weights = np.array([0., 0.])

        # Step size
        step_size = 1e-12

        # Max Iterations to Run
        max_iterations = 1000

        # Tolerance
        tolerance = 1

        # L2 Penalty
        l2_penalty = 0.0

        # Compute our hill climbing value
        final_weights = self.ridge_regression.gradient_ascent(feature_matrix, output,
                                                              {"initial_weights": initial_weights,
                                                               "step_size": step_size,
                                                               "tolerance": tolerance,
                                                               "l2_penalty": l2_penalty,
                                                               "max_iteration": max_iterations})

        # Assert that the weights is correct
        self.assertEquals(0, round(final_weights[0], -68))
        self.assertEquals(0, round(final_weights[1], -70))

    def test_08_gradient_descent_no_penalty_high_tolerance(self):
        """Tests gradient descent algorithm.

        Tests the result on gradient descent with low penalty.

        """
        # We will use sqft_living for our features
        features = ['sqft_living']

        # Output will use price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1)

        # Create our initial weights
        initial_weights = np.array([0., 0.])

        # Step size
        step_size = 1e-12

        # Max Iterations to Run
        max_iterations = 100000

        # Tolerance
        tolerance = 10000000000

        # L2 Penalty
        l2_penalty = 0.0

        # Compute our gradient descent value
        final_weights = self.ridge_regression.gradient_descent(feature_matrix, output,
                                                               {"initial_weights": initial_weights,
                                                                "step_size": step_size,
                                                                "tolerance": tolerance,
                                                                "l2_penalty": l2_penalty,
                                                                "max_iteration": max_iterations})

        # We will use sqft_iving, and sqft_living15
        test_features = ['sqft_living']

        # Output will be price
        test_output = ['price']

        # Convert our test pandas frame to numpy
        test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_house_test, test_features,
                                                                               test_output, 1)

        # Predict the output of test features
        predicted_output = self.predict_output.regression(test_feature_matrix, final_weights)

        # Compute RSS
        rss = self.residual_sum_squares.residual_sum_squares_regression(test_output, predicted_output)

        # Assert that the weights is correct
        self.assertEquals(round(0.093859999999999999, 5), round(final_weights[0], 5))
        self.assertEquals(round(262.98200000000003, 3), round(final_weights[1], 3))

        # Assert that rss is correct
        self.assertEquals(round(275724298300000.0, -5), round(rss, -5))
class KFoldCrossValidation:

    """Class for K Fold Cross Validation.

    Class for K Fold Cross Validation for selecting best parameters.

    Attributes:
        convert_numpy (ConvertNumpy): Pandas to Numpy conversion class.
        predict_output (PredictOutput): Output prediction.
        residual_sum_squares (ResidualSumSquares): Computes residual sum of squares.

    """

    def __init__(self):
        """Constructor for KFoldCrossValidation.

        Constructor for KFoldCrossValidation, used to setup numpy conversion, output prediction, and residual sum
        of squares.

        """
        self.convert_numpy = ConvertNumpy()
        self.predict_output = PredictOutput()
        self.residual_sum_squares = ResidualSumSquares()

    def k_fold_cross_validation(self, k, model, model_parameters, data_parameters):
        """Performs K Fold Cross Validation.

        Takes in our data, and splits the data to smaller subsets, and these smaller subsets are used as validation
        sets, and everything else not included in the validation set is used as training sets. The model will be
        trained using the training set, and the performance assessment such as RSS would be used on the validation
        set against the model.

        Args:
            k (int): Number of folds.=
            model (obj): Model used for k folds cross validation.
            model_parameters (dict): Model parameters to train the specified model.
            data_parameters (dict): A dictionary of data information:
                {
                    data (pandas.DataFrame): Data used for k folds cross validation,
                    output (str): Output name,
                    features (list of str): A list of feature names.
                }

        Returns:
            float: Average validation error.

        """
        # Sum of the validation error, will divide by k (fold) later
        validation_error_sum = 0

        # Loop through each fold
        for i in range(k):
            # Computes validation, and training set
            validation_set, training_set = self.create_validation_training_set(data_parameters["data"], k, i)

            # Convert our pandas frame to numpy to create validation set
            validation_set_matrix, validation_output = self.convert_numpy.convert_to_numpy(validation_set,
                                                                                           data_parameters["features"],
                                                                                           data_parameters["output"], 1)

            # Create a model with Train Set 1 + Train Set 2
            final_weights = self.create_weights(model, model_parameters, training_set, data_parameters)

            # Predict the output of test features
            predicted_output = self.predict_output.regression(validation_set_matrix,
                                                              final_weights)

            # compute squared error (in other words, rss)
            validation_error_sum += self.residual_sum_squares.residual_sum_squares_regression(validation_output,
                                                                                              predicted_output)

        # Return the validation_error_sum divided by fold
        return validation_error_sum/k

    @staticmethod
    def create_validation_training_set(data, k, iteration):
        """Slice data according to k, iteration, and size of data.

        Computes the validation, and training set according to the k number of folds, and the current iteration.

        Args:
            data (pandas.DataFrame): Data used for k folds cross validation.
            k (int): Number of folds.
            iteration (int): Current K fold validation iteration.

        Returns:
            A tuple that contains training set, and validation set:
                (
                    validation_set (pandas.DataFrame): Validation set.
                    training_set (pandas.DataFrame): Training set.
                )
        """
        length_data = len(data)

        # Compute the start section of the current fold
        start = int((length_data * iteration) / k)

        # Compute the end section of the current fold
        end = int((length_data * (iteration + 1)) / k - 1)

        # Get our validation set from the start to the end+1 (+1 since we need to include the end)
        # <Start : end + 1> Validation Set
        validation_set = data[start:end + 1]

        # The Training set the left and the right parts of the validation set
        # < 0       : Start >   Train Set 1
        # < Start   : End + 1 > Validation Set
        # < End + 1 : n >       Train Set 2
        # Train Set 1 + Train Set 2 = All data excluding validation set
        training_set = data[0:start].append(data[end + 1:length_data])

        return validation_set, training_set

    def create_weights(self, model, model_parameters, training_set, data_parameters):
        """Use model to create weights.

        Use model, model parameters, and training set, create a set of coefficients.

        Args:
            model (obj): Model that can be run.
            model_parameters (dict): A dictionary of model parameters.
            training_set (pandas.DataFrame): Train set used for k folds cross validation.
            data_parameters (dict): A dictionary of data information:
                {
                    data (pandas.DataFrame): Data used for k folds cross validation,
                    output (str): Output name,
                    features (list of str): A list of feature names.
                }

        Returns:
            numpy.array: numpy array of weights created by running model.

        """
        # Convert our pandas frame to numpy to create training set
        training_feature_matrix, training_output = self.convert_numpy.convert_to_numpy(training_set,
                                                                                       data_parameters["features"],
                                                                                       data_parameters["output"], 1)

        # Create a model with Train Set 1 + Train Set 2
        return model(model_parameters=model_parameters, feature_matrix=training_feature_matrix, output=training_output)
示例#15
0
class KFoldCrossValidation:
    # Usage:
    #   Computes K Fold Cross Validation

    def __init__(self):
        # Usage:
        #       Constructor for KFoldCrossValidation, used to setup ConvertNumpy class to convert pandas
        #       data to numpy.
        # Arguments:
        #       None

        # Create an instance of the Convert Numpy class
        self.convert_numpy = ConvertNumpy()

        # Create an instance of the Predict Output Class
        self.predict_output = PredictOutput()

        # Create an instance of the Residual Sum Squares Class
        self.residual_sum_squares = ResidualSumSquares()

    def k_fold_cross_validation(self, k, data, model, model_parameters, output, features):
        # Usage:
        #       Takes in our data, and splits the data to smaller subsets, and these smaller subsets
        #       are used as validation sets, and everything else not included in the validation set is used
        #       as training sets. The model will be trained using the training set, and the performance assessment
        #       such as RSS would be used on the validation set against the model.
        # Parameters:
        #       k                (int)            : number of folds
        #       data             (pandas object)  : data used for k folds cross validation
        #       model            (object)         : model used for k folds cross validation
        #       model_parameters (dict)           : model parameters to train the specified model
        #       features         (list of string) : a list of feature names
        #       output           (string)         : output name
        # Return:
        #       validation_error (double)        : average validation error

        # Get the length of the data
        length_data = len(data)

        # Sum of the validation error, will divide by k (fold) later
        validation_error_sum = 0

        # Loop through each fold
        for i in range(k):

            # Compute the start section of the current fold
            start = int((length_data*i)/k)

            # Compute the end section of the current fold
            end = int((length_data*(i+1))/k-1)

            # Get our validation set from the start to the end+1 (+1 since we need to include the end)
            # <Start : end + 1> Validation Set
            validation_set = data[start:end+1]

            # The Training set the left and the right parts of the validation set
            # < 0       : Start >   Train Set 1
            # < Start   : End + 1 > Validation Set
            # < End + 1 : n >       Train Set 2
            # Train Set 1 + Train Set 2 = All data excluding validation set
            training_set = data[0:start].append(data[end+1:length_data])

            # Convert our pandas frame to numpy
            validation_feature_matrix, validation_output = self.convert_numpy.convert_to_numpy(validation_set, features,
                                                                                               output, 1)

            # Convert our pandas frame to numpy
            training_feature_matrix, training_output = self.convert_numpy.convert_to_numpy(training_set, features,
                                                                                           output, 1)

            # Create a model with Train Set 1 + Train Set 2
            final_weights = model(**model_parameters, feature_matrix=training_feature_matrix, output=training_output)

            # Predict the output of test features
            predicted_output = self.predict_output.predict_output_linear_regression(validation_feature_matrix,
                                                                                    final_weights)

            # compute squared error (in other words, rss)
            validation_error_sum += self.residual_sum_squares.residual_sum_squares_linear_regression(validation_output,
                                                                                                     predicted_output)

        # Return the validation_error_sum divided by fold
        return validation_error_sum/k
示例#16
0
class TestLassoRegression(unittest.TestCase):

    """Tests for TestLassoRegression.

    Uses housing data to test LassoRegression.

    Statics:
        _multiprocess_can_split_ (bool): Flag for nose tests to run tests in parallel.

    """

    _multiprocess_can_split_ = True

    def setUp(self):
        """Constructor for TestLassoRegression.

        Loads housing data, and creates training and testing data.

        """
        self.convert_numpy = ConvertNumpy()
        self.normalize_features = NormalizeFeatures()
        self.lasso = LassoRegression()
        self.predict_output = PredictOutput()
        self.residual_sum_squares = ResidualSumSquares()
        self.k_fold_cross_validation = KFoldCrossValidation()

        # Create a dictionary type to store relevant data types so that our pandas
        # will read the correct information
        dtype_dict = {'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float,
                      'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str,
                      'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int,
                      'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int,
                      'view': int}

        # Create a kc_house that encompasses all test and train data
        self.kc_house = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_data.csv', dtype=dtype_dict)

        # Create a kc_house_test_frame that encompasses only train data
        self.kc_house_train = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_train_data.csv',
                                          dtype=dtype_dict)

        # Create a kc_house_frames that encompasses only test data
        self.kc_house_test = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_test_data.csv',
                                         dtype=dtype_dict)

        # Convert all the frames with the floors to float type
        self.kc_house['floors'] = self.kc_house['floors'].astype(float)
        self.kc_house_train['floors'] = self.kc_house['floors'].astype(float)
        self.kc_house_test['floors'] = self.kc_house['floors'].astype(float)

        # Then back to int type
        self.kc_house['floors'] = self.kc_house['floors'].astype(int)
        self.kc_house_train['floors'] = self.kc_house['floors'].astype(int)
        self.kc_house_test['floors'] = self.kc_house['floors'].astype(int)

    def test_01_normalize_features(self):
        """Tests normalizing features.

        Test normalization features, and compare it with known values.

        """
        # Normalize the features, and also return the norms
        features, norms = self.normalize_features.l2_norm(np.array([[3., 6., 9.], [4., 8., 12.]]))

        # Assert that the np array is equal to features
        self.assertTrue(np.array_equal(np.array([[0.6, 0.6, 0.6], [0.8, 0.8, 0.8]]), features), True)

        # Assert that the np array is equal to norms
        self.assertTrue(np.array_equal(np.array([5., 10., 15.]), norms), True)

    def test_02_compute_ro(self):
        """Test compute ro

        Test compute one round of ro.

        """
        # We will use sqft_iving, and sqft_living15
        features = ['sqft_living', 'bedrooms']

        # Output will use price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house, features, output, 1)

        # Create our initial weights
        normalized_feature_matrix, _ = self.normalize_features.l2_norm(feature_matrix)

        # Set initial weights
        weights = np.array([1., 4., 1.])

        # Compute ro_j
        ro_j = self.lasso.compute_ro_j(normalized_feature_matrix, output, weights)

        # Assert the output of ro_j
        self.assertTrue(np.allclose(ro_j, np.array([79400300.03492916, 87939470.77299108, 80966698.67596565])))

    def test_03_compute_coordinate_descent_step(self):
        """Test one coordinate descent step.

        Test one coordinate descent step and compare it with known values.

        """
        # Assert that both are equal
        self.assertEquals(round(self.lasso.lasso_coordinate_descent_step({"i": 1,
                                                                          "weights": np.array([1., 4.])},
                                                                         np.array([[3./math.sqrt(13),
                                                                                    1./math.sqrt(10)],
                                                                                   [2./math.sqrt(13),
                                                                                    3./math.sqrt(10)]]),
                                                                         np.array([1., 1.]),
                                                                         {"l1_penalty": 0.1}), 8),
                          round(0.425558846691, 8))

    def test_04_coordinate_descent(self):
        """Test coordinate descent.

        Test coordinate descent and compare with known values.

        """
        # We will use sqft_iving, and sqft_living15
        features = ['sqft_living', 'bedrooms']

        # Output will use price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house, features, output, 1)

        # Create our initial weights
        normalized_feature_matrix, _ = self.normalize_features.l2_norm(feature_matrix)

        # Set initial weights
        initial_weights = np.zeros(3)

        # Set l1 penalty
        l1_penalty = 1e7

        # Set tolerance
        tolerance = 1.0

        # Compute the weights using coordinate descent
        weights = self.lasso.lasso_cyclical_coordinate_descent(normalized_feature_matrix, output,
                                                               {"initial_weights": initial_weights,
                                                                "l1_penalty": l1_penalty,
                                                                "tolerance": tolerance})

        # Assert that these two numpy arrays are the same
        self.assertTrue(np.allclose(weights, np.array([21624998.3663629, 63157246.78545423, 0.]), True))

        # Predict the output
        predicted_output = self.predict_output.regression(normalized_feature_matrix, weights)

        # Assert that the RSS is what we wanted
        self.assertEquals(round(self.residual_sum_squares.residual_sum_squares_regression(output,
                                                                                          predicted_output), -10),
                          round(1.63049248148e+15, -10))

    def test_05_coordinate_descent_with_normalization(self):
        """Test coordinate descent with normalization.

        Test coordinate descent and then normalize the result, so that we can use the weights on a test set.

        """
        # We will use multiple features
        features = ['bedrooms',
                    'bathrooms',
                    'sqft_living',
                    'sqft_lot',
                    'floors',
                    'waterfront',
                    'view',
                    'condition',
                    'grade',
                    'sqft_above',
                    'sqft_basement',
                    'yr_built',
                    'yr_renovated']

        # Output will use price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1)

        # Create our initial weights
        normalized_feature_matrix, norms = self.normalize_features.l2_norm(feature_matrix)

        # Compute Multiple Weights
        weights1e7 = self.lasso.lasso_cyclical_coordinate_descent(normalized_feature_matrix, output,
                                                                  {"initial_weights": np.zeros(len(features)+1),
                                                                   "l1_penalty": 1e7,
                                                                   "tolerance": 1})
        weights1e8 = self.lasso.lasso_cyclical_coordinate_descent(normalized_feature_matrix, output,
                                                                  {"initial_weights": np.zeros(len(features)+1),
                                                                   "l1_penalty": 1e8,
                                                                   "tolerance": 1})
        weights1e4 = self.lasso.lasso_cyclical_coordinate_descent(normalized_feature_matrix, output,
                                                                  {"initial_weights": np.zeros(len(features)+1),
                                                                   "l1_penalty": 1e4,
                                                                   "tolerance": 5e5})

        # Compute multiple normalized
        normalized_weights1e4 = weights1e4 / norms
        normalized_weights1e7 = weights1e7 / norms
        normalized_weights1e8 = weights1e8 / norms

        # We will use multiple features
        features = ['bedrooms',
                    'bathrooms',
                    'sqft_living',
                    'sqft_lot',
                    'floors',
                    'waterfront',
                    'view',
                    'condition',
                    'grade',
                    'sqft_above',
                    'sqft_basement',
                    'yr_built',
                    'yr_renovated']

        # Output will use price
        output = ['price']

        # Convert our test pandas frame to numpy
        test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_house_test, features, output, 1)

        # Predict the output
        predicted_output = self.predict_output.regression(test_feature_matrix, normalized_weights1e4)

        # Assert that the RSS is what we wanted
        self.assertEquals(round(self.residual_sum_squares.residual_sum_squares_regression(test_output,
                                                                                          predicted_output), -12),
                          round(2.2778100476e+14, -12))

        # Predict the output
        predicted_output = self.predict_output.regression(test_feature_matrix, normalized_weights1e7)

        # Assert that the RSS is what we wanted
        self.assertEquals(round(self.residual_sum_squares.residual_sum_squares_regression(test_output,
                                                                                          predicted_output), -12),
                          round(2.75962079909e+14, -12))

        # Predict the output
        predicted_output = self.predict_output.regression(test_feature_matrix, normalized_weights1e8)

        # Assert that the RSS is what we wanted
        self.assertEquals(round(self.residual_sum_squares.residual_sum_squares_regression(test_output,
                                                                                          predicted_output), -12),
                          round(5.37049248148e+14, -12))
示例#17
0
class TestLinearRegression(unittest.TestCase):

    """Test for LinearRegression.

    Uses housing data to test LinearRegression.

    Statics:
        _multiprocess_can_split_ (bool): Flag for nose tests to run tests in parallel.

    """

    _multiprocess_can_split_ = True

    def setUp(self):
        """Constructor for TestLinearRegression.

        Loads housing data, and creates training and testing data.

        """
        self.convert_numpy = ConvertNumpy()
        self.linear_regression = LinearRegression()
        self.predict_output = PredictOutput()
        self.residual_sum_squares = ResidualSumSquares()

        # Create a dictionary type to store relevant data types so that our pandas
        # will read the correct information
        dtype_dict = {'bathrooms': float, 'waterfront': int, 'sqft_above': int, 'sqft_living15': float,
                      'grade': int, 'yr_renovated': int, 'price': float, 'bedrooms': float, 'zipcode': str,
                      'long': float, 'sqft_lot15': float, 'sqft_living': float, 'floors': str, 'condition': int,
                      'lat': float, 'date': str, 'sqft_basement': int, 'yr_built': int, 'id': str, 'sqft_lot': int,
                      'view': int}

        # Create a kc_house that encompasses all test and train data
        self.kc_house = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_data.csv', dtype=dtype_dict)

        # Create a kc_house_test_frame that encompasses only train data
        self.kc_house_train = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_train_data.csv',
                                          dtype=dtype_dict)

        # Create a kc_house_frames that encompasses only test data
        self.kc_house_test = pd.read_csv('./unit_tests/test_data/regression/kc_house/kc_house_test_data.csv',
                                         dtype=dtype_dict)

    def test_01_gradient_descent(self):
        """Test gradient descent.

        Tests gradient descent and compare it to known values.

        """
        # We will use sqft_living for our features
        features = ['sqft_living']

        # Output will use price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1)

        # Create our initial weights
        initial_weights = np.array([-47000., 1.])

        # Step size
        step_size = 7e-12

        # Tolerance
        tolerance = 2.5e7

        # Compute our gradient descent value
        final_weights = self.linear_regression.gradient_descent(feature_matrix, output,
                                                                {"initial_weights": initial_weights,
                                                                 "step_size": step_size,
                                                                 "tolerance": tolerance})

        # Assert that the weights is correct
        self.assertEquals(round(-46999.887165546708, 3), round(final_weights[0], 3))
        self.assertEquals(round(281.91211917520917, 3), round(final_weights[1], 3))

    def test_02_gradient_descent_multiple(self):
        """Tests gradient descent on multiple features.

        Computes gradient descent on multiple input, and computes predicted model and RSS.

        """
        # We will use sqft_iving, and sqft_living15
        features = ['sqft_living', 'sqft_living15']

        # Output will be price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1)

        # Create our initial weights
        initial_weights = np.array([-100000., 1., 1.])

        # Step size
        step_size = 4e-12

        # Tolerance
        tolerance = 1e9

        # Compute our gradient descent value
        final_weights = self.linear_regression.gradient_descent(feature_matrix, output,
                                                                {"initial_weights": initial_weights,
                                                                 "step_size": step_size,
                                                                 "tolerance": tolerance})

        # We will use sqft_iving, and sqft_living15
        test_features = ['sqft_living', 'sqft_living15']

        # Output will be price
        test_output = ['price']

        # Convert our test pandas frame to numpy
        test_feature_matrix, test_output = self.convert_numpy.convert_to_numpy(self.kc_house_test, test_features,
                                                                               test_output, 1)

        # Predict the output of test features
        predicted_output = self.predict_output.regression(test_feature_matrix, final_weights)

        # Compute RSS
        rss = self.residual_sum_squares.residual_sum_squares_regression(test_output, predicted_output)

        # Assert that rss is correct
        self.assertEquals(round(270263443629803.41, -3), round(rss, -3))

    def test_03_gradient_ascent(self):
        """Test gradient ascent.

        Test gradient ascent and compare it to known values.

        """
        # We will use sqft_living for our features
        features = ['sqft_living']

        # Output will be price
        output = ['price']

        # Convert our pandas frame to numpy
        feature_matrix, output = self.convert_numpy.convert_to_numpy(self.kc_house_train, features, output, 1)

        # Create our initial weights
        initial_weights = np.array([-47000., 1.])

        # Step size
        step_size = 7e-12

        # Tolerance
        tolerance = 2.5e7

        # Compute our hill climbing value
        final_weights = self.linear_regression.gradient_ascent(feature_matrix, output,
                                                               {"initial_weights": initial_weights,
                                                                "step_size": step_size,
                                                                "tolerance": tolerance})

        # Assert that the weights is correct
        self.assertEquals(round(-47000.142201335177, 3), round(final_weights[0], 3))
        self.assertEquals(round(-352.86068692252599, 3), round(final_weights[1], 3))