Пример #1
0
def estimateRegression(X, y, start, stop, n_values, k=5, metric=mse):
    scoresList = GridSearchCV(
        RidgeRegression(), {
            "alpha": np.linspace(start, stop, n_values),
            "fit_intercept": [False, True]
        }, X, y, k, metric)
    nest_est = []
    print("\n", "*" * 100, "\n")
    print("The best value of alpha found in range {0}-{1} and score values:".
          format(start, stop))

    for key, value in scoresList[0].items():
        print("{0} : {1}".format(key, value))
    print("\n", "*" * 100)

    inter = {
        True: "Fit intercept",
        False: "Don't fit intercept",
    }
    #### First with fit_intercept=True then with fit_intercept=False
    for t in inter.keys():

        fig, axs = plt.subplots(2)
        axs[0].set_title("Predictor coefficients - {0}".format(inter[t]))
        axs[1].set_title("Cross-validated estimate - {0}".format(inter[t]))
        fig.tight_layout(pad=0.8)

        #### Plotting alpha vs coefs/riskEst for any transformation on data
        a = [r for r in scoresList if r["estimator"].getFitIntercept() == t]
        plotCoef(axs[0], a)
        plotGridSearch(axs[1], a)

        #### Perform nested cross-validated estimate for the couple (fit_intercept,scale) with a grid centered on the best
        #### value discovered with the GridSearchCv
        a.sort(key=lambda e: e["meanScore"])
        ncv_alpha = np.linspace(
            a[0]["estimator"].getAlpha() - ((stop - start) / n_values) / 2,
            a[0]["estimator"].getAlpha() + ((stop - start) / n_values) / 2,
            n_values)

        res = NestedCVEstimate(RidgeRegression(), {
            "alpha": ncv_alpha,
            "fit_intercept": [t]
        }, X, y, k, metric)
        print("\n", "*" * 100, "\n")
        print("Nested cross-validation estimate for a grid centered around alpha={0} fit_intercept={1}".\
                    format(a[0]["estimator"].getAlpha(),a[0]["estimator"].getFitIntercept()))
        print(res)
        print("\n", "*" * 100)
        nest_est.append((t, res))

    nest_est.sort(key=lambda el: el[1])
    print("The best nested CV estimate is obtained with fit_intercept={0}".
          format(nest_est[0][0]))
    print("The difference between the two is {0}".format(
        abs(nest_est[0][1] - nest_est[1][1])))

    return scoresList
Пример #2
0
def FeatureImportance(X, y):
    #plotting importance of each feature
    R = RidgeRegression()
    importance = np.zeros(len(X[0]))
    index = 0
    for feature in range(len(X[0])):
        importance[index] = R.ImportanceOfFeature(X[:, feature], y[:, 0])
        index += 1

    plt.plot(range(len(X[0])), importance, 'db')
    #plt.plot(labelTitles, importance, 'db')
    plt.savefig('images/FeatureImportance.png')
    plt.cla()  # Clear axis
    plt.clf()  # Clear figure
def start_ridge_regression(training_records, output):
    """
    In this method, we compare the weights calculated using our gradient descent approach with the sklearn's output.

    `Our method`
    >>> regressor = RidgeRegression(iterations=NUM_OF_ITERATIONS, learning_rate=LEARNING_RATE, ridge_learning_rate=RIDGE_LEARNING_RATE)
    >>> weights_table, mse_costs, predicted_outputs = regressor.calculate_weights(training_records, output)

    As you see above there are 3 tables returned from our approach.

    1. weights_table - This is where we store the history of the weights from iteration 0 to the last iteration.
       To access the set of weights in the last iteration simply use `weights_table[-1]`

    2. mse_costs - Table which stores the mean square error for each iteration.

    3. predicted_outputs - This is the predicted output using our machine(i.e weights)

    The following code fragment shows how to invoke sklearn's Ridge regression.
    `sklearn's method`
    >>> clf = linear_model.Ridge(fit_intercept=False)
    >>> clf.fit(training_records, output)

    Lastly, we just print the weights and it is left to the user to visually compare them.

    :parameter training_records - N X P matrix of training samples.
    :parameter output - N X 1 vector of output.

    :return:
    """
    regressor = RidgeRegression(iterations=NUM_OF_ITERATIONS,
                                learning_rate=LEARNING_RATE,
                                ridge_learning_rate=RIDGE_LEARNING_RATE)
    weights_table, mse_costs, predicted_outputs = regressor.calculate_weights(
        training_records, output)
    clf = linear_model.Ridge(fit_intercept=False)
    clf.fit(training_records, output)
    print "Starting gradient descent with {0} iterations, learning rate of {1} and a regularization " \
          "strength of {2}".format(NUM_OF_ITERATIONS, LEARNING_RATE, RIDGE_LEARNING_RATE)

    print "Running..."

    final_weights = [
        weights_table[-1][i] for i in range(0, NUM_OF_FEATURES + 1)
    ]
    print "After %s iterations of Gradient Descent (our implementation), the final weights are : %s" % (
        NUM_OF_ITERATIONS, final_weights)

    print "Using Sklearn's Ridge Regression, the weights are : %s" % clf.coef_
    return weights_table, mse_costs
Пример #4
0
def fit_and_test(x_train, y_train, z_train, x_test, y_test, z_test):

    num_cols = 100
    num_rows = 100
    
    #fit data with Ordinary Least Squares
    print('ordinary lest squared')
    beta_OLS = ols(x_train, y_train, z_train)
    MSE_OLS = MeanSquaredError(x_test, y_test, z_test, beta_OLS)
    R2_OLS = R2(z_test, predict(x_test, y_test, beta_OLS))
    plot_terrain(num_rows, num_cols, beta_OLS)
    print('Mean squared error {} '.format(MSE_OLS))
    print('R2 score {} '.format(R2_OLS))
    
    #fit data with Ridge Regression, test model with testset and calculate MSE
    print('Ridge Regression')
    beta_Ridge = RidgeRegression(x_train, y_train, z_train)
    MSE_Ridge = MeanSquaredError(x_test, y_test, z_test, beta_Ridge)
    R2_Ridge = R2(z_test, predict(x_test, y_test, beta_Ridge))
    plot_terrain(num_rows, num_cols, beta_Ridge)
    print('Mean squared error {} '.format(MSE_Ridge))
    print('R2 score {} '.format(R2_Ridge))
    
    #fit data with Lasso Regression, test model with testset and calculate MSE
    print('Lasso Regression')
    beta_Lasso = Lasso(np.array(x_train), np.array(y_train), np.array(z_train), 5).reshape((21, 1))
    MSE_Lasso = MeanSquaredError(x_test, y_test, z_test, beta_Lasso)
    R2_Lasso = R2(z_test, predict(x_test, y_test, beta_Lasso))
    plot_terrain(num_rows, num_cols, beta_Lasso)
    print('Mean squared error {} '.format(MSE_Lasso))
    print('R2 score {} '.format(R2_Lasso))
Пример #5
0
def run_ridge():
    accs = []
    for number in range(1, 11):
        train_X, train_y, test_X, test_y = load_data(number,
                                                     feature=True,
                                                     balance=True)

        lgr = RidgeRegression(alpha=1)

        lgr.fit(train_X, train_y)

        predict_y = lgr.predict(test_X)
        acc = binary_acc(predict_y, test_y)

        accs.append(acc)
        print(acc)

    print(sum(accs) / 10)
def main(path='./', plot=True):

    housing_X_test, housing_X_train, housing_y_test, housing_y_train = load_data(
        path)

    clf = RidgeRegression(max_pass=200000,
                          _lambda=10,
                          lr=0.00000001,
                          tol=1e-9,
                          closed_form=True,
                          normalize=True)

    w, b = clf.fit(housing_X_train, housing_y_train, housing_X_test,
                   housing_y_test)
    print(b, linalg.norm(b))
    if plot:

        train_loss = clf.train_loss
        test_error = clf.test_error
        train_error = clf.train_error

        plot(train_loss, test_error, train_error)
Пример #7
0
def bootstrap(x, y, z, p_degree, method, n_bootstrap=100):
    # Randomly shuffle data
    data_set = np.c_[x, y, z]
    np.random.shuffle(data_set)
    set_size = round(len(x) / 5)

    # Extract test-set, never used in training. About 1/5 of total data
    x_test = data_set[0:set_size, 0]
    y_test = data_set[0:set_size, 1]
    z_test = data_set[0:set_size, 2]
    test_indices = np.linspace(0, set_size - 1, set_size)

    # And define the training set as the rest of the data
    x_train = np.delete(data_set[:, 0], test_indices)
    y_train = np.delete(data_set[:, 1], test_indices)
    z_train = np.delete(data_set[:, 2], test_indices)

    Z_predict = []
    MSE = []
    R2s = []
    for i in range(n_bootstrap):
        x_, y_, z_ = resample(x_train, y_train, z_train)

        if method == 'Ridge':
            # Ridge regression, save beta values
            beta = RidgeRegression(x_, y_, z_, degree=p_degree)
        elif method == 'Lasso':
            beta = Lasso(x_, y_, z_, degree=p_degree)
        elif method == 'OLS':
            beta = ols(x_, y_, z_, degree=p_degree)
        else:
            print('ERROR: Cannot recognize method')
            return 0

        M_ = np.c_[x_test, y_test]
        poly = PolynomialFeatures(p_degree)
        M = poly.fit_transform(M_)
        z_hat = M.dot(beta)

        Z_predict.append(z_hat)

        # Calculate MSE
        MSE.append(np.mean((z_test - z_hat)**2))
        R2s.append(R2(z_test, z_hat))

    # Calculate MSE, Bias and Variance
    MSE_M = np.mean(MSE)
    R2_M = np.mean(R2s)
    bias = np.mean((z_test - np.mean(Z_predict, axis=0, keepdims=True))**2)
    variance = np.mean(np.var(Z_predict, axis=0, keepdims=True))
    return MSE_M, R2_M, bias, variance
Пример #8
0
def train_rg(degree, K, lambdas):
    data = Data(FILENAME)
    model = RidgeRegression(data, degree=degree, method="CF", lambdas=lambdas, K=K)
    model.train()
    return {
        "tre": model.training_MSE(),
        "tse": model.test_MSE(),
        "la": model.la
    }
Пример #9
0
    def test_pipeline(self):

        print("*" * 20, "PIPELINE TEST", "*" * 20)
        alpha = rnd.random() * 100
        randomPoint = np.array(
            [[rnd.random() * 100 for i in range(self.X.shape[1])]])

        pipesk = make_pipeline(StandardScaler(), PCA(4), Ridge(alpha=alpha))
        pipe = Pipe([StdScaler(), PCA(4)], RidgeRegression(alpha=alpha))

        pipesk = pipesk.fit(self.X, self.y)
        pipe = pipe.fit(self.X, self.y)

        assert_array_almost_equal(pipe.predict(randomPoint),
                                  pipesk.predict(randomPoint))
Пример #10
0
def print_ridges(degrees, lambdas):
    data = Data(FILENAME)
    for degree in degrees:
        print "-" * 40
        print "degree = %d" % degree
        print "-" * 40
        for K in [2, 5, 10, np.shape(data.X_trn)[0]]:
            print "K = ", K
            data = Data(FILENAME)
            model = RidgeRegression(data, degree=degree, method="CF", lambdas=lambdas, K=K)
            model.train()
            print "lambda: ", model.la
            print "w: \n", model.W.T
            print "train error: ", model.training_MSE()
            print "test error: ", model.test_MSE()
            print "-" * 40
Пример #11
0
def calculate_weights(training_records, output):
    mse_costs = []
    weights = np.random.rand(training_records.shape[1])
    weights_table = [weights]
    predicted_outputs = []
    itr = 0
    prevErr = 0
    for i in range(NUM_OF_ITERATIONS):
        predicted_output = np.dot(training_records, weights)
        predicted_outputs.append(predicted_output)
        mse_cost, error = RidgeRegression.mse_cost_function(
            predicted_output, output)
        mse_costs.append(mse_cost)
        slope = training_records.T.dot(error) / (len(output))
        weights -= (LEARNING_RATE *
                    (slope + (RIDGE_LEARNING_RATE / len(output)) * weights))
        weights_table.append(weights.copy())
        if (abs(prevErr - mse_cost) < 0.0001):
            itr = i
            return itr, mse_costs
        prevErr = mse_cost
    return itr, mse_costs
Пример #12
0
    def test_standardScaler(self):

        print("*" * 20, " STANDARD SCALER TEST ", "*" * 20)
        alpha = rnd.random() * 100
        randomPoint = np.array(
            [[rnd.random() * 100 for i in range(self.X.shape[1])]])

        ssc_sk = StandardScaler().fit(self.X)
        ssc = StdScaler().fit(self.X)
        assert_array_almost_equal(ssc.transform(self.X),
                                  ssc_sk.transform(self.X),
                                  decimal=4)

        #### sklearn results
        pipe = make_pipeline(StandardScaler(), Ridge(alpha=alpha))
        pipe = pipe.fit(self.X, self.y)
        skRes = pipe.predict(randomPoint)

        pipe = Pipe([StdScaler()],
                    RidgeRegression(alpha=alpha)).fit(self.X, self.y)
        res = pipe.predict(randomPoint)

        assert_almost_equal(res, skRes, decimal=4)
Пример #13
0
    ax.plot(y)
    ax.set_ylabel("Target labels")
    """
        Shuffle dataset to find the reliability of the dataset collected
    """
    fig, ax = plt.subplots(1)
    ax.set_title("Shuffled data")
    estimates = shuffledCVEstimate(best, X, y)

    logShuffledCVEstimates(estimates, "Shuffle dataset")
    """
        Standardize data before computing estimates
    """
    estimates_std = shuffledCVEstimate(
        Pipe([StdScaler()],
             RidgeRegression(alpha=best.getAlpha(),
                             fit_intercept=best.getFitIntercept())), X, y)

    logShuffledCVEstimates(estimates_std,
                           "Shuffle dataset and standardize features")
    ax.plot(estimates)
    ax.plot(estimates_std)
    ax.legend(["Non standardized", "Standardized"])
    """
        Display correlation matrix to identify correlated features
    """

    fig, ax = plt.subplots(1)
    corr = data.drop('median_house_value', axis=1).corr().to_numpy()
    var = data.drop('median_house_value', axis=1).columns

    ax.matshow(corr)
Пример #14
0
    print("2 Housing Gradient Descent")
    #data, learningRate, tolerance
    GradientDescent(loadData('./data/housing.csv').fillna(0), 0.0004, 0.005).validate()

    print("2 Yacht Gradient Descent")
    GradientDescent(loadData('./data/yachtData.csv').fillna(0), 0.001, 0.001).validate()

    print("2 Concrete Gradient Descent")
    GradientDescent(loadData('./data/concreteData.csv').fillna(0), 0.0007, 0.0001).validate()

    print("3 Housing Normal Equation")
    NormalEquation(loadData('./data/housing.csv').fillna(0)).validate()

    print("3 Yacht Normal Equation")
    NormalEquation(loadData('./data/yachtData.csv').fillna(0)).validate()

    print("5 Sinusoid Polynomial Regression")
    #trainData, testData, power
    PolynomialRegression(loadData('./data/sinData_Train.csv').fillna(0), 
        loadData('./data/sinData_Validation.csv').fillna(0), np.arange(1, 16)).validate()

    print("5 Yacht Polynomial Regression")
    PolynomialRegression(loadData('./data/yachtData.csv').fillna(0), None, np.arange(1, 8)).validate()

    print('7 Sinusoid Ridge Regression - 1')
    #data, power, lambda
    RidgeRegression(loadData('./data/sinData_Train.csv').fillna(0), np.arange(1, 6), np.arange(0.0, 10.2, 0.2)).validate()
    
    print('7 Sinusoid Ridge Regression - 2')
    RidgeRegression(loadData('./data/sinData_Train.csv').fillna(0), np.arange(1, 10), np.arange(0.0, 10.2, 0.2)).validate()
Пример #15
0
def FeatureSelection(X_train, y_train, X_test, y_test, labelTitles):
    # Incrementally removing features
    index = len(X_train[0])

    BefDelXtrain = AftXtrain = X_train
    BefDelXtest = AftXtest = X_test

    R = RidgeRegression()
    R.fit(X_train, y_train, 0.01)
    RMSE = []
    indexs = []
    while index > 1:
        LeastIndex = R.ImportantFeatureLeast()
        index -= 1

        print(labelTitles[LeastIndex])
        labelTitles.remove(labelTitles[LeastIndex])
        #np.delete(labelTitles, LeastIndex, axis=1)

        AftXtrain = np.delete(BefDelXtrain, np.s_[LeastIndex], axis=1)
        AftXtest = np.delete(BefDelXtest, np.s_[LeastIndex], axis=1)

        R = RidgeRegression()
        R.fit(AftXtrain, y_train, 0.01)
        h = R.predict(AftXtest)
        RMSE.append(R.rmse(y_test, h))
        indexs.append(index)

        BefDelXtrain = AftXtrain
        BefDelXtest = AftXtest

    plt.plot(indexs, RMSE, 'b', label='RMSE')
    plt.legend()
    plt.savefig('images/FeatureSelection.png')
    plt.cla()  # Clear axis
    plt.clf()  # Clear figure
Пример #16
0
        temp1 = temp[1: len(temp) - 1]
        
        temp2 = temp[len(temp) - 1]
        #X[i] = X.append(np.array(temp1))
        #X[i].append(temp1)
        #X.extend(temp1)
        tempF1 = np.array(temp1)
        X.extend(tempF1.astype(np.float))
        #Y = Y.append(temp2)
        #Y[i].append(temp2)
        #tempF2 = np.array(temp2)
        Y.append(float(temp2))
    file.close()

    X = np.reshape(X, (num_of_rows, 15))
    Y = np.reshape(Y, (num_of_rows, 1))
    return X, Y

if __name__ == '__main__':
    print("Main model")
    
    X, Y = get_data(relative_path = '/datasets/death_rates_data.txt')
    X = normalize_and_add_one(X)
    
    X_train, Y_train = X[:50], Y[:50]
    X_test, Y_test = X[50:], Y[50:]

    ridgeRegression = RidgeRegression()
    best_LAMBDA = 0
    #best_LAMBDA = ridgeRegression.get_the_best_LAMBDA(X_train, Y_train)
    #print('BEST LAMBDA: ', best_LAMBDA)
Пример #17
0
X = np.load('data_for_part_1.npy')
x = X[:, 0]
y = X[:, 1]

# Calculate Franke's function without noise
z = FrankeFunction(x, y, noise=0)

########################################################################################################################
# Study dependence on lambdas
lambdas = [10**-7, 10**-6, 10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 1]
lambdas_log = [-7, -6, -5, -4, -3, -2, -1, 0]

print('\nINVESTIGATE LAMBDAS')
Bs = []
for la in lambdas:
    Bs.append(RidgeRegression(x, y, z, l=la))

# Generate test data
x_test = np.random.rand(200)
y_test = np.random.rand(200)
z_test = FrankeFunction(x_test, y_test, noise=0)

# Calculate MSE, R2scores
M_ = np.c_[x_test, y_test]
poly5 = PolynomialFeatures(5)
M = poly5.fit_transform(M_)

MSEs = []
R2s = []
for i in range(len(lambdas)):
    z_predict = M.dot(Bs[i])
Пример #18
0
from metrics import MSE
from DatasetCSV import SplitData
from Gradients import gradient_descent, stochastic_gradient_descent, mini_batch_gradient_descent
from LinealRegression import LinealRegression
from RidgeRegression import RidgeRegression

# Data extraction and preprocessing
data_example = SplitData('../Clase 4/income.csv')
x_train, y_train = data_example.get_train_data()
x_test, y_test = data_example.get_test_data()

# Prediction
# w_grad = mini_batch_gradient_descent(x_train, y_train, 0.01, 100)
# print(w_grad)
# y_predicted = x_test*w_grad
ridge = RidgeRegression()
ridge.fit(x_train, y_train)
y_predicted = ridge.predict(x_test)

error = MSE()
lineal_regression = LinealRegression()
lineal_regression.fit(x_train, y_train)
print(lineal_regression.model)
y_predicted_regression = x_test * lineal_regression.model
print(error(y_test, y_predicted))
print(error(y_test, y_predicted_regression))

plt.figure(1)
plt.subplot(311)
plt.grid(True)
plt.title("Validation Data")
Пример #19
0
    print('Sinusoid Dataset - Polynomial Regression')
    sinPolynomialRegression = PolynomialRegression(
        sinTrainDataSet, sinValidationDataSet, None,
        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], False)
    sinPolynomialRegression.validate()

    print('Yacth Dataset - Polynomial Regression')
    yacthPolynomialRegression = PolynomialRegression(yacthDataSet, None, 10,
                                                     [1, 2, 3, 4, 5, 6, 7])
    yacthPolynomialRegression.validate()

    print('Sinusoid Dataset - Ridge Regression')
    sinRidgeRegression = RidgeRegression(
        sinTrainDataSet, 10, [1, 2, 3, 4, 5], [
            0., 0.2, 0.4, 0.6, 0.8, 1., 1.2, 1.4, 1.6, 1.8, 2., 2.2, 2.4, 2.6,
            2.8, 3., 3.2, 3.4, 3.6, 3.8, 4., 4.2, 4.4, 4.6, 4.8, 5., 5.2, 5.4,
            5.6, 5.8, 6., 6.2, 6.4, 6.6, 6.8, 7., 7.2, 7.4, 7.6, 7.8, 8., 8.2,
            8.4, 8.6, 8.8, 9., 9.2, 9.4, 9.6, 9.8, 10.
        ])
    sinRidgeRegression.validate()

    newSinRidgeRegression = RidgeRegression(
        sinTrainDataSet, 10, [1, 2, 3, 4, 5, 6, 7, 8, 9], [
            0., 0.2, 0.4, 0.6, 0.8, 1., 1.2, 1.4, 1.6, 1.8, 2., 2.2, 2.4, 2.6,
            2.8, 3., 3.2, 3.4, 3.6, 3.8, 4., 4.2, 4.4, 4.6, 4.8, 5., 5.2, 5.4,
            5.6, 5.8, 6., 6.2, 6.4, 6.6, 6.8, 7., 7.2, 7.4, 7.6, 7.8, 8., 8.2,
            8.4, 8.6, 8.8, 9., 9.2, 9.4, 9.6, 9.8, 10.
        ])
    newSinRidgeRegression.validate()
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

data = load_boston()
X = data.data
Y = data.target
x_train, x_test, y_train_temp, y_test_temp = train_test_split(X,
                                                              Y,
                                                              test_size=0.40,
                                                              random_state=42)
y_train = y_train_temp[:, np.newaxis]
y_test = y_test_temp[:, np.newaxis]

trainData = HomogenNumericTable(x_train)
trainDependentVariables = HomogenNumericTable(y_train)
testData = HomogenNumericTable(x_test)
testGroundTruth = HomogenNumericTable(y_test)
#Instantiate Linear Regression object
rigde = RidgeRegression(ridgeParameters=0.0005)
#Training
trainingResult = rigde.training(trainData, trainDependentVariables)
#Prediction
pred_nT = rigde.predict(trainingResult, trainData)
#Serialize
rigde.serialize(trainingResult, fileName='RR.npy')
#Deseriailze
de_trainingResult = rigde.deserialize(fileName="RR.npy")
#print predicted responses and actual response
printNumericTable(pred_nT,
                  "Ridge Regression prediction results: (first 10 rows):", 10)
printNumericTable(testGroundTruth, "Ground truth (first 10 rows):", 10)
Пример #21
0
def getModels(X, y, alpha, fit_intercept=True):
    return RidgeRegression(alpha,fit_intercept=fit_intercept).fit(X,y),\
            Ridge(alpha,fit_intercept=fit_intercept).fit(X,y)