Пример #1
0
def make_ridge():
	#X=StandardScaler().fit_transform(all_training_data)
	X=all_training_data
	y=train_labels
	n_alphas = 200
	alphas = np.logspace(-6, 6, n_alphas)
	ridge = Ridge(fit_intercept=False)
	"""
	coefs = []
	for a in alphas:
		clf.set_params(alpha=a)
		clf.fit(X, y)
		coefs.append(clf.coef_)
		print(clf.coef_)
		make_prediction(clf, all_testing_data, test_labels)"""

	scores = list()
	scores_std = list()

	n_folds = 3

	for i, alpha in enumerate(alphas):
		print(i)
		ridge.alpha = alpha
		this_scores = cross_val_score(ridge, X, y, cv=n_folds, n_jobs=1)
		scores.append(np.mean(this_scores))
		scores_std.append(np.std(this_scores))
		clf = Ridge(fit_intercept=False)
		clf.alpha = alpha
		clf.fit(X,y)
		print(clf.coef_)
		make_prediction(clf, all_testing_data, test_labels)

	scores, scores_std = np.array(scores), np.array(scores_std)

	plt.figure().set_size_inches(8, 6)
	plt.semilogx(alphas, scores)

	# plot error lines showing +/- std. errors of the scores
	std_error = scores_std / np.sqrt(n_folds)

	plt.semilogx(alphas, scores + std_error, 'b--')
	plt.semilogx(alphas, scores - std_error, 'b--')

	# alpha=0.2 controls the translucency of the fill color
	plt.fill_between(alphas, scores + std_error, scores - std_error, alpha=0.2)

	plt.ylabel('CV score +/- std error')
	plt.xlabel('alpha')
	plt.axhline(np.max(scores), linestyle='--', color='.5')
	print(scores.argmax(axis=0))
	plt.xlim([alphas[0], alphas[-1]])

	plt.show()
Пример #2
0
def main(data_dir='./data/',
         N=10,
         cv_test_size=0.2,
         files_to_use='all',
         submit_name='submission.csv'):
    if files_to_use == 'all':
        files_to_use = [
            'dswrf_sfc', 'dlwrf_sfc', 'uswrf_sfc', 'ulwrf_sfc', 'ulwrf_tatm',
            'pwat_eatm', 'tcdc_eatm', 'apcp_sfc', 'pres_msl', 'spfh_2m',
            'tcolc_eatm', 'tmax_2m', 'tmin_2m', 'tmp_2m', 'tmp_sfc'
        ]
    train_sub_str = '_latlon_subset_19940101_20071231.nc'
    test_sub_str = '_latlon_subset_20080101_20121130.nc'

    print 'Loading training data...'
    trainX = load_GEFS_data(data_dir, files_to_use, train_sub_str)
    times, trainY = load_csv_data(os.path.join(data_dir, 'train.csv'))
    print 'Training data shape', trainX.shape, trainY.shape

    # Gotta pick a scikit-learn model
    model = Ridge(normalize=True)  # Normalizing is usually a good idea

    print 'Finding best regularization value for alpha...'
    alphas = np.logspace(-3, 1, 8, base=10)  # List of alphas to check
    alphas = np.array((0.1, 0.2, 0.3, 0.4, 0.5, 0.6))
    maes = []
    for alpha in alphas:
        model.alpha = alpha
        mae = cv_loop(trainX, trainY, model, N)
        maes.append(mae)
        print 'alpha %.4f mae %.4f' % (alpha, mae)
    best_alpha = alphas[np.argmin(maes)]
    print 'Best alpha of %s with mean average error of %s' % (best_alpha,
                                                              np.min(maes))

    print 'Fitting model with best alpha...'
    model.alpha = best_alpha
    model.fit(trainX, trainY)

    print 'Loading test data...'
    testX = load_GEFS_data(data_dir, files_to_use, test_sub_str)
    print 'Test data shape', testX.shape

    print 'Predicting...'
    preds = model.predict(testX)

    print 'Saving to csv...'
    save_submission(preds, submit_name, data_dir)
Пример #3
0
def compare(X, y, ringe_alpha, lasso_alpha, k, plot):
    kf = KFold(n_splits=10)
    kf.get_n_splits(X)
    knn_errors = []
    ridge_errors = []
    lasso_errors = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        pred_y = knn.predict(X_test)
        knn_errors.append(mean_squared_error(y_true=y_test, y_pred=pred_y))

        lasso = Lasso(normalize=True)
        lasso.alpha = lasso_alpha
        lasso.fit(X_train, y_train)
        pred_y = lasso.predict(X_test)
        lasso_errors.append(mean_squared_error(y_true=y_test, y_pred=pred_y))

        ridge = Ridge(normalize=True)
        ridge.alpha = ringe_alpha
        ridge.fit(X_train, y_train)
        pred_y = ridge.predict(X_test)
        ridge_errors.append(mean_squared_error(y_true=y_test, y_pred=pred_y))

    if plot:
        plt.plot([0, 1, 2], [np.mean(knn_errors), np.mean(ridge_errors), np.mean(lasso_errors)], 'ro')
        plt.title("Comparison")
        plt.xlabel('models (knn - 0, ridge - 2, lasso - 3)')
        plt.ylabel('MSE')
        # plt.xscale('log')
        plt.show()
    return np.mean(knn_errors), np.mean(ridge_errors), np.mean(lasso_errors)
Пример #4
0
def process_optimized_ridge(data):
    c_alpha = 0.001
    step = 0.01
    max_alpha = 20
    min_mean_sqr_error = 10000000
    max_r2_score = 0
    global optimized_bridge_alpha
    while c_alpha <= max_alpha:
        model = Ridge()
        model.alpha = c_alpha
        model.fit(data["X_train"], data["y_train"])
        predicted_values = model.predict(data["X_test"])
        mean_sqr_error = mean_squared_error(data["y_test"], predicted_values)
        r2_score_calc = r2_score(data["y_test"], predicted_values)
        if max_r2_score < abs(r2_score_calc):
            min_mean_sqr_error = mean_sqr_error
            max_r2_score = r2_score_calc
            optimized_bridge_alpha = c_alpha
        c_alpha = c_alpha + step
        dict_result = {
            "name": "RR",
            'data': {
                "alpha": optimized_bridge_alpha
            },
            'mean_sqr_err': min_mean_sqr_error,
            'r2_score': max_r2_score
        }
    return dict_result
Пример #5
0
def regularization_ridge(X, y):
    # Setup the array of alphas and lists to store scores
    alpha_space = np.logspace(-4, 0, 50)
    ridge_scores = []
    ridge_scores_std = []

    # Create a ridge regressor: ridge
    ridge = Ridge(normalize=True)

    # Compute scores over range of alphas
    for alpha in alpha_space:

        # Specify the alpha value to use: ridge.alpha
        ridge.alpha = alpha

        # Perform 10-fold CV: ridge_cv_scores
        ridge_cv_scores = cross_val_score(ridge, X, y, cv=10)

        # Append the mean of ridge_cv_scores to ridge_scores
        ridge_scores.append(np.mean(ridge_cv_scores))

        # Append the std of ridge_cv_scores to ridge_scores_std
        ridge_scores_std.append(np.std(ridge_cv_scores))

    # Display the plot
    display_plot(ridge_scores, ridge_scores_std)
Пример #6
0
def main(data_dir='./data/',N=10,cv_test_size=0.2,files_to_use='all',submit_name='submission.csv'):
	if files_to_use == 'all':
		files_to_use = ['dswrf_sfc','dlwrf_sfc','uswrf_sfc','ulwrf_sfc','ulwrf_tatm','pwat_eatm','tcdc_eatm','apcp_sfc','pres_msl','spfh_2m','tcolc_eatm','tmax_2m','tmin_2m','tmp_2m','tmp_sfc']
	train_sub_str = '_latlon_subset_19940101_20071231.nc'
	test_sub_str = '_latlon_subset_20080101_20121130.nc'

	print 'Loading training data...'
	trainX = load_GEFS_data(data_dir,files_to_use,train_sub_str)
	times,trainY = load_csv_data(os.path.join(data_dir,'train.csv'))
	print 'Training data shape',trainX.shape,trainY.shape

	# Gotta pick a scikit-learn model
	model = Ridge(normalize=True) # Normalizing is usually a good idea

	print 'Finding best regularization value for alpha...'
	alphas = np.logspace(-3,1,8,base=10) # List of alphas to check
	alphas = np.array(( 0.1, 0.2, 0.3, 0.4, 0.5, 0.6 ))
	maes = []
	for alpha in alphas:
		model.alpha = alpha
		mae = cv_loop(trainX,trainY,model,N)
		maes.append(mae)
		print 'alpha %.4f mae %.4f' % (alpha,mae)
	best_alpha = alphas[np.argmin(maes)]
	print 'Best alpha of %s with mean average error of %s' % (best_alpha,np.min(maes))

	print 'Fitting model with best alpha...'
	model.alpha = best_alpha
	model.fit(trainX,trainY)

	print 'Loading test data...'
	testX = load_GEFS_data(data_dir,files_to_use,test_sub_str)
	print 'Test data shape',testX.shape

	print 'Predicting...'
	preds = model.predict(testX)

	print 'Saving to csv...'
	save_submission(preds,submit_name,data_dir)
Пример #7
0
def process_optimized_ridge_step2(data):
    model = Ridge()
    model.alpha = optimized_bridge_alpha
    model.fit(data["X_train"], data["y_train"])
    predicted_values = model.predict(data["X_test"])
    mean_sqr_error = mean_squared_error(data["y_test"], predicted_values)
    r2_score_calc = r2_score(data["y_test"], predicted_values)
    dict_result = {
        "name": "RR",
        'data': {
            "alpha": optimized_bridge_alpha
        },
        'mean_sqr_err': mean_sqr_error,
        'r2_score': r2_score_calc
    }
    return dict_result
Пример #8
0
def main():
    ''' Linear regression minimizes a loss function
		It choose a coefficient for each feature variable
		Large coefficients can lead to overfitting
		Regularization is penalizing large coefficients
		This function uses RIDGE REGRESSION'''

    # Create a dataframe from the .csv file
    df = pd.read_csv('gapminderstats.csv')

    # Create an array for the target variable
    y = np.array(df['life'])

    # Drop the target variable column from the data frame
    df_X = df.drop('life', axis=1)

    # Get the column names
    # df_columns = df_X.dtypes.index

    # Create an array for the features
    X = np.array(df_X)

    # Setup the array of alphas and lists to store scores
    alpha_space = np.logspace(-4, 0, 50)
    ridge_scores = []
    ridge_scores_std = []

    # Create a ridge regressor: ridge
    ridge = Ridge(normalize=True)

    # Compute scores over range of alphas
    for alpha in alpha_space:

        # Specify the alpha value to use: ridge.alpha
        ridge.alpha = alpha

        # Perform 10-fold CV: ridge_cv_scores
        ridge_cv_scores = cross_val_score(ridge, X, y, cv=10)

        # Append the mean of ridge_cv_scores to ridge_scores
        ridge_scores.append(np.mean(ridge_cv_scores))

        # Append the std of ridge_cv_scores to ridge_scores_std
        ridge_scores_std.append(np.std(ridge_cv_scores))

    # Display the plot
    display_plot(ridge_scores, ridge_scores_std, alpha_space)
Пример #9
0
def calc_ridge(X, y, alphas, plot):
    kf = KFold(n_splits=10)
    kf.get_n_splits(X)
    mses = []
    for alpha in alphas:
        errors = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            ridge = Ridge(normalize=True)
            ridge.alpha = alpha
            ridge.fit(X_train, y_train)
            pred_y = ridge.predict(X_test)
            errors.append(mean_squared_error(y_true=y_test, y_pred=pred_y))
        mses.append(np.mean(errors))
    if plot:
        plt.plot(alphas, mses, 'ro')
        plt.title("MSE for different alpha levels for Ridge Regression")
        plt.xlabel('alpha')
        plt.ylabel('MSE')
        plt.xscale('log')
        plt.show()
    return mses
Пример #10
0
_ = plt.margins(0.02)
plt.show()

# Ridge (-> first choice for regression models!)
# adds sum of squared coeff to loss fun
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score

ridge = Ridge(normalize=True)  # Model
# which alpha??
alpha_space = np.logspace(-4, 0, 50)  # array of alphas
ridge_scores = []  # lists to store scores
ridge_scores_std = []
for alpha in alpha_space:  # compute scores over range of alphas
    # Specify the alpha value to use: ridge.alpha
    ridge.alpha = alpha  # access Ridge(alpha=)!!!!!
    # Perform 10-fold CV: ridge_cv_scores
    ridge_cv_scores = cross_val_score(ridge, X, y, cv=10)
    # Append the mean of ridge_cv_scores to ridge_scores
    ridge_scores.append(np.mean(ridge_cv_scores))
    # Append the std of ridge_cv_scores to ridge_scores_std
    ridge_scores_std.append(np.std(ridge_cv_scores))
display_plot(ridge_scores, ridge_scores_std)

# ElasticNet() (-> see Tuning section)

#endregion (REGULARIZED REGRESSION)

#region CROSS VALIDATION
# problem 1: performance depends on way the data is split
# problem 2: overfit to (one) sample
Пример #11
0
            label='Test data')
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.legend(loc='upper left')
plt.hlines(y=0, xmin=-10, xmax=50, color='black', lw=2)
plt.xlim([-10, 50])
plt.show()

# Setup the array of alphas and lists to store scores
alpha_space = np.logspace(-4, 0, 50)
MSE2_test_scores = []
MSE2_train_scores = []
R22_test_scores = []
R22_train_scores = []
for alpha in alpha_space:
    sr.alpha = alpha
    sr.fit(X, y)
    y_train_pred2 = sr.predict(X_train)
    y_test_pred2 = sr.predict(X_test)
    MSE2_test_scores.append(mean_squared_error(y_test, y_test_pred2))
    MSE2_train_scores.append(mean_squared_error(y_train, y_train_pred2))
    R22_test_scores.append(r2_score(y_test, y_test_pred2))
    R22_train_scores.append(r2_score(y_train, y_train_pred2))
plt.plot(alpha_space, MSE2_test_scores)
plt.xlabel('alpha_space')
plt.ylabel('MSE2_test_scores')
plt.show()

plt.plot(alpha_space, MSE2_train_scores)
plt.xlabel('alpha_space')
plt.ylabel('MSE2_train_scores')
Пример #12
0
#Lasso is great for feature selection, but when building regression models, Ridge regression should be your first choice.
#Recall that lasso performs regularization by adding to the loss function a penalty term of the absolute value of each coefficient
#multiplied by some alpha. This is also known as L1 regularization because the regularization term is the L1 norm of the coefficients. 
#If instead you took the sum of the squared values of the coefficients multiplied by some alpha - like in Ridge regression -
#you would be computing the L2 

 norm. 
## Regularization RIDGE

# Import necessary modules
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
# Setup the array of alphas and lists to store scores
alpha_space = np.logspace(-4, 0, 50)
ridge_scores = []ridge_scores_std = []
# Create a ridge regressor: ridge
ridge = Ridge(normalize=True)
# Compute scores over range of alphas
for alpha in alpha_space:
    # Specify the alpha value to use: 
    ridge.alpha    ridge.alpha = alpha      
    # Perform 10-fold CV: ridge_cv_scores  
    ridge_cv_scores = cross_val_score(ridge, X,y,cv=10)       
    # Append the mean of ridge_cv_scores to ridge_scores  
    ridge_scores.append(np.mean(ridge_cv_scores))     
    # Append the std of ridge_cv_scores to ridge_scores_std    
    ridge_scores_std.append(np.std(ridge_cv_scores))
# Display the plot
display_plot(ridge_scores, ridge_scores_std)
Пример #13
0
I'll start off with using Ridge to perform a regularization of the regression model.
"""

#import modules
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score

alpha_space = np.logspace(-4, 0, 1)
ridge_scores = []
ridge_scores_std = []

ridge = Ridge(normalize=True)

for alpha in alpha_space:
    ridge.alpha = alpha_space
    ridge_cv_scores = cross_val_score(ridge, X_train, y_train, cv=10)
    ridge_scores.append(np.mean(ridge_cv_scores))
    ridge_scores_std.append(np.std(ridge_cv_scores))
"""Then build a decision tree for my model using XGBRegressor, which comes with a built in tree paramter."""

from sklearn.metrics import mean_squared_error
# Instantiating the XGBRegressor
xg_reg = xgb.XGBRegressor(objective='reg:linear', n_estimators=10)
# fitting the reggresor to the training set
xg_reg.fit(X_train, y_train)
# making predictions
preds1 = xg_reg.predict(X_test)
"""Vizualizing the trees and feature importance"""

xgb.plot_tree(xg_reg, num_trees=0, rankdir="LR")
Пример #14
0
def oppgave_6(o=15, seed=4, test=True):

    # Load the terrain
    terrain = imread("{}SRTM_data_Norway_1.tif".format(image_path))
    # Show the terrain
    plt.figure()
    plt.title('Terrain Norway 1, Original')
    plt.imshow(terrain, cmap='gray')
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.show()

    #Pick out  small square to analyze if test is set to True
    if test:
        #Pick out  small square to analyze
        square_size = 100
        x_shift = np.random.randint(0, 1801 - square_size)
        y_shift = np.random.randint(0, 3601 - square_size)
        terrain = terrain[y_shift:y_shift + square_size,
                          x_shift:x_shift + square_size]
        plt.figure()
        plt.title('Terrain part 1, Original {} pt box'.format(square_size))
        plt.imshow(terrain, cmap='gray')
        plt.xlabel('X')
        plt.ylabel('Y')
        plt.show()
    else:
        #Use settings determined  by analysing small squares for analysis
        #on entire dataset. Attemting to rebuild the image from
        #a model based on evenly spaced datapoints

        #Set model parameters
        order = 15
        #Ridge parameter
        lmd = 0.0001
        #Lasso parameter
        alph = 0.0001
        #Set the coarseness of the sample grid
        coarseness = 5
        x_dimension_original = len(terrain[0, :])
        y_dimension_original = len(terrain[:, 0])
        x_dimension = x_dimension_original // coarseness
        y_dimension = y_dimension_original // coarseness
        terrain_points = np.zeros((y_dimension, x_dimension))
        for x_axis in range(x_dimension):
            for y_axis in range(y_dimension):
                terrain_points[y_axis, x_axis] = terrain[y_axis * coarseness,
                                                         x_axis * coarseness]
        #Create mesh grid for training data, selected points
        x = np.linspace(0, 1, x_dimension)
        y = np.linspace(0, 1, y_dimension)
        x_grid, y_grid = np.meshgrid(x, y)
        #Create meshgrid for original data
        x_original = np.linspace(0, 1, x_dimension_original)
        y_original = np.linspace(0, 1, y_dimension_original)
        x_grid_original, y_grid_original = np.meshgrid(x_original, y_original)
        #Flatten grids
        data = np.ravel(terrain_points)
        data_original = np.ravel(terrain)
        x = np.ravel(x_grid)
        y = np.ravel(y_grid)
        x_original = np.ravel(x_grid_original)
        y_original = np.ravel(y_grid_original)
        #Creates a scaler to normalize data
        scaler = MinMaxScaler()
        print("Running time: {} seconds".format(time() - t0))
        #Normalizing data
        scaler.fit(data.reshape(-1, 1))
        #Normalizing training data
        normalized_data = scaler.transform(data.reshape(-1, 1))
        normalized_data = normalized_data[:, 0]
        #Normalizing original data --------not used?
        normalized_data_original = scaler.transform(
            data_original.reshape(-1, 1))
        normalized_data_original = normalized_data_original[:, 0]

        #Initiate instances of the regressors
        linear_regression = LinearRegression()
        ridge_regression = Ridge(solver="svd", alpha=lmd)
        lasso_regression = Lasso(alpha=alph)
        print("Running time: {} seconds".format(time() - t0))
        #Create training matrix
        A = design_matrix(order, x, y)
        #Remove intercept
        A = A[:, 1:]
        print("Running time: {} seconds".format(time() - t0))
        #Create prediction matrix
        X_test = design_matrix(order, x_original, y_original)
        X_test = X_test[:, 1:]
        print("Running time: {} seconds".format(time() - t0))
        #Make prediction using OLS model
        linear_regression.fit(A, normalized_data)
        rebuilt = linear_regression.predict(X_test)
        print("OLS MSE: ", MSE(normalized_data_original, rebuilt))
        rebuilt = scaler.inverse_transform(rebuilt.reshape(-1, 1))
        rebuilt = np.reshape(rebuilt, y_grid_original.shape)
        fig_rebuild = plt.figure(figsize=(9, 5))
        ax1 = fig_rebuild.add_subplot(131)
        ax2 = fig_rebuild.add_subplot(132)
        ax3 = fig_rebuild.add_subplot(133)
        plt.title('Terrain Norway 1, rebuild')
        ax1.imshow(rebuilt, cmap='gray')
        plt.xlabel('X')
        plt.ylabel('Y')
        #Make prediction using Ridge model
        ridge_regression.fit(A, normalized_data)
        rebuilt = ridge_regression.predict(X_test)
        print("Ridge MSE: ", MSE(normalized_data_original, rebuilt))
        rebuilt = scaler.inverse_transform(rebuilt.reshape(-1, 1))
        print("Running time: {} seconds".format(time() - t0))
        rebuilt = np.reshape(rebuilt, y_grid_original.shape)
        ax2.imshow(rebuilt, cmap='gray')

        #Make prediction using LASSO model
        lasso_regression.fit(A, normalized_data)
        rebuilt = lasso_regression.predict(X_test)
        print("LASSO MSE: ", MSE(normalized_data_original, rebuilt))
        rebuilt = scaler.inverse_transform(rebuilt.reshape(-1, 1))
        print("Running time: {} seconds".format(time() - t0))
        rebuilt = np.reshape(rebuilt, y_grid_original.shape)
        ax3.imshow(rebuilt, cmap='gray')

        fig_rebuild.savefig("{}TerrainRebuilOrder{}P4.png".format(
            plots_path, order))

        return ()

    #Get dimensions of data set and make a grid to base the model on
    y_dimension = len(terrain[:, 0])
    x_dimension = len(terrain[0, :])

    x = np.linspace(0, 1, x_dimension)
    y = np.linspace(0, 1, y_dimension)
    x_grid, y_grid = np.meshgrid(x, y)
    #Flatten grid
    data = np.ravel(terrain)
    x = np.ravel(x_grid)
    y = np.ravel(y_grid)
    #set random seed
    np.random.seed(seed)

    #Creates a scaler to normalize data
    scaler = MinMaxScaler()

    #Normalizing data
    scaler.fit(data.reshape(-1, 1))
    normalized_data = scaler.transform(data.reshape(-1, 1))
    normalized_data = normalized_data[:, 0]

    #Create instances of sklearn kFold klass to split data for kfoldcv
    splits = 5
    kfold = KFold(n_splits=splits, shuffle=True)
    #Sets a range of polynomial orders to fit to the data
    polynomial_order = np.arange(o) + 1

    #---------OLS------------------------------
    #------------------------------------------
    #Solve using OLS

    linear_regression = LinearRegression()

    dta = list()
    for order in polynomial_order:
        print("Using polynomial order {}".format(order))
        #Creating designmatrix
        A = design_matrix(order, x, y)
        mse_test = np.zeros(splits)
        mse_train = np.zeros(splits)
        counter = 0
        #Initiating kfold cv
        for train_index, test_index in kfold.split(normalized_data):
            print("Calculating fold {} of {}".format(counter + 1, splits))
            X_train, X_test = A[train_index], A[test_index]
            y_train, y_test = normalized_data[train_index], normalized_data[
                test_index]
            #Using current  polynomial order and fold to solve using OLS
            linear_regression.fit(X_train, y_train)
            ytilde = linear_regression.predict(X_train)
            ypredict = linear_regression.predict(X_test)
            #Get MSE metric for training and testing data
            mse_test[counter] = MSE(y_test, ypredict)
            mse_train[counter] = MSE(y_train, ytilde)
            counter = counter + 1
            print(counter)
            print("Running time: {} seconds".format(time() - t0))

        dta.append(["{}".format(order), mse_test.mean(), mse_train.mean()])
        '''     
        rebuilt =  linear_regression.predict(A)
        rebuilt = scaler.inverse_transform(rebuilt.reshape(-1,1))
        rebuilt = np.reshape(rebuilt,y_grid.shape)
        plt.figure()
        plt.title('Terrain Norway 1, rebuild')
        plt.imshow(rebuilt, cmap='gray')
        plt.xlabel('X')
        plt.ylabel('Y')
        plt.show()
        '''
    df = pd.DataFrame(
        dta, columns=["Polynomial", "MSE test set", "MSE training set"])

    plt.figure()
    fig1 = plt.figure(figsize=(8, 4))
    ax1 = fig1.add_subplot(111)
    ax1.set_position([0.1, 0.1, 0.6, 0.8])
    ax1.set_xlabel("Polynomial order")
    ax1.set_ylabel("Training MSE")
    fig2 = plt.figure(figsize=(8, 4))
    ax2 = fig2.add_subplot(111)
    ax2.set_position([0.1, 0.1, 0.6, 0.8])
    ax2.set_xlabel("Polynomial order")
    ax2.set_ylabel("Testing MSE")
    ax1.plot(df["Polynomial"], df["MSE training set"], label="Training OLS")
    ax2.plot(df["Polynomial"], df["MSE test set"], label="Test OLS")
    fig1.legend(bbox_to_anchor=(0.71, 0.5), loc="center left", borderaxespad=0)
    fig2.legend(bbox_to_anchor=(0.71, 0.5), loc="center left", borderaxespad=0)
    fig1.savefig("{}TerrainOLStrainSeed{}.png".format(plots_path, seed))
    fig2.savefig("{}TerrainOLStestSeed{}.png".format(plots_path, seed))

    #---------RIDGE----------------------------
    #------------------------------------------
    #Creates a dictionary to store dataframes for each Ridge parameter
    dataframe_dic = dict()
    ridge_regression = Ridge(solver="svd")
    #Set a range og shrinkage factors for the Ridge regression
    lambdas = np.logspace(-5, -1, 10)
    for lmd in lambdas:
        print("Calculating Ridge, lambda: {}".format(lmd))
        #Creates a list to store the results of each iteration in
        dta = list()
        for order in polynomial_order:
            print("Using polynomial order {}".format(order))
            #Creating designmatrix
            A = design_matrix(order, x, y)
            #Removing intercept
            A = A[:, 1:]
            lambda_mse_test = np.zeros(splits)
            lambda_mse_train = np.zeros(splits)
            counter = 0
            #Initiating kfold cv
            for train_index, test_index in kfold.split(normalized_data):
                X_train, X_test = A[train_index], A[test_index]
                y_train, y_test = normalized_data[
                    train_index], normalized_data[test_index]
                #Using current lambda and polynomial order solve using Ridge
                ridge_regression.alpha = lmd
                ridge_regression.fit(X_train, y_train)
                #Estimate testing and training data
                ypredict = ridge_regression.predict(X_test)
                ytilde = ridge_regression.predict(X_train)
                #Get MSE metric for training and testing data
                lambda_mse_test[counter] = MSE(y_test, ypredict)
                lambda_mse_train[counter] = MSE(y_train, ytilde)
                print("Calculating fold {} of {}".format(counter + 1, splits))
                counter = counter + 1
                print("Running time: {} seconds".format(time() - t0))
            dta.append([
                "{}".format(order),
                lambda_mse_test.mean(),
                lambda_mse_train.mean()
            ])
            '''
            rebuilt =  ridge_regression.predict(A)
            rebuilt = scaler.inverse_transform(rebuilt.reshape(-1,1))
            rebuilt = np.reshape(rebuilt,y_grid.shape)
            plt.figure()
            plt.title('Terrain Norway 1, rebuild')
            plt.imshow(rebuilt, cmap='gray')
            plt.xlabel('X')
            plt.ylabel('Y')
            plt.show()
            '''
        df = pd.DataFrame(
            dta, columns=["Polynomial", "MSE test set", "MSE training set"])
        dataframe_dic[lmd] = df

    cmap = plt.get_cmap('jet_r')
    plt.figure()
    fig1 = plt.figure(figsize=(8, 4))
    ax1 = fig1.add_subplot(111)
    ax1.set_position([0.1, 0.1, 0.6, 0.8])
    ax1.set_xlabel("Polynomial order")
    ax1.set_ylabel("Training MSE")
    fig2 = plt.figure(figsize=(8, 4))
    ax2 = fig2.add_subplot(111)
    ax2.set_position([0.1, 0.1, 0.6, 0.8])
    ax2.set_xlabel("Polynomial order")
    ax2.set_ylabel("Testing MSE")
    n = 0
    for df in dataframe_dic:
        ax1.plot(dataframe_dic[df]["Polynomial"],
                 dataframe_dic[df]["MSE training set"],
                 color=cmap(float(n) / len(lambdas)),
                 label="Alpha=%10.2E" % (df))
        ax2.plot(dataframe_dic[df]["Polynomial"],
                 dataframe_dic[df]["MSE test set"],
                 color=cmap(float(n) / len(lambdas)),
                 label="Alpha=%10.2E" % (df))
        n = n + 1
    fig1.legend(bbox_to_anchor=(0.71, 0.5), loc="center left", borderaxespad=0)
    fig2.legend(bbox_to_anchor=(0.71, 0.5), loc="center left", borderaxespad=0)
    fig1.savefig("{}TerrainRidgetrainSeed{}.png".format(plots_path, seed))
    fig2.savefig("{}TerrainRidgetestSeed{}.png".format(plots_path, seed))

    #---------LASSO----------------------------
    #------------------------------------------

    #Create an instance of the Lasso class from sklearn
    lasso_regression = Lasso()
    #Set a range og shrinkage factors for the LASSO regression
    alphas = np.logspace(-5, -2, 10)
    dataframe_dic = dict()
    for alph in alphas:
        print("Calculating LASSO, alpha: {}".format(alph))
        #Creates a list to store the results of each iteration in
        dta = list()
        for order in polynomial_order:
            print("Using polynomial order {}".format(order))
            #Creating designmatrix
            A = design_matrix(order, x, y)
            #Removing intercept
            A = A[:, 1:]
            alpha_mse_test = np.zeros(splits)
            alpha_mse_train = np.zeros(splits)
            counter = 0
            #Initiating kfold cv
            for train_index, test_index in kfold.split(normalized_data):
                X_train, X_test = A[train_index], A[test_index]
                y_train, y_test = normalized_data[
                    train_index], normalized_data[test_index]
                #Using current aplha and polynomial order solve using Lasso
                lasso_regression.alpha = alph
                lasso_regression.fit(X_train, y_train)
                #Estimate testing and training data
                ypredict = lasso_regression.predict(X_test)
                ytilde = lasso_regression.predict(X_train)
                #Get MSE metric for training and testing data
                alpha_mse_test[counter] = MSE(y_test, ypredict)
                alpha_mse_train[counter] = MSE(y_train, ytilde)
                print("Calculating fold {} of {}".format(counter + 1, splits))
                counter = counter + 1
                print("Running time: {} seconds".format(time() - t0))
            dta.append([
                "{}".format(order),
                alpha_mse_test.mean(),
                alpha_mse_train.mean()
            ])
        df = pd.DataFrame(
            dta, columns=["Polynomial", "MSE test set", "MSE training set"])
        dataframe_dic[alph] = df

    cmap = plt.get_cmap('jet_r')
    plt.figure()
    fig1 = plt.figure(figsize=(8, 4))
    ax1 = fig1.add_subplot(111)
    ax1.set_position([0.1, 0.1, 0.6, 0.8])
    ax1.set_xlabel("Polynomial order")
    ax1.set_ylabel("Training MSE")
    fig2 = plt.figure(figsize=(8, 4))
    ax2 = fig2.add_subplot(111)
    ax2.set_position([0.1, 0.1, 0.6, 0.8])
    ax2.set_xlabel("Polynomial order")
    ax2.set_ylabel("Testing MSE")
    n = 0
    for df in dataframe_dic:
        ax1.plot(dataframe_dic[df]["Polynomial"],
                 dataframe_dic[df]["MSE training set"],
                 color=cmap(float(n) / len(alphas)),
                 label="Alpha=%10.2E" % (df))
        ax2.plot(dataframe_dic[df]["Polynomial"],
                 dataframe_dic[df]["MSE test set"],
                 color=cmap(float(n) / len(alphas)),
                 label="Alpha=%10.2E" % (df))
        n = n + 1
    fig1.legend(bbox_to_anchor=(0.71, 0.5), loc="center left", borderaxespad=0)
    fig2.legend(bbox_to_anchor=(0.71, 0.5), loc="center left", borderaxespad=0)
    fig1.savefig("{}TerrainLASSOtrainSeed{}.png".format(plots_path, seed))
    fig2.savefig("{}TerrainLASSOtestSeed{}.png".format(plots_path, seed))
Пример #15
0
def main(lat, lon, station_index): 

    files = ['dswrf_sfc','dlwrf_sfc','uswrf_sfc','ulwrf_sfc','ulwrf_tatm','pwat_eatm','tcdc_eatm','apcp_sfc','pres_msl','spfh_2m','tcolc_eatm','tmax_2m','tmin_2m','tmp_2m','tmp_sfc']
    train_sub_strings = '_latlon_subset_19940101_20071231.nc'
    #test_sub_str = '_latlon_subset_20080101_20121130.nc'  

    #Load csv Solar Energy
    print 'Importing solarenergy trainings data'
    energy = np.genfromtxt('train.csv', delimiter=',', dtype="float")
    energy = np.squeeze(energy[:,station_index])
    energy = np.delete(energy, 0, 0)

    #Split in train and test data
    print 'Splitting solarenergy data in test and train data'
    energy_split = np.split(energy,[4018,5113])
    train_energy = energy_split[0]
    test_energy = energy_split[1]

    #Loading netCDF4 data for a specific point(lat,lon)
    train_matrix = loadNetCDF4(files, train_sub_strings, 5113, lat, lon)

    #Deleting zero colum 
    train_matrix = np.delete(train_matrix, 0, 1)

    #Split in train and test data
    print 'Splitting weather data in test and train data'
    train_split = np.split(train_matrix,[4018,5113])
    train_matrix = train_split[0]
    test_matrix = train_split[1]

    #Build csv train
    np.savetxt(str(lat) + '_' + str(lon) + '_' + str(station_index) + '_train.csv', train_matrix, delimiter = ",", fmt = "%.06f" )

    print 'Setting up Regressor'
    ridge = Ridge()

    #Prepare a range of alpha values to test
    alphas = np.array([1,0.1,0.01,0.001,0.0001,0])

    #Printing alphas, taken form scikit
    #print_alpha(ridge, train_matrix, train_energy, alphas)

    # create and fit a ridge regression model, testing each alpha, taken from scikit
    grid = GridSearchCV(estimator=ridge, param_grid=dict(alpha=alphas))
    grid.fit(train_matrix, train_energy)
    print 'Best estimated alpha: '
    print(grid.best_estimator_.alpha)
    ridge.alpha=grid.best_estimator_.alpha

    print 'Training the Regressor'
    ridge.fit(train_matrix,train_energy)

    print 'Predicting Energy'
    prediction_matrix = ridge.predict(test_matrix)

    #Sace csv prediction
    np.savetxt( str(lat) + '_' + str(lon) + '_' +str(station_index) + '_prediction.csv', prediction_matrix, delimiter = ",", fmt = "%d" )

    #Plotting

    #Setting up date x-axis
    time = pd.date_range('2005-01-01', periods=1095)#1095

    fig, ax = plt.subplots(1)
    fig.autofmt_xdate()
    xfmt = mdates.DateFormatter('%d-%m-%y')
    ax.xaxis.set_major_formatter(xfmt)

    #Plot prediction and actual values
    ax = plt.gca()
    ax.plot(time, prediction_matrix, linewidth=0.5)
    ax.plot(time, test_energy, linewidth=0.5,)

    #Labels and Legend
    plt.xlabel('Time')
    plt.ylabel('Jouls per square meter')
    plt.title('Solar Energy of Tahlequah(Oklahoma)' + ' (lat: ' + str(lat) + ' lon: ' + str(lon-360) + ')' )

    plt.axis('tight')

    prediction_patch = mpatches.Patch(color='blue', label='Prediction')
    meassured_patch = mpatches.Patch(color='orange', label='Meassured')
    plt.legend(handles=[prediction_patch, meassured_patch])

    plt.show()

    #Plot difference graph
    differnece = np.subtract(prediction_matrix, test_energy)

    fig, ax = plt.subplots(1)
    fig.autofmt_xdate()
    xfmt = mdates.DateFormatter('%d-%m-%y')
    ax.xaxis.set_major_formatter(xfmt)

    ax = plt.gca()
    ax.plot(time,differnece, linewidth = 0.5)
    #ax.plot(time, np.full((1095,),4000000), color='orange')
    #ax.plot(time, np.full((1095,),-4000000), color='orange')
    plt.xlabel('Time')
    plt.ylabel('Jouls per square meter')
    plt.title('Solar Energy of Tahlequah(Oklahoma)' + ' (lat: ' + str(lat) + ' lon: ' + str(lon-360) + ')' )
    plt.axis('tight')
   
    difference_patch = mpatches.Patch(color='blue', label='Difference')
    plt.legend(handles=[difference_patch])

    plt.show()
Пример #16
0
# fitting ridge regression models over a range of different alphas, and plot cross-validated R**2 scores for each

# Import necessary modules
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score

# Setup the array of alphas and lists to store scores
alpha_space = np.logspace(-4, 0, 50)
ridge_scores = []
ridge_scores_std = []

# Create a ridge regressor: ridge
ridge = Ridge(normalize=True)

# Compute scores over range of alphas
for alpha in alpha_space:
    # Specify the alpha value to use: ridge.alpha
    ridge.alpha = alpha

    # Perform 10-fold CV: ridge_cv_scores
    ridge_cv_scores = cross_val_score(ridge, X, y, cv=10)

    # Append the mean of ridge_cv_scores to ridge_scores
    ridge_scores.append(np.mean(ridge_cv_scores))

    # Append the std of ridge_cv_scores to ridge_scores_std
    ridge_scores_std.append(np.std(ridge_cv_scores))

# Display the plot
display_plot(ridge_scores, ridge_scores_std)
Пример #17
0
reg2 = Ridge(alpha=1)
reg3 = Lasso(alpha=1)

reg1.fit(trainX, trainy)
reg1.coef_
reg2.fit(trainX, trainy)
reg2.coef_
reg3.fit(trainX, trainy)
reg3.coef_

alphas = np.logspace(-3, 3, 30)  #30개 생성

linear_r2 = reg1.score(validX, validy)
result = pd.DataFrame(index=alphas, columns=['Ridge', 'Lasso'])
for alpha in alphas:
    reg2.alpha = alpha
    reg3.alpha = alpha
    reg2.fit(trainX, trainy)
    result.loc[alpha, 'Ridge'] = reg2.score(validX, validy)
    reg3.fit(trainX, trainy)
    result.loc[alpha, 'Lasso'] = reg3.score(validX, validy)

plt.plot(np.log(alphas), result['Ridge'], label="Ridge")
plt.plot(np.log(alphas), result['Lasso'], label="Lasso")
plt.hlines(linear_r2,
           np.log(alphas[0]),
           np.log(alphas[-1]),
           ls=':',
           color="k",
           label='Ordinary')
plt.legend()
Пример #18
0
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error
from math import sqrt
import  pandas as pd
import numpy as np


filename = 'task1a_lm1d1z/train.csv'
data = pd.read_csv(filename)

y = data['y']
X = data.drop(['Id','y'],axis=1)

lam = [0.1, 1, 10, 100, 1000]
ridge = Ridge(normalize=False)
rms=[]
for parameter in lam:
    ridge.alpha = parameter
    predicted = cross_val_predict(ridge,X, y, cv=10)
    rms.append(np.mean(mean_squared_error(predicted,y)))

print (rms)
Пример #19
0
print('MSE train: %.3f, test: %.3f' % ( mean_squared_error(y_train
                                                            , y_train_pred)
    ,mean_squared_error(y_test, y_test_pred)))

print('R^2 train: %.3f, test: %.3f' %(r2_score(y_train, y_train_pred),
                                      r2_score(y_test, y_test_pred)))


###########################Ridge
alpha_space = np.logspace(-4, 0, 50)
ridge_scores = []
ridge_scores_std = []
ridge = Ridge(normalize=True)

for alpha in alpha_space:
    ridge.alpha = Ridge(alpha,normalize=True)
    ridge_cv_scores = cross_val_score(ridge.alpha,X,y,cv=10)
    ridge_scores.append(np.mean(ridge_cv_scores))
    ridge_scores_std.append(np.std(ridge_cv_scores))

def display_plot(cv_scores, cv_scores_std):
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)
    ax.plot(alpha_space, cv_scores)
    std_error = cv_scores_std / np.sqrt(10)
    ax.fill_between(alpha_space, cv_scores + std_error, cv_scores - std_error, alpha=0.2)
    ax.set_ylabel('CV Score +/- Std Error')
    ax.set_xlabel('Alpha')
    ax.axhline(np.max(cv_scores), linestyle='--', color='.5')
    ax.set_xlim([alpha_space[0], alpha_space[-1]])
    ax.set_xscale('log')
reg3 = Lasso(alpha=1)

reg1.fit(train_X, train_y)
reg1.score(test_X, test_y)

reg2.fit(train_X, train_y)
reg2.score(test_X, test_y)

reg3.fit(train_X, train_y)
reg3.score(test_X, test_y)

#logspace이용하여 알파 파라미터 찾기
alphas = np.logspace(-3, 3, 30)
result = pd.DataFrame(index=alphas, columns=['Ridge', 'Lasso'])
for alpha in alphas:
    reg2.alpha = alpha
    reg3.alpha = alpha
    reg2.fit(train_X, train_y)
    result.loc[alpha, 'Ridge'] = reg2.score(test_X, test_y)
    reg3.fit(train_X, train_y)
    result.loc[alpha, 'Lasso'] = reg3.score(test_X, test_y)

param_Ridge = 0.78804
param_Lasso = 0.001

##test 5-fold cross validation
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=1)
for train, test in kf.split(train_data):
    print(train, test)
Пример #21
0
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
ridge3_scores = []
ridge6_scores = []
ridge9_scores = []

alpha_space = np.logspace(-4,0,50)

ridge3 = Ridge()
ridge6 = Ridge()
ridge9 = Ridge()

#finding the best alpha
for alpha in alpha_space:
    ridge3.alpha = alpha
    ridge3_cv_scores = cross_val_score(ridge3, X, y3, cv=10)
    ridge3_scores.append(np.mean(ridge3_cv_scores))
    
    ridge6.alpha = alpha
    ridge6_cv_scores = cross_val_score(ridge6, X, y6, cv=10)
    ridge6_scores.append(np.mean(ridge6_cv_scores))
    
    ridge9.alpha = alpha
    ridge9_cv_scores = cross_val_score(ridge9, X, y9, cv=10)
    ridge9_scores.append(np.mean(ridge9_cv_scores))

print("The best alpha value is: ", alpha_space[np.argmax(ridge3_scores)])
print("The best alpha value is: ", alpha_space[np.argmax(ridge6_scores)])
print("The best alpha value is: ", alpha_space[np.argmax(ridge9_scores)])
ridge3 = Ridge(alpha = alpha_space[np.argmax(ridge3_scores)])
Пример #22
0
def main(data_dir='./data/',
         N=10,
         cv_test_size=0.3,
         files_to_use='all',
         submit_name='submission.csv'):
    if files_to_use == 'all':
        files_to_use = [
            'dswrf_sfc', 'dlwrf_sfc', 'uswrf_sfc', 'ulwrf_sfc', 'ulwrf_tatm',
            'pwat_eatm', 'tcdc_eatm', 'apcp_sfc', 'pres_msl', 'spfh_2m',
            'tcolc_eatm', 'tmax_2m', 'tmin_2m', 'tmp_2m', 'tmp_sfc'
        ]
    train_sub_str = '_latlon_subset_19940101_20071231.nc'
    test_sub_str = '_latlon_subset_20080101_20121130.nc'

    print('Loading training data...')
    trainX = load_GEFS_data(data_dir, files_to_use, train_sub_str)  # 训练样本
    times, trainY = load_csv_data(os.path.join(data_dir,
                                               'train.csv'))  # 训练样本的目标值
    print('Training data shape', trainX.shape, trainY.shape)

    # Gotta pick a scikit-learn model
    model = Ridge(normalize=True)  # Normalizing is usually a good idea

    print('Finding best regularization value for alpha...')
    alphas = np.logspace(-3, 1, 8, base=10)  # List of alphas to check
    alphas = np.array((0.1, 0.2, 0.3, 0.4, 0.5, 0.6))
    maes = []
    for alpha in alphas:
        model.alpha = alpha
        mae = cv_loop(trainX, trainY, model, N)
        maes.append(mae)
        print('alpha %.4f mae %.4f' % (alpha, mae))
    best_alpha = alphas[np.argmin(maes)]
    print('Best alpha of %s with mean average error of %s' %
          (best_alpha, np.min(maes)))

    print('Fitting model with best alpha...')
    model.alpha = best_alpha
    model.fit(trainX, trainY)

    print('Loading test data...')
    testX = load_GEFS_data(data_dir, files_to_use, test_sub_str)
    print('Raw test data shape', testX.shape)

    #
    # predictions_rf = run_random_forest(trainX, trainY, testX)
    #
    # predictions_svr = run_svr(trainX, trainY, testX)
    #
    # predictions_ridge = run_ridge(trainX, trainY, testX)
    #
    # predictions_gbr = run_gbr(trainX, trainY, testX)
    #
    # parameters = {
    #     "loss": 'ls',
    #     "n_estimators": 3000,
    #     "learning_rate": 0.035,
    #     "max_features": 80,
    #     "max_depth": 7,
    #     "subsample": 0.5
    # }
    #
    # model = GradientBoostingRegressor(parameters)
    #
    # print("CV loop ", cv_loop(trainX, trainY[:, ], model, 10))

    print('Predicting...')
    preds = model.predict(testX)

    print('Saving to csv...')
    save_submission(preds, submit_name, data_dir)
#################################

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=508)


# Import necessary modules
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score



# Create a ridge regressor: ridge
ridge = Ridge(normalize=True)

ridge.alpha = 0.75

ridge.fit(X_train, y_train)

# Calling the score method, which compares the predicted values to the actual values

y_score = ridge.score(X_test, y_test)

# The score is directly comparable to R-Square
print(y_score)

# Predict on the test data: y_pred
y_pred = ridge.predict(X_test)
# Compute and print R^2 and RMSE
print("R^2: {}".format(ridge.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test , y_pred))
# Import necessary modules
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score

# Setup the array of alphas and lists to store scores
alpha_space = np.logspace(-4, 0, 50)
ridge_scores = []
ridge_scores_std = []

# Create a ridge regressor: ridge
ridge = Ridge(normalize = True)

# Compute scores over range of alphas
for alpha in alpha_space:

    # Specify the alpha value to use: ridge.alpha
    ridge.alpha = alpha
    
    # Perform 10-fold CV: ridge_cv_scores
    ridge_cv_scores = cross_val_score(ridge,X,y, cv = 10)
    
    # Append the mean of ridge_cv_scores to ridge_scores
    ridge_scores.append(np.mean(ridge_cv_scores))
    
    # Append the std of ridge_cv_scores to ridge_scores_std
    ridge_scores_std.append(np.std(ridge_cv_scores))

# Display the plot
display_plot(ridge_scores, ridge_scores_std)