예제 #1
0
def test_boston_housing_regression():
    from sklearn.metrics import mean_squared_error
    from sklearn.datasets import load_boston
    from sklearn.model_selection import KFold

    boston = load_boston()
    y = boston['target']
    X = boston['data']
    kf = KFold(n_splits=2, shuffle=True, random_state=rng)
    for train_index, test_index in kf.split(X, y):
        xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index])

        preds = xgb_model.predict(X[test_index])
        # test other params in XGBRegressor().fit
        preds2 = xgb_model.predict(X[test_index], output_margin=True,
                                   ntree_limit=3)
        preds3 = xgb_model.predict(X[test_index], output_margin=True,
                                   ntree_limit=0)
        preds4 = xgb_model.predict(X[test_index], output_margin=False,
                                   ntree_limit=3)
        labels = y[test_index]

        assert mean_squared_error(preds, labels) < 25
        assert mean_squared_error(preds2, labels) < 350
        assert mean_squared_error(preds3, labels) < 25
        assert mean_squared_error(preds4, labels) < 350
예제 #2
0
파일: 12-9-als.py 프로젝트: wiznut/Examples
def compute_ALS(R, n_iter, lambda_, k):
    '''임의의 사용자 요인 행렬 X와 임의의 영화 요인 행렬 Y를 생성한 뒤
    교대 최소제곱법을 이용하여 유틸리티 행렬 R을 근사합니다.
    R(ndarray) : 유틸리티 행렬
    lambda_(float) : 정규화 파라미터입니다.
    n_iter(fint) : X와 Y의 갱신 횟수입니다.
    '''
    m, n =R.shape
    X = np.random.rand(m, k)
    Y = np.random.rand(k, n)

    # 각 갱신 때마다 계산한 에러를 저장합니다.
    errors =[]
    for i in range(0, n_iter):
        # [식 6-4]를 구현했습니다.
        # 넘파이의 eye 함수는 파라미터 a를 받아 a x a 크기의 단위행렬을 만듭니다.
        X = np.linalg.solve(np.dot(Y, Y.T) + lambda_ * np.eye(k), np.dot(Y, R.T)).T
        Y = np.linalg.solve(np.dot(X.T, X) + lambda_ * np.eye(k), np.dot(X.T, R))
        
        errors.append(mean_squared_error(R, np.dot(X, Y)))
        
        if i % 10 == 0:
            print('iteration %d is completed'%(i))
            #print(mean_squared_error(R, np.dot(X, Y)))
        
    R_hat = np.dot(X, Y)
    print('Error of rated movies: %.5f'%(mean_squared_error(R, np.dot(X, Y))))
    return(R_hat, errors)
def exercise_2b():
    X, y = make_blobs(n_samples=1000,centers=50, n_features=2, random_state=0)
    kf = ShuffleSplit(100, train_size= 0.9, test_size=0.1, random_state=0)
    # kf = KFold(1000, n_folds=10, shuffle=False, random_state=None)
    accuracy_lst = np.zeros([49, 2], dtype=float)
    accuracy_current = np.zeros(10, dtype=float)
    for k in range(1,50):
        iterator = 0
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            clf = KNeighborsClassifier(n_neighbors=k)
            clf.fit(X_train, y_train)
            accuracy_current[iterator] = (1. - clf.score(X_test,y_test))
            iterator+=1
            print mean_squared_error(y_test, clf.predict(X_test))
        accuracy_lst[k-1, 0] = accuracy_current.mean()
        accuracy_lst[k-1, 1] = accuracy_current.var()#*2 #confidence interval 95%
    x = np.arange(1,50, dtype=int)
    plt.style.use('ggplot')
    plt.plot(x, accuracy_lst[:, 1], '#009999', marker='o')
    # plt.errorbar(x, accuracy_lst[:, 0], accuracy_lst[:, 1], linestyle='None', marker='^')
    plt.xticks(x, x)
    plt.margins(0.02)
    plt.xlabel('K')
    plt.ylabel('Variance')
    plt.show()
예제 #4
0
파일: dataMining.py 프로젝트: wangwf/Codes
def test_regression():
    from numpy.random import rand
    x = rand(40,1) # explanatory variable
    y = x*x*x+rand(40,1)/5 # depentend variable

    from sklearn.linear_model import LinearRegression
    linreg = LinearRegression()
    linreg.fit(x,y)

    from numpy import linspace, matrix
    xx = linspace(0,1,40)
    plot(x,y,'o',xx,linreg.predict(matrix(xx).T),'--r')
    show()
        
    from sklearn.metrics import mean_squared_error
    print mean_squared_error(linreg.predict(x),y)

    from numpy import corrcoef
    corr = corrcoef(data.T) # .T gives the transpose
    print corr


    from pylab import pcolor, colorbar, xticks, yticks
    from numpy import arrange
    pcolor(corr)
    colorbar() # add
    # arranging the names of the variables on the axis
    xticks(arange(0.5,4.5),['sepal length',  'sepal width', 'petal length', 'petal width'],rotation=-20)
    yticks(arange(0.5,4.5),['sepal length',  'sepal width', 'petal length', 'petal width'],rotation=-20)
    show()
def simple_cv(valence_regressors, arousal_regressors, valence_movie_matrices, arousal_movie_matrices, 
	valence_labels_movies, arousal_labels_movies, threshold, valence_movie_t, arousal_movie_t):
	n_train_matrices = 21
	n_valid_matrices = 6
	n_test_matrices = 3
	valence_labels = join_vectors(valence_labels_movies)
	arousal_labels = join_vectors(arousal_labels_movies)
	print len(valence_labels), len(arousal_labels)
	processes = []
	n_valence_features, n_arousal_features = threshold_n_features(threshold, valence_movie_t, arousal_movie_t)
	valence_predictions, arousal_predictions = np.array([], dtype = 'float'), np.array([], dtype = 'float')
	for i in range(0, 10):
		valence_test_predictions, arousal_test_predictions = fold_training(valence_predictions, arousal_predictions, i, 
			valence_regressors, arousal_regressors, 
			valence_movie_matrices, arousal_movie_matrices, 
			valence_labels_movies, arousal_labels_movies, 
			n_test_matrices, n_train_matrices, n_valid_matrices, 
			n_valence_features, n_arousal_features)
		valence_predictions = np.append(valence_predictions, valence_test_predictions)
		arousal_predictions = np.append(arousal_predictions, arousal_test_predictions)

	print math.sqrt(mean_squared_error(valence_labels, valence_predictions)), np.corrcoef(valence_labels, 
			valence_predictions)[0][1]
	print math.sqrt(mean_squared_error(arousal_labels, arousal_predictions)), np.corrcoef(arousal_labels, 
			arousal_predictions)[0][1]
예제 #6
0
def test_als_warm_start():
    X, y, coef = make_user_item_regression(label_stdev=0)
    from sklearn.cross_validation import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=42)
    X_train = sp.csc_matrix(X_train)
    X_test = sp.csc_matrix(X_test)

    fm = als.FMRegression(n_iter=10, l2_reg_w=0, l2_reg_V=0, rank=2)
    fm.fit(X_train, y_train)
    y_pred = fm.predict(X_test)
    error_10_iter = mean_squared_error(y_pred, y_test)

    fm = als.FMRegression(n_iter=5, l2_reg_w=0, l2_reg_V=0, rank=2)
    fm.fit(X_train, y_train)
    print fm.iter_count
    y_pred = fm.predict(X_test)
    error_5_iter = mean_squared_error(y_pred, y_test)

    fm.fit(sp.csc_matrix(X_train), y_train, n_more_iter=5)
    print fm.iter_count
    y_pred = fm.predict(X_test)
    error_5_iter_plus_5 = mean_squared_error(y_pred, y_test)

    print error_5_iter, error_5_iter_plus_5, error_10_iter

    assert error_10_iter == error_5_iter_plus_5
def test_regression_synthetic():
    """Test on synthetic regression datasets used in Leo Breiman,
    `Bagging Predictors?. Machine Learning 24(2): 123-140 (1996). """
    random_state = check_random_state(1)
    regression_params = {'n_estimators': 100, 'max_depth': 4,
                         'min_samples_split': 1, 'learning_rate': 0.1,
                         'loss': 'ls'}

    # Friedman1
    X, y = datasets.make_friedman1(n_samples=1200,
                                   random_state=random_state, noise=1.0)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]
    clf = GradientBoostingRegressor()
    clf.fit(X_train, y_train)
    mse = mean_squared_error(y_test, clf.predict(X_test))
    assert mse < 5.0, "Failed on Friedman1 with mse = %.4f" % mse

    # Friedman2
    X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]
    clf = GradientBoostingRegressor(**regression_params)
    clf.fit(X_train, y_train)
    mse = mean_squared_error(y_test, clf.predict(X_test))
    assert mse < 1700.0, "Failed on Friedman2 with mse = %.4f" % mse

    # Friedman3
    X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]
    clf = GradientBoostingRegressor(**regression_params)
    clf.fit(X_train, y_train)
    mse = mean_squared_error(y_test, clf.predict(X_test))
    assert mse < 0.015, "Failed on Friedman3 with mse = %.4f" % mse
예제 #8
0
def model_metrics(model, X, y, data_split):
    
    print '-----------------------------------------'
    print 'Metrics:'
    print '-----------------------------------------'
    y_test = data_split['y_test']
    y_pred = data_split['y_pred']
    
    X_train = data_split['X_train']
    X_test = data_split['X_test']
    y_train = data_split['y_train']
    
    print 'MSE\t', metrics.mean_squared_error(y_test, y_pred)
    print 'RMSE\t', np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    
    score_train = model.score(X_train, y_train)
    score_test = model.score(X_test, y_test)
    score_general = model.score(X[cols].fillna(0), y)
    
    print '\n'
    print '-----------------------------------------'
    print 'Scores:'
    print '-----------------------------------------'
    print 'Train\t', score_train
    print 'Test\t', score_test
    print 'General\t', score_general
    print '-----------------------------------------\n'

    return score_test, score_general
def gradient_boosting(features_values_temp, rows_temp, columns_temp, prediction_values_temp, kernel, threshold):
	#kernel: linear, poly, rbf, sigmoid, precomputed

	rows = 0
	while rows_temp > 0:
		rows = rows + 1
		rows_temp = rows_temp - 1

	columns = 0
	while columns_temp > 0:
		columns = columns + 1
		columns_temp = columns_temp - 1

	features_values = [x for x in features_values_temp]
	prediction_values = [y for y in prediction_values_temp]



	rotated = convert_list_to_matrix(features_values, rows, columns)
	scores = np.array(prediction_values)

	threshold = float(threshold)

	estimator = SVR(kernel=kernel) # try to change to the model for which the test is gonna run (lasso, ridge, etc.)

	 X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0)
	 X_train, X_test = X[:200], X[200:]
	 y_train, y_test = y[:200], y[200:]
	 est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='ls').fit(X_train, y_train)
	 mean_squared_error(y_test, est.predict(X_test)) 
예제 #10
0
def train_test(features,labels,features_test,labels_test):
    verbose ("Features size",features.shape)
    verbose ("Labels size",labels.shape)
    verbose ("Features size",features_test.shape)
    verbose ("Labels size",labels_test.shape)
    #T_train_xgb = xgb.DMatrix(features, labels)
    verbose ("Training...")
    #params = {"objective":"reg:linear",
    #                               "booster" : "gbtree",
    #                               "eta":0.1,
    #                               "max_depth":10,
    #                               "subsample":0.85,
    #                               "colsample_bytree":0.7}
    #gbm = xgb.train(dtrain=T_train_xgb,params=params)
    regressor = skflow.TensorFlowLinearRegressor()#TODO convert uint32 to TensorFlow DType
    regressor.fit(features, labels)
    verbose ("Predict...")
    #preds=gbm.predict(xgb.DMatrix(features_test))
    preds=regressor.predict(features_test)
    preds[preds<0]=0
    verbose(preds)
    verbose(len(preds))
    verbose("MSE: ")
    score = metrics.mean_squared_error(preds,labels_test)
    verbose("Original",score)
    score = metrics.mean_squared_error(np.round(preds), labels_test)
    verbose("round",score)
    verbose ("RMSLE:")
    score = rmsle(preds, labels_test)
    verbose ("Original",score)
    score = rmsle(np.round(preds), labels_test)
    verbose ("round",score)
예제 #11
0
def get_grid_search_values(model, grid_params, x_train, y_train, x_test, y_test, scoring_criteria = 'mean_squared_error'):  
    # Run a grid search on a model, and return the train / test score and MSE on the best result
    
    # Input
    # model: scikit-learn model
    # grid_params: dict of parameter space
    # x_train: independent variables training set
    # y_train: dependent variable training set
    # x_test: independent variables test set
    # y_test: dependent variable test set
    # scoring_criteria: model scoring criteria
    
    # Output
    # best_model: model that produced the best results
    # para_search.best_params_: best grid parameters
    # train_score: training score
    # test_score: test score
    # train_mse: training mse
    # test_mse: test mse
    
    para_search = grid_search.GridSearchCV(model, grid_params, scoring = scoring_criteria, cv = 5).fit(x_train, y_train)
    best_model = para_search.best_estimator_
    train_score = best_model.score(x_train, y_train)
    test_score = best_model.score(x_test, y_test)
    train_mse = metrics.mean_squared_error(best_model.predict(x_train), y_train)
    test_mse = metrics.mean_squared_error(best_model.predict(x_test), y_test)
    
    return best_model, para_search.best_params_, train_score, test_score, train_mse, test_mse 
예제 #12
0
파일: Training.py 프로젝트: mronian/dermacv
def classify():
    mela=np.loadtxt('TrainingData_Melanoma')
    notmela=np.loadtxt('TrainingData_NotMelanoma')
    
    mela_labels=[1]*32
    notmela_labels=[-1]*26
    
    data=np.append(mela, notmela, axis=0)
    
    means=np.mean(data, axis=0)
    varis=np.var(data, axis=0)
    
    for i in range(len(data[0])-1):
        data[:,i]=(((data[:,i]-means[i])/3*varis[i])+1)/2
    
    labels_d=np.append(mela_labels, notmela_labels)
    X_train, X_test, y_train, y_test = train_test_split(data, labels_d, test_size=0.25)
    clf=svm.LinearSVC()
    clf=clf.fit(X_train, y_train)
    p_test=clf.predict(X_test)

    p_train=clf.predict(X_train)
    
    print clf.score(X_train, y_train)
    print clf.score(X_test, y_test)
    rmse_train=mean_squared_error(y_train, p_train)**0.5
    rmse_test=mean_squared_error(y_test, p_test)**0.5

    print rmse_train, rmse_test
def execute(model, data, savepath, *args, **kwargs):

    fluence_divisions = [3.3E18, 3.3E19, 3.3E20]
    flux_divisions = [5e11,2e11,1e11]

    fig, ax = plt.subplots(1,3, figsize = (30,10))
    for x in range(len(fluence_divisions)):
        model = model
        data.remove_all_filters()
        data.add_inclusive_filter("fluence n/cm2", '<', fluence_divisions[x])
        l_train = len(data.get_y_data())
        model.fit(data.get_x_data(), np.array(data.get_y_data()).ravel())

        data.remove_all_filters()
        data.add_inclusive_filter("fluence n/cm2", '>=', fluence_divisions[x])
        l_test = len(data.get_y_data())
        Ypredict = model.predict(data.get_x_data())
        RMSE = np.sqrt(mean_squared_error(Ypredict, np.array(data.get_y_data()).ravel()))

        matplotlib.rcParams.update({'font.size': 26})
        ax[x].scatter(data.get_y_data(), Ypredict, color='black', s=10)
        ax[x].plot(ax[x].get_ylim(), ax[x].get_ylim(), ls="--", c=".3")
        ax[x].set_xlabel('Measured ∆sigma (Mpa)')
        ax[x].set_ylabel('Predicted ∆sigma (Mpa)')
        ax[x].set_title('Testing Fluence > {}'.format(fluence_divisions[x]))
        ax[x].text(.1, .88, 'RMSE: {:.3f}'.format(RMSE),fontsize = 30, transform=ax[x].transAxes)
        ax[x].text(.1, .83, 'Train: {}, Test: {}'.format(l_train, l_test), transform=ax[x].transAxes)

    fig.tight_layout()
    plt.subplots_adjust(bottom = .2)
    fig.savefig(savepath.format("fluence_extrapolation"), dpi=150, bbox_inches='tight')
    plt.close()

    fig, ax = plt.subplots(1, 3, figsize=(30, 10))
    for x in range(len(flux_divisions)):
        model = model
        data.remove_all_filters()
        data.add_inclusive_filter("flux n/cm2/s", '>', flux_divisions[x])
        l_train = len(data.get_y_data())
        model.fit(data.get_x_data(), np.array(data.get_y_data()).ravel())

        data.remove_all_filters()
        data.add_inclusive_filter("flux n/cm2/s", '<=', flux_divisions[x])
        l_test = len(data.get_y_data())
        Ypredict = model.predict(data.get_x_data())
        RMSE = np.sqrt(mean_squared_error(Ypredict, np.array(data.get_y_data()).ravel()))

        matplotlib.rcParams.update({'font.size': 26})
        ax[x].scatter(data.get_y_data(), Ypredict, color='black', s=10)
        ax[x].plot(ax[x].get_ylim(), ax[x].get_ylim(), ls="--", c=".3")
        ax[x].set_xlabel('Measured ∆sigma (Mpa)')
        ax[x].set_ylabel('Predicted ∆sigma (Mpa)')
        ax[x].set_title('Testing Flux < {:.0e}'.format(flux_divisions[x]))
        ax[x].text(.1, .88, 'RMSE: {:.3f}'.format(RMSE), fontsize=30, transform=ax[x].transAxes)
        ax[x].text(.1, .83, 'Train: {}, Test: {}'.format(l_train, l_test), transform=ax[x].transAxes)

    fig.tight_layout()
    plt.subplots_adjust(bottom=.2)
    fig.savefig(savepath.format("flux_extrapolation"), dpi=150, bbox_inches='tight')
    plt.close()
예제 #14
0
def stats_by_latlev(x_ppi, y_ppi, x_pp, y_pp, r_mlp, lat, lev, datafile):
    # Initialize
    Tmean = np.zeros((len(lat), len(lev)))
    qmean = np.zeros((len(lat), len(lev)))
    Tbias = np.zeros((len(lat), len(lev)))
    qbias = np.zeros((len(lat), len(lev)))
    rmseT = np.zeros((len(lat), len(lev)))
    rmseq = np.zeros((len(lat), len(lev)))
    rT = np.zeros((len(lat), len(lev)))
    rq = np.zeros((len(lat), len(lev)))
    for i in range(len(lat)):
        print('Loading data for latitude {:d} of {:d}'.format(i, len(lat)))
        T_true, q_true, T_pred, q_pred = \
            load_one_lat(x_ppi, y_ppi, x_pp, y_pp, r_mlp, i, datafile,
                         minlev=np.min(lev))
        # Get means of true output
        Tmean[i, :] = np.mean(T_true, axis=0)
        qmean[i, :] = np.mean(q_true, axis=0)
        # Get bias from means
        Tbias[i, :] = np.mean(T_pred, axis=0) - Tmean[i, :]
        qbias[i, :] = np.mean(q_pred, axis=0) - qmean[i, :]
        # Get rmse
        rmseT[i, :] = np.sqrt(
            metrics.mean_squared_error(T_true, T_pred,
                                       multioutput='raw_values'))
        rmseq[i, :] = np.sqrt(
            metrics.mean_squared_error(q_true, q_pred,
                                       multioutput='raw_values'))
        # Get correlation coefficients
        for j in range(len(lev)):
            rT[i, j], _ = scipy.stats.pearsonr(T_true[:, j], T_pred[:, j])
            rq[i, j], _ = scipy.stats.pearsonr(q_true[:, j], q_pred[:, j])
    return Tmean.T, qmean.T, Tbias.T, qbias.T, rmseT.T, rmseq.T, rT.T, rq.T
def plot_stages(reg,X_train,y_train,X_test,y_test,ax,title=""):
    test_score = np.zeros(reg.n_estimators, dtype=np.float64)
    train_score = np.zeros(reg.n_estimators, dtype=np.float64)
    
    for i, y_pred in enumerate(reg.staged_predict(X_test)):
        test_score[i] = np.sqrt(mean_squared_error(y_test, y_pred))
        
    for i, y_pred_train in enumerate(reg.staged_predict(X_train)):    
        train_score[i] = np.sqrt(mean_squared_error(y_train, y_pred_train))
        
    min_test_score = min(test_score)
    min_test_score_stage = np.argmin(test_score)    
    learning_rate=reg.learning_rate
    max_depth = reg.max_depth
    
    ax.hold("on")
    ax.set_title('RMSE per stage for :'+str(title),fontsize=9)
    ax.plot(np.arange(reg.n_estimators), train_score, 'b-', label='Training Set RMSE')
    ax.plot(np.arange(reg.n_estimators), test_score, 'r-', label='Test Set RMSE')
    ax.set_xlim((0,reg.n_estimators))
    ymin , ymax = ax.get_ylim()
    xmin , xmax = ax.get_xlim()
    ax.annotate('Learning rate : '+str(learning_rate), xy=(0.8*xmax, 0.85*ymax), xytext=(0.8*xmax, 0.85*ymax))
    ax.annotate('Max depth : '+str(max_depth), xy=(0.8*xmax, 0.8*ymax), xytext=(0.8*xmax, 0.8*ymax))
    ax.annotate('Min RMSE : '+str(round(min_test_score,3)), xy=(min_test_score_stage+10,min_test_score+0.1), xytext=(min_test_score_stage+10,min_test_score+0.1),color = "red")
    ax.legend(loc='upper right')
    ax.grid(True)
    ax.hlines(y=min_test_score,xmin=0,xmax=reg.n_estimators,linestyles="dashed",color="grey")
    ax.vlines(x=min_test_score_stage,ymin=0,ymax=1,linestyles="dashed",color="grey")
    ax.set_xlabel('Boosting Iterations')
    ax.set_ylabel('RMSE')
    ax.hold("off")
예제 #16
0
파일: mcmc.py 프로젝트: ibayer/fastFM
def find_init_stdev(fm, X_train, y_train, X_vali=None, y_vali=None,
                    stdev_range=None, ):
    if not stdev_range:
        stdev_range = [0.1, 0.1, 0.2, 0.5, 1.0]

    if not isinstance(fm, FMRegression):
        raise Exception("only implemented for FMRegression")

    # just using a dummy here
    if X_vali is None:
        X_test = X_train[:2, :]
    else:
        X_test = X_vali

    best_init_stdev = 0
    best_mse = np.finfo(np.float64).max
    for init_stdev in stdev_range:
        fm.init_stdev = init_stdev
        y_pred_vali = fm.fit_predict(X_train, y_train, X_test)
        if X_vali is None:
            y_pred = fm.predict(X_train)
            mse = mean_squared_error(y_pred, y_train)
        else:
            mse = mean_squared_error(y_pred_vali, y_vali)
        if mse < best_mse:
            best_mse = mse
            best_init_stdev = init_stdev
    return best_init_stdev, best_mse
예제 #17
0
def train_learning_model_decision_tree_ada_boost(df):
    #code taken from sklearn
    X_all, y_all = preprocess_data(df)
    X_train, X_test, y_train, y_test = split_data(X_all, y_all)

    tree_regressor = DecisionTreeRegressor(max_depth = 6)
    ada_regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=6), n_estimators = 500, learning_rate = 0.01, random_state = 1)

    tree_regressor.fit(X_train, y_train)
    ada_regressor.fit(X_train, y_train)

    y_pred_tree = tree_regressor.predict(X_test)
    y_pred_ada = ada_regressor.predict(X_test)
    
    mse_tree = mean_squared_error(y_test, y_pred_tree)
    mse_ada = mean_squared_error(y_test, y_pred_ada)

    mse_tree_train = mean_squared_error(y_train, tree_regressor.predict(X_train))
    mse_ada_train = mean_squared_error(y_train, ada_regressor.predict(X_train))
    
    print ("MSE tree: %.4f " %mse_tree)
    print ("MSE ada: %.4f " %mse_ada)

    print ("MSE tree train: %.4f " %mse_tree_train)
    print ("MSE ada train: %.4f " %mse_ada_train)
예제 #18
0
def execute(model, data, savepath, *args, **kwargs):
    # Train the model using the training sets
    model.fit(data.get_x_data(), np.asarray(data.get_y_data()).ravel())
    overall_rms = np.sqrt(mean_squared_error(model.predict(data.get_x_data()), np.asarray(data.get_y_data()).ravel()))
    datasets = ['IVAR', 'ATR-1', 'ATR-2']
    colors = ['#BCBDBD', '#009AFF', '#FF0A09']
    fig, ax = plt.subplots()
    #calculate rms for each dataset
    for dataset in range(max(np.asarray(data.get_data("Data Set code")).ravel()) + 1):
        data.remove_all_filters()
        data.add_inclusive_filter("Data Set code", '=', dataset)
        Ypredict = model.predict(data.get_x_data())
        Ydata = np.asarray(data.get_y_data()).ravel()
        # calculate rms
        rms = np.sqrt(mean_squared_error(Ypredict, Ydata))
        # graph outputs
        ax.scatter(Ydata, Ypredict, s=7, color=colors[dataset], label= datasets[dataset], lw = 0)
        ax.text(.05, .83 - .05*dataset, '{} RMS: {:.3f}'.format(datasets[dataset],rms), fontsize=14, transform=ax.transAxes)

    ax.legend()
    ax.plot(ax.get_ylim(), ax.get_ylim(), ls="--", c=".3")
    ax.set_xlabel('Measured (MPa)')
    ax.set_ylabel('Predicted (MPa)')
    ax.set_title('Full Fit')
    ax.text(.05, .88, 'Overall RMS: %.4f' % (overall_rms), fontsize=14, transform=ax.transAxes)
    fig.savefig(savepath.format(ax.get_title()), dpi=300, bbox_inches='tight')
    plt.clf()
    plt.close()
예제 #19
0
    def test_rrf_vs_sklearn_reg(self):
        """Test R vs. sklearn on boston housing dataset. """
        from sklearn.datasets import load_boston
        from sklearn.cross_validation import train_test_split
        from sklearn.metrics import mean_squared_error
        from sklearn.ensemble import RandomForestRegressor

        boston = load_boston()
        X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target,
                                                            test_size=0.2, random_state=13)

        n_samples, n_features = X_train.shape
        mtry = int(np.floor(0.3 * n_features))
        # do 100 trees
        r_rf = RRFEstimatorR(**{'ntree': 100, 'nodesize': 1, 'replace': 0,
                                'mtry': mtry, 'corr.bias': False,
                                'sampsize': n_samples, 'random_state': 1234})
        r_rf.fit(X_train, y_train)
        y_pred = r_rf.predict(X_test)
        r_mse = mean_squared_error(y_test, y_pred)

        p_rf = RandomForestRegressor(n_estimators=100, min_samples_leaf=1, bootstrap=False,
                                     max_features=mtry, random_state=1)
        p_rf.fit(X_train, y_train)
        y_pred = p_rf.predict(X_test)
        p_mse = mean_squared_error(y_test, y_pred)
        print('%.4f vs %.4f' % (r_mse, p_mse))
        # should be roughly the same (7.6 vs. 7.2)
        np.testing.assert_almost_equal(r_mse, p_mse, decimal=0)
예제 #20
0
def test_boston_housing_regression_with_sample_weights():
    tm._skip_if_no_sklearn()
    from sklearn.metrics import mean_squared_error
    from sklearn.datasets import load_boston
    from sklearn.cross_validation import KFold

    boston = load_boston()
    y = boston['target']
    X = boston['data']
    sample_weight = np.ones_like(y, 'float')
    kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)

    for train_index, test_index in kf:
        xgb_model = xgb.XGBRegressor().fit(
            X[train_index], y[train_index],
            sample_weight=sample_weight[train_index]
        )

        preds = xgb_model.predict(X[test_index])
        # test other params in XGBRegressor().fit
        preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3)
        preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0)
        preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3)
        labels = y[test_index]

        assert mean_squared_error(preds, labels) < 25
        assert mean_squared_error(preds2, labels) < 370
        assert mean_squared_error(preds3, labels) < 25
        assert mean_squared_error(preds4, labels) < 370
예제 #21
0
def main():
    """ Test SVM from scikit learn on mnist data set.""" 

    (X_train, Y_train), (X_test, Y_test) =  data.preprocess_mnist() 
  

    model = SVC(kernel='poly', degree=2) 
    params = { "C" : np.logspace(0, 3, 4), 
               "gamma" : np.logspace(-7, 2, 4), 
               "coef0" : np.logspace(-4,4,4)} 

    grid = GridSearchCV(model, param_grid = params, 
                        cv=5, n_jobs = 5, pre_dispatch = "n_jobs")
    grid.fit(X_train, Y_train) 
    print(grid.best_params_)

    train_yy = grid.predict(X_train)
    test_yy = grid.predict(X_test) 

    train_err = 100*mean_squared_error(train_yy, Y_train) 
    test_err = 100*mean_squared_error(test_yy, Y_test) 
    
    print("Train. err:", train_err) 
    print("Test err:", test_err) 

    train_acc = accuracy_score(Y_train, train_yy)  
    test_acc = accuracy_score(Y_test, test_yy) 

    print("Train. acc:", train_acc) 
    print("Test acc:", test_acc) 
예제 #22
0
    def train(self):
        print('#### preprocessing ####')
        self.df = self.preprocess(self.df)

        print('#### training ####')
        self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]]
        xgb_param = self.clf.get_xgb_params()

        xgtrain  = xgb.DMatrix(self.df[self.predictors], label=self.df[self.target_column], missing=np.nan)
        try:
            cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose)
        except:
            try:
                cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                    metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose)
            except:
                cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                    metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds)
        self.clf.set_params(n_estimators=cvresult.shape[0])
        self.clf.fit(self.df[self.predictors], self.df[self.target_column],eval_metric=self.scoring)

        #Predict training set:
        train_df_predictions = self.clf.predict(self.df[self.predictors])

        if self.target_type == 'binary':
            train_df_predprob = self.clf.predict_proba(self.df[self.predictors])[:,1]
            print("Accuracy : %.4g" % metrics.accuracy_score(self.df[self.target_column].values, train_df_predictions))
            print("AUC Score (Train): %f" % metrics.roc_auc_score(self.df[self.target_column], train_df_predprob))
        elif self.target_type == 'linear':
            print("Mean squared error: %f" % metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions))
            print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions)))
def rf_test(X,y):

    RF_model = RandomForestRegressor(100,n_jobs=-1)
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)
    RF_model.fit(X_train,y_train)
    y_pred = RF_model.predict(X_test)
    print mean_squared_error(y_test, y_pred), r2_score(y_test,y_pred)
예제 #24
0
def test():
    (X_train, Y_train), (X_test, Y_test) =  mnist.load_data() 
  
    # preprocess data
    X_train = X_train.reshape(60000, 784)
    X_test = X_test.reshape(10000, 784)
    X_train = X_train.astype('float32')
    X_test = X_test.astype('float32')
    X_train /= 255
    X_test /= 255

    model = pickle.load(open("svm_rbf.pickle","rb"))
    
    train_yy = model.predict(X_train)
    test_yy = model.predict(X_test) 

    train_err = 100*mean_squared_error(train_yy, Y_train) 
    test_err = 100*mean_squared_error(test_yy, Y_test) 
    
    print("Train. err:", train_err) 
    print("Test err:", test_err) 

    train_acc = accuracy_score(Y_train, train_yy)  
    test_acc = accuracy_score(Y_test, test_yy) 
    
    print("Train acc:", train_acc)
    print("Test acc:", test_acc)
예제 #25
0
def main():
    """ Test SVM from scikit learn on mnist data set.""" 

    (X_train, Y_train), (X_test, Y_test) =  mnist.load_data() 
  
    # preprocess data
    X_train = X_train.reshape(60000, 784)
    X_test = X_test.reshape(10000, 784)
    X_train = X_train.astype('float32')
    X_test = X_test.astype('float32')
    X_train /= 255
    X_test /= 255

    print(X_train.shape[0], 'train samples')
    print(X_test.shape[0], 'test samples')


    model = SVC(kernel='rbf', gamma=0.02, C=10) 
    model.fit(X_train, Y_train)
    
    train_yy = model.predict(X_train)
    test_yy = model.predict(X_test) 

    train_err = 100*mean_squared_error(train_yy, Y_train) 
    test_err = 100*mean_squared_error(test_yy, Y_test) 
    
    print("Train. err:", train_err) 
    print("Test err:", test_err) 

    train_acc = accuracy_score(Y_train, train_yy)  
    test_acc = accuracy_score(Y_test, test_yy) 

    pickle.dump(model, open("svm_rbf", "wb"))
예제 #26
0
파일: mvar.py 프로젝트: UK-7/Regression
def polyRegressionKFold(inputFiles, deg=2):
      print "***************************"
      print "Degree: %s" % deg
      start_time = time.time()
      errors = []
      for File in inputFiles:
            print "___________________________"
            print "Data Set: %s" % File
            data = tools.readData(File)
            data = data[np.argsort(data[:,0])]
            X = data[:, :-1]
            Y = data[:, len(data[1,:]) - 1]
            kf = KFold(len(data), n_folds = 10, shuffle = True)
            TrainError = 0
            TestError = 0
            for train, test in kf:
                  pol = PolynomialFeatures(deg)
                  Z = pol.fit_transform(X[train]) 
                  Z_test = pol.fit_transform(X[test])     
                  theta = regress(Z, Y[train])
                  Y_hat = np.dot(Z, theta)
                  Y_hat_test = np.dot(Z_test, theta)
                  TrainError += mean_squared_error(Y[train], Y_hat)
                  TestError += mean_squared_error(Y[test], Y_hat_test)
            TestError /= len(kf)
            TrainError /= len(kf)
            errors.append([TestError, deg])
            print "---------------------------"
            print "Test Error: %s" % TestError
            print "Train Error: %s" % TrainError
      time_taken = start_time - time.time()
      print "Time Taken for primal: %s" % str(time_taken)
      return np.asarray(errors)
예제 #27
0
def FindPolyregDegree():
    loadDB()
    points = getAllPoints2(30)
    print(len(points))
    X = []
    Y = []
    for point in points:
        Y.append(point['vehicleSpeed']/point['enginespeed'])
        X.append([point['fuelrate']])

    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.8)
    train_error = np.empty(10)
    test_error = np.empty(10)
    for degree in range(10):
        est = make_pipeline(PolynomialFeatures(degree), Ridge())
        est.fit(X_train, y_train)
        train_error[degree] = mean_squared_error(y_train, est.predict(X_train))
        test_error[degree] = mean_squared_error(y_test, est.predict(X_test))

    plt.plot(np.arange(10), train_error, color='green', label='train')
    plt.plot(np.arange(10), test_error, color='red', label='test')
    plt.title("Degree vs Error - Finding optimal model for regression")
    plt.ylabel('log(mean squared error)')
    plt.xlabel('degree')
    plt.legend(loc='lower left')
    plt.show()
예제 #28
0
def multi_regression():
    '''
    多元回归
    :return:
    '''
    from sklearn.cross_validation import train_test_split
    X = df.iloc[:, :-1].values
    y = df['MEDV'].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    slr = LinearRegression()
    slr.fit(X_train, y_train)
    y_train_pred = slr.predict(X_train)
    y_test_pred = slr.predict(X_test)
    # 计算Mean Squared Error (MSE)
    print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
    # MSE train: 19.958, test: 27.196 => over fitting

    # 计算R*R
    # If R*R =1, the model  ts the data perfectly with a corresponding MSE = 0 .
    print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))

    # plot
    plt.scatter(y_train_pred, y_train_pred - y_train, c='blue', marker='o', label='Training data')
    plt.scatter(y_test_pred, y_test_pred - y_test, c='lightgreen', marker='s', label='Test data')
    plt.xlabel('Predicted values')
    plt.ylabel('Residuals')
    plt.legend(loc='upper left')
    plt.hlines(y=0, xmin=-10, xmax=50, lw=2, color='red')
    plt.xlim([-10, 50])
    plt.show()
예제 #29
0
파일: ca_models.py 프로젝트: pkravik/kaggle
def testingGBM(X_train, Y_train, X_test, Y_test):
    params = {'verbose':2, 'n_estimators':100, 'max_depth':50, 'min_samples_leaf':20, 'learning_rate':0.1, 'loss':'ls', 'max_features':None}
    test_init = Ridge(alpha = 0.1, normalize = True, fit_intercept=True)
    gbm2 = GradientBoostingRegressor(**params)
    gbm2.fit(X_train, Y_train["Ca"])
    yhat_gbm = gbm2.predict(X_test)
    mean_squared_error(Y_test["Ca"], yhat_gbm)
    math.sqrt(mean_squared_error(Y_test["Ca"], yhat_gbm))
    
    test_score = np.zeros((params['n_estimators'],), dtype=np.float64)
    
    for i, y_pred in enumerate(gbm2.staged_decision_function(X_test)):
        test_score[i]=mean_squared_error(Y_test["Ca"], y_pred)
    
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.title('Deviance')
    plt.plot(np.arange(params['n_estimators']) + 1, gbm2.train_score_, 'b-',
             label='Training Set Deviance')
    plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
             label='Test Set Deviance')
    
    plt.legend(loc='upper right')
    plt.xlabel('Boosting Iterations')
    plt.ylabel('Deviance')
    plt.show()
예제 #30
0
def demo(X = None, y = None, test_size = 0.1):
    
    if X == None:
        boston = load_boston()
        X = pd.DataFrame(boston.data)
        y = pd.DataFrame(boston.target)



    base_estimator = DecisionTreeRegressor(max_depth = 5)


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    print X_train.shape

    # If you want to compare with BaggingRegressor.
    # bench =  BaggingRegressor(base_estimator = base_estimator, n_estimators = 10, max_samples = 1, oob_score = True).fit(X_train, y_train)
    # print bench.score(X_test, y_test)
    # print mean_squared_error(bench.predict(X_test), y_test)

    clf = BasicSegmenterEG_FEMPO(ngen=30,init_sample_percentage = 1, n_votes=10, n = 10, base_estimator = base_estimator,
        unseen_x = X_test, unseen_y = y_test)
    clf.fit(X_train, y_train)
    print clf.score(X_test,y_test)
    y = clf.predict(X_test)
    print mean_squared_error(y, y_test)
    print y.shape

    return clf, X_test, y_test
예제 #31
0
regression1 = regression()
regression2 = regression(normalize=True)
neighbor1 = neighbor()
neighbor2 = neighbor(normalize=True)
X_pred1 = regression1.predict(X_train)
X_pred2 = neighbor1.predict(X_train)
X_pred3 = regression2.predict(X_train)
X_pred4 = neighbor2.predict(X_train)

stack_train = np.array([
    X_pred1[X_test > 0], X_pred2[X_test > 0], X_pred3[X_test > 0],
    X_pred4[X_test > 0]
]).T

clf = LinearRegression()
clf.fit(stack_train, X_test[X_test > 0])

stack_test = np.array(
    [X_pred1.ravel(),
     X_pred2.ravel(),
     X_pred3.ravel(),
     X_pred4.ravel()]).T
predicted = clf.predict(stack_test).reshape(X.shape)

r2 = r2_score(y[y > 0], predicted[y > 0])
print r2  #0.345168402139

rmse = np.sqrt(mean_squared_error(y[y > 0], predicted[y > 0]))
print rmse  #0.905436328833
#%%
rf = RandomForestRegressor()


#%%
rf.fit(X_train,y_train)


#%%
pred2 = rf.predict(X_test)


#%%
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, pred2))
print('MSE:', metrics.mean_squared_error(y_test, pred2))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred2)))


#%%
from sklearn.metrics import r2_score
r2_score(y_test, pred2)


#%%
sns.distplot((y_test-pred2),bins=50)


#%%

예제 #33
0
                       sep="\t",
                       index_col="frame",
                       usecols=["frame", "/actuator/inflate"])


#read time series from the exchange.csv file
series = GetData('intensity_2P_breathing signal.txt')

#view top 10 records
print(series.head(10))
print(series.dtypes)

X = series.values
size = int(len(X) * 0.75)
train, test = X[0:size], X[size:len(X)]
# walk-forward validation
history = [x for x in train]
predictions = list()
for i in range(len(test)):
    # make prediction
    predictions.append(history[-1])
    # observation
    history.append(test[i])
# report performance
rmse = sqrt(mean_squared_error(test, predictions))
print('RMSE: %.3f' % rmse)
# line plot of observed vs predicted
pyplot.plot(test)
pyplot.plot(predictions)
pyplot.show()
예제 #34
0
test_err8 = [0] * len(max_n_estimators)

max_depths = [4, 6, 8]

for i, o in enumerate(max_n_estimators):
    print 'AdaBoostClassifier: learning a decision tree with n_estimators=' + str(o)
    dt4 = DecisionTreeClassifier(max_depth=4)
    dt6 = DecisionTreeClassifier(max_depth=6)
    dt8 = DecisionTreeClassifier(max_depth=8)
    bdt4 = AdaBoostClassifier(base_estimator=dt4, n_estimators=o)
    bdt6 = AdaBoostClassifier(base_estimator=dt6, n_estimators=o)
    bdt8 = AdaBoostClassifier(base_estimator=dt8, n_estimators=o)
    bdt4.fit(X_train, y_train)
    bdt6.fit(X_train, y_train)
    bdt8.fit(X_train, y_train)
    train_err4[i] = mean_squared_error(y_train,
                                     bdt4.predict(X_train))
    test_err4[i] = mean_squared_error(y_test,
                                    bdt4.predict(X_test))
    train_err6[i] = mean_squared_error(y_train,
                                     bdt6.predict(X_train))
    test_err6[i] = mean_squared_error(y_test,
                                    bdt6.predict(X_test))
    train_err8[i] = mean_squared_error(y_train,
                                     bdt8.predict(X_train))
    test_err8[i] = mean_squared_error(y_test,
                                    bdt8.predict(X_test))
    print '---'

# Plot results
print 'plotting results'
plt.figure()
예제 #35
0
selector = preprocessing.Selector(datax, datay)
selector.load(opt.partition)
trainx, trainy = selector.training_set()
ymax = np.max(np.abs(trainy))
trainy = trainy.flatten() / ymax

scaler = StandardScaler()
scaler.fit(trainx)
trainx = scaler.transform(trainx)

n_feature = len(trainx[0])

svr = SVR(kernel='linear')
svr.fit(trainx, trainy)

mse = mean_squared_error(trainy, svr.predict(trainx))
print(mse)

pool = []

output = open(opt.output, 'w')
output.write('STEP\tRMFEA\tMSE\tFEATURES\n')
output.write('0\tfull\t%.4e\t[FULL]\n' % mse)

step = 1
while n_feature > opt.end:

    svr = SVR(kernel='linear')
    rfe = RFE(svr, n_feature - 1, step=1)

    rfe.fit(trainx, trainy)
#(kernel(x_train[0,:],x_test[0,:]))

##a = np.matrix([[0],[0.2],[1],[3]])
##print(a.shape)
##b = np.matrix([[0],[0.2],[1]])
##print(kernel_matrix(a,b))

#print(y_test)
g = gaussian_process(x_train,y_train,x_test,train_samples,test_samples,sigma)
#print(g)

##This is for fitting the linear model in python
regr = linear_model.LinearRegression()
regr.fit(x_train,y_train)
y_pred = regr.predict(test[:,1])
print("Mean squared error for the linear model is: %.10f"
      % mean_squared_error(y_test,y_pred))

## This is for printing the accuracy of GP regression!

error = g-y_test
#print(error)
sq_error = np.square(error)
#print(sq_error)
mean_sq_error = sum(sq_error)/sq_error.shape[0]
print("Mean squared errorfor GPR is: %.10f"
      % mean_sq_error)



import pandas as pd
import pickle
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import mean_squared_error
from math import sqrt

df = pd.read_csv('./data/train.csv')

df = df.iloc[:,1:]
df.drop(columns=['cut','clarity'], inplace=True)
df = pd.get_dummies(df)

X = df.drop(columns='price')
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y)
model = GradientBoostingRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

rmse = sqrt(mean_squared_error(y_test, y_pred))
print(rmse)

pickle.dump(model, open('./models/gb_model.sav', 'wb'))
예제 #38
0
    Pred_YList.append(round(ansY[index]))
print(Pred_YList)
accuracy = accuracy_score(Test_YList, Pred_YList)
print("Y accuarcy: %.2f%%" % (accuracy * 100.0))

# draw real and predict points
plt.scatter(Pred_XList, Pred_YList, linewidths=0)
plt.scatter(Test_XList, Test_YList, linewidths=0)
plt.ylabel('real and predict')
plt.show()

# create predict XY list
Pred_XYList = []
for index in range(0, test_num):
    tmp_list = []
    tmp_list.append(Pred_XList[index])
    tmp_list.append(Pred_YList[index])
    Pred_XYList.append(tmp_list)

# create real XY list
Real_XYList = []
for index in range(0, test_num):
    tmp_list = []
    tmp_list.append(Test_XList[index])
    tmp_list.append(Test_YList[index])
    Real_XYList.append(tmp_list)

# calculate MSE
MSE_XY = mean_squared_error(Real_XYList, Pred_XYList)
print("XY MSE: %.2f" % MSE_XY)
              epochs=1,
              batch_size=batch_size,
              verbose=2,
              shuffle=False)
    model.reset_states()
# make predictions
trainPredict = model.predict(trainX, batch_size=batch_size)
model.reset_states()
testPredict = model.predict(testX, batch_size=batch_size)
# invert predictions
trainPredict = scaler.inverse_transform(trainPredict)
trainY = scaler.inverse_transform([trainY])
testPredict = scaler.inverse_transform(testPredict)
testY = scaler.inverse_transform([testY])
# calculate root mean squared error
trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:, 0]))
print('Train Score: %.2f RMSE' % (trainScore))
testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:, 0]))
print('Test Score: %.2f RMSE' % (testScore))
# shift train predictions for plotting
trainPredictPlot = numpy.empty_like(dataset)
trainPredictPlot[:, :] = numpy.nan
trainPredictPlot[look_back:len(trainPredict) + look_back, :] = trainPredict
# shift test predictions for plotting
testPredictPlot = numpy.empty_like(dataset)
testPredictPlot[:, :] = numpy.nan
testPredictPlot[len(trainPredict) + (look_back * 2) + 1:len(dataset) -
                1, :] = testPredict
# plot baseline and predictions
plt.plot(scaler.inverse_transform(dataset))
plt.plot(trainPredictPlot)
예제 #40
0
def RMSE(ytest, y_predict):
    return np.sqrt(mean_squared_error(y_test, y_predict))
'''

# Create an input function for predictions.
# Note: Since we're making just one prediction for each example, we don't
# need to repeat or shuffle the data here.
prediction_input_fn = lambda: my_input_fn(
    my_feature, targets, num_epochs=1, shuffle=False)

# Call predict() on the linear_regressor to make predictions.
predictions = linear_regressor.predict(input_fn=prediction_input_fn)

# Format predictions as a NumPy array, so we can calculate error metrics.
predictions = np.array([item['predictions'][0] for item in predictions])

# Print Mean Squared Error and Root Mean Squared Error.
mean_squared_error = metrics.mean_squared_error(predictions, targets)
root_mean_squared_error = math.sqrt(mean_squared_error)
print("Mean Squared Error (on training data): %0.3f" % mean_squared_error)
print("Root Mean Squared Error (on training data): %0.3f" %
      root_mean_squared_error)
#Mean Squared Error (on training data): 56367.025
#Root Mean Squared Error (on training data): 237.417
'''
这是出色的模型吗?您如何判断误差有多大?
由于均方误差 (MSE) 很难解读,因此我们经常查看的是均方根误差 (RMSE)。RMSE 的一个很好的特性是,它可以在与原目标相同的规模下解读。
我们来比较一下 RMSE 与目标最大值和最小值的差值:
'''

min_house_value = california_housing_dataframe["median_house_value"].min()
max_house_value = california_housing_dataframe["median_house_value"].max()
min_max_difference = max_house_value - min_house_value
예제 #42
0
y_test = ss_y.transform(y_test)

# 使用线性核函数配置
linear_svr = SVR(kernel='linear')
linear_svr.fit(x_train, y_train)
linear_svr_y_predict = linear_svr.predict(x_test)

# 使用多项式核函数配置
ploy_svr = SVR(kernel='poly')
ploy_svr.fit(x_train, y_train)
ploy_svr_y_predict = ploy_svr.predict(x_test)

# 使用径向基核函数配置
rbf_svr = SVR(kernel='rbf')
rbf_svr.fit(x_train, y_train)
rbf_svr_y_predict = rbf_svr.predict(x_test)


print('The R2 ', r2_score(y_test, linear_svr_y_predict))
print('The MSE ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(linear_svr_y_predict)))
print('The MAE ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(linear_svr_y_predict)))


print('The R2 ', r2_score(y_test, ploy_svr_y_predict))
print('The MSE ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(ploy_svr_y_predict)))
print('The MAE ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(ploy_svr_y_predict)))


print('The R2 ', r2_score(y_test, rbf_svr_y_predict))
print('The MSE ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rbf_svr_y_predict)))
print('The MAE ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rbf_svr_y_predict)))
예제 #43
0
                if "L" in direction:
                    label = row.split(",")[3]
                else:
                    label = row.split(",")[6]
                break
        if "V" in file:
            label = "3"
        if "8" not in label and "9" not in label and "X" not in label and '.' not in label:
            #if "." in label:
            #label='4'
            labelList.append(int(label))
            nameList.append(naming)
    return np.array(labelList), np.array(nameList)


Y_true, N_true = load_valY()
Y_pre, N_pre = load_va()
true = []
pre = []
for i in range(len(Y_true) - 1):
    for j in range(len(Y_pre) - 1):
        if N_true[i] == N_pre[j]:
            true.append(Y_true[i])
            pre.append(Y_pre[j])
            break

print(sklm.accuracy_score(true, pre))
print(sklm.classification_report(true, pre))
print(sklm.confusion_matrix(true, pre))
print(sklm.mean_squared_error(true, pre))
def train_model(learning_rate, steps, batch_size, input_feature="total_rooms"):
    """Trains a linear regression model of one feature.
  
  Args:
    learning_rate: A `float`, the learning rate.
    steps: A non-zero `int`, the total number of training steps. A training step
      consists of a forward and backward pass using a single batch.
    batch_size: A non-zero `int`, the batch size.
    input_feature: A `string` specifying a column from `california_housing_dataframe`
      to use as input feature.
  """

    periods = 10
    steps_per_period = steps / periods

    my_feature = input_feature
    my_feature_data = california_housing_dataframe[[my_feature]]
    my_label = "median_house_value"
    targets = california_housing_dataframe[my_label]

    # Create feature columns
    feature_columns = [tf.feature_column.numeric_column(my_feature)]

    # Create input functions
    training_input_fn = lambda: my_input_fn(
        my_feature_data, targets, batch_size=batch_size)
    prediction_input_fn = lambda: my_input_fn(
        my_feature_data, targets, num_epochs=1, shuffle=False)

    # Create a linear regressor object.
    my_optimizer = tf.train.GradientDescentOptimizer(
        learning_rate=learning_rate)
    my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(
        my_optimizer, 5.0)
    linear_regressor = tf.estimator.LinearRegressor(
        feature_columns=feature_columns, optimizer=my_optimizer)

    # Set up to plot the state of our model's line each period.
    plt.figure(figsize=(15, 6))
    plt.subplot(1, 2, 1)
    plt.title("Learned Line by Period")
    plt.ylabel(my_label)
    plt.xlabel(my_feature)
    sample = california_housing_dataframe.sample(n=300)
    plt.scatter(sample[my_feature], sample[my_label])
    colors = [cm.coolwarm(x) for x in np.linspace(-1, 1, periods)]

    # Train the model, but do so inside a loop so that we can periodically assess
    # loss metrics.
    print("Training model...")
    print("RMSE (on training data):")
    root_mean_squared_errors = []
    for period in range(0, periods):
        # Train the model, starting from the prior state.
        linear_regressor.train(input_fn=training_input_fn,
                               steps=steps_per_period)
        # Take a break and compute predictions.
        predictions = linear_regressor.predict(input_fn=prediction_input_fn)
        predictions = np.array(
            [item['predictions'][0] for item in predictions])

        # Compute loss.
        root_mean_squared_error = math.sqrt(
            metrics.mean_squared_error(predictions, targets))
        # Occasionally print the current loss.
        print("  period %02d : %0.2f" % (period, root_mean_squared_error))
        # Add the loss metrics from this period to our list.
        root_mean_squared_errors.append(root_mean_squared_error)
        # Finally, track the weights and biases over time.
        # Apply some math to ensure that the data and line are plotted neatly.
        y_extents = np.array([0, sample[my_label].max()])

        weight = linear_regressor.get_variable_value(
            'linear/linear_model/%s/weights' % input_feature)[0]
        bias = linear_regressor.get_variable_value(
            'linear/linear_model/bias_weights')

        x_extents = (y_extents - bias) / weight
        x_extents = np.maximum(np.minimum(x_extents, sample[my_feature].max()),
                               sample[my_feature].min())
        y_extents = weight * x_extents + bias
        plt.plot(x_extents, y_extents, color=colors[period])
    print("Model training finished.")

    # Output a graph of loss metrics over periods.
    plt.subplot(1, 2, 2)
    plt.ylabel('RMSE')
    plt.xlabel('Periods')
    plt.title("Root Mean Squared Error vs. Periods")
    plt.tight_layout()
    plt.plot(root_mean_squared_errors)

    # Output a table with calibration data.
    calibration_data = pd.DataFrame()
    calibration_data["predictions"] = pd.Series(predictions)
    calibration_data["targets"] = pd.Series(targets)
    display.display(calibration_data.describe())

    print("Final RMSE (on training data): %0.2f" % root_mean_squared_error)
예제 #45
0
from sklearn import metrics

# 创建数据
rdm = np.random.RandomState(2)
xtrain = 10 * rdm.rand(30)
ytrain = 8 + 4 * xtrain + rdm.rand(30) * 3

# 多元回归拟合
model = LinearRegression()
model.fit(xtrain[:, np.newaxis], ytrain)

# 求出预测数据
ytest = model.predict(xtrain[:, np.newaxis])

# 求出均方差
mse = metrics.mean_squared_error(ytrain, ytest)

# 求出均方根
rmse = np.sqrt(mse)

# 求出预测数据与原始数据均值之差的平方和
ssr = ((ytest - ytrain.mean()) ** 2).sum()

# 求出原始数据和均值之差的平方和
sst = ((ytrain - ytrain.mean()) ** 2).sum()

# 求出确定系数
r2 = ssr / sst

# 求出确定系数
r2 = model.score(xtrain[:, np.newaxis], ytrain)
                       label=y_val[:, i],
                       weight=items["perishable"] * 0.25 + 1)

    watchlist = [(dtrain, 'train'), (dval, 'val')]
    model = xgb.train(plst,
                      dtrain,
                      num_rounds,
                      watchlist,
                      early_stopping_rounds=50,
                      verbose_eval=50)

    val_pred.append(model.predict(dval))
    test_pred.append(model.predict(dtest))

print("Validation mse:",
      mean_squared_error(y_val,
                         np.array(val_pred).transpose())**0.5)

p_val = np.array(val_pred).transpose()
df_val = pd.DataFrame(
    p_val,
    index=df_2017.index,
    columns=pd.date_range("2017-07-26",
                          periods=16)).stack().to_frame("unit_sales")
df_val.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

df_val.to_csv('out/xgb_0115_pred.csv', float_format='%.4f', index=None)
df_true.to_csv('out/xgb_0115_true.csv', float_format='%.4f', index=None)

print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
예제 #47
0
# To see the predicted values and the actual values for comparison

# In[17]:


df1.plot(kind='bar',figsize=(10,8))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()


# In[18]:


print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))


# # Model 2: Random Forest Regressor

# In[19]:


# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(X_train, y_train)
# # ## mean_squared_error(Y_true, Y_predict)
# print("Simple Linear Regression MSE:  " + str(mean_squared_error(df['price'], Yhat)))
#
# ###5 Model 2: Multiple Linear Regression
# # Price = -15678.742628061467 + 52.65851272 x horsepower + 4.69878948 x curb-weight + 81.95906216 x engine-size + 33.58258185 x highway-mpg
# # calculate the R^2
# # fit the model
Z = df[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']]
lm.fit(Z, df['price'])
# Find the R^2
print("Multiple Linear Regression R^2: " +
      str(lm.score(Z, df['price'])))  #that ~ 80.896 %
# calculate the MSE
Y_predict_multifit = lm.predict(Z)
print("Multiple Linear Regression MSE: " +
      str(mean_squared_error(df['price'], Y_predict_multifit)))
#
# ###6 Model 3: Polynomial Fit
# poly = PolynomialFeatures(degree = 3)
# X_poly = poly.fit_transform(X)
#
# poly.fit(X_poly, Y)
# lin2 = LinearRegression()
# lin2.fit(X_poly, Y)
# Ypred = lin2.predict(X_poly)
# r2 = r2_score(Y,Ypred) #0.651793603702672
# print("Polynomial Fit R^2:             " + str(r2))
# print("Polynomial Fit MSE:             " + str(mean_squared_error(df['price'], Ypred)))

##5 Multiple Linear Regression
# lm = LinearRegression()
예제 #49
0
t0 = time.time()
regr.fit(x_train, y_train.ravel())
regr_fit = time.time() - t0
print("Complexity and bandwidth selected and model fitted in %.6f s" % regr_fit)

t0 = time.time()
y_regr = regr.predict(x_test)
regr_predict = time.time() - t0
print("Prediction for %d inputs in %.6f s" % (x_test.shape[0], regr_predict))

# open a file to append
outF = open("output.txt", "a")
print("Complexity and bandwidth selected and model fitted in %.6f s" % regr_fit, file=outF)
print("Prediction for %d inputs in %.6f s" % (x_test.shape[0], regr_predict),file=outF)
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_regr), file=outF)
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_regr), file=outF)
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_regr)), file=outF)
outF.close()

print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_regr))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_regr))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_regr)))

x_test_dim = sc_x.inverse_transform(x_test)
y_test_dim = sc_y.inverse_transform(y_test)
y_regr_dim = sc_y.inverse_transform(y_regr)

plt.scatter(x_test_dim, y_test_dim, s=5, c='k', marker='o', label='Matlab')
plt.scatter(x_test_dim, y_regr_dim, s=5, c='r', marker='+', label='Multi-layer Perceptron')
#plt.title('Relaxation term $R_{ci}$ regression')
plt.ylabel('$R_{ci}$ $[J/m^3/s]$')
예제 #50
0
X = iris.data[:, :2]  # 使用前两个特征
Y = iris.target
# 分训练集测试集
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=3)
x = X_train
y = y_train

# 把输入变成二维数组,一行一样本,一列一特征
# x = x.reshape(-1, 1)  # 变成n行1列
model = lm.Ridge(150, fit_intercept=True, max_iter=1000)
model.fit(x, y)
pred_y = model.predict(x)  # 把样本x带入模型求出预测y

# 输出模型的评估指标
print('平均绝对值误差:', sm.mean_absolute_error(y, pred_y))
print('平均平方误差:', sm.mean_squared_error(y, pred_y))
print('中位绝对值误差:', sm.median_absolute_error(y, pred_y))
print('R2得分:', sm.r2_score(y, pred_y))

# 绘制图像
mp.figure("Linear Regression", facecolor='lightgray')
mp.title('Linear Regression', fontsize=16)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.xlabel('x')
mp.ylabel('y')

mp.scatter(x, y, s=60, marker='o', c='dodgerblue', label='Points')
mp.plot(x, pred_y, c='orangered', label='LR Line')
mp.tight_layout()
mp.legend()
예제 #51
0
 def rmse(prediction, ground_truth):
     prediction = np.mat(prediction)
     ground_truth = np.mat(ground_truth)
     prediction = prediction[ground_truth.nonzero()].flatten()
     ground_truth = ground_truth[ground_truth.nonzero()].flatten()
     return sqrt(mean_squared_error(prediction, ground_truth))
예제 #52
0
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, 8]
regressor = LinearRegression(normalize = True)
current_features = []
# Spliting training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=15)
best_features = []
size = len(X_train)
i = 0;
max = 0
prev_rmse= 0.0
m_mse = 0.0
while len(features) > 0 and i < len(features):
    #Fetching the feature
    current_features.append(features[i])
    X_in_test = X_train[current_features]
    y_in_test = y_train.values
    regressor.fit(X_in_test, y_in_test)
    scr = regressor.score(X_in_test, y_in_test)
    mse = met.mean_squared_error(y_in_test, regressor.predict(X_in_test))
    print('\n ADDED FEATURES ' + str(features[i]) + ' RMSE ', math.sqrt(mse * size))
    print('\n R2 SCORE IS ',scr)
    if scr > max or m_mse < prev_rmse :
        max = scr
        best_features.append(features[i])
        m_mse = mse
    features.remove(features[i])
    prev_rmse = mse
print(best_features)
print(' MAX ',max)
예제 #53
0
def DT_main_seq(start, stop, testGroup, segmentName):
    print('\n----------Start-----------\n')
    #    (n_estimators,
    #     max_depth,
    #     min_samples_split,
    #     learning_rate,
    #     loss,
    #     start,
    #     stop,
    #     testGroup,
    #     segmentName) = parsingInit()
    n_estimators = 1000
    max_depth = 2
    min_samples_split = 2
    learning_rate = 0.01
    loss = 'ls'

    flowRates_Train = np.array([i for i in range(start, stop + 10, 10)])

    flowRates_Test = np.array(
        [i for i in range(testGroup, testGroup + 10, 10)])

    flowRates_reTrain = np.append(flowRates_Train, flowRates_Test)

    #The 160 flow rate data is corrupted!!
    #TODO: recollect the data
    flowRates_Train = np.delete(flowRates_Train,
                                np.where(flowRates_Train == 160))
    flowRates_Test = np.delete(flowRates_Test, np.where(flowRates_Test == 160))
    flowRates_reTrain = np.delete(flowRates_reTrain,
                                  np.where(flowRates_reTrain == 160))

    print('Train: ', flowRates_Train)
    print('Test: ', flowRates_Test)
    print('reTrain: ', flowRates_reTrain)

    print('1. Extracting Data... ')
    #Train Data
    X_Train, y_thic_Train, y_flow_Train = getXData(KPI_fileName, objectName,
                                                   segment_Numbers,
                                                   flowRates_Train,
                                                   segmentName, features)
    featureNames = X_Train.columns

    #Test Data
    X_Test, y_thic_Test, y_flow_Test = getXData(KPI_fileName, objectName,
                                                segment_Numbers,
                                                flowRates_Test, segmentName,
                                                features)

    #ReTrain Data
    X_reTrain, y_thic_reTrain, y_flow_reTrain = getXData(
        KPI_fileName, objectName, segment_Numbers, flowRates_reTrain,
        segmentName, features)

    #%% Preprocessing Data converting to float32 and removing NaN
    print('2. Preprocessing Data...')
    imp1 = Imputer(missing_values='NaN', strategy='mean', axis=0)
    #    imp2 = Imputer(missing_values=0, strategy='mean', axis=0)

    X_Train, y_thic_Train = preProcess(X_Train, y_thic_Train)
    X_Train = imp1.fit_transform(X_Train)

    X_Test, y_thic_Test = preProcess(X_Test, y_thic_Test)
    X_Test = imp1.fit_transform(X_Test)

    X_reTrain, y_thic_reTrain = preProcess(X_reTrain, y_thic_reTrain)
    X_reTrain = imp1.fit_transform(X_reTrain)

    #%%
    if not os.path.exists(destinationFolder):
        os.makedirs(destinationFolder)

    paramsGBR = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'learning_rate': learning_rate,
        'loss': loss
    }

    model = ensemble.GradientBoostingRegressor(**paramsGBR)

    clf_Tr = clone(model)

    #%%
    print('3. Building Model with all the Samples...')
    X_Train, y_thic_Train = shuffle(X_Train, y_thic_Train)

    print('\t Shape Train: ', X_Train.shape)
    print('\t DataType Train: ', X_Train.dtype)

    print('\t Shape Train: ', y_thic_Train.shape)
    print('\t DataType Train: ', y_thic_Train.dtype)

    min_max_scaler_Train_X = preprocessing.MinMaxScaler().fit(X_Train)
    scaler_Train_X = preprocessing.StandardScaler().fit(X_Train)

    X_Tr = scaler_Train_X.transform(X_Train)
    X_Tr = min_max_scaler_Train_X.transform(X_Tr)

    clf_Tr = model.fit(X_Tr, y_thic_Train)

    #%%
    print('4. Results for Training:')
    y_pred1 = clf_Tr.predict(X_Tr)
    featureImportance(clf_Tr, featureNames,
                      str(testGroup) + '_initialRankings_' + segmentName)

    mse_Test = mean_squared_error(y_thic_Train, y_pred1)
    mae_Test = mean_absolute_error(y_thic_Train, y_pred1)
    medae_Test = median_absolute_error(y_thic_Train, y_pred1)
    r2_Test = r2_score(y_thic_Train, y_pred1)
    exvs_Test = explained_variance_score(y_thic_Train, y_pred1)

    print('\t Mean Squared Error      :', mse_Test)
    print('\t Mean Absolute Error     :', mae_Test)
    print('\t Median Absolute Error   :', medae_Test)
    print('\t R2 Score                :', r2_Test)
    print('\t Explained Variance Score:', exvs_Test)

    #%%
    print('\n5. Processing emissions Signals for Group ', flowRates_Test,
          ' ...')
    X_Test, y_thic_Test = shuffle(X_Test, y_thic_Test)

    print('\t Shape Test: ', X_Test.shape)
    print('\t DataType Train: ', X_Test.dtype)

    print('\t Shape y Test: ', y_thic_Test.shape)
    print('\t DataType y Test: ', y_thic_Test.dtype)

    print('6. Transforming emissions Signals for Group ', flowRates_Test,
          ' ...')
    X_Te = scaler_Train_X.transform(X_Test)
    X_Te = min_max_scaler_Train_X.transform(X_Te)

    print('\t Shape X_Te: ', X_Te.shape)
    print('\t DataType X_te: ', X_Te.dtype)

    print('7. Predicting KPI for Signals for Group ', flowRates_Test, ' ...')
    y_pred_Te = clf_Tr.predict(X_Te)

    print('8. Results for Predicting KPI for Signals for Group ',
          flowRates_Test, ' ...')
    mse_Test = mean_squared_error(y_thic_Test, y_pred_Te)
    mae_Test = mean_absolute_error(y_thic_Test, y_pred_Te)
    medae_Test = median_absolute_error(y_thic_Test, y_pred_Te)
    r2_Test = r2_score(y_thic_Test, y_pred_Te)
    exvs_Test = explained_variance_score(y_thic_Test, y_pred_Te)

    print('\t Mean Squared Error      :', mse_Test)
    print('\t Mean Absolute Error     :', mae_Test)
    print('\t Median Absolute Error   :', medae_Test)
    print('\t R2 Score                :', r2_Test)
    print('\t Explained Variance Score:', exvs_Test)

    fileNamecsv = destinationFolder + '/FeatureRanking_' + str(
        testGroup) + '_' + segmentName + '.csv'
    print('9. Saving Results', fileNamecsv, ' ...')
    np.savetxt(
        fileNamecsv, [[mse_Test, mae_Test, medae_Test, r2_Test, exvs_Test]],
        delimiter=',',
        header=
        'Mean Squared Error, Mean Absolute Error, Median Absolute Error,R2 Score, Explained Variance Score',
        comments='')

    print('10. Retraining the Model with new emission Signal...')
    X_reTrain, y_thic_reTrain = shuffle(X_reTrain, y_thic_reTrain)

    print('\t Shape reTrain: ', y_thic_reTrain.shape)
    print('\t DataType reTrain: ', y_thic_reTrain.dtype)

    print('\t Shape y reTrain: ', y_thic_Test.shape)
    print('\t DataType y reTrain: ', y_thic_Test.dtype)

    min_max_scaler_Train_X2 = preprocessing.MinMaxScaler().fit(X_reTrain)
    scaler_Train_X2 = preprocessing.StandardScaler().fit(X_reTrain)

    X_reTr = scaler_Train_X2.transform(X_reTrain)
    X_reTr = min_max_scaler_Train_X2.transform(X_reTr)

    print('\t Shape X_reTr: ', X_reTr.shape)
    print('\t DataType X_reTr: ', X_reTr.dtype)

    X_Te = scaler_Train_X.transform(X_Test)
    X_Te = min_max_scaler_Train_X.transform(X_Te)

    print('\t Shape X_Te: ', X_Te.shape)
    print('\t DataType X_Te: ', X_Te.dtype)

    clf_reTr = model.fit(X_reTr, y_thic_reTrain)
    print('11. New Results with emission signals Incorporated:')
    y_pred_Te = clf_reTr.predict(X_Te)
    mse_Test = mean_squared_error(y_thic_Test, y_pred_Te)
    mae_Test = mean_absolute_error(y_thic_Test, y_pred_Te)
    medae_Test = median_absolute_error(y_thic_Test, y_pred_Te)
    r2_Test = r2_score(y_thic_Test, y_pred_Te)
    exvs_Test = explained_variance_score(y_thic_Test, y_pred_Te)

    print('\t Mean Squared Error      :', mse_Test)
    print('\t Mean Absolute Error     :', mae_Test)
    print('\t Median Absolute Error   :', medae_Test)
    print('\t R2 Score                :', r2_Test)
    print('\t Explained Variance Score:', exvs_Test)

    print('12. Saving the new Results', fileNamecsv, ' ...')
    f = open(fileNamecsv, 'a')
    df = pd.DataFrame([[mse_Test, mae_Test, medae_Test, r2_Test, exvs_Test]])
    df.to_csv(f, index=False, header=False)
    f.close()
    featureImportance(clf_reTr, featureNames,
                      str(testGroup) + '_reTrainedRankings_' + segmentName)

    print('-----------:Finished!:--------------- \n')
예제 #54
0
def randomforest_predict():
    warnings.filterwarnings('ignore')

    df_data = pd.read_csv("data/housing.data", delim_whitespace=True)
    X = df_data.drop(["MEDV"], axis=1)
    y = df_data["MEDV"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=128)

    param_grid = {
        'n_estimators': [5, 10, 20, 50, 100, 200],  # tree number
        'max_depth': [3, 5, 7],  # max depth
        'max_features': [0.6, 0.7, 0.8, 1]  # max features
    }

    rf = RandomForestRegressor()
    grid = GridSearchCV(rf, param_grid=param_grid, cv=3)
    grid.fit(X_train, y_train)
    print("best_params", grid.best_params_)

    rf_reg = grid.best_estimator_
    print(rf_reg)

    estimator = rf_reg.estimators_[3]
    dot_data = tree.export_graphviz(estimator, out_file=None, filled=True, rounded=True)
    graph = pydotplus.graph_from_dot_data(dot_data)
    graph.write_png("result/rf_reg.png")

    feature_names = X.columns
    feature_importances = rf_reg.feature_importances_
    indices = np.argsort(feature_importances)[::-1]
    for index in indices:
        print("feature %s (%f)" % (feature_names[index], feature_importances[index]))

    plt.figure(figsize=(16, 8))
    plt.title("feature importance of random forest")
    plt.bar(range(len(feature_importances)), feature_importances[indices], color='b')
    plt.xticks(range(len(feature_importances)), np.array(feature_names)[indices], color='b')
    plt.show()

    rst = {"label": y_test, "prediction": rf_reg.predict(X_test)}
    rst = pd.DataFrame(rst)
    print(rst.head())

    rst['label'].plot(style='k.', figsize=(15, 5))
    rst['prediction'].plot(style='r.')
    plt.legend(fontsize=15, markerscale=3)
    plt.tick_params(labelsize=25)
    plt.grid()
    plt.show()

    MSE = metrics.mean_squared_error(y, rf_reg.predict(X))
    print(np.sqrt(MSE))

    submission = {"prediction": rf_reg.predict(X_test)}
    submission = pd.DataFrame(submission)
    submission.to_csv("result/price_predict_randomforest.csv")

    y_predict = rf_reg.predict(X_test)
    x_data = pd.Series(range(len(y_test)))[:, np.newaxis]
    y_test_data = y_test[:, np.newaxis]
    y_predict_data = y_predict[:, np.newaxis]
    plt.plot(x_data, y_test_data, label='Price')
    plt.plot(x_data, y_predict_data, label='Predict price')
    plt.xlabel('Entity')
    plt.ylabel('Price')
    plt.title('Price prediction (random forest)')
    plt.legend()
    plt.savefig('result/price_predict_random_forest.png')
    plt.show()
예제 #55
0
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)
print(model)
score_train = model.score(X_train, y_train)
score_test = model.score(X_test, y_test)

parameters = {}
model = GridSearchCV(LinearRegression(), parameters, cv=5)
model.fit(X_train, y_train)

output = model.predict(X_test)
score_r2_pred = r2_score(y_test, output)

rmse = np.sqrt(mean_squared_error(y_test, output))
Obsv_tbl = [['Linear Regressor', score_train, score_test, score_r2_pred, rmse]]

#XGBOOST REGRESSION MODEL
import xgboost as xgb
from sklearn.metrics import mean_squared_error as ms
from math import sqrt

model = xgb.XGBRegressor(colsample_bytree=0.4603,
                         gamma=0.0468,
                         learning_rate=0.05,
                         max_depth=3,
                         n_estimators=2500,
                         reg_alpha=0.4640,
                         reg_lambda=0.8571,
                         random_state=7)
예제 #56
0
kfcv = KFold(n_splits=10)

#RidgeCV with 10-fold cross-validation(similar to ISLR)#
#rcv = RidgeCV(alphas=alphas, scoring='neg_mean_squared_error', normalize=True)
rcv = RidgeCV(alphas=alphas, scoring='neg_mean_squared_error', normalize=True, cv=kfcv)
rcv.fit(X_train, Y_train)

print('\nBest RidgeCV alpha value:')
print(rcv.alpha_)

#Ridge regression using best alpha#
rbest = Ridge(alpha=rcv.alpha_, normalize=True)
rbest.fit(X_train, Y_train)

print('\nBest Ridge MSE:')
print(mean_squared_error(Y_test, rbest.predict(X_test)))

print('\nRidge Coeficients:')
print(pd.Series(rbest.coef_, index=xcols))

#Full Lasso regression#
lasso = Lasso(max_iter=10000, normalize=True)
coefs2 = []

for a in alphas:
    lasso.set_params(alpha=a)
    lasso.fit(scale(X), Y)
    coefs2.append(lasso.coef_)

ax2 = plt.gca()
ax2.plot(alphas*2, coefs)
        print("setp = {}, loss = {:.5f}".format(step+1, loss_val))
        
    
    # model 최적화 
    a_up, b_up = sess.run([a, b])
    print("수정된 기울기 : {}, 절편 : {}".format(a_up, b_up)) 
    
    # 테스트용 공급 data 
    feed_data_test = {X : x_test, Y : y_test}
    
    # Y(정답) vs model(예측치)
    y_true = sess.run(Y, feed_dict = feed_data_test)
    y_pred = sess.run(model, feed_dict = feed_data_test)
    
    # model 평가 
    mse = mean_squared_error(y_true, y_pred)
    print("MSE = ", mse)
        
'''
1차 : 학습율 = 0.5, 반복학습 100회
MSE =  0.72902936
2차 : 학습율 = 0.4, 반복학습 100회
MSE =  0.5829428
3차 : 학습율 = 0.4, 반복학습 200회
MSE =  0.7733004
'''        
        
        
    

 def test(self):
     self.results = self.model.predict(self.testX)
     self.finalError = mean_squared_error(self.results, self.testY)
    plt.plot(X_plot[:,0], y_plot, color='r')
    plt.axis([-3, 3, 0, 6])
    plt.show()

print("==============岭回归解决多项式回归问题=======================")
def RidgeRegression(degree=2, alpha=1):
    return Pipeline([
        ("poly", PolynomialFeatures(degree=degree)),    #多项式的增加特征
        ("std_scaler", StandardScaler()),               #归一化
        ("ridge_reg", Ridge(alpha=alpha))               #岭回归替代了线性回归
    ])

ridge1_reg=RidgeRegression(20,0.0001)
ridge1_reg.fit(X_train,y_train)
y1_predict=ridge1_reg.predict(X_test)
print("MSE: ",mean_squared_error(y_test,y1_predict))
plot_model(ridge1_reg)

ridge1_reg=RidgeRegression(20,3)
ridge1_reg.fit(X_train,y_train)
y1_predict=ridge1_reg.predict(X_test)
print("MSE: ",mean_squared_error(y_test,y1_predict))
plot_model(ridge1_reg)

ridge1_reg=RidgeRegression(20,10000)
ridge1_reg.fit(X_train,y_train)
y1_predict=ridge1_reg.predict(X_test)
print("MSE: ",mean_squared_error(y_test,y1_predict))
plot_model(ridge1_reg)

예제 #60
0
def fmean_squared_error(ground_truth, predictions):
    fmean_squared_error_ = mean_squared_error(ground_truth, predictions)**0.5
    return fmean_squared_error_