示例#1
0
def process_optimized_knr(data):
    # we will process a loop to find the best performance for KNN for max_number_of_neighbors
    neighbor = 1
    min_mean_sqr_error = 0
    max_r2_score = 0
    opt_neighbor = 0
    global optimized_neighbor
    while neighbor < max_number_of_neighbors:
        model = KNeighborsRegressor()
        model.n_neighbors = neighbor
        model.fit(data["X_train"], data["y_train"])
        predicted_values = model.predict(data["X_test"])
        mean_sqr_error = mean_squared_error(data["y_test"], predicted_values)
        r2_score_calc = r2_score(data["y_test"], predicted_values)
        if max_r2_score < abs(r2_score_calc):
            min_mean_sqr_error = mean_sqr_error
            max_r2_score = r2_score_calc
            optimized_neighbor = neighbor
        neighbor = neighbor + 1
    return {
        "name": "KNR",
        "data": {
            "neighbors": optimized_neighbor
        },
        "mean_sqr_err": min_mean_sqr_error,
        "r2_score": max_r2_score
    }
示例#2
0
plot_residual(y3_train, ridge3_y_train_pred, y3_test, ridge3_y_test_pred)
plot_residual(y6_train, ridge6_y_train_pred, y6_test, ridge6_y_test_pred)
plot_residual(y9_train, ridge9_y_train_pred, y9_test, ridge9_y_test_pred)


from sklearn.neighbors import KNeighborsRegressor
knn3 = KNeighborsRegressor()
knn6 = KNeighborsRegressor()
knn9 = KNeighborsRegressor()
knn3_scores = []
knn6_scores = []
knn9_scores = []

n_neighbors_space= np.arange(1,11)
for n in n_neighbors_space:
    knn3.n_neighbors = n
    knn3_cv_scores = cross_val_score(knn3, X, y3, cv=10)
    knn3_scores.append(np.mean(knn3_cv_scores))
    
    knn6.n_neighbors = n
    knn6_cv_scores = cross_val_score(knn6, X, y6, cv=10)
    knn6_scores.append(np.mean(knn6_cv_scores))
    
    knn9.n_neighbors = n
    knn9_cv_scores = cross_val_score(knn9, X, y9, cv=10)
    knn9_scores.append(np.mean(knn9_cv_scores))

knn3 = KNeighborsRegressor(n_neighbors=n_neighbors_space[np.argmax(knn3_scores)])
print("The best value for n_neighbors is: ", n_neighbors_space[np.argmax(knn3_scores)])
knn3.fit(X_train_std, y3_train)
mse = mean_squared_error(test_target_0, test_prediction)
print(mse)

# + [markdown] colab_type="text" id="pLW8kdDv5asl"
# ## 과대적합 vs 과소적합
# * Overfitting: 훈련 date에서는 성적이 좋은데 테스트 data에서 성적이 나쁜 경우
# * Underfitting: 훈련 date 보다  테스트 data의 성적이 높은 경우 또는 둘 다  낮은 경우

# + colab={"base_uri": "https://localhost:8080/", "height": 35} colab_type="code" executionInfo={"elapsed": 1634, "status": "ok", "timestamp": 1587904061932, "user": {"displayName": "Haesun Park", "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhsWlS7sKQL-9fIkg3FmxpTMz_u-KDSs8y__P1ngQ=s64", "userId": "14935388527648823821"}, "user_tz": -540} id="ZoXIfmiAJaNw" outputId="99289bfd-0735-4f96-874b-e52dda1725c4"
print('훈련자료의 `R^2` = ', knr.score(train_input, train_target_0))
print('테스트자료의 `R^2` = ', knr.score(test_input, test_target_0))

# + colab={"base_uri": "https://localhost:8080/", "height": 35} colab_type="code" executionInfo={"elapsed": 1628, "status": "ok", "timestamp": 1587904061932, "user": {"displayName": "Haesun Park", "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GhsWlS7sKQL-9fIkg3FmxpTMz_u-KDSs8y__P1ngQ=s64", "userId": "14935388527648823821"}, "user_tz": -540} id="Jhu9abILLHjq" outputId="7fa6a8cf-1137-4ec6-e3df-14acfcb6be4d"
# 이웃의 갯수를 3으로 설정합니다
knr.n_neighbors = 3

# 모델을 다시 훈련합니다

knr.fit(train_input, train_target_0)

print('훈련자료의 `R^2` = ', knr.score(train_input, train_target_0))
print('테스트자료의 `R^2` = ', knr.score(test_input, test_target_0))

# +
r2_train = np.zeros(20)
r2_test = np.zeros(20)
neighbors_n = np.zeros(20)
for n in range(1, 21):
    knr.n_neighbors = n
    knr.fit(train_input, train_target_0)
    5.9, 32.0, 40.0, 51.5, 70.0, 100.0, 78.0, 80.0, 85.0, 85.0, 110.0, 115.0,
    125.0, 130.0, 120.0, 120.0, 130.0, 135.0, 110.0, 130.0, 150.0, 145.0,
    150.0, 170.0, 225.0, 145.0, 188.0, 180.0, 197.0, 218.0, 300.0, 260.0,
    265.0, 250.0, 250.0, 300.0, 320.0, 514.0, 556.0, 840.0, 685.0, 700.0,
    700.0, 690.0, 900.0, 650.0, 820.0, 850.0, 900.0, 1015.0, 820.0, 1100.0,
    1000.0, 1100.0, 1000.0, 1000.0
])

# Split train set and test set
train_input, test_input, train_target, test_target = train_test_split(
    perch_length, perch_weight, random_state=42)

## reshape
train_input = train_input.reshape(-1, 1)
test_input = test_input.reshape(-1, 1)

## KNN regression
knr = KNeighborsRegressor()
knr.fit(train_input, train_target)
print(knr.score(test_input, test_target))
print(knr.score(train_input, train_target))

# First output is bigger than last one
# So, this model is underfitted
# This problem coule be solved by making model more complicated
# Generally model could be more complicated by decreasing number of neighbors.

knr.n_neighbors = 3  # Decrease number of neighbors
knr.fit(train_input, train_target)
print(knr.score(test_input, test_target))
print(knr.score(train_input, train_target))
示例#5
0
# 예측이 타깃과 평균적으로 19g 정도 차이가 난다는 것을 의미

sc = knr.score(train_input, train_target)
print(sc) # 0.9698823289099254
'''
훈련 세트 R2 : 0.9698823289099254
테스트 세트 R2 : 0.992809406101064
==> 테스트 세트 점수(r2)가 훈련세트 r2 보다 높음 ==> 과소적합(underfitting)
[1] 과대 적합 (overfitting) : 훈련세트 점수 > 테스트 세트 점수 
[2] 과소 적합 (underfitting) : 훈련세트 점수 < 테스트 세트 점수 또는 두 점수가 너무 낮은 경우
과소 적합 ==> 이를 해결하려면> ==> 모델을 복잡하게 만들면 된다
       ==> KNR 알고리즘으로 복잡하게 만드는 방법은 K(이웃의 개수)를 줄이는 것
              K(5디폴트) ==> 3으로 즐여서 다시 학습하자
'''

knr.n_neighbors = 3
knr.fit(train_input, train_target)
sc1 = knr.score(train_input, train_target)
sc2 = knr.score(test_input, test_target)
print('훈련 세트의 r2:', sc1) # 0.9804899950518966
print('테스트 세트의 r2:', sc2) # 0.9746459963987609
'''
과소 적합 문제 해결됨
훈련 세트의 r2: 0.9804899950518966
테스트 세트의 r2: 0.9746459963987609
'''
'''
KNR 모델의 문제 제기
# 길이 50cm, 무게 1.5kg인 농어의 무게를 예측해보자
'''
prd = knr.predict([[50]])
示例#6
0
# Performance Info
from sklearn import metrics
print(f"Printing MAE error(avg abs residual): {metrics.mean_absolute_error(y_test, predicted_values)}")
print(f"Printing MSE error: {metrics.mean_squared_error(y_test, predicted_values)}")
print(f"Printing RMSE error: {np.sqrt(metrics.mean_squared_error(y_test, predicted_values))}")
print('Variance score ( close to 1.0 the better ): %.2f' % r2_score(y_test, predicted_values))


# Using KNeighborsRegressor
from sklearn.neighbors import KNeighborsRegressor
# we will process a loop to find the best performance for KNN for max_number_of_neighbors
max_number_of_neighbors = 50
neighbor = 1
min_mean_sqr_error = 0
max_r2_score = 0
opt_neighbor = 0
global optimized_neighbor
while neighbor < max_number_of_neighbors:
    model = KNeighborsRegressor()
    model.n_neighbors = neighbor
    model.fit(X_train, y_train)
    predicted_values = model.predict(X_test)
    mean_sqr_error = mean_squared_error(y_test, predicted_values)
    r2_score_calc = r2_score(y_test, predicted_values)
    print(f"Printing MAE error(avg abs residual): {metrics.mean_absolute_error(y_test, predicted_values)}")
    print(f"Printing MSE error: {metrics.mean_squared_error(y_test, predicted_values)}")
    print(f"Printing RMSE error: {np.sqrt(metrics.mean_squared_error(y_test, predicted_values))}")
    print('Variance score ( close to 1.0 the better ): %.2f' % r2_score(y_test, predicted_values))
    neighbor = neighbor + 1