Exemplo n.º 1
0
def gen_learning_curve(alg, X, y, title, n_iter=10, test_size=0.25):
    print("\nCalculating Learning Curve...")
    cv = ShuffleSplit(n_iter=n_iter, test_size=test_size, \
                      random_state=np.random.randint(0,123456789))

    midpoint, diff = learning_curve.plot_learning_curve(alg, title, X, y, (0.6, 1.01), cv=cv, n_jobs=-1)
    return midpoint, diff
'''
#根据分数表现初步判断欠拟合、正常、过拟合
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(x_train, y_train)
train_score = knn.score(x_train, y_train)
test_score = knn.score(x_test, y_test)
print('选择较好的模型测试,训练分数:{},测试分数:{}'.format(train_score, test_score))

from sklearn.model_selection import ShuffleSplit
from learning_curve import plot_learning_curve

cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=78)
plt.figure()
plot_learning_curve(knn,
                    'Learn Curve for KNN Diabetes',
                    x,
                    y,
                    ylim=(0, 1),
                    cv=cv)
plt.show()
#该模型有些欠拟合
'''
4、特征选择及数据可视化
'''
from sklearn.feature_selection import SelectKBest

selector = SelectKBest(k=2)
x_new = selector.fit_transform(x, y)

result = []
for name, model in models:
    cv = KFold(n_splits=10)
Exemplo n.º 3
0
scores = cross_val_score(logreg, X_std, y, cv=3, scoring='accuracy')
print 'Les performances des 3 scores %s' % (scores)



# Verification de l'evolution des performances en augmentant la taille de l'echantillon

X_train = X_std[500:]
y_train = y[500:]
X_valid = X_std[:500]
y_valid = y[:500]

print X_train.shape
print X_valid.shape


n_train_samples = range(100, len(X_train), 100)
scores = np.empty(len(n_train_samples))
for k, n_train in enumerate(n_train_samples):
    logreg.fit(X_train[:n_train], y_train[:n_train])
    scores[k] = logreg.score(X_valid, y_valid)

plt.plot(n_train_samples, scores)


from learning_curve import plot_learning_curve
plot_learning_curve()


Exemplo n.º 4
0
best = 0
for i in range(0, 100):
    current_score = blender(X_train, y_train, X_test, y_test)
    if current_score > best:
        best = current_score
"""


# RDF prediction model
clf = RandomForestClassifier(class_weight='balanced', n_estimators=500, n_jobs=-1)

# plot learning curves
cv = ShuffleSplit(X.shape[0], n_iter=40,
                  test_size=0.2, random_state=0)
title = "Learning Curves (Random Forests)"
plot_learning_curve(clf, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=1)

# train the model
clf.fit(X_train, y_train)

# predict the test classes
y_score = clf.predict_proba(X_test)

# fscore plot
fscore(y_test, y_score)

# saving the classifier
with open('classifier.pickle', 'wb') as handle:
    pickle.dump(clf, handle)

# saving the standard scaler
Exemplo n.º 5
0
# DataFrame([data, index, columns, dtype, copy])
# print(df[:1])

# 看任意两点的相关度
# _ = sb.pairplot(df[:50], vars=[8, 11, 12, 14, 19], hue="class", size=1.5)
# plt.show()

# 计算各特征之间的相关度
# plt.figure(figsize=(12, 10))
# _ = sb.heatmap(df, annot=False)
# plt.show()

# 减少特征
plot_learning_curve(LinearSVC(C=10.0),
                    "LinearSVC(C=10.0) Features: 11&14",
                    X[:, [11, 14]],
                    y,
                    ylim=(0.8, 1.0),
                    train_sizes=np.linspace(.05, 0.1, 5))

# 增大训练集
# plot_learning_curve(LinearSVC(C=10),'learning rate plot(C=10.0)',X,y,ylim=(0.8,1.01),  train_sizes=np.linspace(.1,.992,5))
# plot_learning_curve(LinearSVC(C=1.0),'learning rate plot(C=10.0)',X,y,ylim=(0.8,1.01),  train_sizes=np.linspace(0.5,0.2,5))

# 遍历多种特征组合
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
plot_learning_curve(
    Pipeline([
        ("fs", SelectKBest(f_classif, k=2)),  # select two features
        ("svc", LinearSVC(C=10.0))
    ]),
Exemplo n.º 6
0
import numpy as np
import pandas as pd
import learning_curve as lc
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

train = pd.read_csv('data/randomforest/data_train.csv').as_matrix()
test = pd.read_csv('data/randomforest/data_test.csv').as_matrix()

rf = RandomForestClassifier(n_estimators=500,
                            max_depth=5,
                            min_samples_leaf=2,
                            max_features='sqrt',
                            verbose=0)
lc.plot_learning_curve(rf, train[:, 1:], train[:, 0])
Exemplo n.º 7
0
    ax4 = fig.add_subplot(224)

    country = 'all'
    ls_model = models[0][country]
    ls_data = data[0][country]

    en_model = models[1][country]
    en_data = data[1][country]

    rf_model = models[2][country]
    rf_data = data[2][country]

    svr_model = models[3][country]
    svr_data = data[3][country]

    plot_learning_curve(ls_model, ls_data['X'], ls_data['y'], ax=ax1)
    plot_learning_curve(en_model, en_data['X'], en_data['y'], ax=ax2)
    plot_learning_curve(rf_model, rf_data['X'], rf_data['y'], ax=ax3)
    plot_learning_curve(svr_model, svr_data['X'], svr_data['y'], ax=ax4)

    ax1.set_title("Lasso Regression")
    ax2.set_title("ElasticNet Regression")
    ax3.set_title("Random Forest Tree Regression")
    ax4.set_title("Support Vector Regression")

    for ax in [ax1, ax2, ax3, ax4]:
        ax.set_ylim((0.1, 1.2))

    plt.show()

    ## train the model
import matplotlib.pyplot as plt

from sklearn import cross_validation
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.naive_bayes import GaussianNB
from learning_curve import plot_learning_curve


digits = load_digits()
X, y = digits.data, digits.target


title = "Learning Curves (Naive Bayes)"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = cross_validation.ShuffleSplit(digits.data.shape[0], n_iter=100,
                                   test_size=0.2, random_state=0)

estimator = GaussianNB()
plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv)

title = "Learning Curves (SVM, RBF kernel, $\gamma=0.001$)"
# SVC is more expensive so we do a lower number of CV iterations:
cv = cross_validation.ShuffleSplit(digits.data.shape[0], n_iter=10,
                                   test_size=0.2, random_state=0)
estimator = SVC(gamma=0.001)
plot_learning_curve(estimator, title, X, y, (0.7, 1.01), cv=cv)

plt.show()
Exemplo n.º 9
0
best = 0
for i in range(0, 100):
    current_score = blender(X_train, y_train, X_test, y_test)
    if current_score > best:
        best = current_score
"""

# RDF prediction model
clf = RandomForestClassifier(class_weight='balanced',
                             n_estimators=500,
                             n_jobs=-1)

# plot learning curves
cv = ShuffleSplit(X.shape[0], n_iter=40, test_size=0.2, random_state=0)
title = "Learning Curves (Random Forests)"
plot_learning_curve(clf, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=1)

# train the model
clf.fit(X_train, y_train)

# predict the test classes
y_score = clf.predict_proba(X_test)

# fscore plot
fscore(y_test, y_score)

# saving the classifier
with open('classifier.pickle', 'wb') as handle:
    pickle.dump(clf, handle)

# saving the standard scaler
Exemplo n.º 10
0
                         ("linear_regression", linear_regression)])
    return pipeline


model = polynomial_model(degree=2)
model.fit(X_train, y_train)
train_score = model.score(X_train, y_train)
cv_score = model.score(X_test, y_test)
print('train_score: {1:0.6f}; cv_score: {2:.6f}'.format(train_score, cv_score))
'''
4、绘制学习曲线
'''
from learning_curve import plot_learning_curve
from sklearn.model_selection import ShuffleSplit

cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
plt.figure(figsize=(18, 4))
title = 'Learning Curves (degree={0})'
degrees = [1, 2, 3]

plt.figure(figsize=(18, 4), dpi=200)
for i in range(len(degrees)):
    plt.subplot(1, 3, i + 1)
    plot_learning_curve(plt,
                        polynomial_model(degrees[i]),
                        title.format(degrees[i]),
                        X,
                        y,
                        ylim=(0.01, 1.01),
                        cv=cv)
Exemplo n.º 11
0
rfc = RandomForestClassifier()
xgbc = XGBClassifier()

from sklearn.model_selection import cross_val_score, ShuffleSplit

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=78)
rfc_scores = cross_val_score(rfc, x_train, y_train, cv=cv)
xgbc_scores = cross_val_score(xgbc, x_train, y_train, cv=cv)

#绘制学习曲线查看拟合情况
from learning_curve import plot_learning_curve
import matplotlib.pyplot as plt

plt.figure(figsize=(18, 6))
plt.subplot(121)
plot_learning_curve(xgbc, 'xgbc', x_train, y_train, cv=cv)
plt.subplot(122)
plot_learning_curve(rfc, 'rfc', x_train, y_train, cv=cv)
plt.show()

rfc.fit(x_train, y_train)
rfc_y_test = rfc.predict(x_test)
rfc_submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': rfc_y_test
})
rfc_submission.to_csv('./rfc_submission.csv', index=False)

xgbc.fit(x_train, y_train)
xgbc_y_test = xgbc.predict(x_test)
xgbc_submission = pd.DataFrame({