def gen_learning_curve(alg, X, y, title, n_iter=10, test_size=0.25): print("\nCalculating Learning Curve...") cv = ShuffleSplit(n_iter=n_iter, test_size=test_size, \ random_state=np.random.randint(0,123456789)) midpoint, diff = learning_curve.plot_learning_curve(alg, title, X, y, (0.6, 1.01), cv=cv, n_jobs=-1) return midpoint, diff
''' #根据分数表现初步判断欠拟合、正常、过拟合 knn = KNeighborsClassifier(n_neighbors=2) knn.fit(x_train, y_train) train_score = knn.score(x_train, y_train) test_score = knn.score(x_test, y_test) print('选择较好的模型测试,训练分数:{},测试分数:{}'.format(train_score, test_score)) from sklearn.model_selection import ShuffleSplit from learning_curve import plot_learning_curve cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=78) plt.figure() plot_learning_curve(knn, 'Learn Curve for KNN Diabetes', x, y, ylim=(0, 1), cv=cv) plt.show() #该模型有些欠拟合 ''' 4、特征选择及数据可视化 ''' from sklearn.feature_selection import SelectKBest selector = SelectKBest(k=2) x_new = selector.fit_transform(x, y) result = [] for name, model in models: cv = KFold(n_splits=10)
scores = cross_val_score(logreg, X_std, y, cv=3, scoring='accuracy') print 'Les performances des 3 scores %s' % (scores) # Verification de l'evolution des performances en augmentant la taille de l'echantillon X_train = X_std[500:] y_train = y[500:] X_valid = X_std[:500] y_valid = y[:500] print X_train.shape print X_valid.shape n_train_samples = range(100, len(X_train), 100) scores = np.empty(len(n_train_samples)) for k, n_train in enumerate(n_train_samples): logreg.fit(X_train[:n_train], y_train[:n_train]) scores[k] = logreg.score(X_valid, y_valid) plt.plot(n_train_samples, scores) from learning_curve import plot_learning_curve plot_learning_curve()
best = 0 for i in range(0, 100): current_score = blender(X_train, y_train, X_test, y_test) if current_score > best: best = current_score """ # RDF prediction model clf = RandomForestClassifier(class_weight='balanced', n_estimators=500, n_jobs=-1) # plot learning curves cv = ShuffleSplit(X.shape[0], n_iter=40, test_size=0.2, random_state=0) title = "Learning Curves (Random Forests)" plot_learning_curve(clf, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=1) # train the model clf.fit(X_train, y_train) # predict the test classes y_score = clf.predict_proba(X_test) # fscore plot fscore(y_test, y_score) # saving the classifier with open('classifier.pickle', 'wb') as handle: pickle.dump(clf, handle) # saving the standard scaler
# DataFrame([data, index, columns, dtype, copy]) # print(df[:1]) # 看任意两点的相关度 # _ = sb.pairplot(df[:50], vars=[8, 11, 12, 14, 19], hue="class", size=1.5) # plt.show() # 计算各特征之间的相关度 # plt.figure(figsize=(12, 10)) # _ = sb.heatmap(df, annot=False) # plt.show() # 减少特征 plot_learning_curve(LinearSVC(C=10.0), "LinearSVC(C=10.0) Features: 11&14", X[:, [11, 14]], y, ylim=(0.8, 1.0), train_sizes=np.linspace(.05, 0.1, 5)) # 增大训练集 # plot_learning_curve(LinearSVC(C=10),'learning rate plot(C=10.0)',X,y,ylim=(0.8,1.01), train_sizes=np.linspace(.1,.992,5)) # plot_learning_curve(LinearSVC(C=1.0),'learning rate plot(C=10.0)',X,y,ylim=(0.8,1.01), train_sizes=np.linspace(0.5,0.2,5)) # 遍历多种特征组合 from sklearn.pipeline import Pipeline from sklearn.feature_selection import SelectKBest, f_classif plot_learning_curve( Pipeline([ ("fs", SelectKBest(f_classif, k=2)), # select two features ("svc", LinearSVC(C=10.0)) ]),
import numpy as np import pandas as pd import learning_curve as lc from sklearn.ensemble import RandomForestClassifier import xgboost as xgb train = pd.read_csv('data/randomforest/data_train.csv').as_matrix() test = pd.read_csv('data/randomforest/data_test.csv').as_matrix() rf = RandomForestClassifier(n_estimators=500, max_depth=5, min_samples_leaf=2, max_features='sqrt', verbose=0) lc.plot_learning_curve(rf, train[:, 1:], train[:, 0])
ax4 = fig.add_subplot(224) country = 'all' ls_model = models[0][country] ls_data = data[0][country] en_model = models[1][country] en_data = data[1][country] rf_model = models[2][country] rf_data = data[2][country] svr_model = models[3][country] svr_data = data[3][country] plot_learning_curve(ls_model, ls_data['X'], ls_data['y'], ax=ax1) plot_learning_curve(en_model, en_data['X'], en_data['y'], ax=ax2) plot_learning_curve(rf_model, rf_data['X'], rf_data['y'], ax=ax3) plot_learning_curve(svr_model, svr_data['X'], svr_data['y'], ax=ax4) ax1.set_title("Lasso Regression") ax2.set_title("ElasticNet Regression") ax3.set_title("Random Forest Tree Regression") ax4.set_title("Support Vector Regression") for ax in [ax1, ax2, ax3, ax4]: ax.set_ylim((0.1, 1.2)) plt.show() ## train the model
import matplotlib.pyplot as plt from sklearn import cross_validation from sklearn.svm import SVC from sklearn.datasets import load_digits from sklearn.naive_bayes import GaussianNB from learning_curve import plot_learning_curve digits = load_digits() X, y = digits.data, digits.target title = "Learning Curves (Naive Bayes)" # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = cross_validation.ShuffleSplit(digits.data.shape[0], n_iter=100, test_size=0.2, random_state=0) estimator = GaussianNB() plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv) title = "Learning Curves (SVM, RBF kernel, $\gamma=0.001$)" # SVC is more expensive so we do a lower number of CV iterations: cv = cross_validation.ShuffleSplit(digits.data.shape[0], n_iter=10, test_size=0.2, random_state=0) estimator = SVC(gamma=0.001) plot_learning_curve(estimator, title, X, y, (0.7, 1.01), cv=cv) plt.show()
("linear_regression", linear_regression)]) return pipeline model = polynomial_model(degree=2) model.fit(X_train, y_train) train_score = model.score(X_train, y_train) cv_score = model.score(X_test, y_test) print('train_score: {1:0.6f}; cv_score: {2:.6f}'.format(train_score, cv_score)) ''' 4、绘制学习曲线 ''' from learning_curve import plot_learning_curve from sklearn.model_selection import ShuffleSplit cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0) plt.figure(figsize=(18, 4)) title = 'Learning Curves (degree={0})' degrees = [1, 2, 3] plt.figure(figsize=(18, 4), dpi=200) for i in range(len(degrees)): plt.subplot(1, 3, i + 1) plot_learning_curve(plt, polynomial_model(degrees[i]), title.format(degrees[i]), X, y, ylim=(0.01, 1.01), cv=cv)
rfc = RandomForestClassifier() xgbc = XGBClassifier() from sklearn.model_selection import cross_val_score, ShuffleSplit cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=78) rfc_scores = cross_val_score(rfc, x_train, y_train, cv=cv) xgbc_scores = cross_val_score(xgbc, x_train, y_train, cv=cv) #绘制学习曲线查看拟合情况 from learning_curve import plot_learning_curve import matplotlib.pyplot as plt plt.figure(figsize=(18, 6)) plt.subplot(121) plot_learning_curve(xgbc, 'xgbc', x_train, y_train, cv=cv) plt.subplot(122) plot_learning_curve(rfc, 'rfc', x_train, y_train, cv=cv) plt.show() rfc.fit(x_train, y_train) rfc_y_test = rfc.predict(x_test) rfc_submission = pd.DataFrame({ 'PassengerId': test['PassengerId'], 'Survived': rfc_y_test }) rfc_submission.to_csv('./rfc_submission.csv', index=False) xgbc.fit(x_train, y_train) xgbc_y_test = xgbc.predict(x_test) xgbc_submission = pd.DataFrame({