from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error X_train,X_val,Y_train,Y_val=train_test_split(X,Y,test_size=0.25) gbrt=GradientBoostingClassifier(max_depth=2,n_estimators=120) gbrt.fit(X_train,Y_train) errors=[mean_squared_error(Y_val,Y_pred) for Y_pred in gbrt.staged_predict(X_val)] bst_n_estimators=np.argmin(errors) gbrt_best=GradientBoostingClassifier(max_depth=2,n_estimators=bst_n_estimators) gbrt.fit(X_train,Y_train) gbrt=GradientBoostingClassifier(max_depth=2,warm_start=True) min_val_error=float('inf') error_going_up=0 for n_estimators in range(1,120): gbrt.n_estimators=n_estimators gbrt.fit(X_train,Y_train) Y_pred=gbrt.predict(X_val) val_error=mean_squared_error(Y_val,Y_pred) if val_error<min_val_error: min_val_error=val_error error_going_up=0 else: error_going_up+=1 if error_going_up==5: break #---------5堆叠法(stacking)------ from sklearn.datasets import load_iris iris=load_iris() X,Y=iris.data[:,1:3],iris.target from sklearn.model_selection import cross_val_score
''' Using Early stopping strategy to calculate no. of estimators required in order to decrease computations and overfitting ''' c_gbrt_est = GradientBoostingClassifier(max_depth=8, min_samples_split=9, min_samples_leaf=8, learning_rate=0.09, warm_start=True) min_val_error = float("inf") error_going_up = 0 n_estimators = 0 for n_estimators in range(1, 1200): c_gbrt_est.n_estimators = n_estimators c_gbrt_est.fit(cX_real_drop, cy_real_drop) y_pred = c_gbrt_est.predict(cX_test_drop) val_error = mean_squared_error(cy_test_drop, y_pred) if val_error < min_val_error: min_val_error = val_error error_going_up = 0 else: error_going_up += 1 if error_going_up == 15: break # A minimum of 100 estimators for the classifier if n_estimators < 100: n_estimators = 100 clf_estimator = n_estimators
errors = [mean_squared_error(y_test, y_pred) for y_pred in grd_clf.staged_predict(X_test)] n_estimators_opt = np.argmin(errors) # getting the index of least error print('Optimal value:', n_estimators_opt) # create new model with optimal value of n_estimators grd_clf_opt_1 = GradientBoostingClassifier(max_depth=2, n_estimators=n_estimators_opt, learning_rate=1.0) print('Training Model..') grd_clf_opt_1.fit(X_train, y_train) print('Done.') y_pred = grd_clf_opt_1.predict(X_test) print('Accuracy:', accuracy_score(y_test, y_pred)) # Implementing actual early-stopping grd_clf = GradientBoostingClassifier(max_depth=2, warm_start=True) # set warm_start min_val_error = float('inf') error_going_up = 0 for n_estimators in range(1, 100): grd_clf.n_estimators = n_estimators grd_clf.fit(X_train, y_train) y_pred = grd_clf.predict(X_test) val_error = mean_squared_error(y_test, y_pred) print('n_estimators:', n_estimators, 'Error:', val_error) if val_error <= min_val_error: min_val_error = val_error error_going_up = 0 else: error_going_up += 1 if error_going_up == 5: break # early-stpping
for i, d in enumerate(y_train): if d: y_trainx[i] = 1 y_train = y_trainx y_train[0] # Decision Tree Classifier.. from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.metrics import accuracy_score model = GradientBoostingClassifier(learning_rate=0.09, warm_start=True, n) acc = [] for i in range(1, 10): model.n_estimators = i model.fit(train_vector[:900], y_train[:900]) acc.append( accuracy_score(model.predict(train_vector[1500:]), y_train[1500:])) #model=DecisionTreeClassifier() model.fit(train_vector[1000:1900], y_train[1000:1900]) from sklearn.metrics import confusion_matrix test_vector = np.zeros(shape=(100, len(corpus))) for j, i in enumerate(x_test[0:100]): for some in tokenizer.tokenize(i[0]): if some not in stopwords: test_vector[j][mp[some]] = 1
undersampled_train, y_train, cv=4, scoring='roc_auc') scores.append(np.mean(this_scores)) scores_std.append(np.std(this_scores)) rf_results = pd.DataFrame({'score': scores, 'Max Depth': msl_s}) rf_results msl_s = [500, 750, 1000] scores = list() scores_std = list() rf = RandomForestClassifier() for msl in msl_s: rf.n_estimators = msl this_scores = cross_val_score(rf, undersampled_train, y_train, cv=4, scoring='roc_auc') scores.append(np.mean(this_scores)) scores_std.append(np.std(this_scores)) rf_results = pd.DataFrame({'score': scores, 'Max Depth': msl_s}) rf_results #clf = GradientBoostingClassifier(learning_rate=0.04, n_estimators=500, subsample=0.93, max_depth=10, max_features=None) #clf = AdaBoostClassifier(learning_rate=0.5, n_estimators=500) #clf = MLPClassifier(activation='logistic', solver='lbfgs', alpha=1e-5, max_iter=500,hidden_layer_sizes=(44, 2), learning_rate='invscaling') #clf = KNeighborsClassifier(weights='distance',algorithm='kd_tree', leaf_size=1000, n_neighbors=2) clf = GradientBoostingClassifier(n_estimators=100,