'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6} model = RandomForestClassifier(**parameters) model.fit(train, targets) output = model.predict(test).astype(int) df_output = pd.DataFrame() aux = pd.read_csv('test.csv') df_output['PassengerId'] = aux['PassengerId'] df_output['Survived'] = output df_output[['PassengerId', 'Survived']].to_csv('gridsearch_rf.csv', index=False) trained_models = [] for model in models: model.fit(train, targets) trained_models.append(model) predictions = [] for model in trained_models: predictions.append(model.predict_proba(test)[:, 1]) predictions_df = pd.DataFrame(predictions).T predictions_df['out'] = predictions_df.mean(axis=1) predictions_df['PassengerId'] = aux['PassengerId'] predictions_df['out'] = predictions_df['out'].map(lambda s: 1 if s >= 0.5 else 0) predictions_df = predictions_df[['PassengerId', 'out']] predictions_df.columns = ['PassengerId', 'Survived'] predictions_df.to_csv('blending_base_models.csv', index=False)
modeler = [ DecisionTreeClassifier(), RandomForestClassifier(), GradientBoostingClassifier() ] S_X_train, S_X_test = stacking(modeler, X_train, y_train, X_test, regression=False, metric=metrics.log_loss, needs_proba=True, stratified=True, shuffle=True, random_state=42, verbose=2) # %% model = LogisticRegression(penalty='l1', C=1, random_state=42) model = model.fit(S_X_train, y_train) y_pred = pd.Series(model.predict(S_X_test)) y_pred_proba = model.predict_proba(S_X_test)[:, 1] print("R Square:", metrics.accuracy_score(y_test, model.predict(S_X_test))) print("kappa:", metrics.cohen_kappa_score(y_test, model.predict(S_X_test))) # %%
# In[ ]: # Using Ensemble model technique by considering all the models trained to predict Survival (Trial 4) models = [ logreg_model, logreg_cv_model, rf_model, gboost_model, dt_model, ab_model ] trained_models = [] for model in models: model.fit(train_reduced, final_train_set_y) trained_models.append(model) predictions = [] for model in trained_models: predictions.append(model.predict_proba(test_reduced)[:, 1]) # Take the mean of probability identified by each model kaggle_df = pd.DataFrame(predictions).T kaggle_df['out'] = kaggle_df.mean(axis=1) kaggle_df['PassengerId'] = titanic_test_org['PassengerId'] kaggle_df['out'] = kaggle_df['out'].map(lambda s: 1 if s >= 0.5 else 0) # dataframe with predictions kaggle_df = kaggle_df[['PassengerId', 'out']] kaggle_df.columns = ['PassengerId', 'Survived'] # save to csv kaggle_df.to_csv('RFTunedsubmission.csv', index=False) # In[ ]:
print("Accuracy: %.2f%%" % (accuracy * 100.0)) print(classification_report(y_test, predictions)) end = time.clock() print('预测163个结果,所需时间为' + str(end - start)) # In[ ]: print(model) # In[ ]: xgb.to_graphviz(model, num_trees=10) # In[ ]: y_score = model.predict_proba(DataFrame(X_test, dtype='float')) y_score = [a[1] for a in y_score] fpr, tpr, threshold = roc_curve(y_test, y_score) ###计算真正率和假正率 roc_auc = auc(fpr, tpr) ###计算auc的值 plt.figure() lw = 2 plt.figure(figsize=(10, 10)) plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) ###假正率为横坐标,真正率为纵坐标做曲线 plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05])
X_test_rf_selected = X_test[X_test.columns.intersection(rf_selected_features)] y_rf_predictions = random_forest.predict(X_test_rf_selected) conf_matrix = metrics.confusion_matrix(y_test, y_rf_predictions) sns.heatmap(pd.DataFrame(conf_matrix), annot=True, fmt='g', cmap='coolwarm_r') plt.title('Random Forests') plt.ylabel('Actual label') plt.xlabel('Predicted label') #plt.savefig('RF_CM.png', quality=95) plt.show() print(f"Accuracy: {metrics.accuracy_score(y_test, y_rf_predictions)}") print(classification_report(y_test, y_rf_predictions)) y_pred_prob_rf = random_forest.predict_proba(X_test_rf_selected)[::, 1] fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_prob_rf) auc = metrics.roc_auc_score(y_test, y_pred_prob_rf) plt.plot(fpr, tpr) plt.title(f'Random Forests - Area Under Curve : {str(auc)[:4]}') plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') #plt.savefig('RF_AUC.png', quality = 95) plt.show() print(f'Area Under Curve : {auc}') # Boosted Trees print('\n Gradient Boosted Trees model') data_dmatrix = xgb.DMatrix(data=X_train, label=y_train)