def model_cross_validation(self, model, best_params): print 'Model Cross Validation' print 'Start:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') lr = self.model_init(model) clf1 = self.model_init('KNN') clf2 = self.model_init('RFC') clf3 = self.model_init('GNB') sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],meta_classifier=lr) sclf.set_params(**best_params) train_data = self.train.values.copy() train_label = self.train_label['label'].values.copy() train_label = train_label.reshape(train_label.shape[0]) scores = cross_val_score(sclf, train_data, train_label, cv=5, scoring='roc_auc', n_jobs=3) print sclf print scores print np.mean(scores) print 'Model: {0} ; Train: {1}'.format(model,np.mean(scores)) print 'End:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') return np.mean(scores)
def model_test(self,model,best_params): print 'Model Test' print 'Start:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') lr = self.model_init(model) clf1 = self.model_init('KNN') clf2 = self.model_init('RFC') clf3 = self.model_init('GNB') sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],meta_classifier=lr) sclf.set_params(**best_params) train_data = self.train.values.copy() train_label = self.train_label['label'].values.copy() sclf.fit(train_data, train_label) if model.upper()=='LR': coef=sclf.coef_.reshape(clf.coef_.shape[1]) ind=coef.argsort() att=self.train.columns[ind[-30:]].tolist() print att elif model.upper()=='RFC': imp=sclf.feature_importances_ print imp ind=imp.argsort() att=self.train.columns[ind[-30:]].tolist() print att elif model.upper()=='XGB': imp=sclf.feature_importances_ print imp ind=imp.argsort() att=self.train.columns[ind[-30:]].tolist() print att test_data = self.test.values.copy() test_label = self.test_label['label'].values.copy() test_label = test_label.reshape(test_label.shape[0]) res_proba=sclf.predict_proba(test_data) res_auc=roc_auc_score(test_label,res_proba[:,1]) print 'Model: {0} ; Test: {1}'.format(model,res_auc) print 'End:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') return res_auc
# train mean of score: 0.6473931873941305 # train std of score: 0.001041262887225388 # test mean of score: 0.6063831053298916 # test std of score: 0.003201456042199307 #%% print('===============XGboost regression with decision tree===============') #It is very easy for Stacking to get overfitted, so we reduce the model complexity here sclf = StackingClassifier(classifiers=[xgbr], meta_classifier=DecisionTreeClassifier( min_samples_leaf=500, random_state=model_random_state)) sclf_updated_dict = {'xgbregressor__' + k: v for k, v in updated_dict.items()} sclf_updated_dict['xgbregressor__subsample'] = .4 sclf_updated_dict['xgbregressor__min_child_weight'] = 100 sclf.set_params(**sclf_updated_dict) sclf_scores = evaluation.cv_scores(sclf, X_train, y_train, cv=cv, scoring=quadratic_weighted_kappa_round, return_estimator=True) # train mean of score: 0.6434094278182996 # train std of score: 0.0021768938733617974 # test mean of score: 0.6069677840155301 # test std of score: 0.007928572700424638 #%% print('===============XGboost classifiction with rounding===============') xgbc = XGBClassifier(random_state=model_random_state,