def job_function(params): learning_rate = params[0] max_depth = params[1] ss_cs = params[2] gamma = params[3] min_child_weight = params[4] reg_lambda = params[5] reg_alpha = params[6] early_stopping_rounds = 25 if learning_rate >= 0.3: early_stopping_rounds = 5 if learning_rate <= 0.03: early_stopping_rounds = 50 scores = [] for i in range(iterations_per_job): X_train = Xy[i][0] X_test = Xy[i][1] y_train = Xy[i][2] y_test = Xy[i][3] y_train2 = le.transform(y_train) y_test2 = le.transform(y_test) clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha) clf.fit(X_train, y_train, eval_set=[(X_test, y_test2)], eval_metric=calculate_score_2, early_stopping_rounds=early_stopping_rounds, verbose=False) y_predicted = clf.predict_proba(X_test, ntree_limit=clf.booster().best_ntree_limit) score = calculate_score(y_predicted, y_test2) scores.append(score) avg_score = np.array(scores).mean() print(avg_score, params) return avg_score
def _distributor(self, label, cv, param, eval_metric, early_stopping_rounds=50): start = time() if self.is_classifier: label = 'XGBClassifier' rs = XGBClassifier(param) else: label = 'XGBRegressor' rs = XGBRegressor(param) X_visible, X_blind, y_visible, y_blined = \ train_test_split( self.X_train, self.y_train, random_state=1301, stratify=self.y_train, test_size=0.4) rs.fit(self.X_visible, self.y_visible, eval_metric, early_stopping_rounds=50, eval_set=[(X_visible, y_visible), (X_blind, y_blined)]) self.result[label] = {} self.result[label]['clf'] = rs # self.result[label]['score'] = rs.best_score_ self.result[label]['time'] = time() - start # self.result[label]['set'] = ('n_iter: %s cv: %s' % (n_iter, cv)) pprint.pprint(self.result[label]) # pprint.pprint(rs.grid_scores_) out_result = open(self.result_address, 'wb') pickle.dump(self.result, out_result) out_result.close()
def extract_leaf_feature(features, targets, train_indexes, params): model = XGBClassifier(**params) model.fit(features[train_indexes], targets[train_indexes]) booster = model.booster() dmatrix = xgb.DMatrix(features) leaf = booster.predict(dmatrix, pred_leaf=True) encoder = sklearn.preprocessing.OneHotEncoder() leaf_feature = encoder.fit_transform(leaf) return leaf_feature
def main(training_data, test_data): # Merging data to ensure consistent cleaning. Putting marker variable to separate later. training_data['source'] = 'training' test_data['source'] = 'test' merged_data = pd.concat([training_data, test_data]) # Cleaning data cleaned_data = data_cleaner(merged_data) # Separating data, removing marker pred_df = cleaned_data[cleaned_data['source'] == 'training'].copy() test_pred = cleaned_data[cleaned_data['source'] == 'test'].copy() pred_df.drop('source', axis=1, inplace=True) test_pred.drop('source', axis=1, inplace=True) # Transforming target into ints, saving the key for later transformation labels = LabelEncoder().fit(training_data['country_destination']) target_df = pd.Series(labels.transform(training_data['country_destination']), index=training_data.index) # Training model xgb_model = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0) xgb_model.fit(pred_df.as_matrix(), target_df.tolist()) # Running the model preds = xgb_model.predict_proba(test_pred.as_matrix()) # Selecting the top 5 most likely for each respondent and stacking. # This section is VERY slow and could use being optimized model_probs = pd.DataFrame(preds, index=test_pred.index, columns=labels.classes_) stacked_probs = pd.Series() for i in model_probs.index: temp = model_probs.loc[i, :] temp_sort = pd.DataFrame(temp.sort_values(ascending=False)[:5].index) temp_sort['id'] = i temp_sort.columns = ['country', 'id'] stacked_probs = pd.concat([stacked_probs, temp_sort]) # # Selecting classes with highest probabilities, compiling into list # ids = [] # cts = [] # test_ids = pd.Series(test_data.index) # for i in range(len(test_ids)): # idx = test_data.index[i] # ids += [idx] * 5 # cts += labels.inverse_transform(np.argsort(model_probs[i])[::-1])[:5].tolist() # # predictions = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country']) # Cleaning output and returning it output = stacked_probs[['id', 'country']] return output
def eval_fn(params): model = XGBClassifier(n_estimators=n_estimators_max, learning_rate=learning_rate, seed=seed) score = 0 n_estimators = 0 for tr, va in skf: X_tr, y_tr = X_train[tr], y_train[tr] X_va, y_va = X_train[va], y_train[va] model.set_params(**params) model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], eval_metric='logloss', early_stopping_rounds=50, verbose=False) score += model.best_score n_estimators += model.best_iteration score /= n_folds n_estimators /= n_folds n_estimators_lst.append(n_estimators) result_str = "train:%.4f ntree:%5d " % (score, n_estimators) if X_valid is not None: model.n_estimators = n_estimators model.fit(X_train, y_train) pr = model.predict_proba(X_valid)[:,1] sc_valid = log_loss(y_valid, pr) score_valid.append(sc_valid) result_str += "valid:%.4f" % sc_valid if verbose: print result_str return score
def objective(space): clf = XGBClassifier(n_estimators=int(space['n_estimators']), objective='binary:logistic', seed=37, learning_rate=space['learning_rate'], max_depth=space['max_depth'], min_child_weight=space['min_child_weight'], colsample_bytree=space['colsample_bytree'], subsample=space['subsample']) clf.fit(xTrain, yTrain, eval_metric="logloss") pred = clf.predict_proba(xValid)[:, 1] loss = log_loss(yValid, pred) return{'loss': loss, 'status': STATUS_OK}
def GBDT(self, report=False): """Gradient Boosting Decision Tree. Args: report: whether print out the model analysis report. Returns: Decision tree model generated from Gradient Boosting Decision Tree.""" from xgboost.sklearn import XGBClassifier self.gbdt = XGBClassifier(objective='binary:logistic', booster='gbtree', learning_rate=0.01, n_estimators=5000, max_depth=3, subsample=0.75, colsample_bytree=0.75, n_jobs=4, random_state=2018) self.gbdt.fit(self.train_prep[self.features], self.train_prep[self.target]) if report: from Report import Report rpt = Report(self.gbdt, self.train, self.valid, self.target, self.features) rpt.ALL() return self.gbdt
def fit(self, json_train, n_estimators = 10, is_xgb = True): train = self.pre_process(json_train, istrain = True) bow_vectorizer = BagOfWordsVectorizer() word2vec_model = Word2VecModel() tag_counter_model = TagCounterModel() # word2vec_model.fit(train["author_pos_sentences"], 500) # author_features = word2vec_model.transform(train["author_pos_sentences"], "author") # self.author_model = word2vec_model.get_model() # bow_vectorizer.fit(train["title_pos_sentences"], 1000) # title_features = bow_vectorizer.transform(train["title_pos_sentences"], "title") # self.title_model = bow_vectorizer.get_vectorizer() bow_vectorizer.fit(train["text_pos_sentences"], 1000) text_features = bow_vectorizer.transform(train["text_pos_sentences"], "text") self.text_model = bow_vectorizer.get_vectorizer() # tag_features = tag_counter_model.fit_transform(train["text"]) # self.tag_model = tag_counter_model.get_col() train = pd.concat([train, text_features], axis = 1) #le = preprocessing.LabelEncoder() # train["forumid"] = le.fit_transform(train["forumid"]) label = train['istroll'] train = train.drop('istroll', axis=1) train = train.drop(['text', 'text_pos', 'text_pos_sentences'], axis=1) print(train.columns) train.columns = [str(x) for x in range(len(train.columns))] if is_xgb == False: self.model = RandomForestClassifier(n_estimators, n_jobs=-1) else: self.model = XGBClassifier(n_estimators = n_estimators, max_depth = 10) print(train.shape) self.model.fit(train, label)
def apply_xgb_ens(y_valid, valid_folder='Valid', test_folder='Test'): """ Ensembler based on xgboost Gradient boosting. """ #Loading data X, X_test, n_preds, n_class = get_X_X_Test(valid_folder, test_folder) y = y_valid #Defining classifier xgb = XGBClassifier(max_depth=4, learning_rate=0.05, n_estimators=200, objective='multi:softprob', gamma=0., max_delta_step=0., subsample=0.9, colsample_bytree=0.9, seed=0) xgb.fit(X, y) y_pred = xgb.predict_proba(X_test) return y_pred
def perform_prediction(training, labels, testing, xgb_votes, rf_votes): """ Perform prediction using a combination of XGB and RandomForests. """ predictions = np.zeros((len(testing), len(set(labels)))) # Predictions using xgboost. for i in range(xgb_votes): print 'XGB vote %d' % i xgb = XGBClassifier( max_depth=DEPTH_XGB, learning_rate=LEARNING_XGB, n_estimators=ESTIMATORS_XGB, objective='multi:softprob', subsample=SUBSAMPLE_XGB, colsample_bytree=COLSAMPLE_XGB) xgb.fit(training, labels) predictions += xgb.predict_proba(testing) # Predictions using RandomForestClassifier. for i in range(rf_votes): print 'RandomForest vote %d' % i rand_forest = RandomForestClassifier( n_estimators=ESTIMATORS_RF, criterion=CRITERION_RF, n_jobs=JOBS_RF, max_depth=DEPTH_RF, min_samples_leaf=MIN_LEAF_RF, bootstrap=True) rand_forest.fit(training, labels) predictions += rand_forest.predict_proba(testing) return predictions
def xgboostinitial_predictor(train_path, test_path, eval_path): # Loading the data print 'Loading the data...' train = pd.read_csv(train_path, index_col=0) test = pd.read_csv(test_path, index_col=0) eval_df = pd.read_csv(eval_path, index_col=0) target = train['target'].copy() train.drop('target', axis=1, inplace=True) # Training model print 'Model training begins...' # xgtrain = xgb.DMatrix(train.values, target.values, missing=np.nan) # xgboost_params = {'objective': 'binary:logistic', 'booster': 'gbtree', 'eval_metric': 'logloss', 'eta': 0.01, # 'subsample': 0.5, 'colsample_bytree': 0.5, 'max_depth': 10, 'silent': 0} # # xgb_model = xgb.train(xgboost_params, xgtrain, learning_rates=0.3) xgb_model = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='binary:logistic', subsample=0.5, colsample_bytree=0.5, seed=0) xgb_model.fit(train.as_matrix(), target.tolist()) # Running the model print 'Making predictions....' # xgtest = xgb.DMatrix(test.values) # xgeval = xgb.DMatrix(eval_df) test_preds = xgb_model.predict_proba(test.as_matrix()) eval_preds = xgb_model.predict_proba(eval_df.as_matrix()) print 'Cleaning predictions to match expected format....' test_output = pd.DataFrame(test_preds, index=test.index) print test_output.columns test_output = test_output[1] test_output.columns = ['PredictedProb'] eval_output = pd.DataFrame(eval_preds, index=eval_df.index) eval_output = eval_output[1] eval_output.columns = ['PredictedProb'] return test_output, eval_output
def train_classifier(X, y, clf_name='xgb'): if clf_name == 'xgb': clf = XGBClassifier( n_estimators=ESTIMATORS_XG, objective=OBJECTIVE_XG, max_depth=DEPTH_XG, learning_rate=LEARNING_RATE_XG, subsample=SUBSAMPLE_XG, colsample_bytree=COLSAMPLE_BYTREE_XG, seed=0, ) else: clf = RandomForestClassifier( n_estimators=ESTIMATORS_RF, criterion=CRITERION_RF, n_jobs=JOBS_RF, max_depth=DEPTH_RF, min_samples_leaf=MIN_LEAF_RF, min_samples_split=MIN_SPLIT_RF, max_features=MAX_FEATURES_RF, bootstrap=True, ) clf.fit(X, y) return clf
def get_xgboost_classifier(X_train, y_train, X_val, y_val,params=None, tag=""): param_grid = {'max_depth':[3,5,7], 'min_child_weight': [1,3,5], 'n_estimators': [50]} if params is None: xgb = XGBClassifier( learning_rate =0.2, objective= 'binary:logistic', seed=27) t = start("training xgboost ") cv = cross_validation.ShuffleSplit(X_train.shape[0], n_iter=10,test_size=0.2, random_state=123) clf = grid_search.GridSearchCV(xgb, param_grid, cv=cv, n_jobs=1, scoring='roc_auc') clf = clf.fit(X_train,y_train) report(t, nitems=10*len(param_grid)) print("Best score:{} with scorer {}".format(clf.best_score_, clf.scorer_)) print "With parameters:" best_parameters = clf.best_estimator_.get_params() for param_name in sorted(param_grid.keys()): print '\t%s: %r' % (param_name, best_parameters[param_name]) else: clf = XGBClassifier(**params) clf.fit(X_train, y_train, eval_set = [(X_train,y_train),(X_val,y_val)], eval_metric='auc', verbose=False) if plot_cv_curves: train = clf.evals_result()['validation_0']['auc'] val = clf.evals_result()['validation_1']['auc'] plot_cv_curve(train, val, tag) if plot_feature_importance: plot_feature_importance(clf, tag) return clf
def main(): data_train = pd.read_csv(args.train_dataset) X_train = data_train.drop(['Id', 'Class'], axis=1) y_train = data_train.loc[:, 'Class'] data_test = pd.read_csv(args.test_dataset) X_test = data_test.drop(['Id'], axis=1) Id = data_test.loc[:, 'Id'] clf = XGBClassifier() clf.set_params(**best_dicts) clf.fit(X_train, y_train) prediction = clf.predict_proba(X_test) columns = ['Prediction'+str(i) for i in range(1, 10)] prediction = pd.DataFrame(prediction, columns=columns) results = pd.concat([Id, prediction], axis=1) return (clf, results)
def myThreadFunc(ThreadID): X_train = Xy[ThreadID][0] X_test = Xy[ThreadID][1] y_train = Xy[ThreadID][2] y_test = Xy[ThreadID][3] y_train2 = le.transform(y_train) y_test2 = le.transform(y_test) clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha) clf.fit(X_train, y_train, eval_set=[(X_test, y_test2)], eval_metric=calculate_score_2, early_stopping_rounds=early_stopping_rounds, verbose=False) y_predicted = clf.predict_proba(X_test, ntree_limit=clf.booster().best_ntree_limit) score = calculate_score(y_predicted, y_test2) print(score, clf.booster().best_ntree_limit) train_and_test_scores[ThreadID] = score
# In[91]: # Random Forests random_forest = RandomForestClassifier(random_state=1, n_estimators=45, min_samples_split=3, min_samples_leaf=2) random_forest.fit(X, y) score=random_forest.score(X, y) Y_pred = random_forest.predict(X_test) # In[14]: #Classifier xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0) xgb.fit(X, y) score = xgb.score(X,y) y_pred = xgb.predict_proba(X_test) # In[15]: print (score) # In[21]: # for Random forest
data1_new_train = data1_new.ix[0:11016, :] data1_new_test = data1_new.ix[11017:, :] X_train, X_test, Y_train, Y_test = train_test_split(data1_new_train, Y, test_size=0.3, random_state=42) #训练xgboost模型 #设置初试参数 xgb_train = XGBClassifier(booster="gbtree", learning_rate=0.02, n_estimators=1000, max_depth=6, min_child_weight=5, gamma=0, reg_alpha=65, reg_lambda=10, subsample=0.81, colsample_bytree=0.81, objective='binary:logistic', nthread=8, scale_pos_weight=3.9, seed=27) xgb_train.fit(X_train, Y_train, eval_metric="auc") pred = xgb_train.predict(X_test) from sklearn import metrics fpr, tpr, thresholds = metrics.roc_curve(Y_test, pred) metrics.auc(fpr, tpr) np.mean(f1_score(Y_test, pred, average=None)) y_prediction = xgb_train.predict(data1_new_test) predict_result = DataFrame({"user_id": user_id, "y_prediction": y_prediction})
def model1(df_train, df_test): print('model1') print('rows', df_train.shape[0]) #remove rows with no sessions data hassessions = df_train['HasSessions'] df_train = df_train.drop(hassessions[hassessions == 0].index) #remove rows older than 1/1/2014 #dac2 = df_train.date_account_created.apply(lambda x: datetime.strptime(x, '%Y-%m-%d')) #print('removing rows', len(dac2[dac2 < datetime.strptime('20140101', '%Y%m%d')].index)) #df_train = df_train.drop(dac2[dac2 < datetime.strptime('20140101', '%Y%m%d')].index) print('rows', df_train.shape[0]) labels = df_train['country_destination'].values df_train = df_train.drop(['country_destination'], axis=1) piv_train = df_train.shape[0] #Creating a DataFrame with train+test data df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True) #Removing id and date_first_booking df_all = df_all.drop(['id', 'date_first_booking', 'sessions_count', 'HasSessions'], axis=1) #Filling nan df_all = df_all.fillna(-1) #####Feature engineering####### print('features in the csv', df_all.shape[1]) #date_account_created print('dac', datetime.now()) dac = np.vstack(df_all.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values) df_all['dac_year'] = dac[:,0] df_all['dac_month'] = dac[:,1] df_all['dac_day'] = dac[:,2] #day of week, seazon print('dac2', datetime.now()) dac2 = df_all.date_account_created.apply(lambda x: datetime.strptime(x, '%Y-%m-%d')) df_all['dac_weekday'] = dac2.apply(lambda x: x.weekday()) df_all['dac_season'] = dac2.apply(calculate_season) df_all = df_all.drop(['date_account_created'], axis=1) #timestamp_first_active print('tfa', datetime.now()) tfa = np.vstack(df_all.timestamp_first_active.astype(str).apply(lambda x: list(map(int, [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]))).values) df_all['tfa_year'] = tfa[:,0] df_all['tfa_month'] = tfa[:,1] df_all['tfa_day'] = tfa[:,2] df_all = df_all.drop(['timestamp_first_active'], axis=1) #Age print('age', datetime.now()) av = df_all.age.values df_all['age'] = np.where(np.logical_or(av<14, av>100), -1, av) #remove features print('remove features', datetime.now()) df_all = df_all.drop(['Sessions' + str(i) for i in [0]], axis=1) df_all = df_all.drop(['SessionsD' + str(i) for i in range(456)], axis=1) print('features in the model', df_all.shape[1]) #One-hot-encoding features print('one-hot', datetime.now()) ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser', 'dac_season', 'sessions_preferred_device'] for f in ohe_feats: df_all_dummy = pd.get_dummies(df_all[f], prefix=f) df_all = df_all.drop([f], axis=1) df_all = pd.concat((df_all, df_all_dummy), axis=1) #Splitting train and test vals = df_all.values X = vals[:piv_train] y = labels X_predict = vals[piv_train:] #learning_rate, max_depth, ss_cs, gamma, min_child_weight, reg_lambda, reg_alpha = 0.03, 6, 0.5, 2, 2, 2, 1 learning_rate, max_depth, ss_cs, gamma, min_child_weight, reg_lambda, reg_alpha = 0.03, 8, 0.5, 2, 1, 2, 0 early_stopping_rounds = 25 if learning_rate <= 0.03: early_stopping_rounds = 50 print(learning_rate, max_depth, ss_cs, gamma, min_child_weight, reg_lambda, reg_alpha) #n_estimators = 455 n_estimators = 350 #n_estimators = 1 print(n_estimators) print('fit start', datetime.now()) clf2 = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha, nthread=-1) clf2.fit(X, y) y_predicted2 = clf2.predict_proba(X_predict) return y_predicted2
y_pred = clf_grid.predict(X_test) print('best parameter:\n', clf_grid.best_params_) print('Best score is {}'.format(clf_grid.best_score_)) print('accuracy:', metrics.accuracy_score(y_test, y_pred), 'precision:', metrics.precision_score(y_test, y_pred),'recall:', metrics.recall_score(y_test, y_pred), 'f-score:', metrics.accuracy_score(y_test, y_pred), 'cm:',metrics.confusion_matrix(y_test, y_pred)) # XGBoost # grid search and step-by-step tuning parameters: 1. fix learning rate and number of estimators for tuning tree-based parameters; 2. tune max_depth and min_child_weight; 3. tune gamma; # 4. tune subsample and colsample_bytree; 5. tune Regularization Parameters; 6. tune learning rate and number of estimators; # reference guide at https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/ # tuning learning rate and number of estimators as an example from xgboost.sklearn import XGBClassifier param_test5 = {'n_estimators': range(50,500,50),'learning_rate':[i/100.0 for i in range(1,10)]} gsearch5 = GridSearchCV(estimator=XGBClassifier(max_depth=5,min_child_weight=1, gamma=0.0, subsample=0.9, colsample_bytree=0.7,reg_alpha=0.1, objective= 'binary:logistic',nthread=4, scale_pos_weight=1, seed=2),param_grid = param_test5, scoring='accuracy',n_jobs=-1,iid=False, cv=5) gsearch5.fit(X_train, y_train) y_pred = gsearch5.predict(X_test) print('best parameter:\n', gsearch5.best_params_) print('Best score is {}'.format(gsearch5.best_score_)) print('accuracy:', metrics.accuracy_score(y_test, y_pred), 'precision:', metrics.precision_score(y_test, y_pred), 'recall:', metrics.recall_score(y_test, y_pred), 'f-score:', metrics.accuracy_score(y_test, y_pred), 'cm:', metrics.confusion_matrix(y_test, y_pred)) # XGBoost feature importance score with the best parameters clf = XGBClassifier(max_depth=5, min_child_weight=1, gamma=0.0, subsample=0.9, colsample_bytree=0.7, reg_alpha=0.1, n_estimators=350, learning_rate=0.05) clf.fit(X_train, y_train) clf.score(X_test, y_test)
class XGBoostModel(BaseModel): """RandomForest classifier.""" def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, objective="binary:logistic", booster='gbtree', silent=True, n_jobs=1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0, missing=None): """""" super(XGBoostModel).__init__() self.model = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, silent=silent, objective=objective, booster=booster, n_jobs=n_jobs, gamma=gamma, min_child_weight=min_child_weight, max_delta_step=max_delta_step, subsample=subsample, colsample_bytree=colsample_bytree, colsample_bylevel=colsample_bylevel, reg_alpha=reg_alpha, reg_lambda=reg_lambda, scale_pos_weight=scale_pos_weight, base_score=base_score, random_state=random_state, missing=missing) def predict(self, features): super().predict(features) labels = self.model.predict(features) return labels def predict_prob(self, features): super().predict_prob(features) probs = self.model.predict_proba(features) return probs def predict_log_prob(self, features): super().predict_log_prob(features) probs = self.model.predict_proba(features) return probs def train(self, features, targets): super().train(features, targets) start = time.time() self.model.fit(X=features, y=targets) print('Finished, time %s' % (time.time() - start)) def accuracy_score(self, features, targets): super().accuracy_score(features, targets) score = self.model.score(features, targets, self.model.scale_pos_weight) return score def abs_errors(self, features, targets): targets_pred = self.predict(features) result = abs(targets_pred - targets) return result def rmse_score(self, y_pred, y_true): """ 计算RMSE评分,为了体现预测结果0、1、2不同的重要性,增加对1,2预测错误的惩罚度, 在评分计算时对不同行为分别乘以1,2,2.5的权重因子。 np.average((y_true - y_pred) ** 2, axis=0, weights=weights) :param y_pred: 预测标签 :param y_true: 真实标签 :return: 评分 """ weight_dict = {0: 1, 1: 2, 2: 2.5} # 不同类别的误判惩罚权重 weights = [weight_dict[l] for l in y_true] mse = np.average((y_true - y_pred)**2, axis=0, weights=weights) score = 1 / (1 + np.sqrt(mse)) return score
# #feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False) #feat_imp.plot(kind='bar', title='Feature Importances') #plt.ylabel('Feature Importance Score') ### 建立模型 print('### 建立XGBClassifier') xgbc = XGBClassifier( learning_rate=0.1, # 学习率 silent=1, # 输出中间过程 n_estimators=150, # 决策树个数 max_depth=5, # 决策树深度 min_child_weight=1, # 最小叶子节点权重和? gamma=0, # 惩罚项系数 subsample=0.8, # 训练一棵树所用的数据占全部数据集比例 colsample_bytree=0.8, # 训练一颗树所用的特征占全部数据集比例 objective='binary:logistic', # 损失函数 nthread=4, # 线程数 scale_pos_weight=1, # 样本不平衡 eval_metric='logloss', # 评估指标 reg_alpha=0.03, # 正则化系数 seed=27) # 随机种子 ### 网格搜索 ## step1:决策树个数 n_estimators print("### 调参:决策树个数") #modelfit(xgbc, train, test, predictors) ## step2:决策树参数 max_depth/min_child_weight/gamma/subsample/colsample_bytree print("### 调参:决策树参数")
Xy.append([X_train, X_test, y_train, y_test]) for iter in range(iterations): # if iter < 5: # continue X_train = Xy[iter][0] X_test = Xy[iter][1] y_train = Xy[iter][2] y_test = Xy[iter][3] y_train2 = le.transform(y_train) y_test2 = le.transform(y_test) print('fit start', datetime.now()) clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha, nthread=nthread) clf.fit(X_train, y_train, eval_set=[(X_test, y_test2)], eval_metric=calculate_score_2) submit = 0 if submit == 1: # n_estimators = 395 n_estimators = 349 #n_estimators = clf.booster().best_ntree_limit print(n_estimators) print('fit start', datetime.now()) clf2 = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha, nthread=nthread) clf2.fit(X, y) #clf2.fit(X, y, eval_set=[(X, y2)], eval_metric=calculate_score_dummy, early_stopping_rounds=n_estimators) y_predicted = clf2.predict_proba(X_predict)
plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') #define X y X, y = data.loc[:,data.columns != 'state'].values, data.loc[:,data.columns == 'state'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) #ClusterCentroids cc = ClusterCentroids(random_state=0) os_X,os_y = cc.fit_sample(X_train,y_train) #XGboost clf_XG = XGBClassifier(learning_rate= 0.3, min_child_weight=1, max_depth=6,gamma=0,subsample=1, max_delta_step=0, colsample_bytree=1, reg_lambda=1, n_estimators=100, seed=1000, scale_pos_weight=1000) clf_XG.fit(os_X, os_y,eval_set=[(os_X, os_y), (X_test, y_test)],eval_metric='auc',verbose=False) evals_result = clf_XG.evals_result() y_true, y_pred = y_test, clf_XG.predict(X_test) #F1_score, precision, recall, specifity, G score print "F1_score : %.4g" % metrics.f1_score(y_true, y_pred) print "Recall : %.4g" % metrics.recall_score(y_true, y_pred) recall = metrics.recall_score(y_true, y_pred) print "Precision : %.4g" % metrics.precision_score(y_true, y_pred) #Compute confusion matrix cnf_matrix = confusion_matrix(y_test,y_pred) np.set_printoptions(precision=2) print "Specifity: " , float(cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[0,1])
from xgboost.sklearn import XGBClassifier from sklearn.ensemble import RandomForestClassifier from sklearn import svm from sklearn.svm import SVC import numpy as np #事先准备三个模型,xgb和rf模型都已经通过cross_validation找出了较好的参数 clf1 = XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=1, min_child_weight=2, gamma=0, subsample=0.7, colsample_bytree=0.6, objective='binary:logistic', nthread=4, scale_pos_weight=1) clf2 = RandomForestClassifier(n_estimators=50, max_depth=1, min_samples_split=4, min_samples_leaf=54, oob_score=True) clf3 = SVC(C=0.1, probability=True) from sklearn.ensemble import VotingClassifier from sklearn.model_selection import cross_val_score eclf = VotingClassifier(estimators=[('xgb', clf1), ('rf', clf2), ('svc', clf3)], voting='hard') for clf, label in zip([clf1, clf2, clf3, eclf], ['XGBBoosting', 'Random Forest', 'SVM', 'Ensemble']): scores = cross_val_score(clf, x, y, cv=5, scoring='accuracy')
data=pd.read_csv("../data/data.csv") data.lon.unique().shape data_x=pd.get_dummies(data.action_type,prefix="action_type") cols=["combined_shot_type","game_event_id","period","playoffs", "shot_type","shot_zone_area","shot_zone_basic","shot_zone_range", "matchup","opponent","game_date","shot_distance","minutes_remaining","seconds_remaining", "loc_x","loc_y"] for col in cols: data_x=pd.concat([data_x,pd.get_dummies(data[col],prefix=col),],axis=1) train_x=data_x[-pd.isnull(data.shot_made_flag)] test_x=data_x[pd.isnull(data.shot_made_flag)] train_y=data.shot_made_flag[-pd.isnull(data.shot_made_flag)] clf = XGBClassifier(max_depth=6, learning_rate=0.01, n_estimators=550, subsample=0.5, colsample_bytree=0.5, seed=0) clf.fit(train_x, train_y) y_pred = clf.predict(train_x) print("Number of mislabeled points out of a total %d points : %d" % (train_x.shape[0],(train_y != y_pred).sum())) def logloss(act, pred): epsilon = 1e-15 pred = sp.maximum(epsilon, pred) pred = sp.minimum(1-epsilon, pred) ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred))) ll = ll * -1.0/len(act) print(ll) return ll logloss(train_y,clf.predict_proba(train_x)[:,1])
from sklearn.ensemble.forest import ExtraTreesClassifier from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier from sklearn.preprocessing import PolynomialFeatures, StandardScaler from sklearn.feature_selection import SelectKBest, f_classif # In[29]: preprocessor = make_pipeline(SelectKBest(f_classif, k=10), PolynomialFeatures(2)) AdaBoost = make_pipeline(preprocessor, AdaBoostClassifier(random_state=0)) SVM = make_pipeline(preprocessor, StandardScaler(), SVC(random_state=0)) GBoost = make_pipeline(preprocessor, StandardScaler(), GradientBoostingClassifier()) RandomForest = make_pipeline(preprocessor, RandomForestClassifier()) XGB = make_pipeline(preprocessor, XGBClassifier()) Extree = make_pipeline(preprocessor, ExtraTreesClassifier()) dict_of_models = { 'AdaBoost': AdaBoost, 'SVM': SVM, 'GBoost': GBoost, 'RandomForest': RandomForest, 'XGB': XGB, 'Extree': Extree } # In[30]: from sklearn.metrics import confusion_matrix, classification_report, f1_score from sklearn.model_selection import learning_curve
from xgboost.sklearn import XGBClassifier # load data dataset = loadtxt("pima-indians-diabetes.csv", delimiter=",") X = dataset[:, 0:8] Y = dataset[:, 8] seed = 7 test_size = 0.33 X_train, x_test, Y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed) model = XGBClassifier() eval_set = [(x_test, y_test)] # early_stopping_rounds: 如果连续N 次结果没有提升,则停止 # eval_metric: 损失函数 # eval_set: A list of (X, y) pairs to use as a validation set for early-stopping # verbose: print 学习结果 model.fit(X_train, Y_train, early_stopping_rounds=10, eval_metric="logloss", eval_set=eval_set, verbose=True) # make predictions for test data y_pred = model.predict(x_test) predictions = [round(value) for value in y_pred]
for data_file in data_files: if exists_in_log_file(data_file): # Skip if already trained and tested print('Skipping {}'.format(data_file)) continue print('Training on {}'.format(data_file)) # Loading data data = dill.load(open(os.path.join(DATA_DIR, data_file), 'rb')) for months_before in data.keys(): train_x = data[months_before]["TRAIN"]["X"] train_y = data[months_before]["TRAIN"]["y"] test_x = data[months_before]["TEST"]["X"] test_y = data[months_before]["TEST"]["y"] # Creating and training model clf = XGBClassifier(n_estimators=N_ESTIMATORS,random_state=1, verbose=1, n_jobs=N_JOBS) clf.fit(train_x, train_y, verbose=True) # Scoring pred_y = clf.predict_proba(test_x) auc_score = roc_auc_score(test_y, pred_y[:,1]) log_score = log_loss(test_y, pred_y) logging.info('{}, {}, {}, {}'.format(data_file, months_before, auc_score, log_score))
def train(self, train_set, dev_set): logger.log('Get features from training set') if os.path.exists(train_features_file): train_features = np.load(train_features_file) _, _, train_labels, _, _ = self.get_minibatch( train_set, 0, len(train_set)) else: train_features = None train_labels = [] total_batch = int(len(train_set) - 1) / self.batch_size + 1 for i in tqdm(range(total_batch)): minibatch_premise_vectors, minibatch_hypothesis_vectors, minibatch_labels, \ minibatch_prem_dep, minibatch_hypo_dep = \ self.get_minibatch(train_set, i * self.batch_size, (i+1) * self.batch_size) feed_dict = { self.model.premise_x: minibatch_premise_vectors, self.model.hypothesis_x: minibatch_hypothesis_vectors, self.model.y: minibatch_labels, self.model.keep_rate_ph: 1.0 } if 'dep_avg' in self.model_type: feed_dict[self.model.prem_dep] = minibatch_prem_dep feed_dict[self.model.hypo_dep] = minibatch_hypo_dep minibatch_features = self.sess.run([self.model.features], feed_dict) train_features = minibatch_features[0] if train_features is None \ else np.concatenate((train_features, minibatch_features[0])) train_labels += minibatch_labels np.save(train_features_file, train_features) logger.log('Get features from dev set') if os.path.exists(dev_features_file): dev_features = np.load(dev_features_file) _, _, dev_labels, _, _ = self.get_minibatch( dev_set, 0, len(dev_set)) else: dev_features = None dev_labels = [] total_batch = int(len(dev_set) - 1) / self.batch_size + 1 for i in tqdm(range(total_batch)): minibatch_premise_vectors, minibatch_hypothesis_vectors, minibatch_labels, \ minibatch_prem_dep, minibatch_hypo_dep = \ self.get_minibatch(dev_set, i * self.batch_size, (i+1) * self.batch_size) feed_dict = { self.model.premise_x: minibatch_premise_vectors, self.model.hypothesis_x: minibatch_hypothesis_vectors, self.model.y: minibatch_labels, self.model.keep_rate_ph: 1.0 } if 'dep_avg' in self.model_type: feed_dict[self.model.prem_dep] = minibatch_prem_dep feed_dict[self.model.hypo_dep] = minibatch_hypo_dep minibatch_features = self.sess.run([self.model.features], feed_dict) dev_features = minibatch_features[0] if dev_features is None \ else np.concatenate((dev_features, minibatch_features[0])) dev_labels += minibatch_labels np.save(dev_features_file, dev_features) tuned_parameters = {'max_depth': [4, 6, 8], 'n_estimators': [100, 200]} best_score = 0. best_params = [] for g in ParameterGrid(tuned_parameters): clf = XGBClassifier(nthread=24) clf.set_params(**g) clf.fit(train_features, train_labels) score = clf.score(dev_features, dev_labels) logger.log('%s: %f' % (str(g), score)) if best_score < score: best_score = score best_params = g self.clf = clf logger.log('Best score: %s %f' % (str(best_params), best_score))
params = { 'learning_rate': 0.1, 'n_estimators': 100, 'seed': 0, 'subsample': 1, 'colsample_bytree': 1, 'objective': 'binary:logistic', 'max_depth': 3 } # log model params for key in params: mlflow.log_param(key, params[key]) # train XGBoost model gbtree = XGBClassifier(**params) gbtree.fit(train_features, train_labels) importances = gbtree.get_booster().get_fscore() print(importances) # get predictions y_pred = gbtree.predict(test_features) accuracy = accuracy_score(test_labels, y_pred) print("Accuracy: %.1f%%" % (accuracy * 100.0)) # log accuracy metric mlflow.log_metric("accuracy", accuracy) sns.set(font_scale=1.5)
params={'max_depth':6, 'eta':0.05,'objective':'multi:softprob', 'subsample':0.8, 'colsample_bytree':1,'min_child_weight':1,'num_class':3} num_rounds=206 z=[] dtrain=xgb.DMatrix(train[features],label=y) clf=xgb.train(params,dtrain,num_rounds) importance=clf.get_fscore(fmap='xgb.fmap') importance=sorted(importance.items(),key=operator.itemgetter(1)) df = pd.DataFrame(importance, columns=['feature', 'fscore']) df['fscore'] = df['fscore'] / df['fscore'].sum() bst=list(df['feature'][df.fscore>0.001]) #df.to_csv('select.csv',index=False) X_train,X_valid,y_train,y_valid=train_test_split(train[bst],y,test_size=0.6,random_state=10) print ('start xgboost learning...') alg = XGBClassifier(max_depth=6, learning_rate=0.05, n_estimators=1210, objective='multi:softprob', subsample=0.8, colsample_bytree=1,min_child_weight=1) alg.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)],eval_metric='mlogloss',early_stopping_rounds=10,verbose=True) #plt.figure() #df.plot() #df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10)) #plt.title('XGBoost Feature Importance') #plt.xlabel('relative importance') #plt.gcf().savefig('feature_importance_xgb.png') y_pred = alg.predict_proba(test[bst]) result=pd.DataFrame(y_pred,columns=['predict_0','predict_1','predict_2']) result['id']=test.id.values.copy() #result.to_csv('xgb10.csv',index=False)
# *Selection of ML algorithm*: A first approach to deal with imbalanced data is to balance it by discarding the majority class before applying an ML algorithm. The disadvantage of undersampling is that a model trained in this way will not perform well on real-world skewed test data since almost all the information was discarded. A better approach might be to oversample the minority class, say by the synthetic minority oversampling technique (SMOTE) contained in the 'imblearn' library. Motivated by this, I tried a variety of anomaly-detection and supervised learning approaches. I find, however, that the best result is obtained on the original dataset by using a ML algorithm based on ensembles of decision trees that intrinsically performs well on imbalanced data. Such algorithms not only allow for constructing a model that can cope with the missing values in our data, but they naturally allow for speedup via parallel-processing. Among these algorithms, the extreme gradient-boosted (XGBoost) algorithm used below slightly outperforms random-forest. Finally, XGBoost, like several other ML algorithms, allows for weighting the positive class more compared to the negative class --- a setting that also allows to account for the skew in the data. # Split the data into training and test sets in a 80:20 ratio # In[ ]: trainX, testX, trainY, testY = train_test_split(X, Y, test_size = 0.2, \ random_state = randomState) # In[ ]: # Long computation in this cell (~1.8 minutes) weights = (Y == 0).sum() / (1.0 * (Y == 1).sum()) clf = XGBClassifier(max_depth = 3, scale_pos_weight = weights, \ n_jobs = 4) probabilities = clf.fit(trainX, trainY).predict_proba(testX) print('AUPRC = {}'.format(average_precision_score(testY, \ probabilities[:, 1]))) # <a href='#top'>back to top</a> # <a id='importance'></a> # ##### 6.1. What are the important features for the ML model? # The figure below shows that the new feature *errorBalanceOrig* that we created is the most relevant feature for the model. The features are ordered based on the number of samples affected by splits on those features. # In[ ]: fig = plt.figure(figsize=(14, 9)) ax = fig.add_subplot(111)
# gamma=0, # subsample=0.6, # colsample_bytree=0.7, # objective= 'binary:logistic', # scale_pos_weight=1, # reg_alpha=0.1, # seed=27) # modelfit(xgb1, df_train, predictors, targetname, early_stopping_rounds=50) xgb1 = XGBClassifier( learning_rate=0.01, n_estimators=700, max_depth=5, min_child_weight=8, gamma=0.3, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', scale_pos_weight=1, seed=27) xgb1.fit(df_train[predictors], df_train[targetname]) df_test['target'] = xgb1.predict(df_test[predictors]) df_test['target'] = df_test['target'].apply(lambda x: 'Y' if x==1 else 'N')
y_train, dtrain_predprob) feat_imp = pd.Series( alg.get_booster().get_fscore()).sort_values(ascending=False) print feat_imp # feat_imp.plot(kind='bar', title='Feature Importances') # plt.ylabel('Feature Importance Score') # predictors = [x for x in train.columns if x not in [target, IDcol]] xgb1 = XGBClassifier(learning_rate=0.2, n_estimators=100, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='reg:linear', n_jobs=4, scale_pos_weight=1, random_state=27) # modelfit(xgb1, x_train, y_train) ##### Step 2: Tune max_depth and min_child_weight ### description # We tune these first as they will have the highest impact on model outcome. To start with, let's set wider ranges # and then we will perform another iteration for smaller ranges. ### note # GridSearchCV documentation -> http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html # scoring parameters -> http://scikit-learn.org/stable/modules/model_evaluation.html
data_test = large_set.tail(545421) data_test = pd.DataFrame(pd.concat([data_test, tag_test_set], axis=1)) print 'Finished Reconstructing Train/Test Sets' print data_train.shape print data_test.shape print 'Started Computing train set labels' label_set = np.sign(label_set['Click']) label_set[label_set == -1] = 0 print 'Finished computing train set labels' # fit estimator print "start XGBClassifier" n_samples = data_train.shape[0] est=XGBClassifier(n_estimators=200, learning_rate=0.1, silent= False) print "start fitting" est.fit(data_train, label_set) # predict class labels probs = est.predict_proba(data_test) print "cross validation start" cv = cross_validation.ShuffleSplit(n_samples, n_iter=10, random_state=0) scores = cross_validation.cross_val_score(est, data_train, label_set, cv=cv) mean = np.mean(probs[:, 1]) std = np.std(probs[:, 1]) print "Test predicted Mean:", mean print "Test predicted STD:", std df = pd.DataFrame(probs[:, 1]) df.columns = ["Prediction"]
def dviz_classification_visualization(data_train, target_train, classifierName): clf = tree.DecisionTreeClassifier(max_depth=5, random_state=666) clf.fit(data_train, target_train) svg_tree = dtreeviz( clf, data_train, target_train, target_name=classifierName, feature_names=data_train.columns, orientation="TD", class_names=[classifierName, 'not_' + classifierName], fancy=True, histtype='strip', X=None, label_fontsize=12, ticks_fontsize=8, fontname="Arial") fname = os.path.join( tree_evaluations_out, str(classifierName) + 'fancy_decision_tree_example.svg') svg_tree.save(fname) print(rounds) # READ IN DATA FOLDER AND REMOVE ALL NON-FEATURE VARIABLES (POP DLC COORDINATE DATA AND TARGET DATA) print('Reading in ' + str(len(os.listdir(data_folder))) + ' annotated files...') for i in os.listdir(data_folder): if i.__contains__(".csv"): currentFn = os.path.join(data_folder, i) df = pd.read_csv(currentFn, index_col=0) features = features.append(df, ignore_index=True) print(features) features = features.loc[:, ~features.columns.str.contains('^Unnamed')] features = features.drop(["scorer"], axis=1, errors='ignore') totalTargetframes = features[classifierName].sum() try: targetFrame = features.pop(classifierName).values except KeyError: print( 'Error: the dataframe does not contain any target annotations. Please check the csv files in the project_folder/csv/target_inserted folder' ) features = features.fillna(0) features = drop_bp_cords(features, inifile) target_names = [] loop = 1 for i in range(model_nos): currentModelNames = 'target_name_' + str(loop) currentModelNames = config.get('SML settings', currentModelNames) if currentModelNames != classifierName: target_names.append(currentModelNames) loop += 1 print('# of models to be created: 1') for i in range(len(target_names)): currentModelName = target_names[i] features.pop(currentModelName).values class_names = class_names = ['Not_' + classifierName, classifierName] feature_list = list(features) print('# of features in dataset: ' + str(len(feature_list))) # IF SET BY USER - PERFORM UNDERSAMPLING AND OVERSAMPLING IF SET BY USER data_train, data_test, target_train, target_test = train_test_split( features, targetFrame, test_size=train_test_size) under_sample_setting = config.get('create ensemble settings', 'under_sample_setting') over_sample_setting = config.get('create ensemble settings', 'over_sample_setting') trainDf = data_train trainDf[classifierName] = target_train targetFrameRows = trainDf.loc[trainDf[classifierName] == 1] print('# of ' + str(classifierName) + ' frames in dataset: ' + str(totalTargetframes)) trainDf = trainDf.sample(frac=1).reset_index(drop=True) if under_sample_setting == 'Random undersample': print('Performing undersampling...') under_sample_ratio = config.getfloat('create ensemble settings', 'under_sample_ratio') nonTargetFrameRows = trainDf.loc[trainDf[classifierName] == 0] nontargetFrameRowsSize = int( len(targetFrameRows) * under_sample_ratio) nonTargetFrameRows = nonTargetFrameRows.sample( nontargetFrameRowsSize, replace=False) trainDf = pd.concat([targetFrameRows, nonTargetFrameRows]) target_train = trainDf.pop(classifierName).values data_train = trainDf if under_sample_setting != 'Random undersample': target_train = trainDf.pop(classifierName).values under_sample_ratio = 'NaN' if over_sample_setting == 'SMOTEENN': print('Performing SMOTEEN oversampling...') over_sample_ratio = config.getfloat('create ensemble settings', 'over_sample_ratio') smt = SMOTEENN(sampling_strategy=over_sample_ratio) data_train, target_train = smt.fit_sample(data_train, target_train) if over_sample_setting == 'SMOTE': print('Performing SMOTE oversampling...') over_sample_ratio = config.getfloat('create ensemble settings', 'over_sample_ratio') smt = SMOTE(sampling_strategy=over_sample_ratio) data_train, target_train = smt.fit_sample(data_train, target_train) if (over_sample_setting != 'SMOTEENN') or (over_sample_setting != 'SMOTE'): over_sample_ratio = 'NaN' data_train = data_train.sample(frac=1).reset_index(drop=True) #target_train = np.random.shuffle(target_train) # RUN THE DECISION ENSEMBLE SET BY THE USER # run random forest if model_to_run == 'RF': print('Training model ' + str(classifierName) + '...') RF_n_estimators = config.getint('create ensemble settings', 'RF_n_estimators') RF_max_features = config.get('create ensemble settings', 'RF_max_features') RF_criterion = config.get('create ensemble settings', 'RF_criterion') RF_min_sample_leaf = config.getint('create ensemble settings', 'RF_min_sample_leaf') clf = RandomForestClassifier(n_estimators=RF_n_estimators, max_features=RF_max_features, n_jobs=-1, criterion=RF_criterion, min_samples_leaf=RF_min_sample_leaf, bootstrap=True, verbose=1) try: clf.fit(data_train, target_train) except ValueError: print( 'ERROR: The model contains a faulty array. This may happen when trying to train a model with 0 examples of the behavior of interest' ) # predictions = clf.predict_proba(data_test) # data_test['probability'] = predictions[:, 1] # data_test['prediction'] = np.where(data_test['probability'] > 0.499999, 1, 0) # print(data_test['prediction'].sum()) scoring = ['precision', 'recall', 'f1'] newDataTargets = np.concatenate((target_train, target_test), axis=0) # #newDataTargets = np.where((newDataTargets == 0) | (newDataTargets == 1), newDataTargets ** 1, newDataTargets) # newDataFeatures = np.concatenate((data_train, data_test), axis=0) # #newDataFeatures = np.where((newDataFeatures == 0) | (newDataFeatures == 1), newDataFeatures ** 1, newDataFeatures) # cv = ShuffleSplit(n_splits=5, test_size=train_test_size) # results = cross_validate(clf, newDataFeatures, newDataTargets, cv=cv, scoring=scoring) # results = pd.DataFrame.from_dict(results) # crossValresultsFname = os.path.join(tree_evaluations_out, str(classifierName) + '_cross_val_100.csv') # results.to_csv(crossValresultsFname) # #RUN RANDOM FOREST EVALUATIONS # compute_permutation_importance = config.get('create ensemble settings', 'compute_permutation_importance') # if compute_permutation_importance == 'yes': # print('Calculating permutation importances...') # computePermutationImportance(data_test, target_test, clf) # # generate_learning_curve = config.get('create ensemble settings', 'generate_learning_curve') # if generate_learning_curve == 'yes': # shuffle_splits = config.getint('create ensemble settings', 'LearningCurve_shuffle_k_splits') # dataset_splits = config.getint('create ensemble settings', 'LearningCurve_shuffle_data_splits') # print('Calculating learning curves...') # LearningCurve(features, targetFrame, shuffle_splits, dataset_splits) # if generate_learning_curve != 'yes': # shuffle_splits = 'NaN' # dataset_splits = 'NaN' # generate_precision_recall_curve = config.get('create ensemble settings', 'generate_precision_recall_curve') # if generate_precision_recall_curve == 'yes': # print('Calculating precision recall curve...') # precisionRecallDf = pd.DataFrame() # probabilities = clf.predict_proba(data_test)[:, 1] # precision, recall, thresholds = precision_recall_curve(target_test, probabilities, pos_label=1) # precisionRecallDf['precision'] = precision # precisionRecallDf['recall'] = recall # thresholds = list(thresholds) # thresholds.insert(0, 0.00) # precisionRecallDf['thresholds'] = thresholds # PRCpath = os.path.join(tree_evaluations_out, str(classifierName) + '_precision_recall.csv') # precisionRecallDf.to_csv(PRCpath) # # generate_example_decision_tree = config.get('create ensemble settings', 'generate_example_decision_tree') # if generate_example_decision_tree == 'yes': # print('Generating example decision tree using graphviz...') # estimator = clf.estimators_[3] # generateExampleDecisionTree(estimator) generate_classification_report = config.get( 'create ensemble settings', 'generate_classification_report') if generate_classification_report == 'yes': print('Generating yellowbrick classification report...') generateClassificationReport(clf, class_names, rounds) # generate_features_importance_log = config.get('create ensemble settings', 'generate_features_importance_log') # if generate_features_importance_log == 'yes': # print('Generating feature importance log...') # importances = list(clf.feature_importances_) # log_df = generateFeatureImportanceLog(importances) # # generate_features_importance_bar_graph = config.get('create ensemble settings', 'generate_features_importance_bar_graph') # if generate_features_importance_bar_graph == 'yes': # N_feature_importance_bars = config.getint('create ensemble settings', 'N_feature_importance_bars') # print('Generating feature importance bar graph...') # generateFeatureImportanceBarGraph(log_df, N_feature_importance_bars) # if generate_features_importance_bar_graph != 'yes': # N_feature_importance_bars = 'NaN' # generate_example_decision_tree_fancy = config.get('create ensemble settings','generate_example_decision_tree_fancy') # if generate_example_decision_tree_fancy == 'yes': # print('Generating fancy decision tree example...') # dviz_classification_visualization(data_train, target_train, classifierName) # SAVE MODEL META DATA RF_meta_data = config.get('create ensemble settings', 'RF_meta_data') if RF_meta_data == 'yes': metaDataList = [ classifierName, RF_criterion, RF_max_features, RF_min_sample_leaf, RF_n_estimators, compute_permutation_importance, generate_classification_report, generate_example_decision_tree, generate_features_importance_bar_graph, generate_features_importance_log, generate_precision_recall_curve, RF_meta_data, generate_learning_curve, dataset_splits, shuffle_splits, N_feature_importance_bars, over_sample_ratio, over_sample_setting, train_test_size, under_sample_ratio, under_sample_ratio ] generateMetaData(metaDataList) # run gradient boost model if model_to_run == 'GBC': GBC_n_estimators = config.getint('create ensemble settings', 'GBC_n_estimators') GBC_max_features = config.get('create ensemble settings', 'GBC_max_features') GBC_max_depth = config.getint('create ensemble settings', 'GBC_max_depth') GBC_learning_rate = config.getfloat('create ensemble settings', 'GBC_learning_rate') GBC_min_sample_split = config.getint('create ensemble settings', 'GBC_min_sample_split') clf = GradientBoostingClassifier( max_depth=GBC_max_depth, n_estimators=GBC_n_estimators, learning_rate=GBC_learning_rate, max_features=GBC_max_features, min_samples_split=GBC_min_sample_split, verbose=1) clf.fit(data_train, target_train) clf_pred = clf.predict(data_test) print( str(classifierName) + str(" Accuracy train: ") + str(clf.score(data_train, target_train))) generate_example_decision_tree = config.get( 'create ensemble settings', 'generate_example_decision_tree') if generate_example_decision_tree == 'yes': estimator = clf.estimators_[3, 0] generateExampleDecisionTree(estimator) generate_classification_report = config.get( 'create ensemble settings', 'generate_classification_report') if generate_classification_report == 'yes': generateClassificationReport(clf, class_names) generate_features_importance_log = config.get( 'create ensemble settings', 'generate_features_importance_log') if generate_features_importance_log == 'yes': importances = list(clf.feature_importances_) log_df = generateFeatureImportanceLog(importances) generate_features_importance_bar_graph = config.get( 'create ensemble settings', 'generate_features_importance_bar_graph') N_feature_importance_bars = config.getint( 'create ensemble settings', 'N_feature_importance_bars') if generate_features_importance_bar_graph == 'yes': generateFeatureImportanceBarGraph(log_df, N_feature_importance_bars) # run XGboost if model_to_run == 'XGB': XGB_n_estimators = config.getint('create ensemble settings', 'XGB_n_estimators') XGB_max_depth = config.getint('create ensemble settings', 'GBC_max_depth') XGB_learning_rate = config.getfloat('create ensemble settings', 'XGB_learning_rate') clf = XGBClassifier(max_depth=XGB_max_depth, min_child_weight=1, learning_rate=XGB_learning_rate, n_estimators=XGB_n_estimators, silent=0, objective='binary:logistic', max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=0, scale_pos_weight=1, seed=1, missing=None, verbosity=3) clf.fit(data_train, target_train, verbose=True) # SAVE MODEL modelfn = str(classifierName) + '.sav' modelPath = os.path.join(modelDir_out, modelfn) pickle.dump(clf, open(modelPath, 'wb')) print('Classifier ' + str(classifierName) + ' saved @ ' + str('models/generated_models ') + 'folder') print( 'Evaluation files are in models/generated_models/model_evaluations folders' )
def main(training_data, test_data): # Merging data to ensure consistent cleaning. Putting marker variable to separate later. training_data['source'] = 'training' test_data['source'] = 'test' merged_data = pd.concat([training_data, test_data]) # Cleaning data cleaned_data = data_cleaner(merged_data) # Separating data, removing marker pred_df = cleaned_data[cleaned_data['source'] == 'training'].copy() test_pred = cleaned_data[cleaned_data['source'] == 'test'].copy() pred_df.drop('source', axis=1, inplace=True) test_pred.drop('source', axis=1, inplace=True) # Transforming target into ints, saving the key for later transformation labels = LabelEncoder().fit(training_data['country_destination']) target_df = pd.Series(labels.transform( training_data['country_destination']), index=training_data.index) # Training model xgb_model = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0) xgb_model.fit(pred_df.as_matrix(), target_df.tolist()) # Running the model preds = xgb_model.predict_proba(test_pred.as_matrix()) # Selecting the top 5 most likely for each respondent and stacking. # This section is VERY slow and could use being optimized model_probs = pd.DataFrame(preds, index=test_pred.index, columns=labels.classes_) stacked_probs = pd.Series() for i in model_probs.index: temp = model_probs.loc[i, :] temp_sort = pd.DataFrame(temp.sort_values(ascending=False)[:5].index) temp_sort['id'] = i temp_sort.columns = ['country', 'id'] stacked_probs = pd.concat([stacked_probs, temp_sort]) # # Selecting classes with highest probabilities, compiling into list # ids = [] # cts = [] # test_ids = pd.Series(test_data.index) # for i in range(len(test_ids)): # idx = test_data.index[i] # ids += [idx] * 5 # cts += labels.inverse_transform(np.argsort(model_probs[i])[::-1])[:5].tolist() # # predictions = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country']) # Cleaning output and returning it output = stacked_probs[['id', 'country']] return output
'Property_Area_Urban', 'Loan_tot_income_ratio', 'coapplicant_True' ] X = dfr_train[col] y = dfr_train['Loan_Status'] ''' # RandomForest from sklearn.ensemble import RandomForestClassifier RF = RandomForestClassifier(n_estimators = 300, max_features=None,criterion = 'entropy',random_state = 0) RF.fit(X, y) ''' #Xgboost import xgboost as xgb from xgboost.sklearn import XGBClassifier classifier = XGBClassifier(learning_rate=0.1, n_estimators=10) classifier.fit(X, y) #validation from sklearn.model_selection import cross_val_score accuracies = cross_val_score(estimator=classifier, X=X, y=y, cv=10) print(accuracies.mean()) print(accuracies.std()) ''' #ensemble from sklearn import model_selection seed = 7 kfold = model_selection.KFold(n_splits=10, random_state=seed)
def do_cell(task): df_train, df_test, x_start, y_start = task[0], task[1], task[2], task[3] #print('do_cell', df_train.shape, df_test.shape, x_start, y_start) #train n_places_th_local = n_places_th n_places_local = n_places if n_places != 0: tmp = df_train.shape[0] value_counts = df_train.place_id.value_counts()[0:n_places] df_train = pd.merge(df_train, pd.DataFrame(value_counts), left_on='place_id', right_index=True)[df_train.columns] n_places_th_local = value_counts.values[n_places - 1] percentage = df_train.shape[0]/tmp elif n_places_th != 0: value_counts = df_train.place_id.value_counts() n_places_local = value_counts[value_counts >= n_places_th_local].count() mask = value_counts[df_train.place_id.values] >= n_places_th_local percentage = mask.value_counts()[True]/df_train.shape[0] df_train = df_train.loc[mask.values] else: n_places_th_local = 2 value_counts = df_train.place_id.value_counts() n_places_local = value_counts[value_counts >= n_places_th_local].count() mask = value_counts[df_train.place_id.values] >= n_places_th_local percentage = mask.value_counts()[True]/df_train.shape[0] while percentage > n_places_percentage: n_places_th_local += 1 n_places_local = value_counts[value_counts >= n_places_th_local].count() mask = value_counts[df_train.place_id.values] >= n_places_th_local percentage = mask.value_counts()[True]/df_train.shape[0] n_places_th_local -= 1 n_places_local = value_counts[value_counts >= n_places_th_local].count() mask = value_counts[df_train.place_id.values] >= n_places_th_local percentage = mask.value_counts()[True]/df_train.shape[0] df_train = df_train.loc[mask.values] #print(x_start, y_start, n_places_local, n_places_th_local, percentage) #test row_ids = df_test.index if 'place_id' in df_test.columns: df_test = df_test.drop(['place_id'], axis=1) le = LabelEncoder() y = le.fit_transform(df_train.place_id.values) X = df_train.drop(['place_id'], axis=1).values X_predict = df_test.values score = 0 n_estimators = 0 if xgb == 1: if xgb_calculate_n_estimators == True: clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss, colsample_bytree=cs, gamma=gamma, min_child_weight=min_child_weight, reg_lambda=reg_lambda, reg_alpha=reg_alpha) if train_test == 1: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric=calculate_score, early_stopping_rounds=early_stopping_rounds, verbose=10 if one_cell == 1 else False) score = round(1 - clf.booster().best_score, 6) n_estimators = clf.booster().best_ntree_limit else: abc += 1 xgb_options = clf.get_xgb_params() xgb_options['num_class'] = n_places + 1 train_dmatrix = DMatrix(X, label=y) #some of the classes have less than n_folds, cannot use stratified KFold #folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True) folds = KFold(len(y), n_folds=n_folds, shuffle=True) cv_results = cv(xgb_options, train_dmatrix, clf.n_estimators, early_stopping_rounds=early_stopping_rounds, verbose_eval=10 if one_cell == 1 else False, show_stdv=False, folds=folds, feval=calculate_score) n_estimators = cv_results.shape[0] score = round(1 - cv_results.values[-1][0], 6) std = round(cv_results.values[-1][1], 6) else: n_estimators = n_estimators_fixed clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss, colsample_bytree=cs, gamma=gamma, min_child_weight=min_child_weight, reg_lambda=reg_lambda, reg_alpha=reg_alpha) else: clf = RandomForestClassifier(n_estimators = 300, n_jobs = -1) if rf_calculate_score == True: if train_test == 1: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) y_train2 = le.transform(y_train) y_test2 = le.transform(y_test) clf.fit(X_train, y_train2) y_predict = clf.predict_proba(X_test) scores_local = [] for i in range(X_test.shape[0]): score = calculate_score_per_row(y_predict[i], y_test2[i]) scores_local.append(score) score = np.array(scores_local).mean() else: #some of the classes have less than n_folds, cannot use stratified KFold #folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True) folds = KFold(len(y), n_folds=n_folds, shuffle=True) scores_cv = [] for train, test in folds: X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test] y_train2 = le.transform(y_train) y_test2 = le.transform(y_test) clf.fit(X_train, y_train2) y_predict = clf.predict_proba(X_test) scores_local = [] for i in range(X_test.shape[0]): score = calculate_score_per_row(y_predict[i], y_test2[i]) scores_local.append(score) score = np.array(scores_local).mean() print(' ', x_start, y_start, score) scores_cv.append(score) score = np.array(scores_cv).mean() #if few_cells == 1 or grid_search == 1: # return [score, None, None] clf.fit(X, y) y_predict = clf.predict_proba(X_predict) ##1 labels_predict = le.inverse_transform(np.argsort(y_predict, axis=1)[:,::-1][:,:n_topx]) print(x_start, y_start, score, n_estimators, n_places_local, n_places_th_local, percentage) return [score, row_ids, labels_predict]
seed = 100 np.random.seed(seed) random.seed(seed) X, y = utils.importar_datos() # ### Métricas finales pipeline = Pipeline([("preprocessor", pp.PreprocessingOHE()), ("model", XGBClassifier(use_label_encoder=False, scale_pos_weight=1, subsample=0.8, colsample_bytree=0.8, objective="binary:logistic", n_estimators=1000, learning_rate=0.01, n_jobs=-1, eval_metric="logloss", min_child_weight=6, max_depth=6, reg_alpha=0.05))]) pipeline = utils.entrenar_y_realizar_prediccion_final_con_metricas( X, y, pipeline) # La métrica objetivo AUC-ROC tiene un resultado similar al obtenido al utilizar LE. Sin embargo, se observa que aumento la tasa de Falsos Negativos con respecto al otro modelo, por lo que su Recall (y por ende su F1 Score) disminuyó (en 0.09). A su vez, mejoró levemente la tasa de Verdaderos Negativos. # ### Predicción HoldOut utils.predecir_holdout_y_generar_csv(pipeline, 'Predicciones/4-XGBoost-OHE.csv')
from sklearn.model_selection import train_test_split, StratifiedKFold, permutation_test_score from xgboost.sklearn import XGBClassifier import matplotlib.pyplot as plt from sklearn.metrics import roc_auc_score from sklearn.linear_model import LogisticRegression data_all = pd.read_csv("G:/GDM/DATA/GDM.csv") X ,y= data_all.drop(['OGTTgroup1'],axis=1),data_all.OGTTgroup1 X_log ,y_log= data_all.drop(['OGTTgroup1','weight_gain','income','education','DBP', 'parity','multi_pregnancy'],axis=1),data_all.OGTTgroup1 X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 1,test_size=0.3,stratify=y) X_train_log,X_test_log,y_train_log,y_test_log = train_test_split(X_log,y_log,random_state = 1,test_size=0.3,stratify=y_log) clf = XGBClassifier(random_state=5361,scale_pos_weight=12.026280323450134,n_estimators=200,max_depth=2, min_child_weight=29,colsample_bytree=0.7,subsample=1,gamma=0, reg_alpha=5,reg_lambda=5,learning_rate=0.1, n_jobs=-1).fit(X_train, y_train) cv = StratifiedKFold(5) score, permutation_scores, pvalue = permutation_test_score( clf, X_train, y_train, scoring="roc_auc", cv=cv, n_permutations=1000, n_jobs=-1) #print(pvalue) clf_log = LogisticRegression(random_state=0,fit_intercept=True, C=1e9,solver = 'newton-cg').fit(X_train_log, y_train_log) score_log, permutation_scores_log, pvalue_log = permutation_test_score( clf_log, X_train_log, y_train_log, scoring="roc_auc", cv=cv, n_permutations=1000, n_jobs=-1) #print(pvalue_log)
def model_making_main(file): logger.info(">> Start - Model making") df = pd.read_csv(config.preprocessed_csv, encoding='UTF-8') # If getting an error remove .astype(str) select_columns = [ 'recepientemail', 'Gender', 'Age(years)', 'Product Type', 'Weight', 'Height', 'Habit', 'Face Amount', 'Medication', 'Property', 'Medical Data', 'Family' ] df['ColumnA'] = df[select_columns].apply( lambda x: ','.join(x.dropna().astype(str)), axis=1) logging.info("Remove puncutation, tokenize") df['Lemmitize'] = df['ColumnA'].apply(rem_punt).apply(tokenize) df['Lemmitize'] = df['Lemmitize'].apply(conversion) df.to_csv(config.nlp_processed_csv, index=False, encoding="utf-8") df = pd.read_csv(config.nlp_processed_csv) X = df['Lemmitize'] of = pd.read_csv(file, encoding='UTF-8') y = of['Offer_noise_free'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4) vect = TfidfVectorizer(max_df=0.5, max_features=10000, min_df=1, use_idf=True, ngram_range=(1, 2), lowercase=True) represent = TfidfVectorizer(max_df=0.5, max_features=10000, min_df=1, use_idf=True, ngram_range=(1, 1), lowercase=True) matrix = represent.fit_transform(X.values) # visualize(represent,matrix,X,y) # print(matrix) # for i, feature in enumerate(vect.get_feature_names()): # print(i, feature) #va = raw_input() model1 = XGBClassifier(nthread=4, n_estimators=1000) model3 = RandomForestClassifier(n_estimators=60, n_jobs=3, max_features="auto", min_samples_leaf=50) model4 = SVC(kernel='rbf', C=1, gamma=10) model5 = LogisticRegression() model7 = SGDClassifier(alpha=.0001) model_making("XGBOOST", vect, model1, X_train, y_train, X_test, y_test) model_making("Random Forest", vect, model3, X_train, y_train, X_test, y_test) model_making("SVM", vect, model4, X_train, y_train, X_test, y_test) model_making("Logistic Regression", vect, model5, X_train, y_train, X_test, y_test) model_making("SGDClassifier", vect, model7, X_train, y_train, X_test, y_test) # model_with_SVD(vect,X_train,X_test,y_train,y_test) logger.info("<< End - Model making")
model = GaussianNB(**vars) elif alg.name == 'LogisticRegression': from sklearn.linear_model import LogisticRegression model = LogisticRegression(**vars) elif alg.name == 'AdaBoost' and alg.type == 'classification': from sklearn.ensemble import AdaBoostClassifier model = AdaBoostClassifier(**vars) elif alg.name == 'GradientBoosting' and alg.type == 'classification': from sklearn.ensemble import GradientBoostingClassifier model = GradientBoostingClassifier(**vars) elif alg.name == 'RandomForest' and alg.type == 'classification': from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier(**vars) elif alg.name == 'XGBoost' and alg.type == 'classification': from xgboost.sklearn import XGBClassifier model = XGBClassifier(**vars) elif alg.name == 'CatBoost' and alg.type == 'classification': from catboost import CatBoostClassifier model = CatBoostClassifier(**vars) #------------------------------------------------------------- # Regression algorithms elif alg.name == 'TPOT_Regressor': from tpot import TPOTRegressor model = TPOTRegressor( generations=alg.generations, cv=alg.cv, scoring=alg.scoring, verbosity=alg.verbosity ) elif alg.name == 'AutoSklearn_Regressor':
)), ('scale', MaxAbsScaler()), ('clf', OneVsRestClassifier(LogisticRegression())) ]) TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)' # Import the hashing vectorizer p2 = Pipeline([ ('union', FeatureUnion( transformer_list = [ ('numeric_features', Pipeline([ ('selector', get_numeric_data), ('imputer', Imputer()) ])), ('text_features', Pipeline([ ('selector', get_text_data), ('vectorizer', HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC, non_negative=True, norm=None, binary=False, ngram_range=(1,2))), ('dim_red', SelectKBest(chi2, chi_k)) ])) ] )), ('int', SparseInteractions(degree=2)), ('scale', MaxAbsScaler()), ('clf', XGBClassifier()))) ])
def xgboost_algorithm(XTrain,YTrain,XTest): xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0) xgb.fit(XTrain, YTrain) y_pred_xgboost = xgb.predict_proba(XTest) return y_pred_xgboost
feat_imp = pd.Series( alg.booster().get_fscore()).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Impxortances') plt.ylabel('Feature Importance Score') #Choose all predictors except target & IDcols #%% Step 1: Fix learning rate and number of estimators for tuning tree-based parameters predictors = [x for x in train.columns if x not in [target, IDcol]] xgb1 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27) modelfit(xgb1, train, predictors) param_test1 = { 'max_depth': list(range(3, 13, 2)), 'min_child_weight': list(range(1, 7, 2)) } gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=5, min_child_weight=1,
df_all = pd.merge(df_all, df_sess_features, how='left', left_on='id', right_on='id') df_all = df_all.drop(['id'], axis=1) #release memory del df_sessions del device_freq del action_freq #Splitting train and test vals = df_all.values X = vals[:piv_train] le = LabelEncoder() y = le.fit_transform(labels) X_test = vals[piv_train:] #Classifier xgb = XGBClassifier(max_depth=6, learning_rate=0.25, n_estimators=43, objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=0) print('scores:', NDCG.cross_validation_score(X, labels,xgb,5)) ''' xgb.fit(X, y) y_pred = xgb.predict_proba(X_test) #Taking the 5 classes with highest probabilities ids = [] #list of ids cts = [] #list of countries for i in range(len(id_test)): idx = id_test[i] ids += [idx] * 5 cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist() #Generate submission
# df_All_stat_9 = pd.read_csv("mchnt_ana.csv", sep=',') # df_All = pd.merge(left=df_All, right=df_All_stat_9, how='left', left_on='certid', right_on='certid') ######################### label_df = pd.read_csv("train_label_encrypt.csv", sep=",", low_memory=False, error_bad_lines=False) df_All = pd.merge(left=df_All, right=label_df, how='left', left_on='certid', right_on='certid') df_All = df_All.fillna(-1) df_All_train = df_All[(df_All["label"] == 0) | (df_All["label"] == 1)] df_All_test = df_All[(df_All["label"] != 0) & (df_All["label"] != 1)] for i in range(2): savename = "xgboost_results_1120_" + str(i) + ".csv" print savename df_All_train = shuffle(df_All_train) X_train = df_All_train.drop(["certid", "label"], axis=1, inplace=False) y_train = df_All_train["label"] clf = XGBClassifier(learning_rate =0.1,n_estimators=1000,max_depth=5,gamma=0.01,subsample=0.8,colsample_bytree=0.8,objective= 'binary:logistic', reg_alpha=0.1, reg_lambda=0.1,seed=27) clf = clf.fit(X_train, y_train) X_test = df_All_test.drop(["certid", "label"], axis=1, inplace=False) pred = clf.predict(X_test).T cerid_arr = np.array(df_All_test["certid"]).T result = np.vstack((cerid_arr,pred)) np.savetxt(savename,result.T,delimiter=',', fmt = "%s")
def xgbost(x,y,targetx): clf_xgb = XGBClassifier(n_estimators=1000,max_depth=6, learning_rate=0.0075,subsample=0.7,colsample_bytree=0.7,seed=4) clf_xgb.fit(x,y) return clf_xgb.predict_proba(targetx)[:,1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) #ClusterCentroids cc = ClusterCentroids(random_state=0) os_X, os_y = cc.fit_sample(X_train, y_train) #XGboost clf_XG = XGBClassifier(learning_rate=0.3, min_child_weight=1, max_depth=6, gamma=0, subsample=1, max_delta_step=0, colsample_bytree=1, reg_lambda=1, n_estimators=100, seed=1000, scale_pos_weight=1000) clf_XG.fit(os_X, os_y, eval_set=[(os_X, os_y), (X_test, y_test)], eval_metric='auc', verbose=False) evals_result = clf_XG.evals_result() y_true, y_pred = y_test, clf_XG.predict(X_test) #F1_score, precision, recall, specifity, G score print "F1_score : %.4g" % metrics.f1_score(y_true, y_pred)
def build_model(X, y): print("Fitting classifier") xgb = XGBClassifier(max_depth = 4, learning_rate = 0.25, n_estimators = 25, objective = 'multi:softprob', subsample = 0.6, colsample_bytree = 0.6) xgb.fit(X, y) return xgb
#建模 alg.fit(dtrain[predictors], dtrain['AKI'], eval_metric='auc') #对训练集预测 dtrain_predictions = alg.predict(dtrain[predictors]) dtrain_predprob = alg.predict_proba(dtrain[predictors])[:, 1] #输出模型的一些结果 print("Stopped at iteration: {0}".format(cvresult.shape[0])) print("\n关于现在这个模型") print("准确率 : %.4g" % metrics.accuracy_score(dtrain['AKI'].values, dtrain_predictions)) print("AUC 得分 (训练集): %f" % metrics.roc_auc_score(dtrain['AKI'], dtrain_predprob)) #获得最佳决策树数目 predictors = [x for x in re.columns if x not in [target]] xgb1 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27) modelfit(xgb1, re, data, predictors)
def run(): np.random.seed(0) # seed to shuffle the train set n_folds = 4 # verbose = True shuffle = False X,y = get_train_data() X_submission = mf.get_test_data() if shuffle: idx = np.random.permutation(y.size) X = X[idx] y = y[idx] skf = list(StratifiedKFold(y, n_folds)) # 这里可以改变参数生成多个模型 clfs = [RandomForestClassifier(n_estimators=500, max_features=0.8, bootstrap=True, min_samples_leaf=50, oob_score=True, criterion='gini', n_jobs=-1), RandomForestClassifier(n_estimators=500, max_features=0.5, bootstrap=True, min_samples_leaf=50, oob_score=True, criterion='entropy', n_jobs=-1), ExtraTreesClassifier(n_estimators=500, min_samples_leaf=50, criterion='gini', n_jobs=-1), ExtraTreesClassifier(n_estimators=500, min_samples_leaf=50, criterion='entropy', n_jobs=-1), GradientBoostingClassifier(learning_rate=0.05, n_estimators=500, max_depth=3, max_features=0.65, subsample=0.7, random_state=10, min_samples_split=350, min_samples_leaf=70), GradientBoostingClassifier(learning_rate=0.01, n_estimators=1000, max_depth=4, max_features=0.7, subsample=0.8, random_state=10, min_samples_split=350, min_samples_leaf=70), XGBClassifier(learning_rate=0.05, n_estimators=350, gamma=0, min_child_weight=5, max_depth=5, subsample=0.8, scale_pos_weight=1, colsample_bytree=0.8, objective='binary:logistic', nthread=8, eval_metric= 'auc', seed=10), XGBClassifier(learning_rate=0.02, n_estimators=500, gamma=0, min_child_weight=5, max_depth=5, subsample=0.7, scale_pos_weight=1, colsample_bytree=0.7, objective='binary:logistic', nthread=8, eval_metric= 'auc', seed=10) ] print ("Creating train and test sets for blending.") dataset_blend_train = np.zeros((X.shape[0], len(clfs))) dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs))) for j, clf in enumerate(clfs): print (j, clf) dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf))) for i, (train, test) in enumerate(skf): print ("Fold", i) X_train = X.ix[train,:] y_train = y[train] X_test = X.ix[test,:] y_test = y[test] clf.fit(X_train, y_train) y_submission = clf.predict_proba(X_test)[:, 1] dataset_blend_train[test, j] = y_submission print ("train ks_score: ",ks.ks_score(y_submission,y_test)) dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:, 1] dataset_blend_test[:, j] = dataset_blend_test_j.mean(1) print print ("Blending.") clf = LogisticRegression() clf.fit(dataset_blend_train, y) y_submission = clf.predict_proba(dataset_blend_test)[:, 1] dataset_blend_train.to_csv('H:\\ML\\DC\\user_loan_risk_predict\\predict/dataset_blend_train.csv',index=False) y.to_csv('H:\\ML\\DC\\user_loan_risk_predict\\predict/y.csv',index=False) y_submission.to_csv('H:\\ML\\DC\\user_loan_risk_predict\\predict/y_submission.csv',index=False) X_user_id.to_csv('H:\\ML\\DC\\user_loan_risk_predict\\predict/X_user_id.csv',index=False) print ("Linear stretch of predictions to [0,1]") y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min()) test_pre = pd.DataFrame({u'userid':X_user_id,u'probability':y_submission}) test_pre = test_pre[['userid','probability']] print (test_pre.head()) test_pre.to_csv('H:\\ML\\DC\\user_loan_risk_predict\\predict/pre_blending.csv',index=False)
nclass=3 stime = time.time() trainc=pd.read_csv('./data/train_lon_lat_predicted.csv',index_col=0) testc=pd.read_csv('./data/test_lon_lat_predicted.csv',index_col=0) target=pd.read_csv('./data/target.csv',index_col=0) nf=10 outcome=target['status_group'] cclf1=XGBClassifier(max_depth=14, learning_rate=0.0588, n_estimators=250, objective='multi:softprob', nthread=8, gamma=0.6890, min_child_weight=7.6550, subsample=0.8, colsample_bytree=0.8) cclf2=XGBClassifier(max_depth=15, learning_rate=0.03599, n_estimators=385, objective='multi:softprob', nthread=8, gamma=0.6836, min_child_weight= 4.3704, subsample=0.8, colsample_bytree=0.8)
"first_affiliate_tracked", "signup_app", "first_device_type", "first_browser", ] X = split_categorical_variables(train, categorical_variables) y = X.pop("country_destination") label_table = LabelEncoder() y = label_table.fit_transform(y.values) # # Let's try a gradiant boost classifier # In[56]: xgb_model = XGBClassifier(max_depth=3, n_estimators=10, learning_rate=0.1) xgb_model.fit(X, y) # ## How did we do? # # * To start, let's look at how well we did just predicting the final outcome pred = xgb_model.predict_proba(X) # Find the most probable country best_country = [] # Not used for now bestId = [] for i in range(len(pred)): bestId.append(np.argsort(pred[i])[::-1])
#Normalize X = StandardScaler().fit_transform(X) # evaluate the model by splitting into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2017) kfold = cross_validation.StratifiedKFold(y=y_train, n_folds=5, random_state=2017) num_rounds = 100 clf_XGB = XGBClassifier(n_estimators=num_rounds, objective='binary:logistic', seed=2017) # use early_stopping_rounds to stop the cv when there is no score imporovement clf_XGB.fit(X_train, y_train, early_stopping_rounds=20, eval_set=[(X_test, y_test)], verbose=False) results = cross_validation.cross_val_score(clf_XGB, X_train, y_train, cv=kfold) print("\nxgBoost - CV Train : %.2f" % results.mean()) print("xgBoost - Train : %.2f" % metrics.accuracy_score(clf_XGB.predict(X_train), y_train)) print("xgBoost - Test : %.2f" %
# In[ ]: #Splitting train and test vals = df_all.values X = vals[:piv_train] le = LabelEncoder() y = le.fit_transform(labels) X_test = vals[piv_train:] # In[ ]: #Classifier xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0) xgb.fit(X, y) y_pred = xgb.predict_proba(X_test) # In[ ]: ids = [] #list of ids cts = [] #list of countries for i in range(len(id_test)): idx = id_test[i] ids += [idx] * 5 cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist() #Generate submission sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
verbose_eval=True) alg.set_params(n_estimators=cvresult.shape[0]) alg.fit(dtrain[predictors], dtrain['segment'], eval_metric='auc') dtrain_predictions = alg.predict(dtrain[predictors]) dtrain_predprob = alg.predict_proba(dtrain[predictors])[:, 1] print '\nModel Report:' print 'AUC (Train): ', metrics.roc_auc_score(dtrain['segment'], dtrain_predprob) return alg print 'Training model_1...' xgb1 = XGBClassifier(learning_rate=0.1, n_estimators=10000, max_depth=4, gamma=0, objective='binary:logistic', seed=27) model_1 = modelfit(xgb1, train, predictors) print 'Predictions in progress...' submit = pd.DataFrame() submit['ID'] = test['ID'] pred_1 = model_1.predict_proba(test[predictors])[:, 1] submit['segment'] = pred_1 submit.to_csv('submit.csv', index=False)
def pipe_main(pipe=None): '''pipeline construction using sklearn estimators, final step support only classifiers currently .. note:: data flows through a pipeline consisting of steps as below: raw data --> clean --> encoding --> scaling --> feature construction --> feature selection --> resampling --> final estimator see scikit-learn preprocess & estimators parameter ---- pipe - str - in the format of 'xx_xx' of which 'xx' means steps in pipeline, default None return ---- 1) pipeline instance of chosen steps 2) if pipe is None, a dict indicating possible choice of 'steps' ''' clean = { 'clean': Split_cls(dtype_filter='not_datetime', na1='null', na2=-999), 'cleanNA': Split_cls(dtype_filter='not_datetime', na1=None, na2=None), 'cleanMean': Split_cls(dtype_filter='not_datetime', na1='most_frequent', na2='mean'), } # encode = { 'woe': Woe_encoder(max_leaf_nodes=5), 'oht': Oht_encoder(), 'ordi': Ordi_encoder(), } resample = { # over_sampling 'rover': RandomOverSampler(), 'smote': SMOTE(), 'bsmote': BorderlineSMOTE(), 'adasyn': ADASYN(), # under sampling controlled methods 'runder': RandomUnderSampler(), 'nearmiss': NearMiss(version=3), 'pcart': InstanceHardnessThreshold(), # under sampling cleaning methods 'tlinks': TomekLinks(n_jobs=-1), 'oside': OneSidedSelection(n_jobs=-1), 'cleanNN': NeighbourhoodCleaningRule(n_jobs=-1), 'enn': EditedNearestNeighbours(n_jobs=-1), 'ann': AllKNN(n_jobs=-1), 'cnn': CondensedNearestNeighbour(n_jobs=-1), # clean outliers 'inlierForest': FunctionSampler(outlier_rejection, kw_args={'method': 'IsolationForest'}), 'inlierLocal': FunctionSampler(outlier_rejection, kw_args={'method': 'LocalOutlierFactor'}), 'inlierEllip': FunctionSampler(outlier_rejection, kw_args={'method': 'EllipticEnvelope'}), 'inlierOsvm': FunctionSampler(outlier_rejection, kw_args={'method': 'OneClassSVM'}), # combine 'smoteenn': SMOTEENN(), 'smotelink': SMOTETomek(), } scale = { 'stdscale': StandardScaler(), 'maxscale': MinMaxScaler(), 'rscale': RobustScaler(quantile_range=(10, 90)), 'qauntile': QuantileTransformer(), # uniform distribution 'power': PowerTransformer(), # Gaussian distribution 'norm': Normalizer(), # default L2 norm # scale sparse data 'maxabs': MaxAbsScaler(), 'stdscalesp': StandardScaler(with_mean=False), } # feature construction feature_c = { 'pca': PCA(whiten=True), 'spca': SparsePCA(normalize_components=True, n_jobs=-1), 'ipca': IncrementalPCA(whiten=True), 'kpca': KernelPCA(kernel='rbf', n_jobs=-1), 'poly': PolynomialFeatures(degree=2), 'rtembedding': RandomTreesEmbedding(n_estimators=10), 'LDA': LinearDiscriminantAnalysis(), 'QDA': QuadraticDiscriminantAnalysis(), } # select from model feature_m = { 'fwoe': SelectFromModel(Woe_encoder(max_leaf_nodes=5)), 'flog': SelectFromModel( LogisticRegressionCV(penalty='l1', solver='saga', scoring='roc_auc')), 'fsgd': SelectFromModel(SGDClassifier(penalty="l1")), 'fsvm': SelectFromModel(LinearSVC('l1', dual=False, C=1e-2)), 'fxgb': SelectFromModel(XGBClassifier(n_jobs=-1)), 'frf': SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5)), 'fRFExgb': RFE(XGBClassifier(n_jobs=-1), step=0.1, n_features_to_select=20), 'fRFErf': RFE(ExtraTreesClassifier(n_estimators=100, max_depth=5), step=0.3, n_features_to_select=20), 'fRFElog': RFE(LogisticRegressionCV(penalty='l1', solver='saga', scoring='roc_auc'), step=0.3, n_features_to_select=20) } # Univariate feature selection feature_u = { 'fchi2': GenericUnivariateSelect(chi2, 'percentile', 25), 'fMutualclf': GenericUnivariateSelect(mutual_info_classif, 'percentile', 25), 'fFclf': GenericUnivariateSelect(f_classif, 'percentile', 25), } # sklearn estimator t = all_estimators(type_filter=['classifier']) estimator = {} for i in t: try: estimator.update({i[0]: i[1]()}) except Exception: continue estimator.update( dummy=DummyClassifier(), XGBClassifier=XGBClassifier(n_jobs=-1), LogisticRegressionCV=LogisticRegressionCV(scoring='roc_auc'), EasyEnsembleClassifier=EasyEnsembleClassifier(), BalancedRandomForestClassifier=BalancedRandomForestClassifier(), RUSBoostClassifier=RUSBoostClassifier(), SVC=SVC(C=0.01, gamma='auto')) if pipe is None: feature_s = {} feature_s.update(**feature_m, **feature_u) return { 'clean': clean.keys(), 'encoding': encode.keys(), 'resample': resample.keys(), 'scale': scale.keys(), 'feature_c': feature_c.keys(), 'feature_s': feature_s.keys(), 'classifier': estimator.keys() } elif isinstance(pipe, str): l = pipe.split('_') all_keys_dict = {} all_keys_dict.update(**clean, **encode, **scale, **feature_c, **feature_m, **feature_u, **estimator, **resample) steps = [] for i in l: if all_keys_dict.get(i) is not None: steps.append((i, all_keys_dict.get(i))) else: raise KeyError( "'{}' invalid key for sklearn estimators".format(i)) return Pipeline(steps) else: raise ValueError("input pipe must be a string in format 'xx[_xx]'")
from sklearn.tree import DecisionTreeClassifier from xgboost.sklearn import XGBClassifier X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3, random_state=0) clf = DecisionTreeClassifier() #we have to define max_depth to prevent overfitting clf.fit(X_train, y_train) print("Train Accuracy of clf:", clf.score(X_train, y_train)) print("Test Accuracy of clf", clf.score(X_test, y_test)) xgb = XGBClassifier() xgb.fit(X_train, y_train) print("Train Accuracy of xgb:", xgb.score(X_train, y_train)) print("Test Accuracy of xgb:", xgb.score(X_test, y_test)) #%% from sklearn.model_selection import GridSearchCV #GridSearch on Xgboost Classifier param_dict = { 'max_depth': range(2, 3, 4), 'min_child_weight': range(1, 2, 6), 'learning_rate': [0.00001, 0.001, 0.01, 0.1], 'n_estimators': [10, 50, 100] }