def kfold_cv(X_train, y_train,idx,k): kf = StratifiedKFold(y_train,n_folds=k) xx=[] count=0 for train_index, test_index in kf: count+=1 X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:] gc.collect() y_train_cv, y_test_cv = y_train[train_index],y_train[test_index] y_pred=np.zeros(X_test_cv.shape[0]) m=0 for j in range(m): clf=xgb_classifier(eta=0.05,min_child_weight=20,col=0.5,subsample=0.7,depth=7,num_round=400,seed=j*77,gamma=0.1) y_pred+=clf.train_predict(X_train_cv,(y_train_cv),X_test_cv,y_test=(y_test_cv)) yqq=y_pred*(1.0/(j+1)) print j,llfun(y_test_cv,yqq) #y_pred/=m; clf=XGBClassifier(max_depth=10,colsample_bytree=0.8,learning_rate=0.02,n_estimators=500,nthread=-1) #clf=RandomForestClassifier(n_jobs=-1,n_estimators=100,max_depth=100) clf.fit(X_train_cv,(y_train_cv),eval_metric="logloss",eval_set=[(X_test_cv, y_test_cv)]) y_pred=clf.predict_proba(X_test_cv).T[1] print y_pred.shape xx.append(llfun(y_test_cv,(y_pred))) ypred=y_pred yreal=y_test_cv idx=idx[test_index] print xx[-1]#,y_pred.shape break print xx,'average:',np.mean(xx),'std',np.std(xx) return ypred,yreal,idx#np.mean(xx)
def test_predict_sklearn_pickle(self): X,y = makeXy() Xtest = makeXtest() from xgboost import XGBClassifier kwargs={} kwargs['tree_method'] = 'gpu_hist' kwargs['predictor'] = 'gpu_predictor' kwargs['silent'] = 0 kwargs['objective'] = 'binary:logistic' model = XGBClassifier(**kwargs) model.fit(X,y) print(model) # pickle model save_obj(model,"model.pkl") # delete model del model # load model model = load_obj("model.pkl") os.remove("model.pkl") # continue as before print("Before model.predict") sys.stdout.flush() tmp = time.time() gpu_pred = model.predict(Xtest, output_margin=True) print(gpu_pred) print("E non-zeroes: %d:" % (np.count_nonzero(gpu_pred))) print("E GPU Time to predict = %g" % (time.time() - tmp))
def test_predict_sklearn_pickle(self): x, y = build_dataset() kwargs = {'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor', 'verbosity': 2, 'objective': 'binary:logistic', 'n_estimators': 10} model = XGBClassifier(**kwargs) model.fit(x, y) save_pickle(model, "model.pkl") del model # load model model: xgb.XGBClassifier = load_pickle("model.pkl") os.remove("model.pkl") gpu_pred = model.predict(x, output_margin=True) # Switch to CPU predictor bst = model.get_booster() bst.set_param({'predictor': 'cpu_predictor'}) cpu_pred = model.predict(x, output_margin=True) np.testing.assert_allclose(cpu_pred, gpu_pred, rtol=1e-5)
def xgboost_classifier(self): cls = XGBClassifier() print 'xgboost cross validation score', cross_val_score(cls,self.x_data,self.y_data) start_time = time.time() cls.fit(self.x_train, self.y_train) print 'score', cls.score(self.x_test, self.y_test) print 'time cost', time.time() - start_time
def feature_selection(model, X_train, X_test, y_train, y_test, eval_metric='auc'): thresholds = [thres for thres in sorted(model.feature_importances_) if thres != 0] # Use feat. with >0 importance roc_scores = {} for thresh in thresholds: # select features using threshold selection = SelectFromModel(model, threshold=thresh, prefit=True) select_X_train = selection.transform(X_train) selection_model = XGBClassifier() # train model selection_model.fit(select_X_train, y_train, eval_metric=eval_metric) select_X_test = selection.transform(X_test) # eval model y_pred = selection_model.predict(select_X_test) roc = roc_auc_score(y_test, y_pred) roc_scores[selection.threshold] = roc best_thresh = max(roc_scores, key=roc_scores.get) fs = SelectFromModel(model, threshold=best_thresh, prefit=True) pickle_model(fs, 'feature.select') X_train_trans_ = fs.transform(X_train) X_test_trans_ = fs.transform(X_test) print 'total features kept: {}'.format(X_train_trans_.shape[1]) return X_train_trans_, X_test_trans_
def xgboostcv(max_depth, learning_rate, n_estimators, subsample, colsample_bytree, gamma, min_child_weight, silent=True, nthread=-1, seed=1234): clf = XGBClassifier(max_depth=int(max_depth), learning_rate=learning_rate, n_estimators=int(n_estimators), silent=silent, nthread=nthread, subsample=subsample, colsample_bytree=colsample_bytree, gamma=gamma, min_child_weight = min_child_weight, seed=seed, objective="binary:logistic") clf.fit(x0, y0, eval_metric="logloss", eval_set=[(x1, y1)],early_stopping_rounds=25) ll = -log_loss(y1, clf.predict_proba(x1)) return ll
def get_xgb_feature_importance_plot(best_param_, experiment_, png_folder, png_fname, score_threshold=0.8): # 1. train_X, train_y = experiment_.get_train_data() clf = XGBClassifier() try: del best_param_['model_type'] except: pass clf.set_params(**best_param_) clf.fit(train_X, train_y) index2feature = clf.booster().get_fscore() fis = pd.DataFrame({'name':index2feature.keys(), 'score':index2feature.values()}) fis = fis.sort('score', ascending=False) if len(fis.index) > 20: score_threshold = fis['score'][fis['score'] > 0.0].quantile(score_threshold) #where_str = 'score > %f & score > %f' % (score_threshold, 0.0) where_str = 'score >= %f' % (score_threshold) fis = fis.query(where_str) # 2. plot #gs = GridSpec(2,2) #ax1 = plt.subplot(gs[:,0]) #ax2 = plt.subplot(gs[0,1]) #ax3 = plt.subplot(gs[1,1]) # 3.1 feature importance sns.barplot(x = 'score', y = 'name', data = fis, #ax=ax1, color="blue") #plt.title("Feature_Importance", fontsize=10) plt.ylabel("Feature", fontsize=10) plt.xlabel("Feature_Importance : f-Score", fontsize=10) """ # 3.2 PDF confidence_score = clf.oob_decision_function_[:,1] sns.distplot(confidence_score, kde=False, rug=False, ax=ax2) ax2.set_title("PDF") # 3.3 CDF num_bins = min(best_param_.get('n_estimators',1), 100) counts, bin_edges = np.histogram(confidence_score, bins=num_bins, normed=True) cdf = np.cumsum(counts) ax3.plot(bin_edges[1:], cdf / cdf.max()) ax3.set_title("CDF") ax3.set_xlabel("Oob_Decision_Function:Confidence_Score", fontsize=10) """ png_fname = os.path.join(Config.get_string('data.path'), 'graph', png_fname) plt.tight_layout() plt.savefig(png_fname)#, bbox_inches='tight', pad_inches=1) plt.close() return True
def cv(X_train, y_train, features_inner): kfold = StratifiedKFold(n_splits=5, shuffle=True) scores_f = [] scores_p = [] scores_r = [] for train, test in kfold.split(X_train, y_train): model = XGBClassifier() X_train_cv = pd.DataFrame(X_train.values[train], columns=X_train.columns) y_train_cv = pd.DataFrame(y_train.values[train], columns=["tred_cutoff"]) X_test_cv = pd.DataFrame(X_train.values[test], columns=X_train.columns) y_test_cv = pd.DataFrame(y_train.values[test], columns=["tred_cutoff"]) model.fit(X_train_cv, y_train_cv) y_pred = model.predict(X_test_cv) s_f = f1_score(y_test_cv, y_pred) s_p = precision_score(y_test_cv, y_pred) s_r = recall_score(y_test_cv, y_pred) print("\tscores f1", (s_f)) print("\tscores p", (s_p)) print("\tscores r", (s_r)) scores_f.append(s_f) scores_p.append(s_p) scores_r.append(s_r) print("mean scores f1", np.mean(scores_f)) print("mean scores p", np.mean(scores_p)) print("mean scores r", np.mean(scores_r))
def XGB_model(train,y): model=XGBClassifier(n_estimators=150, learning_rate=0.01) from sklearn import cross_validation cv = cross_validation.KFold(len(train), n_folds=5,random_state=7) for traincv,testcv in cv: model.fit(train.iloc[traincv],y.iloc[traincv]) y_XGB=model.predict(test) return y_XGB
def main(): # Set seed for reproducibility np.random.seed(0) print("Loading data...") # Load the data from the CSV files training_data = pd.read_csv('/home/vipin/Videos/train.csv', header=0) prediction_data = pd.read_csv('/home/vipin/Videos/test.csv', header=0) training_data['countrycode']=training_data['countrycode'].apply(lambda x:ord(x)) training_data['browserid']=training_data['browserid'].apply(lambda x: myfunc (x) if np.all(pd.notnull(x)) else myfunc("unknown") ) training_data['devid']=training_data['devid'].apply(lambda x: myfunc (x) if np.all(pd.notnull(x)) else myfunc("none")) #pd.to_csv('/home/vipin/Videos/train11.csv', sep=',', encoding='utf-8') #exit(0) prediction_data['countrycode']=prediction_data['countrycode'].apply(lambda x:ord(x)) prediction_data['browserid']=prediction_data['browserid'].apply(lambda x:myfunc (x) if np.all(pd.notnull(x)) else myfunc("unknown") ) prediction_data['devid']=prediction_data['devid'].apply(lambda x:myfunc (x) if np.all(pd.notnull(x)) else myfunc("none") ) features=['siteid','offerid','category','merchant','countrycode','browserid','devid'] target="click" X = training_data[features] x_prediction = prediction_data[features] Y= training_data[target] ids = prediction_data["ID"] model = XGBClassifier() #linear_model.LogisticRegression(n_jobs=-1) print("Training...") # Your model is trained on the training_data model.fit(X, Y) print("Predicting...") seed =7 test_size=0.33 X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=test_size,random_state=seed) y_prediction = model.predict_proba(x_prediction) results = y_prediction[:, 1] results_df = pd.DataFrame(data={'probability':results}) joined = pd.DataFrame(ids).join(results_df) y_pred=model.predict(X_test) accuracy=accuracy_score(y_test,y_pred) print("Accuracy: %.2f%%" % (accuracy * 100.0)) print("Writing predictions to predictions.csv") # Save the predictions out to a CSV file joined.to_csv("/home/vipin/Videos/predictions.csv", index=False)
def test_xgboost(): """Ensure that the TPOT xgboost method outputs the same as the xgboost classfier method""" tpot_obj = TPOT() result = tpot_obj._xgradient_boosting(training_testing_data, n_estimators=100, learning_rate=0, max_depth=3) result = result[result['group'] == 'testing'] xgb = XGBClassifier(n_estimators=100, learning_rate=0.0001, max_depth=3, seed=42) xgb.fit(training_features, training_classes) assert np.array_equal(result['guess'].values, xgb.predict(testing_features))
def update_model(current_year): print 'Creating model...\nDate: {}'.format(datetime.now().strftime('%Y-%m-%d_%H:%M:%S')) managers = tuple(unique_managers(current_year)) sql = "select * from (select week, year, manager1_name, manager2_name, team1_points, team1_projected, team2_points, team2_projected, type \ from scoreboard_all WHERE team1_points > 0 and week<=13 \ UNION select week, year, manager2_name AS manager1_name, manager1_name as manager2_name, team2_points AS team1_points, \ team2_projected AS team1_projected, team1_points as team2_points, team1_projected AS team2_projected, type FROM scoreboard_all \ where team1_points>0 and week<=13) order by year, week, type;" ff1 = download_data(os.path.join(os.getcwd(), 'data/fantasy_football.db'), sql) data_features = custom_features(ff1) data_features = data_features[(data_features.manager1_name.isin(managers)) & (data_features.manager2_name.isin(managers))] X, y, managers, league_type = dummy_and_interaction(data_features) # feats = X.columns.tolist() sc = StandardScaler() X_std = sc.fit_transform(X) pickle_model(sc, 'standard.scaler') # Select best features X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.25, random_state=None) model = XGBClassifier() model.fit(X_train, y_train) # imports = model.feature_importances_.tolist() # g = zip(feats, imports) # feat_importance = sorted(g, key=lambda x: x[1], reverse=True) # print feat_importance X_train_trans, X_test_trans = feature_selection(model, X_train, X_test, y_train, y_test, eval_metric='auc') # Select best params model = XGBClassifier() learning_rate = [0.001, 0.01, 0.1, 0.2, 0.3] n_estimators = [50, 100, 150, 200, 250, 300] param_grid = dict(n_estimators=n_estimators, learning_rate=learning_rate) grid_search = GridSearchCV(model, param_grid, scoring="log_loss", cv=10, verbose=1) result = grid_search.fit(X_train_trans, y_train) print("Best: {0} using {1}".format(result.best_score_, result.best_params_)) print 'Best params: ', result.best_params_ best_est = result.best_estimator_ validation = best_est.predict_proba(X_train_trans) print("Roc AUC Train: ", roc_auc_score(y_train, validation[:, 1], average='macro')) probs = best_est.predict_proba(X_test_trans) print("Roc AUC Validation: ", roc_auc_score(y_test, probs[:, 1], average='macro')) pickle_model(best_est, 'fantasy.predict')
def xgboostcv(max_depth, learning_rate, n_estimators, gamma, min_child_weight, max_delta_step, subsample, colsample_bytree, silent=True, nthread=-1, seed=1234): clf = XGBClassifier(max_depth=int(max_depth), learning_rate=learning_rate, n_estimators=int(n_estimators), silent=silent, nthread=nthread, gamma=gamma, min_child_weight=min_child_weight, max_delta_step=max_delta_step, subsample=subsample, colsample_bytree=colsample_bytree, seed=seed, objective="binary:logistic") # Run Kfolds on the data model to stop over-fitting X_train, X_valid, y_train, y_valid = train_test_split(train, train_labels, test_size=0.1, random_state=seed) xgb_model = clf.fit(X_train, y_train, eval_metric="auc", eval_set=[(X_valid, y_valid)], early_stopping_rounds=20) y_pred = xgb_model.predict_proba(X_valid)[:,1] return auc(y_valid, y_pred)
def train_model_xgb_meta(train_x, train_y, xgb_features): train_ind = StratifiedShuffleSplit(train_y, random_state=1, test_size=0.2) for train_index, test_index in train_ind: x_train = train_x.ix[train_index, :] y_train = train_y.ix[train_index] x_eval = train_x.ix[test_index, :] y_eval = train_y.ix[test_index] #Classifier xgb = XGBClassifier(max_depth=xgb_features['max_depth'], learning_rate=xgb_features['learning_rate'], n_estimators=int(xgb_features['n_estimators']), objective='binary:logistic', subsample=xgb_features['subsample'], colsample_bytree=xgb_features['colsample_bytree'], min_child_weight=xgb_features['min_child_weight']) # gives 0.458 # bag_clf = BaggingClassifier(xgb, max_samples=10, warm_start=True, verbose=10) # x_train = pd.DataFrame(x_train, dtype=float) # bag_clf.fit(x_train, y_train) xgb = xgb.fit(x_train, y_train, verbose=True, eval_metric='logloss', eval_set=[(x_eval, y_eval)], early_stopping_rounds=10) # cv_score = cross_val_score(xgb, x_train, y_train, cv=4, n_jobs=1, pre_dispatch=1, verbose=10, scoring='log_loss') # print(cv_score) # print(np.mean(cv_score)) # predictions = pd.Series(xgb.predict_proba(x_train, ntree_limit=xgb.best_iteration)[:, 1], name='PredictedProb') return xgb # , predictions
def runner (): m = Model() X = m.df.drop("tred_cutoff", axis=1) Y = m.df["tred_cutoff"] features_inner = m.features + m.features_2 cv(X, Y, features_inner) model = XGBClassifier() model.fit(X, Y) y_pred = model.predict(m.X_test) s_f = f1_score(m.y_test, y_pred) s_p = precision_score(m.y_test, y_pred) s_r = recall_score(m.y_test, y_pred) print("test f1", s_f) print("test precision", s_p) print("test recall", s_r)
def main(): titanic = pandas.read_csv('dataset/titanic.csv') x_set = titanic[['pclass', 'age', 'sex']] y_set = titanic['survived'] x_set.fillna(x_set['age'].mean(), inplace=True) x_train, x_test, y_train, y_test = utils.prepare_train_and_test_sets(x_set, y_set) dict_vectorizer = DictVectorizer(sparse=False) x_train = dict_vectorizer.fit_transform(x_train.to_dict(orient='record')) x_test = dict_vectorizer.transform(x_test.to_dict(orient='record')) decision_tree_classifier = DecisionTreeClassifier() utils.get_trained_result(decision_tree_classifier, x_test, x_train, y_test, y_train) xgb_classifier = XGBClassifier() xgb_classifier.fit(x_train, y_train) utils.get_trained_result(xgb_classifier, x_test, x_train, y_test, y_train)
def trainXGB(data_subset): f.write('\nTraining XGB:'+'\n') X_train = data[data_subset]['X_train'] X_test = data[data_subset]['X_test'] y_train = data[data_subset]['y_train'] y_test = data[data_subset]['y_test'] for p in params['xgboost']: if data_subset != 'binary' and p['objective'] == 'binary:logistic': print("Skip using non-binary data with XGB binary:logistic objective") continue if data_subset == 'binary' and p['objective'] != 'binary:logistic': print("Skip using binary data with XGB multi:* objective") continue header = "@ subset: {0}, params: {1}".format(data_subset, p) f.write('\n'+header+'\n') objective = p['objective'] max_depth = p['max_depth'] try: n_estimators= p['n_estimators'] except KeyError as e: n_estimators= 100 model = XGBClassifier(objective=objective, max_depth=max_depth, n_estimators=n_estimators) start = time.time() model.fit(X_train, y_train) elapsed_train = time.time() - start y_pred = model.predict(X_test).astype(int) elapsed_predict = time.time() - start accuracy = accuracy_score(y_test, y_pred) precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, pos_label=2, average='weighted') print("\n{5}\nXGB with {0} objective, {6} max_depth, {7} n_estimators on data subset {1} trained in {2} seconds and predicted in {3} seconds with an accuracy of {4}\n".format(objective, data_subset, elapsed_train, elapsed_predict, accuracy, header, max_depth, n_estimators)) f.write(str(elapsed_train) + ', ' + str(elapsed_predict) + str(accuracy)+ ', ' + str(precision)+ ', ' + str(recall )+ ', ' + str(fscore )+ ', ' + str(support))
def get_thresh(model,train,test,label_test,label_train): if (len(test)>len(train)) or (len(label_test)>len(label_train)): raise TypeError('Invalid train and test size') model1 = XGBClassifier() if type(model)!=type(XGBClassifier()): raise TypeError('Invalid model passed') if (pd.DataFrame(label_train).shape[1]>1) or (pd.DataFrame(label_test).shape[1]>1): raise TypeError('Multiple columns in label, Invalid shape.') max_score=0 thrsh=0 thresholds = np.sort(model.feature_importances_) for thresh in thresholds: selection = feature_selection.SelectFromModel(model, threshold=thresh,prefit=True) select_X_train = selection.transform(train) selection_model = XGBClassifier() selection_model.fit(select_X_train, label_train) select_X_test = selection.transform(test) y_pred = selection_model.predict(select_X_test) scr=metrics.roc_auc_score(label_test,y_pred) if(scr>max_score): max_score=scr thrsh=thresh return thrsh
def test_on_data(X, y): x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.5, random_state=2333) print "train set: {}, test set: {}".format(len(x_train), len(x_test)) cls = XGBClassifier() cls.fit(x_train, y_train) # on test pred = cls.predict(x_test) print "xgb accuracy score test", accuracy_score(y_test, pred) # on all pred = cls.predict(X) print "xgb accuracy score all", accuracy_score(y, pred) # compare to gbrt in sklearn cls = GradientBoostingClassifier() cls.fit(x_train, y_train) # on test pred = cls.predict(x_test) print "sklearn accuracy score test", accuracy_score(y_test, pred) # on all pred = cls.predict(X) print "sklearn accuracy score all", accuracy_score(y, pred)
def train(imgfile='img/segmentation', modelfile='segmentation.pkl'): filelabel = getFiles(imgfile) row = 120 col=40 data = filter(lambda z: z is not None ,map(lambda x:Img(x[1],row,col,x[0]).imgmap,filelabel)) data = filter(lambda x:x[0] is not None,sum(data,[])) label = np.array(map(lambda x:CHARACTER.get(x[0]),data)) feature = np.array(map(lambda x:np.array(x[1]),data)) from xgboost import XGBClassifier xgb = XGBClassifier(objective='multi:softmax',reg_alpha=1.0,reg_lambda=0.0,subsample=0.7,n_estimators=100,learning_rate=0.3) model = xgb.fit(feature,label,eval_set=[(feature,label)],eval_metric='mlogloss') import pickle fn = modelfile with open(fn, 'w') as f: # open file with write-mode pickle.dump(model, f)
def train_model_xgb(train_x, train_y, xgb_features): train_ind = StratifiedShuffleSplit(train_y, random_state=1, test_size=0.1) for train_index, test_index in train_ind: x_train = train_x.ix[train_index, :] y_train = train_y.ix[train_index] x_eval = train_x.ix[test_index, :] y_eval = train_y.ix[test_index] #Classifier xgb = XGBClassifier(max_depth=xgb_features['max_depth'], learning_rate=xgb_features['learning_rate'], n_estimators=int(xgb_features['n_estimators']), objective='binary:logistic', subsample=xgb_features['subsample'], colsample_bytree=xgb_features['colsample_bytree'], min_child_weight=xgb_features['min_child_weight']) # gives 0.458 xgb = xgb.fit(x_train, y_train, verbose=True, eval_metric='logloss', eval_set=[(x_eval, y_eval)], early_stopping_rounds=10) predictions = pd.Series(xgb.predict_proba(x_train, ntree_limit=xgb.best_iteration)[:, 1], name='PredictedProb') return xgb, predictions
new_test_data_handle = test_datas.drop(to_drop, axis=1) return new_test_data_handle train_data = pd.read_csv('../../data/train.csv') test_data = pd.read_csv('../../data/test.csv') pro_datas, target = pro_train_data(train_data) pre_datas = pro_test_data(test_data) X_train, X_test, y_train, y_test = train_test_split(pro_datas, target, test_size=0.20, random_state=RANDOM_STATE) xgcmodel = XGBClassifier(n_estimators=48, max_depth=4, n_jobs=-1, random_state=RANDOM_STATE) xgcmodel.fit(X_train, y_train) train_predictions = xgcmodel.predict(X_test) print(classification_report(y_test, train_predictions, digits=4)) predictions = pd.DataFrame(xgcmodel.predict(pre_datas), columns=['Survived']) predictions = pd.concat([test_data['PassengerId'], predictions], axis=1, join='inner') predictions.to_csv('predictions.csv', index=False)
from sklearn.metrics import accuracy_score # load data dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",") # split data into X and y X = dataset[:, 0:8] Y = dataset[:, 8] # split data into train and test sets seed = 7 test_size = 0.33 X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed) # fit model on training data model = XGBClassifier() eval_set = [(X_test, y_test)] # specify a window of the number of epochs over which no improvement is observed. # This is specified in the early stopping rounds parameter. model.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="logloss", eval_set=eval_set, verbose=True) # make predictions for test data y_pred = model.predict(X_test) predictions = [round(value) for value in y_pred] # evaluate predictions accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0))
## train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=66) ## 모델링 model = XGBClassifier( n_estimators=1000, # verbose의 갯수, epochs와 동일 learning_rate=0.1) model.fit(x_train, y_train, verbose=True, eval_metric='rmse', eval_set=[(x_train, y_train), (x_test, y_test)]) # eval_metic의 종류 : rmse, mae, logloss, error(error가 0.2면 accuracy는 0.8), auc(정확도, 정밀도; accuracy의 친구다) results = model.evals_result() # print("eval's result : ", results) y_pred = model.predict(x_test) acc = accuracy_score(y_pred, y_test) print("acc: ", acc) # r2 = r2_score(y_pred, y_test) # print("r2: %.2f" %(r2 * 100.0)) import pickle pickle.dump(model, open("./model/xgb_save/cancer.pickle.dat", "wb"))
training_indices, testing_indices = next(iter(StratifiedShuffleSplit(input_data['class'].values, n_iter=1, train_size=0.75, test_size=0.25, random_state=dataset_repeat))) training_features = input_data.loc[training_indices].drop('class', axis=1).values training_classes = input_data.loc[training_indices, 'class'].values testing_features = input_data.loc[testing_indices].drop('class', axis=1).values testing_classes = input_data.loc[testing_indices, 'class'].values # Create and fit the model on the training data try: clf = XGBClassifier(learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth) clf.fit(training_features, training_classes) testing_score = clf.score(testing_features, testing_classes) except: continue param_string = '' param_string += 'learning_rate={},'.format(learning_rate) param_string += 'n_estimators={},'.format(n_estimators) param_string += 'max_depth={}'.format(max_depth) out_text = '\t'.join([dataset.split('/')[-1][:-7], 'XGBClassifier', param_string, str(testing_score)]) print(out_text)
def getvalues_and_recommend(): userid = 2552 shop1 = request.form['shop1'] rate1 = float(request.form['rate1']) shop2 = request.form['shop2'] rate2 = float(request.form['rate2']) shop3 = request.form['shop3'] rate3 = float(request.form['rate3']) shop4 = request.form['shop4'] rate4 = float(request.form['rate4']) shop5 = request.form['shop5'] rate5 = float(request.form['rate5']) shop6 = request.form['shop6'] rate6 = float(request.form['rate6']) shop7 = request.form['shop7'] rate7 = float(request.form['rate7']) shop8 = request.form['shop8'] rate8 = float(request.form['rate8']) shop9 = request.form['shop9'] rate9 = float(request.form['rate9']) shop10 = request.form['shop10'] rate10 = float(request.form['rate10']) #creating a new spark session newspark = SparkSession.builder.appName('hybrid_rec').getOrCreate() #reading in prepped dataset for model-based collaborative filtering recommendation mbcf = newspark.read.csv('mbcf.csv', header=True, inferSchema=True) #making a copy for each new user input mbcf_try = mbcf vals = [(shop1,rate1,userid),(shop2,rate2,userid),(shop3,rate3,userid),(shop4,rate4,userid),(shop5,rate5,userid),(shop6,rate6,userid),(shop7,rate7,userid),(shop8,rate8,userid),(shop9,rate9,userid),(shop10,rate10,userid)] #pyspark's convention to adding new rows to the end of an existing spark dataframe-1 newRows = newspark.createDataFrame(vals,mbcf_try.columns) #pyspark's convention to adding new rows to the end of an existing spark dataframe-2 mbcf_try = mbcf_try.union(newRows) #converting df to pandas df for easier manipulation later on... mbcf_try_pd = mbcf_try.toPandas() #getting a look again at the outlets and ratings provided by userid2552 so we know which outlets to exclude in recommending outlets to userid2552 later on... user_item_2552 = mbcf_try_pd[mbcf_try_pd['userids']==2552] #as part of ALS requirements for the feature columns to be in numerical format, am converting both shops and userids to the double precision format just in case (even though userids is already in a float format) indexer_try = [StringIndexer(inputCol=column, outputCol=column+"_index") for column in list(set(mbcf_try.columns)-set(['ratings']))] pipeline_try = PL(stages=indexer_try) transformed_try = pipeline_try.fit(mbcf_try).transform(mbcf_try) #rank=300 and regParam=0.1 was a pair of tuned best params while retuning als with train test split stratified for userids... als = ALS(rank=300, regParam=0.1, maxIter=20, seed=42, userCol='userids_index',itemCol='shops_index', ratingCol='ratings',coldStartStrategy='drop') #training the dataset containing the new user's ratings... als_model_rec = als.fit(transformed_try) #making recommendations for model-based collaborative filtering alone first, passing in all 981 outlets so as to ensure as much overlap between collaborative filtering and content-based filtering in the outlets that they generate rating predictions for recs=als_model_rec.recommendForAllUsers(981).toPandas() nrecs=recs.recommendations.apply(pd.Series) \ .merge(recs, right_index = True, left_index = True) \ .drop(["recommendations"], axis = 1) \ .melt(id_vars = ['userids_index'], value_name = "recommendation") \ .drop("variable", axis = 1) \ .dropna() nrecs=nrecs.sort_values('userids_index') nrecs=pd.concat([nrecs['recommendation'].apply(pd.Series), nrecs['userids_index']], axis = 1) nrecs.columns = [ 'Shop_index', 'Rating', 'UserID_index' ] md=transformed_try.select(transformed_try['userids'],transformed_try['userids_index'],transformed_try['shops'],transformed_try['shops_index']) md=md.toPandas() dict1=dict(zip(md['userids_index'],md['userids'])) dict2=dict(zip(md['shops_index'],md['shops'])) nrecs['UserID']=nrecs['UserID_index'].map(dict1) nrecs['shops']=nrecs['Shop_index'].map(dict2) nrecs=nrecs.sort_values('UserID') nrecs.reset_index(drop=True, inplace=True) new=nrecs[['UserID','shops','Rating']] new['recommendations'] = list(zip(new.shops, new.Rating)) res=new[['UserID','recommendations']] res_new=res['recommendations'].groupby([res.UserID]).apply(list).reset_index() #creating a new df for userid2552's collaborative filtering-derived recommendations collab_rec_2552 = pd.DataFrame(dict(res_new[res_new["UserID"]==2552]['recommendations'].tolist()[0]),index=[0]).T.sort_values(0,ascending=False) #creating a list of outlets userid2552 has rated earlier on rated_2552 = mbcf_try_pd[mbcf_try_pd['userids']==2552]['shops'].tolist() #filtering out those 10 outlets userid2552 has rated initially from the collaborative filtering recommendation list... collab_rankedrecs_2552 = collab_rec_2552.loc[[shop for shop in collab_rec_2552.index if shop not in rated_2552],0] #organizing the above series column into a df of recommendations and collaborative filtering rating predictions collab_2552_df = pd.DataFrame({'recommendations':collab_rankedrecs_2552.index,'collab_filter_predicted_ratings':collab_rankedrecs_2552}) #reading in the previously prepped df meant for content-based filtering here for content-based filtering recommendations.. content_f = pd.read_csv('content_based_df_nouser.csv') #merging userid2552's info with the df meant for content-based filtering so that rcontent-based filtering can make recommendations via rating predictions for userid 2552 later on... content_2552 = pd.merge(content_f,user_item_2552,how='left',on='shops') #getting dummies for categorical columns... content_2552_wdummies = pd.get_dummies(content_2552, columns=['shops','category_alias'], drop_first=False) #setting feature and target X = content_2552_wdummies.drop(['ratings'], axis=1) y = content_2552_wdummies['ratings'] #collating dummified columns shops_cats_list = [col for col in content_2552_wdummies.columns if (col.startswith('shops')) or (col.startswith('category'))] #extending with review_count and rating shops_cats_list.extend(['review_count','rating','userids']) #as tfidf can only work on one column of texts at a time, am separating features as below... X1 = X['reviews'] X2 = X[shops_cats_list] #Assigning a new variable name to X1 for processing. rev = X1 #creating customized stop words' list cust_stop_words = [word for word in stop_words.ENGLISH_STOP_WORDS] #adding on to the above list based on preliminary word cloud EDA cust_stop_words.extend(["wa","ha","just","ve","did","got","quite"]) #preprocessing text in reviews by defining a function to do so lemm = WordNetLemmatizer() def text_processer(raw_text): # Function to convert a raw string of text to a string of words # The input is a single string (a raw unprocessed text), and # the output is a single string (a preprocessed text) # 1. Remove http urls. review_text = re.sub("\(http.+\)", " ", raw_text) # 2. Remove non-letters. letters_only = re.sub("[^a-zA-Z]", " ", review_text) # 3. Convert to lower case, split into individual words. words = letters_only.lower().split() # 4. Lemmatize words. lemmed_words = [lemm.lemmatize(i) for i in words] # 5. Remove stop words. meaningful_words = [w for w in lemmed_words if not w in cust_stop_words] # 6. Join the words back into one string separated by space, # and return the result. return(" ".join(meaningful_words)) #showing how the processed reviews look like rev_processed = pd.Series([text_processer(text) for text in rev]) #using tfidf vectorizer to convert the reviews into term frequency columns... tvec_naive = TfidfVectorizer(stop_words = cust_stop_words) #instantiating TfidfVectorizer with customized stop words X1_tvec_naive = tvec_naive.fit_transform(rev_processed).todense() #fitting tvec and transforming the processed reviews X1_tvec_naive_df = pd.DataFrame(X1_tvec_naive, columns = tvec_naive.get_feature_names()) #converting it into a dataframe for easy lookup. #combining tvec-df with the rest of the features for rating prediction for userid 2552 later on... X_legit = pd.concat([X1_tvec_naive_df,X2], axis=1) #adding back the column of ratings so that it can be dropped below-sorry sometimes my train of thought may sound illogical X_legit['ratings'] = y #creating X_train manually for userid 2552 X_train_2552 = X_legit[X_legit['userids']==2552].drop(['ratings','userids'],axis=1) #creating y_train manually for userid 2552 y_train_2552 = X_legit[X_legit['userids']==2552]['ratings'] #creating X_test manually for userid 2552 which contains all outlets that have not been rated by userid 2552 X_test_2552 = X_legit[X_legit['userids']!=2552].drop(['ratings','userids'],axis=1) #instantiate scaler since not all of the features are of the same scale, eg. review_count and rating ss= StandardScaler() #fitting the train and transforming both the train and test sets X_train_2552_sc = ss.fit_transform(X_train_2552) X_test_2552_sc = ss.transform(X_test_2552) #learning rate, max depth, and n_estimators were retrieved from a tuned xgb model (notebook on future plan for xgb) saved in the folder but in order to use random_state which was not used during tuning, I am just instantiating a new xgb instance with the 3 tuned hyperparams set accordingly... xgb = XGBClassifier(learning_rate=0.5, max_depth=9, n_estimators=200, random_state=42) #training the loaded model on the dataset containing the new user, userid 2552's ratings. xgb.fit(X_train_2552_sc, y_train_2552) #stacking X_test_2552 as first step in regenerating the shops column for predictions trial = X_test_2552.stack() #creating loop to re-generate original X_test_2552 order of shops index_lst = [] outlets_lst = [] for n in range(len(trial.index)): if trial.index[n][1].startswith('shops_') and trial[n]!=0: index_lst.append(str(trial.index[n][0])) outlets_lst.append(trial.index[n][1]) index_lst = [int(x) for x in index_lst] reconstructed_X_test_2552 = pd.DataFrame({'shops':outlets_lst}, index=index_lst) #generating content-based filtering rating predictions for userid 2552 rating_predictions = xgb.predict(X_test_2552_sc) #adding new column of rating predictions into the reconstructed X_test_2552 reconstructed_X_test_2552['predicted_ratings']=rating_predictions #giving the reconstructed df a more easily understood name for distinction from the collaborative filtering df dealt with above content_2552_df = reconstructed_X_test_2552 #trimming off the shops' prefixes so that they can eventually be merged with the collaborative filtering df content_2552_df['shops'] = content_2552_df['shops'].apply(lambda x: x[6:]) #renaming the column of rating predictions to distinguish from collaborative filtering's prediction column later on when both dfs are merged. content_2552_df.rename(columns={'predicted_ratings':'content_filter_predicted_ratings'},inplace=True) #renaming collaborative filtering df's recommendations' column so that it can be merged with the content-based filtering df. collab_2552_df.rename(columns={'recommendations':'shops'},inplace=True) #reseting the index in the collaborative filtering df so that the index is numerical again collab_2552_df.reset_index(drop=True,inplace=True) #merging both content-based filtering and collaborating filtering df to prepare to make hybrid recommendations for userid 2552 content_collab_2552_df = pd.merge(content_2552_df,collab_2552_df,how='inner',on='shops') #as mentioned in the previous sub-notebook on this hybrid recommender's evaluation, the following are the content-based and collaborative filtering's ratings' weights con_wt = 0.97 / (0.97 + 1.0) collab_wt = 1.0 / (0.97 + 1.0) #feature engineering to add hybrid recommender's rating predictions into the combined df by multiplying the respective rating predictions by weights based on both models' f1 scores derived from prior evaluation and summing them up to yield hybrid predictions content_collab_2552_df['final_weighted_rating_predictions'] = (content_collab_2552_df['content_filter_predicted_ratings']*con_wt) + (content_collab_2552_df['collab_filter_predicted_ratings']*collab_wt) #top 5 coffee-drinking outlet recommendations for userid 2552 (me!) based on my ratings given rather randomly to 10 of the outlets earlier on... #recommendations_top_5 = content_collab_2552_df.sort_values('final_weighted_rating_predictions',ascending=False).head() top_5_recs = content_collab_2552_df[['shops','final_weighted_rating_predictions']].sort_values('final_weighted_rating_predictions',ascending=False).head() top_5_recs.reset_index(drop=True,inplace=True) first = top_5_recs.loc[0,'shops'] second = top_5_recs.loc[1,'shops'] third = top_5_recs.loc[2,'shops'] fourth = top_5_recs.loc[3,'shops'] fifth = top_5_recs.loc[4,'shops'] return render_template('outcome.html', first=first, second=second, third=third, fourth=fourth, fifth=fifth, shop1=shop1, rate1=rate1, shop2=shop2, rate2=rate2, shop3=shop3, rate3=rate3, shop4=shop4, rate4=rate4, shop5=shop5, rate5=rate5, shop6=shop6, rate6=rate6, shop7=shop7, rate7=rate7, shop8=shop8, rate8=rate8, shop9=shop9, rate9=rate9, shop10=shop10, rate10=rate10, url_alias=url_alias)
def test_run(): #read data data = pd.read_csv('/Desktop/creditcard.csv') #get some correlations corr_matrix = data.corr() print('Correlations') print(corr_matrix["Class"].sort_values(ascending=False)) #select predictor variables and drop missing data df = data.loc[:, data.columns != 'Class'] df.dropna() #assign target variable (Class in this case) target = pd.DataFrame(data, columns=["Class"]) X = df y = target #solve dataset imbalances on dependent variable using SMOTEENN algorithm #sme = SMOTEENN(random_state=42) #X, y = sme.fit_sample(X, y) # Standardize features scaler = StandardScaler() X_std = scaler.fit_transform(X) X_train, X_test, Y_train, Y_test = model_selection.train_test_split( X_std, y, test_size=0.3, random_state=42) #Create model clf = XGBClassifier(max_depth=6, min_child_weight=1, eta=0.1, silent=1, objective='multi:softmax', num_class=2) # Train model model = clf.fit(X_train, Y_train.values.ravel()) # make predictions for test data y_pred = model.predict(X_test) predictions = [round(value) for value in y_pred] #print(predictions) print('Classification Report') print(classification_report(Y_test, predictions)) #confusion matrix print('Confusion Matrix') print(confusion_matrix(Y_test, predictions)) #k fold validation kfold = StratifiedKFold(n_splits=10, random_state=7) results = cross_val_score(clf, X_std, y, cv=kfold) print("Stratified K-Fold Accuracy: %.2f%% (%.2f%%)" % (results.mean() * 100, results.std() * 100)) # plot feature importance plot_importance(model) plt.show() # save model filename = '/Desktop/Credit_model.pkl' pickle.dump(model, open(filename, 'wb')) # predict values in original data, to see how our model's predictions compare with real values b = model.predict(X_std) # send predictions to csv, after merging them with original data df2 = pd.DataFrame(data={"predicted": b}) pd.set_option('display.max_colwidth', -1) data['Predicted'] = df2 data.to_csv(r"/Desktop/predicted.csv")
def fit_xgboost(params, X, y): clf = XGBClassifier(**params) clf.fit(X, y) return clf
X = dataset.iloc[:, :7] #Converting words to integer values # def convert_to_int(word): # word_dict = {'one':1, 'two':2, 'three':3, 'four':4, 'five':5, 'six':6, 'seven':7, 'eight':8, # 'nine':9, 'ten':10, 'eleven':11, 'twelve':12, 'zero':0, 0: 0} # return word_dict[word] # X['experience'] = X['experience'].apply(lambda x : convert_to_int(x)) y = dataset.iloc[:, -1] #Splitting Training and Test Set #Since we have a very small dataset, we will train our model with all availabe data. # from sklearn.linear_model import LinearRegression # regressor = LinearRegression() from xgboost import XGBClassifier regressor = XGBClassifier() #Fitting model with trainig data regressor.fit(X, y) # Saving model to disk pickle.dump(regressor, open('model.pkl','wb')) # Loading model to compare the results # model = pickle.load(open('model.pkl','rb')) # print(model.predict([[2, 9, 6]]))
n_jobs=4, eta=0.02, gamma=0, max_depth=8, subsample=0.8715623, colsample_bytree=0.9497036, colsample_bylevel=0.8, min_child_weight=39.3259775, reg_alpha=0.041545473, reg_lambda=0.0735294, random_state=42, n_estimators=10000) #%% clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='mlogloss', verbose=10, early_stopping_rounds=30) pickle.dump(clf, open("xgb3.pickle", "wb")) #%% pred_valid_label = list(clf.predict(valid_x)) print('Accuracy_score %.6f' % accuracy_score(valid_y, pred_valid_label)) feats = [f for f in train.columns if f not in ['acc_id', 'label']] importance_data = pd.DataFrame() importance_data["feature"] = feats importance_data["importance"] = clf.feature_importances_ #%% from sklearn.metrics import confusion_matrix import matplotlib.pyplot as plt #2month, month, retained, week 순 입니다. cm = confusion_matrix(valid_y, pred_valid_label)
X = dataset.iloc[:, :-1].values y = dataset.iloc[:, -1].values # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Training XGBoost on the Training set from xgboost import XGBClassifier classifier = XGBClassifier() classifier.fit(X_train, y_train) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix, accuracy_score y_pred = classifier.predict(X_test) cm = confusion_matrix(y_test, y_pred) print(cm) accuracy_score(y_test, y_pred) # Applying k-Fold Cross Validation from sklearn.model_selection import cross_val_score accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10) print("Accuracy: {:.2f} %".format(accuracies.mean() * 100)) print("Standard Deviation: {:.2f} %".format(accuracies.std() * 100))
# Divide each dataset into Indep Vars and Dep var Train_X = Train.drop('is_click', axis = 1).copy() Train_Y = Train['is_click'].copy() Test_X = Test.drop('is_click', axis = 1).copy() Test_Y = Test['is_click'].copy() ########################################## # Fitting the XGBoost to the training set ########################################## from xgboost import XGBClassifier classifier = XGBClassifier() XGB_Model = classifier.fit(Train_X,Train_Y) ######################## # Predict on testset ######################## y_pred = XGB_Model.predict(Test_X) # Making the confusion matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(Test_Y,y_pred) cm #############################
def XGBoosting(trainData, trainLable): clf = XGBClassifier() clf.fit(trainData, trainLable) return clf
def build_model(): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100) model = XGBClassifier() result = model.fit(X_train, y_train) return result
end_time2 = timeit.default_timer() # 시작 시간 체크 g_best_model = model.best_estimator_.named_steps["anyway"] g_feature_importance = model.best_estimator_.named_steps["anyway"].feature_importances_ print("g_feature_importance:\r\n",g_feature_importance) ######## 최적 파라미터로 구한 f.i를 정렬 후, 최대 R2에서 threshold 구하기 thresholds = np.sort(g_feature_importance) # default 오름차순 temp_array =[] for thresh in thresholds: selection = SelectFromModel(g_best_model, threshold=thresh, prefit=True) select_x_train = selection.transform(x_train) selection_model = XGBClassifier() selection_model.fit(select_x_train, y_train) select_x_test = selection.transform(x_test) y_predict = selection_model.predict(select_x_test) score = accuracy_score(y_test, y_predict) # print('Thresh=%.6f, n=%d, R2:%.6f' # %(thresh, select_x_train.shape[1], score)) temp_array.append([thresh, score]) # temp_array를 R2 기준으로 오름차순 정렬하고, # 마지막 값이 최대 R2일 때의 thresh를 적용 # print("temp_array:\r\n", temp_array) temp_array.sort(key=lambda x: x[1]) # print("temp_array:\r\n", temp_array) feature_thresh = temp_array[-1][0] print("feature_thresh:",feature_thresh)
y_pred_proba_LR = classifier_LR.predict_proba(X_test)[::,1] fpr2, tpr2, _ = metrics.roc_curve(ytest, y_pred_proba_RF) auc2 = metrics.roc_auc_score(ytest, y_pred_proba_RF) print(auc2) # ROC plt.figure(figsize=(10,7)) plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr2,tpr2,label="Logistic Regression, auc="+str(round(auc2,2))) ## XGboost #Running the Model from xgboost import XGBClassifier classifier_XGB=XGBClassifier() classifier_XGB.fit(X_train,ytrain) # Predicting the results y_pred=classifier_XGB.predict(X_test) #Making the Confusion Matrix confusion_matrix = pd.crosstab(ytest, y_pred, rownames=['Actual'], colnames=['Predicted'], margins = True) print(confusion_matrix) sns.heatmap(confusion_matrix,annot=True,fmt='d',linewidths=.9) #Model Performance # Model Accuracy from sklearn import metrics print("Accuracy:",metrics.accuracy_score(ytest, y_pred))
'subsample': 1, 'colsample_bytree': 1, 'colsample_bylevel': 1, 'learning_rate': 0.0536444221653737, 'gamma': 8.491520978228445, 'max_depth': 3, 'min_child_weight': 1, 'max_delta_weight': 12, 'rate_drop': 0.9445947559908133} # In[15]: xgb_model = XGBClassifier(**xgboost_params) xgb_model.fit(x_pci_train, y_pci_train) y_pci_pred = xgb_model.predict(x_pci_test) predictions = [round(value) for value in y_pci_pred] accuracy = accuracy_score(y_pci_test, predictions) print(1-accuracy) # # LGBM # In[16]: import lightgbm import lightgbm as lgb from lightgbm import LGBMClassifier
max_iter=2000, momentum=0.87) clf7 = LogisticRegression(solver='saga') clf8 = KNeighborsClassifier(n_neighbors=3) clf9 = KNeighborsClassifier(n_neighbors=5) clfA = KNeighborsClassifier(n_neighbors=7) clfB = KNeighborsClassifier(n_neighbors=9) clfC = GaussianNB() clfD = LinearDiscriminantAnalysis() clfE = AdaBoostClassifier(n_estimators=500) clfF = XGBClassifier(n_estimators=500, objective='binary:logistic', gamma=7) ''' new_X_train, new_X_test = construct_metafeatures( [clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9, clfA, clfB, clfC, clfD, clfE], X_train, X_test, y_train) ''' n_fold = 10 train1, test1 = Stacking(clf1, X_train, y_train, X_test, n_fold) #train2, test2 = Stacking(clf2, X_train, y_train, X_test, n_fold) print(train1.shape, test1.shape) exit() new_X_train = np.concatenate((train1, train2), axis=1) new_X_test = np.concatenate((test1, test2), axis=1) clfF.fit(new_X_train, y_train) pred = clfF.predict(new_X_test) scr = accuracy_score(y_test, pred) print('Meta classifier score: {:.4f}'.format(scr))
"name": i, "id": i } for i in ['index', value.lower()]], describe_df[['index', value.lower() ]].to_dict('records') iris = datasets.load_iris() X = iris.data y = iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) bst = XGBClassifier(max_depth=1, silent=True, objective='multi:softprob') bst.fit(X_train, y_train) preds = bst.predict(X_test) with open("model.txt", "rb") as f: model = pickle.loads(f.read()) confusion_matrix = confusion_matrix(y_test, preds) feature_importance_graphs = list() for importance_type in [ 'weight', 'gain', 'cover', 'total_gain', 'total_cover' ]: curr_importances = bst.get_booster().get_score( importance_type=importance_type) curr_importances = { k: v for k, v in sorted(curr_importances.items(), key=lambda item: item[1]) }
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=1) model = XGBClassifier(n_estimators=1000, learning_rate=0.1, n_jobs=-1, objective='multi:softmax') # model = MultiOutputClassifier(xgb) # model.fit(x_train, y_train, verbose=True, eval_metric= "error", # eval_set=[(x_train, y_train), (x_test, y_test)]) model.fit(x_train, y_train, verbose=True, eval_metric=["mlogloss", "merror"], eval_set=[(x_train, y_train), (x_test, y_test)], early_stopping_rounds=20) # rmse, mae, logloss, error, auc result = model.evals_result() print(result) y_pred = model.predict(x_test) r2 = r2_score(y_pred, y_test) print(f"r2: {r2}") epochs = len(result['validation_0']['mlogloss']) x_axis = range(0, epochs)
colsample_bylevel=1, gamma=0, colsample_bytree=1, max_delta_step=0, min_child_weight=1, missing=None, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, subsample=1) model.feature_names = feature_names print(model) model.fit(X_train, np.ravel(Y_train)) #save model model_save_name = save_directory + '/antgc_' + signal + '_bdt' model._Booster.dump_model(model_save_name + '.xgb') model._Booster.save_model(model_save_name + '_bin.xgb') pk.dump(model, open(model_save_name + '.pickle', 'wb')) print 'Saved model ' + model_save_name + '(*.xgb, *.pickle)' # save test and test sets train_save_file = save_directory + '/train_set' + signal + '.txt' test_save_file = save_directory + '/test_set' + signal + '.txt' train_save = np.append(X_train, Y_train, axis=1) test_save = np.append(X_test, Y_test, axis=1) np.savetxt(train_save_file, train_save, delimiter=",") np.savetxt(test_save_file, test_save, delimiter=",")
learning_rate = [0.001, 0.01, 0.1, 0.2] xgb_params = dict(n_estimators=n_estimators, learning_rate=learning_rate) kfold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=10) grid_search = GridSearchCV(model, xgb_params, scoring="neg_log_loss", n_jobs=-1, cv=kfold_cv) grid_result = grid_search.fit(X_train, label_encoded_y_train) # summarize results print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, param))''' xgb = XGBClassifier(learning_rate=0.01, n_estimators=100) xgb.fit(X_train, y_train) xgb_preds = xgb.predict(X_test) print(classification_report(y_test, xgb_preds)) conf_matrix_xgb = metrics.plot_confusion_matrix(xgb, X_test, y_test, cmap=plt.cm.Blues) conf_matrix_xgb.ax_.set_title("XGBoost Confusion Matrix") ''' LOGISITIC REGRESSION ''' '''clf_lr_10 = LogisticRegression(solver='liblinear', random_state=0) clf_lr_10.fit(X_train_10, y_train_10)''' lr = LogisticRegression(solver='liblinear', random_state=10) lr.fit(X_train, y_train)
from xgboost import XGBClassifier # Recall cv/hyper parameter tuning data sets #hyper_train_features_df, hyper_tuning_features_df, hyper_train_labels_df, hyper_tuning_labels_df xgb_model = XGBClassifier(eval_metric='logloss', random_seed=11, logging_level='Silent', nan_mode='Min') evaluation_set = [(hyper_train_features_df, hyper_train_labels_df), (hyper_tuning_features_df, hyper_tuning_labels_df)] xgb_model.fit(hyper_train_features_df, hyper_train_labels_df, eval_set=evaluation_set) print("XGboost On Test Data") print( classification_report(test_labels_df, xgb_model.predict(test_features_df))) #~~~~~~~~~~~~~~~~~~~~~~~~~~ # 5. Using a Neural Network #~~~~~~~~~~~~~~~~~~~~~~~~~~ # Set up a neural network with one input layer of 768 input neurons and an output layer of 2 neurons nnmodel = Sequential() # Add layers - defining number of input and output nodes appropriately # Note the softmax activation function on the output layer to convert output to a probability
target = df['TARGET'] del df['TARGET'] id = df_test['ID'] from src.transfomations import remove_correlated _, to_remove = remove_correlated(df, 0.99) df_test.drop(to_remove, axis=1, inplace=True ) variance_threshold = VarianceThreshold(threshold=0.001) df = variance_threshold.fit_transform(df) df_test = variance_threshold.fit(df_test) m2_xgb = XGBClassifier(n_estimators=110, nthread=1, max_depth=4, scale_pos_weight=.8) m2_xgb.fit(df, target, eval_metric='auc') param_dist = { "n_estimators": [80, 100, 110, 130], "max_depth": [3, 4, 5], "scale_pos_weight": [0.8, 1, 1.2], "learning_rate": [0.1, 0.05, 0.02], } randomizedSearch = RandomizedSearchCV(m2_xgb, n_iter=20, param_distributions=param_dist, verbose=2) randomizedSearch.fit(df, target) best = randomizedSearch.best_estimator_ print(randomizedSearch.best_params_) scores = cross_validation.cross_val_score(best, df, target, cv=5, scoring='roc_auc')
# plot decision tree from numpy import loadtxt from xgboost import XGBClassifier from xgboost import plot_tree from matplotlib import pyplot # load data dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",") # split data into X and y X = dataset[:,0:8] y = dataset[:,8] # fit model no training data model = XGBClassifier() model.fit(X, y) # plot single tree plot_tree(model) pyplot.show()
log_model = LogisticRegression(C=1, penalty="l1", solver="liblinear", random_state=7).fit(X_train, y_train) model = SelectFromModel(log_model, prefit=True) X_new = model.transform(X_train) selected_features = pd.DataFrame(model.inverse_transform(X_new), index=X_train.index, columns=X_train.columns) sel_col = selected_features.columns[selected_features.var() != 0] #print(sel_col) #clf = XGBClassifier() clf = XGBClassifier(n_estimators=100, learning_rate=0.3) clf.fit(X_train, y_train) #prediction on the test set y_pred = clf.predict(X_val) # round float and convert to int y_pred = y_pred.round(0) y_pred = y_pred.astype(int) # Calculating F1 Score f1 = f1_score(y_val, y_pred, average='macro') print("F1 score of the model is :", f1) submission = clf.predict(final_test) submission2 = submission.round(0)
#RandomForest from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier() rfc.fit(X_train3, y_train3) rfc_y_predict = rfc.predict(X_test3) print('rfc accuracy:', rfc.score(X_test3, y_test3)) print( classification_report(y_test3, rfc_y_predict, target_names=['died', 'suvived'])) #GradientBoosting from sklearn.ensemble import GradientBoostingClassifier gbc = GradientBoostingClassifier() gbc.fit(X_train3, y_train3) gbc_y_predict = gbc.predict(X_test3) print('dtc accuracy:', gbc.score(X_test3, y_test3)) print( classification_report(y_test3, gbc_y_predict, target_names=['died', 'suvived'])) #xgboost from xgboost import XGBClassifier xgbc = XGBClassifier() xgbc.fit(X_train3, y_train3) print('xgbc accuracy:', xgbc.score(X_test3, y_test3))
x = dataset.data y = dataset.target x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=1) model = XGBClassifier(n_estimators=1000, learning_rate=0.1) # model.fit(x_train, y_train, verbose=True, eval_metric= "error", # eval_set=[(x_train, y_train), (x_test, y_test)]) model.fit(x_train, y_train, verbose=True, eval_metric="rmse", eval_set=[(x_train, y_train), (x_test, y_test)], early_stopping_rounds=20) # rmse, mae, logloss, error, auc result = model.evals_result() # print(result) y_pred = model.predict(x_test) r2 = r2_score(y_pred, y_test) acc = accuracy_score(y_pred, y_test) print(f"acc : {acc}") # pickle.dump(model, open("./model/xgbsave/cancer.pickle.dat", "wb"))
def main(): args = parse_args() config = parse_config(args.config_file) if config is None: print('No configuration file is defined. ' 'Define one with `--config-file`.') sys.exit(1) # read dataset files = config['files'] if 'filepath' in config: files = [config['filepath'] + f for f in files] kwargs = config['pandas_kwargs'] print('Reading ', end='') entries = 0 for f in files: rootfile = ROOT.TFile(f) tree = rootfile.Get(kwargs['key']) entries += tree.GetEntries() maxslices = args.max_slices chunksize = kwargs['chunksize'] total = (maxslices if maxslices is not None and maxslices < (entries / chunksize) else (entries / chunksize)) print(total * chunksize, 'events.') df = pd.concat([ df for df in tqdm( islice( read_root(files, flatten=True, **kwargs), maxslices), total=total)]) # rename the tagging particle branches df.rename(columns=dict(zip(df.columns, [c.replace(config['tagging_particle_prefix'], 'tp').replace('-', '_') for c in df.columns])), inplace=True) df['event_id'] = df.runNumber.apply(str) + '_' + df.eventNumber.apply(str) if 'invert_target' in config and config['invert_target']: df['target'] = np.sign(df.B_ID) != np.sign(df.tp_ID) else: df['target'] = np.sign(df.B_ID) == np.sign(df.tp_ID) # read features and selections try: if 'inclusive_mva_features' in config: mva_features = ['tp_' + f for f in config['inclusive_mva_features']] else: mva_features = ['tp_' + f.split(' ')[0] for f in config['selections']] except: raise ValueError('Tried to parse features for the BDT.' ' Either provide well-formatted `selections` or' ' define a `inclusive_mva_features` set.') # build BDT model and train the classifier n_cv x 3 times xgb_kwargs = config['xgb_kwargs'] n_jobs = config['n_jobs'] bootstrap_scores = [] bootstrap_d2s = [] nfold = (args.bootstrap_folds if args.bootstrap_folds is not None else config['n_cv']) print('Starting bootstrapping.') pbar = tqdm(total=nfold * 3) for _ in range(nfold): # yield 3-fold split for CV df_sets = [df.iloc[indices] for indices in NSplit(df)] cv_scores = [] for i in range(3): df1, df2, df3 = (df_sets[i % 3].copy(), df_sets[(i + 1) % 3].copy(), df_sets[(i + 2) % 3].copy()) model = XGBClassifier(nthread=n_jobs, **xgb_kwargs) sample_weight = (df1.target if 'training_weights' in config and config['training_weights'] else None) model.fit(df1[mva_features], df1.target, sample_weight=df1.SigYield_sw) df2['probas'] = model.predict_proba(df2[mva_features])[:, 1] df2.reset_index(inplace=True, drop=True) df2_max = df2.iloc[df2.groupby('event_id')['probas'].idxmax()].copy() df3['probas'] = model.predict_proba(df3[mva_features])[:, 1] df3.reset_index(inplace=True, drop=True) df3_max = df3.iloc[df3.groupby('event_id')['probas'].idxmax()].copy() # calibrate calibrator = PolynomialLogisticRegression(power=4, solver='lbfgs', n_jobs=n_jobs) calibrator.fit(df2_max.probas.reshape(-1, 1), df2_max.target, sample_weight=df2_max.SigYield_sw) df3_max['calib_probas'] = calibrator.predict_proba(df3_max.probas)[:, 1] score = tagging_power_score(df3_max.calib_probas, tot_event_number=get_event_number(df3_max), sample_weight=df3_max.SigYield_sw) bootstrap_scores.append(score) bootstrap_d2s.append(d2_score(df3_max.calib_probas, sample_weight=df3_max.SigYield_sw)) pbar.update(1) pbar.close() print(dedent("""\ Final {}-fold bootstrap performance D2 = {:<6}% ε_eff = {:<6}%""") .format(nfold, 100 * ufloat(np.mean(bootstrap_d2s), np.std(bootstrap_d2s)), 100 * ufloat(np.mean(noms(bootstrap_scores)), np.std(noms(bootstrap_scores)))))
col_y = cols[-1] # y변수명 # plt.scatter(x=iris[:,0], y=iris[:,1], s=100, c=iris[:,4], marker='o') # '(slice(None, None, None), 0)' is an invalid key y = iris[col_y] import numpy as np y = np.array(y) plt.scatter(x=iris[:, 0], y=iris[:, 1], c=y, marker='o') # 단계2 : 훈련/검정 데이터셋 생성 train_set, test_set = train_test_split(iris, test_size=0.25) # 단계3 : model 생성 : train data 이용 xgb = XGBClassifier() model = xgb.fit(train_set[col_x], train_set[col_y]) model # 단계4 :예측치 생성 : test data 이용 y_pred = model.predict(test_set[col_x]) y_true = test_set[col_y] y_pred2 = model.predict_proba(test_set[col_x]) y_pred2.shape # (38, 3) y_pred2 ''' array([[2.1746019e-03, 9.9590498e-01, 1.9204022e-03], [9.9528944e-01, 3.9060446e-03, 8.0451195e-04], [9.9526840e-01, 3.9059622e-03, 8.2559639e-04], ... '''
X=np.hstack([train[good+goodx].as_matrix(),train1.as_matrix()]) Xt=np.hstack([test[good+goodx].as_matrix(),test1.as_matrix()]) from sklearn.feature_extraction import DictVectorizer vec = DictVectorizer() names_categorical = [] cand=['v40','v63','v109'] for name in train.columns.values : if train[name].value_counts().shape[0]<1000 or name in cand:# and name not in good: train[name] = map(str, train[name]) test[name] = map(str, test[name]) names_categorical.append(name) print name,train[name].value_counts().shape[0] X_sparse = vec.fit_transform(train[names_categorical].T.to_dict().values()) Xt_sparse = vec.transform(test[names_categorical].T.to_dict().values()) idx=np.array(train.index) del train gc.collect() X=sparse.hstack([X,X_sparse],format='csr')#.toarray() Xt=sparse.hstack([Xt,Xt_sparse],format='csr') print X.shape,y.shape,Xt.shape clf=XGBClassifier(max_depth=11,colsample_bytree=0.5,learning_rate=0.01,n_estimators=1200,nthread=-1) clf.fit(X,y) idx=np.array(test.index)#id_test yp=clf.predict_proba(Xt).T[1] s=pd.DataFrame({idname:idx,'PredictedProb':yp}) s.to_csv('xgb10.csv',index=False)
random_state=55) print(x_train.shape, x_test.shape) #(56000, 154) (14000, 154) print(y_train.shape, y_test.shape) #(56000,) (14000,) x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.8, random_state=55) # print(x_train.max) # print(x_train.min) #2. 모델링 model = XGBClassifier(n_job=-1, use_label_encoder=False) #3. 컴파일, 훈련 model.fit(x_train, y_train) #4. 평가, 예측 loss = model.evaluate(x_test, y_test) print('loss : ', loss) y_pred = model.predict(x_test[:10]) # print(y_pred) print(y_test[:10]) print(np.argmax(y_test[:10], axis=-1)) #DNN #(784, ) # loss : [0.09116600453853607, 0.9779000282287598] # [7 2 1 0 4 1 4 9 5 9]
print('We have %d classes and %d models TOTAL so in resulting arrays \ we expect to see %d columns.' % (n_classes, len(models_1) + len(models_2), n_classes * (len(models_1) + len(models_2)))) # Create empty arrays S_train_all = np.zeros((X_train.shape[0], 0)) S_test_all = np.zeros((X_test.shape[0], 0)) # Load results for name in sorted(glob('*.npy')): print('Loading: %s' % name) S = np.load(name) S_train_all = np.c_[S_train_all, S[0]] S_test_all = np.c_[S_test_all, S[1]] print('\nS_train_all shape:', S_train_all.shape) print('S_test_all shape: ', S_test_all.shape) # Initialize 2nd level model model = XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, n_estimators=100, max_depth=3) # Fit 2nd level model model = model.fit(S_train_all, y_train) # Predict y_pred = model.predict_proba(S_test_all) # Final prediction score print('Final prediction score: %.8f' % log_loss(y_test, y_pred))
x_test = x_test.reshape(-1, x_test.shape[1] * x_test.shape[2]) pca = PCA(n_components=713) x2_train = pca.fit_transform(x_train) x2_test = pca.fit_transform(x_test) x_train, x_test, y_train, y_test = train_test_split(x2_train, y_train, train_size=0.8, random_state=77) start = time.time() #2. 모델 model = XGBClassifier(n_jobs=-1, use_label_encoder=False) #3. 훈련 model.fit(x_train, y_train, eval_metric='logloss') #4. 평가 예측 acc = model.score(x_test, y_test) print('acc : ', acc) sec = time.time() - start times = str(datetime.timedelta(seconds=sec)).split(".") times = times[0] print("작업 시간 : ", times) # acc : 0.9579166666666666 # 작업 시간 : 892.8674330711365
y22 = np.argmax(y_pred_model2,axis=1) y_test22 = np.argmax(y_test , axis = 1) count = 0 for i in range(y22.shape[0]): if y22[i] == y_test22[i]: count+=1 print('Accuracy for model 2 : ' + str((count / y22.shape[0]) * 100)) X_train2,X_test2,y_train2,y_test2 = train_test_split(feature_all,y,test_size = 0.3,random_state=20) ########################### MODEL 3 ########################### model3 = XGBClassifier() model3.fit(X_train2,y_train2) model3.evals_result() score = cross_val_score(model3, X_train2, y_train2, cv=5) y_pred3 = model3.predict(X_test) count = 0 for i in range(y_pred3.shape[0]): if y_pred3[i] == y_test2[i]: count+=1 print('Accuracy for model 3 : ' + str((count / y_pred3.shape[0]) * 100)) ########################### TESTING ########################### test_file_path = sys.argv[2] X,sr = librosa.load(test_file_path, sr = None)
# In[97]: xg_class = XGBClassifier(earning_rate=0.3, max_delta_step=0, max_depth=3, min_child_weight=1, missing=None, n_estimators=7, n_jobs=1, nthread=None, objective='binary:logistic', random_state=0) # In[98]: xg_class.fit(xtrain, ytrain.values.ravel()) print(xg_class) # In[99]: x.dtypes # In[100]: test_sub1 = test_sub_m.copy() test_sub1['Parch'] = test_sub1['Parch'].astype('int64') test_sub1['male'] = test_sub1['male'].astype('int64') test_sub1['female'] = test_sub1['female'].astype('int64') test_sub1['S'] = test_sub1['S'].astype('int64') test_sub1['PC1'] = test_sub1['PC1'].astype('int64') test_sub1['PC2'] = test_sub1['PC2'].astype('int64')
trainingSet = train.iloc[itr] validationSet = train.iloc[icv] gbm = XGBClassifier(max_depth=4, learning_rate = 0.01, n_estimators=3000, subsample=0.8, colsample_bytree=0.5, objective="binary:logistic", silent = False, min_child_weight=5, nthread=-1) gbm.fit(trainingSet[feature_names], np.array(trainingSet["TARGET"]), eval_metric="auc", eval_set=[(trainingSet[feature_names], np.array(trainingSet["TARGET"])), (validationSet[feature_names], np.array(validationSet["TARGET"]))], early_stopping_rounds=200,verbose=20) ll = gbm.best_score best_iter = gbm.best_iteration cv.append(ll) biter.append(best_iter) print "---auc : %0.6f\n" %ll print "---best_iter: %d\n" %best_iter gc.collect() gbm = XGBClassifier(max_depth=4, learning_rate = 0.01, n_estimators=370, subsample=0.8, colsample_bytree=0.5,
test['title_count']=test.apply(title_count,axis=1) test['genres_count']=test.apply(genres_count,axis=1) test['dow_count'] = test.apply(dow_count,axis=1) test['tod_count']=test.apply(tod_count,axis=1) testdf= test.copy() test.drop(['titles','genres','ID','dow','tod','cities'],axis=1,inplace=True) print("now predicting") # pca.transform(test) # sc.transform(test) #rf = RandomForestClassifier(n_estimators=460,max_depth=12, max_features=8,class_weight='balanced') #xgb model2 = XGBClassifier(max_depth=5, n_estimators=460, learning_rate=0.05,scale_pos_weight = 1,min_child_weight = 2,gamma = 0.0,subsample =0.5, colsample_bytree = 0.5,max_delta_step=1) #model = VotingClassifier(estimators=[('rf',rf),('xgb',xgb)],voting='soft') sfm = SelectFromModel(model2,threshold = 0.013) X_traindf2 = pd.DataFrame(sfm.fit_transform(X_traindf,Y_traindf)) test2 = pd.DataFrame(sfm.transform(test)) print("now grid serach") model = XGBClassifier(max_depth=5, n_estimators=460, learning_rate=0.05,scale_pos_weight = 1,min_child_weight = 2,gamma = 0.0,subsample =0.5, colsample_bytree = 0.5,max_delta_step=1) model.fit(X_traindf2,Y_traindf)# print("best_params: "+str(rf.best_params_)) probabilities = model.predict_proba(test2) print probabilities probabilities1 = pd.DataFrame(probabilities,columns=['neg','segment']) probabilities1 = probabilities1.drop('neg',axis=1) answer = pd.concat([pd.DataFrame(testdf['ID']),probabilities1],axis=1) answer.to_csv('segmentspredanswer78.csv',index=False) # print("best_score: " + str(cv_rfc.best_score_)) # print("best_params: "+str(cv_rfc.best_params_)) #
target = df['TARGET'] del df['TARGET'] # del df['ID'] id = df_test['ID'] # del df_test['ID'] pca = PCA(n_components=250) train_pcaed = pca.fit_transform(df, target) random_forest = RandomForestClassifier(n_estimators=30, max_depth=5, max_features=20) random_forest.fit(train_pcaed, target) forested = random_forest.predict_proba(train_pcaed) # pipe = Pipeline(steps=[('pca', pca), ('random_forest', random_forest)]) m2_xgb = XGBClassifier(n_estimators=110, nthread=1, max_depth=4) m2_xgb.fit(train_pcaed, target) m2_xgbed = m2_xgb.predict_proba(train_pcaed) logistic_regression = LogisticRegression(penalty='l1') logistic_regression.fit(train_pcaed, target) logistic_regressioned = logistic_regression.predict_proba(train_pcaed) combined = np.concatenate([forested, m2_xgbed, logistic_regressioned], axis=1) log_reg = LogisticRegression() log_reg.fit(combined, target) scores = cross_validation.cross_val_score(log_reg, combined, target, cv=5, scoring='roc_auc') print(scores.mean(), scores)
# x, y = load_boston(return_X_y=True) datasets = load_wine() x = datasets.data y = datasets['target'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=66) # 2. 모델 model = XGBClassifier(n_estimators=100, learning_rate=0.01, n_jobs=8) # 3. 훈련 model.fit(x_train, y_train, verbose=1, eval_metric='mlogloss', eval_set=[(x_train, y_train), (x_test, y_test)]) aaa = model.score(x_test, y_test) print('aaa :', aaa) y_pred = model.predict(x_test) r2 = r2_score(y_test, y_pred) print('r2 :', r2) print('====================================') results = model.evals_result() print(results)