def fit(X_vec, y_vec): # 切分数据集 cv = cross_validation.ShuffleSplit(len(X_vec), n_iter=10, test_size=0.2, random_state=0) # 随机森林回归 # for train, test in cv: # svc = RandomForestClassifier(n_estimators=100).fit(X_vec[train], y_vec[train]) # print("train score: %.3f, test score: %.3f\n" % ( # svc.score(X_vec[train], y_vec[train]), svc.score(X_vec[test], y_vec[test]) # )) # gbdt # for train, test in cv: # svc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1).fit(X_vec[train], y_vec[train]) # print("train score: %.3f, test score: %.3f\n" % ( # svc.score(X_vec[train], y_vec[train]), svc.score(X_vec[test], y_vec[test]) # )) # xgboost for train, test in cv: svc = XGBClassifier(max_depth=10, gamma=0.001).fit(X_vec[train], y_vec[train]) print("train score: %.3f, test score: %.3f\n" % (svc.score( X_vec[train], y_vec[train]), svc.score(X_vec[test], y_vec[test])))
def xxgboost(training, cv, testing): xgb = XGBClassifier(max_depth=6, n_estimators=25, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5) xgb.fit(training, cv.ravel()) XGBtrainscore = xgb.score(training, cv.ravel()) #Train Score kf = KFold(len(cv), n_folds=5) # 5 folder cross validation scores = cross_val_score(xgb, training, cv.ravel(), cv=kf) XGBvalidation = abs(scores.mean()) XGBy_pred = xgb.predict_proba(testing) le = LabelEncoder() y = le.fit_transform(labels) idlist = [] #id list listcty = [] #countries list for i in range(len(testid)): idi = testid[i] idlist += [idi] * 5 listcty += le.inverse_transform(np.argsort( XGBy_pred[i])[::-1])[:5].tolist() XGBsub = pd.DataFrame(np.column_stack((idlist, listcty)), columns=['id', 'country']) XGBsub.to_csv('XGsub_%s.csv' % csvname, index=False) print("XGBtrainscore", XGBtrainscore) print("XGBvalidation", XGBvalidation)
def modeling_RF(): estimator = None try: df1 = pd.read_csv('last_total.csv', encoding='cp949') df_dummy = pd.get_dummies(df1) train, test = train_test_split(df_dummy, test_size=0.2, random_state=1234) train_x = train.drop('target_bool', axis=1) train_y = train['target_bool'] test_x = test.drop('target_bool', axis=1) test_y = test['target_bool'] xgb = XGBClassifier(random_state=1234, learning_rate=0.6000000000000001, max_depth=9, n_estimators=200) xgb.fit(train_x, train_y) abc = xgb.score(train_x, train_y) except Exception as e: print(e) finally: pass return abc
def _XGBoost(self): clf = XGBClassifier() clf.fit(self.X_train, self.y_train) score = clf.score(self.X_test, self.y_test) print('Accuracy rate of XGBoost: {0:.3f}'.format(score)) y_pred = clf.predict_proba(self.X_test) ks(y_pred.T[0], self.y_test)
def train_model(mall_id): # 开始训练模型 random_state = 10 metrix, tar = utils.get_data(mall_id) x_train, x_test, y_train, y_test = train_test_split( metrix, tar, test_size=0.1, random_state=random_state) # xgboost方法,基于boosting tree(提升树方法) # 设参数 训练慢 clf_name = "xgboost" save_dir = "./model/" + clf_name + "_" + mall_id + "_model.m" n_est = 50 clf = XGBClassifier( learning_rate=0.1, # 学习率 典型值为0.01-0.2 n_estimators=n_est, max_depth=5, # 树的最大深度 一般3-10 min_child_weight=1, # 决定最小叶子节点样本权重和 值较大,避免过拟合 值过高,会导致欠拟合 gamma=0, # 指定了节点分裂所需的最小损失函数下降值。 这个参数的值越大,算法越保守 subsample=0.8, # 对于每棵树,随机采样的比例 减小,算法保守,避免过拟合。值设置得过小,它会导致欠拟合 典型值:0.5-1 colsample_bytree=0.8, # 每棵随机采样的列数的占比 objective='binary:logistic', # 使用二分类 nthread=4, # 线程数 scale_pos_weight=1, # 在各类别样本十分不平衡时,参数设定为一个正值,可以使算法更快收敛 seed=0) # 随机数的种子 设置它可以复现随机数据的结果 print(utils.get_time(), ' ', mall_id, ' starts...') train_time = time.time() clf.fit(x_train, y_train) train_time = time.time() - train_time score = clf.score(x_test, y_test) joblib.dump(clf, save_dir) print(utils.get_time(), ' saved a model for ', mall_id, ' score: ', score, ' train time : ', train_time) train_time = int(train_time) return (score, n_est, train_time)
def best_param_xgboost(self, estimator=10, depth=1, lr=0.1, gama=0.1, subsamples=1.0, bytree=0.3, n_thread=1, child_weight=1, seed_num=7): best_model = XGBClassifier(n_estimators=estimator, max_depth=depth, learning_rate=lr, gamma=gama, subsample=subsamples, colsample_bytree=bytree, nthread=n_thread, min_child_weight=child_weight, seed=seed_num, objective='binary:logistic') best_model.fit(self.x_train, self.y_train) y_pred = best_model.predict(self.x_val) acc_score = metrics.accuracy_score(self.y_val, y_pred) print("acc_score: {}".format(acc_score)) print("score: {}".format(best_model.score(self.x_val, self.y_val))) save_path = self.model_save_path + "acc={:.6f}".format( acc_score) + ".m" # 判断模型是否存在,存在则删除 if os.path.exists(save_path): os.remove(save_path) pass # 保存模型 joblib.dump(best_model, save_path) print("AUC Score: {}".format(metrics.roc_auc_score(self.y_val, y_pred))) # 绘制 ROC 曲线 self.plt_roc(best_model) pass
def decision_tree_algo(original_df: pd.DataFrame): """ Mon propre test du decision tree pour essayer de l'ameliorer et proposer mes idees a Max. Je l'ai mis pour montrer comment j'ai fait, mais la fonction pourrait etre ameliorer :param original_df: la DF originale sur laquelle construire le decision tree :return: rien """ copied_df = original_df.copy() data = copied_df.iloc[:, :-1] target = copied_df.iloc[:, -1] print() print("========================================================") print("========================================================") print("In decision tree algorithm.") d1 = dt.datetime.now() xtrain, xtest, ytrain, ytest = tts(data, target, train_size=0.8) boost = XGBClassifier(max_depth=4, n_estimators=500) boost.fit(xtrain, ytrain) boost_prediction = boost.predict(xtest) print("Score Train:", round(boost.score(xtest, ytest) * 100, 2), " %") d2 = dt.datetime.now() print("Took ", d2 - d1) print("End decision tree algorithm.") labels = 'Found', 'Not found' hamming = distance.hamming(ytest, boost_prediction) rates = [1 - hamming, hamming] fig1, ax1 = plt.subplots() ax1.pie(rates, labels=labels, autopct='%0.2f%%') plt.show() plot_tree(boost, rankdir='LR') fig = plt.gcf() fig.set_size_inches(150, 50) # fig.savefig("tree.png") plt.show()
def run(): company_news = source.get_company_news() cross_over_keys, cross_under_keys = source.get_cross_keywords() cross_over_keys = cross_over_keys[:128] cross_under_keys = cross_under_keys[:128] company_news['post_time'] = pd.to_datetime(company_news['post_time']).dt.date company_news_train = company_news[int(len(company_news)*0.9):] tmp_content = jieba_analyse.cut_to_list(company_news_train) done = [] row_list = [] label = [] #-跌 +漲 for article in tqdm(tmp_content): xx = {} tmp_content_score = pd.DataFrame() for _, index in cross_over_keys.iterrows(): if index['key'] in article[1]: sss = {index['key']: float(index['weight']) } else: sss = {index['key']: 0} xx.update(sss) for _, index in cross_under_keys.iterrows(): if index['key'] in article[1]: sss = {index['key']: float(index['weight']*-1)} else: sss = {index['key']: 0} xx.update(sss) sss = {'date': article[0]} xx.update(sss) row_list.append(xx) tmp_content_score = pd.DataFrame(row_list) a = tmp_content_score.columns.values.tolist() df_score = tmp_content_score.set_index('date') print(df_score) index_b = source.get_company_higher_lower_index() index_b = index_b.set_index('date') index_b = index_b['result'] result = pd.concat([df_score, index_b], axis=1, join='inner') print(result) print(len(result)) a = result.columns.values.tolist() a.pop(len(a)-1) Y = result['result'].to_numpy() X = result[a].to_numpy() validation_size = 0.20 seed = 7 X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed, stratify=Y ) #===============mlp print('mlp===============================================================') model_MLP = MLPClassifier(hidden_layer_sizes=(256, 256,), max_iter=256) model_MLP.fit(X_train, Y_train) print(model_MLP.score(X_train, Y_train)) predictions = model_MLP.predict(X_validation) print(accuracy_score(Y_validation, predictions)) print(confusion_matrix(Y_validation, predictions)) print(classification_report(Y_validation, predictions)) print('==================================================================') #===============RandomForest print('RandomForest======================================================') model_RandomForest = RandomForestClassifier() model_RandomForest.fit(X_train, Y_train) print(model_RandomForest.score(X_train, Y_train)) predictions = model_RandomForest.predict(X_validation) print(accuracy_score(Y_validation, predictions)) print(confusion_matrix(Y_validation, predictions)) print(classification_report(Y_validation, predictions)) print('==================================================================') #===============XGBClassifier print('XGBClassifier=====================================================') model_XGBClassifier = XGBClassifier() model_XGBClassifier.fit(X_train, Y_train) print(model_XGBClassifier.score(X_train, Y_train)) predictions = model_XGBClassifier.predict(X_validation) print(accuracy_score(Y_validation, predictions)) print(confusion_matrix(Y_validation, predictions)) print(classification_report(Y_validation, predictions)) print('==================================================================') #===============LogisticRegression print('LogisticRegression================================================') model_LogisticRegression = LogisticRegression() model_LogisticRegression.fit(X_train, Y_train) print(model_LogisticRegression.score(X_train, Y_train)) predictions = model_LogisticRegression.predict(X_validation) print(accuracy_score(Y_validation, predictions)) print(confusion_matrix(Y_validation, predictions)) print(classification_report(Y_validation, predictions)) print('==================================================================')
# Confusion Matrix cm = confusion_matrix(y_test, pred) plt.figure(figsize=(12, 8)) ax = sns.heatmap(cm, fmt="f", square=True, annot=True, cbar=False) ax.set_xlabel('Predicted Labels', fontsize=15) ax.set_ylabel('True Labels', fontsize=15) plt.show() import xgboost as xgb from xgboost.sklearn import XGBClassifier xgc = XGBClassifier(max_depth=3, random_state=22) xgc.fit(X_train, y_train) print("Accuracy of train: ", xgc.score(X_train, y_train)) print("Accuracy of test: ", xgc.score(X_test, y_test)) importances = xgc.feature_importances_ sns.barplot(x=importances, y=X_train.columns) plt.show() pred = xgc.predict(X_test) print(classification_report(y_test, pred)) print("*" * 100, "\n") # Metrics print("Precision = {}".format(precision_score(y_test, pred, average='macro'))) print("Recall = {}".format(recall_score(y_test, pred, average='macro'))) print("Accuracy = {}".format(accuracy_score(y_test, pred))) print("F1 Score = {}\n".format(f1_score(y_test, pred, average='macro')))
def train(self, train_set, dev_set): logger.log('Get features from training set') if os.path.exists(train_features_file): train_features = np.load(train_features_file) _, _, train_labels, _, _ = self.get_minibatch( train_set, 0, len(train_set)) else: train_features = None train_labels = [] total_batch = int(len(train_set) - 1) / self.batch_size + 1 for i in tqdm(range(total_batch)): minibatch_premise_vectors, minibatch_hypothesis_vectors, minibatch_labels, \ minibatch_prem_dep, minibatch_hypo_dep = \ self.get_minibatch(train_set, i * self.batch_size, (i+1) * self.batch_size) feed_dict = { self.model.premise_x: minibatch_premise_vectors, self.model.hypothesis_x: minibatch_hypothesis_vectors, self.model.y: minibatch_labels, self.model.keep_rate_ph: 1.0 } if 'dep_avg' in self.model_type: feed_dict[self.model.prem_dep] = minibatch_prem_dep feed_dict[self.model.hypo_dep] = minibatch_hypo_dep minibatch_features = self.sess.run([self.model.features], feed_dict) train_features = minibatch_features[0] if train_features is None \ else np.concatenate((train_features, minibatch_features[0])) train_labels += minibatch_labels np.save(train_features_file, train_features) logger.log('Get features from dev set') if os.path.exists(dev_features_file): dev_features = np.load(dev_features_file) _, _, dev_labels, _, _ = self.get_minibatch( dev_set, 0, len(dev_set)) else: dev_features = None dev_labels = [] total_batch = int(len(dev_set) - 1) / self.batch_size + 1 for i in tqdm(range(total_batch)): minibatch_premise_vectors, minibatch_hypothesis_vectors, minibatch_labels, \ minibatch_prem_dep, minibatch_hypo_dep = \ self.get_minibatch(dev_set, i * self.batch_size, (i+1) * self.batch_size) feed_dict = { self.model.premise_x: minibatch_premise_vectors, self.model.hypothesis_x: minibatch_hypothesis_vectors, self.model.y: minibatch_labels, self.model.keep_rate_ph: 1.0 } if 'dep_avg' in self.model_type: feed_dict[self.model.prem_dep] = minibatch_prem_dep feed_dict[self.model.hypo_dep] = minibatch_hypo_dep minibatch_features = self.sess.run([self.model.features], feed_dict) dev_features = minibatch_features[0] if dev_features is None \ else np.concatenate((dev_features, minibatch_features[0])) dev_labels += minibatch_labels np.save(dev_features_file, dev_features) tuned_parameters = {'max_depth': [4, 6, 8], 'n_estimators': [100, 200]} best_score = 0. best_params = [] for g in ParameterGrid(tuned_parameters): clf = XGBClassifier(nthread=24) clf.set_params(**g) clf.fit(train_features, train_labels) score = clf.score(dev_features, dev_labels) logger.log('%s: %f' % (str(g), score)) if best_score < score: best_score = score best_params = g self.clf = clf logger.log('Best score: %s %f' % (str(best_params), best_score))
def decision_tree(): print( "Microsoft Malware Prediction using a Decision Tree Algorithm (XGBoost)" ) d1 = dt.datetime.now() print("Data processing started at", "%02d:%02d" % (d1.hour, d1.minute)) # Data loading with open('../../data/json/datatypes.json') as file: dtype = json.load(file) df = pd.read_csv('../../data/csv/microsoft-malware.csv', dtype=dtype) # Dropping categorical binary = [] categorical = [] numerical = [] for key, value in dtype.items(): if value in ['int8']: binary.append(key) if value in ['int16', 'category']: categorical.append(key) else: numerical.append(key) categorical.remove('MachineIdentifier') # Déjà enlevé par iloc df = df.drop(columns=list(categorical)) # Cleaning NaN for i in df.columns: s = df.loc[:, i] if i in numerical: # set NaNs in numerical features to -1 s.fillna(-1, inplace=True) elif i in binary: # set NaNs in binary feature to the most frequent one s.fillna(s.mode().iloc[0], inplace=True) df[i] = s.values if df[i].dtype == "int64" or df[i].dtype == "float64": df.loc[df[i].value_counts(normalize=True)[df[i]].values < 0.05, i] = -1 # Splitting dataset data = df.iloc[:, 1:-1] # Dropping MachineIdentifier & HasDetections target = df.iloc[:, -1] # Selecting HasDetections xtrain, xtest, ytrain, ytest = tts(data, target, train_size=0.8) # Training model boost = XGBClassifier(max_depth=2, n_estimators=200) boost.fit(xtrain, ytrain) boost_prediction = boost.predict(xtest) print("Score Train :", round(boost.score(xtest, ytest) * 100, 2), " %") d2 = dt.datetime.now() print("Took ", d2 - d1) # Plotting result labels = 'Found', 'Not found' hamming = distance.hamming(ytest, boost_prediction) rates = [1 - hamming, hamming] fig1, ax1 = plt.subplots() ax1.pie(rates, labels=labels, autopct='%0.2f%%') plt.show() # Decision tree print('Plotting decision tree') plot_tree(boost, rankdir='LR') fig = plt.gcf() fig.set_size_inches(150, 50) # fig.savefig("tree.png") plt.show()
images_train.isnull().any().describe() labels_train.isnull().any().describe() #from xgboost import XGBClassifier classifier = XGBClassifier(silent=0, eta=0.1, max_depth=8, subsample=0.75, colsample_bytree=0.75) classifier.fit(images, labels) # Predicting the Test set results y_pred = classifier.predict(test) #Checking score (Accuracy) classifier.score(images_test, labels_test) #Creating the joblib file dump(classifier, 'xgb.joblib') """#Getting the joblib file classifier = load('random_forest.joblib')""" #Generating the final dataframe y_pred = pd.DataFrame(y_pred) y_pred[:, 1] = y_pred[:, 0] y_pred['ImageId'] = pd.Series(data=np.arange(1, 28001), index=y_pred.index) y_pred.columns = ['Label', 'ImageId'] #y_pred = y_pred.drop(columns = ['ImageId']) columnsTitles = ["ImageId", "Label"] y_pred = y_pred.reindex(columns=columnsTitles) #Exporting the dataframe
def train_model_xgb_cv(X_train, X_test, y_train, y_test): dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test, label=y_test) xgb_sklearn = XGBClassifier(learning_rate=0.1, n_estimators=300, max_depth=3, min_child_weight=1, gamma=0.3, subsample=0.6, colsample_bytree=0.7, objective='binary:logistic', nthread=4, seed=27, reg_lambda=0.01) xgb_params = xgb_sklearn.get_params() cvresult = xgb.cv(xgb_params, dtrain, num_boost_round=xgb_params['n_estimators'], nfold=5, metrics='auc', early_stopping_rounds=5) n_estimators = cvresult.shape[0] print("n_estimators: ", n_estimators) xgb_sklearn.set_params(n_estimators=n_estimators) xgb_sklearn.fit(np.array(X_train), np.array(y_train), eval_metric='auc') pred_y = xgb_sklearn.predict(X_test) pred_y_prob = xgb_sklearn.predict_proba(X_test)[:, 1] # auc auc = roc_auc_score(y_test, pred_y_prob) print('AUC: ', auc) # error score = xgb_sklearn.score(X_test, y_test) print('error: ', 1 - score) # grid search params = {'max_depth': [2, 3, 4, 5, 6, 7, 8]} model = GridSearchCV( estimator=XGBClassifier( learning_rate=0.1, n_estimators=300, # max_depth=3, min_child_weight=1, gamma=0.3, subsample=0.6, colsample_bytree=0.7, objective='binary:logistic', nthread=4, seed=27, reg_lambda=0.01), param_grid=params, cv=2) model.fit(np.array(X_train), np.array(y_train), eval_metric='auc') print(model.cv_results_, model.best_params_, model.best_score_) feat_imp = pd.Series(xgb_sklearn.get_booster().get_fscore( fmap='xgb.fmap')).sort_values(ascending=True) feat_imp.plot(kind='barh', color='black', legend=False, figsize=(10, 6)) plt.ylabel('Feature name') plt.xlabel('Feature score') plt.savefig( 'C:/Users/Administrator.USER-20161227PQ/Desktop/paper figure/figure5.png', dpi=300) plt.show()
score = random_forest.score(X, y) Y_pred = random_forest.predict(X_test) # In[14]: #Classifier xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0) xgb.fit(X, y) score = xgb.score(X, y) y_pred = xgb.predict_proba(X_test) # In[15]: print(score) # In[21]: # for Random forest #Taking the 5 classes with highest probabilities ids = [] #list of ids cts = [] #list of countries for i in range(len(id_test)): idx = id_test[i]
xgb1 = XGBClassifier( learning_rate =0.1, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27).fit(X_train, np.ravel(y_train)) print('Accuracy of XGBOOST classifier on training set: {:.2f}' .format(xgb1.score(X_train, np.ravel(y_train)))) print('Accuracy of XGBOOST classifier on validation set: {:.2f}' .format(xgb1.score(X_val, np.ravel(y_val)))) # **GridSearchCV with XGBoost** # In[ ]: grid_values = {'n_estimators': [300] , 'learning_rate' : [0.05] , 'max_depth' : [5] , 'min_child_weight' : [1], 'colsample_bytree': [0.8] , 'subsample' : [0.6], 'gamma': [0]} clf_xgb_grid = XGBClassifier(seed=2,objective= 'binary:logistic',nthread=-1,scale_pos_weight=1) clf_xgb_grid_acc = GridSearchCV(clf_xgb_grid, param_grid = grid_values) clf_xgb_grid_acc.fit(X_train, np.ravel(y_train))
X_train, X_test, y_train, y_test = train_test_split(trainX, trainY, test_size=0.5) del train del trainTopSiteid del trainX del trainY del listTopSiteids model_train_siteid = XGBClassifier(n_estimators=10, nthread=-1, silent=False, seed=125, learning_rate=0.2) model_train_siteid.fit(X_train, y_train) model_train_siteid.score(X_test, y_test) pred = model_train_siteid.predict(testTopSiteid) testTopSiteid = train_real[train_real['siteid'].isnull()].copy() testTopSiteid['siteid'] = pred train_real[train_real['siteid'].isnull()] = testTopSiteid train_real.to_csv('data\\train_br_dev_site_pp4.csv', index=False) #################################################### # # --------------------------- For Test Data # #################################################### # --------------------- Loading datasets
clf = DecisionTreeClassifier() clf.fit(X_train,y_train) clf_pred = clf.predict(X_test) print("Decision Tree Classifier") print("Train Accuracy :",clf.score(X_train,y_train)) print("Test Accuracy ",metrics.accuracy_score(y_test,clf_pred)) print("") xgb = XGBClassifier() xgb.fit(X_train,y_train) xgb_pred = xgb.predict(X_test) print("Xgboost Classifier") print("Train Accuracy xgb:", xgb.score(X_train,y_train)) print("Test Accuracy ",metrics.accuracy_score(y_test,xgb_pred)) print("") rfc = RandomForestClassifier() rfc.fit(X_train,y_train) rfc_pred = rfc.predict(X_test) print("Random Forest Classifier") print("Train Accuracy of Random Forest C:",rfc.score(X_train,y_train)) print("Test Accuracy ",metrics.accuracy_score(rfc_pred,xgb_pred)) print("")
tweets_transform = pipe.fit_transform(tweets_tfidf) send_event("Explained Variance: " + str(pipe.get_params()['svd'].explained_variance_ratio_.sum())) send_event("Dimension Reduction - Execution time: %s seconds ---" % (time.time() - start_time)) print("Explained Variance: " + str(pipe.get_params()['svd'].explained_variance_ratio_.sum())) print("Dimension Rediction - Execution time: %s seconds ---" % (time.time() - start_time)) print('Start model training...') start_time = time.time() X_train, X_test, y_train, y_test = train_test_split(tweets_transform, y, test_size=0.3) xgb_model = XGBClassifier(max_depth=5, min_child_weight=5, gamma=0.1, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=1, random_state=10, n_estimators=5000, learning_rate=0.01, n_jobs=-1) xgb_model.fit(X_train, y_train) send_event("Test Set Score: " + str(xgb_model.score(X_test, y_test))) send_event("Train - Execution time: %s seconds ---" % (time.time() - start_time)) print("Test Set Score: " + str(xgb_model.score(X_test, y_test))) print("Train - Execution time: %s seconds ---" % (time.time() - start_time))
datasets.target, train_size=0.8, random_state=104) #2 # model = GradientBoostingClassifier(max_depth=4) model = XGBClassifier(n_jobs=-1, use_label_encoder=False) #3 model.fit(x_train, y_train, eval_metric='mlogloss') #4 acc = model.score(x_test, y_test) print(model.feature_importances_) print('acc : ', acc) ''' def plot_feature_importances_dataset(model): n_features = datasets.data.shape[1] plt.barh(np.arange(n_features),model.feature_importances_, align='center') plt.yticks(np.arange(n_features),datasets.feature_names) plt.xlabel("Feature Importances") plt.ylim(-1, n_features) plot_feature_importances_dataset(model) '''
base_score=0.2, n_estimators=200, seed=random_seed, max_depth=8) # In[29]: estimator.fit(dfTr2model[columns], dfTr2model.Cod_Prod) # **Evaluation of the test data** # # In order to observe the results of the test predictions, the trained classifier is evaluated on the subset of test data. # In[30]: tsScore = estimator.score(dfTs2eval[columns], dfTs2eval.Cod_Prod) print("Score obtained in test: " + str(tsScore)) # <a id="predict"> </a> # ## **Prediction** # # We make the prediction of the future products to be hired by the customers of the test dataset. # In[31]: Cod_Prod_predicted = estimator.predict(dfTs2predict[columns]) # **Creation of the results dataframe** # # In the next cell, the creation of a dataframe with the customer's ID and the product code to be purchased is carried out.
Lr_predicted = Lr.predict(x_test) clf = neighbors.KNeighborsClassifier() clf.fit(x_train, y_train) clf_predicted = clf.predict(x_test) svc_linear = SVC() svc_linear.fit(x_train, y_train) svc_linear_predicted = svc_linear.predict(x_test) gaussian = GaussianNB() gaussian.fit(x_train, y_train) gauss_predicted = gaussian.predict(x_test) #Calculate the accuracy print("XGB Classifier accuracy :", model.score(x_test, y_test), confusion_matrix(y_test, model_predicted), classification_report(y_test, model_predicted)) print("Random Forest Classifier accuracy :", Rf.score(x_test, y_test), confusion_matrix(y_test, Rf_predicted), classification_report(y_test, Rf_predicted)) print("Logistic Regression accuracy :", Lr.score(x_test, y_test), confusion_matrix(y_test, Lr_predicted), classification_report(y_test, Lr_predicted)) print("KNeighborsClassifier accuracy :", clf.score(x_test, y_test), confusion_matrix(y_test, clf_predicted), classification_report(y_test, clf_predicted)) print("SVC accuracy :", svc_linear.score(x_test, y_test), confusion_matrix(y_test, svc_linear_predicted), classification_report(y_test, gauss_predicted)) print("GaussianNB accuracy :", gaussian.score(x_test, y_test),
from xgboost.sklearn import XGBClassifier X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3, random_state=0) clf = DecisionTreeClassifier() #we have to define max_depth to prevent overfitting clf.fit(X_train, y_train) print("Train Accuracy of clf:", clf.score(X_train, y_train)) print("Test Accuracy of clf", clf.score(X_test, y_test)) xgb = XGBClassifier() xgb.fit(X_train, y_train) print("Train Accuracy of xgb:", xgb.score(X_train, y_train)) print("Test Accuracy of xgb:", xgb.score(X_test, y_test)) #%% from sklearn.model_selection import GridSearchCV #GridSearch on Xgboost Classifier param_dict = { 'max_depth': range(2, 3, 4), 'min_child_weight': range(1, 2, 6), 'learning_rate': [0.00001, 0.001, 0.01, 0.1], 'n_estimators': [10, 50, 100] } xgb_ = GridSearchCV(xgb, param_dict, cv=3, n_jobs=-1).fit(X_train, y_train)
# classification report cr = classification_report(y_valid, y_pred_rf) print(cr) # modelling from xgboost.sklearn import XGBClassifier from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report model_xgb = XGBClassifier() model_xgb.fit(x_train, y_train) y_pred_xgb = model_xgb.predict(x_valid) # evaluating the model print("Training Accuracy :", model_xgb.score(x_train, y_train)) print("Validation Accuracy :", model_xgb.score(x_valid, y_valid)) # confusion matrix cm = confusion_matrix(y_valid, y_pred_xgb) print(cm) # classification report cr = classification_report(y_valid, y_pred_xgb) print(cr) # boosting the predictions of the model boosted_predictions = 0.4 * y_pred_rf + 0.6 * y_pred_xgb boosted_predictions
##param_test7 = { ## 'reg_alpha':[1e-7, 1e-6, 0.05e-5, 1e-5, 1e-4, 0.5e-4] ##} ##gsearch7 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=234, max_depth=9, ## min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8, ## objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), ## param_grid = param_test7, scoring='f1_macro',n_jobs=4,iid=False, cv=5) ##gsearch7.fit(train_data[predictors],train_data[target]) ##print(gsearch7.cv_results_, gsearch7.best_params_, gsearch7.best_score_) xgb_model = XGBClassifier(learning_rate=0.1, n_estimators=175, max_depth=9, min_child_weight=2, gamma=0.0, subsample=0.8, colsample_bytree=0.8, reg_alpha=5e-05, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27) xgb_model.fit(data, data_label) print(xgb_model) print("score : " + str(xgb_model.score(train_data, train_label))) pred = pd.DataFrame(xgb_model.predict(test_data)) print("Accuracy : " + str(metrics.accuracy_score(test_label, pred))) print("F1 score : " + str(metrics.f1_score(test_label, pred)))
class XGBoostModel(BaseModel): """RandomForest classifier.""" def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, objective="binary:logistic", booster='gbtree', silent=True, n_jobs=1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0, missing=None): """""" super(XGBoostModel).__init__() self.model = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, silent=silent, objective=objective, booster=booster, n_jobs=n_jobs, gamma=gamma, min_child_weight=min_child_weight, max_delta_step=max_delta_step, subsample=subsample, colsample_bytree=colsample_bytree, colsample_bylevel=colsample_bylevel, reg_alpha=reg_alpha, reg_lambda=reg_lambda, scale_pos_weight=scale_pos_weight, base_score=base_score, random_state=random_state, missing=missing) def predict(self, features): super().predict(features) labels = self.model.predict(features) return labels def predict_prob(self, features): super().predict_prob(features) probs = self.model.predict_proba(features) return probs def predict_log_prob(self, features): super().predict_log_prob(features) probs = self.model.predict_proba(features) return probs def train(self, features, targets): super().train(features, targets) start = time.time() self.model.fit(X=features, y=targets) print('Finished, time %s' % (time.time() - start)) def accuracy_score(self, features, targets): super().accuracy_score(features, targets) score = self.model.score(features, targets, self.model.scale_pos_weight) return score def abs_errors(self, features, targets): targets_pred = self.predict(features) result = abs(targets_pred - targets) return result def rmse_score(self, y_pred, y_true): """ 计算RMSE评分,为了体现预测结果0、1、2不同的重要性,增加对1,2预测错误的惩罚度, 在评分计算时对不同行为分别乘以1,2,2.5的权重因子。 np.average((y_true - y_pred) ** 2, axis=0, weights=weights) :param y_pred: 预测标签 :param y_true: 真实标签 :return: 评分 """ weight_dict = {0: 1, 1: 2, 2: 2.5} # 不同类别的误判惩罚权重 weights = [weight_dict[l] for l in y_true] mse = np.average((y_true - y_pred)**2, axis=0, weights=weights) score = 1 / (1 + np.sqrt(mse)) return score
class Example_XGB: def __init__(self, filePath, cols): mpl.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体:解决plot不能显示中文问题 mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题 # 读入表格文件函数 self.all = pd.read_csv(filePath, encoding='UTF-8') own_feature = self.all.columns.values # 数据集具备的特征,包含标签label self.feature_cols = self.get_feature( cols, own_feature) # cols:需要的特征,feature_cols: 特征交集 # if self.feature_cols == "err": # err = "err&模型加载错误或测试文件读取失败!" self.y_pred = [] self.model = XGBClassifier() print("初始化完成...") def split_file(self, test_file_path, train_file_path): if len(self.feature_cols) == 0: return "err" X = self.all y = X.pop(Label) # pop() 函数用于移除列表中的一个元素(默认最后一个元素),并且返回该元素的值 X_train, X_test, Y_train, Y_test = model_selection.train_test_split( X, y, test_size=0.3) X_test = X_test[self.feature_cols] X_train = X_train[self.feature_cols] train_data = pd.concat([X_train, Y_train], axis=1) train_data.to_csv(train_file_path, index=False, encoding='UTF-8', mode='w') # 训练集文件 test_data = pd.concat([X_test, Y_test], axis=1) test_data.to_csv(test_file_path, index=False, encoding='UTF-8', mode='w') # 测试集文件 return "" # 特征处理 def get_feature(self, feature1, feature2): # 两种求列表并集的方法 feature3 = (set(feature1) - set(feature2)) # 需要但是不具备的特征 feature4 = list(set(feature1) - feature3) # 需要且具备的特征 if len(feature4) <= 1: return "err" else: return feature4 # 传 树形图 ,柱状图 # filePath 文件路径 # feature_cols 标签 def process(self, train, path1, path2): # 数据处理,模型调用 # self.plot_feature_importance(train, path2) fmap_filename = "picture/xgb_2.fmap" self.tree_pic(self.feature_cols, fmap_filename, path1) self.plot_feature_importance(train, path2) def train_model(self, train_file, model_file): train_data = pd.read_csv(train_file) x_train = train_data y_train = x_train.pop(Label) self.model.fit(x_train, y_train) if not os.path.exists(model_file): f = open(model_file, mode='ab') else: f = open(model_file, mode="wb") pickle.dump(self.model, f) # 保存模型 return x_train def result(self, all_file, model_file, test_file_path, img_path): if model_file != "" or all_file == "" or test_file_path == "": f = open(model_file, 'rb') self.model = pickle.load(f) # 读取模型 else: return "err" train_data = pd.read_csv(test_file_path) x_test = train_data y_test = x_test.pop(Label) y_pred = self.model.predict(x_test) # 模型测试 # 计算评价指标 accuracy = self.model.score(x_test, y_test) accuracy = '%.4f%%' % (accuracy * 100) ret = "准确率:{0}".format(accuracy) print("ret:", ret) y_pred = y_pred.reshape(y_pred.shape[0], 1) res = pd.read_csv(test_file_path, encoding='utf-8') # a = pd.read_csv(all_file, encoding='utf-8') # res = a.loc[y_test.index] res['pred'] = y_pred res = res.iloc[0:500] res.to_csv(img_path, mode='w', index=False, encoding='UTF-8') return ret def tree_pic(self, features, fmap_filename, path_1): outfile = open(fmap_filename, 'w') i = 0 for feat in features: outfile.write('{0}\t{1}\tq\n'.format(i, feat)) i = i + 1 outfile.close() from xgboost import plot_tree plot_tree(self.model, num_trees=0, fmap=fmap_filename) fig = plt.gcf() fig.set_size_inches(15, 10) fig.savefig(path_1) # im = Image.open(path_1) # im.show() def plot_feature_importance(self, x_train, path2): plt.clf() # 清空画板 feat_labels = x_train.columns importances = self.model.feature_importances_ indices = np.argsort(importances)[::-1] for f in range(x_train.shape[1]): print("%2d) %-*s %f" % (f + 1, 30, feat_labels[f], importances[indices[f]])) plt.title('特征重要性分析', fontsize=18) plt.bar(range(x_train.shape[1]), importances[indices], color='lightblue', align='center') font2 = {'size': 18} plt.xlabel(u'特征变量', font2) plt.ylabel(u'重要度', font2) plt.xticks(range(x_train.shape[1]), feat_labels, rotation=0, fontsize=16) plt.yticks(fontsize=18) plt.xlim([-1, x_train.shape[1]]) plt.tight_layout() plt.savefig(path2)
[clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']): scores = cross_validation.cross_val_score(clf, X, y, cv=5, scoring='accuracy') print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) ################################## xgbc = XGBClassifier(learning_rate=0.1) xgbc.fit(X_trainval, y_trainval) print("XGBoost预测准确率:", xgbc.score(X_test, y_test)) # 0.7872340425531915 rfc = RandomForestClassifier() rfc.fit(X_trainval, y_trainval) y_predst = rfc.predict(X_test) # accuracy=accuracy_score(X_test,y_predst) # print("XGBoost预测值与实际: %.2f%%" % (accuracy*100.0)) joblib.dump(rfc, 'xgb_down_model.joblib') predd_xgb = pd.DataFrame({'y_test': y_test, 'y_pred': y_predst}) predd_xgb.to_csv('xgb_predd.csv') print("随机森林预测准确率:", rfc.score(X_test, y_test)) # 0.7811550151975684 ########################################################################
class Trainer(object): def __init__(self, actions, c_config, c_i, data_manager): self.actions = actions self.c_config = c_config self.c_i = c_i self.data_manager = data_manager self.method = None # param = self.getparams() if c_i == 0: self.method = RandomForestClassifier( n_estimators=int(actions[0]), max_depth=int(actions[1]), min_samples_split=int(actions[2]), min_samples_leaf=int(actions[3]), max_features=actions[4], bootstrap=True, n_jobs=-1) elif c_i == 1: self.method = XGBClassifier(max_depth=int(actions[0]), learning_rate=float(actions[1]), n_estimators=int(actions[2]), gamma=float(actions[3]), min_child_weight=int(actions[4]), subsample=float(actions[5]), colsample_bytree=float(actions[6]), colsample_bylevel=float(actions[7]), reg_alpha=float(actions[8]), reg_lambda=float(actions[9]), nthread=-1) else: assert False, "Trainer.__init__: 异常信息!" def getparams(self): param = [] key_value = self.c_config.methods_dict[self.c_i][1:] assert len( self.actions) == len(key_value), "Trainer.getparams: 数据维度应该一样!" for i in range(len(key_value)): param.append(key_value[i][1][self.actions[i]]) return param def run(self): self.fit() accuracy = self.estimate() return accuracy # 交叉验证集版本的训练方法 def run_CV(self): results = cross_val_score(self.method, self.data_manager.data_cv['data_cv'], self.data_manager.data_cv['labels_cv'], cv=2, n_jobs=1) accuracy = np.mean(results) return accuracy def fit(self): self.method.fit(self.data_manager.data_cv['data_cv'], self.data_manager.data_cv['labels_cv']) def predict(self, x): return self.method.predict(x) def estimate(self): return self.method.score(self.data_manager.data_cv["data_test"], self.data_manager.data_cv["labels_test"])
n_estimators=n_est, max_depth=5, #树的最大深度 一般3-10 min_child_weight=1, #决定最小叶子节点样本权重和 值较大,避免过拟合 值过高,会导致欠拟合 gamma=0, #指定了节点分裂所需的最小损失函数下降值。 这个参数的值越大,算法越保守 subsample= 0.8, #对于每棵树,随机采样的比例 减小,算法保守,避免过拟合。值设置得过小,它会导致欠拟合 典型值:0.5-1 colsample_bytree=0.8, #每棵随机采样的列数的占比 objective='binary:logistic', #使用二分类 nthread=4, #线程数 scale_pos_weight=1, #在各类别样本十分不平衡时,参数设定为一个正值,可以使算法更快收敛 seed=0) #随机数的种子 设置它可以复现随机数据的结果 print(get_time(), ' ', mall_id, ' starts...') train_time = time.time() clf.fit(x_train, y_train) train_time = time.time() - train_time print('time : ', train_time) score = clf.score(x_test, y_test) joblib.dump(clf, save_dir) print(get_time(), ' saved a model for ', mall_id, ' score: ', score) train_time = int(train_time) sql = "UPDATE scores SET xgb='{s}',xgb_itr_times={t}, xgb_train_time={tt} WHERE mall_id='{m}'".format( s=score, t=n_est, m=mall_id, tt=train_time) cur.execute(sql) conn.commit() # print('test done... spent time = {s}'.format(s=get_time() - time)) else: print(mall_id, ' has already been handled.') cur.close() conn.close()
random_forest = RandomForestClassifier(random_state=1, n_estimators=45, min_samples_split=3, min_samples_leaf=2) random_forest.fit(X, y) score=random_forest.score(X, y) Y_pred = random_forest.predict(X_test) # In[14]: #Classifier xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0) xgb.fit(X, y) score = xgb.score(X,y) y_pred = xgb.predict_proba(X_test) # In[15]: print (score) # In[21]: # for Random forest #Taking the 5 classes with highest probabilities
from sklearn.preprocessing import StandardScaler scalar = StandardScaler() X_train = scalar.fit_transform(X_train) X_valid = scalar.transform(X_valid) test_X = scalar.transform(test_X) from xgboost.sklearn import XGBClassifier from sklearn.metrics import roc_auc_score modelXG = XGBClassifier() modelXG.fit(X_train, Y_train) Y_predXG = modelXG.predict(X_valid) print("Train Accuracy: ", modelXG.score(X_train, Y_train)) print("Validation Accuracy: ", modelXG.score(X_valid, Y_valid)) print("AUROC Score of XGBoost = ", roc_auc_score(Y_valid, Y_predXG)) from sklearn.ensemble import RandomForestClassifier modelRF = RandomForestClassifier() modelRF.fit(X_train, Y_train) Y_predRF = modelRF.predict(X_valid) print("Train Accuracy: ", modelRF.score(X_train, Y_train)) print("Validation Accuracy: ", modelRF.score(X_valid, Y_valid)) print("AUROC Score of Random Forest = ", roc_auc_score(Y_valid, Y_predRF))