def xgboost_classifier(self): cls = XGBClassifier() print 'xgboost cross validation score', cross_val_score(cls,self.x_data,self.y_data) start_time = time.time() cls.fit(self.x_train, self.y_train) print 'score', cls.score(self.x_test, self.y_test) print 'time cost', time.time() - start_time
n_jobs = -1 #딥러닝이 아닐 경우 n_job = -1 max_depth = 7 #추후 CV꼭 쓰고 Feature_importance도 써야한다. model = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, n_jobs=n_jobs, colsample_bylevel=colsample_bylevel, colsample_bytree=colsample_bytree) model.fit(x_train, y_train) score = model.score(x_test, y_test) print('score : ', score) print(model.feature_importances_) #아래의 plot_importance를 보여줄수 있따. #print("------------------------------------") ##print(model.best_estimator_) #print(model.best_params_) #print("------------------------------------") #0.9649122807017544 plot_importance(model) plt.show() ''' ### 3. 모델 훈련
def home(request): if request.method == 'POST': username = request.POST['uname'] contact = request.POST['contact'] age = request.POST['age'] email = request.POST['email'] mean_radius = request.POST['mean_radius'] mean_texture = request.POST['mean_texture'] mean_perimeter = request.POST['mean_perimeter'] mean_area = request.POST['mean_area'] mean_smoothness = request.POST['mean_smoothness'] diagnosis = 0 print( f"{username} {contact} {age} {email} {mean_radius} {mean_texture} {mean_perimeter} {mean_area} {mean_smoothness}" ) rows = [[ mean_radius, mean_texture, mean_perimeter, mean_area, mean_smoothness, diagnosis ]] # name of csv file # writing to csv file with open(filename, 'a+') as csvfile: # creating a csv writer object csvwriter = csv.writer(csvfile) # writing the fields # csvwriter.writerow(fields) # writing the data rows csvwriter.writerows(rows) print('Data Added Successfully!!') # print(np.any(np.isnan(df))) # print(np.all(np.isfinite(df))) x = df.drop(['diagnosis'], axis=1) y = df['diagnosis'] X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=0) sc_x = StandardScaler().fit(X_train) X_train = sc_x.transform(X_train) X_test = sc_x.transform(X_test) classifier = XGBClassifier() classifier.fit(X_train, Y_train) predictions = classifier.predict(X_test) Accuracy = classifier.score(X_test, Y_test).round(2) Patient_Id = str(username[0:2]) + \ str(random.randint(100001, 99999999999)) print(Patient_Id) if Accuracy >= 0.70: msg = "Your Test Id is: " + str(Patient_Id) messages.success(request, "You Have Breast Cancer") messages.success(request, msg) obj = Report_Data.objects.create(Patient_Name=username, Patient_Id=Patient_Id, Email=email, Mobile_No=contact, Age=age, mean_radius=mean_radius, mean_texture=mean_texture, mean_perimeter=mean_perimeter, mean_area=mean_area, mean_smoothness=mean_smoothness, Test_Result='Positive') else: msg = "Your Test Id is: " + str(Patient_Id) messages.success(request, "Congratulation You Don't Have Breast Cancer!") messages.success(request, msg) obj = Report_Data.objects.create(Patient_Name=username, Patient_Id=Patient_Id, Email=email, Mobile_No=contact, Age=age, mean_radius=mean_radius, mean_texture=mean_texture, mean_perimeter=mean_perimeter, mean_area=mean_area, mean_smoothness=mean_smoothness, Test_Result='Negative') return render(request, 'index.html')
x_validate_scaled = scaler.transform(x_validate) # XGboosting ------------------------------------------------------------------ print(" XGBOOST ... ") # Training........................................... print("Training...........................") xgb_model = XGBClassifier() xgb_model.fit(x_train_scaled, y_train_new) with open( '/home/mkolpe2s/rand/Classic_ML/Proper_method/XGB/DCASE2018/XGB_DCASE2018_default_parameter.pkl', 'wb') as f: pickle.dump(xgb_model, f) print("Train score:", xgb_model.score(x_train_scaled, y_train_new)) #Validation.............................. print("Validation...........................") print("Validation score:", xgb_model.score(x_validate_scaled, y_validate)) #Testing................................. print("Testing...........................") scaler = StandardScaler() x_test_scaled = scaler.fit_transform(x_test) print("Test score:", xgb_model.score(x_test_scaled, y_test)) # Classification report................... print("Classification report XGB default parameter.....................")
##data=data.drop(bottom_vars,axis=1) ##train_data=train_data.drop(bottom_vars,axis=1) ##test_data=test_data.drop(bottom_vars,axis=1) ##predict_data=predict_data.drop(bottom_vars,axis=1) xgb_model = XGBClassifier(learning_rate=0.1, n_estimators=550, max_depth=7, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27) xgb_model.fit(train_data, train_label) print(xgb_model) predictions = xgb_model.predict(predict_data) output = pd.DataFrame({'tripid': predict_index, 'prediction': predictions}) output.to_csv('sample_submission_xgboost_tuned_new.csv', index=False) print("score : " + str(xgb_model.score(train_data, train_label))) pred = pd.DataFrame(xgb_model.predict(test_data)) print("Accuracy : " + str(metrics.accuracy_score(test_label, pred))) print("F1 score : " + str(metrics.f1_score(test_label, pred)))
from xgboost import XGBClassifier import time # In[ ]: xgb_model = XGBClassifier(n_jobs=4, n_estimators=250, max_depth=8, eta=0.1) # In[ ]: # Model fitting xgb_model.fit(X_train, up_train) # In[ ]: xgb_model.score(X_test, up_test) # In[ ]: # Fetch Test set dayss = env.get_prediction_days() # In[ ]: # Preprocessing the test set for submission n_days = 0 prep_time = 0 prediction_time = 0 packaging_time = 0 for (market_obs_df, news_obs_df, predictions_template_df) in (dayss): n_days += 1
random_state=0) gb_clf2.fit(x_train, y_train) predictions = gb_clf2.predict(x_test) print("Confusion Matrix:") print(confusion_matrix(y_test, predictions)) print("Classification Report") print(classification_report(y_test, predictions)) gb_clf2.feature_importances_ feat_importances = pd.Series(gb_clf2.feature_importances_, index=x.columns) feat_importances.nlargest(5).plot(kind='barh') x_train['date_diff_level'] = pd.to_numeric(x_train['date_diff_level']) x_test['date_diff_level'] = pd.to_numeric(x_test['date_diff_level']) xgb_clf3 = XGBClassifier() xgb_clf3.fit(x_train, y_train) score = xgb_clf3.score(x_test, y_test) print(score) xgb_clf3.feature_importances_ feat_importances = pd.Series(xgb_clf3.feature_importances_, index=x.columns) feat_importances.nlargest(5).plot(kind='barh') """END OF CODE """
plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.legend(loc=4) plt.show() # -------------- from xgboost import XGBClassifier # Code starts here # rf = RandomForestClassifier() xgb = XGBClassifier(learning_rate=0.0001) xgb.fit(X_train, y_train) accuracy = xgb.score(X_test, y_test) y_pred = xgb.predict(X_test) # Store the different evaluation values. f1 = f1_score(y_test, xgb.predict(X_test)) precision = precision_score(y_test, xgb.predict(X_test)) recall = recall_score(y_test, xgb.predict(X_test)) roc_auc = roc_auc_score(y_test, xgb.predict(X_test)) print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) # Plot the auc-roc curve score = roc_auc_score(y_pred, y_test)
class XGBoost_classifier: def __init__(self): self.XGBoost_clf = XGBClassifier(max_depth=5, learning_rate=0.1, n_estimator=100, silent=True, objective='binary:logistic') self.standardScaler = StandardScaler() self.train_score = None self.isload_ = False def train(self, model_folder, train_feature_add): """ XGBoost算法训练数据 :param train_feature_add: 训练数据路径 :param model_add: 模型存储路径 :return: """ train_df = pd.read_csv(train_feature_add, index_col=['domain_name']) train_df = train_df.fillna('0.0') x_train = train_df.drop(['label'], axis=1).values y_train = train_df['label'].values print("_______XGBoost Training_______") self.XGBoost_clf.fit(x_train, y_train) mal_scores = np.array(self.XGBoost_clf.predict_proba(x_train))[:, 1] mal_scores = sorted(mal_scores) np.save(r"{}/XGBoost_train_scores.npy".format(model_folder), mal_scores) pickle.dump(self.XGBoost_clf, open("{}/XGBoost_model.pkl".format(model_folder), 'wb')) def load(self, model_folder): """ 将模型文件和归一化尺度读取到内存中 :param model_add: 模型存储路径 :param standard_scaler_add: 归一化scaler存储路径 :return: """ self.XGBoost_clf = pickle.load( open("{}/XGBoost_model.pkl".format(model_folder), 'rb')) self.standardScaler = pickle.load( open("{}/standardscalar.pkl".format(model_folder), 'rb')) self.train_score = np.load( r"{}/XGBoost_train_scores.npy".format(model_folder)) self.isload_ = True def predict(self, model_folder, test_feature_add): """ 测试集进行测试,计算准确率等 :param test_feature_add: 测试数据路径 :return: """ self.load(model_folder) test_df = pd.read_csv(test_feature_add, index_col=['domain_name']) test_df = test_df.fillna('0.0') x_test = test_df.drop(['label'], axis=1).values y_test = test_df['label'].values print("_______XGBoost Predicting_______") y_predict = self.XGBoost_clf.predict(x_test) print("XGBoost accuracy: ", self.XGBoost_clf.score(x_test, y_test)) print("XGBoost precision: ", precision_score(y_test, y_predict, average='macro')) print("XGBoost recall: ", recall_score(y_test, y_predict, average='macro')) print("XGBoost F1: ", f1_score(y_test, y_predict, average='macro')) print("XGBoost TPR, FPR, thresholds: ", roc_curve(y_test, y_predict, pos_label=1)) plot_roc_curve(self.XGBoost_clf, x_test, y_test) plt.show() def predict_singleDN(self, model_folder, dname): """ 对单个域名进行检测,输出检测结果及恶意概率 :param dname: 域名 :return: """ if not self.isload_: self.load(model_folder) dname = dname.strip('/').strip('.') dname = dname.replace("http://", '') dname = dname.replace("www.", "") dname = wash_tld(dname) if dname == "": label = 0 prob = 0.0000 p_value = 1.0000 print("\nxgboost sld:", dname) # print("label:", label) # print("mal_prob:", prob) # print("p_value:", p_value) print('label:{}, pro:{}, p_value:{}'.format(label, prob, p_value)) return label, prob, p_value else: feature = self.standardScaler.transform( pd.DataFrame([phishing_get_feature(dname)])) label = self.XGBoost_clf.predict(feature) prob = self.XGBoost_clf.predict_proba(feature) p_value = cal_pValue(self.train_score, prob[0][1], label[0]) print("\nxgboost sld:", dname) # print("label:", label[0]) # print("mal_prob:", prob[0][1]) # print("p_value:", p_value) print('label:{}, pro:{}, p_value:{}'.format( label[0], prob[0][1], p_value)) return label[0], prob[0][1], p_value
from sklearn.model_selection import train_test_split from baseline.utils import fetch_data X, y, ot = fetch_data("../data/features") print(X.shape, y.shape) X_train, X_test, y_train, y_test, _, ot_test = train_test_split(X, y, ot, test_size=0.2, shuffle=True) print(X_train.shape, y_train.shape) model = XGBClassifier(objective="multi:softprob") model.fit(X_train, y_train) print("Samples: %d, Accuracy: %.2f%%" % (len(X_test), model.score(X_test, y_test))) pred = model.predict(X_test) ot_num, ot_acc = [0] * 12, [0] * 12 for i in range(len(pred)): ot_num[ot_test[i]] += 1 if pred[i] == y_test[i]: ot_acc[ot_test[i]] += 1 ot_acc = [ round(ot_acc[i] / ot_num[i], 4) if ot_num[i] else 0 for i in range(len(ot_num)) ] print("ot-acc distribution") print(ot_acc)
plt.show() # XGBOOST from xgboost import XGBClassifier # let's find best n_est. paramater n_estimators = {} for n in range(10,160,10): XGB = XGBClassifier(n_estimators=n) XGB.fit(X_train,y_train) n_estimators[n] = [XGB.score(X_test,y_test),roc_auc_score(y_test,XGB.predict(X_test))] n_estimators = pd.DataFrame(n_estimators.items(),columns=["n","Accuracy"]) n_estimators["Roc Score"] = n_estimators["Accuracy"].apply(lambda x: x[1]) n_estimators["Accuracy"] = n_estimators["Accuracy"].apply(lambda x: x[0]) n_estimators.set_index("n",inplace=True) ax = n_estimators.plot() ax.set_title("ROC/ACC scores for different n_estimators paramaters",fontdict={"fontsize":12,"fontweight":"bold"}) # n_estimators = 30 looks good
c[1] += 1 print(c) #KNN knn = KNeighborsClassifier() knn.fit(Train_data, Train_label) knn_pred = knn.predict(Test_data) print('KNN_Acc:', accuracy_score(Test_label, knn_pred)) print('KNN_F1:', f1_score(Test_label, knn_pred, average='micro')) knn.fit(New_Data, New_Label) New_knn_pred = knn.predict(Test_data) print('New_KNN_Acc:', accuracy_score(Test_label, New_knn_pred)) print('New_Knn_F1:', f1_score(Test_label, New_knn_pred, average='micro')) #XGB XGB = XGBClassifier() XGB.fit(Train_data, Train_label) print('XGB_Acc:', XGB.score(Test_data, Test_label)) print('XGB_F1:', f1_score(Test_label, XGB.predict(Test_data), average='micro')) XGB.fit(New_Data, New_Label) print('New_XGB_Acc:', XGB.score(Test_data, Test_label)) #RFC rfc = RandomForestClassifier() rfc.fit(Train_data, Train_label) print('RFC_Acc:', rfc.score(Test_data, Test_label)) print('RFC_F1:', f1_score(Test_label, rfc.predict(Test_data), average='micro')) rfc.fit(New_Data, New_Label) print('New_Rfc_Acc:', rfc.score(Test_data, Test_label)) #SVM svm = SVC(kernel='linear', probability=True) svm.fit(Train_data, Train_label) svm_pred = svm.predict(Test_data) print('SVM_Acc', accuracy_score(Test_label, svm_pred))
print(datetime.datetime.now()) print('\n') print('XGB Classifier') print('\n') xgb_cls = XGBClassifier(objective="multi:softprob", num_class=20, random_state=61, colsample_bytree=0.6, learning_rate=0.1, n_estimators=200, max_depth=8, alpha=0.01, gamma=0.001, subsamples=0.6) xgb_cls.fit(X_train, y_train) print("Accuracy on training set is : {}".format(xgb_cls.score( X_train, y_train))) print("Accuracy on test set is : {}".format(xgb_cls.score(X_test, y_test))) y_pred = xgb_cls.predict(X_test) print(classification_report(y_test, y_pred)) print(confusion_matrix(y_test, y_pred)) # plot statisics on the results of the model run plot_accuracy_and_loss(model, training_fit, test_features, test_labels_one_hot, test_labels) print(datetime.datetime.now())
"max_depth": [4, 5, 6], "colsample_bytree": [0.6, 0.9, 1], "colsample_bylevel": [0.6, 0.7, 0.9] }] n_jobs = -1 # CV 써라 # XGB 속도가 굉장히 빠름, 전처리 결측치 제거 안해줘도 됨 model = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, colsample_bylevel=colsample_bylevel, colsample_bytree=colsample_bytree) model = GridSearchCV(XGBClassifier(), parameters, cv=5, n_jobs=-1) model.fit(x_train, y_train) print("=================================") print(model.best_estimator_) print("=================================") print(model.best_params_) print("=================================") score = model.score(x_test, y_test) # score는 evaluate print('점수 :', score) # print(model.feature_importances_) # plot_importance(model) # plt.show()
class BoostingDecisionMaker: def __init__(self, folder="", n_estimators=50, max_depth=5, monotonous_features=""): self.n_estimators = n_estimators self.max_depth = max_depth self.folder = folder self.monotonous_features = utils.transform_string_feature_range_into_list( monotonous_features) if not folder.strip(): self.xg_boost = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=43) else: self.load_model(folder) def get_feature_ids(self): return utils.transform_string_feature_range_into_list(self.feature_ids)\ if isinstance(self.feature_ids, str) else self.feature_ids def add_config_info(self, full_config, features): self.full_config = full_config self.feature_ids = features def load_model(self, folder): self.folder = folder with open(os.path.join(folder, "boost_model.pickle"), "rb") as f: self.n_estimators, self.max_depth, self.xg_boost = pickle.load(f) with open(os.path.join(folder, "data_features_config.pickle"), "rb") as f: self.full_config, self.feature_ids, self.monotonous_features = pickle.load( f) def save_model(self, folder): if not os.path.exists(folder): os.makedirs(folder) with open(os.path.join(folder, "boost_model.pickle"), "wb") as f: pickle.dump([self.n_estimators, self.max_depth, self.xg_boost], f) with open(os.path.join(folder, "data_features_config.pickle"), "wb") as f: pickle.dump( [self.full_config, self.feature_ids, self.monotonous_features], f) def train_model(self, train_data, labels): mon_features = [(1 if feature in self.monotonous_features else 0) for feature in self.get_feature_ids()] mon_features_prepared = "(" + ",".join([str(f) for f in mon_features]) + ")" self.xg_boost = XGBClassifier( n_estimators=self.n_estimators, max_depth=self.max_depth, random_state=43, monotone_constraints=mon_features_prepared) self.xg_boost.fit(train_data, labels) logger.info("Train score: ", self.xg_boost.score(train_data, labels)) logger.info("Feature importances: ", self.xg_boost.feature_importances_) def validate_model(self, valid_test_set, valid_test_labels): res, res_prob = self.predict(valid_test_set) logger.info("Valid dataset F1 score: ", self.xg_boost.score(valid_test_set, valid_test_labels)) logger.info(confusion_matrix(valid_test_labels, res)) logger.info(classification_report(valid_test_labels, res)) def predict(self, data): if not len(data): return [], [] return self.xg_boost.predict(data), self.xg_boost.predict_proba(data)
import pandas as pd titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt') X = titanic[['pclass', 'age', 'sex']] y = titanic['survived'] X['age'].fillna(X['age'].mean(), inplace=True) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33) from sklearn.feature_extraction import DictVectorizer vec = DictVectorizer(sparse=False) X_train = vec.fit_transform(X_train.to_dict(orient='record')) X_test = vec.transform(X_test.to_dict(orient='record')) from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier() rfc.fit(X_train, y_train) print('The accuracy of Random Forest Classifier on testing set:', rfc.score(X_test, y_test)) from xgboost import XGBClassifier xgbc = XGBClassifier() xgbc.fit(X_train, y_train) print('The accuracy of eXtreme Gradient Boosting Classifier on testing set:', xgbc.score(X_test, y_test))
# 모든 특성 학습시키기 X = data_20[feature_names] y = data_20[data_20.columns[-1]] #데이터 분리 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=66) #모델 학습 model = XGBClassifier(learning_rate=0.09, n_estimators=110, max_depth=7, min_child_weight=1, gamma=0.3, reg_alpha=1e-05, subsample=0.61, colsample_bytree=0.7, seed=150) model.fit(X_train, y_train) #평가 print("훈련 세트 정확도: {:.3f}".format(model.score(X_train, y_train))) print("테스트 세트 정확도 : {:.3f}".format(model.score(X_test, y_test))) # # 예측 # X_data = [] # X_new = pd.DataFrame(X_data, columns = columns) # # 들어오는 값이 어떻게 될지 몰라 우선 x,y축 다른 행에 있다 가정하고 columns 지정. # # 들어온 값 형태에 따라 추후 수정필요 # pre = model.predict(X_new) # print(pre)
# Confusion Matrix print(confusion_matrix(y_test,y_pred)) """#### XGBoost""" from xgboost import XGBClassifier modelXGB = XGBClassifier() modelXGB.fit(X_train,y_train) y_pred = modelXGB.predict(X_test) # Accurracy Score modelXGB.score(X_test,y_test) # Classification Report print(classification_report(y_test,y_pred)) # Confusion Matrix print(confusion_matrix(y_test,y_pred)) """### Test Data and Submission File""" test = pd.read_csv("/content/drive/MyDrive/dataset/test.csv",na_values=['?','-999','Error','xxxxxxxx']) test_1 = pd.read_csv("/content/drive/MyDrive/dataset/test.csv") test.head() test = test.drop(["customer_id","Name","security_no","referral_id","last_visit_time"],axis=1)
# colsample_bynode=1, colsample_bytree=0.8, gamma=0.1, # learning_rate=0.05, max_delta_step=0, max_depth=5, # min_child_weight=1, missing=None, n_estimators=300, n_jobs=1, # nthread=-1, objective='binary:logistic', random_state=0, # reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=10, # silent=None, subsample=0.8, verbosity=1) #XGB= xgb.XGBClassifier( leaning_rate= 0.02, max_bin=10,num_leaves=16, subsample=0.7, max_depth=4, subsample_freq=2, colsample_bytree= 0.3, min_child_samples=500,seed=99,n_estimators=300, objective= 'binary:logistic',n_jobs=-1) XGB=XGBClassifier(gamma=0.1, subsmaple=1.0, colsample_bytree=1.0, n_estimators=500,max_depth=10, min_child_weight=10, learning_rate=0.01, objective= 'binary:logistic', n_jobs=-1) XGB.fit(X_train, y_train) # Make predictions predictions = XGB.predict(X_val) probs = XGB.predict_proba(X_val) display(predictions) score = XGB.score(X_val, y_val) print("Accuracy: ", score) print(classification_report(y_val, predictions)) data['churn'].value_counts() # confusion_matrix(y_val, predictions) # display(confusion_matrix) #ROC 커브 그리기 fpr, tpr, threshold = roc_curve(y_val, probs[:,1]) plt.title('Receiver Operating Characteristic') plt.plot(fpr, tpr, 'b') plt.plot([0, 1], [0, 1],'r--') plt.xlim([0, 1]) plt.ylim([0, 1])
titanic = pd.read_csv("../data/titanic/titanic.txt") # 抽取pclass age 和 sex 作为训练样本 x = titanic[["pclass", "age", "sex"]] y = titanic["survived"] # 采集的age空的用平均数补全 x["age"].fillna(x["age"].mean(), inplace=True) # 分割训练数据和测试数据 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33) # 提取字典特征 进行 向量化 vec = DictVectorizer() x_train = vec.fit_transform(x_train.to_dict(orient="record")) x_test = vec.transform(x_test.to_dict(orient="record")) # 采用默认配置的随机森林进行预测 rfc = RandomForestClassifier() rfc.fit(x_train, y_train) print("随机森林预测准确率:", rfc.score(x_test, y_test)) # 0.7811550151975684 # 采用XGBoost模型进行预测 xgbc = XGBClassifier() xgbc.fit(x_train, y_train) print("XGBoost预测准确率:", xgbc.score(x_test, y_test)) # 0.7872340425531915
data.info() print(data.groupby("cardio").size()) data.dtypes print(data.dtypes) y = data['cardio'] X = data.drop(['cardio'], axis=1) print("Shape of X: {0}; positive example: {1}; negative: {2}".format( X.shape, y[y == 1].shape[0], y[y == 0].shape[0])) # 查看数据的形状和类别分布 X_train, X_test, y_train, y_test = train_test_split( data.drop('cardio', 1), data['cardio'], test_size=.2, random_state=10) #split the data model = XGBClassifier() model.fit(X_train, y_train) train_score = model.score(X_train, y_train) test_score = model.score(X_test, y_test) print("train score: {train_score:.6f}; test score: {test_score:.6f}".format( train_score=train_score, test_score=test_score)) #模型预测 y_pred = model.predict(X_test) print("matchs: {0}/{1}".format( np.equal(y_pred, y_test).shape[0], y_test.shape[0])) xg_result = accuracy_score(y_test, y_pred) print("Accuracy:", xg_result) f1_score(y_test, y_pred) print(classification_report(y_test, y_pred)) confusion_matrix = confusion_matrix(y_test, y_pred)
# start = time.time() # model = XGBClassifier(n_jobs=i) # model.fit(x_train,y_train) # acc = model.score(x_test,y_test) # print('n_jobs가',f'{i}','일때') # print(time.time()-start,'초') model = XGBClassifier(n_jobs=-1) model.fit(x_train,y_train) #4. 평가, 예측 acc = model.score(x_test, y_test) print(model.feature_importances_) print("acc :", acc) df = pd.DataFrame(dataset.data, columns=dataset.feature_names) new_data=[] feature=[] a = np.percentile(model.feature_importances_, q=25) for i in range(len(dataset.data[0])): if model.feature_importances_[i] > a: new_data.append(df.iloc[:,i]) feature.append(dataset.feature_names[i]) new_data = pd.concat(new_data, axis=1)
from sklearn.model_selection import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(X_all, Y_all, test_size=0.2, random_state=0) # In[55]: from xgboost import XGBClassifier model = XGBClassifier() model.fit(X_train, Y_train) # In[56]: model.score(X_test, Y_test) # In[57]: predictions = model.predict(X_test) predictions # In[58]: submission = pd.DataFrame({ 'shot_id_number': X_test['shot_id_number'], 'is_goal': predictions }) # In[59]:
from xgboost import XGBClassifier from sklearn.datasets import load_wine from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score #1. 데이터 datasets = load_wine() x = datasets.data y = datasets.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, shuffle = True, random_state = 66) #2. 모델링 model = XGBClassifier(n_estimators = 100, learning_rate = 0.01, n_jobs = -1) #3. 훈련 model.fit(x_train, y_train, verbose = 1, eval_metric=['merror', 'mlogloss'], eval_set=[(x_train, y_train), (x_test, y_test)], early_stopping_rounds=5) #4. 평가 result1 = model.score(x_test, y_test) print("result1 : ", result1) y_pred = model.predict(x_test) acc = accuracy_score(y_test, y_pred) print("acc : ", acc) result2 = model.evals_result() print("result2 : ", result2) # result1 : 0.9722222222222222 # acc : 0.9722222222222222
# Fitting AdaBoost Classification to the Training set classifier = XGBClassifier(learning_rate=0.5, n_estimators=1000, max_depth=5, min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', scale_pos_weight=1, seed=27) start = time() classifier.fit(X_train, y_train) train_time = time() - start start = time() score = classifier.score(X_test, y_test) score_time = time() - start print("score = {:.3f} | time = {:,.3f}s/{:,.3f}s".format( score, train_time, score_time)) # Calculating feature inportance feature_name = cv.get_feature_names() feature_name = np.array(feature_name) feature_name = np.insert(feature_name, 0, "avg_star_rating", axis=0) importances = classifier.feature_importances_ indices = np.argsort(importances)[::-1] feature_name = np.array(feature_name) for f in range(100): print("%2d) %-*s %f" % (f + 1, 30, feature_name[indices[f]], importances[indices[f]])) # 1) love 0.074280
model = XGBClassifier(n_estimators=200, max_depth=5, #objective='multi:softprob', nthread=4) """ model = XGBClassifier() if not os.path.isfile('model.bin'): print('start training') model.fit(x_train, y_train, eval_set=evallist, verbose=True, early_stopping_rounds=20) print('validation result') print(model.score(x_valid, y_valid)) model.save_model('model.bin') else: booster = Booster() booster.load_model('model.bin') model._Booster = booster print('check Nematostella vectensis seq') nv_seq = "TSPDIMSSSFYIDSLISKAKSVPTSTSEPRHTYESPVPCSCCWTPTQPDPSSLCQLCIPTSASVHPYMHHVRGASIPSGAGLYSRELQKDHILLQQHYAATEEERLHLASYASSRDPDSPSRGGNSRSKRIRTAYTSMQLLELEKEFSQNRYLSRLRRIQIAALLDLSEKQVKIWFQNRRVKWKKDKKAAQHGTTTETSSCPSSPASTGRMDGV" nv_vec = dataset.one_hot(nv_seq) predict_data = np.asarray([nv_vec] * 2) prediction = model.predict_proba(predict_data) print('prediction: ') print(prediction) ipdb.set_trace()
# In[ ]: logisticRegression = LogisticRegression() logisticRegression.fit(X_train, Y_train) Y_prediction = logisticRegression.predict(X_test) logisticRegression.score(X_train, Y_train) # In[ ]: xgBoost = XGBClassifier() xgBoost.fit(X_train, Y_train) Y_prediction = xgBoost.predict(X_test) xgBoost.score(X_train, Y_train) # ---- # <a id='validatingModel'></a> # ## Validating Model # I have evaluated 3 models and it's score. You can see from above score that random forest classifier is the best model out of 3 for the dataset. # In[ ]: acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2) print(round(acc_random_forest,2,), "%")
plt.show() final_xgb = XGBClassifier(learning_rate=0.05, n_estimators=450, max_depth=1, min_child_weight=4, gamma=0, subsample=0.9, colsample_bytree=0.1, objective='multi:softmax', nthread=4, scale_pos_weight=1, seed=27) final_xgb.fit(X_train, Y_train) print(final_xgb.score(X_test, Y_test)) xgb_feat_imps = final_xgb.feature_importances_ importances = list(xgb_feat_imps) feature_importances = [ (feature, round(importance, 4)) for feature, importance in zip(X_train.columns, importances) ] #[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances] feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True) print(len(feature_importances)) [ print('Variable: {:40} Importance: {}'.format(*pair)) for pair in feature_importances ]
model_xgb_rs.fit(X_train, y_train) print('The best parameters for XG Boost are : ', model_xgb_rs.best_params_) # In[99]: model_xgb_best = XGBClassifier(learning_rate=0.5, max_depth=3, n_estimators=30, booster='gbtree', random_state=21) model_xgb_best.fit(X_train, y_train) print( 'The train score is : ', "{00:.2f}%".format(round(model_xgb_best.score(X_train, y_train), 4) * 100)) print('The Test score is : ', "{00:.2f}%".format(round(model_xgb_best.score(X_test, y_test), 4) * 100)) train_acc_xgb = "{00:.2f}%".format( round(model_xgb_best.score(X_train, y_train), 4) * 100) test_acc_xgb = "{00:.2f}%".format( round(model_xgb_best.score(X_test, y_test), 4) * 100) # ## Confusion Matrix XGB # In[100]: y_test_predicted_xgb = model_xgb_best.predict(X_test) cf_xgb = confusion_matrix(y_test, y_test_predicted_xgb) cf_xgb
train_size=0.8, random_state=44) # 타임 걸기 (-1,1,4,8비교) #-------------------------------------------------------------------------------- start_time = timeit.default_timer() # 시작 시간 체크 #2. 모델 # model = DecisionTreeClassifier(max_depth = 4) model = XGBClassifier(n_jobs=-1, use_label_encoder=False) # 훈련 model.fit(x_train, y_train, eval_metric='logloss') #4. 평가, 예측 acc = model.score(x_test, y_test) #model.evaluate 와 같음 print(model.feature_importances_) #feature가 많다고 좋은 것아님(곡선화, 과적합 될 수 있음) print('acc: ', acc) # feature = x_train, x_test, y_train, y_test(max_depth = 4) ####=============시간걸기==================================== terminate_time = timeit.default_timer() # 종료 시간 체크 print("%f초 걸렸습니다." % (terminate_time - start_time)) #============================================================ import matplotlib.pyplot as plt import numpy as np def plot_feature_importances_dataset(model): n_features = dataset.data.shape[1] # y
delimiter=";") X = dataset[:, 0:-1] y = dataset[:, -1] Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=42) kfold = KFold(Xtrain.shape[0], n_folds=10, random_state=42) best_model = None best_score = 0.0 for curr_fold, (train_cv, test_cv) in enumerate(kfold): Xtrain_cv, Xtest_cv, ytrain_cv, ytest_cv = \ Xtrain[train_cv], Xtrain[test_cv], ytrain[train_cv], ytrain[test_cv] clf = XGBClassifier() clf.fit(Xtrain_cv, ytrain_cv) score = clf.score(Xtest_cv, ytest_cv) print("Fold {:d}, score: {:.3f}".format(curr_fold, score)) if score > best_score: best_score = score best_model = clf y_ = best_model.predict(Xtest) print("Accuracy: {:.3f}".format(accuracy_score(ytest, y_))) print() print("Confusion Matrix") print(confusion_matrix(ytest, y_)) print() print("Classification Report") print(classification_report(ytest, y_)) with open(os.path.join(DATA_DIR, "best-model.pkl"), "wb") as fmod:
np.random.seed(23) # for i in range(1, len(CHANNEL_RANGE)): # print(i) train_ori_vec = ori_feature_1 train_steg_vec = stego_feature_1 train_sample = np.concatenate((train_ori_vec, train_steg_vec), axis=0) train_label = np.concatenate( (0 * np.ones(len(train_ori_vec)), 1 * np.ones(len(train_steg_vec))), axis=0) # XGB xgb = XGBClassifier(n_estimators=1000, learning_rate=0.1, min_child_weight=5, max_depth=4, gamma=0.1, subsample=0.7, colsample_bytree=0.7) idx = np.random.permutation(len(train_sample)) train_sample = train_sample[idx] train_label = train_label[idx] xgb.fit(train_sample, train_label) print("TRAIN SCORE:", xgb.score(train_sample, train_label)) # f = open("ensemble_clf.pkl", "wb") # for 3 pixelHOP with 1000 training samples f = open("ensemble_clf_singPH1_holistic.pkl", "wb") pickle.dump(xgb, f) f.close()
n_iter=1, train_size=0.75, test_size=0.25, random_state=dataset_repeat))) training_features = input_data.loc[training_indices].drop('class', axis=1).values training_classes = input_data.loc[training_indices, 'class'].values testing_features = input_data.loc[testing_indices].drop('class', axis=1).values testing_classes = input_data.loc[testing_indices, 'class'].values # Create and fit the model on the training data try: clf = XGBClassifier(learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth) clf.fit(training_features, training_classes) testing_score = clf.score(testing_features, testing_classes) except: continue param_string = '' param_string += 'learning_rate={},'.format(learning_rate) param_string += 'n_estimators={},'.format(n_estimators) param_string += 'max_depth={}'.format(max_depth) out_text = '\t'.join([dataset.split('/')[-1][:-7], 'XGBClassifier', param_string, str(testing_score)]) print(out_text)
x = titanic[['pclass', 'age', 'sex']] y = titanic['survived'] x['age'].fillna(x['age'].mean(), inplace=True) from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=33) from sklearn.feature_extraction import DictVectorizer vec = DictVectorizer(sparse=False) X_train = vec.fit_transform( X_train.to_dict(orient='record')) #将DataFrame格式的数据转换为字典形式 X_test = vec.transform(X_test.to_dict(orient='record')) from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier() rfc.fit(X_train, y_train) print('The accuracy of random forest classifier on testing set:', rfc.score(X_test, y_test)) from xgboost import XGBClassifier xgbc = XGBClassifier() xgbc.fit(X_train, y_train) print('The accuracy of extreme gradient boosting classifier on testing set:', xgbc.score(X_test, y_test))