예제 #1
0
 def xgboost_classifier(self):
     cls = XGBClassifier()
     print 'xgboost cross validation score', cross_val_score(cls,self.x_data,self.y_data)
     start_time = time.time()
     cls.fit(self.x_train, self.y_train)
     print 'score', cls.score(self.x_test, self.y_test)
     print 'time cost', time.time() - start_time
예제 #2
0
n_jobs = -1  #딥러닝이 아닐 경우 n_job = -1
max_depth = 7

#추후 CV꼭 쓰고 Feature_importance도 써야한다.

model = XGBClassifier(max_depth=max_depth,
                      learning_rate=learning_rate,
                      n_estimators=n_estimators,
                      n_jobs=n_jobs,
                      colsample_bylevel=colsample_bylevel,
                      colsample_bytree=colsample_bytree)

model.fit(x_train, y_train)

score = model.score(x_test, y_test)
print('score : ', score)

print(model.feature_importances_)  #아래의 plot_importance를 보여줄수 있따.

#print("------------------------------------")
##print(model.best_estimator_)
#print(model.best_params_)
#print("------------------------------------")

#0.9649122807017544

plot_importance(model)
plt.show()
'''
### 3. 모델 훈련
예제 #3
0
def home(request):

    if request.method == 'POST':

        username = request.POST['uname']
        contact = request.POST['contact']
        age = request.POST['age']
        email = request.POST['email']
        mean_radius = request.POST['mean_radius']
        mean_texture = request.POST['mean_texture']
        mean_perimeter = request.POST['mean_perimeter']
        mean_area = request.POST['mean_area']
        mean_smoothness = request.POST['mean_smoothness']
        diagnosis = 0

        print(
            f"{username} {contact} {age} {email} {mean_radius} {mean_texture} {mean_perimeter} {mean_area} {mean_smoothness}"
        )

        rows = [[
            mean_radius, mean_texture, mean_perimeter, mean_area,
            mean_smoothness, diagnosis
        ]]

        # name of csv file

        # writing to csv file
        with open(filename, 'a+') as csvfile:
            # creating a csv writer object
            csvwriter = csv.writer(csvfile)

            # writing the fields
            # csvwriter.writerow(fields)

            # writing the data rows
            csvwriter.writerows(rows)
            print('Data Added Successfully!!')

        # print(np.any(np.isnan(df)))
        # print(np.all(np.isfinite(df)))
        x = df.drop(['diagnosis'], axis=1)
        y = df['diagnosis']
        X_train, X_test, Y_train, Y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=0)
        sc_x = StandardScaler().fit(X_train)
        X_train = sc_x.transform(X_train)
        X_test = sc_x.transform(X_test)

        classifier = XGBClassifier()
        classifier.fit(X_train, Y_train)
        predictions = classifier.predict(X_test)
        Accuracy = classifier.score(X_test, Y_test).round(2)

        Patient_Id = str(username[0:2]) + \
            str(random.randint(100001, 99999999999))
        print(Patient_Id)

        if Accuracy >= 0.70:
            msg = "Your Test Id is: " + str(Patient_Id)
            messages.success(request, "You Have Breast Cancer")
            messages.success(request, msg)
            obj = Report_Data.objects.create(Patient_Name=username,
                                             Patient_Id=Patient_Id,
                                             Email=email,
                                             Mobile_No=contact,
                                             Age=age,
                                             mean_radius=mean_radius,
                                             mean_texture=mean_texture,
                                             mean_perimeter=mean_perimeter,
                                             mean_area=mean_area,
                                             mean_smoothness=mean_smoothness,
                                             Test_Result='Positive')
        else:

            msg = "Your Test Id is: " + str(Patient_Id)
            messages.success(request,
                             "Congratulation You Don't Have Breast Cancer!")
            messages.success(request, msg)
            obj = Report_Data.objects.create(Patient_Name=username,
                                             Patient_Id=Patient_Id,
                                             Email=email,
                                             Mobile_No=contact,
                                             Age=age,
                                             mean_radius=mean_radius,
                                             mean_texture=mean_texture,
                                             mean_perimeter=mean_perimeter,
                                             mean_area=mean_area,
                                             mean_smoothness=mean_smoothness,
                                             Test_Result='Negative')

    return render(request, 'index.html')
예제 #4
0
x_validate_scaled = scaler.transform(x_validate)

# XGboosting ------------------------------------------------------------------
print(" XGBOOST ... ")
# Training...........................................
print("Training...........................")

xgb_model = XGBClassifier()
xgb_model.fit(x_train_scaled, y_train_new)

with open(
        '/home/mkolpe2s/rand/Classic_ML/Proper_method/XGB/DCASE2018/XGB_DCASE2018_default_parameter.pkl',
        'wb') as f:
    pickle.dump(xgb_model, f)

print("Train score:", xgb_model.score(x_train_scaled, y_train_new))

#Validation..............................
print("Validation...........................")
print("Validation score:", xgb_model.score(x_validate_scaled, y_validate))

#Testing.................................
print("Testing...........................")
scaler = StandardScaler()

x_test_scaled = scaler.fit_transform(x_test)

print("Test score:", xgb_model.score(x_test_scaled, y_test))

# Classification report...................
print("Classification report XGB default parameter.....................")
예제 #5
0
##data=data.drop(bottom_vars,axis=1)
##train_data=train_data.drop(bottom_vars,axis=1)
##test_data=test_data.drop(bottom_vars,axis=1)
##predict_data=predict_data.drop(bottom_vars,axis=1)

xgb_model = XGBClassifier(learning_rate=0.1,
                          n_estimators=550,
                          max_depth=7,
                          min_child_weight=1,
                          gamma=0,
                          subsample=0.8,
                          colsample_bytree=0.8,
                          objective='binary:logistic',
                          nthread=4,
                          scale_pos_weight=1,
                          seed=27)

xgb_model.fit(train_data, train_label)

print(xgb_model)

predictions = xgb_model.predict(predict_data)

output = pd.DataFrame({'tripid': predict_index, 'prediction': predictions})
output.to_csv('sample_submission_xgboost_tuned_new.csv', index=False)
print("score : " + str(xgb_model.score(train_data, train_label)))
pred = pd.DataFrame(xgb_model.predict(test_data))
print("Accuracy : " + str(metrics.accuracy_score(test_label, pred)))
print("F1 score : " + str(metrics.f1_score(test_label, pred)))
예제 #6
0
from xgboost import XGBClassifier
import time

# In[ ]:

xgb_model = XGBClassifier(n_jobs=4, n_estimators=250, max_depth=8, eta=0.1)

# In[ ]:

# Model fitting
xgb_model.fit(X_train, up_train)

# In[ ]:

xgb_model.score(X_test, up_test)

# In[ ]:

# Fetch Test set
dayss = env.get_prediction_days()

# In[ ]:

# Preprocessing the test set for submission
n_days = 0
prep_time = 0
prediction_time = 0
packaging_time = 0
for (market_obs_df, news_obs_df, predictions_template_df) in (dayss):
    n_days += 1
예제 #7
0
                                     random_state=0)
gb_clf2.fit(x_train, y_train)
predictions = gb_clf2.predict(x_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

print("Classification Report")
print(classification_report(y_test, predictions))

gb_clf2.feature_importances_

feat_importances = pd.Series(gb_clf2.feature_importances_, index=x.columns)
feat_importances.nlargest(5).plot(kind='barh')

x_train['date_diff_level'] = pd.to_numeric(x_train['date_diff_level'])
x_test['date_diff_level'] = pd.to_numeric(x_test['date_diff_level'])

xgb_clf3 = XGBClassifier()
xgb_clf3.fit(x_train, y_train)

score = xgb_clf3.score(x_test, y_test)
print(score)

xgb_clf3.feature_importances_

feat_importances = pd.Series(xgb_clf3.feature_importances_, index=x.columns)
feat_importances.nlargest(5).plot(kind='barh')
"""END OF CODE

"""
예제 #8
0
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc=4)
plt.show()

# --------------
from xgboost import XGBClassifier

# Code starts here
# rf = RandomForestClassifier()

xgb = XGBClassifier(learning_rate=0.0001)

xgb.fit(X_train, y_train)

accuracy = xgb.score(X_test, y_test)

y_pred = xgb.predict(X_test)

# Store the different evaluation values.

f1 = f1_score(y_test, xgb.predict(X_test))
precision = precision_score(y_test, xgb.predict(X_test))
recall = recall_score(y_test, xgb.predict(X_test))
roc_auc = roc_auc_score(y_test, xgb.predict(X_test))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Plot the auc-roc curve

score = roc_auc_score(y_pred, y_test)
예제 #9
0
class XGBoost_classifier:
    def __init__(self):
        self.XGBoost_clf = XGBClassifier(max_depth=5,
                                         learning_rate=0.1,
                                         n_estimator=100,
                                         silent=True,
                                         objective='binary:logistic')
        self.standardScaler = StandardScaler()
        self.train_score = None
        self.isload_ = False

    def train(self, model_folder, train_feature_add):
        """
        XGBoost算法训练数据
        :param train_feature_add: 训练数据路径
        :param model_add:  模型存储路径
        :return:
        """
        train_df = pd.read_csv(train_feature_add, index_col=['domain_name'])
        train_df = train_df.fillna('0.0')
        x_train = train_df.drop(['label'], axis=1).values
        y_train = train_df['label'].values
        print("_______XGBoost Training_______")
        self.XGBoost_clf.fit(x_train, y_train)
        mal_scores = np.array(self.XGBoost_clf.predict_proba(x_train))[:, 1]
        mal_scores = sorted(mal_scores)
        np.save(r"{}/XGBoost_train_scores.npy".format(model_folder),
                mal_scores)
        pickle.dump(self.XGBoost_clf,
                    open("{}/XGBoost_model.pkl".format(model_folder), 'wb'))

    def load(self, model_folder):
        """
        将模型文件和归一化尺度读取到内存中
        :param model_add: 模型存储路径
        :param standard_scaler_add: 归一化scaler存储路径
        :return:
        """
        self.XGBoost_clf = pickle.load(
            open("{}/XGBoost_model.pkl".format(model_folder), 'rb'))
        self.standardScaler = pickle.load(
            open("{}/standardscalar.pkl".format(model_folder), 'rb'))
        self.train_score = np.load(
            r"{}/XGBoost_train_scores.npy".format(model_folder))
        self.isload_ = True

    def predict(self, model_folder, test_feature_add):
        """
        测试集进行测试,计算准确率等
        :param test_feature_add: 测试数据路径
        :return:
        """
        self.load(model_folder)
        test_df = pd.read_csv(test_feature_add, index_col=['domain_name'])
        test_df = test_df.fillna('0.0')
        x_test = test_df.drop(['label'], axis=1).values
        y_test = test_df['label'].values
        print("_______XGBoost Predicting_______")
        y_predict = self.XGBoost_clf.predict(x_test)
        print("XGBoost accuracy: ", self.XGBoost_clf.score(x_test, y_test))
        print("XGBoost precision: ",
              precision_score(y_test, y_predict, average='macro'))
        print("XGBoost recall: ",
              recall_score(y_test, y_predict, average='macro'))
        print("XGBoost F1: ", f1_score(y_test, y_predict, average='macro'))
        print("XGBoost TPR, FPR, thresholds: ",
              roc_curve(y_test, y_predict, pos_label=1))

        plot_roc_curve(self.XGBoost_clf, x_test, y_test)
        plt.show()

    def predict_singleDN(self, model_folder, dname):
        """
        对单个域名进行检测,输出检测结果及恶意概率
        :param dname: 域名
        :return:
        """
        if not self.isload_:
            self.load(model_folder)
        dname = dname.strip('/').strip('.')
        dname = dname.replace("http://", '')
        dname = dname.replace("www.", "")
        dname = wash_tld(dname)
        if dname == "":
            label = 0
            prob = 0.0000
            p_value = 1.0000
            print("\nxgboost sld:", dname)
            # print("label:", label)
            # print("mal_prob:", prob)
            # print("p_value:", p_value)
            print('label:{}, pro:{}, p_value:{}'.format(label, prob, p_value))
            return label, prob, p_value
        else:
            feature = self.standardScaler.transform(
                pd.DataFrame([phishing_get_feature(dname)]))
            label = self.XGBoost_clf.predict(feature)
            prob = self.XGBoost_clf.predict_proba(feature)
            p_value = cal_pValue(self.train_score, prob[0][1], label[0])
            print("\nxgboost sld:", dname)
            # print("label:", label[0])
            # print("mal_prob:", prob[0][1])
            # print("p_value:", p_value)
            print('label:{}, pro:{}, p_value:{}'.format(
                label[0], prob[0][1], p_value))
            return label[0], prob[0][1], p_value
예제 #10
0
from sklearn.model_selection import train_test_split
from baseline.utils import fetch_data

X, y, ot = fetch_data("../data/features")
print(X.shape, y.shape)

X_train, X_test, y_train, y_test, _, ot_test = train_test_split(X,
                                                                y,
                                                                ot,
                                                                test_size=0.2,
                                                                shuffle=True)
print(X_train.shape, y_train.shape)

model = XGBClassifier(objective="multi:softprob")
model.fit(X_train, y_train)
print("Samples: %d, Accuracy: %.2f%%" %
      (len(X_test), model.score(X_test, y_test)))

pred = model.predict(X_test)
ot_num, ot_acc = [0] * 12, [0] * 12
for i in range(len(pred)):
    ot_num[ot_test[i]] += 1
    if pred[i] == y_test[i]:
        ot_acc[ot_test[i]] += 1
ot_acc = [
    round(ot_acc[i] / ot_num[i], 4) if ot_num[i] else 0
    for i in range(len(ot_num))
]
print("ot-acc distribution")
print(ot_acc)
예제 #11
0
plt.show()


# XGBOOST

from xgboost import XGBClassifier

# let's find best n_est. paramater
n_estimators = {}

for n in range(10,160,10):
    
    XGB = XGBClassifier(n_estimators=n)
    XGB.fit(X_train,y_train)

    n_estimators[n] = [XGB.score(X_test,y_test),roc_auc_score(y_test,XGB.predict(X_test))]


n_estimators = pd.DataFrame(n_estimators.items(),columns=["n","Accuracy"])

n_estimators["Roc Score"] = n_estimators["Accuracy"].apply(lambda x: x[1])

n_estimators["Accuracy"] = n_estimators["Accuracy"].apply(lambda x: x[0])

n_estimators.set_index("n",inplace=True)

ax = n_estimators.plot()
ax.set_title("ROC/ACC scores for different n_estimators paramaters",fontdict={"fontsize":12,"fontweight":"bold"})

# n_estimators = 30 looks good
예제 #12
0
        c[1] += 1
print(c)
#KNN
knn = KNeighborsClassifier()
knn.fit(Train_data, Train_label)
knn_pred = knn.predict(Test_data)
print('KNN_Acc:', accuracy_score(Test_label, knn_pred))
print('KNN_F1:', f1_score(Test_label, knn_pred, average='micro'))
knn.fit(New_Data, New_Label)
New_knn_pred = knn.predict(Test_data)
print('New_KNN_Acc:', accuracy_score(Test_label, New_knn_pred))
print('New_Knn_F1:', f1_score(Test_label, New_knn_pred, average='micro'))
#XGB
XGB = XGBClassifier()
XGB.fit(Train_data, Train_label)
print('XGB_Acc:', XGB.score(Test_data, Test_label))
print('XGB_F1:', f1_score(Test_label, XGB.predict(Test_data), average='micro'))
XGB.fit(New_Data, New_Label)
print('New_XGB_Acc:', XGB.score(Test_data, Test_label))
#RFC
rfc = RandomForestClassifier()
rfc.fit(Train_data, Train_label)
print('RFC_Acc:', rfc.score(Test_data, Test_label))
print('RFC_F1:', f1_score(Test_label, rfc.predict(Test_data), average='micro'))
rfc.fit(New_Data, New_Label)
print('New_Rfc_Acc:', rfc.score(Test_data, Test_label))
#SVM
svm = SVC(kernel='linear', probability=True)
svm.fit(Train_data, Train_label)
svm_pred = svm.predict(Test_data)
print('SVM_Acc', accuracy_score(Test_label, svm_pred))
예제 #13
0
print(datetime.datetime.now())
print('\n')
print('XGB Classifier')
print('\n')

xgb_cls = XGBClassifier(objective="multi:softprob",
                        num_class=20,
                        random_state=61,
                        colsample_bytree=0.6,
                        learning_rate=0.1,
                        n_estimators=200,
                        max_depth=8,
                        alpha=0.01,
                        gamma=0.001,
                        subsamples=0.6)

xgb_cls.fit(X_train, y_train)
print("Accuracy on training set is : {}".format(xgb_cls.score(
    X_train, y_train)))
print("Accuracy on test set is : {}".format(xgb_cls.score(X_test, y_test)))
y_pred = xgb_cls.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# plot statisics on the results of the model run
plot_accuracy_and_loss(model, training_fit, test_features, test_labels_one_hot,
                       test_labels)

print(datetime.datetime.now())
예제 #14
0
    "max_depth": [4, 5, 6],
    "colsample_bytree": [0.6, 0.9, 1],
    "colsample_bylevel": [0.6, 0.7, 0.9]
}]
n_jobs = -1

# CV 써라
# XGB 속도가 굉장히 빠름, 전처리 결측치 제거 안해줘도 됨

model = XGBClassifier(max_depth=max_depth,
                      learning_rate=learning_rate,
                      n_estimators=n_estimators,
                      colsample_bylevel=colsample_bylevel,
                      colsample_bytree=colsample_bytree)

model = GridSearchCV(XGBClassifier(), parameters, cv=5, n_jobs=-1)
model.fit(x_train, y_train)

print("=================================")
print(model.best_estimator_)
print("=================================")
print(model.best_params_)
print("=================================")

score = model.score(x_test, y_test)  # score는 evaluate
print('점수 :', score)

# print(model.feature_importances_)
# plot_importance(model)
# plt.show()
예제 #15
0
class BoostingDecisionMaker:
    def __init__(self,
                 folder="",
                 n_estimators=50,
                 max_depth=5,
                 monotonous_features=""):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.folder = folder
        self.monotonous_features = utils.transform_string_feature_range_into_list(
            monotonous_features)
        if not folder.strip():
            self.xg_boost = XGBClassifier(n_estimators=n_estimators,
                                          max_depth=max_depth,
                                          random_state=43)
        else:
            self.load_model(folder)

    def get_feature_ids(self):
        return utils.transform_string_feature_range_into_list(self.feature_ids)\
            if isinstance(self.feature_ids, str) else self.feature_ids

    def add_config_info(self, full_config, features):
        self.full_config = full_config
        self.feature_ids = features

    def load_model(self, folder):
        self.folder = folder
        with open(os.path.join(folder, "boost_model.pickle"), "rb") as f:
            self.n_estimators, self.max_depth, self.xg_boost = pickle.load(f)
        with open(os.path.join(folder, "data_features_config.pickle"),
                  "rb") as f:
            self.full_config, self.feature_ids, self.monotonous_features = pickle.load(
                f)

    def save_model(self, folder):
        if not os.path.exists(folder):
            os.makedirs(folder)
        with open(os.path.join(folder, "boost_model.pickle"), "wb") as f:
            pickle.dump([self.n_estimators, self.max_depth, self.xg_boost], f)
        with open(os.path.join(folder, "data_features_config.pickle"),
                  "wb") as f:
            pickle.dump(
                [self.full_config, self.feature_ids, self.monotonous_features],
                f)

    def train_model(self, train_data, labels):
        mon_features = [(1 if feature in self.monotonous_features else 0)
                        for feature in self.get_feature_ids()]
        mon_features_prepared = "(" + ",".join([str(f)
                                                for f in mon_features]) + ")"
        self.xg_boost = XGBClassifier(
            n_estimators=self.n_estimators,
            max_depth=self.max_depth,
            random_state=43,
            monotone_constraints=mon_features_prepared)
        self.xg_boost.fit(train_data, labels)
        logger.info("Train score: ", self.xg_boost.score(train_data, labels))
        logger.info("Feature importances: ",
                    self.xg_boost.feature_importances_)

    def validate_model(self, valid_test_set, valid_test_labels):
        res, res_prob = self.predict(valid_test_set)
        logger.info("Valid dataset F1 score: ",
                    self.xg_boost.score(valid_test_set, valid_test_labels))
        logger.info(confusion_matrix(valid_test_labels, res))
        logger.info(classification_report(valid_test_labels, res))

    def predict(self, data):
        if not len(data):
            return [], []
        return self.xg_boost.predict(data), self.xg_boost.predict_proba(data)
예제 #16
0
import pandas as pd
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
X = titanic[['pclass', 'age', 'sex']]
y = titanic['survived']
X['age'].fillna(X['age'].mean(), inplace=True)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
from sklearn.feature_extraction import DictVectorizer

vec = DictVectorizer(sparse=False)
X_train = vec.fit_transform(X_train.to_dict(orient='record'))
X_test = vec.transform(X_test.to_dict(orient='record'))

from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
print('The accuracy of Random Forest Classifier on testing set:', rfc.score(X_test, y_test))

from xgboost import XGBClassifier
xgbc = XGBClassifier()
xgbc.fit(X_train, y_train)

print('The accuracy of eXtreme Gradient Boosting Classifier on testing set:', xgbc.score(X_test, y_test))
예제 #17
0
# 모든 특성 학습시키기
X = data_20[feature_names]
y = data_20[data_20.columns[-1]]

#데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=66)

#모델 학습
model = XGBClassifier(learning_rate=0.09,
                      n_estimators=110,
                      max_depth=7,
                      min_child_weight=1,
                      gamma=0.3,
                      reg_alpha=1e-05,
                      subsample=0.61,
                      colsample_bytree=0.7,
                      seed=150)
model.fit(X_train, y_train)

#평가
print("훈련 세트 정확도: {:.3f}".format(model.score(X_train, y_train)))
print("테스트 세트 정확도 : {:.3f}".format(model.score(X_test, y_test)))

# # 예측
# X_data = []
# X_new = pd.DataFrame(X_data, columns = columns)
# # 들어오는 값이 어떻게 될지 몰라 우선 x,y축 다른 행에 있다 가정하고 columns 지정.
# # 들어온 값 형태에 따라 추후 수정필요
# pre = model.predict(X_new)
# print(pre)
# Confusion Matrix
print(confusion_matrix(y_test,y_pred))

"""#### XGBoost"""

from xgboost import XGBClassifier

modelXGB = XGBClassifier()

modelXGB.fit(X_train,y_train)

y_pred = modelXGB.predict(X_test)

# Accurracy Score
modelXGB.score(X_test,y_test)

# Classification Report
print(classification_report(y_test,y_pred))

# Confusion Matrix
print(confusion_matrix(y_test,y_pred))

"""### Test Data and Submission File"""

test = pd.read_csv("/content/drive/MyDrive/dataset/test.csv",na_values=['?','-999','Error','xxxxxxxx'])
test_1 = pd.read_csv("/content/drive/MyDrive/dataset/test.csv")

test.head()

test = test.drop(["customer_id","Name","security_no","referral_id","last_visit_time"],axis=1)
예제 #19
0
#               colsample_bynode=1, colsample_bytree=0.8, gamma=0.1,
#               learning_rate=0.05, max_delta_step=0, max_depth=5,
#               min_child_weight=1, missing=None, n_estimators=300, n_jobs=1,
#               nthread=-1, objective='binary:logistic', random_state=0,
#               reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=10,
#               silent=None, subsample=0.8, verbosity=1)
#XGB= xgb.XGBClassifier( leaning_rate= 0.02, max_bin=10,num_leaves=16, subsample=0.7, max_depth=4, subsample_freq=2, colsample_bytree= 0.3, min_child_samples=500,seed=99,n_estimators=300, objective= 'binary:logistic',n_jobs=-1)
XGB=XGBClassifier(gamma=0.1, subsmaple=1.0, colsample_bytree=1.0, n_estimators=500,max_depth=10, min_child_weight=10, learning_rate=0.01, objective= 'binary:logistic', n_jobs=-1)
XGB.fit(X_train, y_train)

# Make predictions
predictions = XGB.predict(X_val)
probs = XGB.predict_proba(X_val)
display(predictions)

score = XGB.score(X_val, y_val)
print("Accuracy: ", score)
print(classification_report(y_val, predictions))
data['churn'].value_counts()


# confusion_matrix(y_val, predictions)
# display(confusion_matrix)

#ROC 커브 그리기
fpr, tpr, threshold = roc_curve(y_val, probs[:,1])
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
titanic = pd.read_csv("../data/titanic/titanic.txt")
# 抽取pclass age 和 sex 作为训练样本
x = titanic[["pclass", "age", "sex"]]
y = titanic["survived"]
# 采集的age空的用平均数补全
x["age"].fillna(x["age"].mean(), inplace=True)

# 分割训练数据和测试数据
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=33)
# 提取字典特征 进行 向量化
vec = DictVectorizer()
x_train = vec.fit_transform(x_train.to_dict(orient="record"))
x_test = vec.transform(x_test.to_dict(orient="record"))

# 采用默认配置的随机森林进行预测
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
print("随机森林预测准确率:", rfc.score(x_test, y_test))  # 0.7811550151975684

# 采用XGBoost模型进行预测
xgbc = XGBClassifier()
xgbc.fit(x_train, y_train)
print("XGBoost预测准确率:", xgbc.score(x_test, y_test))  # 0.7872340425531915



예제 #21
0
data.info()
print(data.groupby("cardio").size())
data.dtypes
print(data.dtypes)

y = data['cardio']
X = data.drop(['cardio'], axis=1)
print("Shape of X: {0}; positive example: {1}; negative: {2}".format(
    X.shape, y[y == 1].shape[0], y[y == 0].shape[0]))  # 查看数据的形状和类别分布
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('cardio', 1), data['cardio'], test_size=.2,
    random_state=10)  #split the data

model = XGBClassifier()
model.fit(X_train, y_train)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print("train score: {train_score:.6f}; test score: {test_score:.6f}".format(
    train_score=train_score, test_score=test_score))

#模型预测
y_pred = model.predict(X_test)
print("matchs: {0}/{1}".format(
    np.equal(y_pred, y_test).shape[0], y_test.shape[0]))
xg_result = accuracy_score(y_test, y_pred)
print("Accuracy:", xg_result)

f1_score(y_test, y_pred)
print(classification_report(y_test, y_pred))

confusion_matrix = confusion_matrix(y_test, y_pred)
#     start = time.time()
#     model = XGBClassifier(n_jobs=i)
#     model.fit(x_train,y_train)
#     acc = model.score(x_test,y_test)
#     print('n_jobs가',f'{i}','일때')
#     print(time.time()-start,'초')




model = XGBClassifier(n_jobs=-1)

model.fit(x_train,y_train)

#4. 평가, 예측
acc = model.score(x_test, y_test)

print(model.feature_importances_)
print("acc :", acc)

df = pd.DataFrame(dataset.data, columns=dataset.feature_names)
new_data=[]
feature=[]
a = np.percentile(model.feature_importances_, q=25)

for i in range(len(dataset.data[0])):
    if model.feature_importances_[i] > a:
       new_data.append(df.iloc[:,i])
       feature.append(dataset.feature_names[i])

new_data = pd.concat(new_data, axis=1)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_all,
                                                    Y_all,
                                                    test_size=0.2,
                                                    random_state=0)

# In[55]:

from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(X_train, Y_train)

# In[56]:

model.score(X_test, Y_test)

# In[57]:

predictions = model.predict(X_test)
predictions

# In[58]:

submission = pd.DataFrame({
    'shot_id_number': X_test['shot_id_number'],
    'is_goal': predictions
})

# In[59]:
예제 #24
0
from xgboost import XGBClassifier
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#1. 데이터
datasets = load_wine()
x = datasets.data
y = datasets.target

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, shuffle = True, random_state = 66)

#2. 모델링
model = XGBClassifier(n_estimators = 100, learning_rate = 0.01, n_jobs = -1)

#3. 훈련
model.fit(x_train, y_train, verbose = 1, eval_metric=['merror', 'mlogloss'], eval_set=[(x_train, y_train), (x_test, y_test)], early_stopping_rounds=5)

#4. 평가
result1 = model.score(x_test, y_test)
print("result1 : ", result1)

y_pred = model.predict(x_test)
acc = accuracy_score(y_test, y_pred)
print("acc : ", acc)

result2 = model.evals_result()
print("result2 : ", result2)

# result1 :  0.9722222222222222
# acc :  0.9722222222222222
예제 #25
0
# Fitting AdaBoost Classification to the Training set
classifier = XGBClassifier(learning_rate=0.5,
                           n_estimators=1000,
                           max_depth=5,
                           min_child_weight=2,
                           gamma=0,
                           subsample=0.8,
                           colsample_bytree=0.8,
                           objective='binary:logistic',
                           scale_pos_weight=1,
                           seed=27)
start = time()
classifier.fit(X_train, y_train)
train_time = time() - start
start = time()
score = classifier.score(X_test, y_test)
score_time = time() - start
print("score = {:.3f} | time = {:,.3f}s/{:,.3f}s".format(
    score, train_time, score_time))

# Calculating feature inportance
feature_name = cv.get_feature_names()
feature_name = np.array(feature_name)
feature_name = np.insert(feature_name, 0, "avg_star_rating", axis=0)
importances = classifier.feature_importances_
indices = np.argsort(importances)[::-1]
feature_name = np.array(feature_name)
for f in range(100):
    print("%2d) %-*s %f" %
          (f + 1, 30, feature_name[indices[f]], importances[indices[f]]))
# 1) love                           0.074280
예제 #26
0
model = XGBClassifier(n_estimators=200,
					  max_depth=5,
					  #objective='multi:softprob',
					  nthread=4)
"""
model = XGBClassifier()
if not os.path.isfile('model.bin'):
    print('start training')
    model.fit(x_train,
              y_train,
              eval_set=evallist,
              verbose=True,
              early_stopping_rounds=20)

    print('validation result')
    print(model.score(x_valid, y_valid))
    model.save_model('model.bin')

else:
    booster = Booster()
    booster.load_model('model.bin')
    model._Booster = booster
print('check Nematostella vectensis seq')
nv_seq = "TSPDIMSSSFYIDSLISKAKSVPTSTSEPRHTYESPVPCSCCWTPTQPDPSSLCQLCIPTSASVHPYMHHVRGASIPSGAGLYSRELQKDHILLQQHYAATEEERLHLASYASSRDPDSPSRGGNSRSKRIRTAYTSMQLLELEKEFSQNRYLSRLRRIQIAALLDLSEKQVKIWFQNRRVKWKKDKKAAQHGTTTETSSCPSSPASTGRMDGV"
nv_vec = dataset.one_hot(nv_seq)
predict_data = np.asarray([nv_vec] * 2)
prediction = model.predict_proba(predict_data)
print('prediction: ')
print(prediction)

ipdb.set_trace()
예제 #27
0
# In[ ]:


logisticRegression = LogisticRegression()
logisticRegression.fit(X_train, Y_train)
Y_prediction = logisticRegression.predict(X_test)
logisticRegression.score(X_train, Y_train)


# In[ ]:


xgBoost = XGBClassifier()
xgBoost.fit(X_train, Y_train)
Y_prediction = xgBoost.predict(X_test)
xgBoost.score(X_train, Y_train)


# ----
# <a id='validatingModel'></a>
# ## Validating Model

# I have evaluated 3 models and it's score. You can see from above score that random forest classifier is the best model  out of 3 for the dataset.

# In[ ]:


acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
print(round(acc_random_forest,2,), "%")

예제 #28
0
plt.show()

final_xgb = XGBClassifier(learning_rate=0.05,
                          n_estimators=450,
                          max_depth=1,
                          min_child_weight=4,
                          gamma=0,
                          subsample=0.9,
                          colsample_bytree=0.1,
                          objective='multi:softmax',
                          nthread=4,
                          scale_pos_weight=1,
                          seed=27)

final_xgb.fit(X_train, Y_train)
print(final_xgb.score(X_test, Y_test))
xgb_feat_imps = final_xgb.feature_importances_
importances = list(xgb_feat_imps)
feature_importances = [
    (feature, round(importance, 4))
    for feature, importance in zip(X_train.columns, importances)
]
#[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]
feature_importances = sorted(feature_importances,
                             key=lambda x: x[1],
                             reverse=True)
print(len(feature_importances))
[
    print('Variable: {:40} Importance: {}'.format(*pair))
    for pair in feature_importances
]
예제 #29
0
model_xgb_rs.fit(X_train, y_train)

print('The best parameters for XG Boost are : ', model_xgb_rs.best_params_)

# In[99]:

model_xgb_best = XGBClassifier(learning_rate=0.5,
                               max_depth=3,
                               n_estimators=30,
                               booster='gbtree',
                               random_state=21)
model_xgb_best.fit(X_train, y_train)
print(
    'The train score is : ',
    "{00:.2f}%".format(round(model_xgb_best.score(X_train, y_train), 4) * 100))
print('The Test score is : ',
      "{00:.2f}%".format(round(model_xgb_best.score(X_test, y_test), 4) * 100))

train_acc_xgb = "{00:.2f}%".format(
    round(model_xgb_best.score(X_train, y_train), 4) * 100)
test_acc_xgb = "{00:.2f}%".format(
    round(model_xgb_best.score(X_test, y_test), 4) * 100)

# ## Confusion Matrix XGB

# In[100]:

y_test_predicted_xgb = model_xgb_best.predict(X_test)
cf_xgb = confusion_matrix(y_test, y_test_predicted_xgb)
cf_xgb
예제 #30
0
                                                    train_size=0.8,
                                                    random_state=44)

# 타임 걸기 (-1,1,4,8비교)
#--------------------------------------------------------------------------------
start_time = timeit.default_timer()  # 시작 시간 체크

#2. 모델
# model = DecisionTreeClassifier(max_depth = 4)
model = XGBClassifier(n_jobs=-1, use_label_encoder=False)

# 훈련
model.fit(x_train, y_train, eval_metric='logloss')

#4. 평가, 예측
acc = model.score(x_test, y_test)  #model.evaluate 와 같음

print(model.feature_importances_)  #feature가 많다고 좋은 것아님(곡선화, 과적합 될 수 있음)
print('acc: ', acc)
# feature = x_train, x_test, y_train, y_test(max_depth = 4)

####=============시간걸기====================================
terminate_time = timeit.default_timer()  # 종료 시간 체크
print("%f초 걸렸습니다." % (terminate_time - start_time))
#============================================================
import matplotlib.pyplot as plt
import numpy as np


def plot_feature_importances_dataset(model):
    n_features = dataset.data.shape[1]  # y
예제 #31
0
                     delimiter=";")
X = dataset[:, 0:-1]
y = dataset[:, -1]

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, 
                                                random_state=42)

kfold = KFold(Xtrain.shape[0], n_folds=10, random_state=42)
best_model = None
best_score = 0.0
for curr_fold, (train_cv, test_cv) in enumerate(kfold):
    Xtrain_cv, Xtest_cv, ytrain_cv, ytest_cv = \
        Xtrain[train_cv], Xtrain[test_cv], ytrain[train_cv], ytrain[test_cv]
    clf = XGBClassifier()
    clf.fit(Xtrain_cv, ytrain_cv)
    score = clf.score(Xtest_cv, ytest_cv)
    print("Fold {:d}, score: {:.3f}".format(curr_fold, score))
    if score > best_score:
        best_score = score
        best_model = clf

y_ = best_model.predict(Xtest)
print("Accuracy: {:.3f}".format(accuracy_score(ytest, y_)))
print()
print("Confusion Matrix")
print(confusion_matrix(ytest, y_))
print()
print("Classification Report")
print(classification_report(ytest, y_))

with open(os.path.join(DATA_DIR, "best-model.pkl"), "wb") as fmod:
예제 #32
0
np.random.seed(23)

# for i in range(1, len(CHANNEL_RANGE)):
#     print(i)
train_ori_vec = ori_feature_1
train_steg_vec = stego_feature_1
train_sample = np.concatenate((train_ori_vec, train_steg_vec), axis=0)
train_label = np.concatenate(
    (0 * np.ones(len(train_ori_vec)), 1 * np.ones(len(train_steg_vec))),
    axis=0)
# XGB
xgb = XGBClassifier(n_estimators=1000,
                    learning_rate=0.1,
                    min_child_weight=5,
                    max_depth=4,
                    gamma=0.1,
                    subsample=0.7,
                    colsample_bytree=0.7)

idx = np.random.permutation(len(train_sample))
train_sample = train_sample[idx]
train_label = train_label[idx]

xgb.fit(train_sample, train_label)
print("TRAIN SCORE:", xgb.score(train_sample, train_label))

# f = open("ensemble_clf.pkl", "wb")  # for 3 pixelHOP with 1000 training samples
f = open("ensemble_clf_singPH1_holistic.pkl", "wb")
pickle.dump(xgb, f)
f.close()
예제 #33
0
                                                                             n_iter=1,
                                                                             train_size=0.75,
                                                                             test_size=0.25,
                                                                             random_state=dataset_repeat)))
    
        training_features = input_data.loc[training_indices].drop('class', axis=1).values
        training_classes = input_data.loc[training_indices, 'class'].values
    
        testing_features = input_data.loc[testing_indices].drop('class', axis=1).values
        testing_classes = input_data.loc[testing_indices, 'class'].values
    
        # Create and fit the model on the training data
        try:
            clf = XGBClassifier(learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth)
            clf.fit(training_features, training_classes)
            testing_score = clf.score(testing_features, testing_classes)
        except:
            continue
    
        param_string = ''
        param_string += 'learning_rate={},'.format(learning_rate)
        param_string += 'n_estimators={},'.format(n_estimators)
        param_string += 'max_depth={}'.format(max_depth)
    
        out_text = '\t'.join([dataset.split('/')[-1][:-7],
                              'XGBClassifier',
                              param_string,
                              str(testing_score)])
    
        print(out_text)
예제 #34
0
x = titanic[['pclass', 'age', 'sex']]
y = titanic['survived']

x['age'].fillna(x['age'].mean(), inplace=True)
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=33)
from sklearn.feature_extraction import DictVectorizer

vec = DictVectorizer(sparse=False)
X_train = vec.fit_transform(
    X_train.to_dict(orient='record'))  #将DataFrame格式的数据转换为字典形式
X_test = vec.transform(X_test.to_dict(orient='record'))

from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
print('The accuracy of random forest classifier on testing set:',
      rfc.score(X_test, y_test))

from xgboost import XGBClassifier

xgbc = XGBClassifier()
xgbc.fit(X_train, y_train)
print('The accuracy of extreme gradient boosting classifier on testing set:',
      xgbc.score(X_test, y_test))