Exemplo n.º 1
0
    def train_all(self, data):
        """
        Train xgb models for all clusters.
        Args:
            data: a DataFrame object, default None. data_cls is the dataframe for training, and must
                  be set clusters.

        Returns:

        """
        if 'cluster' not in data.columns.tolist():
            raise AttributeError("data has no column 'cluster'.")
        for cluster in range(0, self._n_clusters):
            model_path = 'models/xgb_cluster_{}.model'.format(cluster)
            logger.info('Loading xgb_cluster_{}.model'.format(cluster))
            if os.path.exists(model_path):
                xgb_reg = xgb.load_model(model_path)
                self._xgb_boosters.append(xgb_reg)
            else:
                logger.info('Model does not exist, training model...')
                _train = data[data.cluster == cluster]
                print('There are {} training samples.'.format(len(_train)))
                xgb_reg = self.train(data=_train)
                xgb_reg.save_model(model_path)
                self._xgb_boosters.append(xgb_reg)
Exemplo n.º 2
0
    def train_all(self, data):
        """
        Train xgb models for all clusters.
        Args:
            data: a DataFrame object, default None. data_cls is the dataframe for training, and must
                  be set clusters.

        Returns:

        """
        if 'cluster' not in data.columns.tolist():
            raise AttributeError("data has no column 'cluster'.")
        for cluster in range(0, self._n_clusters):
            model_path = 'models/xgb_cluster_{}.model'.format(cluster)
            logger.info('Loading xgb_cluster_{}.model'.format(cluster))
            if os.path.exists(model_path):
                xgb_reg = xgb.load_model(model_path)
                self._xgb_boosters.append(xgb_reg)
            else:
                logger.info('Model does not exist, training model...')
                _train = data[data.cluster == cluster]
                print('There are {} training samples.'.format(len(_train)))
                xgb_reg = self.train(data=_train)
                xgb_reg.save_model(model_path)
                self._xgb_boosters.append(xgb_reg)
Exemplo n.º 3
0
def predict_xgboost(X: pd.DataFrame, config: Config) -> List:
    preds = np.zeros((config["n_split_xgb"], X.shape[0]))

    for i, mdl_fname in enumerate(config["xgb_models"]):
        mdl = xgb.Booster({'nthread': 4})
        mdl = xgb.load_model(mdl_fname)

        preds[i, :] = mdl.predict(xgb.DMatrix(X),
                                  ntree_limit=mdl.best_ntree_limit)
    return list(np.mean(preds, 0))
Exemplo n.º 4
0
def predict(model_file):
    xgb.load_model(model_file)
Exemplo n.º 5
0
dtest = xgb.DMatrix(test_data)
y_pred = booster.predict(dtest)
y_pred  = y_pred > 0.5
y_pred = y_pred.astype(int)
accuracy = accuracy_score(y_pred, test_labels)
print("Accuracy: %.2f%%" % (accuracy * 100.0))#예측률

import pickle
pickle.dump(booster,open("pima.pickle.dat","wb"))
loaded_model = pickle.load(open("pima.pickle.dat","rb"))
y_pred= loaded_model.predict(dtest)


booster.dump_model(doc2vec_model_name+'.xgboost')
xgb.load_model(doc2vec_model_name+'.xgboost')

#scale_pos_weight = len(software_arrays) / len(malware_arrays)## rate of 0/1 #
#xgboost 파라미터 설명 ..
'''
params = {
    #일반 파라미터
    'booster': 'gbtree',# gbtree : tree-based models #gblinear : linear models dart
    'n_jobs' : 'default', #멀티 쓰레드 개수 default가 가장 큼
    'silent': 0,#실행 메시지 출력 : 0 안하게 할라면 1

    #booster 파라미터
    'eta' : 0.02,# 학습률 . 일반적으로 0.01~ 0.2 가 사용. 부스팅 마다 변경 추천 과적합 방지
    'min_child_weight' : 1,# 기본값은 1 .
    'gamma' : 0 ,# 정보획득 값. 기본값 0
    'max_depth' : 6, #트리의 최대 깊이 기본값은 6
Exemplo n.º 6
0
#This program is for testing a trained BTD
# By Zach Shelton
# 9/9/2021
# Running this will test on a f
import awkward as ak
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#Data is stored in pandas -> Each
from sklearn.model_selection import train_test_split
import xgboost as xgb
from numpy.random import choice
import argparse
parser = argparse.ArgumentParser(
    description=
    'run boosted decision tree on data, note this file grabs only the data not validation this is an experimental set'
)
parser.add_argument('file', metavar='f', type=str)
parser.add_argument('BTD', metavar='d', type=str)
parser.add_argument('result', metavar='d', type=str)
args = parser.parse_args()

xg_reg = xgb.load_model(args.BTD)

rawdata = pandas.read_csv(args.file)
etruth = rawdata[["event", "truth"]]
cleandata = rawdata.drop(["event", "truth"], axis=1)
Dexp = xgb.DMatrix(data=cleandata)
predictions = xg_reg.predict(Dexp)
preddf = pd.Series(predictions)
preddf.to_csv("ExperimentalPred/%s.csv" % args.result)
Exemplo n.º 7
0
def dumpBin(model_path, feature_map='', out_put='', ftype=''):
    bst = xgb.load_model(mode_path)
    bst.dump_model(out_put,
                   fmap=freature_map,
                   with_stats=True,
                   dump_format=ftype)
Exemplo n.º 8
0
from data import load_data
_, test_data = load_data.load_data_with_header()

user_id = test_data.pop(['user_id'])
label_vocabulary = {
    0: '99999825',
    1: '90063345',
    2: '90109916',
    3: '89950166',
    4: '89950168',
    5: '99104722',
    6: '89950167',
    7: '89016252',
    8: '90155946',
    9: '99999828',
    10: '99999826',
    11: '99999827',
    12: '89016259',
    13: '99999830',
    14: '89016253'
}

# 预测
model = xgb.load_model('./xgb.model')
xgb_test = xgb.DMatrix(test_data)
preds = model.predict(xgb_test, ntree_limit=model.best_ntree_limit)

result = pd.DataFrame(preds, columns=['predict'])
result['predict'] = result['predict'].map(label_vocabulary)
xgb_submission = pd.concat([user_id, result], axis=1)
xgb_submission.to_csv('./xgb_submission.csv', index=False)