示例#1
0
 def load(self):
     model_file = os.path.join(seldon_core.Storage.download(self.model_uri),
                               BOOSTER_FILE)
     self._booster = xgb.Booster(model_file=model_file)
     self.ready = True
示例#2
0
def xgbmodel_testing_patch():
    bst_filename = './Patch.model'
    bst = xgb.Booster({'nthread': 40})
    bst.load_model(bst_filename)
    bst_assemble = []
    bst_assemble.append(bst)
示例#3
0
    def train(self, dtrain, num_rounds=100, skip_rounds=10, evals=[], silent=False, plot=False):
        """
        HitBoost model training or watching learning curve on evaluation set.

        Parameters
        ----------
        dtrain: xgboost.DMatrix
            Training data for survival analysis. It's suggested that you utilize tools of 
            `datasets` module to convert pd.DataFrame to xgboost.DMatrix.
        num_rounds: int
            The number of iterations.
        skip_rounds: int
            The number of skipped rounds if you want to print infos.
        evals: list of pairs (xgb.DMatrix, string)
            Evaluation set to watch learning curve. If it is set as an empty list by default, 
            then the training data will became the evaluation set.
        silent: boolean
            Keep silence or print information.
        plot: boolean
            Plot learning curve.

        Returns
        -------
        dict:
            Evaluation result during training, which is formatted as `{'td-CI': [], 'Loss': []}`.
        """
        # First to check the args
        _check_params(self.model_params)

        if not isinstance(dtrain, xgb.DMatrix):
            raise TypeError("The type of dtrain must be 'xgb.DMatrix'")

        if len(evals) == 0:
            eval_labels = ['train']
            eval_datas = [dtrain]
        else:
            if not isinstance(evals[0], tuple):
                raise TypeError("The type of dtrain must be 'xgb.DMatrix'")
            eval_labels = [c[1] for c in evals]
            eval_datas = [c[0] for c in evals]
        
        # Logging for result
        eval_result = {'td-CI': [], 'Loss': []}
        self._model = xgb.Booster(self.model_params, [dtrain])
        for _ in range(num_rounds):
            # Note: Since default setting of `output_margin` is `False`,
            # so the prediction is outputted after softmax transformation.
            pred = self._model.predict(dtrain)
            # Note: The gradient you provide for `model.boost()` must be 
            # gradients of objective function with respect to the direct 
            # output of boosting tree (even if you set `output_margin` as 
            # `True`).
            g, h = _hit_grads(pred, dtrain)
            self._model.boost(dtrain, g, h)

            # Append to eval_result
            # returns a list of values
            res_loss, res_ci = _hit_eval(self._model, eval_datas)
            eval_result['Loss'].append(res_loss)
            eval_result['td-CI'].append(res_ci)
            if not silent and (_ + 1) % skip_rounds == 0:
                _print_eval(_ + 1, res_loss, res_ci, eval_labels)

        # plot learning curve
        if plot:
            plot_train_curve(eval_result['Loss'], eval_labels, "Loss function")
            plot_train_curve(eval_result['td-CI'], eval_labels, "Time-Dependent C-index")

        return eval_result
示例#4
0
    def test_load_file_invalid(self):
        with pytest.raises(xgb.core.XGBoostError):
            xgb.Booster(model_file='incorrect_path')

        with pytest.raises(xgb.core.XGBoostError):
            xgb.Booster(model_file=u'不正なパス')
示例#5
0
文件: train.py 项目: zlb1028/sqlflow
def local_train(original_sql,
                model_image,
                estimator_string,
                datasource,
                select,
                validation_select,
                model_params,
                train_params,
                feature_metas,
                feature_column_names,
                feature_column_map,
                label_column,
                transform_fn,
                save,
                load="",
                is_pai=False,
                pai_train_table="",
                pai_validate_table="",
                oss_model_dir=""):
    disk_cache = train_params.pop("disk_cache", False)
    batch_size = train_params.pop("batch_size", None)
    if batch_size is not None and batch_size < 0:
        batch_size = None

    epoch = train_params.pop("epoch", 1)
    num_workers = train_params.pop("num_workers", 1)
    label_meta_dict = label_column.get_field_desc()[0].to_dict(
        dtype_to_string=True)

    file_name = "my_model"
    bst = None
    if load:
        with temp_file.TemporaryDirectory(as_cwd=True):
            Model.load_from_db(datasource, load)
            bst = xgb.Booster()
            bst.load_model(file_name)

    def build_dataset(fn, slct, pai_table):
        return xgb_dataset(datasource,
                           fn,
                           slct,
                           feature_metas,
                           feature_column_names,
                           label_meta_dict,
                           cache=disk_cache,
                           batch_size=batch_size,
                           epoch=epoch,
                           transform_fn=transform_fn,
                           is_pai=is_pai,
                           pai_table=pai_table,
                           feature_column_code=feature_column_map)

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        train_fn = os.path.join(tmp_dir_name, 'train.txt')
        val_fn = os.path.join(tmp_dir_name, 'val.txt')
        train_dataset = build_dataset(train_fn, select, pai_train_table)
        if validation_select:
            val_dataset = build_dataset(val_fn, validation_select,
                                        pai_validate_table)
        else:
            val_dataset = None

        eval_result = dict()
        watchlist = [None]
        if val_dataset:
            # The `xgboost.train` API only accepts the XGBoost DMatrix
            # object as the training or validation dataset, so we should
            # convert the generator to DMatrix.
            if isinstance(val_dataset, types.GeneratorType):
                val_dataset = list(val_dataset)[0]
            watchlist.append((val_dataset, "validate"))

        for per_batch_dmatrix in train_dataset:
            watchlist[0] = (per_batch_dmatrix, "train")
            bst = xgb.train(model_params,
                            per_batch_dmatrix,
                            evals=watchlist,
                            evals_result=eval_result,
                            xgb_model=bst,
                            **train_params)
            print("Evaluation result: %s" % eval_result)

    meta = collect_metadata(original_sql=original_sql,
                            select=select,
                            validation_select=validation_select,
                            model_repo_image=model_image,
                            class_name=estimator_string,
                            attributes=model_params,
                            features=feature_column_map,
                            label=label_column,
                            evaluation=eval_result,
                            num_workers=num_workers)

    save_model_to_local_file(bst, model_params, file_name)
    model = Model(EstimatorType.XGBOOST, meta)
    model.save_to_db(datasource, save)
    return eval_result
示例#6
0
def train(datasource,
          select,
          model_params,
          train_params,
          feature_metas,
          feature_column_names,
          label_meta,
          validation_select,
          disk_cache=False,
          batch_size=None,
          epoch=1,
          load_pretrained_model=False,
          is_pai=False,
          pai_train_table="",
          pai_validate_table="",
          rank=0,
          nworkers=1,
          oss_model_dir="",
          transform_fn=None,
          feature_column_code="",
          model_repo_image="",
          original_sql=""):
    if batch_size == -1:
        batch_size = None
    print("Start training XGBoost model...")
    dtrain = xgb_dataset(datasource,
                         'train.txt',
                         select,
                         feature_metas,
                         feature_column_names,
                         label_meta,
                         is_pai,
                         pai_train_table,
                         cache=disk_cache,
                         batch_size=batch_size,
                         epoch=epoch,
                         rank=rank,
                         nworkers=nworkers,
                         transform_fn=transform_fn,
                         feature_column_code=feature_column_code)
    if len(validation_select.strip()) > 0:
        dvalidate = list(
            xgb_dataset(datasource,
                        'validate.txt',
                        validation_select,
                        feature_metas,
                        feature_column_names,
                        label_meta,
                        is_pai,
                        pai_validate_table,
                        rank=rank,
                        nworkers=nworkers,
                        transform_fn=transform_fn,
                        feature_column_code=feature_column_code))[0]

    filename = "my_model"
    if load_pretrained_model:
        bst = xgb.Booster()
        bst.load_model(filename)
    else:
        bst = None

    re = None
    for per_batch_dmatrix in dtrain:
        watchlist = [(per_batch_dmatrix, "train")]
        if len(validation_select.strip()) > 0:
            watchlist.append((dvalidate, "validate"))

        re = dict()
        bst = xgb.train(model_params,
                        per_batch_dmatrix,
                        evals=watchlist,
                        evals_result=re,
                        xgb_model=bst,
                        **train_params)
        print("Evaluation result: %s" % re)

    if rank == 0:
        # TODO(sneaxiy): collect features and label
        metadata = collect_metadata(original_sql=original_sql,
                                    select=select,
                                    validation_select=validation_select,
                                    model_repo_image=model_repo_image,
                                    class_name=model_params.get("booster"),
                                    attributes=model_params,
                                    features=None,
                                    label=None,
                                    evaluation=re)
        save_model_to_local_file(bst, model_params, filename)
        save_metadata("model_meta.json", metadata)
        if is_pai and len(oss_model_dir) > 0:
            save_model(oss_model_dir, filename, model_params, train_params,
                       feature_metas, feature_column_names, label_meta,
                       feature_column_code)
示例#7
0

from sklearn.metrics import roc_auc_score

# model = xgb.Booster(model_file='xgb_xk.model')
# x_test = xgb.DMatrix(x_test)
offline_result = model.predict(x_test)
print('auc:', roc_auc_score(y_test, offline_result))

# 使用以下命令可以进行tar.bz2格式文件的压缩和解压缩,效果拔群。
# !tar -jcvf <要生成的压缩文件名> <要压缩的文件> # 压缩
# !tar -jxvf <要解压缩的文件> # 解压缩



model = xgb.Booster(model_file='xgb_xk.model')
# 开始预测
feature_pre = pd.read_csv('xk_feature_1_30.csv')
# feature_pre.pop('activity_days_nums')
feature_pre['action_rate1'] = (feature_pre['actions_pre_1_numbers']/
    feature_pre['actions_numbers']).map(lambda x:round(x, 2))
feature_pre['action_rate3'] = (feature_pre['actions_pre_3_numbers']/
    feature_pre['actions_numbers']).map(lambda x:round(x, 2))
feature_pre['action_rate5'] = (feature_pre['actions_pre_5_numbers']/
    feature_pre['actions_numbers']).map(lambda x:round(x, 2))
feature_pre['action_rate7'] = (feature_pre['actions_pre_7_numbers']/
    feature_pre['actions_numbers']).map(lambda x:round(x, 2))

feature_pre['action_rate13'] = (feature_pre['actions_pre_1_numbers']/
    feature_pre['actions_pre_3_numbers']).map(lambda x:round(x, 2))
feature_pre['action_rate15'] = (feature_pre['actions_pre_1_numbers']/
示例#8
0
# read train data and test data
train = pd.DataFrame.from_csv(train_value_path)
test = pd.DataFrame.from_csv(test_path)
train, test = date_parser(train, test)

# read train labels
train_labels = pd.DataFrame.from_csv(train_label_path)
label_encoder = LabelEncoder()
train_labels.iloc[:, 0] = label_encoder.fit_transform(train_labels.values.flatten())

if load_best_model:
	# build final model
	xg_train = xgboost.DMatrix(train, label=train_labels.values.flatten())
	xg_test = xgboost.DMatrix(test)

	xgclassifier = xgboost.Booster(params)
	xgclassifier.load_model(best_model_path)
else:
	# find best boost round
	all_best_rounds = []
	kf = 	StratifiedKFold(
				train_labels.values.flatten(), 
				n_folds=4, 
				shuffle=True, 
				random_state=0
			)
	for cv_train_index, cv_test_index in kf:
		xg_train = xgboost.DMatrix(train.values[cv_train_index, :], label=train_labels.iloc[cv_train_index].values.flatten())
		xg_test = xgboost.DMatrix(train.values[cv_test_index, :], label=train_labels.iloc[cv_test_index].values.flatten())

		xgclassifier = 	xgboost.train(
示例#9
0
#
#
# Author : fcbruce <*****@*****.**>
#
# Time : Thu 01 Dec 2016 17:01:56
#
#

import xgboost as xgb
import numpy as np
import math

rows_file = '../data/1201.npy'
model_file = '../models/RiskModel20161124.model'

def sigmoid(x):
    return 1. / (1 + np.exp(-x))

rows = np.load(rows_file)

mat = xgb.DMatrix(rows)

bst = xgb.Booster(model_file=model_file)
print sigmoid(bst.predict(mat))
    ('open_to_new_job_I am actively looking for a new job',
     'look_postings_frequent'),
    ('interview_likelihood', 'interview'),
]

FEATS_2017 = [
    ('CareerSatisfaction', 'like_developer'),
    ('HoursPerWeek', 'hours_per_week'),
    ('Overpaid', 'overpaid'),
    ('LastNewJob_Less than a year ago', 'curr_job_less_than_year'),
    ('InfluenceWorkstation', 'choose_equip'),
    ('Salary', 'salary'),
]

CURR_FOLDER = os.path.dirname(os.path.realpath(__file__))
XGB_2015 = xgb.Booster(
    model_file=os.path.join(CURR_FOLDER, 'output2015.model'))
XGB_2016 = xgb.Booster(
    model_file=os.path.join(CURR_FOLDER, 'output2016.model'))
XGB_2017 = xgb.Booster(
    model_file=os.path.join(CURR_FOLDER, 'output2017.model'))

MODELS = (
    [XGB_2015, FEATS_2015],
    [XGB_2016, FEATS_2016],
    [XGB_2017, FEATS_2017],
)

NUM_MODELS = len(MODELS)
REQUIRED_KEYS = set(feat[1] for model in MODELS for feat in model[1])

TRAIN_OPTIONS = {
示例#11
0
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import train_test_split
###################################################
import time 
start_time = time.time()
test_data = pd.read_csv("test_data.csv")
test_y = test_data.y
test_X =test_data.drop(['y'],axis=1) 
xgb_test = xgb.DMatrix(test_X,label=test_y)
model = xgb.Booster(model_file='./model/xgb.model')  
y_hat=model.predict(xgb_test) 
print y_hat
示例#12
0
def xgboost_train(
    training_data_path: InputPath('CSV'),  # Also supports LibSVM
    model_path: OutputPath('XGBoostModel'),
    model_config_path: OutputPath('XGBoostModelConfig'),
    starting_model_path: InputPath('XGBoostModel') = None,
    label_column: int = 0,
    num_iterations: int = 10,
    booster_params: dict = None,

    # Booster parameters
    objective: str = 'reg:squarederror',
    booster: str = 'gbtree',
    learning_rate: float = 0.3,
    min_split_loss: float = 0,
    max_depth: int = 6,
):
    '''Train an XGBoost model.

    Args:
        training_data_path: Path for the training data in CSV format.
        model_path: Output path for the trained model in binary XGBoost format.
        model_config_path: Output path for the internal parameter configuration of Booster as a JSON string.
        starting_model_path: Path for the existing trained model to start from.
        label_column: Column containing the label data.
        num_boost_rounds: Number of boosting iterations.
        booster_params: Parameters for the booster. See https://xgboost.readthedocs.io/en/latest/parameter.html
        objective: The learning task and the corresponding learning objective.
            See https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
            The most common values are:
            "reg:squarederror" - Regression with squared loss (default).
            "reg:logistic" - Logistic regression.
            "binary:logistic" - Logistic regression for binary classification, output probability.
            "binary:logitraw" - Logistic regression for binary classification, output score before logistic transformation
            "rank:pairwise" - Use LambdaMART to perform pairwise ranking where the pairwise loss is minimized
            "rank:ndcg" - Use LambdaMART to perform list-wise ranking where Normalized Discounted Cumulative Gain (NDCG) is maximized

    Annotations:
        author: Alexey Volkov <*****@*****.**>
    '''
    import pandas
    import xgboost

    df = pandas.read_csv(training_data_path, )

    training_data = xgboost.DMatrix(
        data=df.drop(columns=[df.columns[label_column]]),
        label=df[df.columns[label_column]],
    )

    booster_params = booster_params or {}
    booster_params.setdefault('objective', objective)
    booster_params.setdefault('booster', booster)
    booster_params.setdefault('learning_rate', learning_rate)
    booster_params.setdefault('min_split_loss', min_split_loss)
    booster_params.setdefault('max_depth', max_depth)

    starting_model = None
    if starting_model_path:
        starting_model = xgboost.Booster(model_file=starting_model_path)

    model = xgboost.train(params=booster_params,
                          dtrain=training_data,
                          num_boost_round=num_iterations,
                          xgb_model=starting_model)

    # Saving the model in binary format
    model.save_model(model_path)

    model_config_str = model.save_config()
    with open(model_config_path, 'w') as model_config_file:
        model_config_file.write(model_config_str)
def predictmodel(file,filefullpath,modelfile,scalefile,stationlist,demdict,origintime,foretime):
    os.environ["CUDA_VISIBLE_DEVICES"] = '0,1'
    allvaluelist=[]
    if file[-3:]=='001' and file[:3]=='D1D':
        print file
        grbs=pygrib.open(filefullpath)
        grb_2t = grbs.select(name='2 metre temperature')
        tempArray = grb_2t[0].values
        grb_2d = grbs.select(name='2 metre dewpoint temperature')
        dewpointArray = grb_2d[0].values
        grb_10u = grbs.select(name='10 metre U wind component')
        u10Array = grb_10u[0].values
        grb_10v = grbs.select(name='10 metre V wind component')
        v10Array = grb_10v[0].values
        grb_tcc = grbs.select(name='Total cloud cover')
        tccArray = grb_tcc[0].values
        grb_lcc = grbs.select(name='Low cloud cover')
        lccArray = grb_lcc[0].values
        grb_z = grbs.select(name='Geopotential')
        geoArray=grb_z[0].values
        grb_500rh = grbs.select(name='Relative humidity', level=500)
        rh500Array = grb_500rh[0].values
        grb_850rh = grbs.select(name='Relative humidity', level=850)
        rh850Array = grb_850rh[0].values
        #遍历站点->要素遍历、
        for i in range(len(stationlist)):
            #print len(stationlist)
            perlist=stationlist[i]
            stationid=perlist[0]
            latitude=float(perlist[1])
            longitude=float(perlist[2])
            alti=float(perlist[3])
            #站点左上角点的索引
            indexlat = int((90 - latitude) / 0.1)
            indexlon = int((longitude + 180) / 0.1)
            per_station_value_list=[]
            calculate16gribvalue(tempArray,indexlat,indexlon,per_station_value_list)
            calculate16gribvalue(dewpointArray,indexlat,indexlon,per_station_value_list)
            calculate16gribvalue(u10Array,indexlat,indexlon,per_station_value_list)
            calculate16gribvalue(v10Array,indexlat,indexlon,per_station_value_list)
            calculate16gribvalue(tccArray,indexlat,indexlon,per_station_value_list)
            calculate16gribvalue(lccArray,indexlat,indexlon,per_station_value_list)
            calculate16gribvalue(geoArray,indexlat,indexlon,per_station_value_list)
            calculate16gribvalue(rh500Array,indexlat,indexlon,per_station_value_list)
            calculate16gribvalue(rh850Array,indexlat,indexlon,per_station_value_list)
            per_station_value_list.append(latitude)
            per_station_value_list.append(longitude)
            per_station_value_list.append(alti)
            # 站点高程:取计算好的站点周边16个点的高程值
            demlist = demdict[stationlist[i][0]]
            for u in range(1, len(demlist), 1):
                per_station_value_list.append(float(demlist[u]))
            allvaluelist.append(per_station_value_list)
            #print(per_station_value_list)
    trainarray=numpy.array(allvaluelist)
    params001 = {
        'tree_method': 'gpu_hist',
        'booster': 'gbtree',
        'objective': 'reg:linear',  # 线性回归
        'gamma': 0.2,  # 用于控制是否后剪枝的参数,越大越保守,一般0.1、0.2这样子。
        'max_depth': 12,  # 构建树的深度,越大越容易过拟合
        'lambda': 2,  # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
        'subsample': 0.7,  # 随机采样训练样本
        'colsample_bytree': 0.7,  # 生成树时进行的列采样
        'min_child_weight': 3,
        # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言
        # ,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
        # 这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。
        'silent': 0,  # 设置成1则没有运行信息输出,最好是设置为0.
        'eta': 0.01,  # 如同学习率
        'seed': 1000,
        # 'nthread':3,# cpu 线程数,不设置取最大值
        # 'eval_metric': 'auc'
        'scale_pos_weight': 1,
        'n_gpus': 2
    }
    xgbst=xgboost.Booster(params001)
    xgbst.load_model(modelfile)
    scaler=joblib.load(scalefile)
    #print(modelfile,scalefile)
    trainarray_t=scaler.transform(trainarray)
    #标准化后的矩阵坑我2次了:看好是标准化后的还是标准化前的
    xgbtrain=xgboost.DMatrix(trainarray_t)
    result=xgbst.predict(xgbtrain)
    #print(result)
    logger.info(result)
    #结果入库
    db = MySQLdb.connect('172.16.8.28', 'admin', 'moji_China_123', 'moge',3307)
    #db = MySQLdb.connect('192.168.10.84', 'admin', 'moji_China_123','moge')
    cursor = db.cursor()
    origin = datetime.datetime.strftime(origintime, '%Y-%m-%d %H:%M:%S')
    forecast = datetime.datetime.strftime(foretime, '%Y-%m-%d %H:%M:%S')
    forecast_year = foretime.year
    forecast_month = foretime.month
    forecast_day = foretime.day
    forecast_hour = foretime.hour
    forecast_minute = foretime.minute
    timestr = datetime.datetime.strftime(origintime, '%Y%m%d%H%M%S')
    # csv = os.path.join(outpath, origin+'_'+forecast + '.csv')
    # csvfile = open(csv, 'w')
    sql = 'replace into t_r_ec_city_forecast_ele_mos_dem_winter3(city_id,initial_time,forecast_time,forecast_year,forecast_month,forecast_day,forecast_hour,temperature)VALUES(%s,%s,%s,%s,%s,%s,%s,%s)'
    L = []
    for j in range(len(stationlist)):
        perstationlist = []
        stationid = stationlist[j][0]
        temp = result[j]
        # 每个站点存储
        perstationlist.append(stationid)
        perstationlist.append(origin)
        perstationlist.append(forecast)
        perstationlist.append(forecast_year)
        perstationlist.append(forecast_month)
        perstationlist.append(forecast_day)
        perstationlist.append(forecast_hour)
        perstationlist.append(temp)
        L.append(perstationlist)
        logger.info(perstationlist)
        #             # sql='insert into t_r_ec_mos_city_forecast_ele(city_id,initial_time,forecast_time,forecsat_year,forecast_month,forecast_day,forecast_hour,temperature)VALUES ()'
        #             # sql = 'insert into t_r_ec_city_forecast_ele_mos (city_id,initial_time,forecast_time,forecast_year,forecast_month,forecast_day,forecast_hour,temperature,temp_max_6h,temp_min_6h,rainstate,precipitation)VALUES ("' + stationid + '","' + origin + '","' + str(
        #             #     forecast) + '","' + str(forecast_year) + '","' + str(
        #             #     forecast_month) + '","' + str(forecast_day) + '","' + str(
        #             #     forecast_hour) + '","' + str(temp) + '","' + str(maxtemp)+ '","' + str(mintemp)+'","' + str(rainstate)+'","' + str(prevalue)+ '")
        #             # csvfile.write(stationid + '","' + origin + '","' + str(
        #             #     forecast) + '","' + str(forecast_year) + '","' + str(
        #             #     forecast_month) + '","' + str(forecast_day) + '","' + str(
        #             #     forecast_hour) + '","' + str(forecast_minute) + '","' + str(
        #             #     temp)+ '","' + str(maxtemp)+ '","' + str(mintemp)+'","' + str(rainstate)+'","' + str(prevalue))
        #             # csvfile.write('\n')
        #             # print sql
        #             # cursor.execute(sql)
    cursor.executemany(sql, L)
    db.commit()
    db.close()
示例#14
0
def xgb_shap_values(x):
    bst = xgb.Booster()
    bst.load_model("my_model")
    explainer = shap.TreeExplainer(bst)
    return explainer.shap_values(x)
import gc
import numpy as np
import feather
import xgboost as xgb

print('+ Loading trained models...')
model_fold0 = xgb.Booster({'nthread': 4})
model_fold0.load_model('tmp/xgb_model_0.model')
model_fold1 = xgb.Booster({'nthread': 4})
model_fold1.load_model('tmp/xgb_model_1.model')

print('+ Loading test data...')
df_test = feather.read_dataframe('tmp/mtv_df_test.feather')
features = sorted(set(df_test.columns) - {'display_id', 'clicked'})

X_test = df_test[features].values
del df_test
gc.collect()

dtest = xgb.DMatrix(X_test, feature_names=features)
del X_test
gc.collect()

print('+ Predicting using test data...')
pred0_test = model_fold0.predict(dtest)
pred1_test = model_fold1.predict(dtest)
pred_test = (pred0_test + pred1_test) / 2

np.save('predictions/xgb_mtv_pred_test.npy', pred_test)
del pred0_test, pred1_test, pred_test
gc.collect()
示例#16
0
def xgboost_train(start_day, end_day):
    model_path = './sub/bst_%s_%s.model' % (start_day, end_day)
    # print model_path
    # exit()
    user_index, training_data, label = processed_train_set(start_day, end_day)
    # user_index, training_data, label = data_set()
    # user_index, training_data, label = make_train_set(train_start_date, train_end_date)
    # 从样本中随机的按比例选取train data和test data。test_size是样本占比,如果是整数的话就是样本的数量。random_state是随机数的种子。
    x_train, x_test, y_train, y_test = train_test_split(training_data.values,
                                                        label.values,
                                                        test_size=0.2,
                                                        random_state=0)
    # 下面这一部分是为了抽取 x_test的user index 测准确度
    x_train_df = pd.DataFrame(x_train)
    x_test_df = pd.DataFrame(x_test)
    # x_test_df.to_csv('./sub/x_test.csv', index=False, index_label=False)
    x_train = x_train_df.iloc[:, 2:].copy()
    test_index = x_test_df.iloc[:, [0, 1]].copy()
    test_index.columns = ['user_id', 'sku_id']
    x_test = x_test_df.iloc[:, 2:].copy()

    del training_data['user_id']
    del training_data['sku_id']

    if os.path.exists(model_path):
        print('Model Loading !')
        bst = xgb.Booster()  # 注意:名字要保持一致,否则报错!
        bst.load_model(model_path)
    else:
        dtrain = xgb.DMatrix(x_train.values, label=y_train)
        dtest = xgb.DMatrix(x_test.values, label=y_test)
        # 'max_delta_step':1
        # param = {'learning_rate': 0.15, 'n_estimators': 1000, 'max_depth': 3, 'max_delta_step': 1,
        #          'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, 'colsample_bytree': 0.8,
        #          'scale_pos_weight': 1, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic'}
        # param = {'n_estimators': 1000, 'max_depth': 3, 'max_delta_step': 1,
        #          'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, 'colsample_bytree': 0.8,
        #          'scale_pos_weight': 13, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic'}
        param = {
            'learning_rate': 0.15,
            'n_estimators': 1000,
            'max_depth': 3,
            'max_delta_step': 1,
            'min_child_weight': 5,
            'gamma': 0,
            'subsample': 1.0,
            'colsample_bytree': 0.8,
            'scale_pos_weight': 1,
            'eta': 0.05,
            'silent': 1,
            'objective': 'binary:logistic'
        }

        # num_round 训练轮数
        # num_round = 290
        num_round = 290
        # 并行度,用几个cpu
        # param['nthread'] = 6
        # param['eval_metric'] = "auc"
        # items() 返回键值对
        plst = param.items()
        plst += [('eval_metric', 'auc')]
        # evallist是验证集,这里用测试集和训练集做两次验证
        evallist = [(dtest, 'eval'), (dtrain, 'train')]
        bst = xgb.train(plst, dtrain, num_round, evallist)
        bst.save_model(model_path)

    print("-------------- 测试集准确度-----------")
    # 测试集实际
    test_index['label'] = y_test
    test_true = test_index[test_index['label'] == 1]
    test_true = test_true[['user_id', 'sku_id']]
    # test_true.to_csv('./sub/testtrue1.csv', index=False, index_label=False)
    # 会有用户购买多个sku,但是最终结果每个用户只能买一个商品,问题是多个商品里面取哪一个?
    # test_true = test_true.first().reset_index(drop=True)
    # test_true.to_csv('./sub/testtrue2.csv', index=False, index_label=False)
    del test_index['label']

    # 测试集预测
    test_input = xgb.DMatrix(x_test.values)
    test_index['label'] = bst.predict(test_input)

    x_test['label'] = y_test
    for lv in label_level:
        print("this Probality = " + str(lv))
        test_pred = test_index[test_index['label'] >= lv]
        test_pred = test_pred[['user_id', 'sku_id']]
        test_pred = test_pred.groupby(
            'user_id', as_index=False).first().reset_index(drop=True)
        res = report(test_pred, test_true)
        if res == 1:
            break
        print("")
def model_load(model_path):
    # 加载已经训练完成的模型(model_path:文件路径)
    booster_model = xgb.Booster(model_path)
    return booster_model
示例#18
0
from sklearn.metrics import confusion_matrix


USE_MEMMAP = True


data = pd.read_csv( 'dataset.csv' ).as_matrix()

X = data[ :, 0:-1 ]
y = data[ :, -1 ]

if USE_MEMMAP:
	Xmm = np.memmap( 'X.mmap', dtype=X.dtype, mode='w+', shape=X.shape )
	ymm = np.memmap( 'y.mmap', dtype=y.dtype, mode='w+', shape=y.shape )
	np.copyto( Xmm, X )
	np.copyto( ymm, y )
	del( data )
	del( X )
	del( y )
	X = Xmm
	y = ymm

d = xgb.DMatrix( X, label=y )

model = xgb.Booster({'nthread':1})
model.load_model('xgb-model.bin')
cm = confusion_matrix(y, model.predict(d) > 0.5)
print(cm)


示例#19
0
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

#most import code here
import pandas
import xgboost as xgb
data = pd.read_csv('example_input.csv')  #read the input file
data.info()
data.columns
bst = xgb.Booster()  #initial instance
bst.load_model('xgb_model')  #load the xgboost model
xgb_data = xgb.DMatrix(data)  #transfer input data to xgb matrix
result = bst.predict(xgb_data)  #get the result
示例#20
0
def process(input_path, output_path):

    jieba.enable_parallel(4)

    df_predict = pd.DataFrame()
    #input file:  id  s1  s2
    df_input = pd.read_table(input_path)

    stopwords_path = "stopwords.txt"
    stopwords = []
    for word in open(stopwords_path, "r").readlines():
        stopwords.append(word.strip('\n'))

    def word_match_share(row):
        q1words = {}
        q2words = {}
        for word in jieba_cut(row['s1']):
            if word not in stopwords:
                q1words[word] = 1
        for word in jieba_cut(row['s2']):
            if word not in stopwords:
                q2words[word] = 1
        if len(q1words) == 0 or len(q2words) == 0:
            # The computer-generated chaff includes a few questions that are nothing but stopwords
            return 0
        shared_words_in_q = [w for w in q1words.keys() if w in q2words]

        # 这里一定要注意 float,否则最后计算结果为 int
        R = float(len(shared_words_in_q) * 2) / (len(q1words) + len(q2words))
        return R

    input_qs = pd.Series(df_input['s1'].tolist() +
                         df_input['s2'].tolist()).astype(str)
    from collections import Counter

    def get_weight(count, eps=10000, min_count=2):
        if count < min_count:
            return 0
        else:
            return 1.0 / (count + eps)

    eps = 5000
    total_words = jieba_cut(" ".join(input_qs))
    counts = Counter(total_words)
    weights = {word: get_weight(count) for word, count in counts.items()}

    def tfidf_word_match_share(row):
        q1words = {}
        q2words = {}
        for word in jieba_cut(row['s1']):
            if word not in stopwords:
                q1words[word] = 1
        for word in jieba_cut(row['s2']):
            if word not in stopwords:
                q2words[word] = 1
        if len(q1words) == 0 or len(q2words) == 0:
            # The computer-generated chaff includes a few questions that are nothing but stopwords
            return 0

        shared_weights = [
            weights.get(w, 0) for w in q1words.keys() if w in q2words
        ] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
        total_weights = [weights.get(w, 0) for w in q1words
                         ] + [weights.get(w, 0) for w in q2words]

        R = sum(shared_weights) / sum(total_weights)
        return R

    df_predict['word_match'] = df_input.apply(word_match_share,
                                              axis=1,
                                              raw=True)
    df_predict['tfidf_word_match'] = df_input.apply(tfidf_word_match_share,
                                                    axis=1,
                                                    raw=True)

    bst = xgboost.Booster(model_file="./model/xgboost.model")
    d_predict = xgboost.DMatrix(df_predict)
    predict = bst.predict(d_predict)

    df_output = pd.DataFrame()
    df_output['id'] = df_input['id']
    df_output['predict'] = [int(x + 0.5) for x in predict]
    with open(output_path, 'w') as fout:
        for index in df_output.index:
            fout.write(
                str(df_output['id'].loc[index]) + '\t' +
                str(df_output['predict'].loc[index]) + '\n')
示例#21
0
def Predict(outfilename, modelname, tempscalerfile, origintime, foretime,
            csvfile, demcsv):
    try:
        # 取气温、最高气温、最低气温的训练训练矩阵。读文件费劲,连降水一起取了。晴雨、降水一块训
        logger.info('----------------------------------------------')
        tempvariablelist = []
        stationlist = []
        calculateStationVariable(tempvariablelist, outfilename, stationlist,
                                 csvfile, demcsv)
        # 加载训练模型
        params = {
            'booster': 'gbtree',
            'objective': 'reg:linear',  # 线性回归
            'gamma': 0.2,  # 用于控制是否后剪枝的参数,越大越保守,一般0.1、0.2这样子。
            'max_depth': 12,  # 构建树的深度,越大越容易过拟合
            'lambda': 2,  # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
            'subsample': 0.7,  # 随机采样训练样本
            'colsample_bytree': 0.7,  # 生成树时进行的列采样
            'min_child_weight': 3,
            # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言
            # ,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
            # 这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。
            'silent': 0,  # 设置成1则没有运行信息输出,最好是设置为0.
            'eta': 0.02,  # 如同学习率
            'seed': 1000,
            # 'nthread': 3,  # cpu 线程数
            # 'eval_metric': 'auc'
            'scale_pos_weight': 1
        }
        bst = xgboost.Booster(params)
        bst.load_model(modelname)
        # 气温模型预测
        ecvaluelist = numpy.array(tempvariablelist)
        # ecvaluelist=ecvaluelist.astype('float64')
        # logger.info('ecvaluelist')
        # logger.info(ecvaluelist)
        # 加载标准化预处理文件,对数据进行与模型一致的标准化
        scaler = joblib.load(tempscalerfile)
        # transform后必须重新复制,原来矩阵是不变的
        ecvaluelist_t = scaler.transform(ecvaluelist)
        # logger.info(ecvaluelist)
        # logger.info(ecvaluelist_t)
        xgbtrain = xgboost.DMatrix(ecvaluelist_t)
        result = bst.predict(xgbtrain)
        # logger.info('result')
        # logger.info(result)
        #
        db = MySQLdb.connect('172.16.8.28', 'admin', 'moji_China_123', 'moge',
                             3307)
        # db = MySQLdb.connect('192.168.10.84', 'admin', 'moji_China_123','moge')
        cursor = db.cursor()
        origin = datetime.datetime.strftime(origintime, '%Y-%m-%d %H:%M:%S')
        forecast = datetime.datetime.strftime(foretime, '%Y-%m-%d %H:%M:%S')
        forecast_year = foretime.year
        forecast_month = foretime.month
        forecast_day = foretime.day
        forecast_hour = foretime.hour
        forecast_minute = foretime.minute
        timestr = datetime.datetime.strftime(origintime, '%Y%m%d%H%M%S')
        # csv = os.path.join(outpath, origin+'_'+forecast + '.csv')
        # csvfile = open(csv, 'w')
        sql = 'replace into t_r_ec_city_forecast_ele_mos_dem (city_id,initial_time,forecast_time,forecast_year,forecast_month,forecast_day,forecast_hour,temperature)VALUES(%s,%s,%s,%s,%s,%s,%s,%s)'
        print sql
        L = []
        for j in range(len(stationlist)):
            perstationlist = []
            stationid = stationlist[j][0]
            temp = result[j]
            # 每个站点存储
            perstationlist.append(stationid)
            perstationlist.append(origin)
            perstationlist.append(forecast)
            perstationlist.append(forecast_year)
            perstationlist.append(forecast_month)
            perstationlist.append(forecast_day)
            perstationlist.append(forecast_hour)
            perstationlist.append(temp)
            L.append(perstationlist)
            # logger.info(perstationlist)
            # sql='insert into t_r_ec_mos_city_forecast_ele(city_id,initial_time,forecast_time,forecsat_year,forecast_month,forecast_day,forecast_hour,temperature)VALUES ()'
            # sql = 'insert into t_r_ec_city_forecast_ele_mos (city_id,initial_time,forecast_time,forecast_year,forecast_month,forecast_day,forecast_hour,temperature,temp_max_6h,temp_min_6h,rainstate,precipitation)VALUES ("' + stationid + '","' + origin + '","' + str(
            #     forecast) + '","' + str(forecast_year) + '","' + str(
            #     forecast_month) + '","' + str(forecast_day) + '","' + str(
            #     forecast_hour) + '","' + str(temp) + '","' + str(maxtemp)+ '","' + str(mintemp)+'","' + str(rainstate)+'","' + str(prevalue)+ '")'
            # csvfile.write(stationid + '","' + origin + '","' + str(
            #     forecast) + '","' + str(forecast_year) + '","' + str(
            #     forecast_month) + '","' + str(forecast_day) + '","' + str(
            #     forecast_hour) + '","' + str(forecast_minute) + '","' + str(
            #     temp)+ '","' + str(maxtemp)+ '","' + str(mintemp)+'","' + str(rainstate)+'","' + str(prevalue))
            # csvfile.write('\n')
            # print sql
            # cursor.execute(sql)
        cursor.executemany(sql, L)
        db.commit()
        db.close()
        # csvfile.close()
        # os.remove(outfilename)
        logger.info(outfilename)
    except Exception as e:
        logger.info(e.message)
def XGB(opts):
    reDirect = False
    FOLDER = 'clean_vpn12_xgb'
    if not os.path.exists(FOLDER):
        os.mkdir(FOLDER)
    MODEL_PATH = FOLDER + '/model.h5'
    FIG_PATH = FOLDER + '/Confusion_Matrix.png'
    FIG_PATH_N = FOLDER + '/Confusion_Matrix_Norm.png'

    import sys
    if (reDirect):
        old_stdout = sys.stdout
        sys.stdout = open(FOLDER + '/log', 'w')

    X_train = np.load(opts.source_data_folder + '/X_train.npy')
    y_train = np.load(opts.source_data_folder + '/y_train.npy')
    X_train = X_train.astype('float32')

    print('X_train:', np.shape(X_train))
    print('y_train:', np.shape(y_train))

    maxsize = 0
    print('-' * 20)
    for cat in np.unique(y_train):
        size = np.shape(np.where(y_train == cat))[1]
        print(str(cat) + ": " + str(np.shape(np.where(y_train == cat))[1]))
        if (size > maxsize):
            maxsize = size
    print('-' * 20)

    y = y_train

    X_train = normalize(X_train,
                        norm='l2',
                        axis=0,
                        copy=True,
                        return_norm=False)
    X_train, X_test, y_train, y_test = train_test_split(X_train,
                                                        y_train,
                                                        test_size=0.33,
                                                        random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                      y_train,
                                                      test_size=0.1,
                                                      random_state=42)

    dim = np.shape(X_train)[1]
    print(dim)

    #Setting Classifier
    xgbc = XGBClassifier(max_depth=20,
                         tree_method='exact',
                         n_estimators=180,
                         n_jobs=-1)
    #training
    xgbc.fit(X_train,
             y_train,
             eval_set=[(X_train, y_train), (X_val, y_val)],
             early_stopping_rounds=30,
             verbose=True)

    results = xgbc.score(X_test, y_test)

    print('Test accuracy: ', results)

    if (reDirect):
        sys.stdout = old_stdout
    print('Test accuracy: ', results)

    xgbc.get_booster().save_model(MODEL_PATH)

    y_pred = xgbc.predict(X_test)

    #load the best model
    import xgboost as xgb
    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model(MODEL_PATH)  # load data
    y_pred = bst.predict(X_test)

    y_p = y_pred
    y_t = y_test
    class_names = [DIG2LABEL[i] for i in range(nclass)]
    cnf_matrix = confusion_matrix(y_t, y_p)
    np.set_printoptions(precision=2)

    # Plot non-normalized confusion matrix
    plt.figure()
    plot_confusion_matrix(cnf_matrix,
                          classes=class_names,
                          title='Confusion matrix, without normalization')
    plt.savefig(FIG_PATH)

    plt.figure()
    plot_confusion_matrix(cnf_matrix,
                          classes=class_names,
                          normalize=True,
                          title='Normalized confusion matrix')
    plt.savefig(FIG_PATH_N)

    print('f1-scroe = {}'.format(f1_score(y_t, y_p, average=None)))
    print('prcision = {}'.format(precision_score(y_t, y_p, average=None)))
    print('recall = {}'.format(recall_score(y_t, y_p, average=None)))
    print('macro f1 = {}'.format(f1_score(y_t, y_p, average='macro')))
示例#23
0
 def test_Booster_init_invalid_path(self):
     """An invalid model_file path should raise XGBoostError."""
     with pytest.raises(xgb.core.XGBoostError):
         xgb.Booster(model_file=Path("invalidpath"))
 def load(self, model_fp):
     self.model = xgb.Booster(self.params)
     self.model.load_model(model_fp)
示例#25
0
'cost_amount_mean食堂',
'cost_amount_sum教务处',
'cost_reason_count淋浴',
'cost_amount_sum1',
'cost_amount_mean教务处',
'cost_amount_sum图书馆',
'rank',
'cost_amount_sum淋浴',
'cost_amount_mean卡充值',
'cost_amount_sum洗衣房',
'cost_amount_mean淋浴',
'cost_amount_mean超市',
'libraryCount',
'borrow_count',
'cost_amount_sum文印中心',
'cost_amount_mean图书馆',
'cost_amount_mean文印中心',
'stu_id'
]]
clf = xgb.Booster({'nthread':4}) #init model
clf.load_model(r'.\model\myxgb_5.m') # load data
Dtest = xgb.DMatrix(test_data1)
predict = clf.predict(Dtest)  #studentid,subsidy
print sum(predict)
student['subsidy'] = predict
student['subsidy'] = student['subsidy'].replace({1:1000,2:1500,3:2000}).astype('int64')
studentall = pd.read_csv(r'.\data\test\studentID_test.txt',header=None,names=['stu_id']) #test表
studentall = studentall.merge(student,on='stu_id',how = 'left').fillna(0).astype('int64')
studentall.rename(columns = {'stu_id':'studentid'},inplace = True)
studentall[['studentid','subsidy']].to_csv(r'.\ans\anwser_1120_1.csv',index = False,header = True)
示例#26
0
import gauss3d


# --------------------- CUSTOM FUNC -------------------------
def im2double(im):
    min_val = np.min(im.ravel())
    max_val = np.max(im.ravel())
    if max_val != min_val:
        out = (im.astype('float') - min_val) / (max_val - min_val)
    else:
        out = im.astype('float') / 255
    return out


## -------------------- LOAD MODEL ----------------------------
dst = xgb.Booster()
dst.load_model("./hog3d.model")
## -------------------- DATASET -------------------------------
data_path = "D:/Proj/UAV/dataset/drones/"
data_postfix = ".avi"
data_num = 1
cap = cv.VideoCapture(data_path + "Video_%s" % data_num + data_postfix)
# ---------------------- PARAMS -------------------------------

CUBE_T, CUBE_Y, CUBE_X = (4, 64, 64
                          )  # define the size of each st-cube to be processed
HOG_SIZE = (int(np.ceil(CUBE_X / 4)), int(np.ceil(CUBE_T / 2)))
HOG_STEP = (int(np.ceil(CUBE_X / 4)), int(np.ceil(CUBE_T / 2)))
BCDIV = 3

GAU_SIGMA = (1, 3, 3)  #(t,y,x)
示例#27
0
def load_model(model_path):
    model = xgb.Booster()
    model.load_model(model_path)
    return model
示例#28
0
clause_dir = "{}/clauses".format(args.outdir)
proof_dir = "{}/proofs".format(args.outdir)
os.makedirs(value_train_dir, exist_ok=True)
os.makedirs(policy_train_dir, exist_ok=True)
os.makedirs(clause_dir, exist_ok=True)
os.makedirs(proof_dir, exist_ok=True)




if args.model_type == "xgboost" and args.guided > 0:
    assert args.guidance_dir is not None
    value_modelfile = "{}/value_xgb".format(args.guidance_dir)
    policy_modelfile = "{}/policy_xgb".format(args.guidance_dir)
    if args.guided == 1: # using python to access xgboost    
        value_model = xgb.Booster() 
        value_model.load_model(value_modelfile)
        policy_model = xgb.Booster()
        policy_model.load_model(policy_modelfile)
elif args.model_type == "Simple Dense":
    assert args.guidance_dir is not None
    value_modelfile = "{}/value_xgb".format(args.guidance_dir)
    policy_modelfile = "{}/policy_xgb".format(args.guidance_dir)
    if args.guided == 1: # using python to access xgboost    
        value_model = tf.keras.models.load_model(value_modelfile)
        policy_model = tf.keras.models.load_model(policy_modelfile)


n_features = get_max_fea(args)

def conv_state(state):
示例#29
0
def main():
    with open('players.dictionary', 'rb') as f:
        players = pickle.load(f)

    with open('20players.dictionary', 'rb') as f:
        players_this_season = pickle.load(f)

    with open('20teams.dictionary', 'rb') as f:
        teams = pickle.load(f)

    players_this_postseason = {}
    matchups = {}
    results = {'champ':{}, \
               'finals':{}, \
               'conf':{}, \
               'semi':{}, \
               'playoffs':{}}

    model = xgb.Booster({'nthread': 4})  # init model
    model.load_model('basketball.model')
    lineups = {'LAL':['jamesle01', 'davisan02', 'greenda02', 'caldwke01', 'mcgeeja01', 'kuzmaky01', 'howardw01', 'carusal01', 'cookqu01', 'smithjr01', 'morrima02', 'waitedi01', 'dudleja01'],\
               'LAC':['leonaka01', 'georgpa01', 'morrima03', 'beverpa01', 'zubaciv01', 'shamela01', 'harremo01', 'willilo02', 'jacksre01', 'greenja01', 'mcgruro01', 'noahjo01', 'pattepa01'], \
               'MIL':['antetgi01', 'middlkh01', 'bledser01', 'lopezbr01', 'matthwe02', 'divindo01', 'ilyaser01', 'korveky01', 'lopezro01', 'hillge01', 'connapa01', 'brownst02'], \
               'PHI':['simmobe01', 'harrito02', 'embiijo01', 'richajo01', 'thybuma01', 'horfoal01', 'korkmfu01', 'robingl02', 'burksal01', 'scottmi01', 'netora01', 'miltosh01', 'oquinky01'], \
               'HOU':['hardeja01', 'westbru01', 'tuckepj01', 'covinro01', 'houseda01', 'gordoer01', 'riverau01', 'mclembe01', 'greenje02', 'sefolth01', 'carrode01', 'mbahalu01', 'nwabada01'], \
               'BOS':['walkeke02', 'haywago01', 'tatumja01', 'brownja02', 'theisda01', 'smartma01', 'kanteen01', 'willigr01', 'wanambr01', 'ojelese01', 'williro04'], \
               'TOR':['lowryky01', 'siakapa01', 'anunoog01', 'gasolma01', 'vanvlfr01', 'powelno01', 'ibakase01', 'holliro01', 'daviste02', 'mccawpa01', 'bouchch01', 'thomama02'], \
               'DEN':['jokicni01', 'murraja01', 'harriga01', 'millspa01', 'bartowi01', 'grantje01', 'craigto01', 'morrimo01', 'plumlma01', 'doziepj01', 'bateske01', 'vonleno01', 'portemi01'], \
               'DAL':['doncilu01', 'porzikr01', 'finnedo01', 'hardati02', 'curryse01', 'klebima01', 'wrighde01', 'jacksju01', 'kiddgmi01', 'caulewi01', 'burketr01', 'marjabo01', 'bareajo01'], \
               'OKC':['paulch01', 'gallida01', 'adamsst01', 'gilgesh01', 'dortlu01', 'schrode01', 'fergute01', 'noelne01', 'roberan03', 'bazleda01', 'diallha01', 'naderab01', 'muscami01'], \
               'UTA':['goberru01', 'mitchdo01', 'conlemi01', 'inglejo01', 'onealro01', 'clarkjo01', 'niangge01', 'davised01', 'bradlto01', 'morgaju01', 'brantja01', 'tuckera01'], \
               'MIA':['butleji01', 'adebaba01', 'nunnke01', 'robindu01', 'crowdja01', 'dragigo01', 'herroty01', 'leoname01', 'jonesde02', 'iguodan01', 'olynyke01', 'hillso01'], \
               'IND':['warretj01', 'brogdma01', 'turnemy01', 'oladivi01', 'holidaa01', 'sabondo01', 'holidju01', 'mcderdo01', 'sumneed01', 'mccontj01', 'bitadgo01', 'leaftj01', 'johnsal02'], \
               'POR':['lillada01', 'mccolcj01', 'nurkiju01', 'anthoca01', 'colliza01', 'hoodro01', 'whiteha01', 'trentga02', 'hezonma01', 'simonan01', 'adamsja01', 'littlna01'], \
               'ORL':['vucevni01', 'fournev01', 'gordoaa01', 'augusdj01', 'ennisja01', 'isaacjo01', 'fultzma01', 'rosste01', 'birchkh01', 'cartemi01', 'iwundwe01', 'clarkga01'], \
               'BRK':['allenja01', 'harrijo01', 'leverca01', 'thomala01', 'johnsty01', 'templga01', 'kurucro01', 'anderju01', 'luwawti01', 'musadz01', 'chiozch01', 'martije02'], \
               'MEM':['moranja01', 'jacksja02', 'brookdi01', 'valanjo01', 'anderky01', 'meltode01', 'clarkbr01', 'jacksjo02', 'dienggo01', 'gudurma01', 'tollian01', 'konchjo01', 'watanyu01'], \
               'NOP':['holidjr01', 'ingrabr01', 'willizi01', 'favorde01', 'redicjj01', 'balllo01', 'hartjo01', 'mellini01', 'mooreet01', 'willike04', 'hayesja02', 'okafoja01'], \
               'SAC':['barneha02', 'foxde01', 'bjeline01','holmeri01', 'bogdabo01', 'hieldbu01', 'parkeja01', 'josepco01', 'bazemke01', 'ferreyo01', 'gilesha01', 'lenal01', 'breweco01'], \
               'WAS':['hachiru01', 'bryanth01', 'browntr01', 'napiesh01', 'bongais01', 'smithis01', 'mahinia01', 'wagnemo01', 'robinje01', 'pasecan01', 'paytoga02', 'grantje02', 'schofad01'], \
               'SAS':['derozde01', 'murrade01', 'whitede01', 'poeltja01', 'walkelo01', 'gayru01', 'forbebr01', 'millspa02', 'zellety01', 'belinma01', 'metuch01', 'samanlu01', 'eubandr01'], \
               'PHO':['bookede01', 'aytonde01', 'rubiori01', 'bridgmi01', 'johnsca02', 'saricda01', 'kaminfr01', 'baynear01', 'carteje01', 'payneca01', 'okoboel01', 'diallch01', 'jeromty01']}

    schedule = [{'team1':'UTA', 'team2':'NOP', 'date':datetime.date(2020, 7, 30)},\
                {'team1':'LAC', 'team2':'LAL', 'date':datetime.date(2020, 7, 30)},\
                {'team1':'ORL', 'team2':'BRK', 'date':datetime.date(2020, 7, 31)},\
                {'team1':'PHO', 'team2':'WAS', 'date':datetime.date(2020, 7, 31)},\
                {'team1':'MEM', 'team2':'POR', 'date':datetime.date(2020, 7, 31)},\
                {'team1':'BOS', 'team2':'MIL', 'date':datetime.date(2020, 7, 31)},\
                {'team1':'SAC', 'team2':'SAS', 'date':datetime.date(2020, 7, 31)},\
                {'team1':'DAL', 'team2':'HOU', 'date':datetime.date(2020, 7, 31)},\
                {'team1':'MIA', 'team2':'DEN', 'date':datetime.date(2020, 8, 1)},\
                {'team1':'UTA', 'team2':'OKC', 'date':datetime.date(2020, 8, 1)},\
                {'team1':'NOP', 'team2':'LAC', 'date':datetime.date(2020, 8, 1)},\
                {'team1':'PHI', 'team2':'IND', 'date':datetime.date(2020, 8, 1)},\
                {'team1':'LAL', 'team2':'TOR', 'date':datetime.date(2020, 8, 1)},\
                {'team1':'WAS', 'team2':'BRK', 'date':datetime.date(2020, 8, 2)},\
                {'team1':'POR', 'team2':'BOS', 'date':datetime.date(2020, 8, 2)},\
                {'team1':'SAS', 'team2':'MEM', 'date':datetime.date(2020, 8, 2)},\
                {'team1':'SAC', 'team2':'ORL', 'date':datetime.date(2020, 8, 2)},\
                {'team1':'MIL', 'team2':'HOU', 'date':datetime.date(2020, 8, 2)},\
                {'team1':'DAL', 'team2':'PHO', 'date':datetime.date(2020, 8, 2)},\
                {'team1':'TOR', 'team2':'MIA', 'date':datetime.date(2020, 8, 3)},\
                {'team1':'IND', 'team2':'WAS', 'date':datetime.date(2020, 8, 3)},\
                {'team1':'DEN', 'team2':'OKC', 'date':datetime.date(2020, 8, 3)},\
                {'team1':'MEM', 'team2':'NOP', 'date':datetime.date(2020, 8, 3)},\
                {'team1':'SAS', 'team2':'PHI', 'date':datetime.date(2020, 8, 3)},\
                {'team1':'LAL', 'team2':'UTA', 'date':datetime.date(2020, 8, 3)},\
                {'team1':'BRK', 'team2':'MIL', 'date':datetime.date(2020, 8, 4)},\
                {'team1':'DAL', 'team2':'SAC', 'date':datetime.date(2020, 8, 4)},\
                {'team1':'PHO', 'team2':'LAC', 'date':datetime.date(2020, 8, 4)},\
                {'team1':'ORL', 'team2':'IND', 'date':datetime.date(2020, 8, 4)},\
                {'team1':'BOS', 'team2':'MIA', 'date':datetime.date(2020, 8, 4)},\
                {'team1':'HOU', 'team2':'POR', 'date':datetime.date(2020, 8, 4)},\
                {'team1':'MEM', 'team2':'UTA', 'date':datetime.date(2020, 8, 5)},\
                {'team1':'PHI', 'team2':'WAS', 'date':datetime.date(2020, 8, 5)},\
                {'team1':'DEN', 'team2':'SAS', 'date':datetime.date(2020, 8, 5)},\
                {'team1':'OKC', 'team2':'LAL', 'date':datetime.date(2020, 8, 5)},\
                {'team1':'TOR', 'team2':'ORL', 'date':datetime.date(2020, 8, 5)},\
                {'team1':'BRK', 'team2':'BOS', 'date':datetime.date(2020, 8, 5)},\
                {'team1':'NOP', 'team2':'SAC', 'date':datetime.date(2020, 8, 6)},\
                {'team1':'MIA', 'team2':'MIL', 'date':datetime.date(2020, 8, 6)},\
                {'team1':'IND', 'team2':'PHO', 'date':datetime.date(2020, 8, 6)},\
                {'team1':'LAC', 'team2':'DAL', 'date':datetime.date(2020, 8, 6)},\
                {'team1':'POR', 'team2':'DEN', 'date':datetime.date(2020, 8, 6)},\
                {'team1':'LAL', 'team2':'HOU', 'date':datetime.date(2020, 8, 6)},\
                {'team1':'UTA', 'team2':'SAS', 'date':datetime.date(2020, 8, 7)},\
                {'team1':'OKC', 'team2':'MEM', 'date':datetime.date(2020, 8, 7)},\
                {'team1':'SAC', 'team2':'BRK', 'date':datetime.date(2020, 8, 7)},\
                {'team1':'ORL', 'team2':'PHI', 'date':datetime.date(2020, 8, 7)},\
                {'team1':'WAS', 'team2':'NOP', 'date':datetime.date(2020, 8, 7)},\
                {'team1':'BOS', 'team2':'TOR', 'date':datetime.date(2020, 8, 7)},\
                {'team1':'LAC', 'team2':'POR', 'date':datetime.date(2020, 8, 8)},\
                {'team1':'UTA', 'team2':'DEN', 'date':datetime.date(2020, 8, 8)},\
                {'team1':'LAL', 'team2':'IND', 'date':datetime.date(2020, 8, 8)},\
                {'team1':'PHO', 'team2':'MIA', 'date':datetime.date(2020, 8, 8)},\
                {'team1':'MIL', 'team2':'DAL', 'date':datetime.date(2020, 8, 8)},\
                {'team1':'WAS', 'team2':'OKC', 'date':datetime.date(2020, 8, 9)},\
                {'team1':'MEM', 'team2':'TOR', 'date':datetime.date(2020, 8, 9)},\
                {'team1':'SAS', 'team2':'NOP', 'date':datetime.date(2020, 8, 9)},\
                {'team1':'ORL', 'team2':'BOS', 'date':datetime.date(2020, 8, 9)},\
                {'team1':'PHI', 'team2':'POR', 'date':datetime.date(2020, 8, 9)},\
                {'team1':'HOU', 'team2':'SAC', 'date':datetime.date(2020, 8, 9)},\
                {'team1':'BRK', 'team2':'LAC', 'date':datetime.date(2020, 8, 9)},\
                {'team1':'OKC', 'team2':'PHO', 'date':datetime.date(2020, 8, 10)},\
                {'team1':'DAL', 'team2':'UTA', 'date':datetime.date(2020, 8, 10)},\
                {'team1':'TOR', 'team2':'MIL', 'date':datetime.date(2020, 8, 10)},\
                {'team1':'IND', 'team2':'MIA', 'date':datetime.date(2020, 8, 10)},\
                {'team1':'DEN', 'team2':'LAL', 'date':datetime.date(2020, 8, 10)},\
                {'team1':'BRK', 'team2':'ORL', 'date':datetime.date(2020, 8, 11)},\
                {'team1':'HOU', 'team2':'SAS', 'date':datetime.date(2020, 8, 11)},\
                {'team1':'PHO', 'team2':'PHI', 'date':datetime.date(2020, 8, 11)},\
                {'team1':'POR', 'team2':'DAL', 'date':datetime.date(2020, 8, 11)},\
                {'team1':'BOS', 'team2':'MEM', 'date':datetime.date(2020, 8, 11)},\
                {'team1':'MIL', 'team2':'WAS', 'date':datetime.date(2020, 8, 11)},\
                {'team1':'NOP', 'team2':'SAC', 'date':datetime.date(2020, 8, 11)},\
                {'team1':'IND', 'team2':'HOU', 'date':datetime.date(2020, 8, 12)},\
                {'team1':'TOR', 'team2':'PHI', 'date':datetime.date(2020, 8, 12)},\
                {'team1':'MIA', 'team2':'OKC', 'date':datetime.date(2020, 8, 12)},\
                {'team1':'LAC', 'team2':'DEN', 'date':datetime.date(2020, 8, 12)},\
                {'team1':'SAS', 'team2':'UTA', 'date':datetime.date(2020, 8, 13)},\
                {'team1':'SAC', 'team2':'LAL', 'date':datetime.date(2020, 8, 13)},\
                {'team1':'MIL', 'team2':'MEM', 'date':datetime.date(2020, 8, 13)},\
                {'team1':'WAS', 'team2':'BOS', 'date':datetime.date(2020, 8, 13)},\
                {'team1':'POR', 'team2':'BRK', 'date':datetime.date(2020, 8, 13)},\
                {'team1':'NOP', 'team2':'ORL', 'date':datetime.date(2020, 8, 13)},\
                {'team1':'DAL', 'team2':'PHO', 'date':datetime.date(2020, 8, 13)},\
                {'team1':'PHI', 'team2':'HOU', 'date':datetime.date(2020, 8, 14)},\
                {'team1':'DEN', 'team2':'TOR', 'date':datetime.date(2020, 8, 14)},\
                {'team1':'OKC', 'team2':'LAC', 'date':datetime.date(2020, 8, 14)},\
                {'team1':'MIA', 'team2':'IND', 'date':datetime.date(2020, 8, 14)}]

    for i in range(1, 100001):
        simulation = simulate_season.Simulation(teams, players, players_this_season, players_this_postseason, schedule, model, lineups, matchups, results)
        simulation.simulate_reg_season()
        simulation.simulate_playoffs(datetime.date(2020, 8, 17))
        if i % 50 == 0:
            print(i)
            for matchup in matchups.keys():
                print(matchup[0] + ',' + matchup[1] + ',' + str(matchups[matchup]))

            for team in teams.keys():
                print(team + ',' + str(get_result(results, 'playoffs', team)) + ',' + str(get_result(results, 'semi', team)) + ',' + str(get_result(results, 'conf', team)) + ',' + str(get_result(results, 'finals', team)) + ',' + str(get_result(results, 'champ', team)))

    # print(matchups)
    # print(simulation.results)
    for matchup in matchups.keys():
        print(str(matchup) + ',' + str(matchups[matchup]))

    for team in teams.keys():
        print(team + ',' + str(get_result(results, 'playoffs', team)) + ',' + str(get_result(results, 'semi', team)) + ',' + str(get_result(results, 'conf', team)) + ',' + str(get_result(results, 'finals', team)) + ',' + str(get_result(results, 'champ', team)))
import gc
import sys
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split

# num = sys.argv[1]
num = 1

path = '../../output/stack-data/'

out_path = '../../output/results/xgb/'

bst = xgb.Booster(model_file='xgb{0}.model'.format(num))  # load model


# valid = pd.read_csv(path + 'valid{0}.csv'.format(num))
# label_valid = np.array(valid['label'])
# valid.drop(['label'], axis=1, inplace=True)
#
# xgb_val = xgb.DMatrix(valid)
#
# del valid
# gc.collect()
#
# val_pred = bst.predict(xgb_val)
# output = open(out_path + 'subval{0}.csv'.format(num), 'w')
# output.write('label,xgb_prob\n')
# for t, p in enumerate(val_pred, start=1):