def load(self): model_file = os.path.join(seldon_core.Storage.download(self.model_uri), BOOSTER_FILE) self._booster = xgb.Booster(model_file=model_file) self.ready = True
def xgbmodel_testing_patch(): bst_filename = './Patch.model' bst = xgb.Booster({'nthread': 40}) bst.load_model(bst_filename) bst_assemble = [] bst_assemble.append(bst)
def train(self, dtrain, num_rounds=100, skip_rounds=10, evals=[], silent=False, plot=False): """ HitBoost model training or watching learning curve on evaluation set. Parameters ---------- dtrain: xgboost.DMatrix Training data for survival analysis. It's suggested that you utilize tools of `datasets` module to convert pd.DataFrame to xgboost.DMatrix. num_rounds: int The number of iterations. skip_rounds: int The number of skipped rounds if you want to print infos. evals: list of pairs (xgb.DMatrix, string) Evaluation set to watch learning curve. If it is set as an empty list by default, then the training data will became the evaluation set. silent: boolean Keep silence or print information. plot: boolean Plot learning curve. Returns ------- dict: Evaluation result during training, which is formatted as `{'td-CI': [], 'Loss': []}`. """ # First to check the args _check_params(self.model_params) if not isinstance(dtrain, xgb.DMatrix): raise TypeError("The type of dtrain must be 'xgb.DMatrix'") if len(evals) == 0: eval_labels = ['train'] eval_datas = [dtrain] else: if not isinstance(evals[0], tuple): raise TypeError("The type of dtrain must be 'xgb.DMatrix'") eval_labels = [c[1] for c in evals] eval_datas = [c[0] for c in evals] # Logging for result eval_result = {'td-CI': [], 'Loss': []} self._model = xgb.Booster(self.model_params, [dtrain]) for _ in range(num_rounds): # Note: Since default setting of `output_margin` is `False`, # so the prediction is outputted after softmax transformation. pred = self._model.predict(dtrain) # Note: The gradient you provide for `model.boost()` must be # gradients of objective function with respect to the direct # output of boosting tree (even if you set `output_margin` as # `True`). g, h = _hit_grads(pred, dtrain) self._model.boost(dtrain, g, h) # Append to eval_result # returns a list of values res_loss, res_ci = _hit_eval(self._model, eval_datas) eval_result['Loss'].append(res_loss) eval_result['td-CI'].append(res_ci) if not silent and (_ + 1) % skip_rounds == 0: _print_eval(_ + 1, res_loss, res_ci, eval_labels) # plot learning curve if plot: plot_train_curve(eval_result['Loss'], eval_labels, "Loss function") plot_train_curve(eval_result['td-CI'], eval_labels, "Time-Dependent C-index") return eval_result
def test_load_file_invalid(self): with pytest.raises(xgb.core.XGBoostError): xgb.Booster(model_file='incorrect_path') with pytest.raises(xgb.core.XGBoostError): xgb.Booster(model_file=u'不正なパス')
def local_train(original_sql, model_image, estimator_string, datasource, select, validation_select, model_params, train_params, feature_metas, feature_column_names, feature_column_map, label_column, transform_fn, save, load="", is_pai=False, pai_train_table="", pai_validate_table="", oss_model_dir=""): disk_cache = train_params.pop("disk_cache", False) batch_size = train_params.pop("batch_size", None) if batch_size is not None and batch_size < 0: batch_size = None epoch = train_params.pop("epoch", 1) num_workers = train_params.pop("num_workers", 1) label_meta_dict = label_column.get_field_desc()[0].to_dict( dtype_to_string=True) file_name = "my_model" bst = None if load: with temp_file.TemporaryDirectory(as_cwd=True): Model.load_from_db(datasource, load) bst = xgb.Booster() bst.load_model(file_name) def build_dataset(fn, slct, pai_table): return xgb_dataset(datasource, fn, slct, feature_metas, feature_column_names, label_meta_dict, cache=disk_cache, batch_size=batch_size, epoch=epoch, transform_fn=transform_fn, is_pai=is_pai, pai_table=pai_table, feature_column_code=feature_column_map) with temp_file.TemporaryDirectory() as tmp_dir_name: train_fn = os.path.join(tmp_dir_name, 'train.txt') val_fn = os.path.join(tmp_dir_name, 'val.txt') train_dataset = build_dataset(train_fn, select, pai_train_table) if validation_select: val_dataset = build_dataset(val_fn, validation_select, pai_validate_table) else: val_dataset = None eval_result = dict() watchlist = [None] if val_dataset: # The `xgboost.train` API only accepts the XGBoost DMatrix # object as the training or validation dataset, so we should # convert the generator to DMatrix. if isinstance(val_dataset, types.GeneratorType): val_dataset = list(val_dataset)[0] watchlist.append((val_dataset, "validate")) for per_batch_dmatrix in train_dataset: watchlist[0] = (per_batch_dmatrix, "train") bst = xgb.train(model_params, per_batch_dmatrix, evals=watchlist, evals_result=eval_result, xgb_model=bst, **train_params) print("Evaluation result: %s" % eval_result) meta = collect_metadata(original_sql=original_sql, select=select, validation_select=validation_select, model_repo_image=model_image, class_name=estimator_string, attributes=model_params, features=feature_column_map, label=label_column, evaluation=eval_result, num_workers=num_workers) save_model_to_local_file(bst, model_params, file_name) model = Model(EstimatorType.XGBOOST, meta) model.save_to_db(datasource, save) return eval_result
def train(datasource, select, model_params, train_params, feature_metas, feature_column_names, label_meta, validation_select, disk_cache=False, batch_size=None, epoch=1, load_pretrained_model=False, is_pai=False, pai_train_table="", pai_validate_table="", rank=0, nworkers=1, oss_model_dir="", transform_fn=None, feature_column_code="", model_repo_image="", original_sql=""): if batch_size == -1: batch_size = None print("Start training XGBoost model...") dtrain = xgb_dataset(datasource, 'train.txt', select, feature_metas, feature_column_names, label_meta, is_pai, pai_train_table, cache=disk_cache, batch_size=batch_size, epoch=epoch, rank=rank, nworkers=nworkers, transform_fn=transform_fn, feature_column_code=feature_column_code) if len(validation_select.strip()) > 0: dvalidate = list( xgb_dataset(datasource, 'validate.txt', validation_select, feature_metas, feature_column_names, label_meta, is_pai, pai_validate_table, rank=rank, nworkers=nworkers, transform_fn=transform_fn, feature_column_code=feature_column_code))[0] filename = "my_model" if load_pretrained_model: bst = xgb.Booster() bst.load_model(filename) else: bst = None re = None for per_batch_dmatrix in dtrain: watchlist = [(per_batch_dmatrix, "train")] if len(validation_select.strip()) > 0: watchlist.append((dvalidate, "validate")) re = dict() bst = xgb.train(model_params, per_batch_dmatrix, evals=watchlist, evals_result=re, xgb_model=bst, **train_params) print("Evaluation result: %s" % re) if rank == 0: # TODO(sneaxiy): collect features and label metadata = collect_metadata(original_sql=original_sql, select=select, validation_select=validation_select, model_repo_image=model_repo_image, class_name=model_params.get("booster"), attributes=model_params, features=None, label=None, evaluation=re) save_model_to_local_file(bst, model_params, filename) save_metadata("model_meta.json", metadata) if is_pai and len(oss_model_dir) > 0: save_model(oss_model_dir, filename, model_params, train_params, feature_metas, feature_column_names, label_meta, feature_column_code)
from sklearn.metrics import roc_auc_score # model = xgb.Booster(model_file='xgb_xk.model') # x_test = xgb.DMatrix(x_test) offline_result = model.predict(x_test) print('auc:', roc_auc_score(y_test, offline_result)) # 使用以下命令可以进行tar.bz2格式文件的压缩和解压缩,效果拔群。 # !tar -jcvf <要生成的压缩文件名> <要压缩的文件> # 压缩 # !tar -jxvf <要解压缩的文件> # 解压缩 model = xgb.Booster(model_file='xgb_xk.model') # 开始预测 feature_pre = pd.read_csv('xk_feature_1_30.csv') # feature_pre.pop('activity_days_nums') feature_pre['action_rate1'] = (feature_pre['actions_pre_1_numbers']/ feature_pre['actions_numbers']).map(lambda x:round(x, 2)) feature_pre['action_rate3'] = (feature_pre['actions_pre_3_numbers']/ feature_pre['actions_numbers']).map(lambda x:round(x, 2)) feature_pre['action_rate5'] = (feature_pre['actions_pre_5_numbers']/ feature_pre['actions_numbers']).map(lambda x:round(x, 2)) feature_pre['action_rate7'] = (feature_pre['actions_pre_7_numbers']/ feature_pre['actions_numbers']).map(lambda x:round(x, 2)) feature_pre['action_rate13'] = (feature_pre['actions_pre_1_numbers']/ feature_pre['actions_pre_3_numbers']).map(lambda x:round(x, 2)) feature_pre['action_rate15'] = (feature_pre['actions_pre_1_numbers']/
# read train data and test data train = pd.DataFrame.from_csv(train_value_path) test = pd.DataFrame.from_csv(test_path) train, test = date_parser(train, test) # read train labels train_labels = pd.DataFrame.from_csv(train_label_path) label_encoder = LabelEncoder() train_labels.iloc[:, 0] = label_encoder.fit_transform(train_labels.values.flatten()) if load_best_model: # build final model xg_train = xgboost.DMatrix(train, label=train_labels.values.flatten()) xg_test = xgboost.DMatrix(test) xgclassifier = xgboost.Booster(params) xgclassifier.load_model(best_model_path) else: # find best boost round all_best_rounds = [] kf = StratifiedKFold( train_labels.values.flatten(), n_folds=4, shuffle=True, random_state=0 ) for cv_train_index, cv_test_index in kf: xg_train = xgboost.DMatrix(train.values[cv_train_index, :], label=train_labels.iloc[cv_train_index].values.flatten()) xg_test = xgboost.DMatrix(train.values[cv_test_index, :], label=train_labels.iloc[cv_test_index].values.flatten()) xgclassifier = xgboost.train(
# # # Author : fcbruce <*****@*****.**> # # Time : Thu 01 Dec 2016 17:01:56 # # import xgboost as xgb import numpy as np import math rows_file = '../data/1201.npy' model_file = '../models/RiskModel20161124.model' def sigmoid(x): return 1. / (1 + np.exp(-x)) rows = np.load(rows_file) mat = xgb.DMatrix(rows) bst = xgb.Booster(model_file=model_file) print sigmoid(bst.predict(mat))
('open_to_new_job_I am actively looking for a new job', 'look_postings_frequent'), ('interview_likelihood', 'interview'), ] FEATS_2017 = [ ('CareerSatisfaction', 'like_developer'), ('HoursPerWeek', 'hours_per_week'), ('Overpaid', 'overpaid'), ('LastNewJob_Less than a year ago', 'curr_job_less_than_year'), ('InfluenceWorkstation', 'choose_equip'), ('Salary', 'salary'), ] CURR_FOLDER = os.path.dirname(os.path.realpath(__file__)) XGB_2015 = xgb.Booster( model_file=os.path.join(CURR_FOLDER, 'output2015.model')) XGB_2016 = xgb.Booster( model_file=os.path.join(CURR_FOLDER, 'output2016.model')) XGB_2017 = xgb.Booster( model_file=os.path.join(CURR_FOLDER, 'output2017.model')) MODELS = ( [XGB_2015, FEATS_2015], [XGB_2016, FEATS_2016], [XGB_2017, FEATS_2017], ) NUM_MODELS = len(MODELS) REQUIRED_KEYS = set(feat[1] for model in MODELS for feat in model[1]) TRAIN_OPTIONS = {
import numpy as np import pandas as pd import xgboost as xgb from sklearn.cross_validation import train_test_split ################################################### import time start_time = time.time() test_data = pd.read_csv("test_data.csv") test_y = test_data.y test_X =test_data.drop(['y'],axis=1) xgb_test = xgb.DMatrix(test_X,label=test_y) model = xgb.Booster(model_file='./model/xgb.model') y_hat=model.predict(xgb_test) print y_hat
def xgboost_train( training_data_path: InputPath('CSV'), # Also supports LibSVM model_path: OutputPath('XGBoostModel'), model_config_path: OutputPath('XGBoostModelConfig'), starting_model_path: InputPath('XGBoostModel') = None, label_column: int = 0, num_iterations: int = 10, booster_params: dict = None, # Booster parameters objective: str = 'reg:squarederror', booster: str = 'gbtree', learning_rate: float = 0.3, min_split_loss: float = 0, max_depth: int = 6, ): '''Train an XGBoost model. Args: training_data_path: Path for the training data in CSV format. model_path: Output path for the trained model in binary XGBoost format. model_config_path: Output path for the internal parameter configuration of Booster as a JSON string. starting_model_path: Path for the existing trained model to start from. label_column: Column containing the label data. num_boost_rounds: Number of boosting iterations. booster_params: Parameters for the booster. See https://xgboost.readthedocs.io/en/latest/parameter.html objective: The learning task and the corresponding learning objective. See https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters The most common values are: "reg:squarederror" - Regression with squared loss (default). "reg:logistic" - Logistic regression. "binary:logistic" - Logistic regression for binary classification, output probability. "binary:logitraw" - Logistic regression for binary classification, output score before logistic transformation "rank:pairwise" - Use LambdaMART to perform pairwise ranking where the pairwise loss is minimized "rank:ndcg" - Use LambdaMART to perform list-wise ranking where Normalized Discounted Cumulative Gain (NDCG) is maximized Annotations: author: Alexey Volkov <*****@*****.**> ''' import pandas import xgboost df = pandas.read_csv(training_data_path, ) training_data = xgboost.DMatrix( data=df.drop(columns=[df.columns[label_column]]), label=df[df.columns[label_column]], ) booster_params = booster_params or {} booster_params.setdefault('objective', objective) booster_params.setdefault('booster', booster) booster_params.setdefault('learning_rate', learning_rate) booster_params.setdefault('min_split_loss', min_split_loss) booster_params.setdefault('max_depth', max_depth) starting_model = None if starting_model_path: starting_model = xgboost.Booster(model_file=starting_model_path) model = xgboost.train(params=booster_params, dtrain=training_data, num_boost_round=num_iterations, xgb_model=starting_model) # Saving the model in binary format model.save_model(model_path) model_config_str = model.save_config() with open(model_config_path, 'w') as model_config_file: model_config_file.write(model_config_str)
def predictmodel(file,filefullpath,modelfile,scalefile,stationlist,demdict,origintime,foretime): os.environ["CUDA_VISIBLE_DEVICES"] = '0,1' allvaluelist=[] if file[-3:]=='001' and file[:3]=='D1D': print file grbs=pygrib.open(filefullpath) grb_2t = grbs.select(name='2 metre temperature') tempArray = grb_2t[0].values grb_2d = grbs.select(name='2 metre dewpoint temperature') dewpointArray = grb_2d[0].values grb_10u = grbs.select(name='10 metre U wind component') u10Array = grb_10u[0].values grb_10v = grbs.select(name='10 metre V wind component') v10Array = grb_10v[0].values grb_tcc = grbs.select(name='Total cloud cover') tccArray = grb_tcc[0].values grb_lcc = grbs.select(name='Low cloud cover') lccArray = grb_lcc[0].values grb_z = grbs.select(name='Geopotential') geoArray=grb_z[0].values grb_500rh = grbs.select(name='Relative humidity', level=500) rh500Array = grb_500rh[0].values grb_850rh = grbs.select(name='Relative humidity', level=850) rh850Array = grb_850rh[0].values #遍历站点->要素遍历、 for i in range(len(stationlist)): #print len(stationlist) perlist=stationlist[i] stationid=perlist[0] latitude=float(perlist[1]) longitude=float(perlist[2]) alti=float(perlist[3]) #站点左上角点的索引 indexlat = int((90 - latitude) / 0.1) indexlon = int((longitude + 180) / 0.1) per_station_value_list=[] calculate16gribvalue(tempArray,indexlat,indexlon,per_station_value_list) calculate16gribvalue(dewpointArray,indexlat,indexlon,per_station_value_list) calculate16gribvalue(u10Array,indexlat,indexlon,per_station_value_list) calculate16gribvalue(v10Array,indexlat,indexlon,per_station_value_list) calculate16gribvalue(tccArray,indexlat,indexlon,per_station_value_list) calculate16gribvalue(lccArray,indexlat,indexlon,per_station_value_list) calculate16gribvalue(geoArray,indexlat,indexlon,per_station_value_list) calculate16gribvalue(rh500Array,indexlat,indexlon,per_station_value_list) calculate16gribvalue(rh850Array,indexlat,indexlon,per_station_value_list) per_station_value_list.append(latitude) per_station_value_list.append(longitude) per_station_value_list.append(alti) # 站点高程:取计算好的站点周边16个点的高程值 demlist = demdict[stationlist[i][0]] for u in range(1, len(demlist), 1): per_station_value_list.append(float(demlist[u])) allvaluelist.append(per_station_value_list) #print(per_station_value_list) trainarray=numpy.array(allvaluelist) params001 = { 'tree_method': 'gpu_hist', 'booster': 'gbtree', 'objective': 'reg:linear', # 线性回归 'gamma': 0.2, # 用于控制是否后剪枝的参数,越大越保守,一般0.1、0.2这样子。 'max_depth': 12, # 构建树的深度,越大越容易过拟合 'lambda': 2, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。 'subsample': 0.7, # 随机采样训练样本 'colsample_bytree': 0.7, # 生成树时进行的列采样 'min_child_weight': 3, # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言 # ,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。 # 这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。 'silent': 0, # 设置成1则没有运行信息输出,最好是设置为0. 'eta': 0.01, # 如同学习率 'seed': 1000, # 'nthread':3,# cpu 线程数,不设置取最大值 # 'eval_metric': 'auc' 'scale_pos_weight': 1, 'n_gpus': 2 } xgbst=xgboost.Booster(params001) xgbst.load_model(modelfile) scaler=joblib.load(scalefile) #print(modelfile,scalefile) trainarray_t=scaler.transform(trainarray) #标准化后的矩阵坑我2次了:看好是标准化后的还是标准化前的 xgbtrain=xgboost.DMatrix(trainarray_t) result=xgbst.predict(xgbtrain) #print(result) logger.info(result) #结果入库 db = MySQLdb.connect('172.16.8.28', 'admin', 'moji_China_123', 'moge',3307) #db = MySQLdb.connect('192.168.10.84', 'admin', 'moji_China_123','moge') cursor = db.cursor() origin = datetime.datetime.strftime(origintime, '%Y-%m-%d %H:%M:%S') forecast = datetime.datetime.strftime(foretime, '%Y-%m-%d %H:%M:%S') forecast_year = foretime.year forecast_month = foretime.month forecast_day = foretime.day forecast_hour = foretime.hour forecast_minute = foretime.minute timestr = datetime.datetime.strftime(origintime, '%Y%m%d%H%M%S') # csv = os.path.join(outpath, origin+'_'+forecast + '.csv') # csvfile = open(csv, 'w') sql = 'replace into t_r_ec_city_forecast_ele_mos_dem_winter3(city_id,initial_time,forecast_time,forecast_year,forecast_month,forecast_day,forecast_hour,temperature)VALUES(%s,%s,%s,%s,%s,%s,%s,%s)' L = [] for j in range(len(stationlist)): perstationlist = [] stationid = stationlist[j][0] temp = result[j] # 每个站点存储 perstationlist.append(stationid) perstationlist.append(origin) perstationlist.append(forecast) perstationlist.append(forecast_year) perstationlist.append(forecast_month) perstationlist.append(forecast_day) perstationlist.append(forecast_hour) perstationlist.append(temp) L.append(perstationlist) logger.info(perstationlist) # # sql='insert into t_r_ec_mos_city_forecast_ele(city_id,initial_time,forecast_time,forecsat_year,forecast_month,forecast_day,forecast_hour,temperature)VALUES ()' # # sql = 'insert into t_r_ec_city_forecast_ele_mos (city_id,initial_time,forecast_time,forecast_year,forecast_month,forecast_day,forecast_hour,temperature,temp_max_6h,temp_min_6h,rainstate,precipitation)VALUES ("' + stationid + '","' + origin + '","' + str( # # forecast) + '","' + str(forecast_year) + '","' + str( # # forecast_month) + '","' + str(forecast_day) + '","' + str( # # forecast_hour) + '","' + str(temp) + '","' + str(maxtemp)+ '","' + str(mintemp)+'","' + str(rainstate)+'","' + str(prevalue)+ '") # # csvfile.write(stationid + '","' + origin + '","' + str( # # forecast) + '","' + str(forecast_year) + '","' + str( # # forecast_month) + '","' + str(forecast_day) + '","' + str( # # forecast_hour) + '","' + str(forecast_minute) + '","' + str( # # temp)+ '","' + str(maxtemp)+ '","' + str(mintemp)+'","' + str(rainstate)+'","' + str(prevalue)) # # csvfile.write('\n') # # print sql # # cursor.execute(sql) cursor.executemany(sql, L) db.commit() db.close()
def xgb_shap_values(x): bst = xgb.Booster() bst.load_model("my_model") explainer = shap.TreeExplainer(bst) return explainer.shap_values(x)
import gc import numpy as np import feather import xgboost as xgb print('+ Loading trained models...') model_fold0 = xgb.Booster({'nthread': 4}) model_fold0.load_model('tmp/xgb_model_0.model') model_fold1 = xgb.Booster({'nthread': 4}) model_fold1.load_model('tmp/xgb_model_1.model') print('+ Loading test data...') df_test = feather.read_dataframe('tmp/mtv_df_test.feather') features = sorted(set(df_test.columns) - {'display_id', 'clicked'}) X_test = df_test[features].values del df_test gc.collect() dtest = xgb.DMatrix(X_test, feature_names=features) del X_test gc.collect() print('+ Predicting using test data...') pred0_test = model_fold0.predict(dtest) pred1_test = model_fold1.predict(dtest) pred_test = (pred0_test + pred1_test) / 2 np.save('predictions/xgb_mtv_pred_test.npy', pred_test) del pred0_test, pred1_test, pred_test gc.collect()
def xgboost_train(start_day, end_day): model_path = './sub/bst_%s_%s.model' % (start_day, end_day) # print model_path # exit() user_index, training_data, label = processed_train_set(start_day, end_day) # user_index, training_data, label = data_set() # user_index, training_data, label = make_train_set(train_start_date, train_end_date) # 从样本中随机的按比例选取train data和test data。test_size是样本占比,如果是整数的话就是样本的数量。random_state是随机数的种子。 x_train, x_test, y_train, y_test = train_test_split(training_data.values, label.values, test_size=0.2, random_state=0) # 下面这一部分是为了抽取 x_test的user index 测准确度 x_train_df = pd.DataFrame(x_train) x_test_df = pd.DataFrame(x_test) # x_test_df.to_csv('./sub/x_test.csv', index=False, index_label=False) x_train = x_train_df.iloc[:, 2:].copy() test_index = x_test_df.iloc[:, [0, 1]].copy() test_index.columns = ['user_id', 'sku_id'] x_test = x_test_df.iloc[:, 2:].copy() del training_data['user_id'] del training_data['sku_id'] if os.path.exists(model_path): print('Model Loading !') bst = xgb.Booster() # 注意:名字要保持一致,否则报错! bst.load_model(model_path) else: dtrain = xgb.DMatrix(x_train.values, label=y_train) dtest = xgb.DMatrix(x_test.values, label=y_test) # 'max_delta_step':1 # param = {'learning_rate': 0.15, 'n_estimators': 1000, 'max_depth': 3, 'max_delta_step': 1, # 'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, 'colsample_bytree': 0.8, # 'scale_pos_weight': 1, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic'} # param = {'n_estimators': 1000, 'max_depth': 3, 'max_delta_step': 1, # 'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, 'colsample_bytree': 0.8, # 'scale_pos_weight': 13, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic'} param = { 'learning_rate': 0.15, 'n_estimators': 1000, 'max_depth': 3, 'max_delta_step': 1, 'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, 'colsample_bytree': 0.8, 'scale_pos_weight': 1, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic' } # num_round 训练轮数 # num_round = 290 num_round = 290 # 并行度,用几个cpu # param['nthread'] = 6 # param['eval_metric'] = "auc" # items() 返回键值对 plst = param.items() plst += [('eval_metric', 'auc')] # evallist是验证集,这里用测试集和训练集做两次验证 evallist = [(dtest, 'eval'), (dtrain, 'train')] bst = xgb.train(plst, dtrain, num_round, evallist) bst.save_model(model_path) print("-------------- 测试集准确度-----------") # 测试集实际 test_index['label'] = y_test test_true = test_index[test_index['label'] == 1] test_true = test_true[['user_id', 'sku_id']] # test_true.to_csv('./sub/testtrue1.csv', index=False, index_label=False) # 会有用户购买多个sku,但是最终结果每个用户只能买一个商品,问题是多个商品里面取哪一个? # test_true = test_true.first().reset_index(drop=True) # test_true.to_csv('./sub/testtrue2.csv', index=False, index_label=False) del test_index['label'] # 测试集预测 test_input = xgb.DMatrix(x_test.values) test_index['label'] = bst.predict(test_input) x_test['label'] = y_test for lv in label_level: print("this Probality = " + str(lv)) test_pred = test_index[test_index['label'] >= lv] test_pred = test_pred[['user_id', 'sku_id']] test_pred = test_pred.groupby( 'user_id', as_index=False).first().reset_index(drop=True) res = report(test_pred, test_true) if res == 1: break print("")
def model_load(model_path): # 加载已经训练完成的模型(model_path:文件路径) booster_model = xgb.Booster(model_path) return booster_model
from sklearn.metrics import confusion_matrix USE_MEMMAP = True data = pd.read_csv( 'dataset.csv' ).as_matrix() X = data[ :, 0:-1 ] y = data[ :, -1 ] if USE_MEMMAP: Xmm = np.memmap( 'X.mmap', dtype=X.dtype, mode='w+', shape=X.shape ) ymm = np.memmap( 'y.mmap', dtype=y.dtype, mode='w+', shape=y.shape ) np.copyto( Xmm, X ) np.copyto( ymm, y ) del( data ) del( X ) del( y ) X = Xmm y = ymm d = xgb.DMatrix( X, label=y ) model = xgb.Booster({'nthread':1}) model.load_model('xgb-model.bin') cm = confusion_matrix(y, model.predict(d) > 0.5) print(cm)
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #most import code here import pandas import xgboost as xgb data = pd.read_csv('example_input.csv') #read the input file data.info() data.columns bst = xgb.Booster() #initial instance bst.load_model('xgb_model') #load the xgboost model xgb_data = xgb.DMatrix(data) #transfer input data to xgb matrix result = bst.predict(xgb_data) #get the result
def process(input_path, output_path): jieba.enable_parallel(4) df_predict = pd.DataFrame() #input file: id s1 s2 df_input = pd.read_table(input_path) stopwords_path = "stopwords.txt" stopwords = [] for word in open(stopwords_path, "r").readlines(): stopwords.append(word.strip('\n')) def word_match_share(row): q1words = {} q2words = {} for word in jieba_cut(row['s1']): if word not in stopwords: q1words[word] = 1 for word in jieba_cut(row['s2']): if word not in stopwords: q2words[word] = 1 if len(q1words) == 0 or len(q2words) == 0: # The computer-generated chaff includes a few questions that are nothing but stopwords return 0 shared_words_in_q = [w for w in q1words.keys() if w in q2words] # 这里一定要注意 float,否则最后计算结果为 int R = float(len(shared_words_in_q) * 2) / (len(q1words) + len(q2words)) return R input_qs = pd.Series(df_input['s1'].tolist() + df_input['s2'].tolist()).astype(str) from collections import Counter def get_weight(count, eps=10000, min_count=2): if count < min_count: return 0 else: return 1.0 / (count + eps) eps = 5000 total_words = jieba_cut(" ".join(input_qs)) counts = Counter(total_words) weights = {word: get_weight(count) for word, count in counts.items()} def tfidf_word_match_share(row): q1words = {} q2words = {} for word in jieba_cut(row['s1']): if word not in stopwords: q1words[word] = 1 for word in jieba_cut(row['s2']): if word not in stopwords: q2words[word] = 1 if len(q1words) == 0 or len(q2words) == 0: # The computer-generated chaff includes a few questions that are nothing but stopwords return 0 shared_weights = [ weights.get(w, 0) for w in q1words.keys() if w in q2words ] + [weights.get(w, 0) for w in q2words.keys() if w in q1words] total_weights = [weights.get(w, 0) for w in q1words ] + [weights.get(w, 0) for w in q2words] R = sum(shared_weights) / sum(total_weights) return R df_predict['word_match'] = df_input.apply(word_match_share, axis=1, raw=True) df_predict['tfidf_word_match'] = df_input.apply(tfidf_word_match_share, axis=1, raw=True) bst = xgboost.Booster(model_file="./model/xgboost.model") d_predict = xgboost.DMatrix(df_predict) predict = bst.predict(d_predict) df_output = pd.DataFrame() df_output['id'] = df_input['id'] df_output['predict'] = [int(x + 0.5) for x in predict] with open(output_path, 'w') as fout: for index in df_output.index: fout.write( str(df_output['id'].loc[index]) + '\t' + str(df_output['predict'].loc[index]) + '\n')
def Predict(outfilename, modelname, tempscalerfile, origintime, foretime, csvfile, demcsv): try: # 取气温、最高气温、最低气温的训练训练矩阵。读文件费劲,连降水一起取了。晴雨、降水一块训 logger.info('----------------------------------------------') tempvariablelist = [] stationlist = [] calculateStationVariable(tempvariablelist, outfilename, stationlist, csvfile, demcsv) # 加载训练模型 params = { 'booster': 'gbtree', 'objective': 'reg:linear', # 线性回归 'gamma': 0.2, # 用于控制是否后剪枝的参数,越大越保守,一般0.1、0.2这样子。 'max_depth': 12, # 构建树的深度,越大越容易过拟合 'lambda': 2, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。 'subsample': 0.7, # 随机采样训练样本 'colsample_bytree': 0.7, # 生成树时进行的列采样 'min_child_weight': 3, # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言 # ,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。 # 这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。 'silent': 0, # 设置成1则没有运行信息输出,最好是设置为0. 'eta': 0.02, # 如同学习率 'seed': 1000, # 'nthread': 3, # cpu 线程数 # 'eval_metric': 'auc' 'scale_pos_weight': 1 } bst = xgboost.Booster(params) bst.load_model(modelname) # 气温模型预测 ecvaluelist = numpy.array(tempvariablelist) # ecvaluelist=ecvaluelist.astype('float64') # logger.info('ecvaluelist') # logger.info(ecvaluelist) # 加载标准化预处理文件,对数据进行与模型一致的标准化 scaler = joblib.load(tempscalerfile) # transform后必须重新复制,原来矩阵是不变的 ecvaluelist_t = scaler.transform(ecvaluelist) # logger.info(ecvaluelist) # logger.info(ecvaluelist_t) xgbtrain = xgboost.DMatrix(ecvaluelist_t) result = bst.predict(xgbtrain) # logger.info('result') # logger.info(result) # db = MySQLdb.connect('172.16.8.28', 'admin', 'moji_China_123', 'moge', 3307) # db = MySQLdb.connect('192.168.10.84', 'admin', 'moji_China_123','moge') cursor = db.cursor() origin = datetime.datetime.strftime(origintime, '%Y-%m-%d %H:%M:%S') forecast = datetime.datetime.strftime(foretime, '%Y-%m-%d %H:%M:%S') forecast_year = foretime.year forecast_month = foretime.month forecast_day = foretime.day forecast_hour = foretime.hour forecast_minute = foretime.minute timestr = datetime.datetime.strftime(origintime, '%Y%m%d%H%M%S') # csv = os.path.join(outpath, origin+'_'+forecast + '.csv') # csvfile = open(csv, 'w') sql = 'replace into t_r_ec_city_forecast_ele_mos_dem (city_id,initial_time,forecast_time,forecast_year,forecast_month,forecast_day,forecast_hour,temperature)VALUES(%s,%s,%s,%s,%s,%s,%s,%s)' print sql L = [] for j in range(len(stationlist)): perstationlist = [] stationid = stationlist[j][0] temp = result[j] # 每个站点存储 perstationlist.append(stationid) perstationlist.append(origin) perstationlist.append(forecast) perstationlist.append(forecast_year) perstationlist.append(forecast_month) perstationlist.append(forecast_day) perstationlist.append(forecast_hour) perstationlist.append(temp) L.append(perstationlist) # logger.info(perstationlist) # sql='insert into t_r_ec_mos_city_forecast_ele(city_id,initial_time,forecast_time,forecsat_year,forecast_month,forecast_day,forecast_hour,temperature)VALUES ()' # sql = 'insert into t_r_ec_city_forecast_ele_mos (city_id,initial_time,forecast_time,forecast_year,forecast_month,forecast_day,forecast_hour,temperature,temp_max_6h,temp_min_6h,rainstate,precipitation)VALUES ("' + stationid + '","' + origin + '","' + str( # forecast) + '","' + str(forecast_year) + '","' + str( # forecast_month) + '","' + str(forecast_day) + '","' + str( # forecast_hour) + '","' + str(temp) + '","' + str(maxtemp)+ '","' + str(mintemp)+'","' + str(rainstate)+'","' + str(prevalue)+ '")' # csvfile.write(stationid + '","' + origin + '","' + str( # forecast) + '","' + str(forecast_year) + '","' + str( # forecast_month) + '","' + str(forecast_day) + '","' + str( # forecast_hour) + '","' + str(forecast_minute) + '","' + str( # temp)+ '","' + str(maxtemp)+ '","' + str(mintemp)+'","' + str(rainstate)+'","' + str(prevalue)) # csvfile.write('\n') # print sql # cursor.execute(sql) cursor.executemany(sql, L) db.commit() db.close() # csvfile.close() # os.remove(outfilename) logger.info(outfilename) except Exception as e: logger.info(e.message)
def XGB(opts): reDirect = False FOLDER = 'clean_vpn12_xgb' if not os.path.exists(FOLDER): os.mkdir(FOLDER) MODEL_PATH = FOLDER + '/model.h5' FIG_PATH = FOLDER + '/Confusion_Matrix.png' FIG_PATH_N = FOLDER + '/Confusion_Matrix_Norm.png' import sys if (reDirect): old_stdout = sys.stdout sys.stdout = open(FOLDER + '/log', 'w') X_train = np.load(opts.source_data_folder + '/X_train.npy') y_train = np.load(opts.source_data_folder + '/y_train.npy') X_train = X_train.astype('float32') print('X_train:', np.shape(X_train)) print('y_train:', np.shape(y_train)) maxsize = 0 print('-' * 20) for cat in np.unique(y_train): size = np.shape(np.where(y_train == cat))[1] print(str(cat) + ": " + str(np.shape(np.where(y_train == cat))[1])) if (size > maxsize): maxsize = size print('-' * 20) y = y_train X_train = normalize(X_train, norm='l2', axis=0, copy=True, return_norm=False) X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.33, random_state=42) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42) dim = np.shape(X_train)[1] print(dim) #Setting Classifier xgbc = XGBClassifier(max_depth=20, tree_method='exact', n_estimators=180, n_jobs=-1) #training xgbc.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], early_stopping_rounds=30, verbose=True) results = xgbc.score(X_test, y_test) print('Test accuracy: ', results) if (reDirect): sys.stdout = old_stdout print('Test accuracy: ', results) xgbc.get_booster().save_model(MODEL_PATH) y_pred = xgbc.predict(X_test) #load the best model import xgboost as xgb bst = xgb.Booster({'nthread': 4}) # init model bst.load_model(MODEL_PATH) # load data y_pred = bst.predict(X_test) y_p = y_pred y_t = y_test class_names = [DIG2LABEL[i] for i in range(nclass)] cnf_matrix = confusion_matrix(y_t, y_p) np.set_printoptions(precision=2) # Plot non-normalized confusion matrix plt.figure() plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix, without normalization') plt.savefig(FIG_PATH) plt.figure() plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Normalized confusion matrix') plt.savefig(FIG_PATH_N) print('f1-scroe = {}'.format(f1_score(y_t, y_p, average=None))) print('prcision = {}'.format(precision_score(y_t, y_p, average=None))) print('recall = {}'.format(recall_score(y_t, y_p, average=None))) print('macro f1 = {}'.format(f1_score(y_t, y_p, average='macro')))
def test_Booster_init_invalid_path(self): """An invalid model_file path should raise XGBoostError.""" with pytest.raises(xgb.core.XGBoostError): xgb.Booster(model_file=Path("invalidpath"))
def load(self, model_fp): self.model = xgb.Booster(self.params) self.model.load_model(model_fp)
'cost_amount_mean食堂', 'cost_amount_sum教务处', 'cost_reason_count淋浴', 'cost_amount_sum1', 'cost_amount_mean教务处', 'cost_amount_sum图书馆', 'rank', 'cost_amount_sum淋浴', 'cost_amount_mean卡充值', 'cost_amount_sum洗衣房', 'cost_amount_mean淋浴', 'cost_amount_mean超市', 'libraryCount', 'borrow_count', 'cost_amount_sum文印中心', 'cost_amount_mean图书馆', 'cost_amount_mean文印中心', 'stu_id' ]] clf = xgb.Booster({'nthread':4}) #init model clf.load_model(r'.\model\myxgb_5.m') # load data Dtest = xgb.DMatrix(test_data1) predict = clf.predict(Dtest) #studentid,subsidy print sum(predict) student['subsidy'] = predict student['subsidy'] = student['subsidy'].replace({1:1000,2:1500,3:2000}).astype('int64') studentall = pd.read_csv(r'.\data\test\studentID_test.txt',header=None,names=['stu_id']) #test表 studentall = studentall.merge(student,on='stu_id',how = 'left').fillna(0).astype('int64') studentall.rename(columns = {'stu_id':'studentid'},inplace = True) studentall[['studentid','subsidy']].to_csv(r'.\ans\anwser_1120_1.csv',index = False,header = True)
import gauss3d # --------------------- CUSTOM FUNC ------------------------- def im2double(im): min_val = np.min(im.ravel()) max_val = np.max(im.ravel()) if max_val != min_val: out = (im.astype('float') - min_val) / (max_val - min_val) else: out = im.astype('float') / 255 return out ## -------------------- LOAD MODEL ---------------------------- dst = xgb.Booster() dst.load_model("./hog3d.model") ## -------------------- DATASET ------------------------------- data_path = "D:/Proj/UAV/dataset/drones/" data_postfix = ".avi" data_num = 1 cap = cv.VideoCapture(data_path + "Video_%s" % data_num + data_postfix) # ---------------------- PARAMS ------------------------------- CUBE_T, CUBE_Y, CUBE_X = (4, 64, 64 ) # define the size of each st-cube to be processed HOG_SIZE = (int(np.ceil(CUBE_X / 4)), int(np.ceil(CUBE_T / 2))) HOG_STEP = (int(np.ceil(CUBE_X / 4)), int(np.ceil(CUBE_T / 2))) BCDIV = 3 GAU_SIGMA = (1, 3, 3) #(t,y,x)
def load_model(model_path): model = xgb.Booster() model.load_model(model_path) return model
clause_dir = "{}/clauses".format(args.outdir) proof_dir = "{}/proofs".format(args.outdir) os.makedirs(value_train_dir, exist_ok=True) os.makedirs(policy_train_dir, exist_ok=True) os.makedirs(clause_dir, exist_ok=True) os.makedirs(proof_dir, exist_ok=True) if args.model_type == "xgboost" and args.guided > 0: assert args.guidance_dir is not None value_modelfile = "{}/value_xgb".format(args.guidance_dir) policy_modelfile = "{}/policy_xgb".format(args.guidance_dir) if args.guided == 1: # using python to access xgboost value_model = xgb.Booster() value_model.load_model(value_modelfile) policy_model = xgb.Booster() policy_model.load_model(policy_modelfile) elif args.model_type == "Simple Dense": assert args.guidance_dir is not None value_modelfile = "{}/value_xgb".format(args.guidance_dir) policy_modelfile = "{}/policy_xgb".format(args.guidance_dir) if args.guided == 1: # using python to access xgboost value_model = tf.keras.models.load_model(value_modelfile) policy_model = tf.keras.models.load_model(policy_modelfile) n_features = get_max_fea(args) def conv_state(state):
def main(): with open('players.dictionary', 'rb') as f: players = pickle.load(f) with open('20players.dictionary', 'rb') as f: players_this_season = pickle.load(f) with open('20teams.dictionary', 'rb') as f: teams = pickle.load(f) players_this_postseason = {} matchups = {} results = {'champ':{}, \ 'finals':{}, \ 'conf':{}, \ 'semi':{}, \ 'playoffs':{}} model = xgb.Booster({'nthread': 4}) # init model model.load_model('basketball.model') lineups = {'LAL':['jamesle01', 'davisan02', 'greenda02', 'caldwke01', 'mcgeeja01', 'kuzmaky01', 'howardw01', 'carusal01', 'cookqu01', 'smithjr01', 'morrima02', 'waitedi01', 'dudleja01'],\ 'LAC':['leonaka01', 'georgpa01', 'morrima03', 'beverpa01', 'zubaciv01', 'shamela01', 'harremo01', 'willilo02', 'jacksre01', 'greenja01', 'mcgruro01', 'noahjo01', 'pattepa01'], \ 'MIL':['antetgi01', 'middlkh01', 'bledser01', 'lopezbr01', 'matthwe02', 'divindo01', 'ilyaser01', 'korveky01', 'lopezro01', 'hillge01', 'connapa01', 'brownst02'], \ 'PHI':['simmobe01', 'harrito02', 'embiijo01', 'richajo01', 'thybuma01', 'horfoal01', 'korkmfu01', 'robingl02', 'burksal01', 'scottmi01', 'netora01', 'miltosh01', 'oquinky01'], \ 'HOU':['hardeja01', 'westbru01', 'tuckepj01', 'covinro01', 'houseda01', 'gordoer01', 'riverau01', 'mclembe01', 'greenje02', 'sefolth01', 'carrode01', 'mbahalu01', 'nwabada01'], \ 'BOS':['walkeke02', 'haywago01', 'tatumja01', 'brownja02', 'theisda01', 'smartma01', 'kanteen01', 'willigr01', 'wanambr01', 'ojelese01', 'williro04'], \ 'TOR':['lowryky01', 'siakapa01', 'anunoog01', 'gasolma01', 'vanvlfr01', 'powelno01', 'ibakase01', 'holliro01', 'daviste02', 'mccawpa01', 'bouchch01', 'thomama02'], \ 'DEN':['jokicni01', 'murraja01', 'harriga01', 'millspa01', 'bartowi01', 'grantje01', 'craigto01', 'morrimo01', 'plumlma01', 'doziepj01', 'bateske01', 'vonleno01', 'portemi01'], \ 'DAL':['doncilu01', 'porzikr01', 'finnedo01', 'hardati02', 'curryse01', 'klebima01', 'wrighde01', 'jacksju01', 'kiddgmi01', 'caulewi01', 'burketr01', 'marjabo01', 'bareajo01'], \ 'OKC':['paulch01', 'gallida01', 'adamsst01', 'gilgesh01', 'dortlu01', 'schrode01', 'fergute01', 'noelne01', 'roberan03', 'bazleda01', 'diallha01', 'naderab01', 'muscami01'], \ 'UTA':['goberru01', 'mitchdo01', 'conlemi01', 'inglejo01', 'onealro01', 'clarkjo01', 'niangge01', 'davised01', 'bradlto01', 'morgaju01', 'brantja01', 'tuckera01'], \ 'MIA':['butleji01', 'adebaba01', 'nunnke01', 'robindu01', 'crowdja01', 'dragigo01', 'herroty01', 'leoname01', 'jonesde02', 'iguodan01', 'olynyke01', 'hillso01'], \ 'IND':['warretj01', 'brogdma01', 'turnemy01', 'oladivi01', 'holidaa01', 'sabondo01', 'holidju01', 'mcderdo01', 'sumneed01', 'mccontj01', 'bitadgo01', 'leaftj01', 'johnsal02'], \ 'POR':['lillada01', 'mccolcj01', 'nurkiju01', 'anthoca01', 'colliza01', 'hoodro01', 'whiteha01', 'trentga02', 'hezonma01', 'simonan01', 'adamsja01', 'littlna01'], \ 'ORL':['vucevni01', 'fournev01', 'gordoaa01', 'augusdj01', 'ennisja01', 'isaacjo01', 'fultzma01', 'rosste01', 'birchkh01', 'cartemi01', 'iwundwe01', 'clarkga01'], \ 'BRK':['allenja01', 'harrijo01', 'leverca01', 'thomala01', 'johnsty01', 'templga01', 'kurucro01', 'anderju01', 'luwawti01', 'musadz01', 'chiozch01', 'martije02'], \ 'MEM':['moranja01', 'jacksja02', 'brookdi01', 'valanjo01', 'anderky01', 'meltode01', 'clarkbr01', 'jacksjo02', 'dienggo01', 'gudurma01', 'tollian01', 'konchjo01', 'watanyu01'], \ 'NOP':['holidjr01', 'ingrabr01', 'willizi01', 'favorde01', 'redicjj01', 'balllo01', 'hartjo01', 'mellini01', 'mooreet01', 'willike04', 'hayesja02', 'okafoja01'], \ 'SAC':['barneha02', 'foxde01', 'bjeline01','holmeri01', 'bogdabo01', 'hieldbu01', 'parkeja01', 'josepco01', 'bazemke01', 'ferreyo01', 'gilesha01', 'lenal01', 'breweco01'], \ 'WAS':['hachiru01', 'bryanth01', 'browntr01', 'napiesh01', 'bongais01', 'smithis01', 'mahinia01', 'wagnemo01', 'robinje01', 'pasecan01', 'paytoga02', 'grantje02', 'schofad01'], \ 'SAS':['derozde01', 'murrade01', 'whitede01', 'poeltja01', 'walkelo01', 'gayru01', 'forbebr01', 'millspa02', 'zellety01', 'belinma01', 'metuch01', 'samanlu01', 'eubandr01'], \ 'PHO':['bookede01', 'aytonde01', 'rubiori01', 'bridgmi01', 'johnsca02', 'saricda01', 'kaminfr01', 'baynear01', 'carteje01', 'payneca01', 'okoboel01', 'diallch01', 'jeromty01']} schedule = [{'team1':'UTA', 'team2':'NOP', 'date':datetime.date(2020, 7, 30)},\ {'team1':'LAC', 'team2':'LAL', 'date':datetime.date(2020, 7, 30)},\ {'team1':'ORL', 'team2':'BRK', 'date':datetime.date(2020, 7, 31)},\ {'team1':'PHO', 'team2':'WAS', 'date':datetime.date(2020, 7, 31)},\ {'team1':'MEM', 'team2':'POR', 'date':datetime.date(2020, 7, 31)},\ {'team1':'BOS', 'team2':'MIL', 'date':datetime.date(2020, 7, 31)},\ {'team1':'SAC', 'team2':'SAS', 'date':datetime.date(2020, 7, 31)},\ {'team1':'DAL', 'team2':'HOU', 'date':datetime.date(2020, 7, 31)},\ {'team1':'MIA', 'team2':'DEN', 'date':datetime.date(2020, 8, 1)},\ {'team1':'UTA', 'team2':'OKC', 'date':datetime.date(2020, 8, 1)},\ {'team1':'NOP', 'team2':'LAC', 'date':datetime.date(2020, 8, 1)},\ {'team1':'PHI', 'team2':'IND', 'date':datetime.date(2020, 8, 1)},\ {'team1':'LAL', 'team2':'TOR', 'date':datetime.date(2020, 8, 1)},\ {'team1':'WAS', 'team2':'BRK', 'date':datetime.date(2020, 8, 2)},\ {'team1':'POR', 'team2':'BOS', 'date':datetime.date(2020, 8, 2)},\ {'team1':'SAS', 'team2':'MEM', 'date':datetime.date(2020, 8, 2)},\ {'team1':'SAC', 'team2':'ORL', 'date':datetime.date(2020, 8, 2)},\ {'team1':'MIL', 'team2':'HOU', 'date':datetime.date(2020, 8, 2)},\ {'team1':'DAL', 'team2':'PHO', 'date':datetime.date(2020, 8, 2)},\ {'team1':'TOR', 'team2':'MIA', 'date':datetime.date(2020, 8, 3)},\ {'team1':'IND', 'team2':'WAS', 'date':datetime.date(2020, 8, 3)},\ {'team1':'DEN', 'team2':'OKC', 'date':datetime.date(2020, 8, 3)},\ {'team1':'MEM', 'team2':'NOP', 'date':datetime.date(2020, 8, 3)},\ {'team1':'SAS', 'team2':'PHI', 'date':datetime.date(2020, 8, 3)},\ {'team1':'LAL', 'team2':'UTA', 'date':datetime.date(2020, 8, 3)},\ {'team1':'BRK', 'team2':'MIL', 'date':datetime.date(2020, 8, 4)},\ {'team1':'DAL', 'team2':'SAC', 'date':datetime.date(2020, 8, 4)},\ {'team1':'PHO', 'team2':'LAC', 'date':datetime.date(2020, 8, 4)},\ {'team1':'ORL', 'team2':'IND', 'date':datetime.date(2020, 8, 4)},\ {'team1':'BOS', 'team2':'MIA', 'date':datetime.date(2020, 8, 4)},\ {'team1':'HOU', 'team2':'POR', 'date':datetime.date(2020, 8, 4)},\ {'team1':'MEM', 'team2':'UTA', 'date':datetime.date(2020, 8, 5)},\ {'team1':'PHI', 'team2':'WAS', 'date':datetime.date(2020, 8, 5)},\ {'team1':'DEN', 'team2':'SAS', 'date':datetime.date(2020, 8, 5)},\ {'team1':'OKC', 'team2':'LAL', 'date':datetime.date(2020, 8, 5)},\ {'team1':'TOR', 'team2':'ORL', 'date':datetime.date(2020, 8, 5)},\ {'team1':'BRK', 'team2':'BOS', 'date':datetime.date(2020, 8, 5)},\ {'team1':'NOP', 'team2':'SAC', 'date':datetime.date(2020, 8, 6)},\ {'team1':'MIA', 'team2':'MIL', 'date':datetime.date(2020, 8, 6)},\ {'team1':'IND', 'team2':'PHO', 'date':datetime.date(2020, 8, 6)},\ {'team1':'LAC', 'team2':'DAL', 'date':datetime.date(2020, 8, 6)},\ {'team1':'POR', 'team2':'DEN', 'date':datetime.date(2020, 8, 6)},\ {'team1':'LAL', 'team2':'HOU', 'date':datetime.date(2020, 8, 6)},\ {'team1':'UTA', 'team2':'SAS', 'date':datetime.date(2020, 8, 7)},\ {'team1':'OKC', 'team2':'MEM', 'date':datetime.date(2020, 8, 7)},\ {'team1':'SAC', 'team2':'BRK', 'date':datetime.date(2020, 8, 7)},\ {'team1':'ORL', 'team2':'PHI', 'date':datetime.date(2020, 8, 7)},\ {'team1':'WAS', 'team2':'NOP', 'date':datetime.date(2020, 8, 7)},\ {'team1':'BOS', 'team2':'TOR', 'date':datetime.date(2020, 8, 7)},\ {'team1':'LAC', 'team2':'POR', 'date':datetime.date(2020, 8, 8)},\ {'team1':'UTA', 'team2':'DEN', 'date':datetime.date(2020, 8, 8)},\ {'team1':'LAL', 'team2':'IND', 'date':datetime.date(2020, 8, 8)},\ {'team1':'PHO', 'team2':'MIA', 'date':datetime.date(2020, 8, 8)},\ {'team1':'MIL', 'team2':'DAL', 'date':datetime.date(2020, 8, 8)},\ {'team1':'WAS', 'team2':'OKC', 'date':datetime.date(2020, 8, 9)},\ {'team1':'MEM', 'team2':'TOR', 'date':datetime.date(2020, 8, 9)},\ {'team1':'SAS', 'team2':'NOP', 'date':datetime.date(2020, 8, 9)},\ {'team1':'ORL', 'team2':'BOS', 'date':datetime.date(2020, 8, 9)},\ {'team1':'PHI', 'team2':'POR', 'date':datetime.date(2020, 8, 9)},\ {'team1':'HOU', 'team2':'SAC', 'date':datetime.date(2020, 8, 9)},\ {'team1':'BRK', 'team2':'LAC', 'date':datetime.date(2020, 8, 9)},\ {'team1':'OKC', 'team2':'PHO', 'date':datetime.date(2020, 8, 10)},\ {'team1':'DAL', 'team2':'UTA', 'date':datetime.date(2020, 8, 10)},\ {'team1':'TOR', 'team2':'MIL', 'date':datetime.date(2020, 8, 10)},\ {'team1':'IND', 'team2':'MIA', 'date':datetime.date(2020, 8, 10)},\ {'team1':'DEN', 'team2':'LAL', 'date':datetime.date(2020, 8, 10)},\ {'team1':'BRK', 'team2':'ORL', 'date':datetime.date(2020, 8, 11)},\ {'team1':'HOU', 'team2':'SAS', 'date':datetime.date(2020, 8, 11)},\ {'team1':'PHO', 'team2':'PHI', 'date':datetime.date(2020, 8, 11)},\ {'team1':'POR', 'team2':'DAL', 'date':datetime.date(2020, 8, 11)},\ {'team1':'BOS', 'team2':'MEM', 'date':datetime.date(2020, 8, 11)},\ {'team1':'MIL', 'team2':'WAS', 'date':datetime.date(2020, 8, 11)},\ {'team1':'NOP', 'team2':'SAC', 'date':datetime.date(2020, 8, 11)},\ {'team1':'IND', 'team2':'HOU', 'date':datetime.date(2020, 8, 12)},\ {'team1':'TOR', 'team2':'PHI', 'date':datetime.date(2020, 8, 12)},\ {'team1':'MIA', 'team2':'OKC', 'date':datetime.date(2020, 8, 12)},\ {'team1':'LAC', 'team2':'DEN', 'date':datetime.date(2020, 8, 12)},\ {'team1':'SAS', 'team2':'UTA', 'date':datetime.date(2020, 8, 13)},\ {'team1':'SAC', 'team2':'LAL', 'date':datetime.date(2020, 8, 13)},\ {'team1':'MIL', 'team2':'MEM', 'date':datetime.date(2020, 8, 13)},\ {'team1':'WAS', 'team2':'BOS', 'date':datetime.date(2020, 8, 13)},\ {'team1':'POR', 'team2':'BRK', 'date':datetime.date(2020, 8, 13)},\ {'team1':'NOP', 'team2':'ORL', 'date':datetime.date(2020, 8, 13)},\ {'team1':'DAL', 'team2':'PHO', 'date':datetime.date(2020, 8, 13)},\ {'team1':'PHI', 'team2':'HOU', 'date':datetime.date(2020, 8, 14)},\ {'team1':'DEN', 'team2':'TOR', 'date':datetime.date(2020, 8, 14)},\ {'team1':'OKC', 'team2':'LAC', 'date':datetime.date(2020, 8, 14)},\ {'team1':'MIA', 'team2':'IND', 'date':datetime.date(2020, 8, 14)}] for i in range(1, 100001): simulation = simulate_season.Simulation(teams, players, players_this_season, players_this_postseason, schedule, model, lineups, matchups, results) simulation.simulate_reg_season() simulation.simulate_playoffs(datetime.date(2020, 8, 17)) if i % 50 == 0: print(i) for matchup in matchups.keys(): print(matchup[0] + ',' + matchup[1] + ',' + str(matchups[matchup])) for team in teams.keys(): print(team + ',' + str(get_result(results, 'playoffs', team)) + ',' + str(get_result(results, 'semi', team)) + ',' + str(get_result(results, 'conf', team)) + ',' + str(get_result(results, 'finals', team)) + ',' + str(get_result(results, 'champ', team))) # print(matchups) # print(simulation.results) for matchup in matchups.keys(): print(str(matchup) + ',' + str(matchups[matchup])) for team in teams.keys(): print(team + ',' + str(get_result(results, 'playoffs', team)) + ',' + str(get_result(results, 'semi', team)) + ',' + str(get_result(results, 'conf', team)) + ',' + str(get_result(results, 'finals', team)) + ',' + str(get_result(results, 'champ', team)))
import gc import sys import pandas as pd import xgboost as xgb import numpy as np from sklearn.model_selection import train_test_split # num = sys.argv[1] num = 1 path = '../../output/stack-data/' out_path = '../../output/results/xgb/' bst = xgb.Booster(model_file='xgb{0}.model'.format(num)) # load model # valid = pd.read_csv(path + 'valid{0}.csv'.format(num)) # label_valid = np.array(valid['label']) # valid.drop(['label'], axis=1, inplace=True) # # xgb_val = xgb.DMatrix(valid) # # del valid # gc.collect() # # val_pred = bst.predict(xgb_val) # output = open(out_path + 'subval{0}.csv'.format(num), 'w') # output.write('label,xgb_prob\n') # for t, p in enumerate(val_pred, start=1):