def main(): train_data = pd.read_csv('train.csv') test_data = pd.read_csv('test.csv') train_y = train_data.iloc[:, -1] datas = train_data.append(test_data, ignore_index=True) datas.drop(['SalePrice'], axis=1, inplace=True) datas.drop(['Id'], axis=1, inplace=True) # train_data.info() # train_data.describe() #观察数据的缺省值 # print(datas.isnull().sum().sort_values(ascending=True)) # print(train_data['MSZoning'].mode()) datas=data_value_deal(datas) train_data,test_data=datapca(train_data,datas) # print(train_data[:5],test_data[:5]) model=XGBRegressor() grid=datastrain(model).gradient_get(train_data,train_y,{ 'max_depth':[8], 'learning_rate':[0.01], 'n_estimators':[10000] }) model=grid.best_estimator_ result = rmse_cv(model, train_data, train_y) cv_mean = result.mean() cv_std = result.std() print('cv_mean:', cv_mean, 'cv_std:', cv_std) prey=model.predict(train_data) model.save_model('001.model') acc=np.sqrt(np.power(prey-train_y,2)) print(acc[:5],acc.sum())
def train_model(train_set_path, model_out_file): """ Train the wine predictor, with parameters discovered in hyper-parameter tuning phase. The model is then saved for future use. """ assert '.csv' in train_set_path, f'Received {train_set_path}! ' \ f'Please provide a .csv file' hp = { 'colsample_bytree': 0.3, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 12, 'min_child_weight': 7 } train_set = pd.read_csv(train_set_path) train_y = train_set[['points']] train_x = train_set.drop(columns=['points']) logger.info(f'XGBoost Regression with parameters: {hp}') model = XGBRegressor(random_state=42, colsample_bytree=hp['colsample_bytree'], learning_rate=hp['learning_rate'], max_depth=hp['max_depth'], min_child_weight=hp['min_child_weight'], gamma=hp['gamma']) logger.info('Training model...') started = time() model.fit(train_x, train_y) logger.info(f'Model trained in {time() - started} seconds') os.makedirs(os.path.dirname(model_out_file), exist_ok=True) model.save_model(Path(model_out_file)) logger.info(f'Models saved to {model_out_file}')
def train(name="features.pkl"): data = pd.read_pickle(name) data = data[[ col for col in set(settings.FEATURES).intersection(set(data.columns)) ]] X_train = data[data.date_block_num < 33].drop(['item_cnt_month'], axis=1) Y_train = data[data.date_block_num < 33]['item_cnt_month'] X_valid = data[data.date_block_num == 33].drop(['item_cnt_month'], axis=1) Y_valid = data[data.date_block_num == 33]['item_cnt_month'] X_test = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1) del data gc.collect() ts = time.time() model = XGBRegressor(**settings.REGRESSOR_PARAMS) model.fit(X_train, Y_train, eval_set=[(X_train, Y_train), (X_valid, Y_valid)], **settings.FIT_PARAMS) model.save_model("model.pkl") print(f"Training the model in {time.time() - ts}s") Y_pred = model.predict(X_valid).clip(0, 20) Y_test = model.predict(X_test).clip(0, 20) test = pd.read_csv(os.path.join(settings.DATA_PATH, 'sales_test.csv')) test = remove_duplicates(test) submission = pd.DataFrame({'ID': test.index, 'item_cnt_month': Y_test}) submission.to_csv('xgb_submission.csv', index=False)
def train_trainable(data): df0 = data[[ "date_block_num", "shop_id", "item_id", "id_struct", "item_category", "Price_agg", "keyz", "item_cnt_month_lag1", "item_cnt_month_lag2", "item_cnt_month_lag3", "item_cnt_month_lag4", "item_cnt_month_lag5", "item_cnt_month_lag6", "item_cnt_month_lag7", "Price_agg_lag1", "Price_agg_lag2" ]] df1 = data[["item_cnt_month"]] param = { 'colsample_bytree': 0.8, 'subsample': 0.75, 'eta': 0.02, 'n_estimators': 1100, 'max_depth': 7, 'min_child_weight': 1 } model = XGBRegressor(**param) model.fit(df0, df1, eval_metric="rmse", eval_set=[(df0, df1)], verbose=False, early_stopping_rounds=1) model.save_model("./models/xgbmodelprime")
def predict(course_code, user_id): filename = get_path(course_code, '%s_model.xgb' % course_code) X, y = load_data(course_code) user_X = X.loc[user_id] # Normalization if course_code not in data_transformer: scaler = MinMaxScaler() scaler.fit(X) data_transformer[course_code] = scaler scaler = data_transformer[course_code] if course_code not in model_cache: model = XGBRegressor() if os.path.isfile(filename): model.load_model(filename) else: X = scaler.transform(X) model.fit(X, y) model.save_model(filename) model_cache[course_code] = model model = model_cache[course_code] X = scaler.transform(X) y_ = model.predict(X) hist, bin_edges = np.histogram(y_, bins=10, range=[0, 1]) return { "classFinalExamDistribution": hist.tolist(), "myChapterScore": get_user_chapter_grades(course_code, user_id), "myPredictedFinalExamScore": float(model.predict(user_X)[0]) }
def generate_models(self): params = dict(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8, early_stopping_rounds=20) training_set = self.load_training_set() for label in labels: train, test = self.prepare_data(training_set, label) model = XGBRegressor(**params) model.fit(train.X, train.y, eval_set=[(test.X, test.y)]) model.save_model(f"resources/{label}.json")
def main(): print("Loading data...") # The training data is used to train your model how to predict the targets. training_data = read_csv("numerai_training_data.csv") # The tournament data is the data that Numerai uses to evaluate your model. tournament_data = read_csv("numerai_tournament_data.csv") feature_names = [ f for f in training_data.columns if f.startswith("feature") ] print(f"Loaded {len(feature_names)} features") # This is the model that generates the included example predictions file. # Taking too long? Set learning_rate=0.1 and n_estimators=200 to make this run faster. # Remember to delete example_model.xgb if you change any of the parameters below. model = XGBRegressor(max_depth=5, learning_rate=0.01, n_estimators=2000, n_jobs=-1, colsample_bytree=0.1) if MODEL_FILE.is_file(): print("Loading pre-trained model...") model.load_model(MODEL_FILE) else: print("Training model...") model.fit(training_data[feature_names], training_data[TARGET_NAME]) model.save_model(MODEL_FILE) # Generate predictions on both training and tournament data print("Generating predictions...") training_data[PREDICTION_NAME] = model.predict( training_data[feature_names]) tournament_data[PREDICTION_NAME] = model.predict( tournament_data[feature_names]) # Check the per-era correlations on the training set (in sample) train_correlations = training_data.groupby("era").apply(score) print( f"On training the correlation has mean {train_correlations.mean()} and std {train_correlations.std()}" ) print( f"On training the average per-era payout is {payout(train_correlations).mean()}" ) # Check the per-era correlations on the validation set (out of sample) validation_data = tournament_data[tournament_data.data_type == "validation"] validation_correlations = validation_data.groupby("era").apply(score) print( f"On validation the correlation has mean {validation_correlations.mean()} and " f"std {validation_correlations.std()}") print( f"On validation the average per-era payout is {payout(validation_correlations).mean()}" ) # Save predictions as a CSV and upload to https://numer.ai tournament_data[PREDICTION_NAME].to_csv(TOURNAMENT_NAME + "_submission.csv")
def xgb_model(n_estimators=[], learning_rate=[], validation_data=(), training_data=(), testing_data=(), directory='', filename=''): ''' Takes a list of estimators and learning rate along with train/valid/test data. Runs the XGB regressor saves the weights in .model format and the performances in a csv file and returns the performance results back in a dataFrame ''' mse = {} for estimator in n_estimators: for rate in learning_rate: #Inisiating the model model = XGBRegressor(n_estimators=estimator, learning_rate=rate) #Training the model model.fit(training_data[0], training_data[1], early_stopping_rounds=50, eval_set=[(validation_data[0], validation_data[1])], verbose=False) #Evaluating the model prediction = model.predict(testing_data[0]) #saving the model model.save_model( '../src/models/xgb_weights/n_estimator{}_learning_rate{}.model' .format(estimator, rate)) #Calculating the error error = mean_squared_error(prediction, testing_data[1]) mse[error] = [estimator, rate] #Converting the dict to a DataFrame xgb_performance = pd.DataFrame(data=mse) xgb_performance = xgb_performance.transpose() xgb_performance.columns = ['n_estimator', 'learning_rate'] xgb_performance.index.name = 'mse' #Saving the performances in a CSV file if os.path.exists(directory): xgb_performance.to_csv('../src/models/{}/{}.csv'.format( directory, filename)) else: os.makedirs(directory) xgb_performance.to_csv('../src/models/{}/{}.csv'.format( directory, filename)) return xgb_performance
class GDPGrowthPredictor: """Gbm class""" def __init__(self, *args, **kwargs): """Create model with given parameters""" self.model = XGBRegressor(*args, **kwargs) def train(self, filename, split, previous_year, plot, *args, **kwargs): """Train model, and plot results""" X_train, X_test, y_train, y_test, features = _io.retrieve_training_dataset( split, previous_year) self.model.fit(X_train, y_train, *args, **kwargs) self.save(filename) if split != 0: self.test(X_test, y_test, features, split, plot) def test(self, X_test, y_test, features, split, plot): """Test model""" model_y_pred = self.model.predict(X_test) results_df = X_test results_df = results_df.drop(columns=features) results_df["y_real"] = y_test results_df["y_pred"] = model_y_pred results_df["err"] = np.absolute(results_df["y_real"] - results_df["y_pred"]) results_df["%_err"] = ((results_df["err"]) / (np.absolute(results_df["y_real"])) * 100) logging.info("Test results with %s split:", split) logging.info("\t RMSE: %.3f", mean_squared_error(y_test, model_y_pred)**0.5) logging.info("\t R^2: %.3f", r2_score(y_test, model_y_pred)) if plot: logging.info("Generating plots") plots.plot_performance_results(y_test, model_y_pred) plots.plot_shap_results(X_test, features, self.model) def predict(self, filename, previous_year, year, *args, **kwargs): """Make predictions for next year GDP growth, returns a pandas df""" self.load(filename) predictions, X_predict = _io.retrieve_predict_dataset( previous_year, year) predictions["Value"] = self.model.predict(X_predict, *args, **kwargs) return predictions def save(self, filename): """ Save model to file""" self.model.save_model(filename) logging.info("Model saved") def load(self, filename): """ Load model from file""" self.model.load_model(filename) logging.info("Model loaded")
def train_xgbr(X, y, param, param1, param2, model_path='./model', test_size=0.2, estimator=XGBRegressor, score=mean_absolute_error): ''' 训练最佳参数模型 Inputs: X, y, param, param1, param2: 参考参数搜索函数search_best_param() - 去除名字与日期 (日期可以进一步做特征工程,但当前版本暂不考虑 test_size: 测试集比例 model_path: 模型保存路径 estimator: 参数搜索用模型 score: 参数搜索用评分metric *注意:实际模型用Huber loss进行优化,相对普遍的square loss function对异常值比较不敏感,表现更加robust Output: 保存训练好的xgb模型至路径 ''' X = X.drop(['name', 'date'], axis=1) # CV dataset split w/o shuffling (not sure if suffuling is better anot) X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False, random_state=None) #find best param best_score, best_param = search_best_param(X_train, y_train, X_val, y_val, param, param1, param2, estimator=XGBRegressor, score=mean_absolute_error) #initilize and train on training data best_xgbr = XGBRegressor(objective=huber_approx_obj, **best_param) best_xgbr.fit(X, y) # output found best_param and trained model if not os.path.isdir(model_path): os.makedirs(model_path) try: best_xgbr.save_model(os.path.join(model_path, 'xgb_model.json')) except: print("error saving the model") return best_xgbr
class XGBModel(GenericModel): def __init__(self, name, version=1, classifier=True, xgb_kwargs=None): super().__init__(name, version) self.xgb_kwargs = xgb_kwargs if classifier: self.model = XGBClassifier(**xgb_kwargs) else: self.model = XGBRegressor(**xgb_kwargs) def train(self): print( 'No custom train method implemented. Instead call self.model.fit(...)' ) def save_model(self, notes=None, update_version=False, config=None, save_attributes=True): if update_version: self.version += 1 try: model_path = self.model_dir / Path(f'v{self.version}.json') self.model.save_model(model_path.as_posix()) except Exception as e: print('Error saving model') print(e) raise if save_attributes: self._save_attributes() if notes is not None: self._save_notes(notes) if config is not None: self._save_config(config) def load_model(self, version, load_attributes=True): # First load the xgb_kwargs so that we can create a new instance of XGB self._load_attributes(self.attr_dir) if hasattr(self, 'xgb_kwargs'): self.model = self.model(self.xgb_kwargs) # Next load the model model_path = self.model_dir / Path(f'v{self.version}.json') assert model_path.exists( ), f'No model exists at {model_path.as_posix()}' self.model.load_model(model_path)
class XGBConfidenceIntervalBootstrap: def __init__(self, n_regressors=100, n_common_trees=0, sample_rate=1.0, **xgb_args): self.n_regressors = n_regressors self.n_common_trees = n_common_trees self.sample_rate = sample_rate self.xgb_args = xgb_args self.base_regressor = None self.regressors = [] def fit(self, X, y): # gpu_args = {'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor', 'gpu_id': 0, 'n_jobs': 16} if self.n_common_trees: base_regressor_args = { 'objective': 'reg:squarederror', 'n_estimators': self.n_common_trees } self.base_regressor = XGBRegressor(**base_regressor_args) self.base_regressor.fit(X, y, verbose=False) self.base_regressor.save_model('base.model') for i in tqdm(range(self.n_regressors)): regressor = XGBRegressor(**self.xgb_args) n_samples = int(len(X) * self.sample_rate) sample_indexes = np.random.choice(range(len(X)), n_samples, replace=True) train_args = {} if self.n_common_trees: train_args['xgb_model'] = 'base.model' regressor.fit(X[sample_indexes], y[sample_indexes], verbose=False, **train_args) self.regressors.append(regressor) def predict(self, X): result = np.array([r.predict(X) for r in self.regressors]) mean = result.mean(axis=0) lower = np.quantile(result, 0.05, axis=0) upper = np.quantile(result, 0.95, axis=0) return mean, lower, upper
def main(): course = 'VJx__VJx_2__3T2016' filename = 'model.xgb' X, y = load_data(course) # Normalization scaler = MinMaxScaler() scaler.fit(X) X = scaler.transform(X) model = XGBRegressor() if os.path.isfile(filename): model.load_model(filename) else: model.fit(X, y) model.save_model(filename) y_ = model.predict(X) print(y_)
def main(): print(MODEL_FILE) print("Loading data...") # The training data is used to train your model how to predict the targets. #training_data = read_csv("numerai_training_data.csv") # The tournament data is the data that Numerai uses to evaluate your model. #tournament_data = read_csv("numerai_tournament_data.csv") contest = str(233) directory = 'F:\\Numerai\\numerai' + contest + '\\' print("Loading data...") # The training data is used to train your model how to predict the targets. training_data = pd.read_csv(directory + "numerai_training_data.csv").set_index("id") # The tournament data is the data that Numerai uses to evaluate your model. tournament_data = pd.read_csv( directory + "numerai_tournament_data.csv").set_index("id") #MODEL_FILE = directory + "example_model.xgb" feature_names = [ f for f in training_data.columns if f.startswith("feature") ] print(f"Loaded {len(feature_names)} features") # This is the model that generates the included example predictions file. # Taking too long? Set learning_rate=0.1 and n_estimators=200 to make this run faster. # Remember to delete example_model.xgb if you change any of the parameters below. model = XGBRegressor(max_depth=5, learning_rate=0.01, n_estimators=2000, n_jobs=-1, colsample_bytree=0.1) print("Training model...") model.fit(training_data[feature_names], training_data[TARGET_NAME]) print("Training model... {MODEL_FILE}") model.save_model("F:\\Numerai\\numerai233\\example_model.xgb")
class XGBModel(Model): def Build(self): self.model = XGBRegressor(max_depth=10, n_estimators=1000, objective='reg:squarederror', seed=config.random_state, nthread=12, tree_method='gpu_hist') def Load(self, fileName): self.Build() self.model.load_model(fileName + '.xgb') def Save(self, fileName): self.model.save_model(fileName + '.xgb') def Fit(self, X_trn, y_trn, X_tst, y_tst, plot=False): self.model.fit(X_trn, y_trn, eval_metric='rmse', eval_set=[(X_trn, y_trn), (X_tst, y_tst)], verbose=True, early_stopping_rounds=50) if plot: results = self.model.evals_result() loss = results['validation_0']['rmse'] val_loss = results['validation_1']['rmse'] plot_loss(loss, val_loss) def Predict(self, X): return self.model.predict(X).reshape(-1,1)
class HousePricePredictor(BaseModel): def __init__(self): self.model = XGBRegressor() def predict(self, X): X = self._prepare_data(X) return self.model.predict(X) def _prepare_data(self, X): return pd.DataFrame(X, columns=FEATURES) def fit(self, X, y): model = XGBRegressor() clf = GridSearchCV( model, { 'max_depth': [6, ], 'learning_rate': [0.05, ], 'n_estimators': [450, 470, 475, 480, 485, ] }, n_jobs=4, cv=3, verbose=1 ) clf.fit(X, y) logging.info("Best Score: {}".format(clf.best_score_)) logging.info("Best Params: {}".format(clf.best_params_)) self.model = clf.best_estimator_ return self.model def dump(self, path): self.model.save_model(path) @classmethod def load(cls, path): house_model = HousePricePredictor() house_model.model.load_model(path) return house_model
def bulid_models(x_train, y_train, x_test, y_test, best_grida, best_gridb): root_folder = lib.features.STORAGE file_patha = os.path.join(root_folder, "modela.xgb") file_pathb = os.path.join(root_folder, "modelb.xgb") modela = XGBRegressor() modela.load_model(file_patha) y_preda = modela.predict(x_test) base_scorea = mean_absolute_error(y_test[:, 0], y_preda) modelb = XGBRegressor() modelb.load_model(file_pathb) y_predb = modela.predict(x_test) base_scoreb = mean_absolute_error(y_test[:, 1], y_predb) modela = XGBRegressor(**best_grida) modela = modela.fit(x_train, y_train[:, 0], eval_set=[(x_test, y_test[:, 0])], early_stopping_rounds=100, verbose=False) y_preda = modela.predict(x_test) scorea = mean_absolute_error(y_test[:, 0], y_preda) print("score A : {} vs {}".format(scorea, base_scorea)) if scorea <= base_scorea: modela.save_model(file_patha) print("model A saved !") modelb = XGBRegressor(**best_gridb) modelb = modelb.fit(x_train, y_train[:, 1], eval_set=[(x_test, y_test[:, 1])], early_stopping_rounds=100, verbose=False) y_predb = modelb.predict(x_test) scoreb = mean_absolute_error(y_test[:, 1], y_predb) print("score B : {} vs {}".format(scoreb, base_scoreb)) if scoreb <= base_scoreb: modelb.save_model(file_pathb) print("model B saved !")
y_test = proton_test kfold = KFold(n_splits=5, shuffle=True, random_state=41) model = XGBRegressor(learning_rate=0.01, n_estimators=1000, booster='gblinear', colsample_bytree=0.8, n_jobs=-1, objective='reg:squaredlogerror', gpu_id=0, tree_method='gpu_hist').fit(x_train, y_train) scores = cross_val_score(model, x_val, y_val, cv=kfold, verbose=2) model.save_model("./AI_2020/task19/model") y_predict = model.predict(x_test) print("====================") print(scores) print(y_predict) y_predict = pd.DataFrame(y_predict) y_test = pd.DataFrame(y_test) y_test = np.append(y_test, y_predict, axis=0) y_test = pd.DataFrame(y_test[575136:]) y_test.to_csv('./AI_2020/task19/predict.csv', header=0, index=0)
(select_x_test, y_test)], early_stopping_rounds=20) y_pred = selection_model.predict(select_x_test) r2 = r2_score(y_test, y_pred) print("Thresh=%.3f, n = %d, R2 : %.2f%%" % (thres, select_x_train.shape[1], r2 * 100.0)) result = selection_model.evals_result() # print("eval's result : ", result) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) # model.save_model("./model/xgb_save/boston_thresh=%.3f-r2=%.2f.model"%(thres, r2)) model.save_model("./model/xgb_save/boston_rmse=%.3f-r2=%.2f.model" % (rmse, r2)) # Thresh=0.003, n = 13, R2 : 93.54% # Thresh=0.005, n = 12, R2 : 93.71% # Thresh=0.006, n = 11, R2 : 93.69% # Thresh=0.009, n = 10, R2 : 93.78% # Thresh=0.012, n = 9, R2 : 94.11% # Thresh=0.014, n = 8, R2 : 94.31% # Thresh=0.015, n = 7, R2 : 93.76% # Thresh=0.017, n = 6, R2 : 92.80% # Thresh=0.017, n = 5, R2 : 93.63% # Thresh=0.039, n = 4, R2 : 92.26% # Thresh=0.045, n = 3, R2 : 89.30% # Thresh=0.248, n = 2, R2 : 81.05% # Thresh=0.569, n = 1, R2 : 69.21%
model = XGBRegressor(n_estimators = 100, learning_rate = 0.05, n_jobs = -1) model.fit(x_train, y_train) threshold = np.sort(model.feature_importances_) for thres in threshold: selection = SelectFromModel(model, threshold = thres, prefit = True) select_x_train = selection.transform(x_train) select_x_test = selection.transform(x_test) selection_model =LGBMRegressor(n_estimators = 100, learning_rate = 0.05, n_jobs = -1) selection_model.fit(select_x_train, y_train, verbose= False, eval_metric= ['logloss', 'rmse'], eval_set= [(select_x_train, y_train), (select_x_test, y_test)], early_stopping_rounds= 20) y_pred = selection_model.predict(select_x_test) r2 = r2_score(y_test, y_pred) print("Thresh=%.3f, n = %d, R2 : %.2f%%" %(thres, select_x_train.shape[1], r2*100.0)) # result = selection_model.evals_result() # print("eval's result : ", result) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) # model.save_model("./model/xgb_save/boston_thresh=%.3f-r2=%.2f.model"%(thres, r2)) model.save_model("./model/sample/boston/boston_rmse=%.3f-r2=%.2f.model"%(rmse, r2))
model_input = train print(model_input.head()) X, y = model_input.iloc[:,np.r_[6:24]],model_input.iloc[:,3] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) xg_reg = XGBRegressor(max_depth=10, learning_rate=0.1, n_estimators=1000, silent=True, objective='reg:linear', nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, missing=None) xg_reg.fit(X_train,y_train) plt.rcParams['figure.figsize'] = [50, 10] plt.show() xgb.plot_importance(xg_reg,max_num_features=3) plt.rcParams['figure.figsize'] = [5, 5] plt.show() preds = xg_reg.predict(X_test) predictions = np.ndarray.reshape(preds,(preds.shape[0],1)) plt.plot(y_test,predictions,'ro') plt.show rmse = np.sqrt(mean_squared_error(y_test, preds)) print("RMSE: %f" % (rmse)) modelName =name+'_XGB.model' xg_reg.save_model(modelName)
class Regressor: # for initializing train and test sets, classifier and accuracy score # Change method to gpu_hist if you want xgboost to run on a GPU def __init__(self, params={ 'objective': 'reg:squarederror', 'verbosity': 0 }): self.X_train = [] self.X_labels = [] self.test = [] self.test_labels = [] self.model = XGBRegressor(**params) self.prediction = 0 self.error = 0 def size(self): if isinstance(self.X_train, np.ndarray): return self.X_train.size return len(self.X_train) # adding the data points def input_train(self, features, feature): if isinstance(self.X_train, np.ndarray) and self.X_train.size > 0: self.X_train = self.X_train.tolist() self.X_labels = self.X_labels.tolist() self.X_train.append(features) self.X_labels.append(feature) # train the data def train(self): self.X_train = np.asarray(self.X_train) self.X_labels = np.asarray(self.X_labels) self.model.fit(self.X_train, self.X_labels) def train_eval(self, metric='error'): self.X_train = np.asarray(self.X_train) self.X_labels = np.asarray(self.X_labels) X_train, X_test, y_train, y_test = train_test_split(self.X_train, self.X_labels, test_size=0.33) self.model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric=metric) evals_result = self.model.evals_result() if metric == 'error': validations = [] for val in evals_result.values(): lst = val.get("error") validations.append(sum(lst) / len(lst)) return 1 - (sum(validations) / len(validations)) else: validations = [] for val in evals_result.values(): lst = val.get(metric) validations.append(lst[-1]) return validations # input test labels if you want to check accuracy def label(self, label): self.test_labels.append(label) def input_test(self, features): if isinstance(self.test, np.ndarray) and self.test.size > 0: self.test = self.test.tolist() self.test.append(features) # test data def predict(self): if not isinstance(self.test, np.ndarray): self.test = np.asarray(self.test) self.prediction = self.model.predict(self.test) return self.prediction # if you have the test labels you can check the error rate (you want error close to 0) def check_error(self): self.test_labels = np.asarray(self.test_labels) self.error = metrics.mean_absolute_error(self.test_labels, self.prediction) return self.error # save classifier def save_classifier(self, file): self.model.save_model(file) # open saved classifier def open_classifier(self, file): self.model.load_model(file) # removes all training data def clean_train(self): self.X_train = [] self.X_labels = [] # removes all testing data def clean_test(self): self.test = [] self.test_labels = []
early_stopping_rounds=10) aaa = model.score(x_test, y_test) print('aaa :', aaa) y_pred = model.predict(x_test) r2 = r2_score(y_test, y_pred) print('r2 :', r2) print('====================================') results = model.evals_result() # print(results) # 저장 import pickle # pickle.dump(model, open('../data/xgb_save/m39.pickle.dat', 'wb')) # print('저장완료') import joblib # joblib.dump(model, '../data/xgb_save/m39.joblib.dat') model.save_model('../data/xgb_save/m39.xgb.model') print('================ xgb model 불러오기 ====================') # 불러오기 # model2 = pickle.load('../data/xgb_save/m39.pickle.dat', 'wb') # model2 = joblib.load('../data/xgb_save/m39.joblib.dat') model2 = XGBRegressor() model2.load_model('../data/xgb_save/m39.xgb.model') print('불러옴!') r22 = model2.score(x_test, y_test) print('r22 :', r22)
from xgboost import XGBRegressor import xgboost as xgb #数据预处理 data = pd.read_csv('Lyangchi1.csv') data2 = pd.read_csv('Lyangchi1.csv') Frequency = data.pop('Frequency') x = data2.pop('Data') #print(data2) #回归预测 reg = XGBRegressor() reg.fit(data2, data) #joblib.dump(reg,"train.m") reg.save_model('lyangchi1.model') #bst2 = xgb.Booster(model_file='001.model') #fig,ax = plt.subplots() #fig.set_size_inches(60,30) #xgb.plot_tree(reg,ax=ax, num_trees=2, rankdir='LR') #fig.savefig('xgb_tree22.jpg') #plt.show() #tar = xgb.Booster(model_file='001.model') #dtest = xgb.DMatrix(data2) #preds = tar.predict(dtest) y_pred = reg.predict(data2) #plt.scatter(Frequency, data, s=5, label='True dates') #plt.plot(data2,y_pred, lw=2, color='g',alpha=0.2,label='Model') #plt.title("L-Shenmen-Train")
selec_model = XGBRegressor() # selec_model = GridSearchCV(model,parameters,cv=3, n_jobs=n_jobs) selec_model.fit(selec_x_train, y_train) ''' 여기서 새로운 모델을 생성하는 것과 반복문에 들어오기전의 모델을 그대로 쓰는것과 차이가 있을까? ''' # print(thresh) selec_x_test = selection.transform(x_test) y_pred = selec_model.predict(selec_x_test) score = r2_score(y_test, y_pred) # print(f'select model score : {score}') # print(f"model.feature_importances_ : {model.feature_importances_}") if max <= score: selec_model.save_model( f'./model/xgb_save/model_{filename}_save_{selec_x_train.shape[1]}_{np.round(score*100,2)}.dat' ) max = score # selec_model.save_model(f'./model/xgb_save/{__file__}_{np.round(thresh,2)}_{np.round(score*100,2)}.data') print( f"select model score : Thresh={np.round(thresh,2)} \t n={selec_x_train.shape[1]} \t r2={np.round(score*100,2)}" ) # 메일 제목 : 아무개 **등 # model.fit(x_train,y_train, verbose=True, eval_metric='error',eval_set=[(x_train, y_train), (x_test, y_test)]) # model.fit(x_train,y_train, verbose=True, eval_metric=['rmse','logloss'],eval_set=[(x_train, y_train), (x_test, y_test)], # early_stopping_rounds=500) # selec_model.fit(x_train,y_train, verbose=True, eval_metric=['rmse','logloss'],eval_set=[(x_train, y_train), (x_test, y_test)], # early_stopping_rounds=500)
thresholds = np.sort(model.feature_importances_) print(thresholds) for thresh in thresholds: selection = SelectFromModel(model, threshold=thresh, prefit=True) select_x_train = selection.transform(x_train) selection_model = XGBRegressor() selection_model.fit(select_x_train, y_train) select_x_test = selection.transform(x_test) y_pred = selection_model.predict(select_x_test) score = r2_score(y_test, y_pred) print("thresh=%.3f, n = %d, R2 : %2.f%%" %(thresh, select_x_train.shape[1], score*100.0)) model.save_model('./model/xgb_save/boston_rmse') print("저장 됬다.") model2=XGBRegressor() model2.load_model('./model/xgb_save/boston_rmse') print("불러왔다.") y_pred = model2.predict(x_test) score = r2_score(y_pred, y_test) print("score : ", score)
selection = SelectFromModel(xgb, threshold=thresholds[idx_max], prefit=True) selection_x_train = selection.transform(x_train) selection_x_test = selection.transform(x_test) #2)모델구성 selection_model = XGBRegressor(n_estimators=100, learning_rate=0.1, n_jobs=-1) #3)훈련 selection_model.fit(selection_x_train, y_train, verbose=False, eval_metric=["logloss", "rmse"], eval_set=[(selection_x_train, y_train), (selection_x_test, y_test)], early_stopping_rounds=20) path = f"./model/xgb_save/{__file__[-24:-3]}-idx{idx_max}.dat" selection_model.save_model(path) ''' r2 0.9328556062354909 score 0.9328556062354909 idx 0 r2 0.9328556062354909 idx 1 r2 0.932501384781691 idx 2
'max_depth': [100, 150, 200, 250, 300, 350, 400, 450], #'min_child_weight': [6, 7, 8], # 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6] } optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=3, verbose=1, n_jobs=-1) optimized_GBM.fit(x_train, y_train) evalute_result = optimized_GBM.return_train_score print('每轮迭代运行结果:{0}'.format(evalute_result)) print('参数的最佳取值:{0}'.format(optimized_GBM.best_params_)) print('最佳模型得分:{0}'.format(optimized_GBM.best_score_)) model.save_model('xgboost.model') y_pred = model.predict(x_test) # 兰州 雪佛兰 2.4L 2016 8.9 0 20.61 自动挡 8.1 # 上海 福特 1.8L 2009 5 0 14.64 自动挡 3.2 # 待预测的数据feature 下例真实值[8.1,3.2] # t = ['兰州', '雪佛兰', '2.4L', 2016, 8.9, 0, 20.61, 0] # city_id = list(city_le.classes_).index('兰州') # band_id = list(brand_le.classes_).index('雪佛兰') # t = [city_id, band_id, 2.4, 2016, 8.9, 0, 20.61, 0] t = ['上海', '福特', '1.8', 2016, 8.9, 0, 20.61, 0] city_id = list(city_le.classes_).index('上海') band_id = list(brand_le.classes_).index('福特') t = [city_id, band_id, 1.8, 2009, 5, 0, 14.64, 0] tu = tuple(t)
search = GridSearchCV(XGBRegressor(), parameter, cv=5, n_jobs=-1) select_x_train = selection.transform(x_train) search.fit(select_x_train, y_train) select_x_test = selection.transform(x_test) x_pred = search.predict(select_x_test) score = r2_score(y_test, x_pred) # print('R2는',score) print("Thresh=%.3f, n=%d, R2: %.2f%%" % (thresh, select_x_train.shape[1], score * 100.0)) model.save_model('./model/xgb_save/m34sfm/cancer.xgb' + str(thresh) + '.model') ''' [0.00497141 0.00802845 0.00874821 0.00903318 0.00930241 0.01546535 0.02015997 0.02073635 0.02245208 0.03186943 0.03302769 0.15981925 0.65638626] Thresh=0.005, n=13, R2: 90.59% Thresh=0.008, n=12, R2: 90.09% Thresh=0.009, n=11, R2: 90.93% Thresh=0.009, n=10, R2: 90.57% Thresh=0.009, n=9, R2: 91.49% Thresh=0.015, n=8, R2: 92.46% Thresh=0.020, n=7, R2: 92.93% Thresh=0.021, n=6, R2: 90.61% Thresh=0.032, n=4, R2: 88.70% Thresh=0.033, n=3, R2: 87.77% Thresh=0.160, n=2, R2: 70.24%
def main(): print("Loading data...") # The training data is used to train your model how to predict the targets. training_data = read_csv("numerai_training_data.csv") # The tournament data is the data that Numerai uses to evaluate your model. tournament_data = read_csv("numerai_tournament_data.csv") feature_names = [ f for f in training_data.columns if f.startswith("feature") ] print(f"Loaded {len(feature_names)} features") # This is the model that generates the included example predictions file. # Taking too long? Set learning_rate=0.1 and n_estimators=200 to make this run faster. # Remember to delete example_model.xgb if you change any of the parameters below. model = XGBRegressor(max_depth=5, learning_rate=0.01, n_estimators=2000, n_jobs=-1, colsample_bytree=0.1) if MODEL_FILE.is_file(): print("Loading pre-trained model...") model.load_model(MODEL_FILE) else: print("Training model...") model.fit(training_data[feature_names], training_data[TARGET_NAME]) model.save_model(MODEL_FILE) # Generate predictions on both training and tournament data print("Generating predictions...") training_data[PREDICTION_NAME] = model.predict( training_data[feature_names]) tournament_data[PREDICTION_NAME] = model.predict( tournament_data[feature_names]) # Check the per-era correlations on the training set (in sample) train_correlations = training_data.groupby("era").apply(score) print( f"On training the correlation has mean {train_correlations.mean()} and std {train_correlations.std()}" ) print( f"On training the average per-era payout is {payout(train_correlations).mean()}" ) """Validation Metrics""" # Check the per-era correlations on the validation set (out of sample) validation_data = tournament_data[tournament_data.data_type == "validation"] validation_correlations = validation_data.groupby("era").apply(score) print( f"On validation the correlation has mean {validation_correlations.mean()} and " f"std {validation_correlations.std(ddof=0)}") print( f"On validation the average per-era payout is {payout(validation_correlations).mean()}" ) # Check the "sharpe" ratio on the validation set validation_sharpe = validation_correlations.mean( ) / validation_correlations.std(ddof=0) print(f"Validation Sharpe: {validation_sharpe}") print("checking max drawdown...") rolling_max = (validation_correlations + 1).cumprod().rolling( window=100, min_periods=1).max() daily_value = (validation_correlations + 1).cumprod() max_drawdown = -(rolling_max - daily_value).max() print(f"max drawdown: {max_drawdown}") # Check the feature exposure of your validation predictions feature_exposures = validation_data[feature_names].apply( lambda d: correlation(validation_data[PREDICTION_NAME], d), axis=0) max_per_era = validation_data.groupby("era").apply( lambda d: d[feature_names].corrwith(d[PREDICTION_NAME]).abs().max()) max_feature_exposure = max_per_era.mean() print(f"Max Feature Exposure: {max_feature_exposure}") # Check feature neutral mean print("Calculating feature neutral mean...") feature_neutral_mean = get_feature_neutral_mean(validation_data) print(f"Feature Neutral Mean is {feature_neutral_mean}") # Load example preds to get MMC metrics example_preds = pd.read_csv("example_predictions.csv").set_index( "id")["prediction"] validation_example_preds = example_preds.loc[validation_data.index] validation_data["ExamplePreds"] = validation_example_preds print("calculating MMC stats...") # MMC over validation mmc_scores = [] corr_scores = [] for _, x in validation_data.groupby("era"): series = neutralize_series(pd.Series(unif(x[PREDICTION_NAME])), pd.Series(unif(x["ExamplePreds"]))) mmc_scores.append(np.cov(series, x[TARGET_NAME])[0, 1] / (0.29**2)) corr_scores.append( correlation(unif(x[PREDICTION_NAME]), x[TARGET_NAME])) val_mmc_mean = np.mean(mmc_scores) val_mmc_std = np.std(mmc_scores) val_mmc_sharpe = val_mmc_mean / val_mmc_std corr_plus_mmcs = [c + m for c, m in zip(corr_scores, mmc_scores)] corr_plus_mmc_sharpe = np.mean(corr_plus_mmcs) / np.std(corr_plus_mmcs) corr_plus_mmc_mean = np.mean(corr_plus_mmcs) corr_plus_mmc_sharpe_diff = corr_plus_mmc_sharpe - validation_sharpe print(f"MMC Mean: {val_mmc_mean}\n" f"Corr Plus MMC Sharpe:{corr_plus_mmc_sharpe}\n" f"Corr Plus MMC Diff:{corr_plus_mmc_sharpe_diff}") # Check correlation with example predictions full_df = pd.concat([ validation_example_preds, validation_data[PREDICTION_NAME], validation_data["era"] ], axis=1) full_df.columns = ["example_preds", "prediction", "era"] per_era_corrs = full_df.groupby('era').apply( lambda d: correlation(unif(d["prediction"]), unif(d["example_preds"]))) corr_with_example_preds = per_era_corrs.mean() print(f"Corr with example preds: {corr_with_example_preds}") # Save predictions as a CSV and upload to https://numer.ai tournament_data[PREDICTION_NAME].to_csv("submission.csv", header=True)
import pickle data = pd.read_csv('dataset/car data.csv') # The column car name doesn't seem to add much value to our analysis and hence dropping the column data = data.drop('Car_Name', axis=1) # It's important to know how many years old the car is. data['Car_age'] = 2020-data['Year'] data.drop('Year', axis=1, inplace=True) fuel = pd.get_dummies(data['Fuel_Type']) transmission = pd.get_dummies(data['Transmission']) seller = pd.get_dummies(data['Seller_Type']) data.drop(['Fuel_Type', 'Transmission', 'Seller_Type'], axis=1, inplace=True) data_final = pd.concat([data, fuel, transmission, seller], axis=1) X = data_final.iloc[:, 1:] y = data_final.iloc[:, 0] model = XGBRegressor() model.fit(X.values, y.values) model.save_model('model.pkl') # pickle.dump(model, open('model.pkl', 'wb'))