Пример #1
0
 def test_regression(self):
     data, target = load_boston(True)
     x_train, x_test, y_train, y_test = train_test_split(data,
                                                         target,
                                                         test_size=0.2,
                                                         random_state=42)
     ngb = NGBoost(Base=default_tree_learner,
                   Dist=Normal,
                   Score=MLE,
                   natural_gradient=True,
                   verbose=False)
     ngb.fit(x_train, y_train)
     preds = ngb.predict(x_test)
     score = mean_squared_error(y_test, preds)
     assert score <= 8.0
Пример #2
0
    def fold_run(self,
            src_dir : str,
            X_train : pd.DataFrame, 
            y_train,
            X_test : pd.DataFrame,
            n_folds : int,
            col : str,
            parameters = None,
            categorical_features = None):

            """
            # Arguments: 
            : src_dir - str - main dir for saving model,history and fold runs 
            
            : X_train,y_train - training dataset with labels 

            : X_test,y_test - test dataset with labels

            : n_folds - number of folds to split dataset 

            : col - if self.stratify is True, you need to specified a col that consist of 

            binary of multilabel classes because StratifiedKFold does not support continous values

            : parameters - run_parameters that are necessary to run the Tree-Based models,

            for better understading of these parameters, you should go and read LightGBM,

            XGBoost,CatBoost API 

            : categorical_features - list - list of categorical features in dataset, 

            necessary for LightGBM and CatBoost

            # Returns: 
            valid_predictions,test_predictions - predictions made by model
            """

            if isinstance(y_train, pd.DataFrame):
                y_train = y_train.values
            if src_dir:
                if os.path.isdir(src_dir):
                    pass
                else:
                    print(f"Making dir:{src_dir}")
                    os.makedirs(src_dir)
            try: 
                print("X_train_shape",X_train.shape)
                if X_test is not None:
                    print("X_test_shape",X_test.shape)
            except ValueError: 
                print("Shape does not fit")

            if self.stratify:
                print(f"Make {n_folds} stratified folds")
                kf = StratifiedKFold(n_splits=n_folds,shuffle=True,random_state = self.seed)
            elif self.time_series:
                kf = TimeSeriesSplit(n_splits=n_folds,random_state = self.seed)
            else: 
                kf = KFold(n_splits=n_folds,random_state = self.seed)

            valid_predictions = np.zeros((X_train.shape[0],n_folds))
            print("Vaild_predict",valid_predictions.shape[0])
            if X_test is not None: 
                test_predictions = np.zeros((X_test.shape[0],n_folds))
                print("Test_predict",test_predictions.shape[0])
            i = 0
            for train_index, val_index in kf.split(X_train,y_train) if self.stratify is False else kf.split(X_train,X_train[col]):
                if self.train_gbm:
                    print("Train LightGBM")
                    train_X = X_train.iloc[train_index]
                    val_X = X_train.iloc[val_index]
                    if isinstance(y_train,pd.DataFrame): 
                        train_y = y_train.iloc[train_index]
                        val_y = y_train.iloc[val_index]
                    else: 
                        train_y = y_train[train_index]
                        val_y  = y_train[val_index]
                    
                    lgb_train = lgb.Dataset(train_X,train_y,categorical_feature = categorical_features)
                    lgb_val = lgb.Dataset(val_X,val_y,categorical_feature = categorical_features,reference = lgb_train)

                    gbm = lgb.train(params = parameters,
                                    train_set=lgb_train,
                                    num_boost_round=parameters['num_boost_round'],
                                    valid_sets=[lgb_train,lgb_val],
                                    early_stopping_rounds=parameters['early_stopping_rounds'],
                                    evals_result = self.history,
                                    verbose_eval = parameters['verbose_eval'],
                                    feval = self.eval_metric)
                    valid_predictions[val_index,i] = gbm.predict(val_X,num_iteration=gbm.best_iteration)
                    valid_predictions[val_index,i] = np.clip(valid_predictions[val_index,i],a_min=0,a_max=None)

                    r2= r2_score(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i]))
                    log_error = np.sqrt(mean_squared_log_error(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i])))
                    print(f"R2 Score for current validation set:{r2}")
                    print(f"RMSLE for current val set:{log_error}")
                
                    if self.save_model:
                        print("Saving model")
                        gbm.save_model(f'{src_dir}/fold_{i}_{self.name}_eval_history.txt')
                    
                    if self.save_history: 
                        print("Saving Hisotry")
                        pd.to_pickle(self.history,f'{src_dir}/fold_{i}_{self.name}_pickle_eval_history.pkl')
                    
                    if self.test_predict: 
                        test_predictions[:,i] = self.predict_test(X_test,i,src_dir)
                        test_predictions[:,i] = np.clip(test_predictions[:,i],a_min=0,a_max=None)

                    if self.importance: 
                        self.visualize_importance(i,src_dir)
                    
                    if self.show_metric_results:
                        self.show_results(i)
                        
                elif self.train_xg: 
                    print("Train XGBooost")
                    train_X = np.nan_to_num(X_train.iloc[train_index])
                    val_X = np.nan_to_num(X_train.iloc[val_index])
                    if isinstance(y_train,pd.DataFrame): 
                        train_y = np.nan_to_num(y_train.iloc[train_index])
                        val_y = np.nan_to_num(y_train.iloc[val_index])
                    else: 
                        train_y = np.nan_to_num(y_train[train_index])
                        val_y =  np.nan_to_num(y_train[val_index])
                    
                    xg_train = xgb.DMatrix(train_X,label=train_y,feature_names=X_train.columns)
                    xg_val = xgb.DMatrix(val_X,label=val_y,feature_names=X_train.columns)
                    eval_list = [(xg_train,'train'),(xg_val,'val')]

                    xgboost_train = xgb.train(parameters,xg_train,evals=eval_list,
                                              evals_result=self.history, 
                                              num_boost_round=parameters['boost_round'],
                                              early_stopping_rounds=parameters['early_stopping'],
                                              verbose_eval = parameters['verbose_eval'])

                    valid_predictions[val_index,i] = xgboost_train.predict(xg_val,
                                                                           ntree_limit = xgboost_train.best_ntree_limit)
                    valid_predictions[val_index,i] = np.clip(valid_predictions[val_index,i],a_min=0,a_max=None)

                    r2= r2_score(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i]))
                    log_error = np.sqrt(mean_squared_log_error(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i])))
                    print(f"R2 Score for current validation set:{r2}")
                    print(f"RMSLE for current val set:{log_error}")
                
                    if self.save_model:
                        print("Saving model")
                        xgboost_train.save_model(f'{src_dir}/fold_{i}_{self.name}_eval_history.txt')

                    if self.save_history: 
                        print("Saving Hisotry")
                        pd.to_pickle(self.history,f'{src_dir}/fold_{i}_{self.name}_pickle_eval_history.pkl')
           
                    if self.test_predict: 
                        test_predictions[:,i] = self.predict_test(X_test,i,src_dir,xgboost_train)
                        test_predictions[:,i] = np.clip(test_predictions[:,i],a_min=0,a_max=None)

                    if self.importance: 
                        self.visualize_importance(i,src_dir)
                    
                    if self.show_metric_results: 
                        self.show_results(i)
                
                elif self.train_cat: 
                    print("Training CatBoost")
                    train_X = np.array(X_train.iloc[train_index],dtype=np.float32)
                    val_X = np.array(X_train.iloc[val_index],dtype=np.float32)
                    if isinstance(y_train,pd.DataFrame): 
                        train_y = np.array(y_train.iloc[train_index],dtype=np.float32)
                        val_y = np.array(y_train.iloc[val_index],dtype=np.float32)
                    else: 
                        train_y = np.array(y_train[train_index],dtype=np.float32)
                        val_y = np.array(y_train[val_index],dtype=np.float32)

                    cat_train = catboost.Pool(train_X,label=train_y)
                    cat_test = catboost.Pool(val_X,label=val_y)
                    self.cat = catboost.CatBoostRegressor(**parameters).fit(cat_train,use_best_model=True,
                                                                       eval_set=cat_test,verbose_eval=True)
                    self.history = self.cat.get_evals_result()
                    #Index Error after first epoch, need to fix it
                    valid_predictions[val_index,i] = self.cat.predict(cat_test)
                    valid_predictions[val_index,i] = np.clip(valid_predictions[val_index,i],a_min=0,a_max=None)
                    r2= r2_score(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i]))
                    log_error = np.sqrt(mean_squared_log_error(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i])))
                    print(f"R2 Score for current validation set:{r2}")
                    print(f"RMSLE for current val set:{log_error}")
                    if self.save_model:
                        print("Saving model")
                        self.cat.save_model(f'{src_dir}/fold_{i}_{self.name}_eval_history',format='json')

                    if self.test_predict: 
                        test_predictions[:,i] = self.predict_test(X_test,i,src_dir)
                        test_predictions[:,i] = np.clip(test_predictions[:,i],a_min=0,a_max=None)

                elif self.train_ng: 
                    print("Train NGBooost")
                    train_X = np.nan_to_num(X_train.iloc[train_index])
                    val_X = np.nan_to_num(X_train.iloc[val_index])
                    if isinstance(y_train,pd.DataFrame): 
                        train_y = np.nan_to_num(y_train.iloc[train_index])
                        val_y = np.nan_to_num(y_train.iloc[val_index])
                    else: 
                        train_y = np.nan_to_num(y_train[train_index])
                        val_y = np.nan_to_num(y_train[val_index])

                        ng = NGBoost(Dist=Normal,Score=MLE,
                                     Base=default_tree_learner,natural_gradient=True,
                                     n_estimators = 150,learning_rate = 0.01,verbose=True,
                                     verbose_eval=50).fit(train_X,train_y)
                        valid_predictions[val_index,i] = ng.predict(val_X)
                        valid_predictions[val_index,i] = np.clip(valid_predictions[val_index,i],a_min=0,a_max=None)
                        rmse = np.sqrt(mean_squared_error(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i])))
                        r2= r2_score(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i]))
                        log_error = np.sqrt(mean_squared_log_error(np.nan_to_num(val_y[val_index]),np.nan_to_num(valid_predictions[val_index,i])))
                        print(f"RMSE for current fold:{rmse}")
                        print(f"R2 Score for current fold:{r2}")
                        print(f"RMSLE for current val set:{log_error}")
                        test_predictions[:,i] = np.clip(ng.predict(X_test),a_min=0,a_max=None)
                i += 1
            if self.jsonize:
                print("Saving model parameters to json")
                if os.path.isdir('parameters'):
                    pass
                else:
                    print("Making Dir: parameters")
                    os.makedirs('parameters')
                model_dict = {"model":f"{src_dir}_{i}_folds",
                              "parameters":parameters}
                with open(f"./parameters/{src_dir}_{i}_fold.json",'w+') as model_param: 
                    json.dump(model_dict,model_param)
            
            if self.prepare_submission: 
                self.output_submission(self,test_predictions,i)

            return valid_predictions,test_predictions
Пример #3
0
    'number_of_particles_in_this_jet', 'jet_px', 'jet_py', 'jet_pz',
    'jet_energy', 'jet_mass'
]]
# 随机切分数据集
X_train, X_test, Y_train, Y_test = train_test_split(features.values,
                                                    train_data.label.values,
                                                    test_size=0.2)
ngb = NGBoost(Base=default_tree_learner,
              Dist=Normal,
              Score=MLE(),
              natural_gradient=True,
              verbose=False)
# 拟合
ngb.fit(X_train, Y_train)
# 预测
Y_preds = ngb.predict(X_test)
test_data = pd.read_csv("jet_simple_data/simple_test_R04_jet.csv")
features = test_data[[
    'number_of_particles_in_this_jet', 'jet_px', 'jet_py', 'jet_pz',
    'jet_energy', 'jet_mass'
]]
Y_test_data = ngb.predict(features)
with open("submmission.csv", "") as f:
    f.write("id,label\n")
    for jet_id, label in zip(test_data['jet_id'], Y_test_data):
        f.write(jet_id + "," + label + "\n")
Y_dists = ngb.pred_dist(X_test)
# 检验均方误差 test Mean Squared Error
test_MSE = mean_squared_error(Y_preds, Y_test)
print('Test MSE', test_MSE)
# 检验负对数似然test Negative Log Likelihood