def get_MAE(arg_list): learning_rate, num_leaves, reg_lambda, reg_alpha, min_split_gain, \ min_child_weight, min_child_samples = arg_list """ Melissa.send_message(f'Starting a train LIGHTGBM search with following params:\n ' f'learning_rate:{learning_rate}, num_leaves:{num_leaves}, ' f'reg_lambda{reg_lambda}, reg_alpha:{reg_alpha}, min_split_gain:{min_split_gain}' f'min_child_weight:{min_child_weight}, min_child_samples:{min_child_samples}') """ params_dict = { 'boosting_type': 'gbdt', 'num_leaves': num_leaves, 'max_depth': -1, 'n_estimators': 10000, 'learning_rate': learning_rate, 'subsample_for_bin': 200000, 'class_weights': None, 'min_split_gain': min_split_gain, 'min_child_weight': min_child_weight, 'min_child_samples': min_child_samples, 'subsample': 1, 'subsample_freq': 0, 'colsample_bytree': 1, 'reg_alpha': reg_alpha, 'reg_lambda': reg_lambda, 'random_state': None, 'n_jobs': -1, 'silent': False, 'importance_type': 'split', 'metric': 'None', 'print_every': 100, 'mode': 'local', } X, y = data.dataset(onehot=False) X = to_cat(X) params_dict['X'] = X model = lightGBM(params_dict=params_dict) model_wrapper = MultiOutputRegressor(model, n_jobs=1) model_wrapper.fit(X, y) X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False) MAE = evaluate(model_wrapper, X_val, y_val) iterations = [] for i in range(4): iterations.append( model_wrapper.estimators_[i].model._Booster.best_iteration) global best_MAE if MAE < best_MAE: best_MAE = MAE Melissa.send_message( f'LIGHTGBM\n MAE: {MAE}\n' f'params:\n' f'iterations:{iterations}, learning_rate:{learning_rate}, num_leaves:{num_leaves}, ' f'reg_lambda{reg_lambda}, reg_alpha:{reg_alpha} , min_split_gain:{min_split_gain}' f'min_child_weight:{min_child_weight}, min_child_samples:{min_child_samples}' ) return MAE
def get_MAE(arg_list): keys = ['learning_rate', 'depth', 'l2_leaf_reg', 'random_strength'] val_params = {keys[i]: arg_list[i] for i in range(len(keys))} #learning_rate, depth, l2_leaf_reg, num_leaves, random_strength = arg_list """ Melissa.send_message(f'starting val CATBOOST\n so fermo nmezzo alla strada... ovviamente\n' f'{val_params}') """ X, Y = data.dataset('local', 'train', onehot=False) weather_cols = [ 'WEATHER_-4', 'WEATHER_-3', 'WEATHER_-2', 'WEATHER_-1' ] X[weather_cols] = X[weather_cols].fillna('Unknown') weather_cols = [ col for col in X.columns if col.startswith('WEATHER_') ] categorical_cols = [ 'EMERGENCY_LANE', 'ROAD_TYPE', 'EVENT_DETAIL', 'EVENT_TYPE' ] + weather_cols categorical_cols.extend(['WEEK_DAY', 'IS_WEEKEND']) weather_clusters_cols = [ 'WEATHER_-4_CL', 'WEATHER_-3_CL', 'WEATHER_-2_CL', 'WEATHER_-1_CL' ] X[weather_clusters_cols] = X[weather_clusters_cols].fillna( 'Unknown') # build params from default and validation ones params = { 'X': X, 'mode': 'local', 'n_estimators': 10000, 'loss_function': 'MAE', 'eval_metric': 'MAE', 'early_stopping_rounds': 100, 'cat_features': categorical_cols } params.update(val_params) catboost = CatBoost(params) model = MultiOutputRegressor(catboost, n_jobs=-1) model.fit(X, Y) X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle=False) MAE = inout.evaluate(model, X_test, y_test) iterations = [] for i in range(4): iterations.append(model.estimators_[i].model.best_iteration_) global _best_MAE if MAE < _best_MAE: _best_MAE = MAE Melissa.send_message( f'CATBOOST\n ITERATIONS: {iterations} MAE: {MAE}\nparams:{val_params}\n' ) return MAE
def train_model(): print() mode = menu.mode_selection() chain_mode = input( 'Choose the chain mode (m: multioutput / c: regressorchain): ' ).lower() M = MultiOutputRegressor if chain_mode == 'm' else RegressorChain #X, Y = data.dataset_with_features('train', onehot=False, drop_index_columns=True) X, Y = data.dataset('local', 'train', onehot=False) print(X.shape, Y.shape) # mask_not_all_null = np.any(X[['SPEED_AVG_-4','SPEED_AVG_-3','SPEED_AVG_-2','SPEED_AVG_-1']].notnull(),axis=1) # X = X[mask_not_all_null] # Y = Y[mask_not_all_null] # print('\nAfter cleaning nan') # print(X.shape, Y.shape) weather_cols = ['WEATHER_-4', 'WEATHER_-3', 'WEATHER_-2', 'WEATHER_-1'] X[weather_cols] = X[weather_cols].fillna('Unknown') weather_cols = [col for col in X.columns if col.startswith('WEATHER_')] categorical_cols = [ 'EMERGENCY_LANE', 'ROAD_TYPE', 'EVENT_DETAIL', 'EVENT_TYPE' ] + weather_cols categorical_cols.extend(['WEEK_DAY', 'IS_WEEKEND']) weather_clusters_cols = [ 'WEATHER_-4_CL', 'WEATHER_-3_CL', 'WEATHER_-2_CL', 'WEATHER_-1_CL' ] X[weather_clusters_cols] = X[weather_clusters_cols].fillna('Unknown') catboost = CatBoost({ 'X': X, 'mode': mode, 'loss_function': 'MAE', 'eval_metric': 'MAE', 'n_estimators': 5000, 'depth': 6, 'learning_rate': 0.1, 'early_stopping_rounds': 100, 'cat_features': categorical_cols }) model = M(catboost) model.fit(X, Y) X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, shuffle=False) mae, mae_4 = inout.evaluate(model, X_test, y_test, intermediate=True) print() print(mae) print(mae_4) # save the model mae = round(mae, 5) suffix = input('Insert model name suffix: ') model_folder = 'saved_models' folder.create_if_does_not_exist(model_folder) chain_mode = 'chain' if chain_mode == 'c' else 'multiout' filename = f'catboost_{chain_mode}_{mae}_{suffix}.jl' inout.save(model, os.path.join(model_folder, filename))
def get_params(self, deep): return {'alpha': self.alpha} def set_params(self): pass def fit(self, X, y): print('fitting lasso model') X = X[y > 0] y = y[y > 0] X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False) self.model.fit(X_train, y_train) y_hat = self.model.predict(X_val) print(mean_absolute_error(y_val, y_hat)) def predict(self, X): return self.model.predict(X) if __name__ == '__main__': X, y = data.dataset('train') X = X.fillna(0) y = y.fillna(0) base_model = LassoRegression(alpha=2) model_wrapper = MultiOutputRegressorWrapper(base_model, X, y) model_wrapper.fit()
return df def seghe_del_catboost(df): weather_cols = ['WEATHER_-4', 'WEATHER_-3', 'WEATHER_-2', 'WEATHER_-1'] df[weather_cols] = df[weather_cols].fillna('Unknown') weather_clusters_cols = [ 'WEATHER_-4_CL', 'WEATHER_-3_CL', 'WEATHER_-2_CL', 'WEATHER_-1_CL' ] df[weather_clusters_cols] = df[weather_clusters_cols].fillna('Unknown') return df if __name__ == '__main__': X, y = data.dataset('local', 'train', onehot=False) weather_cols = [col for col in X.columns if col.startswith('WEATHER_')] categorical_cols = [ 'EMERGENCY_LANE', 'ROAD_TYPE', 'EVENT_DETAIL', 'EVENT_TYPE' ] + weather_cols categorical_cols.extend(['WEEK_DAY', 'IS_WEEKEND']) X = seghe_del_catboost(X) X = to_cat(X) X_test, y_test, sub_base_structure = data.dataset('local', 'test', onehot=False, export=True) X_test = seghe_del_catboost(X_test)
if __name__ == '__main__': import src.data as data import numpy as np from src.algorithms.multioutput import MultiOutputRegressor, RegressorChain from sklearn.utils import shuffle from sklearn.metrics import mean_absolute_error from sklearn.model_selection import train_test_split print() chain_mode = input('Choose the chain mode (m: multioutput / c: regressorchain): ').lower() M = MultiOutputRegressor if chain_mode == 'm' else RegressorChain X, Y = data.dataset(onehot=False, drop_index_columns=True) # add features import src.preprocessing.other_features as feat avg_speed_road_event = feat.avg_speed_for_roadtype_event() X = X.merge(avg_speed_road_event, how='left', on=['EVENT_TYPE','ROAD_TYPE']) del avg_speed_road_event X.fillna(0, inplace=True) weather_cols = [f'WEATHER_{i}' for i in range(-10,0)] categorical_cols = ['EMERGENCY_LANE', 'ROAD_TYPE', 'EVENT_DETAIL','EVENT_TYPE'] + weather_cols xgboost = XGBoost({ 'X':X 'objective' :'reg:linear',