def lightgbm(X_train, y_train, X_test, y_test): reg = LGBMModel(objective='regression') # reg_cv = GridSearchCV(reg, {'max_depth': [2,4,6], 'n_estimators': [50]}, verbose=1) # reg_cv.fit(X_train, y_train) # print(reg_cv.best_params_, reg_cv.best_score_) # reg = xgb.LGBMModel(**reg_cv.best_params_) start = time.time() reg.fit(X_train, y_train) time_train = time.time() - start pred_train = reg.predict(X_train) start = time.time() pred_test = reg.predict(X_test) time_test = time.time() - start return pred_train, pred_test, time_train, time_test, reg.feature_importances_
def objective( num_leaves, scale_pos_weight, min_child_samples, bin_construct_sample_cnt, max_bin, min_sum_hessian_in_leaf, max_depth, min_split_gain, min_child_weight, ): try: scores = [] params = { 'num_leaves': int(round(num_leaves, ndigits=0)), 'scale_pos_weight': scale_pos_weight, 'min_child_samples': int(round(min_child_samples, ndigits=0)), 'bin_construct_sample_cnt': int(round(bin_construct_sample_cnt, ndigits=0)), 'max_bin': int(round(max_bin, ndigits=0)), 'min_sum_hessian_in_leaf': min_sum_hessian_in_leaf, 'max_depth': int(round(max_depth, ndigits=0)), 'min_split_gain': min_split_gain, 'min_child_weight': min_child_weight, 'n_jobs': self.n_jobs, 'silent': self.verbose < 1, 'random_state': self.random_state} if isinstance(self.fixed_parameters, dict): params.update(self.fixed_parameters) if self.use_gpu: params.update({'device': 'gpu', 'gpu_platform_id': 1, 'gpu_device_id': 0}) skf = StratifiedKFold( self.n_folds, shuffle=self.shuffle, random_state=self.random_state) for train_index, valid_index in skf.split(x, y): x_train, y_train = x[train_index, :], y[train_index] x_valid, y_valid = x[valid_index, :], y[valid_index] params['objective'] = 'binary' gbm = LGBMModel(**params) gbm.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], early_stopping_rounds=self.early_stopping_rounds, verbose=int(self.verbose > 0)) y_valid_hat = gbm.predict(x_valid, num_iteration=gbm.best_iteration_) loss_valid = log_loss(y_valid, y_valid_hat) scores.append(loss_valid) result = np.mean(scores) self.iterations.append((params, result)) return result except: # exc_type, exc_obj, exc_tb = sys.exc_info() # fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] # print(exc_type, fname, exc_tb.tb_lineno) return 999.99
class Stack(object): def __init__(self, random_state=None, test_size=0.2, verbose=None, optimization_n_call=50, optimization_n_folds=2, optimization_early_stopping_rounds=1, optimization_shuffle=True): self.opt = LightGBMOptimizer( n_folds=optimization_n_folds, n_calls=optimization_n_call, early_stopping_rounds=optimization_early_stopping_rounds, shuffle=optimization_shuffle, n_jobs=-1) self.lgb_opt = LightGBMOptimizer( n_folds=optimization_n_folds, n_calls=optimization_n_call, early_stopping_rounds=optimization_early_stopping_rounds, shuffle=optimization_shuffle, n_jobs=-1) self.mlp_opt = MLPOptimizer(n_folds=optimization_n_folds, n_calls=optimization_n_call, shuffle=optimization_shuffle, n_jobs=-1) self.knn_opt = KNNOptimizer(n_folds=optimization_n_folds, n_calls=optimization_n_call, shuffle=optimization_shuffle, n_jobs=-1) self.svm_opt = SVMOptimizer( n_folds=optimization_n_folds, n_calls=optimization_n_call, early_stopping_rounds=optimization_early_stopping_rounds, shuffle=optimization_shuffle, n_jobs=-1) self.model = None self.lgb_model = None self.mlp_model = None self.knn_model = None self.svm_model = None self.random_state = random_state self.test_size = test_size self.verbose = verbose def stack_predict(self, x): lgb_y_hat = self.lgb_model.predict( x, num_iteration=self.lgb_model.best_iteration_) print(lgb_y_hat.shape) mlp_y_hat = self.mlp_model.predict_proba(x)[:, -1] print(mlp_y_hat.shape) knn_y_hat = self.knn_model.predict_proba(x)[:, -1] print(knn_y_hat.shape) svm_y_hat = self.svm_model.predict_proba(x)[:, -1] print(svm_y_hat.shape) return np.array([lgb_y_hat, mlp_y_hat, knn_y_hat, svm_y_hat]).T def fit(self, x, y, early_stopping_rounds=None): self.fit_lightgbm(x, y, early_stopping_rounds) self.fit_knn(x, y) self.fit_mlp(x, y, early_stopping_rounds) self.fit_svm(x, y) x_stack = self.stack_predict(x) print('fit stack') optimized_params = self.opt.optimize(x, y) optimized_params['objective'] = 'binary' self.model = LGBMModel(**optimized_params) self.model.fit(x_stack, y) if early_stopping_rounds is not None and early_stopping_rounds > 0: x_train, x_valid, y_train, y_valid = train_test_split( x, y, stratify=y, shuffle=True, test_size=self.test_size, random_state=self.random_state) self.lgb_model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=self.verbose) else: self.lgb_model.fit(x, y) def fit_lightgbm(self, x, y, early_stopping_rounds): print('fit lightgbm') optimized_params = self.lgb_opt.optimize(x, y) optimized_params['objective'] = 'binary' optimized_params['random_state'] = self.random_state optimized_params['n_jobs'] = -1 self.lgb_model = LGBMModel(**optimized_params) self.lgb_model.fit(x, y) if early_stopping_rounds is not None and early_stopping_rounds > 0: x_train, x_valid, y_train, y_valid = train_test_split( x, y, stratify=y, shuffle=True, test_size=self.test_size, random_state=self.random_state) self.lgb_model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=self.verbose) else: self.lgb_model.fit(x, y) def fit_svm(self, x, y): print('fit svm') optimized_params = self.svm_opt.optimize(x, y) optimized_params['random_state'] = self.random_state self.svm_model = SVC(**optimized_params, probability=True) self.svm_model.fit(x, y) def fit_mlp(self, x, y, early_stopping_rounds): print('fit mlp') optimized_params = self.mlp_opt.optimize(x, y) optimized_params['random_state'] = self.random_state esr = early_stopping_rounds is not None and early_stopping_rounds > 0 self.mlp_model = MLPClassifier(**optimized_params, early_stopping=esr, validation_fraction=self.test_size) self.mlp_model.fit(x, y) def fit_knn(self, x, y): print('fit knn') optimized_params = self.knn_opt.optimize(x, y) optimized_params['n_jobs'] = -1 self.knn_model = KNeighborsClassifier(**optimized_params) self.knn_model.fit(x, y) def predict(self, x): x_stack = self.stack_predict(x) return self.model.predict(x_stack, num_iteration=self.model.best_iteration_)
class LGBMPredictor: def __init__(self): self.data_dir = '../../datasets' if not path.exists(self.data_dir): raise Exception( '{} directory not found.'.format(self.data_dir) ) self.train_file = '{}/{}'.format(self.data_dir, 'train.zip') self.val_file = '{}/{}'.format(self.data_dir, 'val.zip') self.pred_val_file = '{}/{}'.format( self.data_dir, 'lgbm_pred_val.zip' ) self.test_file = '{}/{}'.format(self.data_dir, 'test.zip') self.pred_test_file = '{}/{}'.format( self.data_dir, 'lgbm_pred_test.zip' ) def load_data(self, zip_path): df = pd.read_csv( zip_path, dtype={'fullVisitorId': 'str'}, compression='zip' ) [rows, columns] = df.shape print('\nLoaded {} rows with {} columns from {}.\n'.format( rows, columns, zip_path )) return df def load(self): print('Loading train data from {}'.format(self.train_file)) self.train_df = self.load_data(self.train_file) print('Loading val data from {}'.format(self.val_file)) self.val_df = self.load_data(self.val_file) print('Loading test data from {}'.format(self.test_file)) self.test_df = self.load_data(self.test_file) def prepare_data(self): train_df = self.train_df val_df = self.val_df test_df = self.test_df self.train_id = train_df['fullVisitorId'].values self.val_id = val_df['fullVisitorId'].values self.test_id = test_df['fullVisitorId'].values self.train_y = train_df['totals.transactionRevenue'].values self.train_log_y = np.log1p(self.train_y) self.val_y = val_df['totals.transactionRevenue'].values self.val_log_y = np.log1p(self.val_y) self.train_X = train_df.drop( ['totals.transactionRevenue', 'fullVisitorId'], axis=1 ) self.val_X = val_df.drop( ['totals.transactionRevenue', 'fullVisitorId'], axis=1 ) self.test_X = test_df.drop(['fullVisitorId'], axis=1) print('\nShape of the train dataset: {}'.format(self.train_X.shape)) print('\nShape of the val dataset: {}'.format(self.val_X.shape)) print('\nShape of the test dataset: {}\n'.format(self.test_X.shape)) def lgbm_model(self): self.model = LGBMModel( objective='regression', metric='rmse', n_estimators=1000, learning_rate=0.01, min_child_samples=100, bagging_fraction=0.7, feature_fraction=0.5, bagging_freq=5, bagging_seed=2020 ) self.model = self.model.fit( self.train_X, self.train_log_y, eval_set=(self.val_X, self.val_log_y), early_stopping_rounds=100, verbose=100 ) def lgbm_predict(self, X): return self.model.predict(X, self.model.best_iteration_) def lgbm_train(self): self.lgbm_model() def predict(self): self.prev_val = self.lgbm_predict(self.val_X) self.prev_test = self.lgbm_predict(self.test_X) def evaluate_val_prediction(self): pred_val = self.prev_val pred_val[pred_val < 0] = 0 pred_val_data = { 'fullVisitorId': self.val_id, 'transactionRevenue': self.val_y, 'predictedRevenue': np.expm1(pred_val) } pred_val_df = pd.DataFrame(pred_val_data) pred_val_df = pred_val_df.groupby('fullVisitorId') pred_val_df = pred_val_df['transactionRevenue', 'predictedRevenue']\ .sum().reset_index() rsme_val = np.sqrt( mean_squared_error( np.log1p(pred_val_df['transactionRevenue'].values), np.log1p(pred_val_df['predictedRevenue'].values) ) ) self.rsme_val = rsme_val self.prev_val_df = pred_val_df def evaluate_test_prediction(self): pred_test = self.pred_test pred_test[pred_test < 0] = 0 pred_test_data = { 'fullVisitorId': self.test_id, 'predictedRevenue': np.expm1(pred_test) } pred_test_df = pd.DataFrame(pred_test_data) pred_test_df = pred_test_df.groupby('fullVisitorId') pred_test_df = pred_test_df['predictedRevenue'].sum().reset_index() self.pred_test_df = pred_test_df def write_to_csv(self): self.pred_val_df.to_csv( self.pred_val_file, index=False, compression='zip' ) self.pred_test_df.to_csv( self.pred_test_file, index=False, compression='zip' )