def _train_sklgb(self, data): """Train the model using sklearn lightgbm API. Args: data ((tuple of DataFrame)): training and testing data. Returns: model_result (dict): result of trained model. - model (:obj:): trained model, - importances (DataFrame): feature importance table, - eval_score (float): score of model using `eval_func`. """ logging.info("using lightgbm sklearn api...") X_train, X_test, y_train, y_test = data X_train, X_test = X_train[self.features], X_test[self.features] if self.task == "regression": model = LGBMRegressor(**self.trn_params, n_jobs=-1) elif self.task == "classification": model = LGBMClassifier(**self.trn_params, n_jobs=-1) # Train model model.fit(X_train, y_train, feature_name=self.features, categorical_feature=self.cat_features, eval_set=[(X_train, y_train), (X_test, y_test)], early_stopping_rounds=self.early_stopping_rounds, verbose=self.verbose_eval) # Calculate the feature importances importances = pd.DataFrame() importances["feature"] = X_train.columns.values.tolist() importances["importance"] = model.feature_importances_ # Calculate eval_func function eval_score if self.eval_func: if self.task == "classification": # Return the predicted probability # for each class for each sample. y_test_pred = model.predict_proba(X_test) else: y_test_pred = model.predict(X_test) eval_score = self.eval_func(y_test, y_test_pred) logging.info("score on validation is %f" % eval_score) else: eval_score = None model_result = { "model": model, "importances": importances, "eval_score": eval_score } return model_result
class Stacking(BaseEnsembleModel): def __init__(self, stats, data_node, ensemble_size: int, task_type: int, metric: _BaseScorer, output_dir=None, meta_learner='lightgbm', kfold=5): super().__init__(stats=stats, data_node=data_node, ensemble_method='stacking', ensemble_size=ensemble_size, task_type=task_type, metric=metric, output_dir=output_dir) self.kfold = kfold try: from lightgbm import LGBMClassifier except: warnings.warn( "Lightgbm is not imported! Stacking will use linear model instead!" ) meta_learner = 'linear' self.meta_method = meta_learner # We use Xgboost as default meta-learner if self.task_type in CLS_TASKS: if meta_learner == 'linear': from sklearn.linear_model.logistic import LogisticRegression self.meta_learner = LogisticRegression(max_iter=1000) elif meta_learner == 'gb': from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier self.meta_learner = GradientBoostingClassifier( learning_rate=0.05, subsample=0.7, max_depth=4, n_estimators=250) elif meta_learner == 'lightgbm': from lightgbm import LGBMClassifier self.meta_learner = LGBMClassifier(max_depth=4, learning_rate=0.05, n_estimators=150, n_jobs=1) else: if meta_learner == 'linear': from sklearn.linear_model import LinearRegression self.meta_learner = LinearRegression() elif meta_learner == 'lightgbm': from lightgbm import LGBMRegressor self.meta_learner = LGBMRegressor(max_depth=4, learning_rate=0.05, n_estimators=70, n_jobs=1) def fit(self, data): # Split training data for phase 1 and phase 2 if self.task_type in CLS_TASKS: kf = StratifiedKFold(n_splits=self.kfold) else: kf = KFold(n_splits=self.kfold) # Train basic models using a part of training data model_cnt = 0 suc_cnt = 0 feature_p2 = None for algo_id in self.stats.keys(): model_to_eval = self.stats[algo_id] for idx, (config, _, path) in enumerate(model_to_eval): with open(path, 'rb') as f: op_list, model, _ = pkl.load(f) _node = data.copy_() _node = construct_node(_node, op_list, mode='train') X, y = _node.data if self.base_model_mask[model_cnt] == 1: for j, (train, test) in enumerate(kf.split(X, y)): x_p1, x_p2, y_p1, _ = X[train], X[test], y[train], y[ test] estimator = fetch_predict_estimator( self.task_type, algo_id, config, x_p1, y_p1, weight_balance=data.enable_balance, data_balance=data.data_balance) with open( os.path.join( self.output_dir, '%s-model%d_part%d' % (self.timestamp, model_cnt, j)), 'wb') as f: pkl.dump(estimator, f) if self.task_type in CLS_TASKS: pred = estimator.predict_proba(x_p2) n_dim = np.array(pred).shape[1] if n_dim == 2: # Binary classificaion n_dim = 1 # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(train) + len(test) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) if n_dim == 1: feature_p2[test, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred[:, 1:2] else: feature_p2[test, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred else: pred = estimator.predict(x_p2).reshape(-1, 1) n_dim = 1 # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(train) + len(test) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) feature_p2[test, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred suc_cnt += 1 model_cnt += 1 # Train model for stacking using the other part of training data self.meta_learner.fit(feature_p2, y) return self def get_feature(self, data): # Predict the labels via stacking feature_p2 = None model_cnt = 0 suc_cnt = 0 for algo_id in self.stats.keys(): model_to_eval = self.stats[algo_id] for idx, (config, _, path) in enumerate(model_to_eval): with open(path, 'rb') as f: op_list, model, _ = pkl.load(f) _node = data.copy_() _node = construct_node(_node, op_list) if self.base_model_mask[model_cnt] == 1: for j in range(self.kfold): with open( os.path.join( self.output_dir, '%s-model%d_part%d' % (self.timestamp, model_cnt, j)), 'rb') as f: estimator = pkl.load(f) if self.task_type in CLS_TASKS: pred = estimator.predict_proba(_node.data[0]) n_dim = np.array(pred).shape[1] if n_dim == 2: n_dim = 1 if feature_p2 is None: num_samples = len(_node.data[0]) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) # Get average predictions if n_dim == 1: feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = \ feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] + pred[:, 1:2] / self.kfold else: feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = \ feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] + pred / self.kfold else: pred = estimator.predict(_node.data[0]).reshape( -1, 1) n_dim = 1 # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(_node.data[0]) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) # Get average predictions feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = \ feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] + pred / self.kfold suc_cnt += 1 model_cnt += 1 return feature_p2 def predict(self, data): feature_p2 = self.get_feature(data) # Get predictions from meta-learner if self.task_type in CLS_TASKS: final_pred = self.meta_learner.predict_proba(feature_p2) else: final_pred = self.meta_learner.predict(feature_p2) return final_pred def get_ens_model_info(self): model_cnt = 0 ens_info = {} ens_config = [] for algo_id in self.stats: model_to_eval = self.stats[algo_id] for idx, (config, _, _) in enumerate(model_to_eval): if not hasattr(self, 'base_model_mask' ) or self.base_model_mask[model_cnt] == 1: model_path = os.path.join( self.output_dir, '%s-stacking-model%d' % (self.timestamp, model_cnt)) ens_config.append((algo_id, config, model_path)) model_cnt += 1 ens_info['ensemble_method'] = 'stacking' ens_info['config'] = ens_config ens_info['meta_learner'] = self.meta_method return ens_info
class LightGBM(BaseRegressionModel): def __init__(self, n_estimators, learning_rate, num_leaves, min_child_weight, subsample, colsample_bytree, reg_alpha, reg_lambda, random_state): self.n_estimators = n_estimators self.learning_rate = learning_rate self.num_leaves = num_leaves self.subsample = subsample self.reg_alpha = reg_alpha self.reg_lambda = reg_lambda self.min_child_weight = min_child_weight self.colsample_bytree = colsample_bytree self.n_jobs = -1 self.random_state = random_state self.estimator = None def fit(self, X, y): self.estimator = LGBMRegressor(num_leaves=self.num_leaves, learning_rate=self.learning_rate, n_estimators=self.n_estimators, min_child_weight=self.min_child_weight, subsample=self.subsample, colsample_bytree=self.colsample_bytree, reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, random_state=self.random_state, n_jobs=self.n_jobs) self.estimator.fit(X, y) return self def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict_proba(X) @staticmethod def get_properties(dataset_properties=None): return { 'shortname': 'LightGBM Regressor', 'name': 'LightGBM Regressor', 'handles_regression': True, 'handles_classification': False, 'handles_multiclass': False, 'handles_multilabel': False, 'is_deterministic': False, 'input': (SPARSE, DENSE, UNSIGNED_DATA), 'output': (PREDICTIONS, ) } @staticmethod def get_hyperparameter_search_space(dataset_properties=None, optimizer='smac'): if optimizer == 'smac': cs = ConfigurationSpace() n_estimators = UniformIntegerHyperparameter("n_estimators", 100, 1000, default_value=500) num_leaves = UniformIntegerHyperparameter("num_leaves", 31, 1023, default_value=31) learning_rate = UniformFloatHyperparameter("learning_rate", 0.025, 0.3, default_value=0.1, log=True) min_child_weight = UniformIntegerHyperparameter("min_child_weight", 1, 10, default_value=1) subsample = UniformFloatHyperparameter("subsample", 0.5, 1, default_value=1) colsample_bytree = UniformFloatHyperparameter("colsample_bytree", 0.5, 1, default_value=1) reg_alpha = UniformFloatHyperparameter('reg_alpha', 1e-10, 10, log=True, default_value=1e-10) reg_lambda = UniformFloatHyperparameter("reg_lambda", 1e-10, 10, log=True, default_value=1e-10) cs.add_hyperparameters([ n_estimators, num_leaves, learning_rate, min_child_weight, subsample, colsample_bytree, reg_alpha, reg_lambda ]) return cs elif optimizer == 'tpe': from hyperopt import hp space = { 'n_estimators': hp.randint('lgb_n_estimators', 901) + 100, 'num_leaves': hp.randint('lgb_num_leaves', 993) + 31, 'learning_rate': hp.loguniform('lgb_learning_rate', np.log(0.025), np.log(0.3)), 'min_child_weight': hp.randint('lgb_min_child_weight', 10) + 1, 'subsample': hp.uniform('lgb_subsample', 0.5, 1), 'colsample_bytree': hp.uniform('lgb_colsample_bytree', 0.5, 1), 'reg_alpha': hp.loguniform('lgb_reg_alpha', np.log(1e-10), np.log(10)), 'reg_lambda': hp.loguniform('lgb_reg_lambda', np.log(1e-10), np.log(10)) } init_trial = { 'n_estimators': 500, 'num_leaves': 31, 'learning_rate': 0.1, 'min_child_weight': 1, 'subsample': 1, 'colsample_bytree': 1, 'reg_alpha': 1e-10, 'reg_lambda': 1e-10 } return space
yt = y_train.loc[Xt.index, "target"] Xt = Xt.drop(drop_cols, axis=1) Xv = X_train[X_train.codmes == mes] yv = y_train.loc[Xv.index, "target"] learner = LGBMClassifier(n_estimators=5000) learner.fit(Xt, yt, early_stopping_rounds=50, eval_metric="auc", eval_set=[(Xt, yt), (Xv.drop(drop_cols, axis=1), yv)], verbose=50) gc.collect() test_probs.append( pd.Series(learner.predict_proba(X_test.drop(drop_cols, axis=1))[:, -1], index=X_test.index, name="fold_" + str(mes))) train_probs.append( pd.Series(learner.predict_proba(Xv.drop(drop_cols, axis=1))[:, -1], index=Xv.index, name="probs")) gc.collect() test_probs = pd.concat(test_probs, axis=1).mean(axis=1) train_probs = pd.concat(train_probs) test = pd.concat([test_probs.rename("probs"), test_preds.rename("preds")], axis=1) train = pd.concat([train_probs.rename("probs"),
def kfold_lightgbm(df, num_folds, stratified = False, debug= False): # Divide in training/validation and test data train_df = df[df['Next_Premium'].notnull()] test_df = df[df['Next_Premium'].isnull()] print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape)) del df gc.collect() # Cross validation model if stratified: folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=47) else: folds = KFold(n_splits= num_folds, shuffle=True, random_state=47) # Create arrays and dataframes to store results feature_importance_df = pd.DataFrame() feats = [f for f in train_df.columns if f not in ['Policy_Number','Next_Premium',"Insured's_ID", 'Prior_Policy_Number', 'Cancellation', 'Vehicle_identifier_number', 'Vehicle_Make_and_Model1', 'Vehicle_Make_and_Model2', 'Imported_or_Domestic_Car', 'Coding_of_Vehicle_Branding_&_Type', 'qpt', 'fpt', 'Main_Insurance_Coverage_Group', 'Insurance_Coverage', 'Distribution_Channel', 'pdmg_acc', 'fassured', 'ibirth', 'fsex', 'fmarriage', 'aassured_zip', 'iply_area', 'dbirth', 'fequipment1', 'fequipment2', 'fequipment3', 'fequipment4', 'fequipment5', 'fequipment6', 'fequipment9', 'nequipment9', 'Claim_Number', 'Nature_of_the_claim', "Driver's_Gender", "Driver's_Relationship_with_Insured", 'DOB_of_Driver', 'Marital_Status_of_Driver', 'Accident_Date', 'Cause_of_Loss', 'Coverage', 'Vehicle_identifier_claim', 'Claim_Status_(close,_open,_reopen_etc)', 'Accident_area', 'number_of_claimants', 'Accident_Time']] seed = 7 test_size = 0.3 n_fold = 1 submission_file_name = "submission_kernel01.csv" submission_file_name_agg = "submission_kernel_agg.csv" train_x, valid_x, train_y, valid_y = cross_validation.train_test_split(train_df[feats], train_df['Next_Premium'], test_size=test_size, random_state=seed) clf = LGBMRegressor( nthread=4, n_estimators=10000, learning_rate=0.03, num_leaves=32, colsample_bytree=0.9497036, subsample=0.8715623, max_depth=8, reg_alpha=0.04, reg_lambda=0.073, min_split_gain=0.0222415, min_child_weight=40, silent=-1, verbose=-1) clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'mae', verbose= 100, early_stopping_rounds= 200) oof_preds = clf.predict(valid_x, num_iteration=clf.best_iteration_) sub_preds = clf.predict(test_df[feats], num_iteration=clf.best_iteration_) fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = clf.feature_importances_ fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) print('Fold %2d AUC : %.6f' % (n_fold + 1, mean_absolute_error(valid_y, oof_preds))) test_df['Next_Premium'] = sub_preds #est_df[['Policy_Number', 'Next_Premium']].to_csv(submission_file_name, index= False) test_df.to_csv(submission_file_name) NP_aggregations = {'Next_Premium': ['mean']} test_df_agg = test_df.groupby('Policy_Number').agg(NP_aggregations) test_df_submit = pd.read_csv('../data/testing-set.csv') test_df_submit = test_df_submit.join(test_df_agg, how='left', on='Policy_Number', rsuffix='_agg') test_df_submit.to_csv(submission_file_name_agg, index= False) del clf, train_x, train_y, valid_x, valid_y for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['Next_Premium'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx] # LightGBM parameters found by Bayesian optimization clf = LGBMClassifier( nthread=4, n_estimators=10000, learning_rate=0.02, num_leaves=32, colsample_bytree=0.9497036, subsample=0.8715623, max_depth=8, reg_alpha=0.04, reg_lambda=0.073, min_split_gain=0.0222415, min_child_weight=40, silent=-1, verbose=-1) clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'auc', verbose= 100, early_stopping_rounds= 200) oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1] sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = clf.feature_importances_ fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx]))) del clf, train_x, train_y, valid_x, valid_y gc.collect() print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds)) # Write submission file and plot feature importance if not debug: test_df['TARGET'] = sub_preds test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False) display_importances(feature_importance_df) return feature_importance_df
def fit(self, x): """ X can be pd.Dataframe, np.ndarray or sparse. y has to be pd.series """ train_data = x['train_data'] val_data = x['val_data'] self.models = [] # For CV oof_preds = np.zeros(len(train_data[0])) X_data = train_data[0] y_data = train_data[1] # Validate after CV X_val = val_data[0] try: y_val = np.array(val_data[1].todense()).ravel() except: y_val = np.array(val_data[1]).ravel() is_sparse = scipy.sparse.issparse(X_data) # Create dataframe to keep feature importances for each fold feature_importances = pd.DataFrame() if not is_sparse: self.features = X_data.columns if self.features is not None: if not len(self.features) == X_data.shape[1]: raise ValueError( 'Number of features must be the same as n_columns in X.') # Create column for features feature_importances['feature'] = self.features cv_metrics = list() n_folds = 0 folds = None val_preds = None if not isinstance(self.folds, list): folds = self.folds.split(X_data, y_data) else: folds = self.folds for i_fold, (trn_idx, val_idx) in enumerate(folds): n_folds += 1 X_trn_fold = X_data[trn_idx] if is_sparse else X_data.iloc[trn_idx] X_val_fold = X_data[val_idx] if is_sparse else X_data.iloc[val_idx] y_val_fold = None y_trn_fold = None if isinstance(y_data, pd.Series): y_trn_fold = y_data.iloc[trn_idx] y_val_fold = y_data.iloc[val_idx] else: y_trn_fold = y_data[trn_idx] y_val_fold = y_data[val_idx] try: y_trn_fold = np.array(y_trn_fold.todense()).ravel() y_val_fold = np.array(y_val_fold.todense()).ravel() except: y_trn_fold = np.array(y_trn_fold).ravel() y_val_fold = np.array(y_val_fold).ravel() logger.info('Training on fold {}'.format(i_fold)) """trn_data = lgb.Dataset(X_trn_fold, label = y_trn_fold) val_data = lgb.Dataset(X_val_fold, label = y_val_fold)""" # This is validation in CV, not validation set # Training for this fold #print(self.lgbm_hparams) clf = LGBMRegressor( **self.lgbm_hparams ) if self.objective == 'regression' else LGBMClassifier( **self.lgbm_hparams) clf = clf.fit(X=X_trn_fold, y=y_trn_fold, eval_set=[(X_trn_fold, y_trn_fold), (X_val_fold, y_val_fold)], early_stopping_rounds=250, verbose=200) # Keep models of each fold self.models.append(clf) feature_importances['fold_{}'.format( i_fold)] = clf.feature_importances_ try: oof_preds[val_idx] = clf.predict_proba(X_val_fold) except: oof_preds[val_idx] = clf.predict(X_val_fold) # Validation for this fold if X_val is not None: if val_preds is None: try: val_preds = clf.predict_proba(X_val) except: val_preds = clf.predict(X_val) else: try: val_preds += clf.predict_proba(X_val) except: val_preds += clf.predict(X_val) logger.info('Training has finished.') #logger.info(f'Mean CV {params["metric"]}: {np.mean(cv_metrics)}') # Validation val_metric = None if X_val is not None: val_preds /= n_folds logger.info('Calculating validation metric...') val_metric = self.get_metric(y_val, val_preds) logger.info(f'Validation {self.metric}: {val_metric}') feature_importances['importance'] = \ feature_importances[[f'fold_{i}' for i in range(n_folds)]].sum(axis = 1) cols_to_keep = [ col for col in feature_importances.columns if 'fold' not in col ] self.feature_importances = feature_importances[cols_to_keep] if 'feature' in self.feature_importances.columns: self.feature_importances.sort_values(by='importance', ascending=False, inplace=True) return { #'cv_metrics': cv_metrics, 'feature_importances': feature_importances, 'val_preds': val_preds, 'oof_preds': oof_preds, 'metric': val_metric }
class Blending(BaseEnsembleModel): def __init__(self, stats, ensemble_size: int, task_type: int, metric: _BaseScorer, output_dir=None, meta_learner='lightgbm'): super().__init__(stats=stats, ensemble_method='blending', ensemble_size=ensemble_size, task_type=task_type, metric=metric, output_dir=output_dir) try: from lightgbm import LGBMClassifier except: warnings.warn("Lightgbm is not imported! Blending will use linear model instead!") meta_learner = 'linear' self.meta_method = meta_learner # We use Xgboost as default meta-learner if self.task_type in CLS_TASKS: if meta_learner == 'linear': from sklearn.linear_model.logistic import LogisticRegression self.meta_learner = LogisticRegression(max_iter=1000) elif meta_learner == 'gb': from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier self.meta_learner = GradientBoostingClassifier(learning_rate=0.05, subsample=0.7, max_depth=4, n_estimators=250) elif meta_learner == 'lightgbm': from lightgbm import LGBMClassifier self.meta_learner = LGBMClassifier(max_depth=4, learning_rate=0.05, n_estimators=150) else: if meta_learner == 'linear': from sklearn.linear_model import LinearRegression self.meta_learner = LinearRegression() elif meta_learner == 'lightgbm': from lightgbm import LGBMRegressor self.meta_learner = LGBMRegressor(max_depth=4, learning_rate=0.05, n_estimators=70) def fit(self, data): # Split training data for phase 1 and phase 2 test_size = 0.2 # Train basic models using a part of training data model_cnt = 0 suc_cnt = 0 feature_p2 = None for algo_id in self.stats["include_algorithms"]: model_to_eval = self.stats[algo_id]['model_to_eval'] for idx, (node, config) in enumerate(model_to_eval): X, y = node.data if self.task_type in CLS_TASKS: x_p1, x_p2, y_p1, y_p2 = train_test_split(X, y, test_size=test_size, stratify=data.data[1], random_state=self.seed) else: x_p1, x_p2, y_p1, y_p2 = train_test_split(X, y, test_size=test_size, random_state=self.seed) if self.base_model_mask[model_cnt] == 1: estimator = fetch_predict_estimator(self.task_type, config, x_p1, y_p1, weight_balance=node.enable_balance, data_balance=node.data_balance ) with open(os.path.join(self.output_dir, '%s-blending-model%d' % (self.timestamp, model_cnt)), 'wb') as f: pkl.dump(estimator, f) if self.task_type in CLS_TASKS: pred = estimator.predict_proba(x_p2) n_dim = np.array(pred).shape[1] if n_dim == 2: # Binary classificaion n_dim = 1 # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(x_p2) feature_p2 = np.zeros((num_samples, self.ensemble_size * n_dim)) if n_dim == 1: feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred[:, 1:2] else: feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred else: pred = estimator.predict(x_p2).reshape(-1, 1) n_dim = 1 # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(x_p2) feature_p2 = np.zeros((num_samples, self.ensemble_size * n_dim)) feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred suc_cnt += 1 model_cnt += 1 self.meta_learner.fit(feature_p2, y_p2) return self def get_feature(self, data, solvers): # Predict the labels via blending feature_p2 = None model_cnt = 0 suc_cnt = 0 for algo_id in self.stats["include_algorithms"]: model_to_eval = self.stats[algo_id]['model_to_eval'] for idx, (node, config) in enumerate(model_to_eval): test_node = solvers[algo_id].optimizer['fe'].apply(data, node) if self.base_model_mask[model_cnt] == 1: with open(os.path.join(self.output_dir, '%s-blending-model%d' % (self.timestamp, model_cnt)), 'rb') as f: estimator = pkl.load(f) if self.task_type in CLS_TASKS: pred = estimator.predict_proba(test_node.data[0]) n_dim = np.array(pred).shape[1] if n_dim == 2: # Binary classificaion n_dim = 1 # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(data.data[0]) feature_p2 = np.zeros((num_samples, self.ensemble_size * n_dim)) if n_dim == 1: feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred[:, 1:2] else: feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred else: pred = estimator.predict(test_node.data[0]).reshape(-1, 1) n_dim = 1 # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(data.data[0]) feature_p2 = np.zeros((num_samples, self.ensemble_size * n_dim)) feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred suc_cnt += 1 model_cnt += 1 return feature_p2 def predict(self, data, solvers): feature_p2 = self.get_feature(data, solvers) # Get predictions from meta-learner if self.task_type in CLS_TASKS: final_pred = self.meta_learner.predict_proba(feature_p2) else: final_pred = self.meta_learner.predict(feature_p2) return final_pred def get_ens_model_info(self): model_cnt = 0 ens_info = {} ens_config = [] for algo_id in self.stats["include_algorithms"]: model_to_eval = self.stats[algo_id]['model_to_eval'] for idx, (node, config) in enumerate(model_to_eval): if not hasattr(self, 'base_model_mask') or self.base_model_mask[model_cnt] == 1: model_path = os.path.join(self.output_dir, '%s-blending-model%d' % (self.timestamp, model_cnt)) ens_config.append((algo_id, node.config, config, model_path)) model_cnt += 1 ens_info['ensemble_method'] = 'blending' ens_info['config'] = ens_config ens_info['meta_learner'] = self.meta_method ens_info['meta_model']=self.meta_learner return ens_info