def fit(self, external_cols=None): self.predictors = {} self.predictions_cv = pd.DataFrame() for i_model, model in enumerate(self.models): model_name = stringify2(model, i_model) if self.verbose: logd(model_name) t0 = u.t() i_predictor = self.base_predictor(self.X, self.y, model, scoring=self.scoring, n_folds=self.n_folds, random_state=self.random_state, shuffle=self.shuffle, n_jobs=self.predictors_n_jobs, preprocessor=self.preprocessor, verbose=self.verbose) col = model_name i_predictor.fit() i_prediction_cv = i_predictor.predict() if not len(self.predictions_cv): self.predictions_cv = i_prediction_cv.rename(columns={i_predictor.cv_col: col}) # [i_predictor.cv_col] else: df = i_prediction_cv[[i_predictor.cv_col]].rename(columns={i_predictor.cv_col: col}) # TODO assert index is not duplicate self.predictions_cv = self.predictions_cv.merge(df, left_index=True, right_index=True) i_predictor.fit_test() self.predictors[model_name] = i_predictor if self.verbose: logd("Fit %s in %2.2f seconds" % (model_name, u.td(t0))) self.fit_ensemble(external_cols=external_cols)
def __init__(self, x, y, models, ensemble_model, scoring=None, n_folds=3, random_state=SEED, shuffle=False, n_jobs=-1, stratified=False, preprocessor=None, verbose=0, ensemble_grid_params=None, score_greater_is_better=False): assert isinstance(models, (list, tuple, set)), type(models) assert isinstance(ensemble_model, sklearn.base.BaseEstimator), \ "%s != %s" % (type(ensemble_model), type(sklearn.base.BaseEstimator)) self.X = x self.y = y self.ensemble_model = ensemble_model self.n_folds = n_folds self.shuffle = shuffle self.models = models self.stratified = stratified self.random_state = random_state self.predictors_n_jobs = n_jobs self.scoring = scoring self.preprocessor = preprocessor self.verbose = verbose self.ensemble_scaler = None self.score_greater_is_better = score_greater_is_better self.base_predictor = KStratifiedPred if self.stratified else KFoldPred # TODO n_jobs split ensemble and CV self.true_col = "TRUE" self.cols = map(lambda i: "%s" % stringify2(i[1], i[0]), enumerate(models)) self.predictions_cv = None self.predictions = None self.ensemble_grid_params = ensemble_grid_params self.predictors = {}