예제 #1
0
    def fit(self):
        t0 = u.t()
        self.fitted = False
        np.random.seed(self.random_state)
        if self.n_jobs_ != 1:
            pool = multiprocessing.Pool(self.n_jobs_)
            try:
                iter_params = []
                for it in self.yield_cv():
                    iter_params.append(it)
                res = pool.map(_fit, iter_params)
                for iCV, i_model, i_preprocessor in res:
                    self.models[iCV] = i_model
                    self.preprocessors[iCV] = i_preprocessor

            finally:
                pool.close()
        else:
            for args in self.yield_cv():
                icv, i_model, i_preprocessor = KFoldPredBase.fit_static(args)
                self.models[icv] = i_model
                self.preprocessors[icv] = i_preprocessor
        self.fitted = True
        if self.verbose:
            logd("Fit ALL CVs in %2.2f seconds" % u.td(t0))
        return self
예제 #2
0
    def fit(self, external_cols=None):
        self.predictors = {}
        self.predictions_cv = pd.DataFrame()
        for i_model, model in enumerate(self.models):
            model_name = stringify2(model, i_model)
            if self.verbose:
                logd(model_name)
            t0 = u.t()
            i_predictor = self.base_predictor(self.X, self.y, model, scoring=self.scoring,
                                              n_folds=self.n_folds, random_state=self.random_state,
                                              shuffle=self.shuffle, n_jobs=self.predictors_n_jobs,
                                              preprocessor=self.preprocessor, verbose=self.verbose)
            col = model_name
            i_predictor.fit()
            i_prediction_cv = i_predictor.predict()
            if not len(self.predictions_cv):
                self.predictions_cv = i_prediction_cv.rename(columns={i_predictor.cv_col: col})  # [i_predictor.cv_col]
            else:
                df = i_prediction_cv[[i_predictor.cv_col]].rename(columns={i_predictor.cv_col: col})
                # TODO assert index is not duplicate
                self.predictions_cv = self.predictions_cv.merge(df, left_index=True, right_index=True)

            i_predictor.fit_test()
            self.predictors[model_name] = i_predictor
            if self.verbose:
                logd("Fit %s in %2.2f seconds" % (model_name, u.td(t0)))
        self.fit_ensemble(external_cols=external_cols)
예제 #3
0
 def fit_test(self):
     t0 = u.t()
     self.fitted_test = False
     x = self.preprocessor.fit_transform(self.X) if self.preprocessor else self.X
     self.model.fit(x, self.y)
     self.fitted_test = True
     if self.verbose:
         logd("Fit Test in %2.2f seconds | %s" % (u.td(t0), x.shape))
예제 #4
0
    def fit_ensemble(self, external_cols=None):
        t0 = u.t()
        _x = self.predictions_cv[self.cols] if self.predictions_cv is not None else pd.DataFrame()
        if external_cols is not None:
            if not isinstance(external_cols, pd.DataFrame):
                external_cols = pd.DataFrame(external_cols)
            for col in external_cols.columns:
                _x["ADD_%s" % col] = external_cols[col]

        _y = self.predictions_cv[self.true_col]
        self.ensemble_scaler = StandardScaler()
        x = self.ensemble_scaler.fit_transform(_x)
        if self.ensemble_grid_params:
            scorer = make_scorer(self.scoring, greater_is_better=self.score_greater_is_better)
            self.ensemble_model, _ = \
                u.get_best_model(self.ensemble_model, self.ensemble_grid_params, x, _y,
                                 scoring=scorer, cv=self.n_folds, refit=True)
        else:
            self.ensemble_model.fit(x, _y)

        if self.verbose:
            logd("Fit Ensemble in %2.2f seconds" % u.td(t0))
        self.predictions_cv["ENS"] = self.ensemble_model.predict(x)
        self.predictions_cv = self.predictions_cv[self.cols + ["ENS", self.true_col]]