示例#1
0
    def print_training_summary(self, gs):
        print(
            'The best CV score from GridSearchCV (by default averaging across k-fold CV) for '
            + self.output_column + ' is:')
        if self.took_log_of_y:
            print(
                '    Note that this score is calculated using the natural logs of the y values.'
            )
        print(gs.best_score_)
        print('The best params were')

        # Remove 'final_model__model' from what we print- it's redundant with model name, and is difficult to read quickly in a list since it's a python object.
        if 'model' in gs.best_params_:
            printing_copy = {}
            for k, v in gs.best_params_.items():
                if k != 'model':
                    printing_copy[k] = v
                else:
                    printing_copy[k] = utils_models.get_name_from_model(v)
        else:
            printing_copy = gs.best_params_

        print(printing_copy)

        if self.verbose:
            print('Here are all the hyperparameters that were tried:')
            raw_scores = gs.grid_scores_
            sorted_scores = sorted(raw_scores,
                                   key=lambda x: x[1],
                                   reverse=True)
            for score in sorted_scores:
                for k, v in score[0].items():
                    if k == 'model':
                        score[0][k] = utils_models.get_name_from_model(v)
                print(score)
示例#2
0
    def fit(self, X, y):
        self.model_name = get_name_from_model(self.model)

        # if self.model_name[:3] == 'XGB' and scipy.sparse.issparse(X):
        #     ones = [[1] for x in range(X.shape[0])]
        #     # Trying to force XGBoost to play nice with sparse matrices
        #     X_fit = scipy.sparse.hstack((X, ones))

        # else:

        X_fit = X

        if self.model_name[:12] == 'DeepLearning' or self.model_name in [
                'BayesianRidge',
                'LassoLars',
                'OrthogonalMatchingPursuit',
                'ARDRegression',
                'Perceptron',
                'PassiveAggressiveClassifier',
                'SGDClassifier',
                'RidgeClassifier',
                'LogisticRegression',
        ]:
            if scipy.sparse.issparse(X_fit):
                X_fit = X_fit.todense()

        #     num_cols = X_fit.shape[1]
        #     kwargs = {
        #         'num_cols':num_cols
        #         , 'nb_epoch': 20
        #         , 'batch_size': 10
        #         , 'verbose': 1
        #     }
        #     model_params = self.model.get_params()
        #     del model_params['build_fn']
        #     for k, v in model_params.items():
        #         if k not in kwargs:
        #             kwargs[k] = v
        #     if self.type_of_estimator == 'regressor':
        #         self.model = KerasRegressor(build_fn=make_deep_learning_model, **kwargs)

        try:
            self.model.fit(X_fit, y)
        except TypeError as e:
            if scipy.sparse.issparse(X_fit):
                X_fit = X_fit.todense()
            self.model.fit()

        return self
示例#3
0
    def fit(self, X, y):
        self.model_name = get_name_from_model(self.model)

        # if self.model_name[:3] == 'XGB' and scipy.sparse.issparse(X):
        #     ones = [[1] for x in range(X.shape[0])]
        #     # Trying to force XGBoost to play nice with sparse matrices
        #     X_fit = scipy.sparse.hstack((X, ones))

        # else:

        X_fit = X

        if self.model_name[:12] == 'DeepLearning' or self.model_name in [
                'BayesianRidge', 'LassoLars', 'OrthogonalMatchingPursuit',
                'ARDRegression', 'Perceptron', 'PassiveAggressiveClassifier',
                'SGDClassifier', 'RidgeClassifier', 'LogisticRegression'
        ]:
            if scipy.sparse.issparse(X_fit):
                X_fit = X_fit.todense()

            if self.model_name[:12] == 'DeepLearning':
                if keras_installed:

                    # For Keras, we need to tell it how many input nodes to expect, which is our num_cols
                    num_cols = X_fit.shape[1]

                    model_params = self.model.get_params()
                    del model_params['build_fn']

                    if self.type_of_estimator == 'regressor':
                        self.model = KerasRegressor(
                            build_fn=utils_models.make_deep_learning_model,
                            num_cols=num_cols,
                            **model_params)
                    elif self.type_of_estimator == 'classifier':
                        self.model = KerasClassifier(
                            build_fn=utils_models.
                            make_deep_learning_classifier,
                            num_cols=num_cols,
                            **model_params)
                else:
                    print(
                        'WARNING: We did not detect that Keras was available.')
                    raise TypeError(
                        'A DeepLearning model was requested, but Keras was not available to import'
                    )

        try:
            if self.model_name[:12] == 'DeepLearning':

                print(
                    'Stopping training early if we have not seen an improvement in training accuracy in 25 epochs'
                )
                from keras.callbacks import EarlyStopping
                early_stopping = EarlyStopping(monitor='loss',
                                               patience=25,
                                               verbose=1)
                self.model.fit(X_fit, y, callbacks=[early_stopping])

            else:
                self.model.fit(X_fit, y)

        except TypeError as e:
            if scipy.sparse.issparse(X_fit):
                X_fit = X_fit.todense()
            self.model.fit(X_fit, y)

        except KeyboardInterrupt as e:
            pass

        return self
示例#4
0
    def fit(self, X, y):
        global keras_imported, KerasRegressor, KerasClassifier, EarlyStopping, ModelCheckpoint, TerminateOnNaN, keras_load_model
        self.model_name = get_name_from_model(self.model)

        X_fit = X

        if self.model_name[:12] == 'DeepLearning' or self.model_name in [
                'BayesianRidge', 'LassoLars', 'OrthogonalMatchingPursuit',
                'ARDRegression', 'Perceptron', 'PassiveAggressiveClassifier',
                'SGDClassifier', 'RidgeClassifier', 'LogisticRegression'
        ]:
            if scipy.sparse.issparse(X_fit):
                X_fit = X_fit.todense()

            if self.model_name[:12] == 'DeepLearning':
                if keras_imported == False:
                    # Suppress some level of logs
                    os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '3'
                    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
                    from keras.callbacks import EarlyStopping, ModelCheckpoint, TerminateOnNaN
                    from keras.models import load_model as keras_load_model
                    from keras.wrappers.scikit_learn import KerasRegressor, KerasClassifier

                    keras_imported = True

                # For Keras, we need to tell it how many input nodes to expect, which is our num_cols
                num_cols = X_fit.shape[1]

                model_params = self.model.get_params()
                del model_params['build_fn']
                try:
                    del model_params['feature_learning']
                except:
                    pass
                try:
                    del model_params['num_cols']
                except:
                    pass

                if self.type_of_estimator == 'regressor':
                    self.model = KerasRegressor(
                        build_fn=utils_models.make_deep_learning_model,
                        num_cols=num_cols,
                        feature_learning=self.feature_learning,
                        **model_params)
                elif self.type_of_estimator == 'classifier':
                    self.model = KerasClassifier(
                        build_fn=utils_models.make_deep_learning_classifier,
                        num_cols=num_cols,
                        feature_learning=self.feature_learning,
                        **model_params)

        if self.model_name[:12] == 'DeepLearning':
            try:

                if self.is_hp_search == True:
                    patience = 5
                    verbose = 0
                else:
                    patience = 25
                    verbose = 2

                X_fit, y, X_test, y_test = self.get_X_test(X_fit, y)
                try:
                    X_test = X_test.toarray()
                except AttributeError as e:
                    pass
                if not self.is_hp_search:
                    print(
                        '\nWe will stop training early if we have not seen an improvement in validation accuracy in {} epochs'
                        .format(patience))
                    print(
                        'To measure validation accuracy, we will split off a random 10 percent of your training data set'
                    )

                early_stopping = EarlyStopping(monitor='val_loss',
                                               patience=patience,
                                               verbose=verbose)
                terminate_on_nan = TerminateOnNaN()

                now_time = datetime.datetime.now()
                time_string = str(now_time.year) + '_' + str(
                    now_time.month) + '_' + str(now_time.day) + '_' + str(
                        now_time.hour) + '_' + str(now_time.minute)

                temp_file_name = 'tmp_dl_model_checkpoint_' + time_string + str(
                    random.random()) + '.h5'
                model_checkpoint = ModelCheckpoint(temp_file_name,
                                                   monitor='val_loss',
                                                   save_best_only=True,
                                                   mode='min',
                                                   period=1)

                callbacks = [early_stopping, terminate_on_nan]
                if not self.is_hp_search:
                    callbacks.append(model_checkpoint)

                self.model.fit(X_fit,
                               y,
                               callbacks=callbacks,
                               validation_data=(X_test, y_test),
                               verbose=verbose)

                # TODO: give some kind of logging on how the model did here! best epoch, best accuracy, etc.

                if self.is_hp_search is False:
                    self.model = keras_load_model(temp_file_name)

                try:
                    os.remove(temp_file_name)
                except OSError as e:
                    pass
            except KeyboardInterrupt as e:
                print(
                    'Stopping training at this point because we heard a KeyboardInterrupt'
                )
                print(
                    'If the deep learning model is functional at this point, we will output the model in its latest form'
                )
                print(
                    'Note that this feature is an unofficial beta-release feature that is known to fail on occasion'
                )

                if self.is_hp_search is False:
                    self.model = keras_load_model(temp_file_name)
                try:
                    os.remove(temp_file_name)
                except OSError as e:
                    pass

        elif self.model_name[:4] == 'LGBM':
            X_fit = X.toarray()

            X_fit, y, X_test, y_test = self.get_X_test(X_fit, y)

            try:
                X_test = X_test.toarray()
            except AttributeError as e:
                pass

            if self.type_of_estimator == 'regressor':
                eval_metric = 'rmse'
            elif self.type_of_estimator == 'classifier':
                if len(set(y_test)) > 2:
                    eval_metric = 'multi_logloss'
                else:
                    eval_metric = 'binary_logloss'

            verbose = True
            if self.is_hp_search == True:
                verbose = False

            if self.X_test is not None:
                eval_name = 'X_test_the_user_passed_in'
            else:
                eval_name = 'random_holdout_set_from_training_data'

            cat_feature_indices = self.get_categorical_feature_indices()
            if cat_feature_indices is None:
                self.model.fit(X_fit,
                               y,
                               eval_set=[(X_test, y_test)],
                               early_stopping_rounds=100,
                               eval_metric=eval_metric,
                               eval_names=[eval_name],
                               verbose=verbose)
            else:
                self.model.fit(X_fit,
                               y,
                               eval_set=[(X_test, y_test)],
                               early_stopping_rounds=100,
                               eval_metric=eval_metric,
                               eval_names=[eval_name],
                               categorical_feature=cat_feature_indices,
                               verbose=verbose)

        elif self.model_name[:8] == 'CatBoost':
            X_fit = X_fit.toarray()

            if self.type_of_estimator == 'classifier' and len(
                    pd.Series(y).unique()) > 2:
                # TODO: we might have to modify the format of the y values, converting them all to ints, then back again (sklearn has a useful inverse_transform on some preprocessing classes)
                self.model.set_params(loss_function='MultiClass')

            cat_feature_indices = self.get_categorical_feature_indices()

            self.model.fit(X_fit, y, cat_features=cat_feature_indices)

        elif self.model_name[:16] == 'GradientBoosting':
            if not sklearn_version > '0.18.1':
                X_fit = X_fit.toarray()

            patience = 20
            best_val_loss = -10000000000
            num_worse_rounds = 0
            best_model = deepcopy(self.model)
            X_fit, y, X_test, y_test = self.get_X_test(X_fit, y)

            # Add a variable number of trees each time, depending how far into the process we are
            if os.environ.get('is_test_suite', False) == 'True':
                num_iters = list(range(1, 50, 1)) + list(range(
                    50, 100, 2)) + list(range(100, 250, 3))
            else:
                num_iters = list(range(
                    1, 50, 1)) + list(range(50, 100, 2)) + list(
                        range(100, 250, 3)) + list(range(250, 500, 5)) + list(
                            range(500, 1000, 10)) + list(range(
                                1000, 2000, 20)) + list(range(
                                    2000, 10000, 100))
            # TODO: get n_estimators from the model itself, and reduce this list to only those values that come under the value from the model

            try:
                for num_iter in num_iters:
                    warm_start = True
                    if num_iter == 1:
                        warm_start = False

                    self.model.set_params(n_estimators=num_iter,
                                          warm_start=warm_start)
                    self.model.fit(X_fit, y)

                    if self.training_prediction_intervals == True:
                        val_loss = self.model.score(X_test, y_test)
                    else:
                        try:
                            val_loss = self._scorer.score(self, X_test, y_test)
                        except Exception as e:
                            val_loss = self.model.score(X_test, y_test)

                    if val_loss - self.min_step_improvement > best_val_loss:
                        best_val_loss = val_loss
                        num_worse_rounds = 0
                        best_model = deepcopy(self.model)
                    else:
                        num_worse_rounds += 1
                    print(
                        '[' + str(num_iter) +
                        '] random_holdout_set_from_training_data\'s score is: '
                        + str(round(val_loss, 3)))
                    if num_worse_rounds >= patience:
                        break
            except KeyboardInterrupt:
                print(
                    'Heard KeyboardInterrupt. Stopping training, and using the best checkpointed GradientBoosting model'
                )
                pass

            self.model = best_model
            print(
                'The number of estimators that were the best for this training dataset: '
                + str(self.model.get_params()['n_estimators']))
            print('The best score on the holdout set: ' + str(best_val_loss))

        else:
            self.model.fit(X_fit, y)

        if self.X_test is not None:
            del self.X_test
            del self.y_test
        return self
示例#5
0
    def fit(self, X, y):
        self.model_name = get_name_from_model(self.model)

        X_fit = X

        if self.model_name[:12] == 'DeepLearning' or self.model_name in ['BayesianRidge', 'LassoLars', 'OrthogonalMatchingPursuit', 'ARDRegression', 'Perceptron', 'PassiveAggressiveClassifier', 'SGDClassifier', 'RidgeClassifier', 'LogisticRegression']:
            if scipy.sparse.issparse(X_fit):
                X_fit = X_fit.todense()

            if self.model_name[:12] == 'DeepLearning':

                # For Keras, we need to tell it how many input nodes to expect, which is our num_cols
                num_cols = X_fit.shape[1]

                model_params = self.model.get_params()
                del model_params['build_fn']

                if self.type_of_estimator == 'regressor':
                    self.model = KerasRegressor(build_fn=utils_models.make_deep_learning_model, num_cols=num_cols, feature_learning=self.feature_learning, **model_params)
                elif self.type_of_estimator == 'classifier':
                    self.model = KerasClassifier(build_fn=utils_models.make_deep_learning_classifier, num_cols=num_cols, feature_learning=self.feature_learning, **model_params)

        try:
            if self.model_name[:12] == 'DeepLearning':

                print('\nWe will stop training early if we have not seen an improvement in training accuracy in 25 epochs')
                from keras.callbacks import EarlyStopping
                early_stopping = EarlyStopping(monitor='loss', patience=25, verbose=1)
                self.model.fit(X_fit, y, callbacks=[early_stopping])

            elif self.model_name[:16] == 'GradientBoosting':
                if scipy.sparse.issparse(X_fit):
                    X_fit = X_fit.todense()

                patience = 20
                best_val_loss = -10000000000
                num_worse_rounds = 0
                best_model = deepcopy(self.model)
                X_fit, X_test, y, y_test = train_test_split(X_fit, y, test_size=0.15)

                # Add a variable number of trees each time, depending how far into the process we are
                num_iters = list(range(1, 50, 1)) + list(range(50, 100, 2)) + list(range(100, 250, 3)) + list(range(250, 500, 5)) + list(range(500, 1000, 10)) + list(range(1000, 2000, 20)) + list(range(2000, 10000, 100))

                try:
                    for num_iter in num_iters:
                        warm_start = True
                        if num_iter == 1:
                            warm_start = False

                        self.model.set_params(n_estimators=num_iter, warm_start=warm_start)
                        self.model.fit(X_fit, y)

                        try:
                            val_loss = self._scorer.score(self, X_test, y_test)
                        except Exception as e:
                            val_loss = self.model.score(X_test, y_test)

                        if val_loss > best_val_loss:
                            best_val_loss = val_loss
                            num_worse_rounds = 0
                            best_model = deepcopy(self.model)
                        else:
                            num_worse_rounds += 1

                        if num_worse_rounds >= patience:
                            break
                except KeyboardInterrupt:
                    print('Heard KeyboardInterrupt. Stopping training, and using the best checkpointed GradientBoosting model')
                    pass

                self.model = best_model
                print('The number of estimators that were the best for this training dataset: ' + str(self.model.get_params()['n_estimators']))
                print('The best score on a random 15 percent holdout set of the training data: ' + str(best_val_loss))

            else:
                self.model.fit(X_fit, y)

        except TypeError as e:
            if scipy.sparse.issparse(X_fit):
                X_fit = X_fit.todense()
            self.model.fit(X_fit, y)

        except KeyboardInterrupt as e:
            print('Stopping training at this point because we heard a KeyboardInterrupt')
            print('If the model is functional at this point, we will output the model in its latest form')
            print('Note that not all models can be interrupted and still used, and that this feature generally is an unofficial beta-release feature that is known to fail on occasion')
            pass

        return self
示例#6
0
    def fit(self, X, y):
        self.model_name = get_name_from_model(self.model)

        X_fit = X

        if self.model_name[:12] == 'DeepLearning' or self.model_name in [
                'BayesianRidge', 'LassoLars', 'OrthogonalMatchingPursuit',
                'ARDRegression', 'Perceptron', 'PassiveAggressiveClassifier',
                'SGDClassifier', 'RidgeClassifier', 'LogisticRegression'
        ]:
            if scipy.sparse.issparse(X_fit):
                X_fit = X_fit.todense()

            if self.model_name[:12] == 'DeepLearning':
                if keras_installed:

                    # For Keras, we need to tell it how many input nodes to expect, which is our num_cols
                    num_cols = X_fit.shape[1]

                    model_params = self.model.get_params()
                    del model_params['build_fn']

                    if self.type_of_estimator == 'regressor':
                        self.model = KerasRegressor(
                            build_fn=utils_models.make_deep_learning_model,
                            num_cols=num_cols,
                            feature_learning=self.feature_learning,
                            **model_params)
                    elif self.type_of_estimator == 'classifier':
                        self.model = KerasClassifier(
                            build_fn=utils_models.
                            make_deep_learning_classifier,
                            num_cols=num_cols,
                            feature_learning=self.feature_learning,
                            **model_params)
                else:
                    print(
                        'WARNING: We did not detect that Keras was available.')
                    raise TypeError(
                        'A DeepLearning model was requested, but Keras was not available to import'
                    )

        try:
            if self.model_name[:12] == 'DeepLearning':

                print(
                    '\nWe will stop training early if we have not seen an improvement in training accuracy in 25 epochs'
                )
                from keras.callbacks import EarlyStopping
                early_stopping = EarlyStopping(monitor='loss',
                                               patience=25,
                                               verbose=1)
                self.model.fit(X_fit, y, callbacks=[early_stopping])

            else:
                self.model.fit(X_fit, y)

        except TypeError as e:
            if scipy.sparse.issparse(X_fit):
                X_fit = X_fit.todense()
            self.model.fit(X_fit, y)

        except KeyboardInterrupt as e:
            print(
                'Stopping training at this point because we heard a KeyboardInterrupt'
            )
            print(
                'If the model is functional at this point, we will output the model in its latest form'
            )
            print(
                'Note that not all models can be interrupted and still used, and that this feature generally is an unofficial beta-release feature that is known to fail on occasion'
            )
            pass

        return self
示例#7
0
    def fit(self, X, y):
        self.model_name = get_name_from_model(self.model)

        X_fit = X

        if self.model_name[:12] == 'DeepLearning' or self.model_name in [
                'BayesianRidge', 'LassoLars', 'OrthogonalMatchingPursuit',
                'ARDRegression', 'Perceptron', 'PassiveAggressiveClassifier',
                'SGDClassifier', 'RidgeClassifier', 'LogisticRegression'
        ]:
            if scipy.sparse.issparse(X_fit):
                X_fit = X_fit.todense()

            if self.model_name[:12] == 'DeepLearning':

                # For Keras, we need to tell it how many input nodes to expect, which is our num_cols
                num_cols = X_fit.shape[1]

                model_params = self.model.get_params()
                del model_params['build_fn']

                if self.type_of_estimator == 'regressor':
                    self.model = KerasRegressor(
                        build_fn=utils_models.make_deep_learning_model,
                        num_cols=num_cols,
                        feature_learning=self.feature_learning,
                        **model_params)
                elif self.type_of_estimator == 'classifier':
                    self.model = KerasClassifier(
                        build_fn=utils_models.make_deep_learning_classifier,
                        num_cols=num_cols,
                        feature_learning=self.feature_learning,
                        **model_params)

        try:
            if self.model_name[:12] == 'DeepLearning':

                print(
                    '\nWe will stop training early if we have not seen an improvement in training accuracy in 25 epochs'
                )
                from keras.callbacks import EarlyStopping
                early_stopping = EarlyStopping(monitor='loss',
                                               patience=25,
                                               verbose=1)
                self.model.fit(X_fit, y, callbacks=[early_stopping])

            elif self.model_name[:4] == 'LGBM':

                X_fit, X_test, y, y_test = train_test_split(X_fit,
                                                            y,
                                                            test_size=0.15)

                if self.type_of_estimator == 'regressor':
                    eval_metric = 'rmse'
                elif self.type_of_estimator == 'classifier':
                    if len(set(y_test)) > 2:
                        eval_metric = 'multi_logloss'
                    else:
                        eval_metric = 'binary_logloss'

                self.model.fit(
                    X_fit,
                    y,
                    eval_set=[(X_test, y_test)],
                    early_stopping_rounds=50,
                    eval_metric=eval_metric,
                    eval_names=['random_holdout_set_from_training_data'])

            elif self.model_name[:8] == 'CatBoost':
                X_fit = pd.DataFrame(X_fit.todense())

                if self.type_of_estimator == 'classifier' and len(
                        pd.Series(y).unique()) > 2:
                    # TODO: we might have to modify the format of the y values, converting them all to ints, then back again somehow
                    self.model.set_params(loss_function='MultiClass')

                self.model.fit(X_fit, y)
            elif self.model_name[:16] == 'GradientBoosting':
                if scipy.sparse.issparse(X_fit):
                    X_fit = X_fit.todense()

                patience = 20
                best_val_loss = -10000000000
                num_worse_rounds = 0
                best_model = deepcopy(self.model)
                X_fit, X_test, y, y_test = train_test_split(X_fit,
                                                            y,
                                                            test_size=0.15)

                # Add a variable number of trees each time, depending how far into the process we are
                if os.environ.get('is_test_suite', False) == 'True':
                    num_iters = list(range(1, 50, 1)) + list(range(
                        50, 100, 2)) + list(range(100, 250, 3))
                else:
                    num_iters = list(range(1, 50, 1)) + list(range(
                        50, 100, 2)) + list(range(100, 250, 3)) + list(
                            range(250, 500, 5)) + list(
                                range(500, 1000, 10)) + list(
                                    range(1000, 2000, 20)) + list(
                                        range(2000, 10000, 100))

                try:
                    for num_iter in num_iters:
                        warm_start = True
                        if num_iter == 1:
                            warm_start = False

                        self.model.set_params(n_estimators=num_iter,
                                              warm_start=warm_start)
                        self.model.fit(X_fit, y)

                        if self.training_prediction_intervals == True:
                            val_loss = self.model.score(X_test, y_test)
                        else:
                            try:
                                val_loss = self._scorer.score(
                                    self, X_test, y_test)
                            except Exception as e:
                                val_loss = self.model.score(X_test, y_test)

                        if val_loss - self.min_step_improvement > best_val_loss:
                            best_val_loss = val_loss
                            num_worse_rounds = 0
                            best_model = deepcopy(self.model)
                        else:
                            num_worse_rounds += 1
                        print(
                            '[' + str(num_iter) +
                            '] random_holdout_set_from_training_data\'s score is: '
                            + str(round(val_loss, 3)))
                        if num_worse_rounds >= patience:
                            break
                except KeyboardInterrupt:
                    print(
                        'Heard KeyboardInterrupt. Stopping training, and using the best checkpointed GradientBoosting model'
                    )
                    pass

                self.model = best_model
                print(
                    'The number of estimators that were the best for this training dataset: '
                    + str(self.model.get_params()['n_estimators']))
                print(
                    'The best score on a random 15 percent holdout set of the training data: '
                    + str(best_val_loss))

            else:
                self.model.fit(X_fit, y)

        except TypeError as e:
            if scipy.sparse.issparse(X_fit):
                X_fit = X_fit.todense()
            self.model.fit(X_fit, y)

        except KeyboardInterrupt as e:
            print(
                'Stopping training at this point because we heard a KeyboardInterrupt'
            )
            print(
                'If the model is functional at this point, we will output the model in its latest form'
            )
            print(
                'Note that not all models can be interrupted and still used, and that this feature generally is an unofficial beta-release feature that is known to fail on occasion'
            )
            pass

        return self
示例#8
0
    def fit_grid_search(self, X_df, y, gs_params):

        model = gs_params['model']
        # Sometimes we're optimizing just one model, sometimes we're comparing a bunch of non-optimized models.
        if isinstance(model, list):
            model = model[0]
        model_name = utils_models.get_name_from_model(model)

        full_pipeline = self._construct_pipeline(model_name=model_name)
        ppl = full_pipeline.named_steps['final_model']

        if self.verbose:
            grid_search_verbose = 5
        else:
            grid_search_verbose = 0

        gs = GridSearchCV(
            # Fit on the pipeline.
            ppl,
            # Two splits of cross-validation, by default
            cv=self.cv,
            param_grid=gs_params,
            # Train across all cores.
            n_jobs=-1,
            # Be verbose (lots of printing).
            verbose=grid_search_verbose,
            # Print warnings when we fail to fit a given combination of parameters, but do not raise an error.
            # Set the score on this partition to some very negative number, so that we do not choose this estimator.
            error_score=-1000000000,
            scoring=self._scorer.score,
            # Don't allocate memory for all jobs upfront. Instead, only allocate enough memory to handle the current jobs plus an additional 50%
            pre_dispatch='1.5*n_jobs')

        if self.verbose:
            print(
                '\n\n********************************************************************************************'
            )
            if self.optimize_final_model == True:
                print(
                    'About to run GridSearchCV on the pipeline for the model '
                    + model_name + ' to predict ' + self.output_column)
            else:
                print(
                    'About to run GridSearchCV on the pipeline for several models to predict '
                    + self.output_column)
                # Note that we will only report analytics results on the final model that ultimately gets selected, and trained on the entire dataset

        gs.fit(X_df, y)

        if self.write_gs_param_results_to_file:
            utils.write_gs_param_results_to_file(gs, self.gs_param_file_name)

        if self.verbose:
            self.print_training_summary(gs)

        self.trained_final_model = gs.best_estimator_
        if 'model' in gs.best_params_:
            model_name = gs.best_params_['model']
            self.print_results(model_name)

        return gs