Пример #1
0
    def _fit(self,
             X,
             y,
             regressor,
             val_size=0.3,
             random_state=None,
             scoring=None,
             return_train_score=True,
             **fit_params):
        result_dict = dict()
        indices = range(np.array(X).shape[0])

        if 'train_idx' in fit_params.keys() and 'val_idx' in fit_params.keys():
            train_idx = fit_params['train_idx']
            val_idx = fit_params['val_idx']
            X_train = X.loc[train_idx]
            X_val = X.loc[val_idx]
            y_train = y.loc[train_idx]
            y_val = y.loc[val_idx]
        else:
            X_train, X_val, y_train, y_val, train_idx, val_idx = \
                train_test_split(X, y, indices,
                                 test_size=val_size, random_state=random_state)

        train_idx = list(map(int, train_idx))
        val_idx = list(map(int, val_idx))

        np.random.seed(random_state)
        regressor_params = appropriate_kwargs(fit_params, regressor.fit)
        regressor = regressor.fit(X_train, y_train, **regressor_params)
        result_dict['estimators'] = [regressor]
        result_dict['indices'] = [[train_idx, val_idx]]

        val_scores, scorers = calc_scores(X=X_val,
                                          y=y_val,
                                          estimator=regressor,
                                          scoring=scoring)

        if return_train_score:
            train_scores, _ = calc_scores(X=X_train,
                                          y=y_train,
                                          estimator=regressor,
                                          scoring=scoring)
        for name in scorers:
            result_dict['test_%s' % name] = [val_scores[name]]
            if return_train_score:
                key = 'train_%s' % name
                result_dict[key] = [train_scores[name]]
                if return_train_score == 'warn':
                    message = (
                        'You are accessing a training score ({!r}), '
                        'which will not be available by default '
                        'any more in 0.21. If you need training scores, '
                        'please set return_train_score=True').format(key)
                    # warn on key access
                    result_dict.add_warning(key, message, FutureWarning)
        return result_dict, scorers
Пример #2
0
 def calc_score(self, X, y, scoring=None):
     check_is_fitted(self)
     scores, _ = \
         calc_scores(X=X, y=y, estimator=self.best_model_,
                     scoring=scoring if scoring is not None
                     else self.scoring)
     return scores
Пример #3
0
def load_model_and_predict(model_file,
                           X,
                           y=None,
                           task="classification",
                           scoring=None,
                           save_prediction_types='dataframe',
                           backend=None,
                           output_path='tmp'):
    """
    Predict on an arbitrary dataset using the trained model and save predictions
    and/or scores.

    Args:
        model_file: str
            Path to the trained model file.
        X: array-like
            The data to fit. Can be for example a list, or an array.
        y: array-like, optional (default: None)
            The target variable to predict in the case of supervise learning.
        task: str, 'classification' or 'regression' (default: classification)
            Model's task type, only support 'classification' or 'regression'
        scoring: str or callable or a list of them or None (default: None)
            A string (see model evaluation documentation) or
            a scorer callable object / function with signature
            ``scorer(estimator, X, y)``.
        save_prediction_types: str or [str] (default: dataframe)
            It effect when save_prediction is True.
            The optional parameters are: ["npy", "txt", "dataframe"].
        backend: Backend object (default: None)
            MLBackend object which defined output_path, environment
            configuration, save_predictions, and so on.
            If None, use default MLBackend object.
        output_path: str (default: 'tmp')
            Output path of PredictPipeline, if is None or 'tmp', use the
            default output path: '/tmp/amlearn/task_%pid/output_%timestamp'.

    Returns:
        predictions: np.array
            Predictions from the trained model.
    """
    if backend is None:
        backend = create_ml_backend(output_path=output_path)

    model = joblib.load(model_file)
    if isinstance(model, RegressorMixin):
        if task == 'regression' or task is None:
            task = 'regression'
            if scoring is None:
                scoring = [
                    'r2', 'neg_mean_absolute_error', 'neg_mean_squared_error'
                ]
        else:
            raise TypeError('Model type of model_file is "regression", '
                            'but the task parameter is not. Please make '
                            'sure these two match.')
    elif isinstance(model, ClassifierMixin):
        if task == 'classification' or task is None:
            task = 'classification'
            if scoring is None:
                scoring = ['roc_auc', 'accuracy', 'f1', 'precision', 'recall']
        else:
            raise TypeError('Model type of model_file is "classification",'
                            'but the task parameter is not. Please make '
                            'sure these two match.')
    else:
        raise TypeError('Model must be instance of RegressorMixin or '
                        'ClassifierMixin.')

    if task == 'classification':
        if hasattr(model, 'predict_proba'):
            predictions = model.predict_proba(X)
        elif hasattr(model, 'decision_function'):
            predictions = model.decision_function(X)
        else:
            predictions = model.predict(X)

        targets_and_predictions = np.array(list(zip(y, predictions[:, 1]))) \
            if y is not None else predictions[:, 1]

    elif task == 'regression':
        predictions = model.predict(X)
        targets_and_predictions = np.array(list(zip(y, predictions))) \
            if y is not None else predictions
    else:
        raise ValueError('task only support classification or regression')

    if scoring and y is not None:
        scores, _ = calc_scores(X=X, y=y, estimator=model, scoring=scoring)
        write_file(
            os.path.join(backend.output_path, 'scores.txt'), '{}\n{}'.format(
                ','.join(['dataset'] + list(scores.keys())),
                ','.join(['predict'] + list(map(str, scores.values())))))

    if not isinstance(save_prediction_types, list_like()):
        save_prediction_types = [save_prediction_types]
    for predict_type in save_prediction_types:
        if predict_type in backend.valid_predictions_type:
            getattr(backend, 'save_predictions_as_{}'.format(predict_type))\
                (targets_and_predictions, subdir='')
        else:
            raise ValueError('predict_type {} is unknown, '
                             'Possible values are {}'.format(
                                 predict_type, backend.valid_predictions_type))

    return predictions