Пример #1
0
 def save_start_time(self, seed):
     start_time = time.time()
     time_file = self._get_start_time_filename(seed)
     write_file(time_file, str(start_time))
     return time_file
Пример #2
0
def load_model_and_predict(model_file,
                           X,
                           y=None,
                           task="classification",
                           scoring=None,
                           save_prediction_types='dataframe',
                           backend=None,
                           output_path='tmp'):
    """
    Predict on an arbitrary dataset using the trained model and save predictions
    and/or scores.

    Args:
        model_file: str
            Path to the trained model file.
        X: array-like
            The data to fit. Can be for example a list, or an array.
        y: array-like, optional (default: None)
            The target variable to predict in the case of supervise learning.
        task: str, 'classification' or 'regression' (default: classification)
            Model's task type, only support 'classification' or 'regression'
        scoring: str or callable or a list of them or None (default: None)
            A string (see model evaluation documentation) or
            a scorer callable object / function with signature
            ``scorer(estimator, X, y)``.
        save_prediction_types: str or [str] (default: dataframe)
            It effect when save_prediction is True.
            The optional parameters are: ["npy", "txt", "dataframe"].
        backend: Backend object (default: None)
            MLBackend object which defined output_path, environment
            configuration, save_predictions, and so on.
            If None, use default MLBackend object.
        output_path: str (default: 'tmp')
            Output path of PredictPipeline, if is None or 'tmp', use the
            default output path: '/tmp/amlearn/task_%pid/output_%timestamp'.

    Returns:
        predictions: np.array
            Predictions from the trained model.
    """
    if backend is None:
        backend = create_ml_backend(output_path=output_path)

    model = joblib.load(model_file)
    if isinstance(model, RegressorMixin):
        if task == 'regression' or task is None:
            task = 'regression'
            if scoring is None:
                scoring = [
                    'r2', 'neg_mean_absolute_error', 'neg_mean_squared_error'
                ]
        else:
            raise TypeError('Model type of model_file is "regression", '
                            'but the task parameter is not. Please make '
                            'sure these two match.')
    elif isinstance(model, ClassifierMixin):
        if task == 'classification' or task is None:
            task = 'classification'
            if scoring is None:
                scoring = ['roc_auc', 'accuracy', 'f1', 'precision', 'recall']
        else:
            raise TypeError('Model type of model_file is "classification",'
                            'but the task parameter is not. Please make '
                            'sure these two match.')
    else:
        raise TypeError('Model must be instance of RegressorMixin or '
                        'ClassifierMixin.')

    if task == 'classification':
        if hasattr(model, 'predict_proba'):
            predictions = model.predict_proba(X)
        elif hasattr(model, 'decision_function'):
            predictions = model.decision_function(X)
        else:
            predictions = model.predict(X)

        targets_and_predictions = np.array(list(zip(y, predictions[:, 1]))) \
            if y is not None else predictions[:, 1]

    elif task == 'regression':
        predictions = model.predict(X)
        targets_and_predictions = np.array(list(zip(y, predictions))) \
            if y is not None else predictions
    else:
        raise ValueError('task only support classification or regression')

    if scoring and y is not None:
        scores, _ = calc_scores(X=X, y=y, estimator=model, scoring=scoring)
        write_file(
            os.path.join(backend.output_path, 'scores.txt'), '{}\n{}'.format(
                ','.join(['dataset'] + list(scores.keys())),
                ','.join(['predict'] + list(map(str, scores.values())))))

    if not isinstance(save_prediction_types, list_like()):
        save_prediction_types = [save_prediction_types]
    for predict_type in save_prediction_types:
        if predict_type in backend.valid_predictions_type:
            getattr(backend, 'save_predictions_as_{}'.format(predict_type))\
                (targets_and_predictions, subdir='')
        else:
            raise ValueError('predict_type {} is unknown, '
                             'Possible values are {}'.format(
                                 predict_type, backend.valid_predictions_type))

    return predictions
Пример #3
0
    def _fit_cv(self,
                X,
                y,
                random_state=None,
                scoring=None,
                cv_num=1,
                cv_params=None,
                val_size=0.3,
                save_model=False,
                save_score=True,
                save_prediction=False,
                prediction_types='dataframe',
                save_feature_importances=True,
                save_train_val_idx=False,
                **fit_params):

        # If user's cv_params contains 'cv_num' parameter, use the max value
        # between function parameter 'cv_num' and cv_params's 'cv_num'.
        if not self.imblearn:
            self.backend.logger.info('Start Cross Validation.')
            cv_start_time = time.time()

        if cv_params is None:
            cv_params = {}

        if 'cv_num' in cv_params.keys():
            cv_num = max(cv_num, cv_params['cv_num'])
            cv_params.pop('cv_num')

        if 'scoring' in cv_params.keys():
            cv_params.pop('scoring')

        return_train_score = cv_params.get('return_train_score', True)
        if cv_num > 1:
            np.random.seed(random_state)
            classifier_params = \
                appropriate_kwargs(fit_params, self.classifier.fit)
            results, scorers = \
                cross_validate(estimator=self.classifier, scoring=scoring,
                               fit_params=classifier_params, X=X, y=y,
                               cv=cv_num, **cv_params)
        else:
            results, scorers = self._fit(X,
                                         y,
                                         self.classifier,
                                         val_size=val_size,
                                         return_train_score=return_train_score,
                                         random_state=random_state,
                                         scoring=scoring,
                                         **fit_params)
            cv_num = 1

        # TODO: now if scoring is more than one, score_name only can be the first of them.
        self.score_name = self.score_name if hasattr(self, 'score_name') \
            else list(scorers.keys())[0]
        self.best_score_, (self.best_model_, self.best_model_tag_)= \
            max(zip(results['test_{}'.format(self.score_name)],
                    zip(results['estimators'],
                        [''] if cv_num == 1 else
                        ["cv_{}".format(i) for i in range(cv_num)])),
                key=lambda x: x[0])

        if not self.imblearn:
            self.backend.logger.info(
                "\tCV classification finish in {:.4f} seconds.".format(
                    time.time() - cv_start_time))

        if save_model or save_score or save_train_val_idx or save_prediction \
                or save_feature_importances:
            imblearn_output_path = \
                os.path.join(self.backend.output_path, self.imblearn_tag)
            create_path(imblearn_output_path)
            if save_score:
                write_file(
                    os.path.join(imblearn_output_path, 'mean_scores.txt'),
                    '{}\n{}\n{}'.format(
                        ','.join(['dataset'] + list(scorers.keys())),
                        ','.join(['test'] + [
                            str(np.mean(results['test_{}'.format(score_name)]))
                            for score_name in scorers.keys()
                        ]), ','.join(['train'] + [
                            str(np.mean(results['train_{}'.format(
                                score_name)]))
                            for score_name in scorers.keys()
                        ]) if return_train_score else -1))

            check_path_while_saving(self.backend.output_path)
            for cv_idx in range(cv_num):
                sub_path = os.path.join(self.imblearn_tag,
                                        "cv_{}".format(cv_idx))
                cv_output_path = \
                    os.path.join(self.backend.output_path, sub_path)
                create_path(cv_output_path)

                if save_score:
                    write_file(
                        os.path.join(cv_output_path, 'scores.txt'),
                        '{}\n{}\n{}'.format(
                            ','.join(['dataset'] + list(scorers.keys())),
                            ','.join(['test'] + [
                                str(results['test_{}'.format(score_name)]
                                    [cv_idx]) for score_name in scorers.keys()
                            ]), ','.join(['train'] + [
                                str(results['train_{}'.format(score_name)]
                                    [cv_idx]) for score_name in scorers.keys()
                            ]) if return_train_score else -1))

                score_model = results['estimators'][cv_idx]
                if save_model:
                    self.backend.save_model(score_model, sub_path)
                if save_feature_importances:
                    self.backend.save_json(
                        self.feature_importances_dict(score_model),
                        sub_path,
                        name='feature_importances')

                if save_train_val_idx:
                    train_idx = results['indices'][cv_idx][0]
                    val_idx = results['indices'][cv_idx][1]
                    write_file(os.path.join(cv_output_path, 'train_idx.txt'),
                               "\n".join(list(map(str, train_idx))))

                    write_file(os.path.join(cv_output_path, 'val_idx.txt'),
                               "\n".join(list(map(str, val_idx))))

                if save_prediction:
                    if 'X_val' in fit_params.keys(
                    ) and 'y_val' in fit_params.keys():
                        test_X = fit_params['X_val']
                        test_y = fit_params['y_val']
                    else:
                        test_X = X[results['indices'][cv_idx][1]] \
                            if isinstance(X, np.ndarray) \
                            else X.iloc[results['indices'][cv_idx][1]]
                        test_y = y[results['indices'][cv_idx][1]] \
                            if isinstance(y, np.ndarray) \
                            else y.iloc[results['indices'][cv_idx][1]]
                    if hasattr(score_model, 'predict_proba'):
                        predictions = score_model.predict_proba(test_X)
                    elif hasattr(score_model, 'decision_function'):
                        predictions = score_model.decision_function(test_X)
                    else:
                        predictions = score_model.predict(test_X)
                    targets_and_predictions = \
                        np.array(list(zip(test_y, predictions[:, 1])))

                    if not isinstance(prediction_types, list_like()):
                        prediction_types = [prediction_types]
                    for predict_type in prediction_types:
                        if predict_type in self.backend.valid_predictions_type:
                            getattr(
                                self.backend,
                                'save_predictions_as_{}'.format(predict_type))(
                                    targets_and_predictions, sub_path)
                        else:
                            raise ValueError(
                                'predict_type {} is unknown, '
                                'Possible values are {}'.format(
                                    predict_type,
                                    self.backend.valid_predictions_type))
        return results, scorers
Пример #4
0
    def _fit_imblearn(self,
                      X,
                      y,
                      random_state=None,
                      scoring=None,
                      imblearn_method=None,
                      imblearn_params=None,
                      cv_num=1,
                      cv_params=None,
                      val_size=0.3,
                      save_model=True,
                      save_score=True,
                      save_prediction=True,
                      prediction_types='dataframe',
                      save_train_val_idx=True,
                      save_feature_importances=True,
                      **fit_params):
        self.backend.logger.info('Start Imblearn.')
        imblearn_start_time = time.time()
        imblearn = ImblearnPreprocessor()
        if imblearn_method is None:
            imblearn_method = 'EasyEnsemble'
        if imblearn_params is None:
            imblearn_params = {"random_state": random_state, "n_subsets": 3}
        if 'random_state' not in imblearn_params:
            imblearn_params['random_state'] = random_state

        if 'train_idx' in fit_params.keys() and 'val_idx' in fit_params.keys(
        ) and cv_num <= 1:
            train_idx = fit_params['train_idx']
            val_idx = fit_params['val_idx']
            X_val = copy(X.loc[val_idx])
            y_val = copy(y.loc[val_idx])
            X = X.loc[train_idx]
            y = y.loc[train_idx]
        else:
            X_val = None
            y_val = None

        X, y = imblearn.fit(X, y, imblearn_method, imblearn_params)
        score_model_list = list()
        # get the imblearn n_subsets num from X shape.
        if len(X.shape) == 2:
            n_subsets = 1
            X = [X]
            y = [y]
        elif len(X.shape) == 3:
            n_subsets = X.shape[0]
        else:
            raise ValueError("imblearn result error!")

        self.backend.logger.info(
            '\tData imblearn finished in {:.4f} seconds.'.format(
                time.time() - imblearn_start_time))

        all_results = dict()
        return_train_score = cv_params.get('return_train_score', True) \
            if cv_params is not None else True
        for imblearn_idx in range(n_subsets):
            self.backend.logger.info(
                'Start imblearn_{} classification.'.format(imblearn_idx))
            start_time = time.time()
            X_imb = np.array(copy(X))[imblearn_idx, :, :]
            y_imb = np.array(copy(y))[imblearn_idx, :]
            self.imblearn_tag = 'imblearn_{}'.format(imblearn_idx)
            results, scorers = self._fit_cv(
                X=X_imb,
                y=y_imb,
                random_state=random_state,
                scoring=scoring,
                cv_params=cv_params,
                cv_num=cv_num,
                val_size=val_size,
                save_model=save_model,
                save_score=save_score,
                save_prediction=save_prediction,
                prediction_types=prediction_types,
                save_feature_importances=save_feature_importances,
                save_train_val_idx=save_train_val_idx,
                X_val=X_val,
                y_val=y_val,
                **fit_params)

            for score_name in scorers.keys():
                if 'test_{}'.format(score_name) in all_results.keys():
                    if return_train_score:
                        all_results['train_{}'.format(score_name)] += \
                            results['train_{}'.format(score_name)]
                    all_results['test_{}'.format(score_name)] += \
                        results['test_{}'.format(score_name)]
                else:
                    if return_train_score:
                        all_results['train_{}'.format(score_name)] = \
                            results['train_{}'.format(score_name)]
                    else:
                        all_results['train_{}'.format(score_name)] = [-1]
                    all_results['test_{}'.format(score_name)] = \
                        results['test_{}'.format(score_name)]

            score_model_list.append(
                (self.best_score_, (self.best_model_, "imblearn_{}_{}".format(
                    imblearn_idx, self.best_model_tag_))))
            self.backend.logger.info(
                "\tImblearn_{} classification finish in {:.4f} seconds.".
                format(imblearn_idx,
                       time.time() - start_time))

        if save_score:
            print([
                all_results['test_{}'.format(score_name)]
                for score_name in scorers.keys()
            ])
            write_file(
                os.path.join(self.backend.output_path, 'mean_scores.txt'),
                '{}\n{}\n{}'.format(
                    ','.join(['dataset'] + list(scorers.keys())),
                    ','.join(['test'] + [
                        str(
                            np.mean(
                                np.array(all_results['test_{}'.format(
                                    score_name)])))
                        for score_name in scorers.keys()
                    ]), ','.join(['train'] + [
                        str(
                            np.mean(
                                np.array(all_results['train_{}'.format(
                                    score_name)])))
                        for score_name in scorers.keys()
                    ])))

        self.best_score_, (self.best_model_, self.best_model_tag_) = \
            max(score_model_list, key=lambda x: x[0])
        self.backend.logger.info('Whole classification finish in {:.4f} '
                                 'seconds.'.format(time.time() -
                                                   imblearn_start_time))

        return self
Пример #5
0
    def _fit_cv(self, X, y, val_size=0.3, random_state=None, scoring=None,
                cv_num=1, cv_params=None, save_train_val_idx=True,
                save_model=True, save_score=True, save_prediction=True,
                prediction_types='dataframe', save_feature_importances=True,
                **fit_params):

        # If user's cv_params contains 'cv_num' parameter, use the max value
        # between function parameter 'cv_num' and cv_params's 'cv_num'.
        self.backend.logger.info('Start Cross Validation.')
        cv_start_time = time.time()

        if cv_params is None:
            cv_params = {}

        if 'cv_num' in cv_params.keys():
            cv_num = max(cv_num, cv_params['cv_num'])
            cv_params.pop('cv_num')

        if 'scoring' in cv_params.keys():
            cv_params.pop('scoring')

        return_train_score = cv_params.get('return_train_score', True)
        if cv_num > 1:
            if random_state is False:
                pass
            else:
                np.random.seed(random_state)
            results, scorers = \
                cross_validate(estimator=self.regressor, scoring=scoring,
                               fit_params=fit_params, X=X, y=y, cv=cv_num,
                               **cv_params)
        else:
            results, scorers = self._fit(
                X, y, self.regressor, val_size=val_size,
                return_train_score=return_train_score,
                random_state=random_state, scoring=scoring, **fit_params)
            cv_num = 1

        # TODO: now if scorers list length is more than 1, score_name only can
        #  be the first of them.
        self.score_name = self.score_name if hasattr(self, 'score_name') \
            else list(scorers.keys())[0]
        self.best_score_, (self.best_model_, self.best_model_tag_)= \
            max(zip(results['test_{}'.format(self.score_name)],
                    zip(results['estimators'],
                        [''] if cv_num == 1 else
                        ["cv_{}".format(i) for i in range(cv_num)])),
                key=lambda x: x[0])

        self.backend.logger.info(
            "\tCV regression finish in {:.4f} seconds.".format(
                time.time() - cv_start_time))
        if save_score:
            write_file(
                os.path.join(self.backend.output_path, 'mean_scores.txt'),
                '{}\n{}\n{}'.format(
                    ','.join(['dataset'] + list(scorers.keys())),
                    ','.join(['test'] +
                             [str(np.mean(results['test_{}'.format(
                                 score_name)]))
                              for score_name in scorers.keys()]),
                    ','.join(['train'] +
                             [str(np.mean(results['train_{}'.format(
                                 score_name)]))
                              for score_name in scorers.keys()])
                    if return_train_score else -1))

        for cv_idx in range(cv_num):
            cv_tag = "cv_{}".format(cv_idx)
            cv_output_path = os.path.join(self.backend.output_path, cv_tag)
            create_path(cv_output_path, merge=True)

            if save_score:
                write_file(os.path.join(cv_output_path, 'scores.txt'),
                           '{}\n{}\n{}'.format(
                               ','.join(['dataset'] + list(scorers.keys())),
                               ','.join(['test'] +
                                        [str(results['test_{}'.format(
                                            score_name)][cv_idx])
                                         for score_name in scorers.keys()]),
                               ','.join(['train'] +
                                        [str(results['train_{}'.format(
                                            score_name)][cv_idx])
                                         for score_name in scorers.keys()])
                               if return_train_score else -1))

            score_model = results['estimators'][cv_idx]
            if save_model:
                self.backend.save_model(score_model, cv_tag)
            if save_feature_importances:
                self.backend.save_json(
                    self.feature_importances_dict(score_model), cv_tag,
                    name='feature_importances')
            if save_train_val_idx:
                train_idx = results['indices'][cv_idx][0]
                val_idx = results['indices'][cv_idx][1]
                write_file(os.path.join(cv_output_path, 'train_idx.txt'),
                           "\n".join(list(map(str, train_idx))))

                write_file(os.path.join(cv_output_path, 'val_idx.txt'),
                           "\n".join(list(map(str, val_idx))))
            if save_prediction:
                predictions = \
                    score_model.predict(X.iloc[results['indices'][cv_idx][1]])
                targets_and_predictions = \
                    np.array(list(zip(y.iloc[results['indices'][cv_idx][1]],
                                      predictions)))

                if not isinstance(prediction_types, list_like()):
                    prediction_types = [prediction_types]
                for predict_type in prediction_types:
                    if predict_type in self.backend.valid_predictions_type:
                        instance = getattr(self.backend,
                                           'save_predictions_as_{}'.format(
                                               predict_type))
                        instance(targets_and_predictions, cv_tag)
                    else:
                        raise ValueError(
                            'predict_type {} is unknown, '
                            'Possible values are {}'.format(
                                predict_type,
                                self.backend.valid_predictions_type))
        return self