示例#1
0
 def save_model(self, model, sub_dir='model', name='model', seed=None):
     model_dir = self._get_dir(sub_dir)
     create_path(model_dir, merge=True)
     model_file = os.path.join(model_dir,
                               '{}.pkl'.format(name) if seed is None
                               else '{}_{}.pkl'.format(name, seed))
     joblib.dump(model, model_file)
示例#2
0
 def save_predictions_as_pickle(self, predictions, sub_dir='prediction',
                             name='prediction', seed=None):
     output_dir = self._get_prediction_output_dir(sub_dir)
     create_path(output_dir, merge=True)
     predict_file = os.path.join(output_dir,
                                 '{}.pickle'.format(name) if seed is None
                                 else '{}_{}.pickle'.format(name, seed))
     pickle.dump(predictions.astype(np.float32), predict_file)
示例#3
0
 def save_json(self, data, sub_dir='json', name='json_file', seed=None):
     json_dir = self._get_dir(sub_dir)
     create_path(json_dir, merge=True)
     json_file = os.path.join(json_dir,
                               '{}.json'.format(name) if seed is None
                               else '{}_{}.json'.format(name, seed))
     with open(json_file, 'w') as wf:
         json.dump(data, wf)
示例#4
0
 def save_predictions_as_npy(self, predictions, sub_dir='prediction',
                             name='prediction', seed=None):
     output_dir = self._get_prediction_output_dir(sub_dir)
     create_path(output_dir, merge=True)
     predict_file = os.path.join(output_dir,
                                 '{}.npy'.format(name) if seed is None
                                 else '{}_{}.npy'.format(name, seed))
     np.save(predict_file, predictions)
示例#5
0
 def save_featurizer_as_dataframe(self, output_df, name='featurizer',
                                  save_type='pickle.gz'):
     featurizer_dir = self._get_featurizer_output_dir()
     create_path(featurizer_dir, merge=True)
     featurizer_file = os.path.join(
         featurizer_dir, '{}.{}'.format(name, save_type))
     if save_type == 'csv':
         output_df.to_csv(featurizer_file)
     elif save_type.startswith('pickle'):
         output_df.to_pickle(featurizer_file)
示例#6
0
    def save_predictions_as_txt(self, predictions, sub_dir='prediction',
                                name='prediction', seed=None):
        output_dir = self._get_prediction_output_dir(sub_dir)
        create_path(output_dir, merge=True)
        predict_file = os.path.join(output_dir,
                                    '{}.txt'.format(name) if seed is None
                                    else '{}_{}.txt'.format(name, seed))

        with open(predict_file, 'w') as wf:
            wf.write("\n".join(list(map(str, predictions))))
示例#7
0
    def save_predictions_as_dataframe(self, predictions, subdir='prediction',
                                      name='prediction', seed=None):
        predict_dir = self._get_prediction_output_dir(subdir)
        create_path(predict_dir, merge=True)
        predict_file = os.path.join(predict_dir,
                                    '{}.csv'.format(name) if seed is None
                                    else '{}_{}.csv'.format(name, seed))

        predict_df = pd.DataFrame(predictions,
                                  columns=['target', 'predict']
                                  if isinstance(predictions[0], np.ndarray)
                                  else ['predict'])
        predict_df.to_csv(predict_file)
示例#8
0
    def _prepare_paths(self, output_path=None, auto_rename=False):
        timestamp = time.time()
        pid = os.getpid()

        if output_path == 'tmp' or output_path == 'default':
            output_path = \
                '/tmp/amlearn/task_%d/output_%d' % (pid, int(timestamp))
        self.output_path_ = output_path

        if output_path is not None:
            if auto_rename and os.path.exists(self.output_path_):
                self.output_path_ = auto_rename_file(self.output_path_)

            create_path(self.output_path_,
                        overwrite=self.overwrite_path, merge=self.merge_path)
            self.output_path_created_ = True
示例#9
0
    def _fit_cv(self,
                X,
                y,
                random_state=None,
                scoring=None,
                cv_num=1,
                cv_params=None,
                val_size=0.3,
                save_model=False,
                save_score=True,
                save_prediction=False,
                prediction_types='dataframe',
                save_feature_importances=True,
                save_train_val_idx=False,
                **fit_params):

        # If user's cv_params contains 'cv_num' parameter, use the max value
        # between function parameter 'cv_num' and cv_params's 'cv_num'.
        if not self.imblearn:
            self.backend.logger.info('Start Cross Validation.')
            cv_start_time = time.time()

        if cv_params is None:
            cv_params = {}

        if 'cv_num' in cv_params.keys():
            cv_num = max(cv_num, cv_params['cv_num'])
            cv_params.pop('cv_num')

        if 'scoring' in cv_params.keys():
            cv_params.pop('scoring')

        return_train_score = cv_params.get('return_train_score', True)
        if cv_num > 1:
            np.random.seed(random_state)
            classifier_params = \
                appropriate_kwargs(fit_params, self.classifier.fit)
            results, scorers = \
                cross_validate(estimator=self.classifier, scoring=scoring,
                               fit_params=classifier_params, X=X, y=y,
                               cv=cv_num, **cv_params)
        else:
            results, scorers = self._fit(X,
                                         y,
                                         self.classifier,
                                         val_size=val_size,
                                         return_train_score=return_train_score,
                                         random_state=random_state,
                                         scoring=scoring,
                                         **fit_params)
            cv_num = 1

        # TODO: now if scoring is more than one, score_name only can be the first of them.
        self.score_name = self.score_name if hasattr(self, 'score_name') \
            else list(scorers.keys())[0]
        self.best_score_, (self.best_model_, self.best_model_tag_)= \
            max(zip(results['test_{}'.format(self.score_name)],
                    zip(results['estimators'],
                        [''] if cv_num == 1 else
                        ["cv_{}".format(i) for i in range(cv_num)])),
                key=lambda x: x[0])

        if not self.imblearn:
            self.backend.logger.info(
                "\tCV classification finish in {:.4f} seconds.".format(
                    time.time() - cv_start_time))

        if save_model or save_score or save_train_val_idx or save_prediction \
                or save_feature_importances:
            imblearn_output_path = \
                os.path.join(self.backend.output_path, self.imblearn_tag)
            create_path(imblearn_output_path)
            if save_score:
                write_file(
                    os.path.join(imblearn_output_path, 'mean_scores.txt'),
                    '{}\n{}\n{}'.format(
                        ','.join(['dataset'] + list(scorers.keys())),
                        ','.join(['test'] + [
                            str(np.mean(results['test_{}'.format(score_name)]))
                            for score_name in scorers.keys()
                        ]), ','.join(['train'] + [
                            str(np.mean(results['train_{}'.format(
                                score_name)]))
                            for score_name in scorers.keys()
                        ]) if return_train_score else -1))

            check_path_while_saving(self.backend.output_path)
            for cv_idx in range(cv_num):
                sub_path = os.path.join(self.imblearn_tag,
                                        "cv_{}".format(cv_idx))
                cv_output_path = \
                    os.path.join(self.backend.output_path, sub_path)
                create_path(cv_output_path)

                if save_score:
                    write_file(
                        os.path.join(cv_output_path, 'scores.txt'),
                        '{}\n{}\n{}'.format(
                            ','.join(['dataset'] + list(scorers.keys())),
                            ','.join(['test'] + [
                                str(results['test_{}'.format(score_name)]
                                    [cv_idx]) for score_name in scorers.keys()
                            ]), ','.join(['train'] + [
                                str(results['train_{}'.format(score_name)]
                                    [cv_idx]) for score_name in scorers.keys()
                            ]) if return_train_score else -1))

                score_model = results['estimators'][cv_idx]
                if save_model:
                    self.backend.save_model(score_model, sub_path)
                if save_feature_importances:
                    self.backend.save_json(
                        self.feature_importances_dict(score_model),
                        sub_path,
                        name='feature_importances')

                if save_train_val_idx:
                    train_idx = results['indices'][cv_idx][0]
                    val_idx = results['indices'][cv_idx][1]
                    write_file(os.path.join(cv_output_path, 'train_idx.txt'),
                               "\n".join(list(map(str, train_idx))))

                    write_file(os.path.join(cv_output_path, 'val_idx.txt'),
                               "\n".join(list(map(str, val_idx))))

                if save_prediction:
                    if 'X_val' in fit_params.keys(
                    ) and 'y_val' in fit_params.keys():
                        test_X = fit_params['X_val']
                        test_y = fit_params['y_val']
                    else:
                        test_X = X[results['indices'][cv_idx][1]] \
                            if isinstance(X, np.ndarray) \
                            else X.iloc[results['indices'][cv_idx][1]]
                        test_y = y[results['indices'][cv_idx][1]] \
                            if isinstance(y, np.ndarray) \
                            else y.iloc[results['indices'][cv_idx][1]]
                    if hasattr(score_model, 'predict_proba'):
                        predictions = score_model.predict_proba(test_X)
                    elif hasattr(score_model, 'decision_function'):
                        predictions = score_model.decision_function(test_X)
                    else:
                        predictions = score_model.predict(test_X)
                    targets_and_predictions = \
                        np.array(list(zip(test_y, predictions[:, 1])))

                    if not isinstance(prediction_types, list_like()):
                        prediction_types = [prediction_types]
                    for predict_type in prediction_types:
                        if predict_type in self.backend.valid_predictions_type:
                            getattr(
                                self.backend,
                                'save_predictions_as_{}'.format(predict_type))(
                                    targets_and_predictions, sub_path)
                        else:
                            raise ValueError(
                                'predict_type {} is unknown, '
                                'Possible values are {}'.format(
                                    predict_type,
                                    self.backend.valid_predictions_type))
        return results, scorers
示例#10
0
    def _fit_cv(self, X, y, val_size=0.3, random_state=None, scoring=None,
                cv_num=1, cv_params=None, save_train_val_idx=True,
                save_model=True, save_score=True, save_prediction=True,
                prediction_types='dataframe', save_feature_importances=True,
                **fit_params):

        # If user's cv_params contains 'cv_num' parameter, use the max value
        # between function parameter 'cv_num' and cv_params's 'cv_num'.
        self.backend.logger.info('Start Cross Validation.')
        cv_start_time = time.time()

        if cv_params is None:
            cv_params = {}

        if 'cv_num' in cv_params.keys():
            cv_num = max(cv_num, cv_params['cv_num'])
            cv_params.pop('cv_num')

        if 'scoring' in cv_params.keys():
            cv_params.pop('scoring')

        return_train_score = cv_params.get('return_train_score', True)
        if cv_num > 1:
            if random_state is False:
                pass
            else:
                np.random.seed(random_state)
            results, scorers = \
                cross_validate(estimator=self.regressor, scoring=scoring,
                               fit_params=fit_params, X=X, y=y, cv=cv_num,
                               **cv_params)
        else:
            results, scorers = self._fit(
                X, y, self.regressor, val_size=val_size,
                return_train_score=return_train_score,
                random_state=random_state, scoring=scoring, **fit_params)
            cv_num = 1

        # TODO: now if scorers list length is more than 1, score_name only can
        #  be the first of them.
        self.score_name = self.score_name if hasattr(self, 'score_name') \
            else list(scorers.keys())[0]
        self.best_score_, (self.best_model_, self.best_model_tag_)= \
            max(zip(results['test_{}'.format(self.score_name)],
                    zip(results['estimators'],
                        [''] if cv_num == 1 else
                        ["cv_{}".format(i) for i in range(cv_num)])),
                key=lambda x: x[0])

        self.backend.logger.info(
            "\tCV regression finish in {:.4f} seconds.".format(
                time.time() - cv_start_time))
        if save_score:
            write_file(
                os.path.join(self.backend.output_path, 'mean_scores.txt'),
                '{}\n{}\n{}'.format(
                    ','.join(['dataset'] + list(scorers.keys())),
                    ','.join(['test'] +
                             [str(np.mean(results['test_{}'.format(
                                 score_name)]))
                              for score_name in scorers.keys()]),
                    ','.join(['train'] +
                             [str(np.mean(results['train_{}'.format(
                                 score_name)]))
                              for score_name in scorers.keys()])
                    if return_train_score else -1))

        for cv_idx in range(cv_num):
            cv_tag = "cv_{}".format(cv_idx)
            cv_output_path = os.path.join(self.backend.output_path, cv_tag)
            create_path(cv_output_path, merge=True)

            if save_score:
                write_file(os.path.join(cv_output_path, 'scores.txt'),
                           '{}\n{}\n{}'.format(
                               ','.join(['dataset'] + list(scorers.keys())),
                               ','.join(['test'] +
                                        [str(results['test_{}'.format(
                                            score_name)][cv_idx])
                                         for score_name in scorers.keys()]),
                               ','.join(['train'] +
                                        [str(results['train_{}'.format(
                                            score_name)][cv_idx])
                                         for score_name in scorers.keys()])
                               if return_train_score else -1))

            score_model = results['estimators'][cv_idx]
            if save_model:
                self.backend.save_model(score_model, cv_tag)
            if save_feature_importances:
                self.backend.save_json(
                    self.feature_importances_dict(score_model), cv_tag,
                    name='feature_importances')
            if save_train_val_idx:
                train_idx = results['indices'][cv_idx][0]
                val_idx = results['indices'][cv_idx][1]
                write_file(os.path.join(cv_output_path, 'train_idx.txt'),
                           "\n".join(list(map(str, train_idx))))

                write_file(os.path.join(cv_output_path, 'val_idx.txt'),
                           "\n".join(list(map(str, val_idx))))
            if save_prediction:
                predictions = \
                    score_model.predict(X.iloc[results['indices'][cv_idx][1]])
                targets_and_predictions = \
                    np.array(list(zip(y.iloc[results['indices'][cv_idx][1]],
                                      predictions)))

                if not isinstance(prediction_types, list_like()):
                    prediction_types = [prediction_types]
                for predict_type in prediction_types:
                    if predict_type in self.backend.valid_predictions_type:
                        instance = getattr(self.backend,
                                           'save_predictions_as_{}'.format(
                                               predict_type))
                        instance(targets_and_predictions, cv_tag)
                    else:
                        raise ValueError(
                            'predict_type {} is unknown, '
                            'Possible values are {}'.format(
                                predict_type,
                                self.backend.valid_predictions_type))
        return self