Пример #1
0
    def _create_eval_feature_cases(params, set_features_to_eval, eval_type):
        if len(set_features_to_eval) == 0:
            raise CatboostError("Provide at least one feature to evaluation")

        # baseline
        test_cases = list()
        baseline_case = ExecutionCase(
            params, ignored_features=list(set_features_to_eval))
        # test
        if eval_type == EvalType.All or eval_type == EvalType.SeqAddAndAll or len(
                set_features_to_eval) == 1:
            test_cases.append(ExecutionCase(params, ignored_features=[]))
        if eval_type == EvalType.SeqRem:
            for feature_num in set_features_to_eval:
                test_cases.append(
                    ExecutionCase(params, ignored_features=[feature_num]))
        elif eval_type == EvalType.SeqAdd or eval_type == EvalType.SeqAddAndAll:
            for feature_num in set_features_to_eval:
                cur_features = copy(set_features_to_eval)
                cur_features.remove(feature_num)
                test_cases.append(
                    ExecutionCase(params, ignored_features=list(cur_features)))
        elif eval_type != EvalType.All:
            raise AttributeError("Don't support {} mode.", eval_type.value)
        return baseline_case, test_cases
Пример #2
0
    def close(self):
        if self._file is None:
            raise CatboostError("Trying to close None {}".format(
                self._file_path))

        self._file.close()
        self._file = None
Пример #3
0
 def create_metrics_calcer(self, metrics, thread_count, eval_step=1):
     if not os.path.exists(self._model_path):
         raise CatboostError("Model was deleted. Can't create calcer now")
     model = CatBoost()
     model.load_model(self._model_path)
     return model.create_metric_calcer(metrics,
                                       thread_count=thread_count,
                                       eval_period=eval_step)
Пример #4
0
    def __init__(self, metric_results):
        if len(metric_results) < 1:
            raise CatboostError("Need at least one result")

        self._results = dict()
        self._metrics = dict()
        self._cases = None

        for result in metric_results:
            metric_description = result.get_metric_description()
            if metric_description in self._results:
                raise CatboostError(
                    "Duplicate metric {}".format(metric_description))
            if self._cases is None:
                self._cases = result.get_cases()
            key = metric_description_or_str_to_str(metric_description)
            self._results[key] = result
            self._metrics[key] = metric_description
Пример #5
0
    def eval_features(self,
                      learn_config,
                      objective_function,
                      set_features_to_eval,
                      eval_type=EvalType.All,
                      eval_metrics=None,
                      thread_count=None,
                      eval_step=None):
        """ Evaluate features.
            Args:
            learn_config: dict with params or instance of CatBoost. In second case instance params will be used
            objective_function:
            objective_function: one of CatBoost loss functions
            eval_type: Type of feature evaluate (All, SeqAdd, SeqRem)
            eval_metrics: Additional metrics to calculate
            thread_count: thread_count to use. If not none will override learn_config values
            Returns
            -------
            result : Instance of EvaluationResult class
        """
        if eval_metrics is None:
            eval_metrics = []
        if isinstance(learn_config, CatBoost):
            params = learn_config.get_params()
        else:
            params = dict(learn_config)

        if "loss_function" in params and params[
                "loss_function"] != objective_function:
            raise CatboostError(
                "Loss function in params {} should be equal to feature evaluation objective "
                "function {}".format(params["loss_function"],
                                     objective_function))

        if thread_count is not None:
            params["thread_count"] = thread_count
        else:
            thread_count = 1
            if "thread_count" in learn_config:
                thread_count = params["thread_count"]

        if eval_step is None:
            eval_step = 1

        params["loss_function"] = objective_function
        baseline_case, test_cases = self._create_eval_feature_cases(
            params, set_features_to_eval, eval_type=eval_type)
        if objective_function not in eval_metrics:
            eval_metrics.append(objective_function)

        return self.eval_cases(baseline_case=baseline_case,
                               compare_cases=test_cases,
                               eval_metrics=eval_metrics,
                               thread_count=thread_count,
                               eval_step=eval_step)
Пример #6
0
 def _change_score_config(self, config):
     if config is not None:
         if isinstance(config, ScoreType):
             if config == ScoreType.Abs:
                 config = ScoreConfig.abs_score()
             elif config == ScoreType.Rel:
                 config = ScoreConfig.rel_score()
             else:
                 raise CatboostError("Unknown scoreType {}".format(config))
         if self._score_config != config:
             self._score_config = config
             self.__clear_comparisons()
Пример #7
0
    def __init__(self, case_results):
        if len(case_results) <= 1:
            raise CatboostError("Need at least 2 case results, got {} ".format(
                len(case_results)))

        self._case_results = dict()
        self._case_comparisons = dict()
        self._cases = [case_result.get_case() for case_result in case_results]

        for case_result in case_results:
            case = case_result.get_case()
            self._case_results[case] = case_result
Пример #8
0
class MetricEvaluationResult:
    """
        Evaluation result for one metric.
        Stores all ExecutionCases with specified metric scores
        Computes human-friendly tables with results and some plots
    """
    def __init__(self, case_results):
        if len(case_results) <= 1:
            raise CatboostError("Need at least 2 case results, got {} ".format(
                len(case_results)))

        self._case_results = dict()
        self._case_comparisons = dict()
        self._cases = [case_result.get_case() for case_result in case_results]

        for case_result in case_results:
            case = case_result.get_case()
            self._case_results[case] = case_result

        self._metric_description = case_results[0].get_metric_description()
        self._baseline_case = case_results[0].get_case()

        self._score_config = ScoreConfig(score_type=ScoreType.Rel,
                                         score_level=0.01,
                                         multiplier=1000)

        for (case, case_result) in self._case_results.items():
            if case_result.get_metric_description(
            ) != self._metric_description:
                raise CatboostError(
                    "Metric names should be equal for all case results")

            if case_result.get_fold_ids() != self.get_fold_ids():
                raise CatboostError(
                    "Case results should be computed on the same folds")

            if case_result.get_eval_step() != self.get_eval_step():
                raise CatboostError(
                    "Eval steps should be equal for different cases")
Пример #9
0
    def _calculate_result_metrics(self,
                                  cases,
                                  metrics,
                                  thread_count=-1,
                                  evaluation_step=1):
        """
        This method calculate metrics and return them.

        Args:
            :param cases: List of the ExecutionCases you want to evaluate
            :param metrics: List of the metrics to be computed
            :param thread_count: Count of thread to use.
            :param: evaluation_step: Step to evaluate metrics
            :return: instance of EvaluationResult
        """
        cases_set = set(cases)
        if len(cases_set) != len(cases):
            raise CatboostError("Found duplicate cases in " + cases)
        current_wd = self.__go_to_working_dir()
        try:
            if self._fold_count <= self._fold_offset:
                error_msg = 'Count of folds(folds_count - offset) need to be at least one: offset {}, folds_count {}.'
                raise AttributeError(
                    error_msg.format(self._fold_offset, self._fold_count))

            handler = FoldModelsHandler(cases=cases,
                                        metrics=metrics,
                                        eval_step=evaluation_step,
                                        thread_count=thread_count,
                                        remove_models=self._remove_models,
                                        time_split_mode=self._time_split_mode)

            reader = _SimpleStreamingFileReader(
                self._path_to_dataset,
                sep=self._delimiter,
                group_feature_num=self._group_feature_num)
            splitter = _Splitter(reader,
                                 self._column_description,
                                 seed=self._seed,
                                 min_folds_count=self._min_fold_count,
                                 time_split_mode=self._time_split_mode)

            result = handler.proceed(splitter=splitter,
                                     fold_size=self._fold_size,
                                     folds_count=self._fold_count,
                                     fold_offset=self._fold_offset)

            return self._create_evaluation_results(result)
        finally:
            os.chdir(current_wd)
Пример #10
0
    def _add(self, model, learning_curve):
        if model.get_case() != self._case:
            raise CatboostError("Model case should be equal to result case")

        fold_id = model.get_fold_id()

        self._fold_curves[fold_id] = learning_curve
        score = max(learning_curve) if self._metric_description.is_max_optimal(
        ) else min(learning_curve)
        position = np.argmax(
            learning_curve) if self._metric_description.is_max_optimal(
            ) else np.argmin(learning_curve)

        self._fold_metric.at[fold_id] = score
        self._fold_metric_iteration.at[fold_id] = position
Пример #11
0
    def _compute_metrics(self, metrics, grouped_by_case_models, learn_folds,
                         skipped_folds, rest_folds):
        metric_calcers = {}

        for case, case_models in grouped_by_case_models.items():
            metric_calcers[case] = list()
            for case_model in case_models:

                metric_calcer = case_model.create_metrics_calcer(
                    metrics,
                    eval_step=self._eval_step,
                    thread_count=self._thread_count)
                metric_calcers[case].append(metric_calcer)

                if self._metric_descriptions is None:
                    self._init_case_results(
                        metric_calcer.metric_descriptions())
                elif self._metric_descriptions != metric_calcer.metric_descriptions(
                ):
                    raise CatboostError(
                        "Error: metric names should be consistent")

        for file_num, fold_file in enumerate(skipped_folds + learn_folds +
                                             rest_folds):
            pool = FoldModelsHandler._create_pool(fold_file,
                                                  self._thread_count)

            for case, case_models in grouped_by_case_models.items():
                calcers = metric_calcers[case]

                for model_num, model in enumerate(case_models):
                    if self._time_split_mode:
                        if file_num <= (model_num + len(skipped_folds)):
                            continue
                    elif file_num == (model_num + len(skipped_folds)):
                        continue
                    calcers[model_num].add(pool)

        for case, case_models in grouped_by_case_models.items():
            calcers = metric_calcers[case]
            case_results = self._case_results[case]
            for calcer, model in zip(calcers, case_models):
                scores = calcer.eval_metrics()
                for metric in self._metric_descriptions:
                    case_results[metric]._add(model, scores.get_result(metric))
Пример #12
0
 def open(self):
     if self._file is None:
         self._file = open(self._file_path, mode='a')
     else:
         raise CatboostError("File already opened {}".format(
             self._file_path))
Пример #13
0
                    case_results[metric]._add(model, scores.get_result(metric))

    @staticmethod
    def _fit_model(pool, case, fold_id, model_path, time_split_mode=False):
        from catboost import CatBoost
        # Learn model
        make_dirs_if_not_exists(FoldModelsHandler.__MODEL_DIR)

        feature_count = pool.num_col()
        if "ignored_features" in case.get_params():
            ignored_features = case.get_params()["ignored_features"]
            if len(ignored_features
                   ) and max(ignored_features) >= feature_count:
                raise CatboostError(
                    "Error: input parameter contains feature indices wich are not available in pool: "
                    "{}\n "
                    "Check eval_feature set and ignored features options".
                    format(ignored_features))
        get_eval_logger().debug('Learn model {} on fold #{}'.format(
            str(case), fold_id))
        cur_time = time.time()
        params = case.get_params()
        if time_split_mode:
            params['has_time'] = True
        instance = CatBoost(params=params)
        instance.fit(pool)
        instance.save_model(fname=model_path)

        get_eval_logger().debug(
            'Operation was done in {} seconds'.format(time.time() - cur_time))
        return FoldModel(case, model_path, fold_id)
Пример #14
0
class FoldModelsHandler(object):
    """
    Class that is responsible for learning models and computing their metrics
    """
    """
       All models are placed to the default directory "models".
       """
    __MODEL_DIR = 'models'

    @staticmethod
    def _remove_model_dir():
        try:
            if os.path.exists(FoldModelsHandler.__MODEL_DIR):
                os.rmdir(FoldModelsHandler.__MODEL_DIR)
        except OSError as err:
            get_eval_logger().warning(str(err))

    def __init__(self, metrics, cases, thread_count, eval_step, remove_models):
        """
        Args:
            :param remove_models: Set true if you want models to be removed after applying them.

        """
        self._cases = cases
        self._metrics = metrics

        self._case_results = dict()
        for case in self._cases:
            self._case_results[case] = dict()

        self._thread_count = thread_count
        self._eval_step = eval_step
        self._flag_remove_models = remove_models
        self._metric_descriptions = None

    def _init_case_results(self, metric_descriptions):
        self._metric_descriptions = metric_descriptions
        for case in self._cases:
            case_result = self._case_results[case]
            for metric_description in self._metric_descriptions:
                case_result[metric_description] = CaseEvaluationResult(
                    case, metric_description, eval_step=self._eval_step)

    def _compute_metrics(self, metrics, grouped_by_case_models, learn_folds,
                         skipped_folds, rest_folds):
        metric_calcers = {}

        for case, case_models in grouped_by_case_models.items():
            metric_calcers[case] = list()
            for case_model in case_models:

                metric_calcer = case_model.create_metrics_calcer(
                    metrics,
                    eval_step=self._eval_step,
                    thread_count=self._thread_count)
                metric_calcers[case].append(metric_calcer)

                if self._metric_descriptions is None:
                    self._init_case_results(
                        metric_calcer.metric_descriptions())
                elif self._metric_descriptions != metric_calcer.metric_descriptions(
                ):
                    raise CatboostError(
                        "Error: metric names should be consistent")

        for file_num, fold_file in enumerate(learn_folds + skipped_folds +
                                             rest_folds):
            pool = FoldModelsHandler._create_pool(fold_file,
                                                  self._thread_count)

            for case, case_models in grouped_by_case_models.items():
                calcers = metric_calcers[case]

                for model_num, model in enumerate(case_models):
                    if file_num != model_num:
                        calcers[model_num].add(pool)

        for case, case_models in grouped_by_case_models.items():
            calcers = metric_calcers[case]
            case_results = self._case_results[case]
            for calcer, model in zip(calcers, case_models):
                scores = calcer.eval_metrics()
                for metric in self._metric_descriptions:
                    case_results[metric]._add(model, scores.get_result(metric))

    @staticmethod
    def _fit_model(pool, case, fold_id, model_path):
        from catboost import CatBoost
        # Learn model
        make_dirs_if_not_exists(FoldModelsHandler.__MODEL_DIR)

        feature_count = pool.num_col()
        if "ignored_features" in case.get_params():
            ignored_features = case.get_params()["ignored_features"]
            if len(ignored_features
                   ) and max(ignored_features) >= feature_count:
                raise CatboostError(
                    "Error: input parameter contains feature indices wich are not available in pool: "
                    "{}\n "
                    "Check eval_feature set and ignored features options".
                    format(ignored_features))
Пример #15
0
    def delete(self):
        if self._file is not None:
            raise CatboostError("Close file before delete")

        if os.path.exists(self._file_path):
            os.remove(self._file_path)
Пример #16
0
    def eval_features(self,
                      learn_config,
                      features_to_eval,
                      loss_function=None,
                      eval_type=EvalType.SeqAdd,
                      eval_metrics=None,
                      thread_count=-1,
                      eval_step=None,
                      label_mode=LabelMode.AddFeature):
        """ Evaluate features.
            Args:
            learn_config: dict with params or instance of CatBoost. In second case instance params will be used
            objective_function:
            objective_function: one of CatBoost loss functions
            eval_type: Type of feature evaluate (All, SeqAdd, SeqRem)
            eval_metrics: Additional metrics to calculate
            thread_count: thread_count to use. If not none will override learn_config values
            Returns
            -------
            result : Instance of EvaluationResult class
        """
        features_to_eval = set(features_to_eval)
        if eval_metrics is None:
            eval_metrics = []
        eval_metrics = eval_metrics if isinstance(eval_metrics, list) else [eval_metrics]
        if isinstance(learn_config, CatBoost):
            params = learn_config.get_params()
        else:
            params = dict(learn_config)

        if loss_function is not None:
            if "loss_function" in params and params["loss_function"] != loss_function:
                raise CatboostError("Loss function in params {} should be equal to feature evaluation objective "
                                    "function {}".format(params["loss_function"], loss_function))
        else:
            if "loss_function" not in params:
                raise CatboostError("Provide loss function in params or as option to eval_features method")

        if thread_count is not None and thread_count != -1:
            params["thread_count"] = thread_count

        if eval_step is None:
            eval_step = 1

        if loss_function is not None:
            params["loss_function"] = loss_function
        else:
            loss_function = params["loss_function"]

        if params["loss_function"] == "PairLogit":
            raise CatboostError("Pair classification is not supported")

        baseline_case, test_cases = self._create_eval_feature_cases(params,
                                                                    features_to_eval,
                                                                    eval_type=eval_type,
                                                                    label_mode=label_mode)
        if loss_function not in eval_metrics:
            eval_metrics.append(loss_function)

        return self.eval_cases(baseline_case=baseline_case,
                               compare_cases=test_cases,
                               eval_metrics=eval_metrics,
                               thread_count=thread_count,
                               eval_step=eval_step)
Пример #17
0
 def _validate_ignored_features(ignored_features, eval_features):
     for eval_feature in eval_features:
         if eval_feature in ignored_features:
             raise CatboostError(
                 "Feature {} is in ignored set and in tmp-features set at the same time"
                 .format(eval_feature))