Пример #1
0
 def tune_parameter(X, y, clf, params):
     # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
     gs = BayesSearchCV(estimator=clf,
                        search_spaces=params,
                        scoring="f1",
                        n_iter=100,
                        optimizer_kwargs={"base_estimator": "GP"},
                        verbose=2,
                        n_jobs=-1,
                        cv=4,
                        refit=True,
                        random_state=1234)
     gs.fit(X, y, callback=DeltaXStopper(0.000001))
     best_params = gs.best_params_
     best_score = gs.best_score_
     print(best_params)
     print(best_score)
     str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
     with open("kuaishou_stats.csv", 'a', newline='') as f:
         writer = csv.writer(f)
         writer.writerow(["the best params for svm: "])
         for key, value in best_params.items():
             writer.writerow([key, value])
         writer.writerow(["the best score for svm: ", best_score, str_time])
     return gs
Пример #2
0
def optimize(config, objective_fn, initial_points=None, logger_fn=None):
    best_params = {}
    best_score = 0.0
    metadata = None

    def skopt_logger(result):
        x0 = result.x_iters  # list of input points
        y0 = result.func_vals  # evaluation of input points

        num_iters = len(x0)
        params = config.param_dict_from_values(x0[-1])
        params = _convert_param_types(params)
        score = -1 * y0[-1]

        if logger_fn:
            logger_fn(num_iters, score, params)

    if config.selected_method == 'grid':
        grid_space = config.param_dict_from_values(
            [list(dim.categories) for dim in config.space])
        for i, params in enumerate(list(ParameterGrid(grid_space))):
            # keep the same order as in the configuration to make reading logs easier
            ordered_params = {k: params[k] for k in config.dimension_names()}
            score = -1 * objective_fn(ordered_params)

            if logger_fn:
                logger_fn(i + 1, score, ordered_params)

            if score > best_score:
                best_score = score
                best_params = ordered_params.copy()

    elif config.selected_method == 'bayesian' or config.selected_method == 'random':

        if config.selected_method == 'random':
            config.num_initial_points = config.num_iterations

        def list_based_objective_fn(param_values):
            """Convert params to a dict first."""
            return objective_fn(config.param_dict_from_values(param_values))

        res = gp_minimize(
            func=list_based_objective_fn,
            dimensions=config.space,
            n_calls=config.
            num_iterations,  # total calls to func, includes initial points
            n_initial_points=config.
            num_initial_points,  # random points to seed process
            verbose=False,
            callback=[DeltaXStopper(0.001), skopt_logger],
            x0=initial_points)
        best_params = config.param_dict_from_values(res.x)
        best_score = -1 * res.fun
        metadata = res
    else:
        raise ValueError(f"Unsupported method: {config.selected_method}")

    final_params = merge_params([config.default, best_params])
    return best_score, _convert_param_types(best_params), _convert_param_types(
        final_params), metadata
def get_bayes_scikit_score_cv(X_train,y_train,X_test,y_test, X_val=None, y_val= None, max_evals = 25, folds=5, original = None):

    space = get_baesian_space(dictem = True)
    opt_cat = BayesSearchCV(CatBoostClassifier(logging_level='Silent'), space['CAT'], n_iter = max_evals, random_state = 0)
    opt_xgb = BayesSearchCV(XGBClassifier(), space['XGB'], n_iter = max_evals, random_state = 0)
    opt_lgbm = BayesSearchCV(LGBMClassifier(), space['LGBM'], n_iter = max_evals, random_state = 0)
    _ = opt_cat.fit(X_train, y_train, callback = [DeltaXStopper(0.01), DeltaYStopper(0.01)])
    __ = opt_xgb.fit(X_train, y_train, callback = [DeltaXStopper(0.01), DeltaYStopper(0.01)])
    ___ = opt_lgbm.fit(X_train, y_train, callback = [DeltaXStopper(0.01), DeltaYStopper(0.01)])

    scores = [opt_cat.score(X_test, y_test), opt_xgb.score(X_test, y_test), opt_lgbm.score(X_test, y_test)]
    train_scores  = [opt_cat.best_score_, opt_xgb.best_score_, opt_lgbm.best_score_]
    score = max(scores)
    cross_score = max(train_scores)
    neptune.log_metric(f'skopt-{max_evals}-iterations-{folds}-folds', score)
    neptune.log_metric('skopt train holdout score', cross_score)
    return score
Пример #4
0
def test_early_stopping_delta_x(minimizer):
    n_calls = 11
    res = minimizer(bench1,
                    callback=DeltaXStopper(0.1),
                    dimensions=[(-1., 1.)],
                    x0=[[-0.1], [0.1], [-0.9]],
                    n_calls=n_calls,
                    n_random_starts=0, random_state=1)
    assert len(res.x_iters) < n_calls
Пример #5
0
def test_early_stopping_delta_x_empty_result_object(minimizer):
    # check that the callback handles the case of being passed an empty
    # results object, e.g. at the start of the optimization loop
    n_calls = 15
    res = minimizer(bench1,
                    callback=DeltaXStopper(0.1),
                    dimensions=[(-1., 1.)],
                    n_calls=n_calls,
                    n_random_starts=1, random_state=1)
    assert len(res.x_iters) < n_calls
def optimized_bayesian_search_2(X,y,model,parameter,seed,eval_method):
    start_time = time.time()
    search= BayesSearchCV(
                   model, 
                   parameter, 
                   n_jobs=-1, 
                   n_iter=200,
                   scoring=eval_method,
                   cv=5,
                   random_state=seed,
                   verbose= 0,
                   optimizer_kwargs={'base_estimator': 'GP'}
    )
    search.fit(X,y,callback=DeltaXStopper(0.0001))
    end_time = time.time()

    return [search.cv_results_,search.best_index_,end_time-start_time]
Пример #7
0
    print((title + " took %.2f seconds,  candidates checked: %d, best CV score: %.3f "
           + u"\u00B1" + " %.3f") % (time() - start,
                                     len(optimizer.cv_results_['params']),
                                     best_score,
                                     best_score_std))
    print('Best parameters:')
    pprint.pprint(best_params)
    print()
    return best_params


# Converting average precision score into a scorer suitable for model selection
avg_prec = make_scorer(average_precision_score, greater_is_better=True, needs_proba=True)
# Setting a 5-fold stratified cross-validation (note: shuffle=True)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
opt = BayesSearchCV(clf,
                    search_spaces,
                    scoring=avg_prec,
                    cv=skf,
                    n_iter=40,
                    n_jobs=-1,
                    refit=True,
                    optimizer_kwargs={'base_estimator': 'GP'},
                    random_state=22,
                    return_train_score=False,
                    )

best_params = report_perf(opt, X, y, 'LightGBM',
                          callbacks=[DeltaXStopper(0.001),
                                     DeadlineStopper(60 * 5)])
Пример #8
0
    def _check_parameters(self):
        """Check the validity of the input parameters."""
        if self.mapping is None:
            self.mapping = {str(v): v for v in sorted(self.y.unique())}

        if self.scaled is None:
            self.scaled = check_scaling(self.X)

        # Create model subclasses ================================== >>

        models = []
        for m in self._models:
            if isinstance(m, str):
                acronym = get_acronym(m, must_be_equal=False)

                # Check if packages for non-sklearn models are available
                if acronym in OPTIONAL_PACKAGES:
                    try:
                        importlib.import_module(OPTIONAL_PACKAGES[acronym])
                    except ImportError:
                        raise ValueError(
                            f"Unable to import the {OPTIONAL_PACKAGES[acronym]} "
                            "package. Make sure it is installed.")

                # Check for regression/classification-only models
                if self.goal.startswith("class") and acronym in ONLY_REG:
                    raise ValueError(
                        f"The {acronym} model can't perform classification tasks!"
                    )
                elif self.goal.startswith("reg") and acronym in ONLY_CLASS:
                    raise ValueError(
                        f"The {acronym} model can't perform regression tasks!")

                models.append(MODEL_LIST[acronym](self,
                                                  acronym + m[len(acronym):]))

            elif not isinstance(m, BaseModel):  # Model is custom estimator
                models.append(CustomModel(self, estimator=m))

            else:  # Model is already a model subclass (can happen with reruns)
                models.append(m)

        self._models = CustomDict({m.name: m for m in models})

        # Check validity metric ==================================== >>

        if None in self._metric:
            self._metric = CustomDict(get_default_metric(self.task))

        # Ignore if it's the same metric as previous call
        elif not all([hasattr(m, "name") for m in self._metric]):
            self._metric = self._prepare_metric(
                metric=self._metric,
                greater_is_better=self.greater_is_better,
                needs_proba=self.needs_proba,
                needs_threshold=self.needs_threshold,
            )

        # Check validity sequential parameters ===================== >>

        for param in ["n_calls", "n_initial_points", "bagging"]:
            p = lst(getattr(self, param))
            if len(p) != 1 and len(p) != len(self._models):
                raise ValueError(
                    f"Invalid value for the {param} parameter. Length "
                    "should be equal to the number of models, got len"
                    f"(models)={len(self._models)} and len({param})={len(p)}.")

            for i, model in enumerate(self._models):
                if param in ("n_calls", "bagging") and p[i % len(p)] < 0:
                    raise ValueError(
                        f"Invalid value for the {param} parameter. "
                        f"Value should be >=0, got {p[i % len(p)]}.")
                elif param == "n_initial_points" and p[i % len(p)] <= 0:
                    raise ValueError(
                        f"Invalid value for the {param} parameter. "
                        f"Value should be >0, got {p[i % len(p)]}.")

                setattr(model, "_" + param, p[i % len(p)])

        # Prepare bo parameters ===================================== >>

        # Choose a base estimator (GP is chosen as default)
        self._base_estimator = self.bo_params.get("base_estimator", "GP")
        if isinstance(self._base_estimator, str):
            if self._base_estimator.lower() not in ("gp", "et", "rf", "gbrt"):
                raise ValueError(
                    f"Invalid value for the base_estimator parameter, got "
                    f"{self._base_estimator}. Value should be one of: 'GP', "
                    f"'ET', 'RF', 'GBRT'.")

        if self.bo_params.get("callbacks"):
            self._callbacks = lst(self.bo_params["callbacks"])

        if "max_time" in self.bo_params:
            if self.bo_params["max_time"] <= 0:
                raise ValueError(
                    "Invalid value for the max_time parameter. "
                    f"Value should be >0, got {self.bo_params['max_time']}.")
            self._callbacks.append(DeadlineStopper(self.bo_params["max_time"]))

        if "delta_x" in self.bo_params:
            if self.bo_params["delta_x"] < 0:
                raise ValueError(
                    "Invalid value for the delta_x parameter. "
                    f"Value should be >=0, got {self.bo_params['delta_x']}.")
            self._callbacks.append(DeltaXStopper(self.bo_params["delta_x"]))

        if "delta_y" in self.bo_params:
            if self.bo_params["delta_y"] < 0:
                raise ValueError(
                    "Invalid value for the delta_y parameter. "
                    f"Value should be >=0, got {self.bo_params['delta_y']}.")
            self._callbacks.append(
                DeltaYStopper(self.bo_params["delta_y"], n_best=5))

        if self.bo_params.get("plot"):
            self._callbacks.append(PlotCallback(self))

        if "cv" in self.bo_params:
            if self.bo_params["cv"] <= 0:
                raise ValueError(
                    "Invalid value for the max_time parameter. "
                    f"Value should be >=0, got {self.bo_params['cv']}.")
            self._cv = self.bo_params["cv"]

        if "early_stopping" in self.bo_params:
            if self.bo_params["early_stopping"] <= 0:
                raise ValueError(
                    "Invalid value for the early_stopping parameter. "
                    f"Value should be >=0, got {self.bo_params['early_stopping']}."
                )
            self._early_stopping = self.bo_params["early_stopping"]

        # Add custom dimensions to every model subclass
        if self.bo_params.get("dimensions"):
            for name, model in self._models.items():
                # If not dict, the dimensions are for all models
                if not isinstance(self.bo_params["dimensions"], dict):
                    model._dimensions = self.bo_params["dimensions"]
                else:
                    # Dimensions for every specific model
                    for key, value in self.bo_params["dimensions"].items():
                        # Parameters for this model only
                        if key.lower() == name:
                            model._dimensions = value
                            break

        kwargs = [
            "base_estimator",
            "max_time",
            "delta_x",
            "delta_y",
            "early_stopping",
            "cv",
            "callbacks",
            "dimensions",
            "plot",
        ]

        # The remaining bo_params are added as kwargs to the optimizer
        self._bo_kwargs = {
            k: v
            for k, v in self.bo_params.items() if k not in kwargs
        }

        # Prepare est_params ======================================= >>

        if self.est_params:
            for name, model in self._models.items():
                params = {}
                for key, value in self.est_params.items():
                    # Parameters for this model only
                    if key.lower() == name:
                        params.update(value)
                    # Parameters for all models
                    elif key.lower() not in self._models.keys():
                        params.update({key: value})

                for key, value in params.items():
                    if key.endswith("_fit"):
                        model._est_params_fit[key[:-4]] = value
                    else:
                        model._est_params[key] = value
def main():

    start_time_main = time.time()

    print_info('Reading config files...', ':')
    run_config = config_file_to_dict(config_path + 'run_params.conf')
    data_config = config_file_to_dict(config_path + 'data_params.conf')
    model_config = config_file_to_dict(config_path + 'model_params.conf')

    if run_mode_user in run_config:
        frac_train_sample = run_config[run_mode_user]['frac_train_sample']
        num_test_samples = run_config[run_mode_user]['num_test_samples']
        num_CV_folds = run_config[run_mode_user]['num_CV_folds']
        do_optimize_params = run_config[run_mode_user]['do_optimize_params']
        n_iter = run_config[run_mode_user]['n_iter']
        print_info('Chosen run mode is {}: {}'.format(
            run_mode_user, run_config[run_mode_user]))
    else:
        raise KeyError('{} is not a valid run mode setting ' \
                       '(use, e.g., "run_params")'.format(
            run_mode_user))

    # collection of performance measures to be applied to the test set(s)
    scoring_funcs = ['accuracy_score', 'precision_score', 'recall_score', \
                     'f1_score']

    final_results_labels = [
        'dataset', 'model', 'model_params', 'num_test_sets', 'num_CV_folds',
        'elapsed_time_train', 'elapsed_time_test'
    ]
    final_results_labels += ['test_{}_1fold'.format(i) for i in scoring_funcs]
    final_results_labels += ['train_{}'.format(i) for i in scoring_funcs]
    final_results_labels += ['test_{}'.format(i) for i in scoring_funcs]
    final_results_labels += [
        'test_{}_bootstrap'.format(i) for i in scoring_funcs
    ]
    final_results_labels += [
        'test_{}_diff_max'.format(i) for i in scoring_funcs
    ]
    final_results_labels += [
        'test_{}_diff_max_bootstrap'.format(i) for i in scoring_funcs
    ]
    final_results_labels += [
        'test_{}_diff_mean'.format(i) for i in scoring_funcs
    ]
    final_results_labels += [
        'test_{}_diff_std'.format(i) for i in scoring_funcs
    ]
    final_results_labels += [
        'test_{}_diff_mean_bootstrap'.format(i) for i in scoring_funcs
    ]
    final_results_labels += [
        'test_{}_diff_std_bootstrap'.format(i) for i in scoring_funcs
    ]

    final_results = pd.DataFrame(columns=final_results_labels)

    # loop over all sections of the data params config file
    for d_cnt, d in enumerate(data_config):

        print_info(
            'Processing dataset: {} ({} of {})'.format(d, d_cnt + 1,
                                                       len(data_config)), '=',
            50)

        current_data_results = {}

        current_data_params = data_config[d]
        check_data_config_requirements(current_data_params)

        print_info('Loading data...', ':')
        data = load_data(current_data_params)

        print_info('Preparing target vector...', ':')
        X = data.drop(current_data_params['data_target_col'], axis=1)
        y = data[current_data_params['data_target_col']]

        y = parse_target_labels(
            y, current_data_params['data_target_positive_label'],
            current_data_params['data_target_negative_label'])

        del data

        print_info('Dimensions of feature matrix X: {}'.format(X.shape))
        print_info('Dimensions of target vector y:  {}'.format(y.shape))

        print_info('Splitting the data: splitting off the training sample...',
                   ':')
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=1 -
                                                            frac_train_sample)

        del X, y

        print_info('Preprocessing the data...', ':')
        pm = PreprocessingManager(om.get_session_folder())

        for func in current_data_params['data_preprocessing']:
            X_train = getattr(pm, func)(X_train, False)
            X_test = getattr(pm, func)(X_test, True)

        print_class_counts(y_train, 'training', background=0, signal=1)
        print_class_counts(y_test, 'test', background=0, signal=1)

        # hyperparameter optimization, if required
        if num_CV_folds is None:
            print_info('Optimizing the number of cross-validation folds...',
                       ':')
            num_CV_folds = get_optimal_CV_n_folds(X_train.as_matrix(),
                                                  y_train.as_matrix())

        for mod in model_config:

            print_info('Training model: {}'.format(mod), '-', 50)

            try:
                model_params = model_config[mod]
                model = supported_models[mod](**model_params)
            except KeyError:
                raise KeyError('Model {} not supported. Choose a valid input ' \
                               'from this list: {}'.format(mod, supported_models))

            fitkwargs = {'X': X_train, 'y': y_train}
            if do_optimize_params:
                print_info('Optimizing hyperparameters...', ':')
                model = hyperparameter_search(model, n_iter, num_CV_folds)
                if mod != 'GaussianNB':
                    fitkwargs['callback'] = DeltaXStopper(1e-2)

            start_time_train = time.time()
            print_info('Fitting the model...', ':')
            model.fit(**fitkwargs)
            elapsed_time_train = time.time() - start_time_train

            model_parameters = get_search_results(model)

            # evaluate model on the training sample
            print_info('Evaluating the model on the training sample...', ':')
            for scoring_func in scoring_funcs:
                try:
                    model_scores_train = evaluate_nfold(X_train,
                                                        y_train,
                                                        model,
                                                        1,
                                                        scoring=scoring_func)
                    current_data_results['train_{}'.format(
                        scoring_func)] = model_scores_train[0]
                except ValueError:
                    warnings.warn('ValueError when evaluating with {}. ' \
                                  'Ignoring and continuing...'.format(
                                      scoring_func))

            # evaluate model on the test sample(s)
            print_info('Evaluating the model on the test sample(s)...', ':')

            test_performance_1fold = -1  # must be initialized with a negative number

            for t in range(1, num_test_samples + 1):

                start_time_test = time.time()

                for scoring_func in scoring_funcs:
                    try:
                        model_scores_test = evaluate_nfold(
                            X_test,
                            y_test,
                            model,
                            t,
                            scoring=scoring_func,
                            bootstrapping=False)
                        model_scores_test_bootstrap = evaluate_nfold(
                            X_test,
                            y_test,
                            model,
                            t,
                            scoring=scoring_func,
                            bootstrapping=True)

                        if test_performance_1fold < 0:
                            test_performance_1fold = model_scores_test[0]
                        else:
                            pass

                        current_data_results['test_{}_1fold'.format(
                            scoring_func)] = test_performance_1fold

                        current_data_results['test_{}'.format(
                            scoring_func)] = str(model_scores_test)
                        current_data_results['test_{}_bootstrap'.format(
                            scoring_func)] = str(model_scores_test_bootstrap)

                        current_data_results['test_{}_diff_max'.format(
                            scoring_func
                        )] = max(model_scores_test) - min(model_scores_test)

                        current_data_results[
                            'test_{}_diff_max_bootstrap'.format(
                                scoring_func
                            )] = max(model_scores_test_bootstrap) - min(
                                model_scores_test_bootstrap)

                        scores_mean, scores_std = performance_difference(
                            model_scores_test)
                        current_data_results['test_{}_diff_mean'.format(
                            scoring_func)] = scores_mean
                        current_data_results['test_{}_diff_std'.format(
                            scoring_func)] = scores_std

                        scores_mean_bootstrap, scores_std_bootstrap = performance_difference(
                            model_scores_test_bootstrap)
                        current_data_results[
                            'test_{}_diff_mean_bootstrap'.format(
                                scoring_func)] = scores_mean_bootstrap
                        current_data_results[
                            'test_{}_diff_std_bootstrap'.format(
                                scoring_func)] = scores_std_bootstrap

                    except ValueError:
                        warnings.warn('ValueError when evaluating with {}. ' \
                                      'Ignoring and continuing...'.format(
                                          scoring_func))
                        current_data_results['test_{}_1fold'.format(
                            scoring_func)] = -1
                        #current_data_results['test_{}'.format(
                        #    scoring_func)] = "-1"
                        #current_data_results['test_{}_bootstrap'.format(
                        #    scoring_func)] = "-1"
                        current_data_results['test_{}_diff_mean'.format(
                            scoring_func)] = -1
                        current_data_results['test_{}_diff_std'.format(
                            scoring_func)] = -1
                        current_data_results['test_{}_diff_mean_bootstrap'.
                                             format(scoring_func)] = -1
                        current_data_results['test_{}_diff_std_bootstrap'.
                                             format(scoring_func)] = -1

                print_info('Model score differences (mean, std) for {} ' \
                           'test sample folds: {:.5f}, {:.5f}'.format(
                               t, scores_mean, scores_std))

                model_params_string = ','.join('{}:{}'.format(key, val) \
                                              for key, val in \
                                               sorted(model_parameters.items()))

                current_data_results['dataset'] = str(d)
                current_data_results['model'] = str(mod)
                current_data_results['model_params'] = model_params_string
                current_data_results['num_test_sets'] = t
                current_data_results['num_CV_folds'] = num_CV_folds
                current_data_results['elapsed_time_train'] = elapsed_time_train
                current_data_results['elapsed_time_test'] = time.time(
                ) - start_time_test

                final_results = final_results.append(current_data_results,
                                                     ignore_index=True)

        print_info('Creating results plots...', ':')
        scoring_func_plot = 'f1_score'

        train_differences = []

        current_data_plot_nsplits = final_results.query(
            '(dataset=="{}") & (model=="{}")'.format(d, mod))['num_test_sets']

        # explicit conversion to floats is necessary for the np.isfinite method,
        # which is implicitely called during plotting
        current_data_plot_xyvals = [
            current_data_plot_nsplits.values.astype(np.float32)
        ]
        current_data_plot_xyvals_bootstrap = [
            current_data_plot_nsplits.values.astype(np.float32)
        ]

        current_data_plot_xyvals_max = [
            current_data_plot_nsplits.values.astype(np.float32)
        ]
        current_data_plot_xyvals_max_bootstrap = [
            current_data_plot_nsplits.values.astype(np.float32)
        ]

        for mod in model_config:
            current_data_plot_xyvals.append(
                final_results.query('(dataset=="{}") & (model=="{}")'.format(
                    d, mod))['test_{}_diff_mean'.format(
                        scoring_func_plot)].values.astype(np.float32))
            current_data_plot_xyvals.append(
                final_results.query('(dataset=="{}") & (model=="{}")'.format(
                    d, mod))['test_{}_diff_std'.format(
                        scoring_func_plot)].values.astype(np.float32))

            current_data_plot_xyvals_max.append(
                final_results.query('(dataset=="{}") & (model=="{}")'.format(
                    d, mod))['test_{}_diff_max'.format(
                        scoring_func_plot)].values.astype(np.float32))
            current_data_plot_xyvals_max.append(
                np.zeros(current_data_plot_xyvals_max[-1].shape))

            current_data_plot_xyvals_max_bootstrap.append(
                final_results.query('(dataset=="{}") & (model=="{}")'.format(
                    d, mod))['test_{}_diff_max_bootstrap'.format(
                        scoring_func_plot)].values.astype(np.float32))
            current_data_plot_xyvals_max_bootstrap.append(
                np.zeros(current_data_plot_xyvals_max_bootstrap[-1].shape))

            train_differences.append(
                abs(final_results.query('(dataset=="{}") & '\
                                        '(model=="{}")'.format(
                                            d,mod))['train_{}'.format(
                                                scoring_func_plot)].iloc[0] -
                    final_results.query('(dataset=="{}") & (model=="{}")'.format(
                        d,mod))['test_{}_1fold'.format(scoring_func_plot)].iloc[0])
            )

            current_data_plot_xyvals_bootstrap.append(
                final_results.query('(dataset=="{}") & (model=="{}")'.format(
                    d, mod))['test_{}_diff_mean_bootstrap'.format(
                        scoring_func_plot)].values.astype(np.float32))
            current_data_plot_xyvals_bootstrap.append(
                final_results.query('(dataset=="{}") & (model=="{}")'.format(
                    d, mod))['test_{}_diff_std_bootstrap'.format(
                        scoring_func_plot)].values.astype(np.float32))

        xmax_list = [None]
        for i in range(10, 100, 10):
            if num_test_samples > i:
                xmax_list.append(i)

        for lim in xmax_list:
            current_data_plot = plot_performance_diff(
                *current_data_plot_xyvals,
                labels=[m for m in model_config],
                train_difference=train_differences,
                xmax=lim,
                xlabel='number of samples',
                ylabel='mean performance difference')
            plot_filename = '{}_performance-diff_num-splits_full'.format(d)
            if lim is not None:
                plot_filename += '_zoomed-{}'.format(lim)

            om.save(current_data_plot, plot_filename)

            current_data_plot = plot_performance_diff(
                *current_data_plot_xyvals,
                labels=[m for m in model_config],
                xmax=lim,
                xlabel='number of samples',
                ylabel='mean performance difference')
            plot_filename = '{}_performance-diff_num-splits'.format(d)
            if lim is not None:
                plot_filename += '_zoomed-{}'.format(lim)

            om.save(current_data_plot, plot_filename)

            current_data_plot = plot_performance_diff(
                *current_data_plot_xyvals_max,
                labels=[m for m in model_config],
                xmax=lim,
                xlabel='number of samples',
                ylabel='maximum performance difference')
            plot_filename = '{}_performance-diff_max_num-splits'.format(d)
            if lim is not None:
                plot_filename += '_zoomed-{}'.format(lim)

            om.save(current_data_plot, plot_filename)

            current_data_plot = plot_performance_diff(
                *current_data_plot_xyvals_max_bootstrap,
                labels=[m for m in model_config],
                xmax=lim,
                xlabel='number of samples',
                ylabel='maximum performance difference')
            plot_filename = '{}_performance-diff_max_num-splits_bootstrap'.format(
                d)
            if lim is not None:
                plot_filename += '_zoomed-{}'.format(lim)

            om.save(current_data_plot, plot_filename)

            current_data_plot = plot_performance_diff(
                *current_data_plot_xyvals_bootstrap,
                labels=[m for m in model_config],
                train_difference=train_differences,
                xmax=lim,
                xlabel='number of samples',
                ylabel='mean performance difference')
            plot_filename = '{}_performance-diff_num-splits_full_bootstrap'.format(
                d)
            if lim is not None:
                plot_filename += '_zoomed-{}'.format(lim)

            om.save(current_data_plot, plot_filename)

            current_data_plot = plot_performance_diff(
                *current_data_plot_xyvals_bootstrap,
                labels=[m for m in model_config],
                xmax=lim,
                xlabel='number of samples',
                ylabel='mean performance difference')
            plot_filename = '{}_performance-diff_num-splits_bootstrap'.format(
                d)
            if lim is not None:
                plot_filename += '_zoomed-{}'.format(lim)

            om.save(current_data_plot, plot_filename)

        print_info('Saving the final results...', ':')
        om.save(final_results, '{}_final-results'.format(d))

        final_results_dict = final_results.to_dict('dict')
        final_results_dict['relation'] = str(d)  # needed for ARFF
        final_results_dict['description'] = u''  # needed for ARFF
        om.save(final_results, '{}_final-results'.format(d), to_arff=True)

        print_info('\n')
        print_info(
            'Everything done. (Elapsed overall time: {} seconds)\n'.format(
                time.time() - start_time_main))
Пример #10
0
        def _optimize_skopt() -> Union[pd.Series, Tuple[pd.Series, pd.Series], Tuple[pd.Series, pd.Series, dict]]:
            nonlocal max_tries
            max_tries = get_max_tries(max_tries, _grid_size)
            dimensions = construct_dimensions(kwargs)
            memoized_run = lru_cache()(lambda tup: self.run(**dict(tup)))

            INVALID = 1e300
            progress = tqdm(dimensions, total=max_tries, desc='Backtest.optimize', leave=False)

            @use_named_args(dimensions=dimensions)
            def objective_function(**params):
                progress.update(1)

                # Check constraints
                # TODO: Adjust after https://github.com/scikit-optimize/scikit-optimize/pull/971
                if not constraint(AttrDict(params)):
                    return INVALID
                res = memoized_run(tuple(params.items()))
                value = -maximize(res)
                if np.isnan(value):
                    return INVALID
                return value

            with warnings.catch_warnings():
                warnings.filterwarnings('ignore', 'The objective has been evaluated at this point before.')

                res = forest_minimize(
                    func=objective_function,
                    dimensions=dimensions,
                    n_calls=max_tries,
                    base_estimator=ExtraTreesRegressor(n_estimators=20, min_samples_leaf=2),
                    acq_func='LCB',
                    kappa=3,
                    n_jobs=-1,
                    n_initial_points=min(max_tries, 20 + 3 * len(kwargs)),
                    initial_point_generator='lhs',  # 'sobel' requires n_initial_points ~ 2**N
                    callback=DeltaXStopper(9e-7),
                    random_state=random_state,
                )

            stats = self.run(**dict(zip(kwargs.keys(), res.x)))
            output = [stats]

            if return_heatmap:
                heatmap = pd.Series(
                    dict(zip(map(tuple, res.x_iters), -res.func_vals)),
                    name=maximize_key,
                )
                heatmap.index.names = kwargs.keys()
                heatmap = heatmap[heatmap != -INVALID]
                heatmap.sort_index(inplace=True)
                output.append(heatmap)

            if return_optimization:
                valid = res.func_vals != INVALID
                res.x_iters = list(compress(res.x_iters, valid))
                res.func_vals = res.func_vals[valid]
                output.append(res)
            progress.clear()
            progress.close()
            return stats if len(output) == 1 else tuple(output)