Exemplo n.º 1
0
def test_early_stop_no_progress_loss():
    trials = generate_trials_to_calculate([{'x': -100}])
    fmin(fn=lambda x: x,
         space=hp.uniform("x", -5, 5),
         algo=rand.suggest,
         max_evals=500,
         trials=trials,
         early_stop_fn=no_progress_loss(10))

    assert len(trials) == 10
Exemplo n.º 2
0
def _find_optimal_model(train_ds, val_ds, test_ds, data_props, examples):

    search_space = {
        'backlooking_period': hp.choice('backlooking_period', [1, 2, 3, 4]),
        'n_estimators': hp.quniform('n_estimators', 100, 1000, 1),
        'eta': hp.quniform('eta', 0.025, 0.5, 0.025),
        # A problem with max_depth casted to float instead of int with
        # the hp.quniform method.
        'max_depth': hp.choice('max_depth', np.arange(1, 14, dtype=int)),
        'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
        'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
        'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
        'eval_metric': 'mae',
        # Increase this number if you have more cores. Otherwise, remove it and it will default
        # to the maxium number.
        'nthread': None,
        'booster': 'gbtree',
        'tree_method': 'exact',
        'silent': 0,
        'seed': 42
    }

    search_space['train'] = train_ds
    search_space['val'] = val_ds
    search_space['test'] = test_ds
    search_space['iter_step'] = data_props['iter_step']

    trials = Trials()
    best = fmin(_optimize_obj,
                search_space,
                algo=tpe.suggest,
                trials=trials,
                early_stop_fn=no_progress_loss(iteration_stop_count=25,
                                               percent_increase=0.025),
                max_evals=100)

    best_result = trials.best_trial['result']['results']
    best_params = trials.best_trial['result']['params']

    best_result = pd.DataFrame(best_result)
    best_result = _reformat_DF(best_result, data_props['iter_step'])
    best_params = pd.Series(best_params, name=data_props['iter_step'])

    return best_result, best_params
Exemplo n.º 3
0
    def _early_stop_fn() -> Any:
        no_progress_loss_fn = no_progress_loss(
            int(_get_option_value(*_opt_no_progress_loss)))
        timeout = int(_get_option_value(*_opt_timeout))
        if timeout <= 0:
            return no_progress_loss_fn

        # Set base time for budget mechanism
        start_time = time.time()

        def timeout_fn(trials,
                       best_loss=None,
                       iteration_no_progress=0):  # type: ignore
            no_progress_loss, meta = no_progress_loss_fn(
                trials, best_loss, iteration_no_progress)
            to = time.time() - start_time > timeout
            return no_progress_loss or to, meta

        return timeout_fn
Exemplo n.º 4
0
 def start_opt(self):
     best = fmin(
         # функция для оптимизации
         fn=partial(self.objective, pipeline=self.pipeline.get_model(),
                    X_train=self.data.get_Xy()['X'], y_train=self.data.get_Xy()['y'],
                    metric=self.metric),
         # пространство поиска гиперпараметров
         space=self.params.get_opt_space(),
         # алгоритм поиска
         algo=tpe.suggest,
         # число итераций
         # (можно ещё указать и время поиска)
         max_evals=250,
         # куда сохранять историю поиска
         trials=self.trials,
         # random state
         rstate=np.random.RandomState(1),
         # early stop
         early_stop_fn=no_progress_loss(**self.early_stop),
         # progressbar
         show_progressbar=True
     )
Exemplo n.º 5
0
def _add_conf(func, trial):
    return dict(early_stop_fn=no_progress_loss(50))
Exemplo n.º 6
0
def main_run_linear_models(train_ds,
                           val_ds,
                           test_ds,
                           data_props,
                           max_backlooking=None,
                           layer_type='dense',
                           activation_funcs=['sigmoid', 'relu', 'tanh'],
                           max_serach_iterations=200,
                           NN_max_depth=3,
                           MAX_EPOCHS=800,
                           patience=25,
                           model_name='linear',
                           examples=None,
                           return_permutation_importances=True,
                           redo_serach_best_model=False):
    mlflow.set_experiment(model_name)
    experiment_date_time = int(
        datetime.datetime.now().strftime("%Y%m%d%H%M%S"))

    flatten_input = True if layer_type == 'dense' else False

    def _extract_just_important_data_props(data_props):
        kwargs = {}
        kwargs['dataset_cols_X_just_these'] = data_props['third_filter'][
            'cols_just_these']
        kwargs['dataset_cols_X_exclude'] = data_props['third_filter'][
            'cols_drop']
        kwargs['dataset_cols_y'] = data_props['third_filter'][
            'y_cols_just_these']
        kwargs['dataset_hash_input'] = int(data_props['first_step']['dataset'])
        kwargs['dataset_hash_first'] = data_props['first_step_data_hash']
        kwargs['dataset_hash_second'] = data_props['second_step_data_hash']
        kwargs['dataset_split_method'] = data_props['second_step'][
            'split_method']
        kwargs['dataset_split_steps_train'] = data_props['second_step'][
            'split_props']['train_time_steps']
        kwargs['dataset_split_steps_val'] = data_props['second_step'][
            'split_props']['val_time_steps']
        kwargs['dataset_split_steps_test'] = data_props['second_step'][
            'split_props']['test_time_steps']
        kwargs['dataset_iter_step'] = data_props['iter_step']
        kwargs['dataset_normalization'] = data_props['second_step'][
            'normalize_method']
        kwargs['dataset_window_backlooking'] = data_props['first_step'][
            'window_input_width']
        kwargs['dataset_window_prediction'] = data_props['first_step'][
            'window_pred_width']
        kwargs['dataset_window_shift'] = data_props['first_step'][
            'window_shift']
        return kwargs

    def _hp_tranform_param_dict(param_dict):
        new_param_dict = {}
        for key, value in param_dict.items():
            if type(value) == list:
                new_param_dict[key] = hp.choice(key, value)
            elif type(value) == set:
                new_param_dict[key] = hp.uniform(key, *values)
            else:
                new_param_dict[key] = value
        return new_param_dict

    max_backlooking = data_props['first_step'][
        'window_input_width'] if max_backlooking is None else max_backlooking

    param_grid = dict(
        n_layers=list(range(1, NN_max_depth + 1)),
        first_layer_nodes=[0] if NN_max_depth == 1 else [128, 64, 32, 16, 8],
        last_layer_nodes=[0] if NN_max_depth == 1 else [64, 32, 16, 8, 4],
        activation_func=activation_funcs,
        backlooking_window=list(range(1, max_backlooking + 1)))
    hp_param_dict = _hp_tranform_param_dict(param_dict=param_grid)
    hp_param_dict['model_name'] = model_name
    hp_param_dict['data_props'] = data_props
    hp_param_dict['layer_type'] = layer_type

    def _optimize_objective(*args, **kwargs):
        if args != ():
            kwargs = args[
                0]  # if positional arguments expect first to be dictionary with all kwargs
        if type(kwargs) != dict:
            raise Exception(
                f'kwargs is not  dict - it is {type(kwargs)} with values: {kwargs}'
            )

        backlooking_window = kwargs.pop('backlooking_window')
        n_layers = kwargs.pop('n_layers')
        first_layer_nodes = kwargs.pop('first_layer_nodes')
        last_layer_nodes = kwargs.pop('last_layer_nodes')
        activation_func = kwargs.pop('activation_func')
        return_everything = kwargs.pop('return_everything', False)
        verbose = kwargs.pop('verbose', 0)
        model_name = kwargs.pop('model_name', 'linear')
        data_props = kwargs.pop('data_props')
        layer_type = kwargs.pop('layer_type', 'dense')

        dataset = _get_prep_data(train_ds,
                                 val_ds,
                                 test_ds,
                                 flatten=flatten_input,
                                 keep_last_n_periods=backlooking_window)

        now = datetime.datetime.now()
        date_time = str(now.strftime("%y%m%d%H%M%S"))
        model_name = f"{date_time}_{model_name}_w{backlooking_window}_l{n_layers}_a{activation_func}"

        kwargs = dict(
            model_name=model_name,
            n_layers=n_layers,
            first_layer_nodes=first_layer_nodes,
            last_layer_nodes=last_layer_nodes,
            activation_func=activation_func,
            input_size=dataset['input_shape'] if layer_type == 'dense' else
            tuple(list(train_ds.element_spec[0].shape)[1:]),
            output_size=dataset['output_shape'],
            backlooking_window=backlooking_window,
            layer_type=layer_type)

        model = createmodel(**kwargs)
        history, mlflow_additional_params = compile_and_fit(
            model=model,
            train=dataset['train_ds'],
            val=dataset['val_ds'],
            MAX_EPOCHS=MAX_EPOCHS,
            patience=patience,
            model_name=model_name,
            verbose=verbose)

        # Get all data props for documentation in MLflow
        kwargs.update(_extract_just_important_data_props(data_props))
        kwargs['run'] = experiment_date_time
        mlflow_additional_params['kwargs'] = kwargs

        train_performance = dict(
            zip(model.metrics_names,
                evaluate_model(model=model, tf_data=dataset['train_ds'])))
        val_performance = dict(
            zip(model.metrics_names,
                evaluate_model(model=model, tf_data=dataset['val_ds'])))
        test_performance = dict(
            zip(
                model.metrics_names,
                evaluate_model(
                    model=model,
                    tf_data=dataset['test_ds'],
                    mlflow_additional_params=mlflow_additional_params)))
        mlflow_additional_params['data_props'] = data_props

        # Only save model if close to 15% best models
        try:
            best_loss = float(trials.best_trial['result']['loss'])
            current_loss = min(history.history['val_loss'])
            if current_loss <= best_loss * (1 + 0.15):
                save_model = True
            else:
                save_model = False
        except:
            save_model = True
        mlflow_saved = my_helpers.mlflow_last_run_add_param(
            param_dict=mlflow_additional_params, save_model=save_model)

        tf.keras.backend.clear_session()

        return_metrics = dict(loss=val_performance['loss'],
                              all_metrics={
                                  'train': train_performance,
                                  'val': val_performance,
                                  'test': test_performance
                              },
                              status=STATUS_OK,
                              mlflow=mlflow_saved,
                              model_name=model_name)

        if return_everything:
            return_metrics['model'] = model
            return_metrics['history'] = history

        return return_metrics

    ###### Get old best model records ######

    storage_file_path = os.path.join(
        my_helpers.get_project_directories(key='cache_dir'),
        'storage_best_model.json')
    if not os.path.exists(storage_file_path):
        best_model_storage = {}
    else:
        with open(storage_file_path) as json_file:
            best_model_storage = json.load(json_file)

    ######## Search for best model ########

    if redo_serach_best_model or model_name not in best_model_storage or data_props[
            'iter_step'] not in best_model_storage[model_name]:
        warnings.filterwarnings('ignore')
        trials = Trials()
        best = fmin(fn=_optimize_objective,
                    space=hp_param_dict,
                    algo=tpe.suggest,
                    max_evals=max_serach_iterations,
                    trials=trials,
                    early_stop_fn=no_progress_loss(iteration_stop_count=int(
                        max_serach_iterations / 4),
                                                   percent_increase=0.025))
        warnings.simplefilter('always')

        # getting all parameters for best model storage
        mlflow_best_model = trials.best_trial['result']['mlflow']
        best_params = {}
        for key, idx in best.items():
            best_params[key] = param_grid[key][idx]

        coef_names_ = list(
            data_props['look_ups']['out_lookup_col_name']['X'].keys())
        coef_names_ = coef_names_ + [
            col + f'_sft_{i}'
            for i in range(1, best_params['backlooking_window'])
            for col in coef_names_
        ]

        # Saving best model to storage
        if model_name not in best_model_storage:
            best_model_storage[model_name] = {}
        if data_props['iter_step'] not in best_model_storage[model_name]:
            best_model_storage[model_name][data_props['iter_step']] = {
                'best_model': {
                    'result': {
                        'loss': 10**10
                    }
                },
                'history': {}
            }

        best_model_param = dict(
            result={
                'loss': trials.best_trial['result']['loss'],
                'all_metrics': trials.best_trial['result']['all_metrics']
            },
            model_name=trials.best_trial['result']['model_name'],
            model_id=trials.best_trial['result']['mlflow']['model_id'],
            run_id=experiment_date_time,
            input_coefs=coef_names_,
            path_saved_model=trials.best_trial['result']['mlflow']
            ['saved_model_path'],
            status=trials.best_trial['result']['status'],
            params=best_params,
            data=_extract_just_important_data_props(data_props))

        best_model_storage[model_name][data_props['iter_step']]['history'][
            experiment_date_time] = best_model_param
        if trials.best_trial['result']['loss'] < best_model_storage[model_name][
                data_props['iter_step']]['best_model']['result']['loss']:
            best_model_storage[model_name][
                data_props['iter_step']]['best_model'] = best_model_param

        with open(storage_file_path, 'w') as outfile:
            json.dump(best_model_storage, outfile)

    else:
        # Get best model from storage
        best_model_param = best_model_storage[model_name][
            data_props['iter_step']]['best_model']

    ######## Get Best model again ########
    best_model = tf.keras.models.load_model(
        best_model_param['path_saved_model'])
    best_model.compile(loss=tf.losses.MeanAbsoluteError(),
                       optimizer=tf.optimizers.Adam(),
                       metrics=[
                           tf.metrics.MeanAbsoluteError(),
                           CustomMeanDirectionalAccuracy(),
                           tf.losses.Huber(),
                           tf.metrics.MeanAbsolutePercentageError(),
                           tf.metrics.MeanSquaredError(),
                           tf.metrics.MeanSquaredLogarithmicError()
                       ])
    print('Best model is:', best_model_param)

    out = dict(best_model_param)

    ####### Get examples for plotting #######
    if examples is not None:
        example_X = examples['X']
        periods = best_model_param['params']['backlooking_window']
        if layer_type == 'dense':
            example_X = tf.data.Dataset.from_tensors(
                np.reshape(example_X[:, -periods:, :],
                           (example_X.shape[0], -1)))
        else:
            example_X = tf.data.Dataset.from_tensors(example_X)
        out['examples_pred_y'] = best_model.predict(example_X)

    ###### For 1 layer dense/linear models get coef & p-values ######
    if NN_max_depth == 1 and isinstance(best_model.layers[0],
                                        tf.keras.layers.Dense):
        # Get coefs
        intercept_ = best_model.layers[0].bias.numpy()
        coef_ = best_model.layers[0].weights[0].numpy()
        out['coef_'] = pd.Series(
            dict(
                zip(['intercept_'] + best_model_param['input_coefs'],
                    intercept_.tolist() + coef_.squeeze().tolist())))

        dataset = _get_prep_data(train_ds,
                                 val_ds,
                                 test_ds,
                                 flatten=True,
                                 keep_last_n_periods=best_model_param['params']
                                 ['backlooking_window'])

        # get p-values
        import app.d_prediction.my_custom_pvalue_calc as my_p_lib

        out['p_values'] = {}
        for data_set in ['train', 'val', 'test']:
            y_pred = best_model.predict(dataset[f'{data_set}_X'])
            y_pred = np.reshape(y_pred, (-1, 1))
            try:
                p_values = my_p_lib.coef_pval(dataset[f'{data_set}_X'],
                                              dataset[f'{data_set}_y'], coef_,
                                              intercept_, y_pred)
                p_values = pd.Series(
                    dict(zip(best_model_param['input_coefs'], p_values)))
                out['p_values'][data_set] = p_values
            except:
                warnings.warn(
                    "P-Values: ValueError: Input contains infinity or nan.")
                out['p_values'][data_set] = pd.Series(
                    dict(
                        zip(best_model_param['input_coefs'],
                            ['error'] * len(best_model_param['input_coefs']))))
        out['p_values'] = pd.DataFrame(out['p_values'])

    ##### Get Column Feature Importance #####
    if return_permutation_importances:
        if 'feature_importance' in best_model_param:
            out['feature_importance'] = best_model_param['feature_importance']

        else:
            import eli5
            from eli5.sklearn import PermutationImportance

            sklearn_model = KerasRegressor(build_fn=best_model)
            sklearn_model.model = best_model

            dataset = _get_prep_data(
                train_ds,
                val_ds,
                test_ds,
                flatten=flatten_input,
                keep_last_n_periods=best_model_param['params']
                ['backlooking_window'])

            out['feature_importance'] = {}
            for data_set in ['train', 'val']:
                # Calculate actual FeatureImporttance
                try:
                    perm = PermutationImportance(
                        sklearn_model, cv='prefit').fit(
                            dataset[f'{data_set}_X'].numpy(),
                            np.reshape(dataset[f'{data_set}_y'].numpy(),
                                       (-1, 1)))
                    feature_importances = eli5.format_as_dataframe(
                        eli5.explain_weights(
                            perm,
                            feature_names=best_model_param['input_coefs'],
                            top=10**10))
                    out['feature_importance'][
                        data_set] = feature_importances.set_index(
                            'feature').to_dict()
                except:
                    warnings.warn(
                        "PermutationImportance: ValueError: Input contains infinity or a value too large for dtype('float16')."
                    )

            if out['feature_importance'] != {}:
                best_model_param['feature_importance'] = out[
                    'feature_importance']
                best_model_storage[model_name][
                    data_props['iter_step']]['best_model'][
                        'feature_importance'] = out['feature_importance']
                best_model_storage[model_name][
                    data_props['iter_step']]['history'][experiment_date_time][
                        'feature_importance'] = out['feature_importance']

                with open(storage_file_path, 'w') as outfile:
                    json.dump(best_model_storage, outfile)

    out['status'] = 'ok'
    return out