Пример #1
0
def _create_experiment(predefined_kwargs, maker=None, need_test=False, user_kwargs=None):
    df = dsutils.load_boston()
    df['Constant'] = [0 for i in range(df.shape[0])]
    df['Id'] = [i for i in range(df.shape[0])]
    target = 'target'
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=1234)
    df_test.pop(target)
    df_train['Drifted'] = np.random.random(df_train.shape[0])
    df_test['Drifted'] = np.random.random(df_test.shape[0]) * 100

    def maker_(*args, **kwargs):
        if 'random_state' not in kwargs.keys():
            kwargs['random_state'] = 1234
        return make_experiment(PlainModel, *args, **kwargs)

    default_kwargs = dict(
        log_level='info',
    )
    predefined_kwargs.update(default_kwargs)
    if maker is None:
        maker = maker_
        predefined_kwargs['search_space'] = PlainSearchSpace(enable_lr=False,
                                                             enable_nn=False, enable_dt=False, enable_dtr=True)
    if need_test:
        predefined_kwargs['test_data'] = df_test

    predefined_kwargs.update(user_kwargs)

    return maker(df_train, target=target, task=const.TASK_REGRESSION, **predefined_kwargs)
Пример #2
0
    def setup_class(cls):
        if is_dask_installed:
            import dask.dataframe as dd
            setup_dask(cls)

            cls.boston = dd.from_pandas(dsutils.load_boston(), npartitions=1)
            cls.blood = dd.from_pandas(dsutils.load_blood(), npartitions=1)
            cls.bike_sharing = dd.from_pandas(dsutils.load_Bike_Sharing(), npartitions=1)
Пример #3
0
    def experiment_with_boston(self,
                               init_kwargs,
                               run_kwargs,
                               row_count=3000,
                               with_dask=False):
        if with_dask:
            X = self.boston
            y = X.pop('target')
        else:
            X = dsutils.load_boston()
            if row_count is not None:
                X = X.head(row_count)
            X['target'] = LabelEncoder().fit_transform(X['target'])
            y = X.pop('target')
            y = y.astype('float64')

        hyper_model = create_plain_model(with_encoder=True)

        tb = get_tool_box(X, y)
        X_train, X_test, y_train, y_test = \
            tb.train_test_split(X, y, test_size=0.3, random_state=9527)
        X_train, X_eval, y_train, y_eval = \
            tb.train_test_split(X_train, y_train, test_size=0.3, random_state=9527)

        init_kwargs = {
            'X_eval': X_eval,
            'y_eval': y_eval,
            'X_test': X_test,
            **init_kwargs
        }

        compete_experiment = CompeteExperiment(hyper_model, X_train, y_train,
                                               **init_kwargs)
        base_experiment = Experiment(hyper_model, X_train, y_train,
                                     **init_kwargs)

        mydict_compete = compete_experiment.get_data_character()
        mydict_base = base_experiment.get_data_character()

        assert mydict_base
        assert mydict_compete
        assert mydict_base['experimentType'] == 'base'
        assert mydict_compete['experimentType'] == 'compete'
        assert mydict_base['target']['taskType'] == 'regression'
        assert mydict_base['target']['freq'] is None
        assert mydict_base['target']['unique']
        assert mydict_base['target']['mean'] is not None
        assert mydict_base['target']['max'] is not None
        assert mydict_base['target']['min'] is not None
        assert mydict_base['target']['stdev'] is not None
        assert mydict_base['target']['dataType'] is 'float'
        assert len(mydict_base['targetDistribution']) <= 10
        assert mydict_base['datasetShape']['X_train']
        assert mydict_base['datasetShape']['y_train']
        assert mydict_base['datasetShape']['X_eval']
        assert mydict_base['datasetShape']['y_eval']
        assert mydict_base['datasetShape']['X_test']
        assert mydict_compete['featureDistribution']
Пример #4
0
def test_regression_task_report():
    df = dsutils.load_boston()
    df['Constant'] = [0 for i in range(df.shape[0])]
    df['Id'] = [i for i in range(df.shape[0])]

    target = 'target'

    df_train, df_eval = train_test_split(df, test_size=0.2)

    df_train['Drifted'] = np.random.random(df_train.shape[0])
    df_eval['Drifted'] = np.random.random(df_eval.shape[0]) * 100
    file_path = common_util.get_temp_file_path(prefix="report_excel_",
                                               suffix=".xlsx")
    print(file_path)
    experiment = make_experiment(
        PlainModel,
        df_train,
        target=target,
        eval_data=df_eval.copy(),
        test_data=df_eval.copy(),
        drift_detection_threshold=0.4,
        drift_detection_min_features=3,
        drift_detection_remove_size=0.5,
        search_space=PlainSearchSpace(enable_lr=False,
                                      enable_nn=False,
                                      enable_dt=False,
                                      enable_dtr=True),
        report_render='excel',
        report_render_options={'file_path': file_path})
    estimator = experiment.run(max_trials=3)
    assert estimator is not None
    mlr_callback = None
    mle_callback = None
    for callback in experiment.callbacks:
        if isinstance(callback, MLReportCallback):
            mlr_callback = callback
        if isinstance(callback, MLEvaluateCallback):
            mle_callback = callback

    assert mlr_callback is not None
    _experiment_meta: ExperimentMeta = mlr_callback.experiment_meta_

    assert len(_experiment_meta.resource_usage) > 0
    assert len(_experiment_meta.steps) == 5
    assert os.path.exists(file_path)

    assert mle_callback is not None
    assert _experiment_meta.evaluation_metric is not None
    assert len(_experiment_meta.prediction_stats) == 1
    assert len(_experiment_meta.datasets) == 3
def main():
    df = dsutils.load_boston()

    df_train, df_eval = train_test_split(df, test_size=0.2)
    search_space = PlainSearchSpace(enable_lr=False,
                                    enable_nn=False,
                                    enable_dt=False,
                                    enable_dtr=True)

    experiment = make_experiment(PlainModel,
                                 df_train,
                                 target='target',
                                 search_space=search_space,
                                 report_render='excel')
    estimator = experiment.run(max_trials=3)
    print(estimator)