示例#1
0
    def test_drift_detector_lightgbm(self):
        df = load_bank()
        y = df.pop('y')
        X_train, X_test = train_test_split(df.copy(), train_size=0.7, shuffle=True, random_state=9527)
        dd = DriftDetector()
        dd.fit(X_train, X_test)

        assert len(dd.feature_names_) == 17
        assert len(dd.feature_importances_) == 17
        assert dd.auc_
        assert len(dd.estimator_) == 5

        proba = dd.predict_proba(df)
        assert proba.shape[0] == df.shape[0]

        df = load_bank()
        y = df.pop('y')
        p = int(df.shape[0] * 0.2)
        X_train, X_test, y_train, y_test = dd.train_test_split(df.copy(), y, test_size=0.2)
        assert X_train.shape == (df.shape[0] - p, df.shape[1])
        assert y_train.shape == (df.shape[0] - p,)
        assert X_test.shape == (p, df.shape[1])
        assert y_test.shape == (p,)

        df['y'] = y
        X_train['y'] = y_train
        X_test['y'] = y_test
        df_split = pd.concat([X_train, X_test])
        df_hash = hash_pandas_object(df).sort_values()
        splitted_hash = hash_pandas_object(df_split).sort_values()
        assert (df_hash == splitted_hash).all()
示例#2
0
    def test_drift_detector_split(self):
        df = dd.from_pandas(load_bank(), npartitions=2)
        y = df.pop('y')
        X_train, X_test = DaskToolBox.train_test_split(df.copy(), train_size=0.7, shuffle=True, random_state=9527)
        ddr = dd_selector().get_detector()
        ddr.fit(X_train, X_test)

        assert len(ddr.feature_names_) == 17
        assert len(ddr.feature_importances_) == 17
        assert ddr.auc_
        assert len(ddr.estimator_) == 5

        proba = ddr.predict_proba(df)
        assert proba.compute().shape[0] == len(df)

        df = dd.from_pandas(load_bank(), npartitions=2)
        y = df.pop('y')
        p = int(len(df) * 0.2)
        X_train, X_test, y_train, y_test = ddr.train_test_split(df.copy(), y, test_size=0.2, remain_for_train=0.)

        df, X_train, X_test, y_train, y_test = DaskToolBox.compute(df, X_train, X_test, y_train, y_test)
        assert X_train.shape == (df.shape[0] - p, df.shape[1])
        assert y_train.shape == (df.shape[0] - p,)
        assert X_test.shape == (p, df.shape[1])
        assert y_test.shape == (p,)

        df['y'] = y
        X_train['y'] = y_train
        X_test['y'] = y_test
        df_split = pd.concat([X_train, X_test])
        df_hash = hash_pandas_object(df).sort_values()
        splitted_hash = hash_pandas_object(df_split).sort_values()
        assert (df_hash == splitted_hash).all()
示例#3
0
def _create_bankdata_experiment(predefined_kwargs, maker=None, need_test=False, user_kwargs=None):
    target = 'y'
    df = dsutils.load_bank().head(2000)
    df[target] = LabelEncoder().fit_transform(df[target])
    df_train, df_test = train_test_split(df, test_size=0.3, random_state=9527)

    def maker_(*args, **kwargs):

        return make_experiment(PlainModel, *args, **kwargs)

    default_kwargs = dict(log_level='info')

    predefined_kwargs.update(default_kwargs)

    if maker is None:
        maker = maker_
        predefined_kwargs['search_space'] = PlainSearchSpace(enable_lr=True,
                                                             enable_nn=False, enable_dt=False, enable_dtr=False)
        predefined_kwargs['hyper_model_options'] = {'transformer': MultiLabelEncoder}

    if need_test:
        predefined_kwargs['test_data'] = df_test

    predefined_kwargs.update(user_kwargs)

    return maker(df_train, target=target, task=const.TASK_BINARY, **predefined_kwargs)
示例#4
0
 def test_shift_score(self):
     df = load_bank().head(1000)
     df = cudf.from_pandas(df)
     selector = dd_selector()
     scores = selector._covariate_shift_score(df[:700], df[700:])
     print('_covariate_shift_score', scores)
     assert scores['id'] >= 0.95
示例#5
0
    def test_datetime_encoder(self):
        def is_holiday(x):
            holidays = {'0501', '0502', '0503'}
            return x.apply(lambda t: int(t.strftime('%m%d') in holidays))

        months = {'oct': 10, 'may': 5, 'apr': 4, 'jun': 6, 'feb': 2, 'aug': 8, 'jan': 1, 'jul': 7, 'nov': 11,
                  'sep': 9, 'mar': 3, 'dec': 12}

        df = dsutils.load_bank().sample(n=1000, random_state=9527)
        df['year'] = 2000
        df['month'] = df['month'].apply(lambda s: months[s])
        df['date'] = pd.to_datetime(df[['year', 'month', 'day']])

        encoder = skex.DatetimeEncoder()
        X = encoder.fit_transform(df)
        columns = X.columns.to_list()
        assert 'date' not in columns
        assert all([c in columns for c in ['date_month', 'date_day']])
        assert all([c not in columns for c in ['date_hour', 'date_minute']])

        encoder = skex.DatetimeEncoder(include=skex.DatetimeEncoder.default_include + ['timestamp'],
                                       extra=[('holiday', is_holiday)], drop_constants=False)
        X = encoder.fit_transform(df)
        columns = X.columns.to_list()
        assert 'date' not in columns
        assert all([c in columns for c in ['date_holiday', 'date_timestamp']])
示例#6
0
    def test_feature_selection(self):
        df = load_bank()
        df = cudf.from_pandas(df)
        y = df.pop('y')
        p = int(df.shape[0] * 0.8)
        X_train = df[:p]
        X_test = df[p:]
        # = train_test_split(df, train_size=0.7,  random_state=9527)
        selector = dd_selector(remove_shift_variable=False,
                               auc_threshold=0.55,
                               min_features=15,
                               remove_size=0.2)
        remain_features, history, scores = selector.select(X_train,
                                                           X_test,
                                                           copy_data=True)
        assert len(remain_features) == 15

        selector = dd_selector(remove_shift_variable=True,
                               auc_threshold=0.55,
                               min_features=15,
                               remove_size=0.2)
        remain_features, history, scores = selector.select(X_train,
                                                           X_test,
                                                           copy_data=True)

        assert len(remain_features) == 16
示例#7
0
def test_collinear():
    df = dsutils.load_bank().head(10000)
    y = df.pop('y')
    df.drop(['id'], axis=1, inplace=True)
    corr = spearmanr(df).correlation
    corr_linkage = hierarchy.ward(corr)
    # fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
    # dendro = hierarchy.dendrogram(
    #     corr_linkage, labels=df.columns.to_list(), ax=ax1, leaf_rotation=90
    # )
    # dendro_idx = np.arange(0, len(dendro['ivl']))
    # ax2.imshow(corr[dendro['leaves'], :][:, dendro['leaves']])
    # ax2.set_xticks(dendro_idx)
    # ax2.set_yticks(dendro_idx)
    # ax2.set_xticklabels(dendro['ivl'], rotation='vertical')
    # ax2.set_yticklabels(dendro['ivl'])
    # fig.tight_layout()
    # plt.show()

    cluster_ids = hierarchy.fcluster(corr_linkage, 1, criterion='distance')
    cluster_id_to_feature_ids = defaultdict(list)
    for idx, cluster_id in enumerate(cluster_ids):
        cluster_id_to_feature_ids[cluster_id].append(idx)
    selected_features = [
        df.columns[v[0]] for v in cluster_id_to_feature_ids.values()
    ]
    assert selected_features == [
        'age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
        'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
        'poutcome'
    ]
示例#8
0
 def load_data():
     set_random_state(9527)
     df = dsutils.load_bank().head(3000)
     encoder = MultiLabelEncoder()
     df = encoder.fit_transform(df)
     df.drop(['id'], axis=1, inplace=True)
     return df
示例#9
0
def test_cache_dask():
    clear()

    cache_counter = CachedDaskMultiLabelEncoder.cache_counter
    df = dd.from_pandas(dsutils.load_bank(), npartitions=2)

    t = dex.SafeOrdinalEncoder()
    X = t.fit_transform(df.copy())

    cache_counter.reset()
    t1 = CachedDaskMultiLabelEncoder()
    X1 = t1.fit_transform(df.copy())
    t2 = CachedDaskMultiLabelEncoder()
    X2 = t2.fit_transform(df.copy())

    hasher = dex.DaskToolBox.data_hasher()
    assert hasher(X) == hasher(X1) == hasher(X2)
    assert cache_counter.enter_counter.value == 2
    assert cache_counter.apply_counter.value <= 2
    assert cache_counter.store_counter.value <= 2
    assert cache_counter.apply_counter.value + cache_counter.store_counter.value == 2

    cache_counter.reset()
    t3 = CachedDaskMultiLabelEncoder()
    X3 = t3.fit_transform_as_array(df.copy())
    t4 = CachedDaskMultiLabelEncoder()
    X4 = t4.fit_transform_as_array(df.copy())

    assert hasher(X3) == hasher(X4)
    assert cache_counter.enter_counter.value == 2
    assert cache_counter.apply_counter.value <= 2
    assert cache_counter.store_counter.value <= 2
    assert cache_counter.apply_counter.value + cache_counter.store_counter.value == 2
示例#10
0
    def setup_class(cls):
        from sklearn.preprocessing import LabelEncoder
        df = dsutils.load_bank()
        df['y'] = LabelEncoder().fit_transform(df['y'])
        cls.bank_data = df
        cls.movie_lens = dsutils.load_movielens()

        os.makedirs(cls.work_dir)
示例#11
0
 def test_shift_score(self):
     df = load_bank().head(1000)
     df = dd.from_pandas(df, npartitions=2)
     selector = dd_selector()
     df_train = DaskToolBox.select_df(df, np.arange(700))
     df_test = DaskToolBox.select_df(df, np.arange(700, 1000))
     scores = selector._covariate_shift_score(df_train, df_test)
     assert scores['id'] > 0.99
示例#12
0
    def setup_class(cls):
        from sklearn.preprocessing import LabelEncoder
        df = dsutils.load_bank()
        df['y'] = LabelEncoder().fit_transform(df['y'])
        df['education'] = LabelEncoder().fit_transform(df['education'])
        cf = cudf.from_pandas(df)

        cls.df = df
        cls.cf = cf
示例#13
0
    def test_dataframe_fs(self):
        file_path = f'/{type(self).__name__}/test_df_fs.parquet'
        df = dsutils.load_bank()
        p.store(df, file_path, filesystem=fs)
        assert fs.exists(file_path)

        # read it
        df_read = p.load(file_path, filesystem=fs)
        assert self.is_same_df(df, df_read)
示例#14
0
 def test_shufflesplit(self):
     df = load_bank().head(1000)
     y = df.pop('y')
     iterators = StratifiedShuffleSplit(n_splits=1, test_size=0.3)
     indices = []
     for train_index, test_index in iterators.split(df, y):
         indices.append((train_index, test_index))
     assert len(indices) == 1
     assert len(indices[0][0]) == 700
     assert len(indices[0][1]) == 300
示例#15
0
    def test_series(self):
        file_path = f'{test_output_dir}/{type(self).__name__}/test_series.parquet'
        df = dsutils.load_bank()
        p.store(df['age'], file_path)
        assert path.exists(file_path)

        s = p.load(file_path)
        assert isinstance(s, pd.Series)
        assert s.name == 'age'
        assert len(s) == len(df)
        assert all(s == df['age'])
示例#16
0
    def test_ndarray(self):
        file_path = f'{test_output_dir}/{type(self).__name__}/test_ndarray.parquet'
        df = dsutils.load_bank()
        p.store(df.values, file_path)
        assert path.exists(file_path)

        values = p.load(file_path)
        assert isinstance(values, np.ndarray)
        assert values.shape == df.shape

        df_read = pd.DataFrame(values, columns=df.columns)
        assert all(df_read['y'] == df['y'])
示例#17
0
 def test_feature_tools_transformer(self):
     df = dsutils.load_bank()
     df.drop(['id'], axis=1, inplace=True)
     y = df.pop('y')
     X_train, X_test = train_test_split(df.head(100),
                                        test_size=0.2,
                                        random_state=42)
     ftt = FeatureGenerationTransformer(
         task='binary', trans_primitives=['add_numeric', 'divide_numeric'])
     ftt.fit(X_train)
     x_t = ftt.transform(X_train)
     assert x_t is not None
示例#18
0
    def setup_class(cls):
        df = dsutils.load_bank()
        df = get_tool_box(df).general_preprocessor(df).fit_transform(df)
        cls.bank_data = df
        cls.bank_data_cudf = cudf.from_pandas(df)
        #
        # cls.boston_data = dsutils.load_blood()
        # cls.boston_data_cudf = cudf.from_pandas(cls.boston_data)
        #
        # cls.movie_lens = dsutils.load_movielens()

        os.makedirs(cls.work_dir)
示例#19
0
def experiment_with_bank_data(init_kwargs,
                              run_kwargs,
                              row_count=3000,
                              with_dask=False):
    hyper_model = create_plain_model(with_encoder=True, with_dask=with_dask)
    X = dsutils.load_bank()
    if row_count is not None:
        X = X.head(row_count)
    X['y'] = LabelEncoder().fit_transform(X['y'])

    if with_dask:
        setup_dask(None)
        X = dd.from_pandas(X, npartitions=1)

    y = X.pop('y')

    tb = get_tool_box(X, y)
    scorer = tb.metrics.metric_to_scoring(hyper_model.reward_metric)

    X_train, X_test, y_train, y_test = \
        tb.train_test_split(X, y, test_size=0.3, random_state=9527)
    X_train, X_eval, y_train, y_eval = \
        tb.train_test_split(X_train, y_train, test_size=0.3, random_state=9527)

    init_kwargs = {
        'X_eval': X_eval,
        'y_eval': y_eval,
        'X_test': X_test,
        'scorer': scorer,
        'ensemble_size': 0,
        'drift_detection': False,
        **init_kwargs
    }
    run_kwargs = {'max_trials': 3, **run_kwargs}
    experiment = CompeteExperiment(hyper_model, X_train, y_train,
                                   **init_kwargs)
    estimator = experiment.run(**run_kwargs)

    assert estimator

    preds = estimator.predict(X_test)
    proba = estimator.predict_proba(X_test)

    if with_dask:
        preds, proba = tb.to_local(preds, proba)

    score = tb.metrics.calc_score(
        y_test,
        preds,
        proba,
        metrics=['auc', 'accuracy', 'f1', 'recall', 'precision'])
    print('evaluate score:', score)
    assert score
示例#20
0
    def test_dataframe(self):
        file_path = f'{test_output_dir}/{type(self).__name__}/test_df.parquet'
        df = dsutils.load_bank()
        p.store(df, file_path)
        assert path.exists(file_path)

        # read with pandas
        df_pd = pd.read_parquet(file_path)
        assert self.is_same_df(df, df_pd)

        # read with our utility
        df_read = p.load(file_path)
        assert self.is_same_df(df, df_read)
示例#21
0
    def test_drift_detector_fit_randomforest(self):
        df = load_bank().head(10000)
        y = df.pop('y')
        X_train, X_test = train_test_split(df, train_size=0.7, shuffle=True, random_state=9527)

        dd_rf = DriftDetector(
            estimator=RandomForestClassifier(min_samples_leaf=20, min_impurity_decrease=0.01))

        dd_rf.fit(X_train, X_test)

        assert len(dd_rf.feature_names_) == 17
        assert len(dd_rf.feature_importances_) == 17
        assert dd_rf.auc_
        assert len(dd_rf.estimator_) == 5
示例#22
0
 def test_in_dataframe_mapper(self):
     df = dsutils.load_bank()
     df.drop(['id'], axis=1, inplace=True)
     X_train, X_test = train_test_split(df.head(100),
                                        test_size=0.2,
                                        random_state=42)
     ftt = FeatureGenerationTransformer(
         task='binary',
         trans_primitives=['cross_categorical'],
         categories_cols=column_object_category_bool(X_train))
     dfm = DataFrameMapper(features=[(X_train.columns.to_list(), ftt)],
                           input_df=True,
                           df_out=True)
     X_t = dfm.fit_transform(X_train)
     assert X_t.shape == (80, 62)
示例#23
0
def test_experiment_with_data_adaption():
    df = dsutils.load_bank()
    df = MultiLabelEncoder().fit_transform(df)
    mem_usage = int(df.memory_usage().sum())
    experiment = make_experiment(
        PlainModel,
        df,
        target='y',
        search_space=PlainSearchSpace(),
        data_adaption_memory_limit=mem_usage // 2,
        log_level='info',
    )
    estimator = experiment.run(max_trials=3)
    assert estimator is not None
    assert estimator.steps[0][0] == 'data_adaption'
示例#24
0
    def setup_class(cls):
        from sklearn.preprocessing import LabelEncoder
        df = dsutils.load_bank()
        df['y'] = LabelEncoder().fit_transform(df['y'])  # binary task target
        df['education'] = LabelEncoder().fit_transform(
            df['education'])  # multiclass task target
        cls.bank_data = df
        cls.bank_data_cudf = cudf.from_pandas(df)

        cls.boston_data = dsutils.load_blood()
        cls.boston_data_cudf = cudf.from_pandas(cls.boston_data)

        cls.movie_lens = dsutils.load_movielens()

        os.makedirs(cls.work_dir)
示例#25
0
 def test_pipeline(self):
     df = dsutils.load_bank()
     df.drop(['id'], axis=1, inplace=True)
     X_train, X_test = train_test_split(df.head(100),
                                        test_size=0.2,
                                        random_state=42)
     ftt = FeatureGenerationTransformer(
         task='binary',
         trans_primitives=['cross_categorical'],
         categories_cols=column_object_category_bool(X_train))
     preprocessor = general_preprocessor()
     pipe = Pipeline(steps=[('feature_gen', ftt), ('processor',
                                                   preprocessor)])
     X_t = pipe.fit_transform(X_train)
     print(X_t.columns)
     assert X_t.shape == (80, 62)
    def test_pipeline(self):
        df = dsutils.load_bank()
        df.drop(['id'], axis=1, inplace=True)
        ddf = dd.from_pandas(df.head(100), npartitions=2)
        tb = get_tool_box(ddf)

        ftt = tb.transformers['FeatureGenerationTransformer'](
            task='binary',
            trans_primitives=['cross_categorical'],
            categories_cols=tb.column_selector.column_object_category_bool(
                ddf))
        preprocessor = tb.general_preprocessor(ddf)
        pipe = Pipeline(steps=[('feature_gen', ftt), ('processor',
                                                      preprocessor)])
        X_t = pipe.fit_transform(ddf)
        X_t = X_t.compute()
        assert X_t.shape[1] == 62
示例#27
0
 def test_feature_generation_with_selection(self):
     df = dsutils.load_bank().head(1000)
     df.drop(['id'], axis=1, inplace=True)
     y = df.pop('y')
     ftt = FeatureGenerationTransformer(
         task='binary',
         trans_primitives=[
             'add_numeric', 'divide_numeric', 'cross_categorical'
         ],
         categories_cols=column_object_category_bool(df),
         feature_selection_args={'ratio_select_cols': 0.2})
     with pytest.raises(AssertionError) as err:
         ftt.fit(df)
         assert err.value == '`y` must be provided for feature selection.'
     ftt.fit(df, y)
     x_t = ftt.transform(df)
     assert x_t.shape[1] == 35
示例#28
0
 def test_feature_tools_categorical_cross(self):
     df = dsutils.load_bank()
     df.drop(['id'], axis=1, inplace=True)
     X_train, X_test = train_test_split(df.head(100),
                                        test_size=0.2,
                                        random_state=42)
     cat_cols = column_object_category_bool(X_train)
     ftt = FeatureGenerationTransformer(
         task='binary',
         trans_primitives=['cross_categorical'],
         categories_cols=cat_cols)
     ftt.fit(X_train)
     x_t = ftt.transform(X_train)
     columns = set(x_t.columns.to_list())
     for i_left in range(len(cat_cols) - 1):
         for i_right in range(i_left + 1, len(cat_cols)):
             assert f'CROSS_CATEGORICAL_{cat_cols[i_left]}__{cat_cols[i_right]}' in columns \
                    or f'CROSS_CATEGORICAL_{cat_cols[i_right]}__{cat_cols[i_left]}' in columns
示例#29
0
    def test_feature_selection(self):
        df = self.bank_data.copy()
        y = df.pop('y')
        reserved_cols = ['age', 'poutcome', 'id']
        fse = skex.FeatureSelectionTransformer('classification', 10000, 10000, 10, n_max_cols=8,
                                               reserved_cols=reserved_cols)
        fse.fit(df, y)
        assert len(fse.scores_.items()) == 10
        assert len(fse.columns_) == 11
        assert len(set(reserved_cols) - set(fse.columns_)) == 0

        x_t = fse.transform(df)
        assert x_t.columns.to_list() == fse.columns_

        df = dsutils.load_bank()
        y = df.pop('age')
        fse = skex.FeatureSelectionTransformer('regression', 10000, 10000, -1)
        fse.fit(df, y)
        assert len(fse.scores_.items()) == 17
        assert len(fse.columns_) == 10
示例#30
0
    def test_feature_selection(self):
        df = dsutils.load_bank().head(1000)
        df.drop(['id'], axis=1, inplace=True)
        y = df.pop('y')
        ftt = FeatureGenerationTransformer(
            task='binary',
            trans_primitives=[
                'add_numeric', 'divide_numeric', 'cross_categorical'
            ],
            categories_cols=column_object_category_bool(df))
        ftt.fit(df)
        x_t = ftt.transform(df)

        fst = FeatureSelectionTransformer('binary',
                                          ratio_select_cols=0.2,
                                          reserved_cols=ftt.original_cols)
        fst.fit(x_t, y)
        assert len(fst.scores_.items()) == 99
        assert len(fst.columns_) == 35
        x_t2 = fst.transform(x_t)
        assert x_t2.shape[1] == 35