Пример #1
0
    def predict_proba(self, X, *, ingore_transformer=False, **kwargs):
        eval_set = kwargs.pop('eval_set', None)  # ignore

        if not ingore_transformer and self.transformer is not None:
            logger.info('transform data')
            X = self.transformer.transform(X)

        tb_original = get_tool_box(X)
        X, = tb_original.to_local(X)

        tb = get_tool_box(X)
        if self.cv_models_:
            proba_sum = None
            for n, est in enumerate(self.cv_models_):
                logger.info(f'predict_proba estimator {n}')
                proba = est.predict_proba(X, **kwargs)
                if self.task == const.TASK_BINARY:
                    proba = tb.fix_binary_predict_proba_result(proba)
                if proba_sum is None:
                    proba_sum = proba
                else:
                    proba_sum += proba
            proba = proba_sum / len(self.cv_models_)
        else:
            logger.info('predict_proba')
            proba = self.model.predict_proba(X, **kwargs)
            if self.task == const.TASK_BINARY:
                proba = tb.fix_binary_predict_proba_result(proba)

        proba, = tb_original.from_local(proba)
        return proba
Пример #2
0
    def test_get_tool_box(self):
        tb = get_tool_box(dd.DataFrame)
        assert tb is DaskToolBox

        ddf = dd.from_pandas(pd.DataFrame(
            dict(x1=['a', 'b', 'c'], x2=[1, 2, 3])),
                             npartitions=1)
        tb = get_tool_box(ddf)
        assert tb is DaskToolBox
Пример #3
0
def check_dataframe(df1,
                    df2,
                    *,
                    shape=True,
                    columns=True,
                    dtypes=True,
                    values=True,
                    delta=1e-5):
    from hypernets.tabular import get_tool_box

    if not isinstance(df1, pd.DataFrame):
        df1, = get_tool_box(df1).to_local(df1)
        df1 = pd.DataFrame(df1)
    if not isinstance(df2, pd.DataFrame):
        df2, = get_tool_box(df2).to_local(df2)
        df2 = pd.DataFrame(df2)

    if shape:
        assert df1.shape == df2.shape, 'The same dataframe shape is required.'

    if columns:
        assert all(
            df1.columns == df2.columns), 'The same column names were required.'

    if dtypes:
        assert df1.dtypes.tolist() == df2.dtypes.tolist(
        ), 'The same column dtypes were required.'

    if values:
        if not columns:
            df2.columns = df1.columns

        float_cols = df1.select_dtypes(['float32', 'float64']).columns.tolist()
        if float_cols:
            df1_float = df1[float_cols]
            df2_float = df2[float_cols]
            value_diff = (df1_float - df2_float).abs().max().max()
            assert value_diff < delta

            df1_nofloat = df1[[
                c for c in df1.columns.tolist() if c not in float_cols
            ]]
            df2_nofloat = df2[[
                c for c in df2.columns.tolist() if c not in float_cols
            ]]
        else:
            df1_nofloat = df1
            df2_nofloat = df2

        if df1_nofloat.shape[1] > 0:
            assert (df1_nofloat == df2_nofloat
                    ).all().all(), 'all value should be equal.'

    return True
Пример #4
0
    def test_basic(self):
        hasher = get_tool_box(pd.DataFrame).data_hasher()
        df1 = pd.read_csv(io.StringIO(csv_str))
        hash1 = hasher(df1)

        df2 = pd.read_csv(io.StringIO(csv_str))
        hash2 = hasher(df2)
        assert hash1 == hash2

        df3 = df1.head(5)
        hash3 = hasher(df3)
        assert hash1 != hash3

        df4 = pd.concat([df1, df1.head(1)], axis=0)
        hash4 = hasher(df4)
        assert hash1 != hash4

        df5 = copy.deepcopy(df1)
        df5['x1_int_nanchar'] = ['1.0', '2.2', '\\N', '4.', '5', '6']
        hash5 = hasher(df5)
        assert hash1 == hash5

        df6 = copy.deepcopy(df1)
        df6['x1_int_nanchar'] = ['2.0', '2.2', '\\N', '4.', '5', '6']
        hash6 = hasher(df6)
        assert hash1 != hash6
Пример #5
0
    def _get_estimator(self, space_sample):
        from hypernets.tabular import get_tool_box
        import dask.dataframe as dd

        estimator = super()._get_estimator(space_sample)

        return get_tool_box(dd.DataFrame).wrap_local_estimator(estimator)
Пример #6
0
    def predict(self, X, **kwargs):
        eval_set = kwargs.pop('eval_set', None)  # ignore

        if self.transformer is not None:
            logger.info('transform local')
            X = self.transformer.transform(X)

        logger.info('bring X,y to local')
        tb_original = get_tool_box(X)
        X, = tb_original.to_local(X)

        if self.cv_models_:
            if self.task == const.TASK_REGRESSION:
                pred_sum = None
                for n, est in enumerate(self.cv_models_):
                    logger.info(f'predict estimator {n}')
                    pred = est.predict(X, **kwargs)
                    if pred_sum is None:
                        pred_sum = pred
                    else:
                        pred_sum += pred
                preds = pred_sum / len(self.cv_models_)
            else:
                logger.info('predict_proba')
                proba = self.predict_proba(X, ingore_transformer=True, **kwargs)

                logger.info('proba2predict')
                preds = self.proba2predict(proba)
                preds = np.array(self.classes_).take(preds, axis=0)
        else:
            logger.info('predict')
            preds = self.model.predict(X, **kwargs)

        preds, = tb_original.from_local(preds)
        return preds
Пример #7
0
    def test_concat_df(self):
        df = cudf.DataFrame(
            dict(
                x1=['a', 'b', 'c'],
                x2=[1, 2, 3],
                x3=[4., 5, 6],
            ))
        tb = get_tool_box(cudf.DataFrame)

        # DataFrame + DataFrame
        df1 = tb.concat_df([df, df], axis=0)
        df2 = cudf.concat([df, df], axis=0)
        assert (df1 == df2).all().all()

        # DataFrame + ndarray
        df_num = df[['x2', 'x3']]
        df1 = tb.concat_df([df_num, df_num.values], axis=0)
        df2 = cudf.concat([df_num, df_num], axis=0)
        assert isinstance(df1, cudf.DataFrame)
        assert (df1 == df2).all().all()

        # Series + ndarray
        s = df['x2']
        df1 = tb.concat_df([s, s.values], axis=0)
        df2 = cudf.concat([s, s], axis=0)
        assert isinstance(df1, cudf.Series)
        assert (df1 == df2).all()
Пример #8
0
 def test_transform(self):
     df_train = dsutils.load_adult()
     df_train = dd.from_pandas(df_train, npartitions=2)
     y = df_train.pop(14)  # .values
     X = df_train
     X_train, X_test, y_train, y_test = get_tool_box(X, y).train_test_split(
         X, y, test_size=0.2, random_state=42)
     conf = deeptable.ModelConfig(auto_discrete=True,
                                  auto_imputation=True,
                                  auto_encode_label=True,
                                  auto_categorize=True,
                                  apply_gbm_features=False)
     processor = DefaultDaskPreprocessor(conf, compute_to_local=True)
     X1, y1 = processor.fit_transform(X_train, y_train)
     X2, y2 = processor.transform(X_test, y_test)
     assert len(
         set(X1.columns.tolist()) - set([
             'x_1', 'x_3', 'x_5', 'x_6', 'x_7', 'x_8', 'x_9', 'x_13',
             'x_0_cat', 'x_4_cat', 'x_10_cat', 'x_11_cat', 'x_12_cat',
             'x_2', 'x_0', 'x_4', 'x_10', 'x_11', 'x_12', 'x_2_discrete',
             'x_0_discrete', 'x_4_discrete', 'x_10_discrete',
             'x_11_discrete', 'x_12_discrete'
         ])) == 0
     assert len(set(X1.columns) - set(X2.columns)) == 0
     assert X1.shape, (X_train.shape[0], 25)
     assert X2.shape, (X_test.shape[0], 25)
     assert y1.sum(), 6297
     assert y2.sum(), 1544
Пример #9
0
    def test_var_categorical_feature(self):
        X = self.df.copy()
        y = X.pop('rating').values.astype('float32')

        conf = deeptable.ModelConfig(nets=['dnn_nets'],
                                     task=consts.TASK_REGRESSION,
                                     categorical_columns=[
                                         "movie_id", "user_id", "gender",
                                         "occupation", "zip", "title", "age"
                                     ],
                                     metrics=['mse'],
                                     fixed_embedding_dim=True,
                                     embeddings_output_dim=4,
                                     apply_gbm_features=False,
                                     apply_class_weight=True,
                                     earlystopping_patience=5,
                                     var_len_categorical_columns=[
                                         ('genres', "|", "max")
                                     ])

        dt = deeptable.DeepTable(config=conf)

        X_train, X_validation, y_train, y_validation = get_tool_box(
            X).train_test_split(X, y, test_size=0.2)

        model, history = dt.fit(X_train,
                                y_train,
                                validation_data=(X_validation, y_validation),
                                epochs=10,
                                batch_size=32)

        assert 'genres' in model.model.input_names
Пример #10
0
def train(X_train,
          y_train,
          X_eval,
          y_eval,
          task=None,
          reward_metric=None,
          optimize_direction='max',
          **kwargs):
    from hypernets.core.callbacks import SummaryCallback
    from hypernets.searchers import make_searcher

    if task is None:
        task, _ = get_tool_box(y_train).infer_task_type(y_train)
    if reward_metric is None:
        reward_metric = 'rmse' if task == const.TASK_REGRESSION else 'accuracy'

    search_space = PlainSearchSpace()
    searcher = make_searcher('mcts',
                             search_space,
                             optimize_direction=optimize_direction)
    callbacks = [SummaryCallback()]
    hm = PlainModel(searcher=searcher,
                    task=task,
                    reward_metric=reward_metric,
                    callbacks=callbacks)
    hm.search(X_train, y_train, X_eval, y_eval, **kwargs)
    best = hm.get_best_trial()
    model = hm.final_train(best.space_sample, X_train, y_train)
    return hm, model
Пример #11
0
 def _fix_softmax_proba(self, n_rows, proba):
     # proba shape should be (n, 1) if output layer is softmax
     if proba is None:
         return None
     else:
         # assert proba.shape == (n_rows, 1)
         # return np.insert(proba, 0, values=(1 - proba).reshape(1, -1), axis=1)
         return get_tool_box(proba).fix_binary_predict_proba_result(proba)
Пример #12
0
    def transformers(self):
        import dask.dataframe as dd
        tfs = get_tool_box(dd.DataFrame).transformers
        r = DefaultDaskPreprocessor.Dummy()
        for k, v in tfs.items():
            setattr(r, k, v)

        return r
Пример #13
0
    def experiment_with_boston(self,
                               init_kwargs,
                               run_kwargs,
                               row_count=3000,
                               with_dask=False):
        if with_dask:
            X = self.boston
            y = X.pop('target')
        else:
            X = dsutils.load_boston()
            if row_count is not None:
                X = X.head(row_count)
            X['target'] = LabelEncoder().fit_transform(X['target'])
            y = X.pop('target')
            y = y.astype('float64')

        hyper_model = create_plain_model(with_encoder=True)

        tb = get_tool_box(X, y)
        X_train, X_test, y_train, y_test = \
            tb.train_test_split(X, y, test_size=0.3, random_state=9527)
        X_train, X_eval, y_train, y_eval = \
            tb.train_test_split(X_train, y_train, test_size=0.3, random_state=9527)

        init_kwargs = {
            'X_eval': X_eval,
            'y_eval': y_eval,
            'X_test': X_test,
            **init_kwargs
        }

        compete_experiment = CompeteExperiment(hyper_model, X_train, y_train,
                                               **init_kwargs)
        base_experiment = Experiment(hyper_model, X_train, y_train,
                                     **init_kwargs)

        mydict_compete = compete_experiment.get_data_character()
        mydict_base = base_experiment.get_data_character()

        assert mydict_base
        assert mydict_compete
        assert mydict_base['experimentType'] == 'base'
        assert mydict_compete['experimentType'] == 'compete'
        assert mydict_base['target']['taskType'] == 'regression'
        assert mydict_base['target']['freq'] is None
        assert mydict_base['target']['unique']
        assert mydict_base['target']['mean'] is not None
        assert mydict_base['target']['max'] is not None
        assert mydict_base['target']['min'] is not None
        assert mydict_base['target']['stdev'] is not None
        assert mydict_base['target']['dataType'] is 'float'
        assert len(mydict_base['targetDistribution']) <= 10
        assert mydict_base['datasetShape']['X_train']
        assert mydict_base['datasetShape']['y_train']
        assert mydict_base['datasetShape']['X_eval']
        assert mydict_base['datasetShape']['y_eval']
        assert mydict_base['datasetShape']['X_test']
        assert mydict_compete['featureDistribution']
Пример #14
0
def run(distribute_strategy=None, batch_size=32, epochs=5):
    # loading data
    df = dsutils.load_bank_by_dask()
    df_train, df_test = get_tool_box(df).train_test_split(df,
                                                          test_size=0.2,
                                                          random_state=42)

    y = df_train.pop('y')
    y_test = df_test.pop('y')
    df_train, y, df_test, y_test = dask.persist(df_train, y, df_test, y_test)

    # training
    config = deeptable.ModelConfig(
        nets=deepnets.DeepFM,
        earlystopping_patience=5,
        distribute_strategy=distribute_strategy,
    )
    dt = deeptable.DeepTable(config=config)
    model, history = dt.fit(df_train, y, batch_size=batch_size, epochs=epochs)

    # save
    model_path = 'model_by_dask'
    dt.save(model_path)
    print(f'saved to {model_path}')

    # evaluation
    model_path = 'model_by_dask'
    dt2 = deeptable.DeepTable.load(model_path)
    result = dt2.evaluate(df_test, y_test, batch_size=512, verbose=0)
    print('score:', result)

    # scoring
    preds = dt2.predict(
        df_test,
        batch_size=512,
    )
    proba = dt2.predict_proba(
        df_test,
        batch_size=512,
    )
    print(
        get_tool_box(y_test).metrics.calc_score(y_test,
                                                preds,
                                                proba,
                                                metrics=['accuracy', 'auc']))
Пример #15
0
def experiment_with_movie_lens(init_kwargs,
                               run_kwargs,
                               row_count=None,
                               with_dask=False):
    hyper_model = create_plain_model(reward_metric='f1',
                                     with_encoder=True,
                                     with_dask=with_dask)

    X = dsutils.load_movielens()
    # X['genres'] = X['genres'].apply(lambda s: s.replace('|', ' '))
    X['timestamp'] = X['timestamp'].apply(datetime.fromtimestamp)
    if row_count is not None:
        X = X.head(row_count)

    if with_dask:
        setup_dask(None)
        X = dd.from_pandas(X, npartitions=1)

    y = X.pop('rating')

    tb = get_tool_box(X, y)

    X_train, X_test, y_train, y_test = \
        tb.train_test_split(X, y, test_size=0.3, random_state=9527)
    X_train, X_eval, y_train, y_eval = \
        tb.train_test_split(X_train, y_train, test_size=0.3, random_state=9527)

    init_kwargs = {
        'X_eval': X_eval,
        'y_eval': y_eval,
        'X_test': X_test,
        'ensemble_size': 0,
        'drift_detection': False,
        **init_kwargs
    }
    run_kwargs = {'max_trials': 3, **run_kwargs}
    experiment = CompeteExperiment(hyper_model, X_train, y_train,
                                   **init_kwargs)
    estimator = experiment.run(**run_kwargs)

    assert estimator

    preds = estimator.predict(X_test)
    proba = estimator.predict_proba(X_test)

    if with_dask:
        preds, proba = tb.to_local(preds, proba)

    score = tb.metrics.calc_score(
        y_test,
        preds,
        proba,
        metrics=['auc', 'accuracy', 'f1', 'recall', 'precision'],
        task=experiment.task)
    print('evaluate score:', score)
    assert score
Пример #16
0
    def setup_class(self):
        self.X, self.y = self.load_data()

        conf = deeptable.ModelConfig(task=consts.TASK_REGRESSION, metrics=[r2_c, 'RootMeanSquaredError'],
                                     apply_gbm_features=False)
        self.dt = deeptable.DeepTable(config=conf)

        self.X_train, self.X_test, self.y_train, self.y_test = \
            get_tool_box(self.X).train_test_split(self.X, self.y, test_size=0.2, random_state=42)
        self.model, self.history = self.dt.fit(self.X_train, self.y_train, batch_size=32, epochs=100)
Пример #17
0
 def test_detect_estimator_lightgbm(self):
     tb = get_tool_box(cudf.DataFrame)
     detector = tb.estimator_detector(
         'lightgbm.LGBMClassifier',
         'binary',
         init_kwargs={'device': 'GPU'},
     )
     r = detector()
     assert r == {'installed', 'initialized',
                  'fitted'}  # lightgbm dose not support cudf.DataFrame
Пример #18
0
def _get_tool_box_for_cache(*args, **kwargs):
    dtypes = []
    for a in args:
        stype = str(type(a))
        if stype.find('DataFrame') >= 0 or stype.find(
                'array') >= 0 or stype.find('Array') >= 0:
            dtypes.append(type(a))
    if len(dtypes) == 0:
        dtypes.append(pd.DataFrame)

    return get_tool_box(*dtypes)
Пример #19
0
    def test_datetime_derivation(self):
        df = pd.DataFrame(data={"x1": [datetime.now()]})
        tb = get_tool_box(df)
        ftt = tb.transformers['FeatureGenerationTransformer'](
            task='binary', trans_primitives=["year", "month", "week"])
        ftt.fit(df)

        x_t = ftt.transform(df)
        assert "YEAR__x1__" in x_t
        assert "MONTH__x1__" in x_t
        assert "WEEK__x1__" in x_t
Пример #20
0
def experiment_with_bank_data(init_kwargs,
                              run_kwargs,
                              row_count=3000,
                              with_dask=False):
    hyper_model = create_plain_model(with_encoder=True, with_dask=with_dask)
    X = dsutils.load_bank()
    if row_count is not None:
        X = X.head(row_count)
    X['y'] = LabelEncoder().fit_transform(X['y'])

    if with_dask:
        setup_dask(None)
        X = dd.from_pandas(X, npartitions=1)

    y = X.pop('y')

    tb = get_tool_box(X, y)
    scorer = tb.metrics.metric_to_scoring(hyper_model.reward_metric)

    X_train, X_test, y_train, y_test = \
        tb.train_test_split(X, y, test_size=0.3, random_state=9527)
    X_train, X_eval, y_train, y_eval = \
        tb.train_test_split(X_train, y_train, test_size=0.3, random_state=9527)

    init_kwargs = {
        'X_eval': X_eval,
        'y_eval': y_eval,
        'X_test': X_test,
        'scorer': scorer,
        'ensemble_size': 0,
        'drift_detection': False,
        **init_kwargs
    }
    run_kwargs = {'max_trials': 3, **run_kwargs}
    experiment = CompeteExperiment(hyper_model, X_train, y_train,
                                   **init_kwargs)
    estimator = experiment.run(**run_kwargs)

    assert estimator

    preds = estimator.predict(X_test)
    proba = estimator.predict_proba(X_test)

    if with_dask:
        preds, proba = tb.to_local(preds, proba)

    score = tb.metrics.calc_score(
        y_test,
        preds,
        proba,
        metrics=['auc', 'accuracy', 'f1', 'recall', 'precision'])
    print('evaluate score:', score)
    assert score
Пример #21
0
    def setup_class(cls):
        df = dsutils.load_bank()
        df = get_tool_box(df).general_preprocessor(df).fit_transform(df)
        cls.bank_data = df
        cls.bank_data_cudf = cudf.from_pandas(df)
        #
        # cls.boston_data = dsutils.load_blood()
        # cls.boston_data_cudf = cudf.from_pandas(cls.boston_data)
        #
        # cls.movie_lens = dsutils.load_movielens()

        os.makedirs(cls.work_dir)
Пример #22
0
    def experiment_start(self, exp):
        self.exp = exp
        self.steps = OrderedDict()
        self.running = True

        display_markdown('### Input Data', raw=True)

        X_train, y_train, X_test, X_eval, y_eval = \
            exp.X_train, exp.y_train, exp.X_test, exp.X_eval, exp.y_eval
        tb = get_tool_box(X_train, y_train, X_test, X_eval, y_eval)
        display_data = (tb.get_shape(X_train), tb.get_shape(y_train),
                        tb.get_shape(X_eval, allow_none=True),
                        tb.get_shape(y_eval, allow_none=True),
                        tb.get_shape(X_test, allow_none=True),
                        exp.task if exp.task == const.TASK_REGRESSION else
                        f'{exp.task}({tb.to_local(y_train.nunique())[0]})')
        display(pd.DataFrame([display_data],
                             columns=[
                                 'X_train.shape',
                                 'y_train.shape',
                                 'X_eval.shape',
                                 'y_eval.shape',
                                 'X_test.shape',
                                 'Task',
                             ]),
                display_id='output_intput')

        try:
            import seaborn as sns
            import matplotlib.pyplot as plt
            from sklearn.preprocessing import LabelEncoder
            if exp.task == const.TASK_REGRESSION:
                # Draw Plot
                plt.figure(figsize=(8, 4), dpi=80)
                sns.kdeplot(y_train.dropna(),
                            shade=True,
                            color="g",
                            label="Proba",
                            alpha=.7,
                            bw_adjust=0.01)
            else:
                le = LabelEncoder()
                y = le.fit_transform(y_train.dropna())
                # Draw Plot
                plt.figure(figsize=(8, 4), dpi=80)
                sns.distplot(y, kde=False, color="g", label="y")
            # Decoration
            plt.title('Distribution of y', fontsize=22)
            plt.legend()
            plt.show()
        except:
            pass
Пример #23
0
    def transform(self, X, y=None):
        # 1. check is fitted and values
        assert self.feature_defs_ is not None, 'Please fit it first.'

        # 2. fix input
        X, y = self._fix_input(X, y, for_fit=False)

        # 3. transform
        es = ft.EntitySet(id='es_hypernets_transform')
        feature_type_dict = self._get_feature_types(X)
        make_index = self.ft_index not in X.columns.to_list()

        if _base.FT_V0:
            es.entity_from_dataframe(entity_id='e_hypernets_ft',
                                     dataframe=X,
                                     variable_types=feature_type_dict,
                                     make_index=make_index,
                                     index=self.ft_index)
        else:
            if make_index:
                tb = get_tool_box(X)
                X = tb.reset_index(X)
                X[self.ft_index] = X.index
            es.add_dataframe(dataframe=X,
                             dataframe_name='e_hypernets_ft',
                             index=self.ft_index,
                             make_index=False,
                             logical_types=feature_type_dict)

        Xt = ft.calculate_feature_matrix(self.feature_defs_,
                                         entityset=es,
                                         n_jobs=1,
                                         verbose=False)
        if make_index:
            X.pop(self.ft_index)
            if self.ft_index in Xt.columns.to_list():
                Xt.pop(self.ft_index)

        if self.categorical_as_object:
            cat_cols = column_category(Xt)
            if cat_cols:
                Xt[cat_cols] = Xt[cat_cols].astype('object')
        if self.bool_as_int:
            bool_cols = column_bool(Xt)
            if bool_cols:
                Xt[bool_cols] = Xt[bool_cols].astype('int')
        Xt = Xt.replace([np.inf, -np.inf], np.nan)

        if self.fix_feature_names:
            Xt = self._fix_transformed_feature_names(Xt)

        return Xt
Пример #24
0
    def evaluate(self, X, y, metrics=None, **kwargs):
        if metrics is None:
            metrics = ['rmse'] if self.task == const.TASK_REGRESSION else ['accuracy']

        if self.task == const.TASK_REGRESSION:
            proba = None
            preds = self.predict(X, **kwargs)
        else:
            proba = self.predict_proba(X, **kwargs)
            preds = self.proba2predict(proba, proba_threshold=kwargs.get('proba_threshold', 0.5))

        scores = get_tool_box(y).metrics.calc_score(y, preds, proba, metrics, self.task)
        return scores
Пример #25
0
    def setup_class(self):
        setup_dask(self)

        print("Loading datasets...")
        data = dd.from_pandas(dsutils.load_glass_uci(), npartitions=2)
        self.y = data.pop(10).values
        self.X = data

        conf = deeptable.ModelConfig(metrics=['AUC'], apply_gbm_features=False, )
        self.dt = deeptable.DeepTable(config=conf)
        self.X_train, self.X_test, self.y_train, self.y_test = \
            [t.persist() for t in get_tool_box(data).train_test_split(self.X, self.y, test_size=0.2, random_state=42)]
        self.model, self.history = self.dt.fit(self.X_train, self.y_train, batch_size=32, epochs=3)
Пример #26
0
    def test_detect_estimator_xgboost(self):
        pytest.importorskip('xgboost')

        tb = get_tool_box(cudf.DataFrame)
        detector = tb.estimator_detector(
            'xgboost.XGBClassifier',
            'binary',
            init_kwargs={
                'tree_method': 'gpu_hist',
                'use_label_encoder': False
            },
        )
        r = detector()
        assert r == {'installed', 'initialized', 'fitted', 'fitted_with_cuml'}
Пример #27
0
    def test_feature_tools_transformer(self):
        df = dsutils.load_bank()
        df.drop(['id'], axis=1, inplace=True)
        y = df.pop('y')
        ddf = dd.from_pandas(df.head(100), npartitions=2)

        tb = get_tool_box(ddf)
        X_train, X_test = tb.train_test_split(ddf,
                                              test_size=0.2,
                                              random_state=42)
        ftt = tb.transformers['FeatureGenerationTransformer'](
            task='binary', trans_primitives=['add_numeric', 'divide_numeric'])
        ftt.fit(X_train)
        x_t = ftt.transform(X_train)
        assert x_t is not None
Пример #28
0
def proba2predict(proba, *, task=None, threshold=0.5, classes=None):
    assert len(proba.shape) <= 2

    if len(proba.shape) == 0:  # empty
        return proba

    from hypernets.tabular import get_tool_box

    def is_one_dim(x):
        return len(x.shape) == 1 or (len(x.shape) == 2 and x.shape[1] == 1)

    if logger.is_info_enabled():
        logger.info(
            f'proba2predict with task={task}, classes={classes}, threshold={threshold}'
        )

    if task == const.TASK_BINARY and is_one_dim(proba):
        proba = get_tool_box(proba).fix_binary_predict_proba_result(proba)

    if task == const.TASK_REGRESSION or is_one_dim(proba):  # regression
        return proba

    if proba.shape[-1] > 2:  # multiclass
        pred = proba.argmax(axis=-1)
    else:  # binary
        pred = (proba[:, -1] > threshold).astype(np.int32)

    if classes is not None:
        # if dex.is_dask_object(pred):
        #     pred = dex.da.take(np.array(classes), pred, axis=0)
        # else:
        #     pred = np.take(np.array(classes), pred, axis=0)
        tb = get_tool_box(pred)
        pred = tb.take_array(np.array(classes), pred, axis=0)

    return pred
Пример #29
0
 def test_latlong(self):
     df = pd.DataFrame()
     df['latitude'] = [51.52, 9.93, 37.38]
     df['longitude'] = [-0.17, 76.25, -122.08]
     df['latlong'] = df[['latitude', 'longitude']].apply(tuple, axis=1)
     df['latitude2'] = [51.22, 9.22, 37.22]
     df['longitude2'] = [-0.22, 76.22, -122.22]
     df['latlong2'] = df[['latitude2', 'longitude2']].apply(tuple, axis=1)
     df = dd.from_pandas(df, npartitions=1)
     tb = get_tool_box(df)
     ftt = tb.transformers['FeatureGenerationTransformer'](
         latlong_cols=['latlong', 'latlong2'])
     x_t = ftt.fit_transform(df)
     print(x_t.head(3))
     assert 'GEOHASH__latlong__' in x_t.columns.to_list()
Пример #30
0
 def test_category_datetime_text(self):
     df = dsutils.load_movielens()
     df['genres'] = df['genres'].apply(lambda s: s.replace('|', ' '))
     df['timestamp'] = df['timestamp'].apply(datetime.fromtimestamp)
     ddf = dd.from_pandas(df, npartitions=2)
     tb = get_tool_box(ddf)
     ftt = tb.transformers['FeatureGenerationTransformer'](
         task='binary',
         text_cols=['title'],
         categories_cols=['gender', 'genres'])
     x_t = ftt.fit_transform(ddf)
     xt_columns = x_t.columns.to_list()
     assert 'CROSS_CATEGORICAL_gender__genres' in xt_columns
     assert 'TFIDF__title____0__' in xt_columns
     assert 'DAY__timestamp__' in xt_columns