예제 #1
0
    def test_general_preprocessor(self):
        X = self.cf.copy()
        y = X.pop('y')
        preprocessor = CumlToolBox.general_preprocessor(self.cf)
        Xt = preprocessor.fit_transform(X, y)
        assert CumlToolBox.is_cuml_object(Xt)

        # dtypes
        dtypes = set(map(str, Xt.dtypes.to_dict().values()))
        assert dtypes.issubset({'float64', 'int64', 'uint8'})
예제 #2
0
 def test_general_preprocessor(self):
     X_foo = cudf.from_pandas(self.bank_data.head())
     pp = CumlToolBox.general_preprocessor(X_foo)
     self.fit_reload_transform(
         pp,
         column_selector=CumlToolBox.column_selector.column_all,
         check_options=dict(dtypes=False))
예제 #3
0
    def test_general_estimator(self):
        X = self.cf.copy()
        y = X.pop('y')
        preprocessor = CumlToolBox.general_preprocessor(self.cf)
        Xt = preprocessor.fit_transform(X, y)

        for s in [None, 'xgb', 'rf', 'gbm']:
            est = CumlToolBox.general_estimator(Xt, y, estimator=s)
            est.fit(Xt, y)
            assert len(est.classes_) == 2

            pred = est.predict(Xt)
            assert CumlToolBox.is_cuml_object(pred)

            proba = est.predict_proba(Xt)
            assert CumlToolBox.is_cuml_object(proba)
예제 #4
0
 def test_binary_collinearity_detection(self):
     preprocessor = CumlToolBox.general_preprocessor(self.bank_data_cudf)
     run_experiment(
         self.bank_data_cudf.copy(),
         hyper_model_options=dict(transformer=preprocessor),
         collinearity_detection=True,
         # log_level='info',
         random_state=335,
     )
예제 #5
0
 def test_binary_cv(self):
     preprocessor = CumlToolBox.general_preprocessor(self.bank_data_cudf)
     run_experiment(
         self.bank_data_cudf.copy(),
         hyper_model_options=dict(transformer=preprocessor),
         # cv=False,
         ensemble_size=5,
         max_trials=5,
         # log_level='info',
         random_state=335,
     )
예제 #6
0
 def test_multiclass(self):
     preprocessor = CumlToolBox.general_preprocessor(self.bank_data_cudf)
     run_experiment(
         self.bank_data_cudf.copy(),
         target='education',
         hyper_model_options=dict(transformer=preprocessor),
         cv=True,
         ensemble_size=5,
         max_trials=5,
         # log_level='info',
         random_state=335,
     )
예제 #7
0
 def test_adapt_to_cuml(self):
     preprocessor = CumlToolBox.general_preprocessor(self.bank_data_cudf)
     run_experiment(
         self.bank_data.copy(),
         check_as_local=False,
         hyper_model_options=dict(transformer=preprocessor),
         data_adaption_target='cuml',
         cv=False,
         ensemble_size=5,
         max_trials=5,
         log_level='info',
         random_state=335,
     )
예제 #8
0
    def test_drift_detector_split(self):
        df = cudf.from_pandas(load_bank())
        y = df.pop('y')
        X_train, X_test = CumlToolBox.train_test_split(df.copy(),
                                                       train_size=0.7,
                                                       shuffle=True,
                                                       random_state=9527)
        dd = dd_selector().get_detector()
        dd.fit(X_train, X_test)

        assert len(dd.feature_names_) == 17
        assert len(dd.feature_importances_) == 17
        assert dd.auc_
        assert len(dd.estimator_) == 5

        proba = dd.predict_proba(df)
        assert proba.shape[0] == df.shape[0]

        df = cudf.from_pandas(load_bank())
        y = df.pop('y')
        p = int(df.shape[0] * 0.2)
        X_train, X_test, y_train, y_test = dd.train_test_split(df.copy(),
                                                               y,
                                                               test_size=0.2)
        assert X_train.shape == (df.shape[0] - p, df.shape[1])
        assert y_train.shape == (df.shape[0] - p, )
        assert X_test.shape == (p, df.shape[1])
        assert y_test.shape == (p, )

        df['y'] = y
        X_train['y'] = y_train
        X_test['y'] = y_test
        df, X_train, X_test = CumlToolBox.to_local(df, X_train, X_test)
        df_split = pd.concat([X_train, X_test])
        df_hash = hash_pandas_object(df).sort_values()
        splitted_hash = hash_pandas_object(df_split).sort_values()
        assert (df_hash == splitted_hash).all()
예제 #9
0
    def fit_reload_transform(self,
                             tf,
                             *,
                             df=None,
                             target=None,
                             column_selector=None,
                             dtype=None,
                             check_options=None):
        if df is None:
            df = self.bank_data.copy()
            target = 'y'

        if target is not None:
            y = df.pop(target)
            y_cf = cudf.from_pandas(y)
        else:
            y_cf = None

        if column_selector:
            columns = column_selector(df)
            df = df[columns]
        if dtype is not None:
            df = df.astype(dtype)
        cf = cudf.from_pandas(df)

        tf.fit_transform(cf.copy(), y_cf)
        file_path = f'{self.work_dir}/fitted_{type(tf).__name__}.pkl'
        with open(file_path, 'wb') as f:
            pickle.dump(tf, f, protocol=pickle.HIGHEST_PROTOCOL)

        with open(file_path, 'rb') as f:
            tf_loaded = pickle.load(f)
            assert type(tf_loaded) is type(tf)

        # transform cudf.DataFrame
        t = tf_loaded.transform(cf)
        assert t is not None
        assert CumlToolBox.is_cuml_object(t)

        # convert to local transformer
        assert hasattr(tf_loaded, 'as_local')
        tf_local = tf_loaded.as_local()
        t2 = tf_local.transform(df.copy())
        assert isinstance(t2, (pd.DataFrame, np.ndarray))

        if check_options is None:
            check_options = {}
        check_dataframe(t, t2, **check_options)
예제 #10
0
    def test_load_data(self, ):
        data_dir = path.split(dsutils.__file__)[0]
        data_file = f'{data_dir}/blood.csv'

        df = CumlToolBox.load_data(data_file, reset_index=True)
        assert isinstance(df, cudf.DataFrame)