def test_general_preprocessor(self): X = self.cf.copy() y = X.pop('y') preprocessor = CumlToolBox.general_preprocessor(self.cf) Xt = preprocessor.fit_transform(X, y) assert CumlToolBox.is_cuml_object(Xt) # dtypes dtypes = set(map(str, Xt.dtypes.to_dict().values())) assert dtypes.issubset({'float64', 'int64', 'uint8'})
def test_general_preprocessor(self): X_foo = cudf.from_pandas(self.bank_data.head()) pp = CumlToolBox.general_preprocessor(X_foo) self.fit_reload_transform( pp, column_selector=CumlToolBox.column_selector.column_all, check_options=dict(dtypes=False))
def test_general_estimator(self): X = self.cf.copy() y = X.pop('y') preprocessor = CumlToolBox.general_preprocessor(self.cf) Xt = preprocessor.fit_transform(X, y) for s in [None, 'xgb', 'rf', 'gbm']: est = CumlToolBox.general_estimator(Xt, y, estimator=s) est.fit(Xt, y) assert len(est.classes_) == 2 pred = est.predict(Xt) assert CumlToolBox.is_cuml_object(pred) proba = est.predict_proba(Xt) assert CumlToolBox.is_cuml_object(proba)
def test_binary_collinearity_detection(self): preprocessor = CumlToolBox.general_preprocessor(self.bank_data_cudf) run_experiment( self.bank_data_cudf.copy(), hyper_model_options=dict(transformer=preprocessor), collinearity_detection=True, # log_level='info', random_state=335, )
def test_binary_cv(self): preprocessor = CumlToolBox.general_preprocessor(self.bank_data_cudf) run_experiment( self.bank_data_cudf.copy(), hyper_model_options=dict(transformer=preprocessor), # cv=False, ensemble_size=5, max_trials=5, # log_level='info', random_state=335, )
def test_multiclass(self): preprocessor = CumlToolBox.general_preprocessor(self.bank_data_cudf) run_experiment( self.bank_data_cudf.copy(), target='education', hyper_model_options=dict(transformer=preprocessor), cv=True, ensemble_size=5, max_trials=5, # log_level='info', random_state=335, )
def test_adapt_to_cuml(self): preprocessor = CumlToolBox.general_preprocessor(self.bank_data_cudf) run_experiment( self.bank_data.copy(), check_as_local=False, hyper_model_options=dict(transformer=preprocessor), data_adaption_target='cuml', cv=False, ensemble_size=5, max_trials=5, log_level='info', random_state=335, )
def test_drift_detector_split(self): df = cudf.from_pandas(load_bank()) y = df.pop('y') X_train, X_test = CumlToolBox.train_test_split(df.copy(), train_size=0.7, shuffle=True, random_state=9527) dd = dd_selector().get_detector() dd.fit(X_train, X_test) assert len(dd.feature_names_) == 17 assert len(dd.feature_importances_) == 17 assert dd.auc_ assert len(dd.estimator_) == 5 proba = dd.predict_proba(df) assert proba.shape[0] == df.shape[0] df = cudf.from_pandas(load_bank()) y = df.pop('y') p = int(df.shape[0] * 0.2) X_train, X_test, y_train, y_test = dd.train_test_split(df.copy(), y, test_size=0.2) assert X_train.shape == (df.shape[0] - p, df.shape[1]) assert y_train.shape == (df.shape[0] - p, ) assert X_test.shape == (p, df.shape[1]) assert y_test.shape == (p, ) df['y'] = y X_train['y'] = y_train X_test['y'] = y_test df, X_train, X_test = CumlToolBox.to_local(df, X_train, X_test) df_split = pd.concat([X_train, X_test]) df_hash = hash_pandas_object(df).sort_values() splitted_hash = hash_pandas_object(df_split).sort_values() assert (df_hash == splitted_hash).all()
def fit_reload_transform(self, tf, *, df=None, target=None, column_selector=None, dtype=None, check_options=None): if df is None: df = self.bank_data.copy() target = 'y' if target is not None: y = df.pop(target) y_cf = cudf.from_pandas(y) else: y_cf = None if column_selector: columns = column_selector(df) df = df[columns] if dtype is not None: df = df.astype(dtype) cf = cudf.from_pandas(df) tf.fit_transform(cf.copy(), y_cf) file_path = f'{self.work_dir}/fitted_{type(tf).__name__}.pkl' with open(file_path, 'wb') as f: pickle.dump(tf, f, protocol=pickle.HIGHEST_PROTOCOL) with open(file_path, 'rb') as f: tf_loaded = pickle.load(f) assert type(tf_loaded) is type(tf) # transform cudf.DataFrame t = tf_loaded.transform(cf) assert t is not None assert CumlToolBox.is_cuml_object(t) # convert to local transformer assert hasattr(tf_loaded, 'as_local') tf_local = tf_loaded.as_local() t2 = tf_local.transform(df.copy()) assert isinstance(t2, (pd.DataFrame, np.ndarray)) if check_options is None: check_options = {} check_dataframe(t, t2, **check_options)
def test_load_data(self, ): data_dir = path.split(dsutils.__file__)[0] data_file = f'{data_dir}/blood.csv' df = CumlToolBox.load_data(data_file, reset_index=True) assert isinstance(df, cudf.DataFrame)