def split_data(self, dataset, y_label, train_size=.8, random_state=0, shuffle=True): with PerfTimer(self, 'split_timer'): train, test = train_test_split( dataset, random_state=random_state ) # unable to shuffle -- no dask_cudf sampler implemented X_train, y_train = train.drop( y_label, axis=1).astype('float32'), train[y_label].astype('int32') X_test, y_test = test.drop( y_label, axis=1).astype('float32'), test[y_label].astype('int32') if 'multi-GPU' in self.compute_type: with PerfTimer(self, 'persist_timer'): workers = self.client.has_what().keys() X_train, X_test, y_train, y_test = persist_across_workers( self.client, [X_train, X_test, y_train, y_test], workers=workers) wait([X_train, X_test, y_train, y_test]) return X_train, X_test, y_train, y_test
def create_cuml_distributed(X_train, y_train): start_time = datetime.now() print('init dask cluster') cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) workers = client.has_what().keys() n_workers = len(workers) X_train_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train)) y_train_cudf = cudf.Series(y_train) X_train_dask = dask_cudf.from_cudf(X_train_cudf, npartitions=n_workers) y_train_dask = dask_cudf.from_cudf(y_train_cudf, npartitions=n_workers) X_train_ddask, y_train_ddask = dask_utils.persist_across_workers( client, [X_train_dask, y_train_dask], workers=workers) print('cuml distributed initialized', datetime.now() - start_time) model = distributed_cuml_Rf(n_estimators=500, n_streams=64) model.fit(X_train, y_train) wait(model.rfs) print('cuml distributed finished', datetime.now() - start_time) client.close() cluster.close() return model
def split_dataset(self, dataset, random_state): """ Split dataset into train and test data subsets, currently using CV-fold index for randomness. Plan to refactor with dask_ml KFold """ hpo_log.info('> train-test split') label_column = self.hpo_config.label_column train, test = train_test_split(dataset, random_state=random_state) # build X [ features ], y [ labels ] for the train and test subsets y_train = train[label_column] X_train = train.drop(label_column, axis=1) y_test = test[label_column] X_test = test.drop(label_column, axis=1) # force execution X_train, y_train, X_test, y_test = persist_across_workers( self.client, [X_train, y_train, X_test, y_test], workers=self.client.has_what().keys()) # wait! wait([X_train, y_train, X_test, y_test]) return (X_train.astype(self.hpo_config.dataset_dtype), X_test.astype(self.hpo_config.dataset_dtype), y_train.astype(self.hpo_config.dataset_dtype), y_test.astype(self.hpo_config.dataset_dtype))
def test_rf_regression_dask_fil(partitions_per_worker, cluster): # Use CUDA_VISIBLE_DEVICES to control the number of workers c = Client(cluster) try: X, y = make_regression(n_samples=10000, n_features=20, n_informative=10, random_state=123) X = X.astype(np.float32) y = y.astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1000) cu_rf_params = { 'n_estimators': 50, 'max_depth': 16, 'n_bins': 16, } workers = c.has_what().keys() n_partitions = partitions_per_worker * len(workers) X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train)) X_train_df = \ dask_cudf.from_cudf(X_cudf, npartitions=n_partitions) y_cudf = np.array(pd.DataFrame(y_train).values) y_cudf = y_cudf[:, 0] y_cudf = cudf.Series(y_cudf) y_train_df = \ dask_cudf.from_cudf(y_cudf, npartitions=n_partitions) X_cudf_test = cudf.DataFrame.from_pandas(pd.DataFrame(X_test)) X_test_df = \ dask_cudf.from_cudf(X_cudf_test, npartitions=n_partitions) X_train_df, y_train_df = dask_utils.persist_across_workers( c, [X_train_df, y_train_df], workers=workers) cu_rf_mg = cuRFR_mg(**cu_rf_params) cu_rf_mg.fit(X_train_df, y_train_df) cu_rf_mg_predict = cu_rf_mg.predict(X_test_df).compute() cu_rf_mg_predict = cp.asnumpy(cp.array(cu_rf_mg_predict)) acc_score = r2_score(cu_rf_mg_predict, y_test) assert acc_score >= 0.67 finally: c.close()
def _prep_training_data(c, X_train, partitions_per_worker): workers = c.has_what().keys() n_partitions = partitions_per_worker * len(workers) X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train)) X_train_df = dask_cudf.from_cudf(X_cudf, npartitions=n_partitions) X_train_df, = dask_utils.persist_across_workers(c, [X_train_df], workers=workers) return X_train_df
def ETL(self, columns=None, label_column=None, random_seed=0): """ Perfom ETL given a set of target dataset to prepare for model training. 1. Ingest parquet compressed dataset 2. Drop samples with missing data [ predominantly cancelled flights ] 3. Split dataset into train and test subsets """ with PerfTimer('ETL'): if 'single' in self.compute_type: if 'CPU' in self.compute_type: from sklearn.model_selection import train_test_split dataset = pandas.read_parquet(self.target_files, columns=columns, engine='pyarrow') dataset = self.handle_missing_data(dataset) X_train, X_test, y_train, y_test = train_test_split( dataset.loc[:, dataset.columns != label_column], dataset[label_column], random_state=random_seed) elif 'GPU' in self.compute_type: from cuml.preprocessing.model_selection import train_test_split dataset = cudf.read_parquet(self.target_files, columns=columns) dataset = self.handle_missing_data(dataset) X_train, X_test, y_train, y_test = train_test_split( dataset, label_column, random_state=random_seed) elif 'multi' in self.compute_type: from dask_ml.model_selection import train_test_split if 'CPU' in self.compute_type: dataset = self.dask_cpu_parquet_ingest(self.target_files, columns=columns) elif 'GPU' in self.compute_type: dataset = self.dask_gpu_parquet_ingest(self.target_files, columns=columns) dataset = self.handle_missing_data(dataset) # split [ always runs, regardless of whether dataset is cached ] train, test = train_test_split(dataset, random_state=random_seed) # build X [ features ], y [ labels ] for the train and test subsets y_train = train[label_column] X_train = train.drop(label_column, axis=1) y_test = test[label_column] X_test = test.drop(label_column, axis=1) if 'GPU' in self.compute_type: X_train, y_train, X_test, y_test = persist_across_workers( self.client, [X_train, y_train, X_test, y_test], workers=self.client.has_what().keys()) return X_train.astype('float32'), X_test.astype('float32'),\ y_train.astype('int32'), y_test.astype('int32')
def _prep_training_data(c, X_train, y_train, partitions_per_worker): workers = c.has_what().keys() n_partitions = partitions_per_worker * len(workers) X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train)) X_train_df = dask_cudf.from_cudf(X_cudf, npartitions=n_partitions) y_cudf = np.array(pd.DataFrame(y_train).values) y_cudf = y_cudf[:, 0] y_cudf = cudf.Series(y_cudf) y_train_df = dask_cudf.from_cudf(y_cudf, npartitions=n_partitions) X_train_df, y_train_df = dask_utils.persist_across_workers( c, [X_train_df, y_train_df], workers=workers) return X_train_df, y_train_df
def distribute(X, y): # First convert to cudf (with real data, you would likely load in cuDF format to start) X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X)) y_cudf = cudf.Series(y) # Partition with Dask # In this case, each worker will train on 1/n_partitions fraction of the data X_dask = dask_cudf.from_cudf(X_cudf, npartitions=n_partitions) y_dask = dask_cudf.from_cudf(y_cudf, npartitions=n_partitions) # Persist to cache the data in active memory X_dask, y_dask = \ dask_utils.persist_across_workers(c, [X_dask, y_dask], workers=workers) return X_dask, y_dask
def fit_random_forest(self, X_train, y_train, model_params): """ Trains a RandomForest model on X_train and y_train with model_params. Depending on compute_type, estimators from appropriate packages are used. CPU - sklearn Single-GPU - cuml multi_gpu - cuml.dask Parameters and Objects returned are same as trained_model """ if "CPU" in self.compute_type: rf_model = sklearn.ensemble.RandomForestClassifier( n_estimators=model_params["n_estimators"], max_depth=model_params["max_depth"], max_features=model_params["max_features"], n_jobs=int(self.n_workers), verbose=self.verbose_estimator, ) elif "GPU" in self.compute_type: if "single" in self.compute_type: rf_model = cuml.ensemble.RandomForestClassifier( n_estimators=model_params["n_estimators"], max_depth=model_params["max_depth"], n_bins=model_params["n_bins"], max_features=model_params["max_features"], verbose=self.verbose_estimator, ) elif "multi" in self.compute_type: self.log_to_file("\n\tFitting multi-GPU daskRF") X_train, y_train = dask_utils.persist_across_workers( self.client, [X_train.fillna(0.0), y_train.fillna(0.0)], workers=self.workers, ) rf_model = cuml.dask.ensemble.RandomForestClassifier( n_estimators=model_params["n_estimators"], max_depth=model_params["max_depth"], n_bins=model_params["n_bins"], max_features=model_params["max_features"], verbose=self.verbose_estimator, ) with PerfTimer() as train_timer: try: trained_model = rf_model.fit(X_train, y_train) except Exception as error: self.log_to_file("\n\n! Error during fit " + str(error)) return trained_model, train_timer.duration
def test_rf_regression(n_workers, partitions_per_worker): if dask_cuda.utils.get_n_gpus() < n_workers: pytest.skip("too few GPUs") cluster = LocalCUDACluster(threads_per_worker=1, n_workers=n_workers) c = Client(cluster) X, y = make_regression(n_samples=40000, n_features=20, n_informative=10, random_state=123) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1000) cu_rf_params = { 'n_estimators': 25, 'max_depth': 13, } workers = c.has_what().keys() n_partitions = partitions_per_worker * len(workers) X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train)) X_train_df = \ dask_cudf.from_cudf(X_cudf, npartitions=n_partitions) y_cudf = np.array(pd.DataFrame(y_train).values) y_cudf = y_cudf[:, 0] y_cudf = cudf.Series(y_cudf) y_train_df = \ dask_cudf.from_cudf(y_cudf, npartitions=n_partitions) X_train_df, y_train_df = dask_utils.persist_across_workers( c, [X_train_df, y_train_df], workers=workers) cu_rf_mg = cuRFR_mg(**cu_rf_params) cu_rf_mg.fit(X_train_df, y_train_df) cu_rf_mg_predict = cu_rf_mg.predict(X_test) acc_score = r2_score(cu_rf_mg_predict, y_test) print(str(acc_score)) assert acc_score >= 0.70 c.close() cluster.close()
def persist_training_inputs(self, X_train, y_train, X_test, y_test): """ In the case of dask multi-CPU and dask multi-GPU Random Forest, we need the dataset to be computed/persisted prior to a fit call. In the case of XGBoost this step is performed by the DMatrix creation. """ if 'multi-CPU' in self.compute_type: X_train = X_train.persist() y_train = y_train.persist() elif 'multi-GPU' in self.compute_type: from cuml.dask.common.utils import persist_across_workers X_train, y_train = persist_across_workers( self.client, [X_train, y_train, X_test, y_test], workers=self.client.has_what().keys()) wait([X_train, y_train, X_test, y_test]) return X_train, y_train, X_test, y_test
def _prep_training_data(c, X_train, partitions_per_worker, reverse_order=False): workers = c.has_what().keys() if reverse_order: workers = list(workers)[::-1] n_partitions = partitions_per_worker * len(workers) X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train)) X_train_df = dask_cudf.from_cudf(X_cudf, npartitions=n_partitions) X_train_df, = dask_utils.persist_across_workers(c, [X_train_df], workers=list(workers)) return X_train_df
def test_rf_regression_dask_cpu(partitions_per_worker, client): n_workers = len(client.scheduler_info()['workers']) X, y = make_regression(n_samples=n_workers * 2000, n_features=20, n_informative=10, random_state=123) X = X.astype(np.float32) y = y.astype(np.float32) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=n_workers * 400, random_state=123) cu_rf_params = { 'n_estimators': 50, 'max_depth': 16, 'n_bins': 16, } workers = client.has_what().keys() n_partitions = partitions_per_worker * len(workers) X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train)) X_train_df = \ dask_cudf.from_cudf(X_cudf, npartitions=n_partitions) y_cudf = cudf.Series(y_train) y_train_df = \ dask_cudf.from_cudf(y_cudf, npartitions=n_partitions) X_train_df, y_train_df = dask_utils.persist_across_workers( client, [X_train_df, y_train_df], workers=workers) cuml_mod = cuRFR_mg(**cu_rf_params) cuml_mod.fit(X_train_df, y_train_df) cuml_mod_predict = cuml_mod.predict(X_test, predict_model='CPU') acc_score = r2_score(cuml_mod_predict, y_test) assert acc_score >= 0.67
def ETL ( self, cached_dataset = None, random_seed = 0, columns = airline_columns, label_column = airline_label_column ): """ run ETL [ ingest -> rebalance -> drop missing -> split -> persist ] after the first run the dataset is cached, so only split is re-run (re-shuffled) """ with PerfTimer( 'ETL' ): if 'single' in self.compute_type: if 'CPU' in self.compute_type: from sklearn.model_selection import train_test_split dataset = pandas.read_parquet( self.target_files, columns = columns, engine='pyarrow' ) dataset = dataset.dropna() X_train, X_test, y_train, y_test = train_test_split( dataset.loc[:, dataset.columns != label_column], dataset[label_column], random_state = random_seed ) elif 'GPU' in self.compute_type: from cuml.preprocessing.model_selection import train_test_split dataset = cudf.read_parquet( self.target_files, columns = columns ) dataset = dataset.dropna() X_train, X_test, y_train, y_test = train_test_split( dataset, label_column, random_state = random_seed ) return X_train, X_test, y_train, y_test, dataset elif 'multi' in self.compute_type: from dask_ml.model_selection import train_test_split if cached_dataset is None: if 'CPU' in self.compute_type: dataset = dask.dataframe.read_parquet( self.target_files, columns = columns, engine='pyarrow') elif 'GPU' in self.compute_type: dataset = dask_cudf.read_parquet( self.target_files, columns = columns ) # drop missing values [ ~2.5% -- predominantly cancelled flights ] dataset = dataset.dropna() # repartition [ inplace ], rebalance ratio of workers & data partitions initial_npartitions = dataset.npartitions dataset = dataset.repartition( npartitions = self.n_workers ) else: print( f"using cache [ skiping ingestion, dropna, and repartition ]") if 'multi-CPU'in self.compute_type: assert( type(cached_dataset) == dask.dataframe.core.DataFrame ) if 'multi-GPU'in self.compute_type: assert( type(cached_dataset) == dask_cudf.core.DataFrame ) dataset = cached_dataset # split [ always runs, regardless of whether dataset is cached ] train, test = train_test_split( dataset, random_state = random_seed ) # build X [ features ], y [ labels ] for the train and test subsets y_train = train[label_column].astype('int32') X_train = train.drop(label_column, axis = 1).astype('float32') y_test = test[label_column].astype('int32') X_test = test.drop(label_column, axis = 1).astype('float32') # force computation / persist if 'multi-CPU' in self.compute_type: X_train = X_train.persist(); X_test = X_test.persist() y_train = y_train.persist(); y_test = y_test.persist() elif 'multi-GPU' in self.compute_type: workers = self.client.has_what().keys() X_train, X_test, y_train, y_test = \ persist_across_workers( self.client, [ X_train, X_test, y_train, y_test ], workers = workers ) wait( [X_train, X_test, y_train, y_test] ); # return [ CPU/GPU ] dask dataframes return X_train, X_test, y_train, y_test, dataset return None