def create_cuml_distributed(X_train, y_train): start_time = datetime.now() print('init dask cluster') cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) workers = client.has_what().keys() n_workers = len(workers) X_train_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train)) y_train_cudf = cudf.Series(y_train) X_train_dask = dask_cudf.from_cudf(X_train_cudf, npartitions=n_workers) y_train_dask = dask_cudf.from_cudf(y_train_cudf, npartitions=n_workers) X_train_ddask, y_train_ddask = dask_utils.persist_across_workers( client, [X_train_dask, y_train_dask], workers=workers) print('cuml distributed initialized', datetime.now() - start_time) model = distributed_cuml_Rf(n_estimators=500, n_streams=64) model.fit(X_train, y_train) wait(model.rfs) print('cuml distributed finished', datetime.now() - start_time) client.close() cluster.close() return model
def test_rf_regression_dask_fil(partitions_per_worker, cluster): # Use CUDA_VISIBLE_DEVICES to control the number of workers c = Client(cluster) try: X, y = make_regression(n_samples=10000, n_features=20, n_informative=10, random_state=123) X = X.astype(np.float32) y = y.astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1000) cu_rf_params = { 'n_estimators': 50, 'max_depth': 16, 'n_bins': 16, } workers = c.has_what().keys() n_partitions = partitions_per_worker * len(workers) X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train)) X_train_df = \ dask_cudf.from_cudf(X_cudf, npartitions=n_partitions) y_cudf = np.array(pd.DataFrame(y_train).values) y_cudf = y_cudf[:, 0] y_cudf = cudf.Series(y_cudf) y_train_df = \ dask_cudf.from_cudf(y_cudf, npartitions=n_partitions) X_cudf_test = cudf.DataFrame.from_pandas(pd.DataFrame(X_test)) X_test_df = \ dask_cudf.from_cudf(X_cudf_test, npartitions=n_partitions) X_train_df, y_train_df = dask_utils.persist_across_workers( c, [X_train_df, y_train_df], workers=workers) cu_rf_mg = cuRFR_mg(**cu_rf_params) cu_rf_mg.fit(X_train_df, y_train_df) cu_rf_mg_predict = cu_rf_mg.predict(X_test_df).compute() cu_rf_mg_predict = cp.asnumpy(cp.array(cu_rf_mg_predict)) acc_score = r2_score(cu_rf_mg_predict, y_test) assert acc_score >= 0.67 finally: c.close()
def test_end_to_end(nrows, ncols, nclusters, n_parts, delayed_predict, cluster): client = None try: client = Client(cluster) from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs X_cudf, y = make_blobs(nrows, ncols, nclusters, n_parts, cluster_std=0.01, verbose=False, random_state=10) wait(X_cudf) cumlModel = cumlKMeans(verbose=0, init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_cudf) cumlLabels = cumlModel.predict(X_cudf, delayed_predict) n_workers = len(list(client.has_what().keys())) # Verifying we are grouping partitions. This should be changed soon. if n_parts is not None and n_parts < n_workers: assert cumlLabels.npartitions == n_parts else: assert cumlLabels.npartitions == n_workers cumlPred = cp.array(cumlLabels.compute()) assert cumlPred.shape[0] == nrows assert np.max(cumlPred) == nclusters - 1 assert np.min(cumlPred) == 0 labels = np.squeeze(y.compute().to_pandas().values) score = adjusted_rand_score(labels, cp.squeeze(cumlPred.get())) print(str(score)) assert 1.0 == score finally: client.close()
def test_end_to_end(nrows, ncols, nclusters, n_parts, cluster): client = Client(cluster) try: from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs X_cudf, y = make_blobs(nrows, ncols, nclusters, n_parts, cluster_std=0.01, verbose=True, random_state=10) wait(X_cudf) cumlModel = cumlKMeans(verbose=1, init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_cudf) cumlLabels = cumlModel.predict(X_cudf) n_workers = len(list(client.has_what().keys())) # Verifying we are grouping partitions. This should be changed soon. if n_parts is not None and n_parts < n_workers: assert cumlLabels.npartitions == n_parts else: assert cumlLabels.npartitions == n_workers from sklearn.metrics import adjusted_rand_score cumlPred = cumlLabels.compute().to_pandas().values assert cumlPred.shape[0] == nrows assert np.max(cumlPred) == nclusters - 1 assert np.min(cumlPred) == 0 labels = y.compute().to_pandas().values score = adjusted_rand_score(labels.reshape(labels.shape[0]), cumlPred) assert 1.0 == score finally: client.close()
def test_rf_regression(n_workers, partitions_per_worker): if dask_cuda.utils.get_n_gpus() < n_workers: pytest.skip("too few GPUs") cluster = LocalCUDACluster(threads_per_worker=1, n_workers=n_workers) c = Client(cluster) X, y = make_regression(n_samples=40000, n_features=20, n_informative=10, random_state=123) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1000) cu_rf_params = { 'n_estimators': 25, 'max_depth': 13, } workers = c.has_what().keys() n_partitions = partitions_per_worker * len(workers) X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train)) X_train_df = \ dask_cudf.from_cudf(X_cudf, npartitions=n_partitions) y_cudf = np.array(pd.DataFrame(y_train).values) y_cudf = y_cudf[:, 0] y_cudf = cudf.Series(y_cudf) y_train_df = \ dask_cudf.from_cudf(y_cudf, npartitions=n_partitions) X_train_df, y_train_df = dask_utils.persist_across_workers( c, [X_train_df, y_train_df], workers=workers) cu_rf_mg = cuRFR_mg(**cu_rf_params) cu_rf_mg.fit(X_train_df, y_train_df) cu_rf_mg_predict = cu_rf_mg.predict(X_test) acc_score = r2_score(cu_rf_mg_predict, y_test) print(str(acc_score)) assert acc_score >= 0.70 c.close() cluster.close()
def test_ols(cluster): client = Client(cluster) try: import dask_cudf import cudf import numpy as np from cuml.dask.linear_model import LinearRegression as cumlOLS_dask nrows = 2**8 ncols = 399 X, y = load_data(nrows, ncols) X_cudf = cudf.DataFrame.from_pandas(X) y_cudf = np.array(y.as_matrix()) y_cudf = y_cudf[:, 0] y_cudf = cudf.Series(y_cudf) workers = client.has_what().keys() X_df = dask_cudf.from_cudf(X_cudf, npartitions=len(workers)).persist() y_df = dask_cudf.from_cudf(y_cudf, npartitions=len(workers)).persist() lr = cumlOLS_dask() lr.fit(X_df, y_df) ret = lr.predict(X_df) error_cuml = mean_squared_error(y, ret.compute().to_array()) assert(error_cuml < 1e-6) finally: client.close() cluster.close()
class RapidsCloudML(object): def __init__( self, cloud_type="Azure", model_type="RandomForest", data_type="Parquet", compute_type="single-GPU", verbose_estimator=False, CSP_paths=default_azureml_paths, ): self.CSP_paths = CSP_paths self.cloud_type = cloud_type self.model_type = model_type self.data_type = data_type self.compute_type = compute_type self.verbose_estimator = verbose_estimator self.log_to_file( f"\n> RapidsCloudML\n\tCompute, Data , Model, Cloud types {self.compute_type, self.data_type, self.model_type, self.cloud_type}" ) # Setting up client for multi-GPU option if "multi" in self.compute_type: self.log_to_file("\n\tMulti-GPU selected") # This will use all GPUs on the local host by default cluster = LocalCUDACluster(threads_per_worker=1) self.client = Client(cluster) # Query the client for all connected workers self.workers = self.client.has_what().keys() self.n_workers = len(self.workers) self.log_to_file(f"\n\tClient information {self.client}") def load_hyperparams(self, model_name="XGBoost"): """ Selecting model paramters based on the model we select for execution. Checks if there is a config file present in the path self.CSP_paths['hyperparams'] with the parameters for the experiment. If not present, it returns the default parameters. Parameters ---------- model_name : string Selects which model to set the parameters for. Takes either 'XGBoost' or 'RandomForest'. Returns ---------- model_params : dict Loaded model parameters (dict) """ self.log_to_file("\n> Loading Hyperparameters") # Default parameters of the models if self.model_type == "XGBoost": # https://xgboost.readthedocs.io/en/latest/parameter.html model_params = { "max_depth": 6, "num_boost_round": 100, "learning_rate": 0.3, "gamma": 0.0, "lambda": 1.0, "alpha": 0.0, "objective": "binary:logistic", "random_state": 0, } elif self.model_type == "RandomForest": # https://docs.rapids.ai/api/cuml/stable/ -> cuml.ensemble.RandomForestClassifier model_params = { "n_estimators": 10, "max_depth": 10, "n_bins": 16, "max_features": 1.0, "seed": 0, } hyperparameters = {} try: with open(self.CSP_paths["hyperparams"], "r") as file_handle: hyperparameters = json.load(file_handle) for key, value in hyperparameters.items(): model_params[key] = value pprint.pprint(model_params) return model_params except Exception as error: self.log_to_file(str(error)) return def load_data(self, filename="dataset.orc", col_labels=None, y_label="ArrDelayBinary"): """ Loading the data into the object from the filename and based on the columns that we are interested in. Also, generates y_label from 'ArrDelay' column to convert this into a binary classification problem. Parameters ---------- filename : string the path of the dataset to be loaded col_labels : list of strings The input columns that we are interested in. None selects all the columns y_label : string The column to perform the prediction task in. Returns ---------- dataset : dataframe (Pandas, cudf or dask-cudf) Ingested dataset in the format of a dataframe col_labels : list of strings The input columns selected y_label : string The generated y_label name for binary classification duration : float The time it took to execute the function """ target_filename = filename self.log_to_file(f"\n> Loading dataset from {target_filename}") with PerfTimer() as ingestion_timer: if "CPU" in self.compute_type: # CPU Reading options self.log_to_file(f"\n\tCPU read") if self.data_type == "ORC": with open(target_filename, mode="rb") as file: dataset = pyarrow_orc.ORCFile(file).read().to_pandas() elif self.data_type == "CSV": dataset = pd.read_csv(target_filename, names=col_labels) elif self.data_type == "Parquet": if "single" in self.compute_type: dataset = pd.read_parquet(target_filename) elif "multi" in self.compute_type: self.log_to_file(f"\n\tReading using dask dataframe") dataset = dask.dataframe.read_parquet(target_filename, columns=columns) elif "GPU" in self.compute_type: # GPU Reading Option self.log_to_file(f"\n\tGPU read") if self.data_type == "ORC": dataset = cudf.read_orc(target_filename) elif self.data_type == "CSV": dataset = cudf.read_csv(target_filename, names=col_labels) elif self.data_type == "Parquet": if "single" in self.compute_type: dataset = cudf.read_parquet(target_filename) elif "multi" in self.compute_type: self.log_to_file(f"\n\tReading using dask_cudf") dataset = dask_cudf.read_parquet(target_filename, columns=col_labels) # cast all columns to float32 for col in dataset.columns: dataset[col] = dataset[col].astype( np.float32) # needed for random forest # Adding y_label column if it is not present if y_label not in dataset.columns: dataset[y_label] = 1.0 * (dataset["ArrDelay"] > 10) dataset[y_label] = dataset[y_label].astype( np.int32) # Needed for cuml RF dataset = dataset.fillna( 0.0) # Filling the null values. Needed for dask-cudf self.log_to_file( f"\n\tIngestion completed in {ingestion_timer.duration}") self.log_to_file( f"\n\tDataset descriptors: {dataset.shape}\n\t{dataset.dtypes}") return dataset, col_labels, y_label, ingestion_timer.duration def split_data(self, dataset, y_label, train_size=0.8, random_state=0, shuffle=True): """ Splitting data into train and test split, has appropriate imports for different compute modes. CPU compute - Uses sklearn, we manually filter y_label column in the split call GPU Compute - Single GPU uses cuml and multi GPU uses dask, both split y_label internally. Parameters ---------- dataset : dataframe The dataframe on which we wish to perform the split y_label : string The name of the column (not the series itself) train_size : float The size for the split. Takes values between 0 to 1. random_state : int Useful for running reproducible splits. shuffle : binary Specifies if the data must be shuffled before splitting. Returns ---------- X_train : dataframe The data to be used for training. Has same type as input dataset. X_test : dataframe The data to be used for testing. Has same type as input dataset. y_train : dataframe The label to be used for training. Has same type as input dataset. y_test : dataframe The label to be used for testing. Has same type as input dataset. duration : float The time it took to perform the split """ self.log_to_file("\n> Splitting train and test data") start_time = time.perf_counter() with PerfTimer() as split_timer: if "CPU" in self.compute_type: X_train, X_test, y_train, y_test = sklearn_train_test_split( dataset.loc[:, dataset.columns != y_label], dataset[y_label], train_size=train_size, shuffle=shuffle, random_state=random_state, ) elif "GPU" in self.compute_type: if "single" in self.compute_type: X_train, X_test, y_train, y_test = cuml_train_test_split( X=dataset, y=y_label, train_size=train_size, shuffle=shuffle, random_state=random_state, ) elif "multi" in self.compute_type: X_train, X_test, y_train, y_test = dask_train_test_split( dataset, y_label, train_size=train_size, shuffle=False, # shuffle not available for dask_cudf yet random_state=random_state, ) self.log_to_file( f"\n\tX_train shape and type{X_train.shape} {type(X_train)}") self.log_to_file(f"\n\tSplit completed in {split_timer.duration}") return X_train, X_test, y_train, y_test, split_timer.duration def train_model(self, X_train, y_train, model_params): """ Trains a model with the model_params specified by calling fit_xgboost or fit_random_forest depending on the model_type. Parameters ---------- X_train : dataframe The data for traning y_train : dataframe The label to be used for training. model_params : dict The model params to use for this training Returns ---------- trained_model : The object of the trained model either of XGBoost or RandomForest training_time : float The time it took to train the model """ self.log_to_file( f"\n> Training {self.model_type} estimator w/ hyper-params") training_time = 0 try: if self.model_type == "XGBoost": trained_model, training_time = self.fit_xgboost( X_train, y_train, model_params) elif self.model_type == "RandomForest": trained_model, training_time = self.fit_random_forest( X_train, y_train, model_params) except Exception as error: self.log_to_file("\n\n!error during model training: " + str(error)) self.log_to_file(f"\n\tFinished training in {training_time:.4f} s") return trained_model, training_time def fit_xgboost(self, X_train, y_train, model_params): """ Trains a XGBoost model on X_train and y_train with model_params Parameters and Objects returned are same as trained_model """ if "GPU" in self.compute_type: model_params.update({"tree_method": "gpu_hist"}) else: model_params.update({"tree_method": "hist"}) with PerfTimer() as train_timer: if "single" in self.compute_type: train_DMatrix = xgboost.DMatrix(data=X_train, label=y_train) trained_model = xgboost.train( dtrain=train_DMatrix, params=model_params, num_boost_round=model_params["num_boost_round"], ) elif "multi" in self.compute_type: self.log_to_file("\n\tTraining multi-GPU XGBoost") train_DMatrix = xgboost.dask.DaskDMatrix(self.client, data=X_train, label=y_train) trained_model = xgboost.dask.train( self.client, dtrain=train_DMatrix, params=model_params, num_boost_round=model_params["num_boost_round"], ) return trained_model, train_timer.duration def fit_random_forest(self, X_train, y_train, model_params): """ Trains a RandomForest model on X_train and y_train with model_params. Depending on compute_type, estimators from appropriate packages are used. CPU - sklearn Single-GPU - cuml multi_gpu - cuml.dask Parameters and Objects returned are same as trained_model """ if "CPU" in self.compute_type: rf_model = sklearn.ensemble.RandomForestClassifier( n_estimators=model_params["n_estimators"], max_depth=model_params["max_depth"], max_features=model_params["max_features"], n_jobs=int(self.n_workers), verbose=self.verbose_estimator, ) elif "GPU" in self.compute_type: if "single" in self.compute_type: rf_model = cuml.ensemble.RandomForestClassifier( n_estimators=model_params["n_estimators"], max_depth=model_params["max_depth"], n_bins=model_params["n_bins"], max_features=model_params["max_features"], verbose=self.verbose_estimator, ) elif "multi" in self.compute_type: self.log_to_file("\n\tFitting multi-GPU daskRF") X_train, y_train = dask_utils.persist_across_workers( self.client, [X_train.fillna(0.0), y_train.fillna(0.0)], workers=self.workers, ) rf_model = cuml.dask.ensemble.RandomForestClassifier( n_estimators=model_params["n_estimators"], max_depth=model_params["max_depth"], n_bins=model_params["n_bins"], max_features=model_params["max_features"], verbose=self.verbose_estimator, ) with PerfTimer() as train_timer: try: trained_model = rf_model.fit(X_train, y_train) except Exception as error: self.log_to_file("\n\n! Error during fit " + str(error)) return trained_model, train_timer.duration def evaluate_test_perf(self, trained_model, X_test, y_test, threshold=0.5): """ Evaluates the model performance on the inference set. For XGBoost we need to generate a DMatrix and then we can evaluate the model. For Random Forest, in single GPU case, we can just call .score function. And multi-GPU Random Forest needs to predict on the model and then compute the accuracy score. Parameters ---------- trained_model : The object of the trained model either of XGBoost or RandomForest X_test : dataframe The data for testing y_test : dataframe The label to be used for testing. Returns ---------- test_accuracy : float The accuracy achieved on test set duration : float The time it took to evaluate the model """ self.log_to_file(f"\n> Inferencing on test set") test_accuracy = None with PerfTimer() as inference_timer: try: if self.model_type == "XGBoost": if "multi" in self.compute_type: test_DMatrix = xgboost.dask.DaskDMatrix(self.client, data=X_test, label=y_test) xgb_pred = xgboost.dask.predict( self.client, trained_model, test_DMatrix).compute() xgb_pred = (xgb_pred > threshold) * 1.0 test_accuracy = accuracy_score(y_test.compute(), xgb_pred) elif "single" in self.compute_type: test_DMatrix = xgboost.DMatrix(data=X_test, label=y_test) xgb_pred = trained_model.predict(test_DMatrix) xgb_pred = (xgb_pred > threshold) * 1.0 test_accuracy = accuracy_score(y_test, xgb_pred) elif self.model_type == "RandomForest": if "multi" in self.compute_type: cuml_pred = trained_model.predict(X_test).compute() self.log_to_file("\n\tPrediction complete") test_accuracy = accuracy_score(y_test.compute(), cuml_pred, convert_dtype=True) elif "single" in self.compute_type: test_accuracy = trained_model.score( X_test, y_test.astype("int32")) except Exception as error: self.log_to_file("\n\n!error during inference: " + str(error)) self.log_to_file( f"\n\tFinished inference in {inference_timer.duration:.4f} s") self.log_to_file(f"\n\tTest-accuracy: {test_accuracy}") return test_accuracy, inference_timer.duration def set_up_logging(self): """ Function to set up logging for the object. """ logging_path = self.CSP_paths["output"] + "/log.txt" logging.basicConfig(filename=logging_path, level=logging.INFO) def log_to_file(self, text): """ Logs the text that comes in as input. """ logging.info(text) print(text)
d_train = pd.read_csv("https://raw.githubusercontent.com/szilard/benchm-ml--data/master/int_enc/train-1m-intenc.csv") d_test = pd.read_csv("https://raw.githubusercontent.com/szilard/benchm-ml--data/master/int_enc/test-1m-intenc.csv") dx_train = dd.from_pandas(d_train, npartitions=16) dx_test = dd.from_pandas(d_test, npartitions=1) X_train = dx_train.iloc[:, :-1].to_dask_array(lengths=True) y_train = dx_train.iloc[:,-1:].to_dask_array(lengths=True) X_test = dx_test.iloc[:, :-1].to_dask_array(lengths=True) y_test = dx_test.iloc[:,-1:].to_dask_array(lengths=True) X_train.persist() y_train.persist() client.has_what() dxgb_train = xgb.dask.DaskDMatrix(client, X_train, y_train) dxgb_test = xgb.dask.DaskDMatrix(client, X_test) param = {'objective':'binary:logistic', 'tree_method':'hist', 'max_depth':10, 'eta':0.1} %time md = xgb.dask.train(client, param, dxgb_train, num_boost_round = 100) y_pred = xgb.dask.predict(client, md, dxgb_test) y_pred_loc = y_pred.compute() y_test_loc = y_test.compute() print(metrics.roc_auc_score(y_test_loc, y_pred_loc))
def run(self, client: DaskClient): """ Run the algorithm. Parameters ---------- client : DaskClient A client to Dask. rj : RedisClient A Redist Client, a rejson.Client Notes ----- This function runs the adaptive algorithm. Because it's asynchronous, this function should return if ``"reset" in rj.keys() and rj.jsonget("reset")``. """ rj = self.redis_client() answers: List = [] logger.info(f"Staring {self.ident}") def submit(fn: str, *args, allow_other_workers=True, **kwargs): if "workers" in kwargs: kwargs.update({"allow_other_workers": allow_other_workers}) return client.submit( getattr(type(self), fn), *args, **kwargs, ) update = False queries = np.array([]) scores = np.array([]) n_model_updates = 0 rj.jsonset(f"alg-perf-{self.ident}", root, []) save_deadline = 0.0 # right away data: List[Dict[str, Any]] = [] error_raised: List[int] = [] for k in itertools.count(): try: loop_start = time() datum = {"iteration": k, "ident": self.ident, "time": time()} answers = self.get_answers(rj, clear=True) datum["num_answers"] = len(answers) self_future = client.scatter(self) _start = time() if len(queries) and len(scores): queries_f = client.scatter(queries) scores_f = client.scatter(scores) else: queries_f = scores_f = [] if update: datum["cleared_queries"] = True __start = time() self.clear_queries(rj) datum["time_clearing"] = time() - __start else: datum["cleared_queries"] = False done = distributed.Event(name="pa_finished") done.clear() workers = list(client.has_what()) random.shuffle(workers) f_post = submit( "post_queries", self_future, queries_f, scores_f, done=done, workers=workers[0], ) f_model = submit( "process_answers", self_future, answers, workers=workers[1], ) f_search = submit( "get_queries", self_future, stop=done, workers=workers[2], ) time_model = 0.0 time_post = 0.0 time_search = 0.0 def _model_done(_): nonlocal time_model nonlocal done done.set() time_model += time() - _start def _post_done(_): nonlocal time_post time_post += time() - _start def _search_done(_): nonlocal time_search time_search += time() - _start f_model.add_done_callback(_model_done) f_post.add_done_callback(_post_done) f_search.add_done_callback(_search_done) # Future.result raises errors automatically posted = f_post.result() new_self, update = f_model.result() queries, scores, search_meta = f_search.result() _datum_update = { "n_queries_posted": posted, "n_queries_scored": len(queries), "n_queries_in_db": rj.zcard(f"alg-{self.ident}-queries"), "model_updated": update, "n_model_updates": n_model_updates, "time_posting_queries": time_post, "time_model_update": time_model, "time_search": time_search, "time": time(), **search_meta, } datum.update(_datum_update) if update: _s = time() self.__dict__.update(new_self.__dict__) datum["time_update"] = time() - _s n_model_updates += 1 if time() > save_deadline + 1e-3: save_deadline = time() + 60 _s = time() self.save() datum["time_save"] = time() - _s datum["time_loop"] = time() - loop_start data.append(datum) logger.info(datum) posting_deadline = data[0]["time"] + 2 * 60 if time() >= posting_deadline or k == 10 or k == 20: flush_logger(logger) keys = data[-1].keys() to_post = {} for _k in keys: vals = [d.get(_k, None) for d in data] vals = [v for v in vals if v] if not len(vals): continue if isinstance(vals[0], (int, np.integer)): Type = int elif isinstance(vals[0], (float, np.floating)): Type = float else: continue _update = { f"{_k}_median": np.median(vals), f"{_k}_mean": np.mean(vals), f"{_k}_min": np.min(vals), f"{_k}_max": np.max(vals), } if _k == "time": _update = {"time": _update["time_median"]} to_post.update( {_k: Type(v) for _k, v in _update.items()}) try: rj.jsonarrappend(f"alg-perf-{self.ident}", root, to_post) except ResponseError as e: if ("could not perform this operation on a key that doesn't exist" in str(e)): # I think this happens when the frontend deletes # the database when /reset is triggered pass else: raise e data = [] if "reset" in rj.keys() and rj.jsonget("reset", root): logger.warning(f"Resetting {self.ident}") self.reset(client, rj, futures=[f_model, f_post, f_search]) break except Exception as e: logger.exception(e) flush_logger(logger) error_raised.append(k) __n = 5 if np.diff(error_raised[-__n:]).tolist() == [1] * (__n - 1): logger.exception(e) flush_logger(logger) raise e return True
X_dask, y_dask = \ dask_utils.persist_across_workers(c, [X_dask, y_dask], workers=workers) return X_dask, y_dask if __name__ == "__main__": ## using dask to setup cluster # This will use all GPUs on the local host by default # set this to use on node disk for caching cluster = LocalCUDACluster(threads_per_worker=1) c = Client(cluster) # Query the client for all connected workers workers = c.has_what().keys() n_workers = len(workers) n_streams = 8 # Performance optimization ## setting parameters # Data parameters train_size = 100000 test_size = 1000 n_samples = train_size + test_size n_features = 20 # Random Forest building parameters max_depth = 12 n_bins = 16 n_trees = 1000
class RapidsCloudML(object): def __init__(self, model_type='RandomForest', compute_type='multi-GPU', CSP_paths=default_sagemaker_paths): self.CSP_paths = CSP_paths self.model_type = model_type self.compute_type = compute_type # CPU or GPU cluster if 'multi-GPU' in self.compute_type: self.n_workers = cupy.cuda.runtime.getDeviceCount() self.cluster = LocalCUDACluster(n_workers=self.n_workers) self.client = Client(self.cluster) print(f'dask multi-GPU cluster with {self.n_workers} workers ') elif 'multi-CPU' in self.compute_type: self.n_workers = os.cpu_count() self.cluster = LocalCluster(n_workers=self.n_workers, threads_per_worker=1) self.client = Client(self.cluster) print(f'dask multi-CPU cluster with {self.n_workers} workers') else: self.cluster = None self.client = None def load_data(self, filename='*.parquet', columns=None): target_filename = self.CSP_paths['train_data'] + '/' + filename self.log(f'\n> loading dataset from {target_filename}...\n') with PerfTimer(self, 'ingestion_timer'): if 'multi-CPU' in self.compute_type: dataset = dask.dataframe.read_parquet(target_filename, columns=columns) elif 'multi-GPU' in self.compute_type: dataset = dask_cudf.read_parquet(target_filename, columns=columns) dataset = dataset.dropna() dataset = dataset.repartition(npartitions=self.n_workers * 4) print(f'dataset len : {len(dataset)}') return dataset def split_data(self, dataset, y_label, train_size=.8, random_state=0, shuffle=True): with PerfTimer(self, 'split_timer'): train, test = train_test_split( dataset, random_state=random_state ) # unable to shuffle -- no dask_cudf sampler implemented X_train, y_train = train.drop( y_label, axis=1).astype('float32'), train[y_label].astype('int32') X_test, y_test = test.drop( y_label, axis=1).astype('float32'), test[y_label].astype('int32') if 'multi-GPU' in self.compute_type: with PerfTimer(self, 'persist_timer'): workers = self.client.has_what().keys() X_train, X_test, y_train, y_test = persist_across_workers( self.client, [X_train, X_test, y_train, y_test], workers=workers) wait([X_train, X_test, y_train, y_test]) return X_train, X_test, y_train, y_test def train_model(self, X_train, y_train, model_params): with PerfTimer(self, 'train_timer'): if 'XGBoost' in self.model_type: dtrain = xgboost.dask.DaskDMatrix(self.client, X_train, y_train) # avoids warning messages boosting_rounds = model_params.pop('num_boost_round') trained_model = xgboost.dask.train( self.client, model_params, dtrain, num_boost_round=boosting_rounds) return trained_model['booster'] elif 'RandomForest' in self.model_type: if 'GPU' in self.compute_type: from cuml.dask.ensemble import RandomForestClassifier rf_model = RandomForestClassifier( n_estimators=model_params['n_estimators'], max_depth=model_params['max_depth'], max_features=model_params['max_features'], n_bins=32) else: from sklearn.ensemble import RandomForestClassifier rf_model = RandomForestClassifier( n_estimators=model_params['n_estimators'], max_depth=model_params['max_depth'], max_features=model_params['max_features'], n_jobs=-1) trained_model = rf_model.fit(X_train, y_train) return trained_model print(len(X_train)) return None def evaluate_test_perf(self, trained_model, X_test, y_test, threshold=0.5): with PerfTimer(self, 'score_timer'): if 'XGBoost' in self.model_type: dtest = xgboost.dask.DaskDMatrix(self.client, X_test, y_test) predictions = xgboost.dask.predict(self.client, trained_model, dtest).compute() predictions = np.where( predictions >= threshold, 1, 0) # threshold returned probabilities into 0/1 labels elif 'RandomForest' in self.model_type: predictions = trained_model.predict(X_test) if 'multi-CPU' not in self.compute_type: predictions = predictions.compute() if 'multi' in self.compute_type: y_test = y_test.compute() if 'GPU' in self.compute_type: test_accuracy = cuml_accuracy_score(y_test, predictions) elif 'CPU' in self.compute_type: test_accuracy = sklearn_accuracy_score(y_test, predictions) # accumulate internal list return test_accuracy # emit score so sagemaker can parse it (using string REGEX) def emit_score(self, test_accuracy): self.log(f'\n\t test-accuracy: {test_accuracy}; \n') def save_best_model(self, global_best_model=None): pass def set_up_logging(self): logging_path = self.CSP_paths['output'] + '/log.txt' logging.basicConfig(filename=logging_path, level=logging.INFO) def log(self, text): logging.info(text) print(text)
def test_end_to_end(): cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) # NOTE: The LocalCUDACluster needs to be started before any imports that # could potentially create a CUDA context. import dask_cudf import cudf import numpy as np from dask_cuml.neighbors import NearestNeighbors as cumlKNN def create_df(f, m, n): X = np.random.rand(m, n) ret = cudf.DataFrame( [(i, X[:, i].astype(np.float32)) for i in range(n)], index=cudf.dataframe.RangeIndex(f * m, f * m + m, 1)) return ret def get_meta(df): ret = df.iloc[:0] return ret # Per gpu/worker train_m = 500 train_n = 25 search_m = 10 search_k = 15 workers = client.has_what().keys() # Create dfs on each worker (gpu) dfs = [ client.submit(create_df, n, train_m, train_n, workers=[worker]) for worker, n in list(zip(workers, list(range(len(workers))))) ] # Wait for completion wait(dfs) meta = client.submit(get_meta, dfs[0]).result() X_df = dask_cudf.from_delayed(dfs, meta=meta) X_pd = X_df.compute().to_pandas() cumlNN = cumlKNN() cumlNN.fit(X_df) sklNN = NearestNeighbors(metric="sqeuclidean") sklNN.fit(X_pd) cuml_D, cuml_I = cumlNN.kneighbors(X_df[0:search_m - 1], search_k) sk_D, sk_I = sklNN.kneighbors(X_pd[0:search_m], search_k) cuml_I_nd = np.array(cuml_I.compute().as_gpu_matrix(), dtype=sk_I.dtype) cuml_D_nd = np.array(cuml_D.compute().as_gpu_matrix(), dtype=sk_D.dtype) print(str(cuml_D_nd.dtype)) print(str(sk_D.dtype)) assert np.array_equal(cuml_I_nd, sk_I) assert np.allclose(cuml_D_nd, sk_D, atol=1e-5) cluster.close()
def test_end_to_end(nrows, ncols, nclusters, n_parts, delayed_predict, input_type, cluster): client = None try: client = Client(cluster) from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs X, y = make_blobs(n_samples=int(nrows), n_features=ncols, centers=nclusters, n_parts=n_parts, cluster_std=0.01, random_state=10) wait(X) if input_type == "dataframe": X_train = to_dask_cudf(X) y_train = to_dask_cudf(y) elif input_type == "array": X_train, y_train = X, y cumlModel = cumlKMeans(init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_train) cumlLabels = cumlModel.predict(X_train, delayed_predict) n_workers = len(list(client.has_what().keys())) # Verifying we are grouping partitions. This should be changed soon. if n_parts is not None and n_parts < n_workers: parts_len = n_parts else: parts_len = n_workers if input_type == "dataframe": assert cumlLabels.npartitions == parts_len cumlPred = cp.array(cumlLabels.compute().to_pandas().values) labels = cp.squeeze(y_train.compute().to_pandas().values) elif input_type == "array": assert len(cumlLabels.chunks[0]) == parts_len cumlPred = cp.array(cumlLabels.compute()) labels = cp.squeeze(y_train.compute()) assert cumlPred.shape[0] == nrows assert cp.max(cumlPred) == nclusters - 1 assert cp.min(cumlPred) == 0 score = adjusted_rand_score(labels, cumlPred) print(str(score)) assert 1.0 == score finally: client.close()