def _query_models(self, n_neighbors, comms, nn_models, index_futures, query_futures): worker_info = comms.worker_info(comms.worker_addresses) index_worker_to_parts = workers_to_parts(index_futures) query_worker_to_parts = workers_to_parts(query_futures) """ Build inputs and outputs """ idx_parts_to_ranks, idx_M = parts_to_ranks(self.client, worker_info, index_futures) query_parts_to_ranks, query_M = parts_to_ranks(self.client, worker_info, query_futures) """ Invoke kneighbors on Dask workers to perform distributed query """ key = uuid1() nn_fit = dict([(worker_info[worker]["r"], self.client.submit( NearestNeighbors._func_kneighbors, nn_models[worker], index_worker_to_parts[worker] if worker in index_worker_to_parts else [], idx_M, self.n_cols, idx_parts_to_ranks, query_worker_to_parts[worker] if worker in query_worker_to_parts else [], query_M, query_parts_to_ranks, worker_info[worker]["r"], n_neighbors, key="%s-%s" % (key, idx), workers=[worker])) for idx, worker in enumerate(comms.worker_addresses)]) wait(list(nn_fit.values())) raise_exception_from_futures(list(nn_fit.values())) """ Gather resulting partitions and return dask_cudfs """ out_d_futures = flatten_grouped_results(self.client, query_parts_to_ranks, nn_fit, getter_func=_func_get_d) out_i_futures = flatten_grouped_results(self.client, query_parts_to_ranks, nn_fit, getter_func=_func_get_i) return nn_fit, out_d_futures, out_i_futures
def _parallel_func(self, X, func): """ Internal function that predicts the labels using a distributed KMeans model. Parameters ---------- X : dask_cudf.Dataframe Dataframe to predict Returns ------- result: dask_cudf.Dataframe Dataframe containing label predictions """ key = uuid1() gpu_futures = self.client.sync(extract_ddf_partitions, X) worker_to_parts = workers_to_parts(gpu_futures) kmeans_predict = [ self.client.submit(func, self.local_model, wf[1], workers=[wf[0]], key="%s-%s" % (key, idx)) for idx, wf in enumerate(worker_to_parts.items()) ] self.raise_exception_from_futures(kmeans_predict) return to_dask_cudf(kmeans_predict)
def score(self, X): key = uuid1() gpu_futures = self.client.sync(extract_ddf_partitions, X) worker_to_parts = workers_to_parts(gpu_futures) scores = [ self.client.submit(KMeans._func_score, self.local_model, wf[1], workers=[wf[0]], key="%s-%s" % (key, idx)).result() for idx, wf in enumerate(worker_to_parts.items()) ] return -1 * np.sum(np.array(scores) * -1)
def _inverse_transform(self, X): gpu_futures = self.client.sync(extract_ddf_partitions, X) worker_to_parts = workers_to_parts(gpu_futures) key = uuid1() partsToRanks = [(self.rnks[wf[0]], self.client.submit(PCA._func_get_size, wf[1], workers=[wf[0]], key="%s-%s" % (key, idx)).result()) for idx, wf in enumerate(gpu_futures)] N = X.shape[1] M = reduce(lambda a, b: a + b, map(lambda x: x[1], partsToRanks)) key = uuid1() pca_inverse_transform = dict([ (self.rnks[wf[0]], self.client.submit(PCA._func_inverse_transform, wf[1], worker_to_parts[wf[0]], M, N, partsToRanks, self.rnks[wf[0]], key="%s-%s" % (key, idx), workers=[wf[0]])) for idx, wf in enumerate(self.pca_models) ]) wait(list(pca_inverse_transform.values())) raise_exception_from_futures(list(pca_inverse_transform.values())) out_futures = [] completed_part_map = {} for rank, size in partsToRanks: if rank not in completed_part_map: completed_part_map[rank] = 0 f = pca_inverse_transform[rank] out_futures.append( self.client.submit(PCA._func_get_idx, f, completed_part_map[rank])) completed_part_map[rank] += 1 return to_dask_cudf(out_futures)
def fit(self, X): """ Fit a multi-node multi-GPU KMeans model Parameters ---------- X : dask_cudf.Dataframe Returns ------- self: KMeans model """ gpu_futures = self.client.sync(extract_ddf_partitions, X) worker_to_parts = workers_to_parts(gpu_futures) workers = list(map(lambda x: x[0], worker_to_parts.items())) comms = CommsContext(comms_p2p=False) comms.init(workers=workers) key = uuid1() kmeans_fit = [ self.client.submit(KMeans._func_fit, comms.sessionId, wf[1], **self.kwargs, workers=[wf[0]], key="%s-%s" % (key, idx)) for idx, wf in enumerate(worker_to_parts.items()) ] wait(kmeans_fit) self.raise_exception_from_futures(kmeans_fit) comms.destroy() self.local_model = kmeans_fit[0].result() self.cluster_centers_ = self.local_model.cluster_centers_ return self
def fit(self, X, y, convert_dtype=False): """ Fit the input data with a Random Forest classifier IMPORTANT: X is expected to be partitioned with at least one partition on each Dask worker being used by the forest (self.workers). If a worker has multiple data partitions, they will be concatenated before fitting, which will lead to additional memory usage. To minimize memory consumption, ensure that each worker has exactly one partition. When persisting data, you can use cuml.dask.common.utils.persist_across_workers to simplify this:: X_dask_cudf = dask_cudf.from_cudf(X_cudf, npartitions=n_workers) y_dask_cudf = dask_cudf.from_cudf(y_cudf, npartitions=n_workers) X_dask_cudf, y_dask_cudf = persist_across_workers(dask_client, [X_dask_cudf, y_dask_cudf]) (this is equivalent to calling `persist` with the data and workers):: X_dask_cudf, y_dask_cudf = dask_client.persist([X_dask_cudf, y_dask_cudf], workers={ X_dask_cudf=workers, y_dask_cudf=workers }) Parameters ---------- X : dask_cudf.Dataframe Dense matrix (floats or doubles) of shape (n_samples, n_features). Features of training examples. y : dask_cudf.Dataframe Dense matrix (floats or doubles) of shape (n_samples, 1) Labels of training examples. **y must be partitioned the same way as X** convert_dtype : bool, optional (default = False) When set to True, the fit method will, when necessary, convert y to be the same data type as X if they differ. This will increase memory used for the method. """ c = default_client() self.num_classes = len(y.unique()) X_futures = workers_to_parts(c.sync(extract_ddf_partitions, X)) y_futures = workers_to_parts(c.sync(extract_ddf_partitions, y)) X_partition_workers = [w for w, xc in X_futures.items()] y_partition_workers = [w for w, xc in y_futures.items()] if set(X_partition_workers) != set(self.workers) or \ set(y_partition_workers) != set(self.workers): raise ValueError(""" X is not partitioned on the same workers expected by RF\n X workers: %s\n y workers: %s\n RF workers: %s """ % (str(X_partition_workers), str(y_partition_workers), str(self.workers))) futures = list() for w, xc in X_futures.items(): futures.append( c.submit( RandomForestClassifier._fit, self.rfs[w], xc, y_futures[w], convert_dtype, random.random(), workers=[w], ) ) wait(futures) raise_exception_from_futures(futures) return self
def fit(self, X, y): """ Fit the input data with a Random Forest regression model IMPORTANT: X is expected to be partitioned with at least one partition on each Dask worker being used by the forest (self.workers). When persisting data, you can use cuml.dask.common.utils.persist_across_workers to simplify this:: X_dask_cudf = dask_cudf.from_cudf(X_cudf, npartitions=n_workers) y_dask_cudf = dask_cudf.from_cudf(y_cudf, npartitions=n_workers) X_dask_cudf, y_dask_cudf = persist_across_workers(dask_client, [X_dask_cudf, y_dask_cudf]) (this is equivalent to calling `persist` with the data and workers):: X_dask_cudf, y_dask_cudf = dask_client.persist([X_dask_cudf, y_dask_cudf], workers={ X_dask_cudf=workers, y_dask_cudf=workers }) Parameters ---------- X : dask_cudf.Dataframe Dense matrix (floats or doubles) of shape (n_samples, n_features). Features of training examples. y : dask_cudf.Dataframe Dense matrix (floats or doubles) of shape (n_samples, 1) Labels of training examples. y must be partitioned the same way as X """ c = default_client() X_futures = workers_to_parts(c.sync(extract_ddf_partitions, X)) y_futures = workers_to_parts(c.sync(extract_ddf_partitions, y)) X_partition_workers = [w for w, xc in X_futures.items()] y_partition_workers = [w for w, xc in y_futures.items()] if set(X_partition_workers) != set(self.workers) or \ set(y_partition_workers) != set(self.workers): raise ValueError(""" X is not partitioned on the same workers expected by RF\n X workers: %s\n y workers: %s\n RF workers: %s """ % (str(X_partition_workers), str(y_partition_workers), str(self.workers))) futures = list() for w, xc in X_futures.items(): futures.append( c.submit( RandomForestRegressor._fit, self.rfs[w], xc, y_futures[w], random.random(), workers=[w], )) wait(futures) raise_exception_from_futures(futures) return self
def fit(self, X, _transform=False): """ Fit the model with X. Parameters ---------- X : dask cuDF input """ gpu_futures = self.client.sync(extract_ddf_partitions, X) worker_to_parts = workers_to_parts(gpu_futures) workers = list(map(lambda x: x[0], gpu_futures)) comms = CommsContext(comms_p2p=False) comms.init(workers=workers) worker_info = comms.worker_info(comms.worker_addresses) self.rnks = {w: worker_info[w]["r"] for w in workers} key = uuid1() partsToRanks = [(worker_info[wf[0]]["r"], self.client.submit(PCA._func_get_size, wf[1], workers=[wf[0]], key="%s-%s" % (key, idx)).result()) for idx, wf in enumerate(gpu_futures)] N = X.shape[1] M = reduce(lambda a, b: a + b, map(lambda x: x[1], partsToRanks)) key = uuid1() self.pca_models = [(wf[0], self.client.submit(PCA._func_create_model, comms.sessionId, wf[1], **self.kwargs, workers=[wf[0]], key="%s-%s" % (key, idx))) for idx, wf in enumerate(worker_to_parts.items())] key = uuid1() pca_fit = dict([(worker_info[wf[0]]["r"], self.client.submit(PCA._func_fit, wf[1], M, N, partsToRanks, worker_info[wf[0]]["r"], _transform, key="%s-%s" % (key, idx), workers=[wf[0]])) for idx, wf in enumerate(self.pca_models)]) wait(list(pca_fit.values())) raise_exception_from_futures(list(pca_fit.values())) comms.destroy() self.local_model = self.client.submit(PCA._func_get_first, self.pca_models[0][1]).result() self.components_ = self.local_model.components_ self.explained_variance_ = self.local_model.explained_variance_ self.explained_variance_ratio_ = \ self.local_model.explained_variance_ratio_ self.singular_values_ = self.local_model.singular_values_ self.noise_variance = self.local_model.noise_variance_ out_futures = [] if _transform: completed_part_map = {} for rank, size in partsToRanks: if rank not in completed_part_map: completed_part_map[rank] = 0 f = pca_fit[rank] out_futures.append( self.client.submit(PCA._func_get_idx, f, completed_part_map[rank])) completed_part_map[rank] += 1 return to_dask_cudf(out_futures)