def fit(self, X): """ Fit a multi-node multi-GPU KMeans model Parameters ---------- X : Dask cuDF DataFrame or CuPy backed Dask Array Training data to cluster. """ data = DistributedDataHandler.create(X, client=self.client) self.datatype = data.datatype comms = Comms(comms_p2p=False) comms.init(workers=data.workers) kmeans_fit = [self.client.submit(KMeans._func_fit, comms.sessionId, wf[1], self.datatype, **self.kwargs, workers=[wf[0]], pure=False) for idx, wf in enumerate(data.worker_to_parts.items())] wait_and_raise_from_futures(kmeans_fit) comms.destroy() self._set_internal_model(kmeans_fit[0]) return self
def _fit(self, model_func, data): n_cols = data[0].shape[1] data = DistributedDataHandler.create(data=data, client=self.client) self.datatype = data.datatype comms = Comms(comms_p2p=False) comms.init(workers=data.workers) data.calculate_parts_to_sizes(comms) self.ranks = data.ranks worker_info = comms.worker_info(comms.worker_addresses) parts_to_sizes, _ = parts_to_ranks(self.client, worker_info, data.gpu_futures) lin_models = dict([(data.worker_info[worker_data[0]]["rank"], self.client.submit( model_func, comms.sessionId, self.datatype, **self.kwargs, pure=False, workers=[worker_data[0]])) for worker, worker_data in enumerate(data.worker_to_parts.items())]) lin_fit = dict([(worker_data[0], self.client.submit( _func_fit, lin_models[data.worker_info[worker_data[0]]["rank"]], worker_data[1], data.total_rows, n_cols, parts_to_sizes, data.worker_info[worker_data[0]]["rank"], pure=False, workers=[worker_data[0]])) for worker, worker_data in enumerate(data.worker_to_parts.items())]) wait_and_raise_from_futures(list(lin_fit.values())) comms.destroy() return lin_models
def fit(self, X, sample_weight=None): """ Fit a multi-node multi-GPU KMeans model Parameters ---------- X : Dask cuDF DataFrame or CuPy backed Dask Array Training data to cluster. sample_weight : Dask cuDF DataFrame or CuPy backed Dask Array shape = (n_samples,), default=None # noqa The weights for each observation in X. If None, all observations are assigned equal weight. Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device ndarray, cuda array interface compliant array like CuPy """ sample_weight = self._check_normalize_sample_weight(sample_weight) inputs = X if sample_weight is None else (X, sample_weight) data = DistributedDataHandler.create(inputs, client=self.client) self.datatype = data.datatype # This needs to happen on the scheduler comms = Comms(comms_p2p=False, client=self.client) comms.init(workers=data.workers) kmeans_fit = [ self.client.submit(KMeans._func_fit, comms.sessionId, wf[1], self.datatype, data.multiple, **self.kwargs, workers=[wf[0]], pure=False) for idx, wf in enumerate(data.worker_to_parts.items()) ] wait_and_raise_from_futures(kmeans_fit) comms.destroy() self._set_internal_model(kmeans_fit[0]) return self
def fit(self, X, out_dtype="int32"): """ Fit a multi-node multi-GPU DBSCAN model Parameters ---------- X : array-like (device or host) Dense matrix containing floats or doubles. Acceptable formats: CUDA array interface compliant objects like CuPy, cuDF DataFrame/Series, NumPy ndarray and Pandas DataFrame/Series. out_dtype: dtype Determines the precision of the output labels array. default: "int32". Valid values are { "int32", np.int32, "int64", np.int64}. """ if out_dtype not in ["int32", np.int32, "int64", np.int64]: raise ValueError("Invalid value for out_dtype. " "Valid values are {'int32', 'int64', " "np.int32, np.int64}") data = self.client.scatter(X, broadcast=True) comms = Comms(comms_p2p=True) comms.init() dbscan_fit = [ self.client.submit(DBSCAN._func_fit(out_dtype), comms.sessionId, data, self.verbose, **self.kwargs, workers=[worker], pure=False) for worker in comms.worker_addresses ] wait_and_raise_from_futures(dbscan_fit) comms.destroy() self._set_internal_model(dbscan_fit[0]) return self
def _fit(self, X, _transform=False): """ Fit the model with X. Parameters ---------- X : dask cuDF input """ n_cols = X.shape[1] data = DistributedDataHandler.create(data=X, client=self.client) self.datatype = data.datatype if "svd_solver" in self.kwargs \ and self.kwargs["svd_solver"] == "tsqr": comms = Comms(comms_p2p=True) else: comms = Comms(comms_p2p=False) comms.init(workers=data.workers) data.calculate_parts_to_sizes(comms) worker_info = comms.worker_info(comms.worker_addresses) parts_to_sizes, _ = parts_to_ranks(self.client, worker_info, data.gpu_futures) total_rows = data.total_rows models = dict([(data.worker_info[wf[0]]["rank"], self.client.submit(self._create_model, comms.sessionId, self._model_func, self.datatype, **self.kwargs, pure=False, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items())]) pca_fit = dict([ (wf[0], self.client.submit(DecompositionSyncFitMixin._func_fit, models[data.worker_info[wf[0]]["rank"]], wf[1], total_rows, n_cols, parts_to_sizes, data.worker_info[wf[0]]["rank"], _transform, pure=False, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items()) ]) wait(list(pca_fit.values())) raise_exception_from_futures(list(pca_fit.values())) comms.destroy() self._set_internal_model(list(models.values())[0]) if _transform: out_futures = flatten_grouped_results(self.client, data.gpu_futures, pca_fit) return to_output(out_futures, self.datatype) return self