def predict_proba(self, X, convert_dtype=True): """ Provide score by comparing predictions and ground truth. Parameters ---------- X : array-like (device or host) shape = (n_samples, n_features) Query data. Acceptable formats: dask cuDF, dask CuPy/NumPy/Numba Array convert_dtype : bool, optional (default = True) When set to True, the predict method will automatically convert the data to the right formats. Returns ------- probabilities : Dask futures or Dask CuPy Arrays """ query_handler = \ DistributedDataHandler.create(data=X, client=self.client) self.datatype = query_handler.datatype comms = KNeighborsClassifier._build_comms(self.data_handler, query_handler, self.streams_per_handle) worker_info = comms.worker_info(comms.worker_addresses) """ Build inputs and outputs """ self.data_handler.calculate_parts_to_sizes(comms=comms) query_handler.calculate_parts_to_sizes(comms=comms) data_parts_to_ranks, data_nrows = \ parts_to_ranks(self.client, worker_info, self.data_handler.gpu_futures) query_parts_to_ranks, query_nrows = \ parts_to_ranks(self.client, worker_info, query_handler.gpu_futures) """ Each Dask worker creates a single model """ key = uuid1() models = dict([(worker, self.client.submit(self._func_create_model, comms.sessionId, **self.kwargs, workers=[worker], key="%s-%s" % (key, idx))) for idx, worker in enumerate(comms.worker_addresses)]) """ Invoke knn_classify on Dask workers to perform distributed query """ key = uuid1() knn_prob_res = dict([ (worker_info[worker]["rank"], self.client.submit(self._func_predict, models[worker], self.data_handler.worker_to_parts[worker] if worker in self.data_handler.workers else [], data_parts_to_ranks, data_nrows, query_handler.worker_to_parts[worker] if worker in query_handler.workers else [], query_parts_to_ranks, query_nrows, self.uniq_labels, self.n_unique, X.shape[1], worker_info[worker]["rank"], convert_dtype, True, key="%s-%s" % (key, idx), workers=[worker])) for idx, worker in enumerate(comms.worker_addresses) ]) wait_and_raise_from_futures(list(knn_prob_res.values())) n_outputs = len(self.n_unique) """ Gather resulting partitions and return result """ outputs = [] for o in range(n_outputs): futures = flatten_grouped_results(self.client, query_parts_to_ranks, knn_prob_res, getter_func=_custom_getter(o)) outputs.append(to_output(futures, self.datatype)) comms.destroy() return tuple(outputs)
def kneighbors(self, X=None, n_neighbors=None, return_distance=True, _return_futures=False): """ Query the distributed nearest neighbors index Parameters ---------- X : dask_cudf.Dataframe Vectors to query. If not provided, neighbors of each indexed point are returned. n_neighbors : int Number of neighbors to query for each row in X. If not provided, the n_neighbors on the model are used. return_distance : boolean (default=True) If false, only indices are returned Returns ------- ret : tuple (dask_cudf.DataFrame, dask_cudf.DataFrame) First dask-cuDF DataFrame contains distances, second contains the indices. """ n_neighbors = self.get_neighbors(n_neighbors) query_handler = self.X_handler if X is None else \ DistributedDataHandler.create(data=X, client=self.client) if query_handler is None: raise ValueError("Model needs to be trained using fit() " "before calling kneighbors()") """ Create communicator clique """ comms = NearestNeighbors._build_comms(self.X_handler, query_handler, self.streams_per_handle) """ Initialize models on workers """ nn_models = self._create_models(comms) """ Perform model query """ nn_fit, out_d_futures, out_i_futures = \ self._query_models(n_neighbors, comms, nn_models, self.X_handler, query_handler) comms.destroy() if _return_futures: ret = nn_fit, out_i_futures if not return_distance else \ (nn_fit, out_d_futures, out_i_futures) else: ret = to_output(out_i_futures, self.datatype) \ if not return_distance else (to_output(out_d_futures, self.datatype), to_output( out_i_futures, self.datatype)) return ret
def predict(self, X, convert_dtype=True): """ Predict outputs for a query from previously stored index and outputs. The process is done in a multi-node multi-GPU fashion. Parameters ---------- X : array-like (device or host) shape = (n_samples, n_features) Query data. Acceptable formats: dask cuDF, dask CuPy/NumPy/Numba Array convert_dtype : bool, optional (default = True) When set to True, the predict method will automatically convert the data to the right formats. Returns ------- predictions : Dask futures or Dask CuPy Arrays """ query_handler = \ DistributedDataHandler.create(data=X, client=self.client) self.datatype = query_handler.datatype comms = KNeighborsRegressor._build_comms(self.data_handler, query_handler, self.streams_per_handle) worker_info = comms.worker_info(comms.worker_addresses) """ Build inputs and outputs """ self.data_handler.calculate_parts_to_sizes(comms=comms) query_handler.calculate_parts_to_sizes(comms=comms) data_parts_to_ranks, data_nrows = \ parts_to_ranks(self.client, worker_info, self.data_handler.gpu_futures) query_parts_to_ranks, query_nrows = \ parts_to_ranks(self.client, worker_info, query_handler.gpu_futures) """ Each Dask worker creates a single model """ key = uuid1() models = dict([(worker, self.client.submit(self._func_create_model, comms.sessionId, **self.kwargs, workers=[worker], key="%s-%s" % (key, idx))) for idx, worker in enumerate(comms.worker_addresses)]) """ Invoke knn_classify on Dask workers to perform distributed query """ key = uuid1() knn_reg_res = dict([ (worker_info[worker]["rank"], self.client.submit(self._func_predict, models[worker], self.data_handler.worker_to_parts[worker] if worker in self.data_handler.workers else [], data_parts_to_ranks, data_nrows, query_handler.worker_to_parts[worker] if worker in query_handler.workers else [], query_parts_to_ranks, query_nrows, X.shape[1], self.n_outputs, worker_info[worker]["rank"], convert_dtype, key="%s-%s" % (key, idx), workers=[worker])) for idx, worker in enumerate(comms.worker_addresses) ]) wait_and_raise_from_futures(list(knn_reg_res.values())) """ Gather resulting partitions and return result """ out_futures = flatten_grouped_results(self.client, query_parts_to_ranks, knn_reg_res) comms.destroy() return to_output(out_futures, self.datatype).squeeze()
def _fit(self, X, _transform=False): """ Fit the model with X. Parameters ---------- X : dask cuDF input """ n_cols = X.shape[1] data = DistributedDataHandler.create(data=X, client=self.client) self.datatype = data.datatype if "svd_solver" in self.kwargs \ and self.kwargs["svd_solver"] == "tsqr": comms = CommsContext(comms_p2p=True) else: comms = CommsContext(comms_p2p=False) comms.init(workers=data.workers) data.calculate_parts_to_sizes(comms) worker_info = comms.worker_info(comms.worker_addresses) parts_to_sizes, _ = parts_to_ranks(self.client, worker_info, data.gpu_futures) total_rows = data.total_rows models = dict([(data.worker_info[wf[0]]["rank"], self.client.submit(self._create_model, comms.sessionId, self._model_func, self.datatype, **self.kwargs, pure=False, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items())]) pca_fit = dict([ (wf[0], self.client.submit(DecompositionSyncFitMixin._func_fit, models[data.worker_info[wf[0]]["rank"]], wf[1], total_rows, n_cols, parts_to_sizes, data.worker_info[wf[0]]["rank"], _transform, pure=False, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items()) ]) wait(list(pca_fit.values())) raise_exception_from_futures(list(pca_fit.values())) comms.destroy() self._set_internal_model(list(models.values())[0]) if _transform: out_futures = flatten_grouped_results(self.client, data.gpu_futures, pca_fit) return to_output(out_futures, self.datatype) return self
def _fit(self, X, _transform=False): """ Fit the model with X. Parameters ---------- X : dask cuDF input """ n_cols = X.shape[1] data = DistributedDataHandler.create(data=X, client=self.client) self.datatype = data.datatype comms = CommsContext(comms_p2p=False) comms.init(workers=data.workers) data.calculate_parts_to_sizes(comms) total_rows = data.total_rows models = dict([(data.worker_info[wf[0]]["rank"], self.client.submit(self._create_model, comms.sessionId, self._model_func, self.datatype, **self.kwargs, pure=False, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items())]) pca_fit = dict([ (wf[0], self.client.submit( DecompositionSyncFitMixin._func_fit, models[data.worker_info[wf[0]]["rank"]], wf[1], total_rows, n_cols, data.parts_to_sizes[data.worker_info[wf[0]]["rank"]], data.worker_info[wf[0]]["rank"], _transform, pure=False, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items()) ]) wait(list(pca_fit.values())) raise_exception_from_futures(list(pca_fit.values())) comms.destroy() self.local_model = list(models.values())[0].result() self.components_ = self.local_model.components_ self.explained_variance_ = self.local_model.explained_variance_ self.explained_variance_ratio_ = \ self.local_model.explained_variance_ratio_ self.singular_values_ = self.local_model.singular_values_ if _transform: out_futures = flatten_grouped_results(self.client, data.gpu_futures, pca_fit) return to_output(out_futures, self.datatype) return self return self