def _fit_with_colocality(self, X, y): input_futures = self.client.sync(extract_colocated_ddf_partitions, X, y, self.client) workers = list(input_futures.keys()) comms = CommsContext(comms_p2p=False) comms.init(workers=workers) worker_info = comms.worker_info(comms.worker_addresses) n_cols = X.shape[1] n_rows = 0 self.rnks = dict() partsToSizes = dict() key = uuid1() for w, futures in input_futures.items(): self.rnks[w] = worker_info[w]["r"] parts = [(self.client.submit( Ridge._func_get_size_cl, future, workers=[w], key="%s-%s" % (key, idx)).result()) for idx, future in enumerate(futures)] partsToSizes[worker_info[w]["r"]] = parts for p in parts: n_rows = n_rows + p key = uuid1() self.linear_models = [(w, self.client.submit( Ridge._func_create_model, comms.sessionId, **self.kwargs, workers=[w], key="%s-%s" % (key, idx))) for idx, w in enumerate(workers)] key = uuid1() linear_fit = dict([(worker_info[wf[0]]["r"], self.client.submit( Ridge._func_fit_colocated, wf[1], input_futures[wf[0]], n_rows, n_cols, partsToSizes, worker_info[wf[0]]["r"], key="%s-%s" % (key, idx), workers=[wf[0]])) for idx, wf in enumerate(self.linear_models)]) wait(list(linear_fit.values())) raise_exception_from_futures(list(linear_fit.values())) comms.destroy() self.local_model = self.linear_models[0][1].result() self.coef_ = self.local_model.coef_ self.intercept_ = self.local_model.intercept_
def fit(self, X, _transform=False): """ Fit the model with X. Parameters ---------- X : dask cuDF input """ gpu_futures = self.client.sync(extract_ddf_partitions, X) self.rnks = dict() rnk_counter = 0 worker_to_parts = OrderedDict() for w, p in gpu_futures: if w not in worker_to_parts: worker_to_parts[w] = [] if w not in self.rnks.keys(): self.rnks[w] = rnk_counter rnk_counter = rnk_counter + 1 worker_to_parts[w].append(p) workers = list(map(lambda x: x[0], gpu_futures)) comms = CommsContext(comms_p2p=False) comms.init(workers=workers) worker_info = comms.worker_info(comms.worker_addresses) key = uuid1() partsToRanks = [(worker_info[wf[0]]["r"], self.client.submit(TruncatedSVD._func_get_size, wf[1], workers=[wf[0]], key="%s-%s" % (key, idx)).result()) for idx, wf in enumerate(gpu_futures)] N = X.shape[1] M = reduce(lambda a, b: a + b, map(lambda x: x[1], partsToRanks)) key = uuid1() self.tsvd_models = [ (wf[0], self.client.submit(TruncatedSVD._func_create_model, comms.sessionId, wf[1], **self.kwargs, workers=[wf[0]], key="%s-%s" % (key, idx))) for idx, wf in enumerate(worker_to_parts.items()) ] key = uuid1() tsvd_fit = dict([(worker_info[wf[0]]["r"], self.client.submit(TruncatedSVD._func_fit, wf[1], M, N, partsToRanks, worker_info[wf[0]]["r"], _transform, key="%s-%s" % (key, idx), workers=[wf[0]])) for idx, wf in enumerate(self.tsvd_models)]) wait(list(tsvd_fit.values())) raise_exception_from_futures(list(tsvd_fit.values())) comms.destroy() self.local_model = self.client.submit(TruncatedSVD._func_get_first, self.tsvd_models[0][1]).result() self.components_ = self.local_model.components_ self.explained_variance_ = self.local_model.explained_variance_ self.explained_variance_ratio_ = \ self.local_model.explained_variance_ratio_ self.singular_values_ = self.local_model.singular_values_ out_futures = [] if _transform: completed_part_map = {} for rank, size in partsToRanks: if rank not in completed_part_map: completed_part_map[rank] = 0 f = tsvd_fit[rank] out_futures.append( self.client.submit(TruncatedSVD._func_get_idx, f, completed_part_map[rank])) completed_part_map[rank] += 1 return to_dask_cudf(out_futures)
def _fit(self, X, _transform=False): """ Fit the model with X. Parameters ---------- X : dask cuDF input """ n_cols = X.shape[1] data = DistributedDataHandler.create(data=X, client=self.client) self.datatype = data.datatype if "svd_solver" in self.kwargs \ and self.kwargs["svd_solver"] == "tsqr": comms = CommsContext(comms_p2p=True) else: comms = CommsContext(comms_p2p=False) comms.init(workers=data.workers) data.calculate_parts_to_sizes(comms) worker_info = comms.worker_info(comms.worker_addresses) parts_to_sizes, _ = parts_to_ranks(self.client, worker_info, data.gpu_futures) total_rows = data.total_rows models = dict([(data.worker_info[wf[0]]["rank"], self.client.submit(self._create_model, comms.sessionId, self._model_func, self.datatype, **self.kwargs, pure=False, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items())]) pca_fit = dict([ (wf[0], self.client.submit(DecompositionSyncFitMixin._func_fit, models[data.worker_info[wf[0]]["rank"]], wf[1], total_rows, n_cols, parts_to_sizes, data.worker_info[wf[0]]["rank"], _transform, pure=False, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items()) ]) wait(list(pca_fit.values())) raise_exception_from_futures(list(pca_fit.values())) comms.destroy() self._set_internal_model(list(models.values())[0]) if _transform: out_futures = flatten_grouped_results(self.client, data.gpu_futures, pca_fit) return to_output(out_futures, self.datatype) return self
def _fit(self, X, y): X_futures = self.client.sync(extract_ddf_partitions, X) y_futures = self.client.sync(extract_ddf_partitions, y) X_partition_workers = [w for w, xc in X_futures] y_partition_workers = [w for w, xc in y_futures] if set(X_partition_workers) != set(y_partition_workers): raise ValueError(""" X and y are not partitioned on the same workers expected \n_cols Linear Regression""") self.rnks = dict() rnk_counter = 0 worker_to_parts = OrderedDict() for w, p in X_futures: if w not in worker_to_parts: worker_to_parts[w] = [] if w not in self.rnks.keys(): self.rnks[w] = rnk_counter rnk_counter = rnk_counter + 1 worker_to_parts[w].append(p) worker_to_parts_y = OrderedDict() for w, p in y_futures: if w not in worker_to_parts_y: worker_to_parts_y[w] = [] worker_to_parts_y[w].append(p) workers = list(map(lambda x: x[0], X_futures)) comms = CommsContext(comms_p2p=False) comms.init(workers=workers) worker_info = comms.worker_info(comms.worker_addresses) key = uuid1() partsToSizes = [(worker_info[wf[0]]["r"], self.client.submit( Ridge._func_get_size, wf[1], workers=[wf[0]], key="%s-%s" % (key, idx)).result()) for idx, wf in enumerate(X_futures)] n_cols = X.shape[1] n_rows = reduce(lambda a, b: a+b, map(lambda x: x[1], partsToSizes)) key = uuid1() self.linear_models = [(wf[0], self.client.submit( Ridge._func_create_model, comms.sessionId, **self.kwargs, workers=[wf[0]], key="%s-%s" % (key, idx))) for idx, wf in enumerate(worker_to_parts.items())] key = uuid1() linear_fit = dict([(worker_info[wf[0]]["r"], self.client.submit( Ridge._func_fit, wf[1], worker_to_parts[wf[0]], worker_to_parts_y[wf[0]], n_rows, n_cols, partsToSizes, worker_info[wf[0]]["r"], key="%s-%s" % (key, idx), workers=[wf[0]])) for idx, wf in enumerate(self.linear_models)]) wait(list(linear_fit.values())) raise_exception_from_futures(list(linear_fit.values())) comms.destroy() self.local_model = self.linear_models[0][1].result() self.coef_ = self.local_model.coef_ self.intercept_ = self.local_model.intercept_