def predict(self, X): client = default_client() class_probs = predict(client, self._Booster, X) if class_probs.ndim > 1: cidx = da.argmax(class_probs, axis=1) else: cidx = (class_probs > 0).astype(np.int64) return cidx
def predict_proba(self, data, ntree_limit=None): client = default_client() if ntree_limit is not None: raise NotImplementedError( "'ntree_limit' is not currently " "supported." ) class_probs = predict(client, self._Booster, data) return class_probs
def predict(self, X, client=None, **kwargs): if client is None: client = default_client() return predict(client, self.to_local(), X, dtype=self.classes_.dtype, **kwargs)
def close(self, running=True): from dask.distributed import default_client try: client = default_client() client.close() except ValueError: pass
def client(self): from dask.distributed import Client, default_client try: return default_client() except ValueError: if self._cluster: return Client(self._cluster) return Client()
def louvain(input_graph, max_iter=100, resolution=1.0, load_balance=True): """ Compute the modularity optimizing partition of the input graph using the Louvain method on multiple GPUs Examples -------- >>> import cugraph.dask as dcg >>> Comms.initialize() >>> chunksize = dcg.get_chunksize(input_data_path) >>> ddf = dask_cudf.read_csv('datasets/karate.csv', chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) >>> dg = cugraph.Graph() >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst', edge_attr='value') >>> parts, modularity_score = dcg.louvain(dg) """ # FIXME: finish docstring: describe parameters, etc. # FIXME: import here to prevent circular import: cugraph->louvain # wrapper->cugraph/structure->cugraph/dask->dask/louvain->cugraph/structure # from cugraph.structure.graph import Graph # FIXME: dask methods to populate graphs from edgelists are only present on # DiGraph classes. Disable the Graph check for now and assume inputs are # symmetric DiGraphs. # if type(graph) is not Graph: # raise Exception("input graph must be undirected") client = default_client() if (input_graph.local_data is not None and input_graph.local_data['by'] == 'src'): data = input_graph.local_data['data'] else: data = get_local_data(input_graph, by='src', load_balance=load_balance) result = dict([(data.worker_info[wf[0]]["rank"], client.submit(call_louvain, Comms.get_session_id(), wf[1], data.local_data, max_iter, resolution, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items())]) wait(result) (parts, modularity_score) = result[0].result() if input_graph.renumbered: # MG renumbering is lazy, but it's safe to assume it's been called at # this point if renumbered=True parts = input_graph.unrenumber(parts, "vertex") return parts, modularity_score
def predict_model_on_cpu(self, X, convert_dtype=True): """ Predicts the labels for X. Parameters ---------- X : Dask cuDF dataframe or CuPy backed Dask Array (n_rows, n_features) Distributed dense matrix (floats or doubles) of shape (n_samples, n_features). convert_dtype : bool, optional (default = True) When set to True, the predict method will, when necessary, convert the input to the data type which was used to train the model. This will increase memory used for the method. Returns ---------- y : Dask cuDF dataframe or CuPy backed Dask Array (n_rows, 1) """ c = default_client() workers = self.workers X_Scattered = c.scatter(X) futures = list() for n, w in enumerate(workers): futures.append( c.submit( RandomForestClassifier._predict_model_on_cpu, self.rfs[w], X_Scattered, convert_dtype, workers=[w], )) rslts = self.client.gather(futures, errors="raise") indexes = np.zeros(len(futures), dtype=np.int32) pred = list() for i in range(len(X)): classes = dict() max_class = -1 max_val = 0 for d in range(len(rslts)): for j in range(self.n_estimators_per_worker[d]): sub_ind = indexes[d] + j cls = rslts[d][sub_ind] if cls not in classes.keys(): classes[cls] = 1 else: classes[cls] = classes[cls] + 1 if classes[cls] > max_val: max_val = classes[cls] max_class = cls indexes[d] = indexes[d] + self.n_estimators_per_worker[d] pred.append(max_class) return pred
def close(self): from dask.distributed import Client, default_client, as_completed try: client = default_client() client.close() except ValueError: pass if self._cluster: self._cluster.close()
def predict_proba(self, X, client=None, **kwargs): if client is None: client = default_client() return predict(client, self.to_local(), X, proba=True, dtype=self.classes_[0].dtype, **kwargs)
def test_parquet_concat_within_workers(client_connection): if not os.path.exists("test_files_parquet"): print("Generate data... ") os.mkdir("test_files_parquet") for x in range(10): if not os.path.exists("test_files_parquet/df" + str(x)): df = utils.random_edgelist(e=100, ef=16, dtypes={ "src": np.int32, "dst": np.int32 }, seed=x) df.to_parquet("test_files_parquet/df" + str(x), index=False) n_gpu = get_n_workers() print("Read_parquet... ") t1 = time.time() ddf = dask_cudf.read_parquet("test_files_parquet/*", dtype=["int32", "int32"]) ddf = ddf.persist() futures_of(ddf) wait(ddf) t1 = time.time() - t1 print("*** Read Time: ", t1, "s") print(ddf) assert ddf.npartitions > n_gpu print("Drop_duplicates... ") t2 = time.time() ddf.drop_duplicates(inplace=True) ddf = ddf.persist() futures_of(ddf) wait(ddf) t2 = time.time() - t2 print("*** Drop duplicate time: ", t2, "s") assert t2 < t1 print("Repartition... ") t3 = time.time() # Notice that ideally we would use : # ddf = ddf.repartition(npartitions=n_gpu) # However this is slower than reading and requires more memory # Using custom concat instead client = default_client() ddf = concat_within_workers(client, ddf) ddf = ddf.persist() futures_of(ddf) wait(ddf) t3 = time.time() - t3 print("*** repartition Time: ", t3, "s") print(ddf) assert t3 < t1
def __init__(self, client=None, **kwargs): """ Initializes the linear regression class. """ self.client = default_client() if client is None else client self.kwargs = kwargs self.coef_ = None self.intercept_ = None self._model_fit = False self._consec_call = 0
def fit(self, X, y, **fit_params): """Find the best parameters for a particular model. Parameters ---------- X, y : array-like **fit_params Additional partial fit keyword arguments for the estimator. """ return default_client().sync(self._fit, X, y, **fit_params)
def persist_distributed_data(dask_df, client): client = default_client() if client is None else client worker_addresses = Comms.get_workers() _keys = dask_df.__dask_keys__() worker_dict = {} for i, key in enumerate(_keys): worker_dict[str(key)] = tuple([worker_addresses[i]]) persisted = client.persist(dask_df, workers=worker_dict) parts = futures_of(persisted) return parts
def setup_dask(cls): try: from dask.distributed import default_client client = default_client() except: client = _startup_dask(2.0) print('Dask Client:', client) if cls is not None: setattr(cls, 'dask_client_', client)
def to_dask_cudf(dask_arr, client=None): client = default_client() if client is None else client elms = [_to_cudf(dp) for dp in dask_arr.to_delayed().flatten()] dfs = client.compute(elms) meta = client.submit(_get_meta, dfs[0]) meta_local = meta.result() return dd.from_delayed(dfs, meta=meta_local)
def fit(self, ddf): """ Fits a single-node multi-gpu knn model using single process-multiGPU technique. :param futures: :return: """ client = default_client() # Keep the futures around so the GPU memory doesn't get # deallocated on the workers. gpu_futures, cols = client.sync(self._get_mg_info, ddf) self.gpu_futures = gpu_futures host_dict = self._build_host_dict(gpu_futures, client).items() if len(host_dict) > 1: raise Exception("Dask cluster appears to span hosts. Current " "multi-GPU version is limited to single host") # Choose a random worker on each unique host to run dask-cuml's # kNN.fit() function on all the cuDFs living on that host. self.master_host = [(host, random.sample(ports, 1)[0]) for host, ports in host_dict][0] host, port = self.master_host gpu_futures_for_host = list( filter(lambda d: d[0][0] == host, gpu_futures)) exec_node = (host, port) # build ipc handles gpu_data_excl_worker = list( filter(lambda d: d[0] != exec_node, gpu_futures_for_host)) gpu_data_incl_worker = list( filter(lambda d: d[0] == exec_node, gpu_futures_for_host)) ipc_handles = [ client.submit(get_ipc_handle, future, workers=[worker]) for worker, future in gpu_data_excl_worker ] raw_arrays = [future for worker, future in gpu_data_incl_worker] f = (exec_node, client.submit(_fit_on_worker, (ipc_handles, raw_arrays), { "D": cols, "should_downcast": self.should_downcast }, workers=[exec_node])) wait(f) # The model on each unique host is held for futures queries self.model = f
def __init__(self, n_clusters=8, max_iter=300, tol=1e-4, verbose=0, random_state=1, precompute_distances='auto', init='scalable-k-means++', n_init=1, algorithm='auto', client=None): """ Constructor for distributed KMeans model handle : cuml.Handle If it is None, a new one is created just for this class. n_clusters : int (default = 8) The number of centroids or clusters you want. max_iter : int (default = 300) The more iterations of EM, the more accurate, but slower. tol : float (default = 1e-4) Stopping criterion when centroid means do not change much. verbose : boolean (default = 0) If True, prints diagnositc information. random_state : int (default = 1) If you want results to be the same when you restart Python, select a state. precompute_distances : boolean (default = 'auto') Not supported yet. init : {'scalable-kmeans++', 'k-means||' , 'random' or an ndarray} (default = 'scalable-k-means++') 'scalable-k-means++' or 'k-means||': Uses fast and stable scalable kmeans++ intialization. 'random': Choose 'n_cluster' observations (rows) at random from data for the initial centroids. If an ndarray is passed, it should be of shape (n_clusters, n_features) and gives the initial centers. n_init : int (default = 1) Number of times intialization is run. More is slower, but can be better. algorithm : "auto" Currently uses full EM, but will support others later. n_gpu : int (default = 1) Number of GPUs to use. Currently uses single GPU, but will support multiple GPUs later. """ self.client = default_client() if client is None else client self.max_iter = max_iter self.tol = tol self.random_state = random_state self.precompute_distances = precompute_distances self.n_init = n_init self.algorithm = algorithm self.n_clusters = n_clusters self.init = init self.verbose = verbose
def to_dask_cudf(futures): """ Convert a list of futures containing cudf Dataframes into a Dask.Dataframe :param futures: list[cudf.Dataframe] list of futures containing dataframes :return: dask.Dataframe a dask.Dataframe """ c = default_client() # Convert a list of futures containing dfs back into a dask_cudf dfs = [d for d in futures if d.type != type(None)] # NOQA meta = c.submit(get_meta, dfs[0]).result() return dd.from_delayed(dfs, meta=meta)
def update(self, batch, who=None, metadata=None): try: client = default_client() result = [ client.submit(self.func, x, *self.args, **self.kwargs) for x in batch ] except Exception as e: logger.exception(e) raise else: return self._emit(result, metadata=metadata)
def to_dask_cudf(futures, client=None): """ Convert a list of futures containing cudf Dataframes into a Dask.Dataframe :param futures: list[cudf.Dataframe] list of futures containing dataframes :param client: dask.distributed.Client Optional client to use :return: dask.Dataframe a dask.Dataframe """ c = default_client() if client is None else client # Convert a list of futures containing dfs back into a dask_cudf dfs = [d for d in futures if d.type != type(None)] # NOQA meta = c.submit(get_meta, dfs[0]).result() return dd.from_delayed(dfs, meta=meta)
def _fit(self, model_factory, X, y=None, sample_weight=None, client=None, **kwargs): """Docstring is inherited from the LGBMModel.""" if client is None: client = default_client() params = self.get_params(True) model = _train(client, X, y, params, model_factory, sample_weight, **kwargs) self.set_params(**model.get_params()) self._copy_extra_params(model, self) return self
def predict(self, X): """ Predicts the regressor outputs for X. Parameters ---------- X : Dense matrix (floats or doubles) of shape (n_samples, n_features). Returns ---------- y: NumPy Dense vector (float) of shape (n_samples, 1) """ c = default_client() workers = self.workers if not isinstance(X, np.ndarray): raise ValueError("Predict inputs must be numpy arrays") X_Scattered = c.scatter(X) futures = list() for n, w in enumerate(workers): futures.append( c.submit( RandomForestRegressor._predict, self.rfs[w], X_Scattered, random.random(), workers=[w], )) wait(futures) raise_exception_from_futures(futures) indexes = list() rslts = list() for d in range(len(futures)): rslts.append(futures[d].result()) indexes.append(0) pred = list() for i in range(len(X)): pred_per_worker = 0.0 for d in range(len(rslts)): pred_per_worker = pred_per_worker + rslts[d][i] pred.append(pred_per_worker / len(rslts)) return pred
def fit(self, X, y=None, sample_weight=None, client=None, **kwargs): if client is None: client = default_client() model_factory = lightgbm.LGBMRegressor params = self.get_params(True) model = train(client, X, y, params, model_factory, sample_weight, **kwargs) self.set_params(**model.get_params()) self._copy_extra_params(model, self) return self
def fit(self, X, y=None, **fit_params): """Find the best parameters for a particular model. Parameters ---------- X, y : array-like **fit_params Additional partial fit keyword arguments for the estimator. """ client = default_client() if not client.asynchronous: return client.sync(self._fit, X, y, **fit_params) return self._fit(X, y, **fit_params)
def __init__(self, client=None, **kwargs): """ Constructor for distributed TruncatedSVD model """ self.client = default_client() if client is None else client self.kwargs = kwargs # define attributes to make sure they # are available even on untrained object self.local_model = None self.components_ = None self.explained_variance_ = None self.explained_variance_ratio_ = None self.singular_values_ = None
def __init__(self, client=None, streams_per_handle=0, verbose=False, **kwargs): raise NotImplementedError("Multi-GPU KNN is not available in RAPIDS " "0.11, it will be enabled in the next " "release. Legacy version is available in " "0.10.") self.client = default_client() if client is None else client self.model_args = kwargs self.X = None self.Y = None self.n_cols = 0 self.streams_per_handle = streams_per_handle self.verbose = verbose
def load_balance_func(ddf_, by, client=None): # Load balances the sorted dask_cudf DataFrame. # Input is a dask_cudf dataframe ddf_ which is sorted by # the column name passed as the 'by' argument. client = default_client() if client is None else client parts = persist_distributed_data(ddf_, client) wait(parts) who_has = client.who_has(parts) key_to_part = [(str(part.key), part) for part in parts] gpu_fututres = [(first(who_has[key]), part.key[1], part) for key, part in key_to_part] worker_to_data = create_dict(gpu_fututres) # Calculate cumulative sum in each dataframe partition cumsum_parts = [ client.submit(get_cumsum, wf[1][0][0], by, workers=[wf[0]]).result() for idx, wf in enumerate(worker_to_data.items()) ] num_rows = [] for cumsum in cumsum_parts: num_rows.append(cumsum.iloc[-1]) # Calculate current partition divisions divisions = [sum(num_rows[0:x:1]) for x in range(0, len(num_rows) + 1)] divisions[-1] = divisions[-1] - 1 divisions = tuple(divisions) # Set global index from 0 to len(dask_cudf_dataframe) so that global # indexing of divisions can be used for repartitioning. futures = [ client.submit(set_global_index, wf[1][0][0], divisions[wf[1][0][1]], workers=[wf[0]]) for idx, wf in enumerate(worker_to_data.items()) ] wait(futures) ddf = dask_cudf.from_delayed(futures) ddf.divisions = divisions # Repartition the data ddf = repartition(ddf, cumsum_parts) return ddf
def _to_dask_cudf(futures, client=None): """ Convert a list of futures containing cudf Dataframes into a Dask.Dataframe :param futures: list[cudf.Dataframe] list of futures containing dataframes :param client: dask.distributed.Client Optional client to use :return: dask.Dataframe a dask.Dataframe """ c = default_client() if client is None else client # Convert a list of futures containing dfs back into a dask_cudf dfs = [d for d in futures if d.type != type(None)] # NOQA if logger.should_log_for(logger.level_debug): logger.debug("to_dask_cudf dfs=%s" % str(dfs)) meta_future = c.submit(_get_meta, dfs[0], pure=False) meta = meta_future.result() return dd.from_delayed(dfs, meta=meta)
def __init__(self, client=None, **kwargs): """ Constructor for distributed KMeans model Parameters ---------- handle : cuml.Handle If it is None, a new one is created just for this class. n_clusters : int (default = 8) The number of centroids or clusters you want. max_iter : int (default = 300) The more iterations of EM, the more accurate, but slower. tol : float (default = 1e-4) Stopping criterion when centroid means do not change much. verbose : boolean (default = 0) If True, prints diagnositc information. random_state : int (default = 1) If you want results to be the same when you restart Python, select a state. init : {'scalable-kmeans++', 'k-means||' , 'random' or an ndarray} (default = 'scalable-k-means++') 'scalable-k-means++' or 'k-means||': Uses fast and stable scalable kmeans++ intialization. 'random': Choose 'n_cluster' observations (rows) at random from data for the initial centroids. If an ndarray is passed, it should be of shape (n_clusters, n_features) and gives the initial centers. oversampling_factor : int (default = 2) The amount of points to sample in scalable k-means++ initialization for potential centroids. Increasing this value can lead to better initial centroids at the cost of memory. The total number of centroids sampled in scalable k-means++ is oversampling_factor * n_clusters * 8. max_samples_per_batch : int (default = 32768) The number of data samples to use for batches of the pairwise distance computation. This computation is done throughout both fit predict. The default should suit most cases. The total number of elements in the batched pairwise distance computation is max_samples_per_batch * n_clusters. It might become necessary to lower this number when n_clusters becomes prohibitively large. Attributes ---------- cluster_centers_ : array The coordinates of the final clusters. This represents of "mean" of each data cluster. """ self.client = default_client() if client is None else client self.kwargs = kwargs
def fit(self, X, y=None, sample_weight=None, client=None, **kwargs): if client is None: client = default_client() model_factory = lightgbm.LGBMRegressor params = self.get_params(True) model = train(client, X, y, params, model_factory, sample_weight, **kwargs) self.set_params(**model.get_params()) self._Booster = model._Booster self._n_features = model._n_features self._evals_result = model._evals_result self._best_iteration = model._best_iteration self._best_score = model._best_score return self