def renumber(input_graph): client = default_client() ddf = input_graph.edgelist.edgelist_df num_edges = len(ddf) if isinstance(ddf, dask_cudf.DataFrame): is_mnmg = True else: is_mnmg = False num_verts = input_graph.number_of_vertices() if is_mnmg: data = get_distributed_data(ddf) result = [ client.submit(call_renumber, Comms.get_session_id(), wf[1], num_verts, num_edges, is_mnmg, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items()) ] wait(result) ddf = dask_cudf.from_delayed(result) else: call_renumber(Comms.get_session_id(), ddf, num_verts, num_edges, is_mnmg) return ddf
def __delayed_call_noports(self, inputs): def get_pout(df_out): '''Used for delayed unpacking.''' if isinstance(df_out, cudf.DataFrame): # Needed for the same reason as __make_copy. To prevent columns # addition in the input data frames. In python everything is # by reference value and dataframes are mutable. # Handle the case when dask_cudf.DataFrames are source frames # which appear as cudf.DataFrame in a dask-delayed function. return df_out.copy(deep=False) return df_out # handle the dask dataframe automatically # use the to_delayed interface # TODO, currently only handles first input is dask_cudf df i_df = inputs[0] rest = inputs[1:] if isinstance(i_df, dask_cudf.DataFrame): output_df_dly_list = [] for input_dly in i_df.to_delayed(): inputs_ = [input_dly] + rest output_df_dly = dask.delayed(self.decorate_process())(inputs_) output_df_dly_per = output_df_dly.persist() df_out = dask.delayed(get_pout)(output_df_dly_per) output_df_dly_list.append(df_out.persist()) output_df = dask_cudf.from_delayed(output_df_dly_list) else: output_df = self.decorate_process()(inputs) return output_df
def to_ddf(self, columns=None, shuffle=False, seed=None): """ Convert `Dataset` object to `dask_cudf.DataFrame` Parameters ----------- columns : str or list(str); default None Columns to include in output `DataFrame`. If not specified, the output will contain all known columns in the Dataset. shuffle : bool; default False Whether to shuffle the order of partitions in the output `dask_cudf.DataFrame`. Note that this does not shuffle the rows within each partition. This is because the data is not actually loaded into memory for this operation. seed : int; Optional The random seed to use if `shuffle=True`. If nothing is specified, the current system time will be used by the `random` std library. """ # Use DatasetEngine to create ddf ddf = self.engine.to_ddf(columns=columns) # Shuffle the partitions of ddf (optional) if shuffle and ddf.npartitions > 1: parts = ddf.to_delayed() random.seed(seed) random.shuffle(parts) ddf = dask_cudf.from_delayed(parts) # Special dtype conversion (optional) if self.dtypes: _meta = _set_dtypes(ddf._meta, self.dtypes) return ddf.map_partitions(_set_dtypes, self.dtypes, meta=_meta) return ddf
def test_dataframe_from_delayed(): delays = [load_data(10 * i, i) for i in range(1, 3)] out = dgd.from_delayed(delays) res = out.compute() assert isinstance(res, gd.DataFrame) expected = gd.concat([d.compute() for d in delays]) assert_frame_equal(res.to_pandas(), expected.to_pandas())
def test_series_from_delayed(): delays = [get_combined_column(load_data(10 * i, i)) for i in range(1, 3)] out = dgd.from_delayed(delays) res = out.compute() assert isinstance(res, gd.Series) expected = gd.concat([d.compute() for d in delays]) np.testing.assert_array_equal(res.to_pandas(), expected.to_pandas())
def apply(self, function): """Transform each group using a python function. """ @delayed def apply_to_group(grp): return grp.apply(function) grouped = [apply_to_group(g) for g in self._grouped] return from_delayed(grouped).reset_index()
def kneighbors(self, X, k=None): """ Queries the multi-gpu knn model given a dask-cudf as the query 1. Create 2 new Dask dataframes to hold output (1 chunk each per chunk of X), co-locate pieces w/ X. 2. Get IPC handles for each dataframe. Use IPCThread to hold onto them while calling query. :param input: A dask-cudf for calculating the kneighbors :param k: The number of nearest neighbors to query for each input vector. :return: dists and indices of the k-nearest neighbors to the input vectors """ if k is None: k = self.n_neighbors client = default_client() dfs = client.sync(self._kneighbors, X, k).value dfs = [d for d in dfs if d.type != type(None)] # NOQA local_divs = [client.submit(get_idx, f).result() for f in dfs] indices = [client.submit(get_I, f) for f in dfs] dists = [client.submit(get_D, f) for f in dfs] dfs_divs = list(zip(local_divs, indices, dists)) # Sort delayed dfs by their starting index dfs_divs.sort(key=lambda x: x[0][0]) I_meta = client.submit(get_I_meta, dfs[0]).result() D_meta = client.submit(get_D_meta, dfs[0]).result() I_ddf = dask_cudf.from_delayed(indices, meta=I_meta) D_ddf = dask_cudf.from_delayed(dists, meta=D_meta) return D_ddf, I_ddf
def apply_grouped(self, *args, **kwargs): """Transform each group using a GPU function. Calls ``cudf.Groupby.apply_grouped`` concurrently """ @delayed def apply_to_group(grp): return grp.apply_grouped(*args, **kwargs) grouped = [apply_to_group(g) for g in self._grouped] return from_delayed(grouped).reset_index()
def build_dask_df(nrows, ncols): workers = client.has_what().keys() # Create dfs on each worker (gpu) dfs = [ client.submit(create_df, n, nrows, ncols, workers=[worker]) for worker, n in list(zip(workers, list(range(len(workers))))) ] # Wait for completion wait(dfs) meta = client.submit(get_meta, dfs[0]).result() return dask_cudf.from_delayed(dfs, meta=meta)
def load_balance_func(ddf_, by, client=None): # Load balances the sorted dask_cudf DataFrame. # Input is a dask_cudf dataframe ddf_ which is sorted by # the column name passed as the 'by' argument. client = default_client() if client is None else client parts = persist_distributed_data(ddf_, client) wait(parts) who_has = client.who_has(parts) key_to_part = [(str(part.key), part) for part in parts] gpu_fututres = [(first(who_has[key]), part.key[1], part) for key, part in key_to_part] worker_to_data = create_dict(gpu_fututres) # Calculate cumulative sum in each dataframe partition cumsum_parts = [ client.submit(get_cumsum, wf[1][0][0], by, workers=[wf[0]]).result() for idx, wf in enumerate(worker_to_data.items()) ] num_rows = [] for cumsum in cumsum_parts: num_rows.append(cumsum.iloc[-1]) # Calculate current partition divisions divisions = [sum(num_rows[0:x:1]) for x in range(0, len(num_rows) + 1)] divisions[-1] = divisions[-1] - 1 divisions = tuple(divisions) # Set global index from 0 to len(dask_cudf_dataframe) so that global # indexing of divisions can be used for repartitioning. futures = [ client.submit(set_global_index, wf[1][0][0], divisions[wf[1][0][1]], workers=[wf[0]]) for idx, wf in enumerate(worker_to_data.items()) ] wait(futures) ddf = dask_cudf.from_delayed(futures) ddf.divisions = divisions # Repartition the data ddf = repartition(ddf, cumsum_parts) return ddf
def __call__(self, inputs_data): if self.load: if isinstance(self.load, bool): output_df = self.load_cache() else: output_df = self.load else: if self._using_ports(): # nodes with ports take dictionary as inputs inputs = {iport: self.__make_copy(data_input) for iport, data_input in inputs_data.items()} else: # nodes without ports take list as inputs inputs = [self.__make_copy(inputs_data[ient['to_port']]) for ient in self.inputs] if not self.delayed_process: output_df = self.decorate_process()(inputs) else: if self._using_ports(): use_delayed = self.__check_dly_processing_prereq(inputs) if use_delayed: output_df = self.__delayed_call(inputs) else: output_df = self.decorate_process()(inputs) else: # handle the dask dataframe automatically # use the to_delayed interface # TODO, currently only handles first input is dask_cudf df i_df = inputs[0] rest = inputs[1:] if isinstance(i_df, dask_cudf.DataFrame): d_fun = dask.delayed(self.decorate_process()) output_df = dask_cudf.from_delayed([ d_fun([item] + rest) for item in i_df.to_delayed()]) else: output_df = self.decorate_process()(inputs) if self.uid != OUTPUT_ID and output_df is None: raise Exception("None output") else: self.__valide(output_df, self.output_columns) if self.save: self.save_cache(output_df) return output_df
def concat_within_workers(client, ddf): """ Concats all partitions within workers without transfers """ df_delayed = get_delayed_dict(ddf) result = [] for worker, tasks in client.has_what().items(): worker_task_list = [] for task in list(tasks): if task in df_delayed: worker_task_list.append(df_delayed[task]) concat_tasks = delayed(concat_dfs)(worker_task_list) result.append(client.persist(collections=concat_tasks, workers=worker)) return dask_cudf.from_delayed(result)
def _mg_rmat(scale, num_edges, a, b, c, seed, clip_and_flip, scramble_vertex_ids, create_using=cugraph.DiGraph): """ Calls RMAT on multiple GPUs and uses the resulting Dask cuDF DataFrame to initialize and return a cugraph Graph object specified with create_using. If create_using is None, returns the Dask DataFrame edgelist as-is. seed is used as the initial seed for the first worker used (worker 0), then each subsequent worker will receive seed+<worker num> as the seed value. """ client = default_client() worker_list = list(client.scheduler_info()['workers'].keys()) num_workers = len(worker_list) num_edges_list = _calc_num_edges_per_worker(num_workers, num_edges) futures = [] for (i, worker_num_edges) in enumerate(num_edges_list): unique_worker_seed = seed + i future = client.submit(_call_rmat, Comms.get_session_id(), scale, worker_num_edges, a, b, c, unique_worker_seed, clip_and_flip, scramble_vertex_ids, workers=worker_list[i]) futures.append(future) ddf = dask_cudf.from_delayed(futures) if create_using is None: return ddf G = create_using() G.from_dask_cudf_edgelist(ddf, source="src", destination="dst") return G
def test_mixing_series_frame_error(): nelem = 20 df = gd.DataFrame() df["x"] = np.arange(nelem) df["y"] = np.random.randint(nelem, size=nelem) ddf = dgd.from_cudf(df, npartitions=5) delay_frame = ddf.to_delayed() delay_series = ddf.x.to_delayed() combined = dgd.from_delayed(delay_frame + delay_series) with pytest.raises(ValueError) as raises: combined.compute() raises.match(r"^Metadata mismatch found in `from_delayed`.")
def process(self, inputs): df = inputs[self.INPUT_PORT_NAME] # df = df.drop('datetime', axis=1) output = {} if self.outport_connected(self.OUTPUT_PORT_NAME): offset = self.conf.get('offset', 0) out_df = self._process(df, offset) output.update({self.OUTPUT_PORT_NAME: out_df}) if self.outport_connected(self.OUTPUT_DASK_PORT): partitions = self.conf['partitions'] out_dfs = [ dask.delayed(self._process)(df, i) for i in range(partitions) ] meta = self.meta_setup().outports[self.OUTPUT_DASK_PORT] meta['date'] = 'datetime64[ns]' dask_df = dask_cudf.from_delayed(out_dfs, meta=meta) output.update({self.OUTPUT_DASK_PORT: dask_df}) return output
def test_frame_extra_columns_error(): nelem = 20 df = gd.DataFrame() df["x"] = np.arange(nelem) df["y"] = np.random.randint(nelem, size=nelem) ddf1 = dgd.from_cudf(df, npartitions=5) df["z"] = np.arange(nelem) ddf2 = dgd.from_cudf(df, npartitions=5) combined = dgd.from_delayed(ddf1.to_delayed() + ddf2.to_delayed()) with pytest.raises(ValueError) as raises: combined.compute() raises.match(r"^Metadata mismatch found in `from_delayed`.") raises.match(r"z")
def weakly_connected_components(input_graph): """ Generate the Weakly Connected Components and attach a component label to each vertex. Parameters ---------- input_graph : cugraph.Graph, networkx.Graph, CuPy or SciPy sparse matrix Graph or matrix object, which should contain the connectivity information """ client = default_client() input_graph.compute_renumber_edge_list() ddf = input_graph.edgelist.edgelist_df vertex_partition_offsets = get_vertex_partition_offsets(input_graph) num_verts = vertex_partition_offsets.iloc[-1] num_edges = len(ddf) data = get_distributed_data(ddf) src_col_name = input_graph.renumber_map.renumbered_src_col_name dst_col_name = input_graph.renumber_map.renumbered_dst_col_name result = [client.submit(call_wcc, Comms.get_session_id(), wf[1], src_col_name, dst_col_name, num_verts, num_edges, vertex_partition_offsets, input_graph.aggregate_segment_offsets, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items())] wait(result) ddf = dask_cudf.from_delayed(result) if input_graph.renumbered: return input_graph.unrenumber(ddf, 'vertex') return ddf
def test_frame_dtype_error(): nelem = 20 df1 = gd.DataFrame() df1["bad"] = np.arange(nelem) df1["bad"] = np.arange(nelem, dtype=np.float64) df2 = gd.DataFrame() df2["bad"] = np.arange(nelem) df2["bad"] = np.arange(nelem, dtype=np.float32) ddf1 = dgd.from_cudf(df1, npartitions=5) ddf2 = dgd.from_cudf(df2, npartitions=5) combined = dgd.from_delayed(ddf1.to_delayed() + ddf2.to_delayed()) with pytest.raises(ValueError) as raises: combined.compute() raises.match(r"same type")
def __call__(self, inputs): # valide inputs Class = type(self) cache = Class.cache_dir inputs = [self.__make_copy(i) for i in inputs] if not isinstance(self.load, bool) or self.load: if isinstance(self.load, bool): output_df = self.load_cache(cache+'/'+self.uid+'.hdf5') else: output_df = self.load else: if not self.delayed_process: output_df = self.process(inputs) else: # handle the dask dataframe automatically # use the to_delayed interface # TODO, currently only handles first input is dask_cudf df i_df = inputs[0] rest = inputs[1:] if isinstance(i_df, dask_cudf.DataFrame): d_fun = dask.delayed(self.process) output_df = dask_cudf.from_delayed([ d_fun([item] + rest) for item in i_df.to_delayed()]) else: output_df = self.process(inputs) if self.uid != 'unique_output' and output_df is None: raise Exception("None output") elif (isinstance(output_df, cudf.DataFrame) or isinstance(output_df, dask_cudf.DataFrame) ) and len(output_df) == 0: raise Exception("empty output") elif not self.__valide(output_df, self.output_columns): raise Exception("not valid output") if self.save: os.makedirs(cache, exist_ok=True) output_df.to_hdf(cache+'/'+self.uid+'.hdf5', key=self.uid) return output_df
def fit(self, X, y): """ Fits a multi-gpu linear regression model such that each of the resulting coefficients are also distributed across the GPUs. :param futures: :return: """ client = default_client() self.dtype = X[X.columns[0]].compute().dtype coef, intercept, locations = client.sync(self._do_fit, X, y, self.dtype) self.intercept = intercept self._locations = locations self._model_fit = True self._ncols = X.shape[1] self.coef_ = dask_cudf.from_delayed(coef)
def test_frame_dtype_error(): nelem = 20 df1 = gd.DataFrame() df1['bad'] = np.arange(nelem) df1['bad'] = np.arange(nelem, dtype=np.float64) df2 = gd.DataFrame() df2['bad'] = np.arange(nelem) df2['bad'] = np.arange(nelem, dtype=np.float32) ddf1 = dgd.from_cudf(df1, npartitions=5) ddf2 = dgd.from_cudf(df2, npartitions=5) combined = dgd.from_delayed(ddf1.to_delayed() + ddf2.to_delayed()) with pytest.raises(ValueError) as raises: combined.compute() print("out") raises.match(r"^Metadata mismatch found in `from_delayed`.") raises.match(r"\s+\|\s+".join(['bad', 'float32', 'float64']))
def predict(self, X): """ Predict values for the multi-gpu linear regression model by making calls to the predict function with dask-cudf objects. :param df: a dask-cudf with data distributed one worker per GPU :return: a dask-cudf containing outputs of the linear regression """ if self._model_fit: client = default_client() ret = client.sync(self._do_predict, X, self.coef_, self._locations, self.intercept, self.dtype) ret = dask_cudf.from_delayed(ret) return ret else: raise ValueError('Model coefficients have not been fit. You need ' 'to run the fit() method first. ')
def sssp(graph, source): """ Compute the distance and predecessors for shortest paths from the specified source to all the vertices in the graph. The distances column will store the distance from the source to each vertex. The predecessors column will store each vertex's predecessor in the shortest path. Vertices that are unreachable will have a distance of infinity denoted by the maximum value of the data type and the predecessor set as -1. The source vertex's predecessor is also set to -1. The input graph must contain edge list as dask-cudf dataframe with one partition per GPU. Parameters ---------- graph : cugraph.DiGraph cuGraph graph descriptor, should contain the connectivity information as dask cudf edge list dataframe. Undirected Graph not currently supported. source : Integer Specify source vertex Returns ------- df : dask_cudf.DataFrame df['vertex'] gives the vertex id df['distance'] gives the path distance from the starting vertex df['predecessor'] gives the vertex id it was reached from in the traversal Examples -------- >>> import cugraph.dask as dcg >>> ... Init a DASK Cluster >> see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html >>> chunksize = dcg.get_chunksize(input_data_path) >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) >>> dg = cugraph.DiGraph() >>> dg.from_dask_cudf_edgelist(ddf, 'src', 'dst') >>> df = dcg.sssp(dg, 0) """ client = default_client() graph.compute_renumber_edge_list(transposed=False) ddf = graph.edgelist.edgelist_df vertex_partition_offsets = get_vertex_partition_offsets(graph) num_verts = vertex_partition_offsets.iloc[-1] num_edges = len(ddf) data = get_distributed_data(ddf) if graph.renumbered: source = graph.lookup_internal_vertex_id(cudf.Series([source ])).compute() source = source.iloc[0] result = [ client.submit(call_sssp, Comms.get_session_id(), wf[1], num_verts, num_edges, vertex_partition_offsets, graph.aggregate_segment_offsets, source, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items()) ] wait(result) ddf = dask_cudf.from_delayed(result) if graph.renumbered: ddf = graph.unrenumber(ddf, 'vertex') ddf = graph.unrenumber(ddf, 'predecessor') ddf["predecessor"] = ddf["predecessor"].fillna(-1) return ddf
def main(client): import dask_cudf import cudf item_df = read_tables() """ Filter and Join web_clickstreams and item table. SELECT wcs_user_sk, (wcs_click_date_sk*24L*60L*60L + wcs_click_time_sk) AS tstamp_inSec, i_category_id FROM web_clickstreams wcs, item i WHERE wcs.wcs_item_sk = i.i_item_sk AND i.i_category_id IS NOT NULL AND wcs.wcs_user_sk IS NOT NULL """ f_item_df = item_df[item_df["i_category_id"].notnull()].reset_index( drop=True) # The main idea is that we don't fuse a filtration task with reading task yet # this causes more memory pressures as we try to read the whole thing ( and spill that) # at once and then do filtration . ### Below Pr has the dashboard snapshot which makes the problem clear ### https://github.com/rapidsai/tpcx-bb-internal/pull/496#issue-399946141 web_clickstream_flist = glob.glob(cli_args["data_dir"] + "web_clickstreams/*.parquet") task_ls = [ delayed(pre_repartition_task)(fn, f_item_df.to_delayed()[0]) for fn in web_clickstream_flist ] meta_d = { "wcs_user_sk": np.ones(1, dtype=np.int64), "tstamp_inSec": np.ones(1, dtype=np.int64), "i_category_id": np.ones(1, dtype=np.int8), } meta_df = cudf.DataFrame(meta_d) merged_df = dask_cudf.from_delayed(task_ls, meta=meta_df) ### that the click for each user ends up at the same partition merged_df = merged_df.repartition(columns=["wcs_user_sk"]) ### Main Query ### sessionize logic. distinct_session_df = merged_df.map_partitions( get_distinct_sessions, keep_cols=["wcs_user_sk", "i_category_id"], time_out=q30_session_timeout_inSec, ) del merged_df ### create pairs out of item category id's. pair_df = distinct_session_df.map_partitions( get_pairs, pair_col="i_category_id", output_col_1="category_id_1", output_col_2="category_id_2", ) del distinct_session_df ### apply groupby on "category_id_1", "category_id_2" grouped_df = (pair_df.groupby(["category_id_1", "category_id_2" ]).size(split_every=2).reset_index()) grouped_df.columns = ["category_id_1", "category_id_2", "cnt"] result_df = grouped_df.repartition(npartitions=1).persist() ### sort records in desc order and reset index. ### below only has 40 rows so leaving as cudf frame should be fine result_df = result_df.map_partitions( lambda x: x.sort_values("cnt", ascending=False)) result_df = result_df.reset_index(drop=True).head(q30_limit) return result_df
def pagerank(edge_list, alpha=0.85, max_iter=30): """ Find the PageRank values for each vertex in a graph using multiple GPUs. cuGraph computes an approximation of the Pagerank using the power method. The input edge list should be provided in dask-cudf dataframe with one partition per GPU. Parameters ---------- edge_list : dask_cudf.DataFrame Contain the connectivity information as an edge list. Source 'src' and destination 'dst' columns must be of type 'int32'. Edge weights are not used for this algorithm. Indices must be in the range [0, V-1], where V is the global number of vertices. alpha : float The damping factor alpha represents the probability to follow an outgoing edge, standard value is 0.85. Thus, 1.0-alpha is the probability to “teleport” to a random vertex. Alpha should be greater than 0.0 and strictly lower than 1.0. max_iter : int The maximum number of iterations before an answer is returned. If this value is lower or equal to 0 cuGraph will use the default value, which is 30. Returns ------- PageRank : dask_cudf.DataFrame Dask GPU DataFrame containing two columns of size V: the vertex identifiers and the corresponding PageRank values. Examples -------- >>> import dask_cugraph.pagerank as dcg >>> chunksize = dcg.get_chunksize(edge_list.csv) >>> ddf_edge_list = dask_cudf.read_csv(edge_list.csv, >>> chunksize = chunksize, >>> delimiter='\t', >>> names=['src', 'dst'], >>> dtype=['int32', 'int32']) >>> pr = dcg.pagerank(ddf_edge_list, alpha=0.85, max_iter=50) """ client = default_client() gpu_futures = _get_mg_info(edge_list) # npartitions = len(gpu_futures) host_dict = _build_host_dict(gpu_futures, client).items() if len(host_dict) > 1: raise Exception("Dask cluster appears to span hosts. Current " "multi-GPU version is limited to single host") master_host = [(host, random.sample(ports, 1)[0]) for host, ports in host_dict][0] host, port = master_host gpu_futures_for_host = list(filter(lambda d: d[0][0] == host, gpu_futures)) exec_node = (host, port) # build ipc handles gpu_data_excl_worker = list( filter(lambda d: d[0] != exec_node, gpu_futures_for_host)) gpu_data_incl_worker = list( filter(lambda d: d[0] == exec_node, gpu_futures_for_host)) ipc_handles = [ client.submit(get_ipc_handle, future, workers=[worker]) for worker, future in gpu_data_excl_worker ] raw_arrays = [future for worker, future in gpu_data_incl_worker] pr = [ client.submit(_mg_pagerank, (ipc_handles, raw_arrays, alpha, max_iter), workers=[exec_node]) ] c = cudf.DataFrame({ 'vertex': cudf.Series(dtype='int32'), 'pagerank': cudf.Series(dtype='float32') }) ddf = dc.from_delayed(pr, meta=c) return ddf
def __delayed_call(self, inputs): '''Delayed processing called when self.delayed_process is set. To handle delayed processing automatically, prerequisites are checked via call to: :meth:`__check_dly_processing_prereq` Additionally all input dask_cudf dataframes have to be partitioned the same i.e. equal number of partitions. ''' def get_pout(out_dict, port): '''Get the output in out_dict at key port. Used for delayed unpacking.''' # DEBUGGING # try: # from dask.distributed import get_worker # worker = get_worker() # print('worker{} get_pout NODE "{}" port "{}" worker: {}' # .format(worker.name, self.uid, port, worker)) # except Exception as err: # print(err) df_out = out_dict.get(port, cudf.DataFrame()) if isinstance(df_out, cudf.DataFrame): # Needed for the same reason as __make_copy. To prevent columns # addition in the input data frames. In python everything is # by reference value and dataframes are mutable. # Handle the case when dask_cudf.DataFrames are source frames # which appear as cudf.DataFrame in a dask-delayed function. return df_out.copy(deep=False) return df_out inputs_dly = {} # A dask_cudf object will return a list of dask delayed object using # to_delayed() API. Below the logic assumes (otherwise error) that # all inputs are dask_cudf objects and are distributed in the same # manner. Ex. inputs_dly: # inputs_dly = { # p0: { # iport0: ddf_dly_i0_p0, # iport1: ddf_dly_i1_p0, # ... for all iports # }, # p1: { # iport0: ddf_dly_i0_p1, # iport1: ddf_dly_i1_p1, # ... for all iports # }, # ... for all partitions # i_x - iport # p_x - partition index npartitions = None for iport, dcudf in inputs.items(): ddf_dly_list = dcudf.to_delayed() npartitions_ = len(ddf_dly_list) if npartitions is None: npartitions = npartitions_ if npartitions != npartitions_: raise Exception( 'Error DASK_CUDF PARTITIONS MISMATCH: Node "{}" input "{}"' ' has {} npartitions and other inputs have {} partitions' .format(self.uid, iport, npartitions_, npartitions)) for idly, dly in enumerate(ddf_dly_list): inputs_dly.setdefault(idly, {}).update({ # iport: dly.persist() # DON'T PERSIST HERE iport: dly }) # DEBUGGING # print('INPUTS_DLY:\n{}'.format(inputs_dly)) outputs_dly = {} # Formulate a list of delayed objects for each output port to be able # to call from_delayed to synthesize a dask_cudf object. # Ex. outputs_dly: # outputs_dly = { # o0: [ddf_dly_o0_p0, ddf_dly_o0_p1, ... _pN] # o1: [ddf_dly_o1_p0, ddf_dly_o1_p1, ... _pN] # ... for all output ports # } # o_x - output port # p_x - delayed partition # VERY IMPORTANT TO USE PERSIST: # https://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.DataFrame.persist # Otherwise process will run several times. for inputs_ in inputs_dly.values(): output_df_dly = dask.delayed(self.decorate_process())(inputs_) output_df_dly_per = output_df_dly.persist() for oport in self._get_output_ports(): oport_out = dask.delayed(get_pout)( output_df_dly_per, oport) outputs_dly.setdefault(oport, []).append(oport_out.persist()) # DEBUGGING # print('OUTPUTS_DLY:\n{}'.format(outputs_dly)) output_df = {} # A dask_cudf object is synthesized from a list of delayed objects. # Per outputs_dly above use dask_cudf.from_delayed API. for oport in self._get_output_ports(): output_df[oport] = dask_cudf.from_delayed(outputs_dly[oport]) return output_df
def renumber_and_segment(df, src_col_names, dst_col_names, preserve_order=False, store_transposed=False): if isinstance(src_col_names, list): renumber_type = 'legacy' elif not (df[src_col_names].dtype == np.int32 or df[src_col_names].dtype == np.int64): renumber_type = 'legacy' else: renumber_type = 'experimental' renumber_map = NumberMap() if not isinstance(src_col_names, list): src_col_names = [src_col_names] dst_col_names = [dst_col_names] # Assign the new src and dst column names to be used in the renumbered # dataframe to return (renumbered_src_col_name and # renumbered_dst_col_name) renumber_map.set_renumbered_col_names(src_col_names, dst_col_names, df.columns) id_type = df[src_col_names[0]].dtype if isinstance(df, cudf.DataFrame): renumber_map.implementation = NumberMap.SingleGPU( df, src_col_names, dst_col_names, renumber_map.id_type, store_transposed) elif isinstance(df, dask_cudf.DataFrame): renumber_map.implementation = NumberMap.MultiGPU( df, src_col_names, dst_col_names, renumber_map.id_type, store_transposed) else: raise TypeError("df must be cudf.DataFrame or dask_cudf.DataFrame") if renumber_type == 'legacy': indirection_map = renumber_map.implementation.\ indirection_map(df, src_col_names, dst_col_names) df = renumber_map.add_internal_vertex_id( df, renumber_map.renumbered_src_col_name, src_col_names, drop=True, preserve_order=preserve_order) df = renumber_map.add_internal_vertex_id( df, renumber_map.renumbered_dst_col_name, dst_col_names, drop=True, preserve_order=preserve_order) else: df = df.rename( columns={ src_col_names[0]: renumber_map.renumbered_src_col_name, dst_col_names[0]: renumber_map.renumbered_dst_col_name }) num_edges = len(df) if isinstance(df, dask_cudf.DataFrame): is_mnmg = True else: is_mnmg = False if is_mnmg: client = default_client() data = get_distributed_data(df) result = [(client.submit(call_renumber, Comms.get_session_id(), wf[1], renumber_map.renumbered_src_col_name, renumber_map.renumbered_dst_col_name, num_edges, is_mnmg, store_transposed, workers=[wf[0]]), wf[0]) for idx, wf in enumerate(data.worker_to_parts.items())] wait(result) def get_renumber_map(id_type, data): return data[0].astype(id_type) def get_segment_offsets(data): return data[1] def get_renumbered_df(id_type, data): data[2][renumber_map.renumbered_src_col_name] = \ data[2][renumber_map.renumbered_src_col_name]\ .astype(id_type) data[2][renumber_map.renumbered_dst_col_name] = \ data[2][renumber_map.renumbered_dst_col_name]\ .astype(id_type) return data[2] renumbering_map = dask_cudf.from_delayed([ client.submit(get_renumber_map, id_type, data, workers=[wf]) for (data, wf) in result ]) list_of_segment_offsets = client.gather([ client.submit(get_segment_offsets, data, workers=[wf]) for (data, wf) in result ]) aggregate_segment_offsets = [] for segment_offsets in list_of_segment_offsets: aggregate_segment_offsets.extend(segment_offsets) renumbered_df = dask_cudf.from_delayed([ client.submit(get_renumbered_df, id_type, data, workers=[wf]) for (data, wf) in result ]) if renumber_type == 'legacy': renumber_map.implementation.ddf = indirection_map.merge( renumbering_map, right_on='original_ids', left_on='global_id', how='right').\ drop(columns=['global_id', 'original_ids'])\ .rename(columns={'new_ids': 'global_id'}) else: renumber_map.implementation.ddf = renumbering_map.rename( columns={ 'original_ids': '0', 'new_ids': 'global_id' }) renumber_map.implementation.numbered = True return renumbered_df, renumber_map, aggregate_segment_offsets else: renumbering_map, segment_offsets, renumbered_df = \ c_renumber.renumber(df, renumber_map.renumbered_src_col_name, renumber_map.renumbered_dst_col_name, num_edges, 0, Comms.get_default_handle(), is_mnmg, store_transposed) if renumber_type == 'legacy': renumber_map.implementation.df = indirection_map.\ merge(renumbering_map, right_on='original_ids', left_on='id').\ drop(columns=['id', 'original_ids'])\ .rename(columns={'new_ids': 'id'}, copy=False) else: renumber_map.implementation.df = renumbering_map.rename( columns={ 'original_ids': '0', 'new_ids': 'id' }, copy=False) renumber_map.implementation.numbered = True return renumbered_df, renumber_map, segment_offsets
def main(client, config): import cudf import dask_cudf (date_dim_df, web_page_df, web_sales_df) = benchmark( read_tables, config=config, compute_result=config["get_read_time"], dask_profile=config["dask_profile"], ) date_dim_cov_df = date_dim_df.map_partitions(convert_datestring_to_days) q08_start_dt = np.datetime64(q08_STARTDATE, "D").astype(int) q08_end_dt = np.datetime64(q08_ENDDATE, "D").astype(int) filtered_date_df = date_dim_cov_df.query( f"d_date >= {q08_start_dt} and d_date <= {q08_end_dt}", meta=date_dim_cov_df._meta, ).reset_index(drop=True) # Convert wp_type to categorical and get cat_id of review and dynamic type # see https://github.com/rapidsai/cudf/issues/4093 for more info web_page_df = web_page_df.persist() # map_partitions is a bit faster than ddf[col].astype('category') web_page_df["wp_type"] = web_page_df["wp_type"].map_partitions( lambda ser: ser.astype("category")) cpu_categories = web_page_df["wp_type"].compute().cat.categories.to_pandas( ) REVIEW_CAT_CODE = cpu_categories.get_loc("review") # cast to minimum viable dtype codes_min_signed_type = cudf.utils.dtypes.min_signed_type( len(cpu_categories)) web_page_df["wp_type_codes"] = web_page_df["wp_type"].cat.codes.astype( codes_min_signed_type) web_page_newcols = ["wp_web_page_sk", "wp_type_codes"] web_page_df = web_page_df[web_page_newcols] web_clickstream_flist = glob.glob(config["data_dir"] + "web_clickstreams/*.parquet") task_ls = [ delayed(etl_wcs)(fn, filtered_date_df.to_delayed()[0], web_page_df.to_delayed()[0]) for fn in web_clickstream_flist ] meta_d = { "wcs_user_sk": np.ones(1, dtype=np.int64), "tstamp_inSec": np.ones(1, dtype=np.int64), "wcs_sales_sk": np.ones(1, dtype=np.int64), "wp_type_codes": np.ones(1, dtype=np.int8), } meta_df = cudf.DataFrame(meta_d) merged_df = dask_cudf.from_delayed(task_ls, meta=meta_df) merged_df = merged_df.repartition(columns=["wcs_user_sk"]) reviewed_sales = merged_df.map_partitions( reduction_function, REVIEW_CAT_CODE, meta=cudf.DataFrame({"wcs_sales_sk": np.ones(1, dtype=np.int64)}), ) reviewed_sales = reviewed_sales.persist() wait(reviewed_sales) del merged_df all_sales_in_year = filtered_date_df.merge(web_sales_df, left_on=["d_date_sk"], right_on=["ws_sold_date_sk"], how="inner") all_sales_in_year = all_sales_in_year[["ws_net_paid", "ws_order_number"]] all_sales_in_year = all_sales_in_year.persist() wait(all_sales_in_year) # note: switch to mainline # once https://github.com/dask/dask/pull/6066 # lands q08_reviewed_sales = hash_merge( lhs=all_sales_in_year, rhs=reviewed_sales, left_on=["ws_order_number"], right_on=["wcs_sales_sk"], how="inner", ) q08_reviewed_sales_sum = q08_reviewed_sales["ws_net_paid"].sum() q08_all_sales_sum = all_sales_in_year["ws_net_paid"].sum() q08_reviewed_sales_sum, q08_all_sales_sum = client.compute( [q08_reviewed_sales_sum, q08_all_sales_sum]) q08_reviewed_sales_sum, q08_all_sales_sum = ( q08_reviewed_sales_sum.result(), q08_all_sales_sum.result(), ) no_q08_review_sales_amount = q08_all_sales_sum - q08_reviewed_sales_sum final_result_df = cudf.DataFrame() final_result_df["q08_review_sales_amount"] = [q08_reviewed_sales_sum] final_result_df["q08_review_sales_amount"] = final_result_df[ "q08_review_sales_amount"].astype("int") final_result_df["no_q08_review_sales_amount"] = [ no_q08_review_sales_amount ] final_result_df["no_q08_review_sales_amount"] = final_result_df[ "no_q08_review_sales_amount"].astype("int") return final_result_df
def main(client, config): import dask_cudf import cudf item_df = benchmark( read_tables, config=config, compute_result=config["get_read_time"], dask_profile=config["dask_profile"], ) wcs_tstamp_min = get_wcs_minima(config) item_df["i_item_sk"] = item_df["i_item_sk"].astype("int32") item_df["i_category_id"] = item_df["i_category_id"].astype("int8") # we eventually will only care about these categories, so we can filter now item_df_filtered = item_df.loc[item_df.i_category_id.isin( q03_purchased_item_category_IN)].reset_index(drop=True) # The main idea is that we don't fuse a filtration task with reading task yet # this causes more memory pressures as we try to read the whole thing ( and spill that) # at once and then do filtration . ### Below Pr has the dashboard snapshot which makes the problem clear ### https://github.com/rapidsai/tpcx-bb-internal/pull/496#issue-399946141 web_clickstream_flist = glob.glob( os.path.join(config["data_dir"], "web_clickstreams/*.parquet")) task_ls = [ delayed(pre_repartition_task)(fn, item_df.to_delayed()[0], wcs_tstamp_min) for fn in web_clickstream_flist ] meta_d = { "wcs_user_sk": np.ones(1, dtype=np.int32), "tstamp": np.ones(1, dtype=np.int32), "wcs_item_sk": np.ones(1, dtype=np.int32), "wcs_sales_sk": np.ones(1, dtype=np.int32), "i_category_id": np.ones(1, dtype=np.int8), } meta_df = cudf.DataFrame(meta_d) merged_df = dask_cudf.from_delayed(task_ls, meta=meta_df) merged_df = merged_df.shuffle(on="wcs_user_sk") meta_d = { "i_item_sk": np.ones(1, dtype=merged_df["wcs_item_sk"].dtype), "cnt": np.ones(1, dtype=merged_df["wcs_item_sk"].dtype), } meta_df = cudf.DataFrame(meta_d) grouped_df = merged_df.map_partitions(reduction_function, item_df_filtered.to_delayed()[0], meta=meta_df) ### todo: check if this has any impact on stability grouped_df = grouped_df.persist(priority=10000) ### todo: remove this later after more testing wait(grouped_df) print("---" * 20) print("grouping complete ={}".format(len(grouped_df))) grouped_df = grouped_df.groupby(["i_item_sk" ]).sum(split_every=2).reset_index() grouped_df.columns = ["i_item_sk", "cnt"] result_df = grouped_df.map_partitions( lambda df: df.sort_values(by=["cnt"], ascending=False)) result_df.columns = ["lastviewed_item", "cnt"] result_df["purchased_item"] = q03_purchased_item_IN cols_order = ["purchased_item", "lastviewed_item", "cnt"] result_df = result_df[cols_order] result_df = result_df.persist() ### todo: remove this later after more testing wait(result_df) print(len(result_df)) result_df = result_df.head(q03_limit) print("result complete") print("---" * 20) return result_df
def bfs(graph, start, return_distances=False): """ Find the distances and predecessors for a breadth first traversal of a graph. The input graph must contain edge list as dask-cudf dataframe with one partition per GPU. Parameters ---------- graph : cugraph.DiGraph cuGraph graph descriptor, should contain the connectivity information as dask cudf edge list dataframe(edge weights are not used for this algorithm). Undirected Graph not currently supported. start : Integer Specify starting vertex for breadth-first search; this function iterates over edges in the component reachable from this node. return_distances : bool, optional, default=False Indicates if distances should be returned Returns ------- df : dask_cudf.DataFrame df['vertex'] gives the vertex id df['distance'] gives the path distance from the starting vertex (Only if return_distances is True) df['predecessor'] gives the vertex it was reached from in the traversal Examples -------- >>> import cugraph.dask as dcg >>> Comms.initialize(p2p=True) >>> chunksize = dcg.get_chunksize(input_data_path) >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) >>> dg = cugraph.DiGraph() >>> dg.from_dask_cudf_edgelist(ddf, 'src', 'dst') >>> df = dcg.bfs(dg, 0) >>> Comms.destroy() """ client = default_client() graph.compute_renumber_edge_list(transposed=False) (ddf, num_verts, partition_row_size, partition_col_size, vertex_partition_offsets) = shuffle(graph, transposed=False) num_edges = len(ddf) data = get_distributed_data(ddf) if graph.renumbered: start = graph.lookup_internal_vertex_id(cudf.Series([start], dtype='int32')).compute() start = start.iloc[0] result = [client.submit( call_bfs, Comms.get_session_id(), wf[1], num_verts, num_edges, vertex_partition_offsets, start, return_distances, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items())] wait(result) ddf = dask_cudf.from_delayed(result) if graph.renumbered: ddf = graph.unrenumber(ddf, 'vertex') ddf = graph.unrenumber(ddf, 'predecessor') ddf["predecessor"] = ddf["predecessor"].fillna(-1) return ddf