def test_rearrange(shuffle, get): df = pd.DataFrame({'x': np.random.random(10)}) ddf = dd.from_pandas(df, npartitions=4) ddf2 = ddf.assign(y=ddf.x % 4) result = rearrange_by_column(ddf2, 'y', max_branch=32, shuffle=shuffle) assert result.npartitions == ddf.npartitions assert set(ddf.dask).issubset(result.dask) # Every value in exactly one partition a = result.compute(get=get) parts = get(result.dask, result._keys()) for i in a.y.drop_duplicates(): assert sum(i in part.y for part in parts) == 1
def test_rearrange(shuffle, scheduler): df = pd.DataFrame({"x": np.random.random(10)}) ddf = dd.from_pandas(df, npartitions=4) ddf2 = ddf.assign(_partitions=ddf.x % 4) result = rearrange_by_column(ddf2, "_partitions", max_branch=32, shuffle=shuffle) assert result.npartitions == ddf.npartitions assert set(ddf.dask).issubset(result.dask) # Every value in exactly one partition a = result.compute(scheduler=scheduler) get = dask.base.get_scheduler(scheduler=scheduler) parts = get(result.dask, result.__dask_keys__()) for i in a._partitions.drop_duplicates(): assert sum(i in set(part._partitions) for part in parts) == 1
def sort_values( df, by, max_branch=None, divisions=None, set_divisions=False, ignore_index=False, ): """ Sort by the given list/tuple of column names. """ npartitions = df.npartitions if isinstance(by, tuple): by = list(by) elif not isinstance(by, list): by = [by] # Step 1 - Calculate new divisions (if necessary) if divisions is None: divisions = quantile_divisions(df, by, npartitions) # Step 2 - Perform repartitioning shuffle meta = df._meta._constructor_sliced([0]) if not isinstance(divisions, (gd.Series, gd.DataFrame)): dtype = df[by[0]].dtype divisions = df._meta._constructor_sliced(divisions, dtype=dtype) partitions = df[by].map_partitions( _set_partitions_pre, divisions=divisions, meta=meta ) df2 = df.assign(_partitions=partitions) df3 = rearrange_by_column( df2, "_partitions", max_branch=max_branch, npartitions=len(divisions) - 1, shuffle="tasks", ignore_index=ignore_index, ).drop(columns=["_partitions"]) df3.divisions = (None,) * (df3.npartitions + 1) # Step 3 - Return final sorted df df4 = df3.map_partitions(M.sort_values, by) if not isinstance(divisions, gd.DataFrame) and set_divisions: # Can't have multi-column divisions elsewhere in dask (yet) df4.divisions = methods.tolist(divisions) return df4
def test_rearrange_disk_cleanup_with_exception(): # ensure temporary files are cleaned up when there's an internal exception. with mock.patch("dask.dataframe.shuffle.shuffle_group_3", new=mock_shuffle_group_3): df = pd.DataFrame({"x": np.random.random(10)}) ddf = dd.from_pandas(df, npartitions=4) ddf2 = ddf.assign(_partitions=ddf.x % 4) tmpdir = tempfile.mkdtemp() with dask.config.set(temporay_directory=str(tmpdir)): with pytest.raises(ValueError, match="Mock exception!"): result = rearrange_by_column( ddf2, "_partitions", max_branch=32, shuffle="disk" ) result.compute(scheduler="processes") assert len(os.listdir(tmpdir)) == 0
def shuffle(dg, transposed=False, prows=None, pcols=None): """ Shuffles the renumbered input distributed graph edgelist into ngpu partitions. The number of processes/gpus P = prows*pcols. The 2D partitioning divides the matrix into P*pcols rectangular partitions as per vertex partitioning performed in renumbering, and then shuffles these partitions into P gpus. """ ddf = dg.edgelist.edgelist_df ngpus = get_n_workers() if prows is None and pcols is None: prows, pcols = get_2D_div(ngpus) else: if prows is not None and pcols is not None: if ngpus != prows*pcols: raise Exception('prows*pcols should be equal to the\ number of processes') elif prows is not None: if ngpus % prows != 0: raise Exception('prows must be a factor of the number\ of processes') pcols = int(ngpus/prows) elif pcols is not None: if ngpus % pcols != 0: raise Exception('pcols must be a factor of the number\ of processes') prows = int(ngpus/pcols) renumber_vertex_count = dg.renumber_map.implementation.\ ddf.map_partitions(len).compute() renumber_vertex_cumsum = renumber_vertex_count.cumsum() src_dtype = ddf['src'].dtype dst_dtype = ddf['dst'].dtype vertex_row_partitions = cudf.Series([0], dtype=src_dtype) vertex_row_partitions = vertex_row_partitions.append(cudf.Series( renumber_vertex_cumsum, dtype=src_dtype)) num_verts = vertex_row_partitions.iloc[-1] vertex_col_partitions = [] for i in range(pcols + 1): vertex_col_partitions.append(vertex_row_partitions.iloc[i*prows]) vertex_col_partitions = cudf.Series(vertex_col_partitions, dtype=dst_dtype) meta = ddf._meta._constructor_sliced([0]) partitions = ddf.map_partitions( _set_partitions_pre, vertex_row_partitions=vertex_row_partitions, vertex_col_partitions=vertex_col_partitions, prows=prows, pcols=pcols, transposed=transposed, meta=meta) ddf2 = ddf.assign(_partitions=partitions) ddf3 = rearrange_by_column( ddf2, "_partitions", max_branch=None, npartitions=ngpus, shuffle="tasks", ignore_index=True, ).drop(columns=["_partitions"]) return ddf3, num_verts, vertex_row_partitions
def rearrange_by_hash(df, columns, npartitions, max_branch=None, ignore_index=True): if npartitions and npartitions != df.npartitions: # Use main-line dask for new npartitions meta = df._meta._constructor_sliced([0]) partitions = df[columns].map_partitions(set_partitions_hash, columns, npartitions, meta=meta) # Note: Dask will use a shallow copy for assign df2 = df.assign(_partitions=partitions) return rearrange_by_column( df2, "_partitions", shuffle="tasks", max_branch=max_branch, npartitions=npartitions, ignore_index=ignore_index, ) n = df.npartitions if max_branch is False: stages = 1 else: max_branch = max_branch or 32 stages = int(math.ceil(math.log(n) / math.log(max_branch))) if stages > 1: k = int(math.ceil(n**(1 / stages))) else: k = n if isinstance(columns, str): columns = [columns] elif isinstance(columns, tuple): columns = list(columns) groups = [] splits = [] combines = [] inputs = [ tuple(digit(i, j, k) for j in range(stages)) for i in range(k**stages) ] token = tokenize(df, columns, max_branch) start = {("shuffle-combine-" + token, 0, inp): (df._name, i) if i < df.npartitions else df._meta for i, inp in enumerate(inputs)} for stage in range(1, stages + 1): group = { # Convert partition into dict of dataframe pieces ("shuffle-group-" + token, stage, inp): ( _shuffle_group, ("shuffle-combine-" + token, stage - 1, inp), columns, stage - 1, k, n, ignore_index, ) for inp in inputs } split = { # Get out each individual dataframe piece from the dicts ("shuffle-split-" + token, stage, i, inp): ( getitem, ("shuffle-group-" + token, stage, inp), i, ) for i in range(k) for inp in inputs } combine = { # concatenate those pieces together, with their friends ("shuffle-combine-" + token, stage, inp): ( _concat, [( "shuffle-split-" + token, stage, inp[stage - 1], insert(inp, stage - 1, j), ) for j in range(k)], ignore_index, ) for inp in inputs } groups.append(group) splits.append(split) combines.append(combine) end = {("shuffle-" + token, i): ("shuffle-combine-" + token, stages, inp) for i, inp in enumerate(inputs)} dsk = toolz.merge(start, end, *(groups + splits + combines)) graph = HighLevelGraph.from_collections("shuffle-" + token, dsk, dependencies=[df]) df2 = df.__class__(graph, "shuffle-" + token, df, df.divisions) df2.divisions = (None, ) * (df.npartitions + 1) return df2
def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1.0e-5, nstart=None): """ Find the PageRank values for each vertex in a graph using multiple GPUs. cuGraph computes an approximation of the Pagerank using the power method. The input graph must contain edge list as dask-cudf dataframe with one partition per GPU. Parameters ---------- input_graph : cugraph.DiGraph cuGraph graph descriptor, should contain the connectivity information as dask cudf edge list dataframe(edge weights are not used for this algorithm). Undirected Graph not currently supported. alpha : float, optional (default=0.85) The damping factor alpha represents the probability to follow an outgoing edge, standard value is 0.85. Thus, 1.0-alpha is the probability to “teleport” to a random vertex. Alpha should be greater than 0.0 and strictly lower than 1.0. personalization : cudf.Dataframe, optional (default=None) GPU Dataframe containing the personalization information. Currently not supported. personalization['vertex'] : cudf.Series Subset of vertices of graph for personalization personalization['values'] : cudf.Series Personalization values for vertices max_iter : int, optional (default=100) The maximum number of iterations before an answer is returned. If this value is lower or equal to 0 cuGraph will use the default value, which is 30. tol : float, optional (default=1.0e-5) Set the tolerance the approximation, this parameter should be a small magnitude value. The lower the tolerance the better the approximation. If this value is 0.0f, cuGraph will use the default value which is 1.0E-5. Setting too small a tolerance can lead to non-convergence due to numerical roundoff. Usually values between 0.01 and 0.00001 are acceptable. nstart : not supported initial guess for pagerank Returns ------- PageRank : dask_cudf.DataFrame GPU data frame containing two dask_cudf.Series of size V: the vertex identifiers and the corresponding PageRank values. ddf['vertex'] : dask_cudf.Series Contains the vertex identifiers ddf['pagerank'] : dask_cudf.Series Contains the PageRank score Examples -------- >>> # import cugraph.dask as dcg >>> # ... Init a DASK Cluster >>> # see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html >>> # Download dataset from https://github.com/rapidsai/cugraph/datasets/.. >>> # chunksize = dcg.get_chunksize(datasets_path / "karate.csv") >>> # ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize) >>> # dg = cugraph.Graph(directed=True) >>> # dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst', >>> # edge_attr='value') >>> # pr = dcg.pagerank(dg) """ nstart = None client = default_client() input_graph.compute_renumber_edge_list(transposed=True) ddf = input_graph.edgelist.edgelist_df vertex_partition_offsets = get_vertex_partition_offsets(input_graph) num_verts = vertex_partition_offsets.iloc[-1] num_edges = len(ddf) data = get_distributed_data(ddf) src_col_name = input_graph.renumber_map.renumbered_src_col_name dst_col_name = input_graph.renumber_map.renumbered_dst_col_name if personalization is not None: if input_graph.renumbered is True: personalization = input_graph.add_internal_vertex_id( personalization, "vertex", "vertex") # Function to assign partition id to personalization dataframe def _set_partitions_pre(s, divisions): partitions = divisions.searchsorted(s, side="right") - 1 partitions[divisions.tail(1).searchsorted( s, side="right").astype("bool")] = (len(divisions) - 2) return partitions # Assign partition id column as per vertex_partition_offsets df = personalization by = ['vertex'] meta = df._meta._constructor_sliced([0]) divisions = vertex_partition_offsets partitions = df[by].map_partitions(_set_partitions_pre, divisions=divisions, meta=meta) df2 = df.assign(_partitions=partitions) # Shuffle personalization values according to the partition id df3 = rearrange_by_column( df2, "_partitions", max_branch=None, npartitions=len(divisions) - 1, shuffle="tasks", ignore_index=False, ).drop(columns=["_partitions"]) p_data = get_distributed_data(df3) result = [ client.submit(call_pagerank, Comms.get_session_id(), wf[1], src_col_name, dst_col_name, num_verts, num_edges, vertex_partition_offsets, input_graph.aggregate_segment_offsets, alpha, max_iter, tol, p_data.worker_to_parts[wf[0]][0], nstart, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items()) ] else: result = [ client.submit(call_pagerank, Comms.get_session_id(), wf[1], src_col_name, dst_col_name, num_verts, num_edges, vertex_partition_offsets, input_graph.aggregate_segment_offsets, alpha, max_iter, tol, personalization, nstart, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items()) ] wait(result) ddf = dask_cudf.from_delayed(result) if input_graph.renumbered: return input_graph.unrenumber(ddf, 'vertex') return ddf
def shuffle(dg, transposed=False): """ Shuffles the renumbered input distributed graph edgelist into ngpu partitions. The number of processes/gpus P = prows*pcols. The 2D partitioning divides the matrix into P*pcols rectangular partitions as per vertex partitioning performed in renumbering, and then shuffles these partitions into P gpus. Parameters ---------- transposed : bool, optional (default=False) """ ddf = dg.edgelist.edgelist_df ngpus = Comms.get_n_workers() prows, pcols, partition_type = Comms.get_2D_partition() renumber_vertex_count = dg.renumber_map.implementation.\ ddf.map_partitions(len).compute() renumber_vertex_cumsum = renumber_vertex_count.cumsum() if transposed: row_dtype = ddf['dst'].dtype col_dtype = ddf['src'].dtype else: row_dtype = ddf['src'].dtype col_dtype = ddf['dst'].dtype vertex_partition_offsets = cudf.Series([0], dtype=row_dtype) vertex_partition_offsets = vertex_partition_offsets.append( cudf.Series(renumber_vertex_cumsum, dtype=row_dtype)) num_verts = vertex_partition_offsets.iloc[-1] if partition_type == 1: vertex_row_partitions = [] for i in range(prows + 1): vertex_row_partitions.append(vertex_partition_offsets.iloc[i * pcols]) vertex_row_partitions = cudf.Series(vertex_row_partitions, dtype=row_dtype) else: vertex_row_partitions = vertex_partition_offsets vertex_col_partitions = [] for i in range(pcols + 1): vertex_col_partitions.append(vertex_partition_offsets.iloc[i * prows]) vertex_col_partitions = cudf.Series(vertex_col_partitions, dtype=col_dtype) meta = ddf._meta._constructor_sliced([0]) partitions = ddf.map_partitions( _set_partitions_pre, vertex_row_partitions=vertex_row_partitions, vertex_col_partitions=vertex_col_partitions, prows=prows, pcols=pcols, transposed=transposed, partition_type=partition_type, meta=meta) ddf2 = ddf.assign(_partitions=partitions) ddf3 = rearrange_by_column( ddf2, "_partitions", max_branch=None, npartitions=ngpus, shuffle="tasks", ignore_index=True, ).drop(columns=["_partitions"]) partition_row_size = pcols partition_col_size = prows return (ddf3, num_verts, partition_row_size, partition_col_size, vertex_partition_offsets)
def sort_values( df, by, max_branch=None, divisions=None, set_divisions=False, ignore_index=False, ascending=True, na_position="last", sort_function=None, sort_function_kwargs=None, ): """Sort by the given list/tuple of column names.""" if not isinstance(ascending, bool): raise ValueError("ascending must be either True or False") if na_position not in ("first", "last"): raise ValueError("na_position must be either 'first' or 'last'") npartitions = df.npartitions if isinstance(by, tuple): by = list(by) elif not isinstance(by, list): by = [by] # parse custom sort function / kwargs if provided sort_kwargs = { "by": by, "ascending": ascending, "na_position": na_position, } if sort_function is None: sort_function = M.sort_values if sort_function_kwargs is not None: sort_kwargs.update(sort_function_kwargs) # handle single partition case if npartitions == 1: return df.map_partitions(sort_function, **sort_kwargs) # Step 1 - Calculate new divisions (if necessary) if divisions is None: divisions = quantile_divisions(df, by, npartitions) # Step 2 - Perform repartitioning shuffle meta = df._meta._constructor_sliced([0]) if not isinstance(divisions, (gd.Series, gd.DataFrame)): dtype = df[by[0]].dtype divisions = df._meta._constructor_sliced(divisions, dtype=dtype) partitions = df[by].map_partitions( _set_partitions_pre, divisions=divisions, ascending=ascending, na_position=na_position, meta=meta, ) df2 = df.assign(_partitions=partitions) df3 = rearrange_by_column( df2, "_partitions", max_branch=max_branch, npartitions=len(divisions) - 1, shuffle="tasks", ignore_index=ignore_index, ).drop(columns=["_partitions"]) df3.divisions = (None, ) * (df3.npartitions + 1) # Step 3 - Return final sorted df df4 = df3.map_partitions(sort_function, **sort_kwargs) if not isinstance(divisions, gd.DataFrame) and set_divisions: # Can't have multi-column divisions elsewhere in dask (yet) df4.divisions = methods.tolist(divisions) return df4