def bfs(graph, start, return_distances=False): """ Find the distances and predecessors for a breadth first traversal of a graph. The input graph must contain edge list as dask-cudf dataframe with one partition per GPU. Parameters ---------- graph : cugraph.DiGraph cuGraph graph descriptor, should contain the connectivity information as dask cudf edge list dataframe(edge weights are not used for this algorithm). Undirected Graph not currently supported. start : Integer Specify starting vertex for breadth-first search; this function iterates over edges in the component reachable from this node. return_distances : bool, optional, default=False Indicates if distances should be returned Returns ------- df : dask_cudf.DataFrame df['vertex'] gives the vertex id df['distance'] gives the path distance from the starting vertex (Only if return_distances is True) df['predecessor'] gives the vertex it was reached from in the traversal Examples -------- >>> import cugraph.dask as dcg >>> Comms.initialize(p2p=True) >>> chunksize = dcg.get_chunksize(input_data_path) >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) >>> dg = cugraph.DiGraph() >>> dg.from_dask_cudf_edgelist(ddf, 'src', 'dst') >>> df = dcg.bfs(dg, 0) >>> Comms.destroy() """ client = default_client() graph.compute_renumber_edge_list(transposed=False) (ddf, num_verts, partition_row_size, partition_col_size, vertex_partition_offsets) = shuffle(graph, transposed=False) num_edges = len(ddf) data = get_distributed_data(ddf) if graph.renumbered: start = graph.lookup_internal_vertex_id(cudf.Series([start], dtype='int32')).compute() start = start.iloc[0] result = [client.submit( call_bfs, Comms.get_session_id(), wf[1], num_verts, num_edges, vertex_partition_offsets, start, return_distances, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items())] wait(result) ddf = dask_cudf.from_delayed(result) if graph.renumbered: ddf = graph.unrenumber(ddf, 'vertex') ddf = graph.unrenumber(ddf, 'predecessor') ddf["predecessor"] = ddf["predecessor"].fillna(-1) return ddf
def katz_centrality(input_graph, alpha=None, beta=None, max_iter=100, tol=1.0e-5, nstart=None, normalized=True): """ Compute the Katz centrality for the nodes of the graph G. Parameters ---------- input_graph : cuGraph.Graph cuGraph graph descriptor with connectivity information. The graph can contain either directed (DiGraph) or undirected edges (Graph). alpha : float Attenuation factor defaulted to None. If alpha is not specified then it is internally calculated as 1/(degree_max) where degree_max is the maximum out degree. NOTE : The maximum acceptable value of alpha for convergence alpha_max = 1/(lambda_max) where lambda_max is the largest eigenvalue of the graph. Since lambda_max is always lesser than or equal to degree_max for a graph, alpha_max will always be greater than or equal to (1/degree_max). Therefore, setting alpha to (1/degree_max) will guarantee that it will never exceed alpha_max thus in turn fulfilling the requirement for convergence. beta : None A weight scalar - currently Not Supported max_iter : int The maximum number of iterations before an answer is returned. This can be used to limit the execution time and do an early exit before the solver reaches the convergence tolerance. If this value is lower or equal to 0 cuGraph will use the default value, which is 100. tolerance : float Set the tolerance the approximation, this parameter should be a small magnitude value. The lower the tolerance the better the approximation. If this value is 0.0f, cuGraph will use the default value which is 1.0e-6. Setting too small a tolerance can lead to non-convergence due to numerical roundoff. Usually values between 1e-2 and 1e-6 are acceptable. nstart : dask_cudf.Dataframe GPU Dataframe containing the initial guess for katz centrality nstart['vertex'] : dask_cudf.Series Contains the vertex identifiers nstart['values'] : dask_cudf.Series Contains the katz centrality values of vertices normalized : bool If True normalize the resulting katz centrality values Returns ------- katz_centrality : dask_cudf.DataFrame GPU data frame containing two dask_cudf.Series of size V: the vertex identifiers and the corresponding katz centrality values. ddf['vertex'] : dask_cudf.Series Contains the vertex identifiers ddf['katz_centrality'] : dask_cudf.Series Contains the katz centrality of vertices Examples -------- >>> import cugraph.dask as dcg >>> Comms.initialize(p2p=True) >>> chunksize = dcg.get_chunksize(input_data_path) >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) >>> dg = cugraph.DiGraph() >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst', edge_attr='value') >>> pr = dcg.katz_centrality(dg) >>> Comms.destroy() """ nstart = None client = default_client() input_graph.compute_renumber_edge_list(transposed=True) (ddf, num_verts, partition_row_size, partition_col_size, vertex_partition_offsets) = shuffle(input_graph, transposed=True) num_edges = len(ddf) data = get_distributed_data(ddf) result = [ client.submit(call_katz_centrality, Comms.get_session_id(), wf[1], num_verts, num_edges, vertex_partition_offsets, alpha, beta, max_iter, tol, nstart, normalized, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items()) ] wait(result) ddf = dask_cudf.from_delayed(result) if input_graph.renumbered: return input_graph.unrenumber(ddf, 'vertex') return ddf
def sssp(graph, source): """ Compute the distance and predecessors for shortest paths from the specified source to all the vertices in the graph. The distances column will store the distance from the source to each vertex. The predecessors column will store each vertex's predecessor in the shortest path. Vertices that are unreachable will have a distance of infinity denoted by the maximum value of the data type and the predecessor set as -1. The source vertex's predecessor is also set to -1. The input graph must contain edge list as dask-cudf dataframe with one partition per GPU. Parameters ---------- graph : cugraph.DiGraph cuGraph graph descriptor, should contain the connectivity information as dask cudf edge list dataframe. Undirected Graph not currently supported. source : Integer Specify source vertex Returns ------- df : dask_cudf.DataFrame df['vertex'] gives the vertex id df['distance'] gives the path distance from the starting vertex df['predecessor'] gives the vertex id it was reached from in the traversal Examples -------- >>> import cugraph.dask as dcg >>> Comms.initialize(p2p=True) >>> chunksize = dcg.get_chunksize(input_data_path) >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) >>> dg = cugraph.DiGraph() >>> dg.from_dask_cudf_edgelist(ddf, 'src', 'dst') >>> df = dcg.sssp(dg, 0) >>> Comms.destroy() """ client = default_client() graph.compute_renumber_edge_list(transposed=False) (ddf, num_verts, partition_row_size, partition_col_size, vertex_partition_offsets) = shuffle(graph, transposed=False) num_edges = len(ddf) data = get_distributed_data(ddf) if graph.renumbered: source = graph.lookup_internal_vertex_id( cudf.Series([source], dtype='int32')).compute() source = source.iloc[0] result = [ client.submit(call_sssp, Comms.get_session_id(), wf[1], num_verts, num_edges, vertex_partition_offsets, source, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items()) ] wait(result) ddf = dask_cudf.from_delayed(result) if graph.renumbered: ddf = graph.unrenumber(ddf, 'vertex') ddf = graph.unrenumber(ddf, 'predecessor') ddf["predecessor"] = ddf["predecessor"].fillna(-1) return ddf
def louvain(input_graph, max_iter=100, resolution=1.0): """ Compute the modularity optimizing partition of the input graph using the Louvain method on multiple GPUs Examples -------- >>> import cugraph.dask as dcg >>> Comms.initialize(p2p=True) >>> chunksize = dcg.get_chunksize(input_data_path) >>> ddf = dask_cudf.read_csv('datasets/karate.csv', chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) >>> dg = cugraph.Graph() >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst', edge_attr='value') >>> parts, modularity_score = dcg.louvain(dg) """ # FIXME: finish docstring: describe parameters, etc. # MG Louvain currently requires CUDA 10.2 or higher. # FIXME: remove this check once RAPIDS drops support for CUDA < 10.2 if is_cuda_version_less_than((10, 2)): raise NotImplementedError("Multi-GPU Louvain is not implemented for " "this version of CUDA. Ensure CUDA version " "10.2 or higher is installed.") # FIXME: dask methods to populate graphs from edgelists are only present on # DiGraph classes. Disable the Graph check for now and assume inputs are # symmetric DiGraphs. # if type(graph) is not Graph: # raise Exception("input graph must be undirected") client = default_client() # Calling renumbering results in data that is sorted by degree input_graph.compute_renumber_edge_list(transposed=False) sorted_by_degree = True (ddf, num_verts, partition_row_size, partition_col_size, vertex_partition_offsets) = shuffle(input_graph, transposed=False) num_edges = len(ddf) data = get_distributed_data(ddf) futures = [ client.submit(call_louvain, Comms.get_session_id(), wf[1], num_verts, num_edges, vertex_partition_offsets, sorted_by_degree, max_iter, resolution, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items()) ] wait(futures) # futures is a list of Futures containing tuples of (DataFrame, mod_score), # unpack using separate calls to client.submit with a callable to get # individual items. # FIXME: look into an alternate way (not returning a tuples, accessing # tuples differently, etc.) since multiple client.submit() calls may not be # optimal. df_futures = [client.submit(op.getitem, f, 0) for f in futures] mod_score_futures = [client.submit(op.getitem, f, 1) for f in futures] ddf = dask_cudf.from_delayed(df_futures) # Each worker should have computed the same mod_score mod_score = mod_score_futures[0].result() if input_graph.renumbered: # MG renumbering is lazy, but it's safe to assume it's been called at # this point if renumbered=True ddf = input_graph.unrenumber(ddf, "vertex") return (ddf, mod_score)
def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1.0e-5, nstart=None): """ Find the PageRank values for each vertex in a graph using multiple GPUs. cuGraph computes an approximation of the Pagerank using the power method. The input graph must contain edge list as dask-cudf dataframe with one partition per GPU. Parameters ---------- graph : cugraph.DiGraph cuGraph graph descriptor, should contain the connectivity information as dask cudf edge list dataframe(edge weights are not used for this algorithm). Undirected Graph not currently supported. alpha : float The damping factor alpha represents the probability to follow an outgoing edge, standard value is 0.85. Thus, 1.0-alpha is the probability to “teleport” to a random vertex. Alpha should be greater than 0.0 and strictly lower than 1.0. personalization : cudf.Dataframe GPU Dataframe containing the personalization information. Currently not supported. personalization['vertex'] : cudf.Series Subset of vertices of graph for personalization personalization['values'] : cudf.Series Personalization values for vertices max_iter : int The maximum number of iterations before an answer is returned. If this value is lower or equal to 0 cuGraph will use the default value, which is 30. tolerance : float Set the tolerance the approximation, this parameter should be a small magnitude value. The lower the tolerance the better the approximation. If this value is 0.0f, cuGraph will use the default value which is 1.0E-5. Setting too small a tolerance can lead to non-convergence due to numerical roundoff. Usually values between 0.01 and 0.00001 are acceptable. nstart : not supported initial guess for pagerank Returns ------- PageRank : dask_cudf.DataFrame GPU data frame containing two dask_cudf.Series of size V: the vertex identifiers and the corresponding PageRank values. ddf['vertex'] : dask_cudf.Series Contains the vertex identifiers ddf['pagerank'] : dask_cudf.Series Contains the PageRank score Examples -------- >>> import cugraph.dask as dcg >>> Comms.initialize(p2p=True) >>> chunksize = dcg.get_chunksize(input_data_path) >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) >>> dg = cugraph.DiGraph() >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst', edge_attr='value') >>> pr = dcg.pagerank(dg) >>> Comms.destroy() """ from cugraph.structure.graph import null_check nstart = None client = default_client() input_graph.compute_renumber_edge_list(transposed=True) (ddf, num_verts, partition_row_size, partition_col_size, vertex_partition_offsets) = shuffle(input_graph, transposed=True) num_edges = len(ddf) data = get_distributed_data(ddf) if personalization is not None: null_check(personalization["vertex"]) null_check(personalization["values"]) if input_graph.renumbered is True: personalization = input_graph.add_internal_vertex_id( personalization, "vertex", "vertex") p_data = get_distributed_data(personalization) result = [ client.submit(call_pagerank, Comms.get_session_id(), wf[1], num_verts, num_edges, vertex_partition_offsets, alpha, max_iter, tol, p_data.worker_to_parts[wf[0]][0], nstart, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items()) ] else: result = [ client.submit(call_pagerank, Comms.get_session_id(), wf[1], num_verts, num_edges, vertex_partition_offsets, alpha, max_iter, tol, personalization, nstart, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items()) ] wait(result) ddf = dask_cudf.from_delayed(result) if input_graph.renumbered: return input_graph.unrenumber(ddf, 'vertex') return ddf
def louvain(input_graph, max_iter=100, resolution=1.0, load_balance=True): """ Compute the modularity optimizing partition of the input graph using the Louvain method on multiple GPUs Examples -------- >>> import cugraph.dask as dcg >>> Comms.initialize() >>> chunksize = dcg.get_chunksize(input_data_path) >>> ddf = dask_cudf.read_csv('datasets/karate.csv', chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) >>> dg = cugraph.Graph() >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst', edge_attr='value') >>> parts, modularity_score = dcg.louvain(dg) """ # FIXME: finish docstring: describe parameters, etc. # FIXME: import here to prevent circular import: cugraph->louvain # wrapper->cugraph/structure->cugraph/dask->dask/louvain->cugraph/structure # from cugraph.structure.graph import Graph # FIXME: dask methods to populate graphs from edgelists are only present on # DiGraph classes. Disable the Graph check for now and assume inputs are # symmetric DiGraphs. # if type(graph) is not Graph: # raise Exception("input graph must be undirected") client = default_client() # Calling renumbering results in data that is sorted by degree input_graph.compute_renumber_edge_list(transposed=False) sorted_by_degree = True (ddf, num_verts, partition_row_size, partition_col_size, vertex_partition_offsets) = shuffle(input_graph, transposed=False) num_edges = len(ddf) data = get_distributed_data(ddf) result = dict([(data.worker_info[wf[0]]["rank"], client.submit(call_louvain, Comms.get_session_id(), wf[1], num_verts, num_edges, partition_row_size, partition_col_size, vertex_partition_offsets, sorted_by_degree, max_iter, resolution, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items())]) wait(result) (parts, modularity_score) = result[0].result() if input_graph.renumbered: # MG renumbering is lazy, but it's safe to assume it's been called at # this point if renumbered=True parts = input_graph.unrenumber(parts, "vertex") return parts, modularity_score