def compute_local_data(self, by, load_balance=True): """ Compute the local edges, vertices and offsets for a distributed graph stored as a dask-cudf dataframe and initialize the communicator. Performs global sorting and load_balancing. Parameters ---------- by : str by argument is the column by which we want to sort and partition. It should be the source column name for generating CSR format and destination column name for generating CSC format. load_balance : bool Set as True to perform load_balancing after global sorting of dask-cudf DataFrame. This ensures that the data is uniformly distributed among multiple GPUs to avoid over-loading. """ if self.distributed: data = get_local_data(self, by, load_balance) self.local_data = {} self.local_data['data'] = data self.local_data['by'] = by else: raise Exception('Graph should be a distributed graph')
def louvain(input_graph, max_iter=100, resolution=1.0, load_balance=True): """ Compute the modularity optimizing partition of the input graph using the Louvain method on multiple GPUs Examples -------- >>> import cugraph.dask as dcg >>> Comms.initialize() >>> chunksize = dcg.get_chunksize(input_data_path) >>> ddf = dask_cudf.read_csv('datasets/karate.csv', chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) >>> dg = cugraph.Graph() >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst', edge_attr='value') >>> parts, modularity_score = dcg.louvain(dg) """ # FIXME: finish docstring: describe parameters, etc. # FIXME: import here to prevent circular import: cugraph->louvain # wrapper->cugraph/structure->cugraph/dask->dask/louvain->cugraph/structure # from cugraph.structure.graph import Graph # FIXME: dask methods to populate graphs from edgelists are only present on # DiGraph classes. Disable the Graph check for now and assume inputs are # symmetric DiGraphs. # if type(graph) is not Graph: # raise Exception("input graph must be undirected") client = default_client() if (input_graph.local_data is not None and input_graph.local_data['by'] == 'src'): data = input_graph.local_data['data'] else: data = get_local_data(input_graph, by='src', load_balance=load_balance) result = dict([(data.worker_info[wf[0]]["rank"], client.submit(call_louvain, Comms.get_session_id(), wf[1], data.local_data, max_iter, resolution, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items())]) wait(result) (parts, modularity_score) = result[0].result() if input_graph.renumbered: # MG renumbering is lazy, but it's safe to assume it's been called at # this point if renumbered=True parts = input_graph.unrenumber(parts, "vertex") return parts, modularity_score
def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1.0e-5, nstart=None, load_balance=True): """ Find the PageRank values for each vertex in a graph using multiple GPUs. cuGraph computes an approximation of the Pagerank using the power method. The input graph must contain edge list as dask-cudf dataframe with one partition per GPU. Parameters ---------- graph : cugraph.DiGraph cuGraph graph descriptor, should contain the connectivity information as dask cudf edge list dataframe(edge weights are not used for this algorithm). Undirected Graph not currently supported. alpha : float The damping factor alpha represents the probability to follow an outgoing edge, standard value is 0.85. Thus, 1.0-alpha is the probability to “teleport” to a random vertex. Alpha should be greater than 0.0 and strictly lower than 1.0. personalization : cudf.Dataframe GPU Dataframe containing the personalization information. personalization['vertex'] : cudf.Series Subset of vertices of graph for personalization personalization['values'] : cudf.Series Personalization values for vertices max_iter : int The maximum number of iterations before an answer is returned. If this value is lower or equal to 0 cuGraph will use the default value, which is 30. tolerance : float Set the tolerance the approximation, this parameter should be a small magnitude value. The lower the tolerance the better the approximation. If this value is 0.0f, cuGraph will use the default value which is 1.0E-5. Setting too small a tolerance can lead to non-convergence due to numerical roundoff. Usually values between 0.01 and 0.00001 are acceptable. nstart : not supported initial guess for pagerank load_balance : bool Set as True to perform load_balancing after global sorting of dask-cudf DataFrame. This ensures that the data is uniformly distributed among multiple GPUs to avoid over-loading. Returns ------- PageRank : cudf.DataFrame GPU data frame containing two cudf.Series of size V: the vertex identifiers and the corresponding PageRank values. df['vertex'] : cudf.Series Contains the vertex identifiers df['pagerank'] : cudf.Series Contains the PageRank score Examples -------- >>> import cugraph.dask as dcg >>> Comms.initialize() >>> chunksize = dcg.get_chunksize(input_data_path) >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) >>> dg = cugraph.DiGraph() >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst', edge_attr='value') >>> pr = dcg.pagerank(dg) >>> Comms.destroy() """ from cugraph.structure.graph import null_check nstart = None client = default_client() if (input_graph.local_data is not None and input_graph.local_data['by'] == 'dst'): data = input_graph.local_data['data'] else: data = get_local_data(input_graph, by='dst', load_balance=load_balance) if personalization is not None: null_check(personalization["vertex"]) null_check(personalization["values"]) if input_graph.renumbered is True: personalization = input_graph.add_internal_vertex_id( personalization, "vertex", "vertex").compute() result = dict([(data.worker_info[wf[0]]["rank"], client.submit(call_pagerank, Comms.get_session_id(), wf[1], data.local_data, alpha, max_iter, tol, personalization, nstart, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items())]) wait(result) if input_graph.renumbered: return input_graph.unrenumber(result[0].result(), 'vertex').compute() return result[0].result()
def bfs(graph, start, return_distances=False): """ Find the distances and predecessors for a breadth first traversal of a graph. The input graph must contain edge list as dask-cudf dataframe with one partition per GPU. Parameters ---------- graph : cugraph.DiGraph cuGraph graph descriptor, should contain the connectivity information as dask cudf edge list dataframe(edge weights are not used for this algorithm). Undirected Graph not currently supported. start : Integer Specify starting vertex for breadth-first search; this function iterates over edges in the component reachable from this node. return_distances : bool, optional, default=False Indicates if distances should be returned Returns ------- df : cudf.DataFrame df['vertex'][i] gives the vertex id of the i'th vertex df['distance'][i] gives the path distance for the i'th vertex from the starting vertex (Only if return_distances is True) df['predecessor'][i] gives for the i'th vertex the vertex it was reached from in the traversal Examples -------- >>> import cugraph.dask as dcg >>> Comms.initialize() >>> chunksize = dcg.get_chunksize(input_data_path) >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) >>> dg = cugraph.DiGraph() >>> dg.from_dask_cudf_edgelist(ddf) >>> df = dcg.bfs(dg, 0) >>> Comms.destroy() """ client = default_client() if (graph.local_data is not None and graph.local_data['by'] == 'src'): data = graph.local_data['data'] else: data = get_local_data(graph, by='src') if graph.renumbered: start = graph.lookup_internal_vertex_id(cudf.Series([start])).compute() start = start.iloc[0] result = dict([(data.worker_info[wf[0]]["rank"], client.submit(call_bfs, Comms.get_session_id(), wf[1], data.local_data, start, return_distances, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items())]) wait(result) df = result[0].result() if graph.renumbered: df = graph.unrenumber(df, 'vertex').compute() df = graph.unrenumber(df, 'predecessor').compute() df["predecessor"].fillna(-1, inplace=True) return df