def to_undirected(self, G): """ Return an undirected copy of the graph. """ G.properties.renumbered = self.properties.renumbered G.renumber_map = self.renumber_map if self.properties.directed is False: G.edgelist = self.edgelist G.adjlist = self.adjlist G.transposedadjlist = self.transposedadjlist else: df = self.edgelist.edgelist_df if self.edgelist.weights: source_col, dest_col, value_col = symmetrize( df["src"], df["dst"], df["weights"]) else: source_col, dest_col = symmetrize(df["src"], df["dst"]) value_col = None G.edgelist = simpleGraphImpl.EdgeList(source_col, dest_col, value_col)
def to_undirected(self): """ Return an undirected copy of the graph. Returns ------- G : Graph A undirected graph with the same nodes, and each directed edge (u,v,weights) replaced by an undirected edge (u,v,weights). Examples -------- >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', >>> dtype=['int32', 'int32', 'float32'], header=None) >>> DiG = cugraph.DiGraph() >>> DiG.from_cudf_edgelist(M, '0', '1') >>> G = DiG.to_undirected() """ if self.distributed: raise Exception("Not supported for distributed graph") if type(self) is Graph: return self if type(self) is DiGraph: G = Graph() df = self.edgelist.edgelist_df G.renumbered = self.renumbered G.renumber_map = self.renumber_map if self.edgelist.weights: source_col, dest_col, value_col = symmetrize( df["src"], df["dst"], df["weights"] ) else: source_col, dest_col = symmetrize(df["src"], df["dst"]) value_col = None G.edgelist = Graph.EdgeList( source_col, dest_col, value_col ) return G
def from_cudf_edgelist(self, input_df, source='source', destination='destination', edge_attr=None, renumber=True): """ Initialize a graph from the edge list. It is an error to call this method on an initialized Graph object. The passed input_df argument wraps gdf_column objects that represent a graph using the edge list format. source argument is source column name and destination argument is destination column name. Source and destination indices must be in the range [0, V) where V is the number of vertices. If renumbering needs to be done, renumber argument should be passed as True. If weights are present, edge_attr argument is the weights column name. Parameters ---------- input_df : cudf.DataFrame This cudf.DataFrame wraps source, destination and weight gdf_column of size E (E: number of edges) The 'src' column contains the source index for each edge. Source indices are in the range [0, V) (V: number of vertices). The 'dst' column contains the destination index for each edge. Destination indices are in the range [0, V) (V: number of vertices). If renumbering needs to be done, renumber argument should be passed as True. For weighted graphs, dataframe contains 'weight' column containing the weight value for each edge. source : str source argument is source column name destination : str destination argument is destination column name. edge_attr : str edge_attr argument is the weights column name. renumber : bool If source and destination indices are not in range 0 to V where V is number of vertices, renumber argument should be True. Examples -------- >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', >>> dtype=['int32', 'int32', 'float32'], header=None) >>> G = cugraph.Graph() >>> G.from_cudf_edgelist(M, source='0', destination='1', edge_attr='2', renumber=False) """ if self.edgelist is not None or self.adjlist is not None: raise Exception('Graph already has values') if self.multi: if type(edge_attr) is not list: raise Exception('edge_attr should be a list of column names') value_col = {} for col_name in edge_attr: value_col[col_name] = input_df[col_name] elif edge_attr is not None: value_col = input_df[edge_attr] else: value_col = None renumber_map = None if renumber: if type(source) is list and type(destination) is list: source_col, dest_col, renumber_map = multi_rnb( input_df, source, destination) else: source_col, dest_col, renumber_map = rnb( input_df[source], input_df[destination]) self.renumbered = True else: if type(source) is list and type(destination) is list: raise Exception('set renumber to True for multi column ids') else: source_col = input_df[source] dest_col = input_df[destination] if not self.symmetrized and not self.multi: if value_col is not None: source_col, dest_col, value_col = symmetrize( source_col, dest_col, value_col) else: source_col, dest_col = symmetrize(source_col, dest_col) self.edgelist = Graph.EdgeList(source_col, dest_col, value_col, renumber_map)
def from_cudf_edgelist( self, input_df, source="source", destination="destination", edge_attr=None, renumber=True, ): """ Initialize a graph from the edge list. It is an error to call this method on an initialized Graph object. The passed input_df argument wraps gdf_column objects that represent a graph using the edge list format. source argument is source column name and destination argument is destination column name. By default, renumbering is enabled to map the source and destination vertices into an index in the range [0, V) where V is the number of vertices. If the input vertices are a single column of integers in the range [0, V), renumbering can be disabled and the original external vertex ids will be used. If weights are present, edge_attr argument is the weights column name. Parameters ---------- input_df : cudf.DataFrame or dask_cudf.DataFrame This cudf.DataFrame wraps source, destination and weight gdf_column of size E (E: number of edges) The 'src' column contains the source index for each edge. Source indices are in the range [0, V) (V: number of vertices). The 'dst' column contains the destination index for each edge. Destination indices are in the range [0, V) (V: number of vertices). If renumbering needs to be done, renumber argument should be passed as True. For weighted graphs, dataframe contains 'weight' column containing the weight value for each edge. If a dask_cudf.DataFrame is passed it will be reinterpreted as a cudf.DataFrame. For the distributed path please use from_dask_cudf_edgelist. source : str source argument is source column name destination : str destination argument is destination column name. edge_attr : str edge_attr argument is the weights column name. renumber : bool If source and destination indices are not in range 0 to V where V is number of vertices, renumber argument should be True. Examples -------- >>> df = cudf.read_csv('datasets/karate.csv', delimiter=' ', >>> dtype=['int32', 'int32', 'float32'], header=None) >>> G = cugraph.Graph() >>> G.from_cudf_edgelist(df, source='0', destination='1', edge_attr='2', renumber=False) """ if self.edgelist is not None or self.adjlist is not None: raise Exception("Graph already has values") # Consolidation if isinstance(input_df, cudf.DataFrame): if len(input_df[source]) > 2147483100: raise Exception('cudf dataFrame edge list is too big \ to fit in a single GPU') elist = input_df elif isinstance(input_df, dask_cudf.DataFrame): if len(input_df[source]) > 2147483100: raise Exception('dask_cudf dataFrame edge list is too big \ to fit in a single GPU') elist = input_df.compute().reset_index(drop=True) else: raise Exception('input should be a cudf.DataFrame or \ a dask_cudf dataFrame') renumber_map = None if renumber: # FIXME: Should SG do lazy evaluation like MG? elist, renumber_map = NumberMap.renumber( elist, source, destination, store_transposed=False ) source = 'src' destination = 'dst' self.renumbered = True self.renumber_map = renumber_map else: if type(source) is list and type(destination) is list: raise Exception('set renumber to True for multi column ids') source_col = elist[source] dest_col = elist[destination] if self.multi: if type(edge_attr) is not list: raise Exception("edge_attr should be a list of column names") value_col = {} for col_name in edge_attr: value_col[col_name] = elist[col_name] elif edge_attr is not None: value_col = elist[edge_attr] else: value_col = None if not self.symmetrized and not self.multi: if value_col is not None: source_col, dest_col, value_col = symmetrize( source_col, dest_col, value_col ) else: source_col, dest_col = symmetrize(source_col, dest_col) self.edgelist = Graph.EdgeList( source_col, dest_col, value_col ) if self.batch_enabled: self._replicate_edgelist() self.renumber_map = renumber_map
def get_traversed_cost(df, source, source_col, dest_col, value_col): """ Take the DataFrame result from a BFS or SSSP function call and sums the given weights along the path to the starting vertex. The source_col, dest_col identifiers need to match with the vertex and predecessor columns of df. Input Parameters ---------- df : cudf.DataFrame The dataframe containing the results of a BFS or SSSP call source: int Index of the source vertex. source_col : cudf.DataFrame This cudf.Series wraps a gdf_column of size E (E: number of edges). The gdf column contains the source index for each edge. Source indices must be an integer type. dest_col : cudf.Series This cudf.Series wraps a gdf_column of size E (E: number of edges). The gdf column contains the destination index for each edge. Destination indices must be an integer type. value_col : cudf.Series This cudf.Series wraps a gdf_column of size E (E: number of edges). The gdf column contains values associated with this edge. Weight should be a floating type. Returns --------- df : cudf.DataFrame DataFrame containing two columns 'vertex' and 'info'. Unreachable vertices will have value the max value of the weight type. """ if 'vertex' not in df.columns: raise ValueError("DataFrame does not appear to be a BFS or " "SSP result - 'vertex' column missing") if 'distance' not in df.columns: raise ValueError("DataFrame does not appear to be a BFS or " "SSP result - 'distance' column missing") if 'predecessor' not in df.columns: raise ValueError("DataFrame does not appear to be a BFS or " "SSP result - 'predecessor' column missing") src, dst, val = symmetrize(source_col, dest_col, value_col) symmetrized_df = cudf.DataFrame() symmetrized_df['source'] = src symmetrized_df['destination'] = dst symmetrized_df['weights'] = val input_df = df.merge(symmetrized_df, left_on=['vertex', 'predecessor'], right_on=['source', 'destination'], how="left") # Set unreachable vertex weights to max float and source vertex weight to 0 max_val = np.finfo(val.dtype).max input_df[['weights']] = input_df[['weights']].fillna(max_val) input_df.loc[input_df['vertex'] == source, 'weights'] = 0 # Renumber renumbered_gdf, renumber_map = NumberMap.renumber(input_df, ["vertex"], ["predecessor"], preserve_order=True) renumbered_gdf = renumbered_gdf.rename(columns={ 'src': 'vertex', 'dst': 'predecessor' }) stop_vertex = renumber_map.to_internal_vertex_id(cudf.Series(-1)).values[0] out_df = path_retrieval_wrapper.get_traversed_cost(renumbered_gdf, stop_vertex) # Unrenumber out_df['vertex'] = renumber_map.unrenumber(renumbered_gdf, 'vertex', preserve_order=True)["vertex"] return out_df
def __from_edgelist( self, input_df, source="source", destination="destination", edge_attr=None, renumber=True, ): # Verify column names present in input DataFrame s_col = source d_col = destination if not isinstance(s_col, list): s_col = [s_col] if not isinstance(d_col, list): d_col = [d_col] if not (set(s_col).issubset(set(input_df.columns)) and set(d_col).issubset(set(input_df.columns))): # FIXME: Raise concrete Exceptions raise Exception("source column names and/or destination column " "names not found in input. Recheck the source and " "destination parameters") # FIXME: check if the consolidated graph fits on the # device before gathering all the edge lists # Consolidation if isinstance(input_df, cudf.DataFrame): if len(input_df[source]) > 2147483100: raise Exception("cudf dataFrame edge list is too big " "to fit in a single GPU") elist = input_df elif isinstance(input_df, dask_cudf.DataFrame): if len(input_df[source]) > 2147483100: raise Exception("dask_cudf dataFrame edge list is too big " "to fit in a single GPU") elist = input_df.compute().reset_index(drop=True) else: raise Exception("input should be a cudf.DataFrame or " "a dask_cudf dataFrame") # Renumbering self.renumber_map = None if renumber: # FIXME: Should SG do lazy evaluation like MG? elist, renumber_map = NumberMap.renumber(elist, source, destination, store_transposed=False) source = "src" destination = "dst" self.properties.renumbered = True self.renumber_map = renumber_map else: if type(source) is list and type(destination) is list: raise Exception("set renumber to True for multi column ids") # Populate graph edgelist source_col = elist[source] dest_col = elist[destination] if edge_attr is not None: self.weighted = True value_col = elist[edge_attr] else: value_col = None # TODO: Update Symmetrize to work on Graph and/or DataFrame if value_col is not None: source_col, dest_col, value_col = symmetrize( source_col, dest_col, value_col, multi=self.properties.multi_edge, symmetrize=not self.properties.directed) if isinstance(value_col, cudf.DataFrame): value_dict = {} for i in value_col.columns: value_dict[i] = value_col[i] value_col = value_dict else: source_col, dest_col = symmetrize( source_col, dest_col, multi=self.properties.multi_edge, symmetrize=not self.properties.directed) self.edgelist = simpleGraphImpl.EdgeList(source_col, dest_col, value_col) if self.batch_enabled: self._replicate_edgelist()