def read_graph(graph_file, dataset_name, directed=False, weighted=False): """ Reads the input network in networkx. :param graph_file: The directory where graph in EPGM format is stored :param dataset_name: The name of the graph selected out of all the graph heads in EPGM file :return: The graph in networkx format """ try: # assume args.input points to an EPGM graph G_epgm = EPGM(graph_file) graphs = G_epgm.G["graphs"] if ( dataset_name is None ): # if dataset_name is not given, use the name of the 1st graph head dataset_name = graphs[0]["meta"]["label"] print( "WARNING: dataset name not specified, using dataset '{}' in the 1st graph head" .format(dataset_name)) graph_id = None for g in graphs: if g["meta"]["label"] == dataset_name: graph_id = g["id"] g = G_epgm.to_nx(graph_id, directed) if weighted: raise NotImplementedError else: # This is the correct way to set the edge weight in a MultiGraph. edge_weights = {e: 1 for e in g.edges(keys=True)} nx.set_edge_attributes(g, name="weight", values=edge_weights) except: # otherwise, assume arg.input points to an edgelist file if weighted: g = nx.read_edgelist( graph_file, nodetype=int, data=(("weight", float), ), create_using=nx.DiGraph(), ) else: g = nx.read_edgelist(graph_file, nodetype=int, create_using=nx.DiGraph()) for edge in g.edges(): g[edge[0]][edge[1]]["weight"] = 1 if not directed: g = g.to_undirected() if not nx.is_connected(g): print("Graph is not connected") # take the largest connected component as the data g_ccs = (g.subgraph(c).copy() for c in nx.connected_components(g)) g = max(g_ccs, key=len) print("Largest subgraph statistics: {} nodes, {} edges".format( g.number_of_nodes(), g.number_of_edges())) print("Graph statistics: {} nodes, {} edges".format( g.number_of_nodes(), g.number_of_edges())) return g
def test_load_epgm(self): """Test that the EPGM is loaded correctly from epgm path""" G_epgm = EPGM(self.input_dir) print(self.input_dir) assert "graphs" in G_epgm.G.keys() assert "vertices" in G_epgm.G.keys() assert "edges" in G_epgm.G.keys() # check that G_epgm.G['graphs] has at least one graph head: assert len(G_epgm.G["graphs"]) > 0 # cora nodes should have a subject attribute graph_id = G_epgm.G["graphs"][0]["id"] assert self.target_attribute in G_epgm.node_attributes( graph_id, self.node_type) # cora should have 2708 vertices n_nodes = 2708 nodes = G_epgm.G["vertices"] assert len(nodes) == n_nodes # cora nodes should have 7 unique values for subject attribute: assert sum(["data" in v for v in nodes]) == n_nodes subjects = np.unique([v["data"][self.target_attribute] for v in nodes]) assert len(subjects) == 7
def test_load_epgm(self): """Test that the EPGM is loaded correctly from epgm path""" G_epgm = EPGM(self.input_dir) assert "graphs" in G_epgm.G.keys() assert "vertices" in G_epgm.G.keys() assert "edges" in G_epgm.G.keys() # check that G_epgm.G['graphs] has at least one graph head: assert len(G_epgm.G["graphs"]) > 0 # graph nodes of self.node_type type should have a self.target_attribute attribute graph_id = G_epgm.G["graphs"][0]["id"] assert self.target_attribute in G_epgm.node_attributes( graph_id, self.node_type) # graph should have 260 vertices n_nodes = 260 nodes = G_epgm.G["vertices"] assert len(nodes) == n_nodes # 'user' nodes should have 3 unique values for 'elite' attribute: # first make sure that all nodes have 'data' key assert sum(["data" in v for v in nodes]) == n_nodes labels_all = [v["data"].get(self.target_attribute) for v in nodes] labels = list(filter(lambda l: l is not None, labels_all)) assert len(np.unique(labels)) == 3
def load_data(path, dataset_name=None, node_type=None, target_attribute=None): """ Loads the node data :param path: Input filename or directory where graph in EPGM format is stored :param node_type: For HINs, the node type to consider :param target_attribute: For EPGM format, the target node attribute :return: N x 2 numpy arrays where the first column is the node id and the second column is the node label. """ if os.path.isdir(path): g_epgm = EPGM(path) graphs = g_epgm.G["graphs"] for g in graphs: if g["meta"]["label"] == dataset_name: g_id = g["id"] g_vertices = g_epgm.G["vertices"] # retrieve all graph vertices if node_type is None: node_type = g_epgm.node_types(g_id) if len(node_type) == 1: node_type = node_type[0] else: raise Exception( "Multiple node types detected in graph {}: {}.".format( g_id, node_type ) ) if target_attribute is None: target_attribute = g_epgm.node_attributes(g_id, node_type) if len(target_attribute) == 1: target_attribute = target_attribute[0] else: raise Exception( "Multiple node attributes detected for nodes of type {} in graph {}: {}.".format( node_type, g_id, target_attribute ) ) y = np.array( get_nodes( g_vertices, node_type=node_type, target_attribute=target_attribute ) ) else: y_df = pd.read_csv(path, delimiter=" ", header=None, dtype=str) y_df.sort_values(by=[0], inplace=True) y = y_df.values return y
def test_node_types(self): """Test the .node_types() method""" G_epgm = EPGM(self.input_dir) graph_id = G_epgm.G["graphs"][0]["id"] # cora has a single 'paper' node type: node_types = G_epgm.node_types(graph_id) assert len(node_types) == 1 assert self.node_type in node_types with pytest.raises(Exception): G_epgm.node_types("invalid_graph_id")
def test_node_types(self): """Test the .node_types() method""" G_epgm = EPGM(self.input_dir) graph_id = G_epgm.G["graphs"][0]["id"] # dataset has multiple node types: node_types = G_epgm.node_types(graph_id) assert len(node_types) == 3 assert "person" in node_types assert "paper" in node_types assert "venue" in node_types with pytest.raises(Exception): G_epgm.node_types("invalid_graph_id")
def from_epgm(epgm_location, dataset_name=None, directed=False): """ Imports a graph stored in EPGM format to a NetworkX object Args: epgm_location (str): The directory containing the EPGM data dataset_name (str), optional: The name of the dataset to import directed (bool): If True, load as a directed graph, otherwise load as an undirected graph Returns: A NetworkX graph containing the data for the EPGM-stored graph. """ G_epgm = EPGM(epgm_location) graphs = G_epgm.G["graphs"] # if dataset_name is not given, use the name of the 1st graph head if not dataset_name: dataset_name = graphs[0]["meta"]["label"] warnings.warn( "dataset name not specified, using dataset '{}' in the 1st graph head".format( dataset_name ), RuntimeWarning, stacklevel=2, ) # Select graph using dataset_name for g in graphs: if g["meta"]["label"] == dataset_name: graph_id = g["id"] # Convert to StellarGraph (via nx) Gnx = G_epgm.to_nx(graph_id, directed=directed) print( "Graph statistics: {} nodes, {} edges".format( Gnx.number_of_nodes(), Gnx.number_of_edges() ) ) return Gnx
def test_node_attributes(self): """Test the .node_attributes() method""" G_epgm = EPGM(self.input_dir) graph_id = G_epgm.G["graphs"][0]["id"] # cora has 1433 unique node attributes, including 'subject' node_attributes = G_epgm.node_attributes(graph_id, self.node_type) assert self.target_attribute in node_attributes # after the predictions cora has 1434 attributes, including subject and subject_PREDICTED if self.epgm_input: assert ( len(node_attributes) == 1433 ), "There should be 1433 unique node attributes; found {}".format( len(node_attributes)) else: assert ( len(node_attributes) == 1434 ), "There should be 1434 unique node attributes; found {}".format( len(node_attributes)) # passing a non-existent node type should return an empty array of node attributes: assert len(G_epgm.node_attributes(graph_id, "person")) == 0 # if node_type is not supplied, a TypeError should be raised: with pytest.raises(TypeError): G_epgm.node_attributes(graph_id)
def test_node_attributes(self): """Test the .node_attributes() method""" G_epgm = EPGM(self.input_dir) graph_id = G_epgm.G["graphs"][0]["id"] # dataset has 1 unique 'user' node attribute, 'elite' node_attributes = G_epgm.node_attributes(graph_id, self.node_type) assert self.target_attribute in node_attributes assert (len(node_attributes) == 1 ), "There should be 1 unique node attribute; found {}".format( len(node_attributes)) # passing a non-existent node type should return an empty array of node attributes: assert len(G_epgm.node_attributes(graph_id, "business")) == 0 # if node_type is not supplied, a TypeError should be raised: with pytest.raises(TypeError): G_epgm.node_attributes(graph_id)
def read_graph(graph_file, dataset_name, is_directed=False, is_weighted=False): """ Reads the input network in networkx. Args: graph_file: The directory where graph in EPGM format is stored. dataset_name: The name of the graph selected out of all the graph heads in EPGM file. Returns: The graph in networkx format """ if graph_file.split('.')[-1] == 'gpickle': g = nx.read_gpickle(graph_file) for edge in g.edges(): g[edge[0]][edge[1]]["weight"] = 1 # {'weight': 1} if not is_directed: g = g.to_undirected() return g try: # assume args.input points to an EPGM graph G_epgm = EPGM(graph_file) graphs = G_epgm.G["graphs"] if ( dataset_name is None ): # if dataset_name is not given, use the name of the 1st graph head dataset_name = graphs[0]["meta"]["label"] print( "WARNING: dataset name not specified, using dataset '{}' in the 1st graph head" .format(dataset_name)) graph_id = None for g in graphs: if g["meta"]["label"] == dataset_name: graph_id = g["id"] g = G_epgm.to_nx(graph_id, is_directed) if is_weighted: raise NotImplementedError else: # This is the correct way to set the edge weight in a MultiGraph. edge_weights = {e: 1 for e in g.edges(keys=True)} nx.set_edge_attributes(g, name="weight", values=edge_weights) except: # otherwise, assume arg.input points to an edgelist file if is_weighted: g = nx.read_edgelist( graph_file, nodetype=int, data=(("weight", float), ), create_using=nx.DiGraph(), ) else: g = nx.read_edgelist(graph_file, nodetype=int, create_using=nx.DiGraph()) for edge in g.edges(): g[edge[0]][edge[1]]["weight"] = 1 # {'weight': 1} if not is_directed: g = g.to_undirected() print("Graph statistics: {} nodes, {} edges".format( g.number_of_nodes(), g.number_of_edges())) return g