def SplitGraphObj(graphObj): # Creating splitter object using EdgeSplitter and create test graph and test set. # Test graph is a reduced version of original graph obtained by removing links from test set. # Here we are samping 0.1 which is ~10% of positive and negative links and creating Test set. test_splitobj = EdgeSplitter(graphObj) test_graph, edgelist_test, labels_test = test_splitobj.train_test_split( p=0.1, method="global") # Creating train set and train graph further using reduced test graph # train_test_split returns 'reduced graph (positive edges removed)' , #'N*2 dim edgelist for pos and neg edges sampled', #'labels '0' and '1' based for neg and pos edge resp' train_splitobj = EdgeSplitter(test_graph, graphObj) train_graph, edgelist, labels = train_splitobj.train_test_split( p=0.1, method="global" ) #Using sklearn train_test_split method to split the edgelist and labels #Here we are splitting into 75% and 25% to generate model selection and train set. ( edgelist_train, edgelist_model_selection, labels_train, labels_model_selection, ) = train_test_split(edgelist, labels, train_size=0.75, test_size=0.25) return test_graph, train_graph, edgelist_test, edgelist_train,edgelist_model_selection, labels_test ,labels_train, labels_model_selection
def initialize(self,**hyper_params): if(not "batch_size" in hyper_params.keys()): batch_size = 20 if(not "layer_sizes" in hyper_params.keys()): num_samples = [20, 10] if(not "num_samples" in hyper_params.keys()): layer_sizes = [10, 10 ] if(not "bias" in hyper_params.keys()): bias = True if(not "dropout" in hyper_params.keys()): dropout = 0.1 if(not "lr" in hyper_params.keys()): lr = 1e-2 graph = sg.StellarGraph(nodes=self.nodes,edges=self.edges) # Test split edge_splitter_test = EdgeSplitter(graph) self.graph_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split( p=0.1, method="global", keep_connected=True, seed = 42 ) # Train split edge_splitter_train = EdgeSplitter(self.graph_test) self.graph_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split( p=0.1, method="global", keep_connected=True, seed = 42 ) # Train iterators train_gen = GraphSAGELinkGenerator(self.graph_train, batch_size, num_samples, seed = 42) self.train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True) # Test iterators test_gen = GraphSAGELinkGenerator(self.graph_train, batch_size, num_samples, seed = 42) self.test_flow = test_gen.flow(edge_ids_test, edge_labels_test, shuffle=True) # Model defining - Keras functional API + Stellargraph layers graphsage = GraphSAGE( layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout ) x_inp, x_out = graphsage.in_out_tensors() prediction = link_classification( output_dim=1, output_act="sigmoid", edge_embedding_method="ip" )(x_out) self.model = keras.Model(inputs=x_inp, outputs=prediction) self.model.compile( optimizer=keras.optimizers.Adam(lr=lr), loss=keras.losses.binary_crossentropy, metrics=[keras.metrics.BinaryAccuracy(),keras.metrics.Recall(),keras.metrics.AUC(),keras.metrics.Precision()], ) # return number of training and testing examples return edge_ids_train.shape[0],edge_ids_test.shape[0]
def inference(embeddings): G = form_graph("data/cutted_edges.csv", "data/cutted_features.csv", "data/cutted_edges_to.csv", embeddings) edge_splitter_full = EdgeSplitter(G) import keras as keras from stellargraph.layer import MeanAggregator, LinkEmbedding model = keras.models.load_model('data/w_rev1.h5', custom_objects={'MeanAggregator': MeanAggregator, 'LinkEmbedding': LinkEmbedding}) G_full, edge_ids_full, edge_labels_full = edge_splitter_full.train_test_split( p=0.5, method="global", keep_connected=True ) batch_size = 20 num_samples = [20, 10] generator = GraphSAGELinkGenerator(G, batch_size, num_samples) hold_out_gen = generator.flow(edge_ids_full, edge_labels_full) hold_out_predictions_pr = model.predict(hold_out_gen) ID = 111180 EDGE_results = [] for i in range(len(edge_ids_full)): if edge_ids_full[i][0] == ID or edge_ids_full[i][1] == ID: EDGE_results.append(i) predictions = [[hold_out_predictions_pr[EDGE_results[i]][0], edge_ids_full[EDGE_results[i]][0]] for i in range(len(hold_out_predictions_pr[EDGE_results]))] a = sorted(predictions, reverse=True)[0:10] sorted_ids = [a[i][1] for i in range(len(a))] return sorted_ids
def split_graph(g, p_test=0.1, p_train=0.1): # TEST edge_splitter_test = EdgeSplitter(g) # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from graph, and obtain the # reduced graph graph_test with the sampled links removed: g_test, edges_test, labels_test = edge_splitter_test.train_test_split( p=p_test, method="global", edge_label='image2word') # TRAIN edge_splitter_train = EdgeSplitter(g_test, g) # Sampling for the second time for train to eliminate overlap with test: g_train, edges_train, labels_train = edge_splitter_train.train_test_split( p=p_train, method="global", edge_label='image2word') return edges_train, edges_test, labels_train, labels_test, g_train, g_test
def initialize(self, **hyper_params): if (not "batch_size" in hyper_params.keys()): batch_size = 20 if (not "layer_sizes" in hyper_params.keys()): num_samples = [20, 10] if (not "num_samples" in hyper_params.keys()): layer_sizes = [20, 20] if (not "bias" in hyper_params.keys()): bias = True if (not "dropout" in hyper_params.keys()): dropout = 0.3 if (not "lr" in hyper_params.keys()): lr = 1e-3 if (not "train_split" in hyper_params.keys()): train_split = 0.2 self.graph = sg.StellarGraph(nodes=self.nodes, edges=self.edges) # Train split edge_splitter_train = EdgeSplitter(self.graph) graph_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split( p=train_split, method="global", keep_connected=True) # Train iterators train_gen = GraphSAGELinkGenerator(graph_train, batch_size, num_samples) self.train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True) # Model defining - Keras functional API + Stellargraph layers graphsage = GraphSAGE(layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout) x_inp, x_out = graphsage.in_out_tensors() prediction = link_classification(output_dim=1, output_act="relu", edge_embedding_method="ip")(x_out) self.model = keras.Model(inputs=x_inp, outputs=prediction) self.model.compile( optimizer=keras.optimizers.Adam(lr=lr), loss=keras.losses.binary_crossentropy, metrics=["acc"], ) return self.model.get_weights()
def test(G, model_file: AnyStr, batch_size: int = 100): """ Load the serialized model and evaluate on a random balanced subset of all links in the graph. Note that the set of links the model is evaluated on may contain links from the model's training set. To avoid this, set the seed of the edge splitter to the same seed as used for link splitting in train() Args: G: NetworkX graph file model_file: Location of Keras model to load batch_size: Size of batch for inference """ print("Loading model from ", model_file) model = keras.models.load_model( model_file, custom_objects={"MeanAggregator": MeanAggregator}) # Get required input shapes from model num_samples = [ int(model.input_shape[ii + 1][1] / model.input_shape[ii][1]) for ii in range(1, len(model.input_shape) - 1, 2) ] edge_splitter_test = EdgeSplitter(G) # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G, and obtain the # reduced graph G_test with the sampled links removed: G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split( p=0.1, method=args.edge_sampling_method, probs=args.edge_sampling_probs) # Convert G_test to StellarGraph object (undirected, as required by GraphSAGE): G_test = sg.StellarGraph(G_test, node_features="feature") # Generator feeds data from (source, target) sampled subgraphs to GraphSAGE model test_gen = GraphSAGELinkGenerator( G_test, batch_size, num_samples, name="test", ).flow(edge_ids_test, edge_labels_test) # Evaluate and print metrics test_metrics = model.evaluate_generator(test_gen) print("\nTest Set Evaluation:") for name, val in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, val))
def splitSampleGraph(): print('Graph post 2020') graph = nx.read_gml('../../graphs/graph_postCOVID_final_netx.gml.gz') for i in range(1, 6): print('Current run ' + str(i)) # Define an edge splitter on the original graph: edge_splitter_ = EdgeSplitter(graph) # Randomly sample a fraction p of the graph (positive links), and same number of negative links, from graph, and obtain the # reduced graph graph_subset with the sampled links removed: graph_, sampled_edges, sample_labels = edge_splitter_.train_test_split(p=0.5, method="global") nx.write_gml(graph_, '../../graphs/graph_sampled_' + str(i) + '.gml.gz') del graph_ # Now, split the sampled edges into training-test-validation sets for performing link prediction # Split operation 1 - obtain test versus train+validation (sampled_comp, sampled_test, labels_comp, labels_test,) = train_test_split(sampled_edges, sample_labels, train_size=0.65, test_size=0.35) # Split operation 2 - divide the comp block into training and validation sets (sampled_training, sampled_validation, labels_training, labels_validation,) = train_test_split(sampled_comp, labels_comp, train_size=0.77, test_size=0.23) # Save the sampled training validation test sets df_train = pd.DataFrame({'node1': np.array(sampled_training)[:, 0], 'node2': np.array(sampled_training)[:, 1], 'labels': labels_training}) df_train.to_csv('../../graphs/graph_train_edges_sampled_' + str(i) + '.csv') del df_train print('Number of training samples (positive) ' + str(len(labels_training)/2.0)) df_val = pd.DataFrame({'node1': np.array(sampled_validation)[:, 0], 'node2': np.array(sampled_validation)[:, 1], 'labels': labels_validation}) df_val.to_csv('../../graphs/graph_val_edges_sampled_' + str(i) + '.csv') del df_val print('Number of validation samples (positive) ' + str(len(labels_validation)/2.0)) df_test = pd.DataFrame({'node1': np.array(sampled_test)[:, 0], 'node2': np.array(sampled_test)[:, 1], 'labels': labels_test}) df_test.to_csv('../../graphs/graph_test_edges_sampled_' + str(i) + '.csv') del df_test print('Number of test samples (positive) ' + str(len(labels_test)/2.0))
def create_samples(g): G = sg.StellarGraph.from_networkx(g, node_features="feature") edge_splitter_test = EdgeSplitter(G) G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split( p=0.1, method="global", keep_connected=True) id_1, id_2 = [], [] for edge in edge_ids_test: id_1.append(edge[0]) id_2.append(edge[1]) link = list(edge_labels_test) num = int(nx.number_of_edges(g) * 0.1) positive_sample = edge_ids_test[:num] # negative_sample = edge_ids_test[num:] for edge in positive_sample: g.remove_edge(edge[0], edge[1]) df = pd.DataFrame({'id_1': id_1, 'id_2': id_2, 'link': link}) # df.to_csv('nodes.csv', encoding='utf-8') return g, df
G = StellarGraph(edges = edgelist_df) print("\n", G.info()) print("Created master graph from data") # Define an edge splitter on the original graph: edge_splitter_test = EdgeSplitter(G) # Randomly sample a fraction p=0.1 of all positive links, # and same number of negative links, from graph, and obtain the # reduced graph graph_test with the sampled links removed: ( G_test, # To compute node embeddings with mode edges than G_train examples_test, labels_test ) = edge_splitter_test.train_test_split(p=0.1, method="global") #print(G_test.info()) print("Created test Graph from master graph") # Do the same process to compute a training subset from within the test graph edge_splitter_train = EdgeSplitter(G_test, G) ( G_train, # To compute node embeddings examples, labels ) = edge_splitter_train.train_test_split(p=0.1, method="global") print("Created train Graph from test graph")
print("Graph is not connected") # take the largest connected component as the data g_nx_ccs = (g_nx.subgraph(c).copy() for c in nx.connected_components(g_nx)) g_nx = max(g_nx_ccs, key=len) print("Largest subgraph statistics: {} nodes, {} edges".format( g_nx.number_of_nodes(), g_nx.number_of_edges())) # From the original graph, extract E_test and G_test edge_splitter_test = EdgeSplitter(g_nx) if args.hin: g_test, edge_data_ids_test, edge_data_labels_test = edge_splitter_test.train_test_split( p=p, edge_label=args.edge_type, edge_attribute_label=args.edge_attribute_label, edge_attribute_threshold=args.edge_attribute_threshold, attribute_is_datetime=args.attribute_is_datetime, method=args.sampling_method, probs=sampling_probs, ) else: g_test, edge_data_ids_test, edge_data_labels_test = edge_splitter_test.train_test_split( p=p, method=args.sampling_method, probs=sampling_probs) if args.show_histograms: if args.sampling_method == "local": bins = np.arange(1, len(sampling_probs) + 2) else: bins = np.arange( 1, np.max(edge_splitter_test.negative_edge_node_distances) + 2)
def iteration(ind): datasets = {} edge_splitter_test = EdgeSplitter(G) G_test, ids_test, _ = edge_splitter_test.train_test_split( p=float(probs[0]), method="global", keep_connected=True, seed=ind) ids_test, labels_test = undirected_label2directed_label( A, ids_test, task) edge_splitter_val = EdgeSplitter(G_test) G_val, ids_val, _ = edge_splitter_val.train_test_split( p=float(probs[1]), method="global", keep_connected=True, seed=ind) ids_val, labels_val = undirected_label2directed_label(A, ids_val, task) edge_splitter_train = EdgeSplitter(G_val) _, ids_train, _ = edge_splitter_train.train_test_split( p=0.99, method="global", keep_connected=False, seed=ind) ids_train, labels_train = undirected_label2directed_label( A, ids_train, task) # observation after removing edges for training/validation/testing edges = [e for e in G_val.edges] # convert back to directed graph oberved_edges = np.zeros((len(edges), 2), dtype=np.int32) undirected_edges = np.zeros((2 * len(G.edges), 2), dtype=np.int32) for i, e in enumerate(edges): if A[e[0], e[1]] > 0: oberved_edges[i, 0] = int(e[0]) oberved_edges[i, 1] = int(e[1]) if A[e[1], e[0]] > 0: oberved_edges[i, 0] = int(e[1]) oberved_edges[i, 1] = int(e[0]) for i, e in enumerate(G.edges): if A[e[0], e[1]] > 0 or A[e[1], e[0]] > 0: undirected_edges[i, :] = [int(e[1]), e[0]] undirected_edges[i + len(edges), :] = [int(e[0]), e[1]] if label_dim == 2: ids_train = ids_train[labels_train < 2] labels_train = labels_train[labels_train < 2] ids_test = ids_test[labels_test < 2] labels_test = labels_test[labels_test < 2] ids_val = ids_val[labels_val < 2] labels_val = labels_val[labels_val < 2] ############################################ # training data ############################################ datasets[ind] = {} datasets[ind]['graph'] = torch.from_numpy(oberved_edges.T).long() datasets[ind]['undirected'] = undirected_edges datasets[ind]['train'] = {} datasets[ind]['train']['pairs'] = ids_train datasets[ind]['train']['label'] = labels_train ############################################ # validation data ############################################ datasets[ind]['validate'] = {} datasets[ind]['validate']['pairs'] = ids_val datasets[ind]['validate']['label'] = labels_val ############################################ # test data ############################################ datasets[ind]['test'] = {} datasets[ind]['test']['pairs'] = ids_test datasets[ind]['test']['label'] = labels_test return datasets
edgelist = utils.load_data(filename_edges, weighted=wt) G = StellarGraph(edges=edgelist) print("\n", G.info()) print("Created master graph from data") # Define an edge splitter on the original graph: edge_splitter_test = EdgeSplitter(G) # Randomly sample a fraction p=0.1 of all positive links, # and same number of negative links, from graph, and obtain the # reduced graph graph_test with the sampled links removed: ( G_test, # To compute node embeddings with mode edges than G_train examples_test, labels_test) = edge_splitter_test.train_test_split(p=0.1, method="global") #print(G_test.info()) print("Created test Graph from master graph") # Do the same process to compute a training subset from within the test graph edge_splitter_train = EdgeSplitter(G_test, G) ( G_train, # To compute node embeddings examples, labels) = edge_splitter_train.train_test_split(p=0.1, method="global") print("Created train Graph from test graph") ( examples_train, # For training classifiers. They dont exist in G_train examples_model_selection, # For choosing the best classifier
# # We begin with the full graph and use the `EdgeSplitter` class to produce: # # * Test Graph # * Test set of positive/negative link examples # # The Test Graph is the reduced graph we obtain from removing the test set of links from the full graph. # In[6]: # Define an edge splitter on the original graph: edge_splitter_test = EdgeSplitter(graph) # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from graph, and obtain the # reduced graph graph_test with the sampled links removed: graph_test, examples_test, labels_test = edge_splitter_test.train_test_split( p=0.1, method="global") print(graph_test.info()) # ### Train Graph # # This time, we use the `EdgeSplitter` on the Test Graph, and perform a train/test split on the examples to produce: # # * Train Graph # * Training set of link examples # * Set of link examples for model selection # # In[7]: # Do the same process to compute a training subset from within the test graph
# reduced) as some of the links will be removed during each split and used as the positive samples for # training/testing the link prediction classifier. # From the original graph G, extract a randomly sampled subset of test edges (true and false citation links) and the # reduced graph G_test with the positive test edges removed: # In[4]: # Define an edge splitter on the original graph G: edge_splitter_test = EdgeSplitter(G) # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G, and obtain the # reduced graph G_test with the sampled links removed: G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split( p=0.1, method="global", keep_connected=True, seed=42 ) # The reduced graph G_test, together with the test ground truth set of links (edge_ids_test, edge_labels_test), # will be used for testing the model. # # Now, repeat this procedure to obtain validation data that we are going to use for early stopping in order to # prevent overfitting. From the reduced graph G_test, extract a randomly sampled subset of validation edges (true and # false citation links) and the reduced graph G_val with the positive validation edges removed. # In[5]: # Define an edge splitter on the reduced graph G_test: edge_splitter_val = EdgeSplitter(G_test)
def train( G, layer_size: List[int], num_samples: List[int], batch_size: int = 100, num_epochs: int = 10, learning_rate: float = 0.001, dropout: float = 0.0, ): """ Train the GraphSAGE model on the specified graph G with given parameters. Args: G: NetworkX graph file layer_size: A list of number of hidden units in each layer of the GraphSAGE model num_samples: Number of neighbours to sample at each layer of the GraphSAGE model batch_size: Size of batch for inference num_epochs: Number of epochs to train the model learning_rate: Initial Learning rate dropout: The dropout (0->1) """ # Split links into train/test print("Using '{}' method to sample negative links".format( args.edge_sampling_method)) # From the original graph, extract E_test and the reduced graph G_test: edge_splitter_test = EdgeSplitter(G) # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G, and obtain the # reduced graph G_test with the sampled links removed: G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split( p=0.1, keep_connected=True, method=args.edge_sampling_method, probs=args.edge_sampling_probs, ) # From G_test, extract E_train and the reduced graph G_train: edge_splitter_train = EdgeSplitter(G_test, G) # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G_test, and obtain the # further reduced graph G_train with the sampled links removed: G_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split( p=0.1, keep_connected=True, method=args.edge_sampling_method, probs=args.edge_sampling_probs, ) # G_train, edge_ds_train, edge_labels_train will be used for model training # G_test, edge_ds_test, edge_labels_test will be used for model testing # Convert G_train and G_test to StellarGraph objects (undirected, as required by GraphSAGE) for ML: G_train = sg.StellarGraph(G_train, node_features="feature") G_test = sg.StellarGraph(G_test, node_features="feature") # Mapper feeds link data from sampled subgraphs to GraphSAGE model # We need to create two mappers: for training and testing of the model train_gen = GraphSAGELinkGenerator(G_train, batch_size, num_samples) train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True) test_gen = GraphSAGELinkGenerator(G_test, batch_size, num_samples) test_flow = test_gen.flow(edge_ids_test, edge_labels_test) # GraphSAGE model graphsage = GraphSAGE(layer_sizes=layer_size, generator=train_gen, bias=True, dropout=dropout) # Construct input and output tensors for the link prediction model x_inp, x_out = graphsage.build() # Final estimator layer prediction = link_classification( output_dim=1, output_act="sigmoid", edge_embedding_method=args.edge_embedding_method, )(x_out) # Stack the GraphSAGE and prediction layers into a Keras model, and specify the loss model = keras.Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=optimizers.Adam(lr=learning_rate), loss=losses.binary_crossentropy, metrics=[metrics.binary_accuracy], ) # Evaluate the initial (untrained) model on the train and test set: init_train_metrics = model.evaluate_generator(train_flow) init_test_metrics = model.evaluate_generator(test_flow) print("\nTrain Set Metrics of the initial (untrained) model:") for name, val in zip(model.metrics_names, init_train_metrics): print("\t{}: {:0.4f}".format(name, val)) print("\nTest Set Metrics of the initial (untrained) model:") for name, val in zip(model.metrics_names, init_test_metrics): print("\t{}: {:0.4f}".format(name, val)) # Train model print("\nTraining the model for {} epochs...".format(num_epochs)) history = model.fit_generator( train_flow, epochs=num_epochs, validation_data=test_flow, verbose=2, shuffle=False, ) # Evaluate and print metrics train_metrics = model.evaluate_generator(train_flow) test_metrics = model.evaluate_generator(test_flow) print("\nTrain Set Metrics of the trained model:") for name, val in zip(model.metrics_names, train_metrics): print("\t{}: {:0.4f}".format(name, val)) print("\nTest Set Metrics of the trained model:") for name, val in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, val)) # Save the trained model save_str = "_n{}_l{}_d{}_r{}".format( "_".join([str(x) for x in num_samples]), "_".join([str(x) for x in layer_size]), dropout, learning_rate, ) model.save("graphsage_link_pred" + save_str + ".h5")
path_node_partition = "./data/4_attributes_0" path_edge_partition = "./data/4_0" # Constructing the graph nodes = pd.read_csv(path_node_partition , sep='\t', lineterminator='\n',header=None).loc[:,0:1433] nodes.set_index(0,inplace=True) edges = pd.read_csv(path_edge_partition , sep='\s+', lineterminator='\n', header=None) edges.columns = ["source","target"] G = sg.StellarGraph(nodes=nodes,edges=edges) # Train split edge_splitter_train = EdgeSplitter(G) G_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split( p=0.2, method="global", keep_connected=True ) # Hyperparams batch_size = 20 epochs = 20 num_samples = [20, 10] layer_sizes = [20, 20] # Train iterators train_gen = GraphSAGELinkGenerator(G_train, batch_size, num_samples) train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True) # Model defining - Keras functional API + Stellargraph layers
# # Reduce initial Graph # edge_splitter_test = EdgeSplitter(G) # G,_,_ = edge_splitter_test.train_test_split(p=0.8, # method="global", # keep_connected=True) # print("\n", G.info()) # Define an edge splitter on the original graph G: edge_splitter_test = EdgeSplitter(G) # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G, and obtain the # reduced graph G_test with the sampled links removed: (G_test, edge_ids_test, edge_labels_test) = edge_splitter_test.train_test_split(p=0.1, method="global", keep_connected=True) # Define an edge splitter on the reduced graph G_test: edge_splitter_train = EdgeSplitter(G_test) # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G_test, and obtain the # reduced graph G_train with the sampled links removed: (G_train, edge_ids_train, edge_labels_train) = edge_splitter_train.train_test_split(p=0.1, method="global", keep_connected=True) print(G_train.info()) print(G_test.info())
def graph_links_from_csv(nodes_filepath, links_filepath, sample_sizes=[10, 20], train_pct=0.1, val_pct=0.1, sep=',', holdout_pct=None, holdout_for_inductive=False, missing_label_value=None, random_state=None, verbose=1): """ Loads graph data from CSV files. Returns generators for links in graph for use with GraphSAGE model. Args: nodes_filepath(str): file path to training CSV containing node attributes links_filepath(str): file path to training CSV describing links among nodes sample_sizes(int): Number of nodes to sample at each neighborhood level. train_pct(float): Proportion of edges to use for training. Default is 0.1. Note that train_pct is applied after val_pct is applied. val_pct(float): Proportion of edges to use for validation sep (str): delimiter for CSVs. Default is comma. random_state (int): random seed for train/test split verbose (boolean): verbosity Return: tuple of EdgeSequenceWrapper objects for train and validation sets and LinkPreprocessor """ # import stellargraph try: import stellargraph as sg from stellargraph.data import EdgeSplitter except: raise Exception(SG_ERRMSG) if version.parse(sg.__version__) < version.parse('0.8'): raise Exception(SG_ERRMSG) #---------------------------------------------------------------- # read graph structure #---------------------------------------------------------------- nx_sep = None if sep in [' ', '\t'] else sep G = nx.read_edgelist(path=links_filepath, delimiter=nx_sep) print(nx.info(G)) #---------------------------------------------------------------- # read node attributes #---------------------------------------------------------------- node_attr = pd.read_csv(nodes_filepath, sep=sep, header=None) num_features = len(node_attr.columns.values) - 1 # subract ID and treat all other columns as features feature_names = ["w_{}".format(ii) for ii in range(num_features)] node_data = pd.read_csv(nodes_filepath, header=None, names=feature_names, sep=sep) node_data.index = node_data.index.map(str) df = node_data[node_data.index.isin(list(G.nodes()))] for col in feature_names: if not isinstance(node_data[col].values[0], str): continue df = pd.concat([df, df[col].astype('str').str.get_dummies().add_prefix(col+'_')], axis=1, sort=False) df = df.drop([col], axis=1) feature_names = df.columns.values node_data = df node_features = node_data[feature_names].values for nid, f in zip(node_data.index, node_features): G.node[nid][sg.globalvar.TYPE_ATTR_NAME] = "node" G.node[nid]["feature"] = f #---------------------------------------------------------------- # train/validation sets #---------------------------------------------------------------- edge_splitter_test = EdgeSplitter(G) G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split(p=val_pct, method="global", keep_connected=True) edge_splitter_train = EdgeSplitter(G_test) G_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(p=train_pct, method="global", keep_connected=True) epp = LinkPreprocessor(G, sample_sizes=sample_sizes) trn = epp.preprocess_train(G_train, edge_ids_train, edge_labels_train) val = epp.preprocess_valid(G_test, edge_ids_test, edge_labels_test) return (trn, val, epp)
def main(): """Load the graph, create the embeddings, evaluate them with link prediction and save the results.""" args = parse_args() graph = utils.load_graph(args.weighted, args.directed, args.input) utils.print_graph_info(graph, "original graph") graph.remove_nodes_from(list(nx.isolates(graph))) utils.print_graph_info(graph, "graph without isolates") edge_splitter_test = EdgeSplitter(graph) graph_test, X_test_edges, y_test = edge_splitter_test.train_test_split( p=args.test_percentage, method="global") edge_splitter_train = EdgeSplitter(graph_test, graph) graph_train, X_edges, y = edge_splitter_train.train_test_split( p=args.train_percentage, method="global") X_train_edges, X_model_selection_edges, y_train, y_model_selection = train_test_split( X_edges, y, train_size=0.75, test_size=0.25) logger.info(f'\nEmbedding algorithm started.') start = time.time() embedding.create_embedding(args, graph_train) time_diff = time.time() - start logger.info(f'\nEmbedding algorithm finished in {time_diff:.2f} seconds.') embeddings = utils.load_embedding(args.output) logger.info(f'\nEmbedding evaluation started.') start = time.time() results = evaluation.evaluate(args.classifier, embeddings, X_train_edges, y_train, X_model_selection_edges, y_model_selection) time_diff = time.time() - start logger.info(f'Embedding evaluation finished in {time_diff:.2f} seconds.') best_result = max(results, key=lambda result: result["roc_auc"]) logger.info( f"\nBest roc_auc_score on train set using '{best_result['binary_operator'].__name__}': {best_result['roc_auc']}." ) logger.info(f'\nEmbedding algorithm started.') start = time.time() embedding.create_embedding(args, graph_test) time_diff = time.time() - start logger.info(f'\nEmbedding algorithm finished in {time_diff:.2f} seconds.') embedding_test = utils.load_embedding(args.output) roc_auc, average_precision, accuracy, f1 = evaluation.evaluate_model( best_result["classifier"], embedding_test, best_result["binary_operator"], X_test_edges, y_test) logger.info( f"Scores on test set using '{best_result['binary_operator'].__name__}'." ) logger.info(f"roc_auc_score: {roc_auc}") logger.info(f"average_precision_score: {average_precision}") logger.info(f"accuracy_score: {accuracy}") logger.info(f"f1_score on test set using: {f1}\n") if (args.results): evaluation.save_evaluation_results( args.dataset, args.method, args.classifier, (roc_auc, average_precision, accuracy, f1), args.results)