def inference(embeddings): G = form_graph("data/cutted_edges.csv", "data/cutted_features.csv", "data/cutted_edges_to.csv", embeddings) edge_splitter_full = EdgeSplitter(G) import keras as keras from stellargraph.layer import MeanAggregator, LinkEmbedding model = keras.models.load_model('data/w_rev1.h5', custom_objects={'MeanAggregator': MeanAggregator, 'LinkEmbedding': LinkEmbedding}) G_full, edge_ids_full, edge_labels_full = edge_splitter_full.train_test_split( p=0.5, method="global", keep_connected=True ) batch_size = 20 num_samples = [20, 10] generator = GraphSAGELinkGenerator(G, batch_size, num_samples) hold_out_gen = generator.flow(edge_ids_full, edge_labels_full) hold_out_predictions_pr = model.predict(hold_out_gen) ID = 111180 EDGE_results = [] for i in range(len(edge_ids_full)): if edge_ids_full[i][0] == ID or edge_ids_full[i][1] == ID: EDGE_results.append(i) predictions = [[hold_out_predictions_pr[EDGE_results[i]][0], edge_ids_full[EDGE_results[i]][0]] for i in range(len(hold_out_predictions_pr[EDGE_results]))] a = sorted(predictions, reverse=True)[0:10] sorted_ids = [a[i][1] for i in range(len(a))] return sorted_ids
def initialize(self,**hyper_params): if(not "batch_size" in hyper_params.keys()): batch_size = 16 if(not "layer_sizes" in hyper_params.keys()): num_samples = [25, 10] if(not "num_samples" in hyper_params.keys()): layer_sizes = [256, 256] if(not "bias" in hyper_params.keys()): bias = True if(not "dropout" in hyper_params.keys()): dropout = 0.0 if(not "lr" in hyper_params.keys()): lr = 1e-3 if(not "num_walks" in hyper_params.keys()): num_walks = 1 if(not "length" in hyper_params.keys()): length = 5 self.graph = sg.StellarGraph(nodes=self.nodes_df,edges=self.edges_df) self.nodes = list(self.graph.nodes()) del self.nodes_df del self.edges_df unsupervised_samples = UnsupervisedSampler( self.graph, nodes=self.nodes, length=length, number_of_walks=num_walks ) # Train iterators train_gen = GraphSAGELinkGenerator(self.graph, batch_size, num_samples) self.train_flow = train_gen.flow(unsupervised_samples) # Model defining - Keras functional API + Stellargraph layers graphsage = GraphSAGE( layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout, normalize="l2" ) x_inp, x_out = graphsage.in_out_tensors() prediction = link_classification( output_dim=1, output_act="sigmoid", edge_embedding_method="ip" )(x_out) self.model = keras.Model(inputs=x_inp, outputs=prediction) self.model.compile( optimizer=keras.optimizers.Adam(lr=lr), loss=keras.losses.binary_crossentropy, metrics=[keras.metrics.binary_accuracy], ) x_inp_src = x_inp[0::2] x_out_src = x_out[0] self.embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src) self.node_gen = GraphSAGENodeGenerator(self.graph, batch_size, num_samples).flow(self.nodes) return self.model.get_weights()
def initialize(self, **hyper_params): if (not "batch_size" in hyper_params.keys()): batch_size = 20 if (not "layer_sizes" in hyper_params.keys()): num_samples = [20, 10] if (not "num_samples" in hyper_params.keys()): layer_sizes = [20, 20] if (not "bias" in hyper_params.keys()): bias = True if (not "dropout" in hyper_params.keys()): dropout = 0.3 if (not "lr" in hyper_params.keys()): lr = 1e-3 if (not "train_split" in hyper_params.keys()): train_split = 0.2 self.graph = sg.StellarGraph(nodes=self.nodes, edges=self.edges) # Train split edge_splitter_train = EdgeSplitter(self.graph) graph_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split( p=train_split, method="global", keep_connected=True) # Train iterators train_gen = GraphSAGELinkGenerator(graph_train, batch_size, num_samples) self.train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True) # Model defining - Keras functional API + Stellargraph layers graphsage = GraphSAGE(layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout) x_inp, x_out = graphsage.in_out_tensors() prediction = link_classification(output_dim=1, output_act="relu", edge_embedding_method="ip")(x_out) self.model = keras.Model(inputs=x_inp, outputs=prediction) self.model.compile( optimizer=keras.optimizers.Adam(lr=lr), loss=keras.losses.binary_crossentropy, metrics=["acc"], ) return self.model.get_weights()
def preprocess_train(self, G, edge_ids, edge_labels, mode='train'): """ ``` preprocess training set Args: G (networkx graph): networkx graph edge_ids(list): list of tuples representing edge ids edge_labels(list): edge labels (1 or 0 to indicated whether it is a true edge in original graph or not) ``` """ # import stellargraph try: import stellargraph as sg from stellargraph.mapper import GraphSAGELinkGenerator except: raise Exception(SG_ERRMSG) if version.parse(sg.__version__) < version.parse('0.8'): raise Exception(SG_ERRMSG) #edge_labels = to_categorical(edge_labels) G_sg = sg.StellarGraph(G, node_features="feature") #print(G_sg.info()) shuffle = True if mode == 'train' else False link_seq = GraphSAGELinkGenerator( G_sg, U.DEFAULT_BS, self.sample_sizes).flow(edge_ids, edge_labels, shuffle=shuffle) from .sg_wrappers import LinkSequenceWrapper return LinkSequenceWrapper(link_seq)
def _dispatch_generator(graph, model_name, params, generator_type="node"): """Create a graph generator.""" if model_name == "watchyourstep": return AdjacencyPowerGenerator( graph, num_powers=params["num_powers"]) elif model_name in ["complex", "distmult"]: return KGTripleGenerator(graph, params["batch_size"]) elif model_name == "attri2vec": if generator_type == "node": return Attri2VecNodeGenerator( graph, params["batch_size"]) else: return Attri2VecLinkGenerator( graph, params["batch_size"]) elif model_name in ["graphsage", "graphsage_dgi"]: if generator_type == "node": return GraphSAGENodeGenerator( graph, params["batch_size"], params["num_samples"]) else: return GraphSAGELinkGenerator( graph, params["batch_size"], params["num_samples"]) elif model_name in ["gcn_dgi", "gat_dgi"]: return FullBatchNodeGenerator(graph, sparse=False) elif model_name in ["cluster_gcn_dgi", "cluster_gat_dgi"]: return ClusterNodeGenerator( graph, clusters=params["clusters"], q=params["clusters_q"]) else: raise ValueError(f"Unknown model name '{model_name}'")
def create_graphSAGE_model(graph, link_prediction=False): if link_prediction: # We are going to train on the original graph generator = GraphSAGELinkGenerator(graph, batch_size=2, num_samples=[2, 2]) edge_ids_train = np.array([[1, 2], [2, 3], [1, 3]]) train_gen = generator.flow(edge_ids_train, np.array([1, 1, 0])) else: generator = GraphSAGENodeGenerator(graph, batch_size=2, num_samples=[2, 2]) train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]])) # if link_prediction: # edge_ids_train = np.array([[1, 2], [2, 3], [1, 3]]) # train_gen = generator.flow(edge_ids_train, np.array([1, 1, 0])) # else: # train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]])) base_model = GraphSAGE(layer_sizes=[8, 8], generator=train_gen, bias=True, dropout=0.5) if link_prediction: # Expose input and output sockets of graphsage, for source and destination nodes: x_inp_src, x_out_src = base_model.node_model() x_inp_dst, x_out_dst = base_model.node_model() # re-pack into a list where (source, destination) inputs alternate, for link inputs: x_inp = [x for ab in zip(x_inp_src, x_inp_dst) for x in ab] # same for outputs: x_out = [x_out_src, x_out_dst] prediction = link_classification(output_dim=1, output_act="relu", edge_embedding_method="ip")(x_out) keras_model = Model(inputs=x_inp, outputs=prediction) else: x_inp, x_out = base_model.node_model() prediction = layers.Dense(units=2, activation="softmax")(x_out) keras_model = Model(inputs=x_inp, outputs=prediction) return base_model, keras_model, generator, train_gen
def create_train_gen(G): # This generates random walk samples from the graph unsupervised_samples = UnsupervisedSampler( G, nodes=list(G.nodes()), length=config.WALK_LENGTH, number_of_walks=config.NUM_WALKS, ) return GraphSAGELinkGenerator(G, config.BATCH_SIZE, config.NUM_SAMPLES).flow( unsupervised_samples )
def initialize(self,**hyper_params): if(not "batch_size" in hyper_params.keys()): batch_size = 20 if(not "layer_sizes" in hyper_params.keys()): num_samples = [20, 10] if(not "num_samples" in hyper_params.keys()): layer_sizes = [10, 10 ] if(not "bias" in hyper_params.keys()): bias = True if(not "dropout" in hyper_params.keys()): dropout = 0.1 if(not "lr" in hyper_params.keys()): lr = 1e-2 graph = sg.StellarGraph(nodes=self.nodes,edges=self.edges) # Test split edge_splitter_test = EdgeSplitter(graph) self.graph_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split( p=0.1, method="global", keep_connected=True, seed = 42 ) # Train split edge_splitter_train = EdgeSplitter(self.graph_test) self.graph_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split( p=0.1, method="global", keep_connected=True, seed = 42 ) # Train iterators train_gen = GraphSAGELinkGenerator(self.graph_train, batch_size, num_samples, seed = 42) self.train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True) # Test iterators test_gen = GraphSAGELinkGenerator(self.graph_train, batch_size, num_samples, seed = 42) self.test_flow = test_gen.flow(edge_ids_test, edge_labels_test, shuffle=True) # Model defining - Keras functional API + Stellargraph layers graphsage = GraphSAGE( layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout ) x_inp, x_out = graphsage.in_out_tensors() prediction = link_classification( output_dim=1, output_act="sigmoid", edge_embedding_method="ip" )(x_out) self.model = keras.Model(inputs=x_inp, outputs=prediction) self.model.compile( optimizer=keras.optimizers.Adam(lr=lr), loss=keras.losses.binary_crossentropy, metrics=[keras.metrics.BinaryAccuracy(),keras.metrics.Recall(),keras.metrics.AUC(),keras.metrics.Precision()], ) # return number of training and testing examples return edge_ids_train.shape[0],edge_ids_test.shape[0]
def test(G, model_file: AnyStr, batch_size: int = 100): """ Load the serialized model and evaluate on a random balanced subset of all links in the graph. Note that the set of links the model is evaluated on may contain links from the model's training set. To avoid this, set the seed of the edge splitter to the same seed as used for link splitting in train() Args: G: NetworkX graph file model_file: Location of Keras model to load batch_size: Size of batch for inference """ print("Loading model from ", model_file) model = keras.models.load_model( model_file, custom_objects={"MeanAggregator": MeanAggregator}) # Get required input shapes from model num_samples = [ int(model.input_shape[ii + 1][1] / model.input_shape[ii][1]) for ii in range(1, len(model.input_shape) - 1, 2) ] edge_splitter_test = EdgeSplitter(G) # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G, and obtain the # reduced graph G_test with the sampled links removed: G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split( p=0.1, method=args.edge_sampling_method, probs=args.edge_sampling_probs) # Convert G_test to StellarGraph object (undirected, as required by GraphSAGE): G_test = sg.StellarGraph(G_test, node_features="feature") # Generator feeds data from (source, target) sampled subgraphs to GraphSAGE model test_gen = GraphSAGELinkGenerator( G_test, batch_size, num_samples, name="test", ).flow(edge_ids_test, edge_labels_test) # Evaluate and print metrics test_metrics = model.evaluate_generator(test_gen) print("\nTest Set Evaluation:") for name, val in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, val))
G = sg.StellarGraph(nodes=nodes,edges=edges) # Train split edge_splitter_train = EdgeSplitter(G) G_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split( p=0.2, method="global", keep_connected=True ) # Hyperparams batch_size = 20 epochs = 20 num_samples = [20, 10] layer_sizes = [20, 20] # Train iterators train_gen = GraphSAGELinkGenerator(G_train, batch_size, num_samples) train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True) # Model defining - Keras functional API + Stellargraph layers graphsage = GraphSAGE( layer_sizes=layer_sizes, generator=train_gen, bias=True, dropout=0.3 ) x_inp, x_out = graphsage.in_out_tensors() prediction = link_classification( output_dim=1, output_act="relu", edge_embedding_method="ip" )(x_out)
# GraphSAGE model: # In[12]: num_samples = [20, 10] # ### Create the generators for training # For training we create a generator on the `G_train` graph. The `shuffle=True` argument is given to the `flow` method to improve training. # In[13]: generator = GraphSAGELinkGenerator(G_train, batch_size, num_samples) # In[14]: train_gen = generator.flow(edge_ids_train, edge_labels_train, shuffle=True) # At test time we use the `G_test` graph and don't specify the `shuffle` argument (it defaults to `False`). # In[15]: test_gen = GraphSAGELinkGenerator(G_test, batch_size, num_samples).flow( edge_ids_test, edge_labels_test
g_test = g.copy() edgelist = [(start, end) for start, end in zip(link_ids_train[:,0], link_ids_train[:,1]) ] g_test.remove_edges_from(edgelist) G = StellarGraph.from_networkx(g, node_features="feature") g_train = StellarGraph.from_networkx(g_train, node_features="feature") g_test = StellarGraph.from_networkx(g_test, node_features="feature") print(g_train.info()) print(g_test.info()) ## batch_size = 40 num_samples = [15, 10, 5] train_gen = GraphSAGELinkGenerator(g_train, batch_size, num_samples) train_flow = train_gen.flow(link_ids_train, link_labels_train, shuffle=True) test_gen = GraphSAGELinkGenerator(g_test, batch_size, num_samples) test_flow = test_gen.flow(link_ids_test, link_labels_test) traintest_gen = GraphSAGELinkGenerator(G, batch_size, num_samples) traintest_flow = traintest_gen.flow(link_ids, link_labels) ## =================== model -============================== graphsage_model = GraphSAGE(layer_sizes=[64, 32, 16], generator= train_gen, activations=["relu","relu","linear"], bias=True, aggregator = MaxPoolingAggregator, dropout=0.0) x_inp, x_out = graphsage_model.in_out_tensors() def custom_layer(x):
def train( G, layer_size: List[int], num_samples: List[int], batch_size: int = 100, num_epochs: int = 10, learning_rate: float = 0.001, dropout: float = 0.0, ): """ Train the GraphSAGE model on the specified graph G with given parameters. Args: G: NetworkX graph file layer_size: A list of number of hidden units in each layer of the GraphSAGE model num_samples: Number of neighbours to sample at each layer of the GraphSAGE model batch_size: Size of batch for inference num_epochs: Number of epochs to train the model learning_rate: Initial Learning rate dropout: The dropout (0->1) """ # Split links into train/test print("Using '{}' method to sample negative links".format( args.edge_sampling_method)) # From the original graph, extract E_test and the reduced graph G_test: edge_splitter_test = EdgeSplitter(G) # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G, and obtain the # reduced graph G_test with the sampled links removed: G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split( p=0.1, keep_connected=True, method=args.edge_sampling_method, probs=args.edge_sampling_probs, ) # From G_test, extract E_train and the reduced graph G_train: edge_splitter_train = EdgeSplitter(G_test, G) # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G_test, and obtain the # further reduced graph G_train with the sampled links removed: G_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split( p=0.1, keep_connected=True, method=args.edge_sampling_method, probs=args.edge_sampling_probs, ) # G_train, edge_ds_train, edge_labels_train will be used for model training # G_test, edge_ds_test, edge_labels_test will be used for model testing # Convert G_train and G_test to StellarGraph objects (undirected, as required by GraphSAGE) for ML: G_train = sg.StellarGraph(G_train, node_features="feature") G_test = sg.StellarGraph(G_test, node_features="feature") # Mapper feeds link data from sampled subgraphs to GraphSAGE model # We need to create two mappers: for training and testing of the model train_gen = GraphSAGELinkGenerator(G_train, batch_size, num_samples) train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True) test_gen = GraphSAGELinkGenerator(G_test, batch_size, num_samples) test_flow = test_gen.flow(edge_ids_test, edge_labels_test) # GraphSAGE model graphsage = GraphSAGE(layer_sizes=layer_size, generator=train_gen, bias=True, dropout=dropout) # Construct input and output tensors for the link prediction model x_inp, x_out = graphsage.build() # Final estimator layer prediction = link_classification( output_dim=1, output_act="sigmoid", edge_embedding_method=args.edge_embedding_method, )(x_out) # Stack the GraphSAGE and prediction layers into a Keras model, and specify the loss model = keras.Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=optimizers.Adam(lr=learning_rate), loss=losses.binary_crossentropy, metrics=[metrics.binary_accuracy], ) # Evaluate the initial (untrained) model on the train and test set: init_train_metrics = model.evaluate_generator(train_flow) init_test_metrics = model.evaluate_generator(test_flow) print("\nTrain Set Metrics of the initial (untrained) model:") for name, val in zip(model.metrics_names, init_train_metrics): print("\t{}: {:0.4f}".format(name, val)) print("\nTest Set Metrics of the initial (untrained) model:") for name, val in zip(model.metrics_names, init_test_metrics): print("\t{}: {:0.4f}".format(name, val)) # Train model print("\nTraining the model for {} epochs...".format(num_epochs)) history = model.fit_generator( train_flow, epochs=num_epochs, validation_data=test_flow, verbose=2, shuffle=False, ) # Evaluate and print metrics train_metrics = model.evaluate_generator(train_flow) test_metrics = model.evaluate_generator(test_flow) print("\nTrain Set Metrics of the trained model:") for name, val in zip(model.metrics_names, train_metrics): print("\t{}: {:0.4f}".format(name, val)) print("\nTest Set Metrics of the trained model:") for name, val in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, val)) # Save the trained model save_str = "_n{}_l{}_d{}_r{}".format( "_".join([str(x) for x in num_samples]), "_".join([str(x) for x in layer_size]), dropout, learning_rate, ) model.save("graphsage_link_pred" + save_str + ".h5")
dataset = datasets.Cora() display(HTML(dataset.description)) G, node_subjects = dataset.load() nodes = list(G.nodes()) number_of_walks = 1 length = 5 unsupervised_samples = UnsupervisedSampler( G, nodes=nodes, length=length, number_of_walks=number_of_walks ) batch_size = 50 epochs = 4 num_samples = [10, 5] generator = GraphSAGELinkGenerator(G, batch_size, num_samples) train_gen = generator.flow(unsupervised_samples) # + # feature extractoring and preprocessing data import librosa import pandas as pd import numpy as np import matplotlib.pyplot as plt # %matplotlib inline import os from PIL import Image import pathlib import csv
## Train the embedding # mo"number of epochs to train for"l G = sg.StellarGraph(Gnx, node_features=node_features) Gtrain = sg.StellarGraph(Gtrain_nx, node_features=node_features) # The graph G # together wi"number of parallel workers to use" the unsupervised sampler will be used to generate samples. actual_nodes_train = list(Gtrain.nodes()) if testtype == 'nodes': assert set(nodes_train).issuperset(actual_nodes_train) unsupervised_samples = UnsupervisedSampler(Gtrain, nodes=actual_nodes_train, length=length_of_walks, number_of_walks=number_of_walks) train_gen = GraphSAGELinkGenerator(Gtrain, batch_size, num_samples).flow(unsupervised_samples) # Build the model assert len(layer_sizes) == len(num_samples) graphsage = GraphSAGE(layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=0.0, normalize="l2") x_inp, x_out = graphsage.build(flatten_output=False) prediction = link_classification(output_dim=1, output_act="sigmoid", edge_embedding_method='ip')(x_out) model = keras.Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=keras.optimizers.Adam(lr=1e-3),