def graph_link_predictor(name, train_data, preproc, layer_sizes=[20, 20], verbose=1): """ Build and return a neural link prediction model. Args: name (string): one of: - 'graphsage' for GraphSAGE model (only GraphSAGE currently supported) train_data (LinkSequenceWrapper): a ktrain.graph.sg_wrappers.LinkSequenceWrapper object preproc(LinkPreprocessor): a LinkPreprocessor instance verbose (boolean): verbosity of output Return: model (Model): A Keras Model instance """ from .sg_wrappers import LinkSequenceWrapper # check argument if not isinstance(train_data, LinkSequenceWrapper): err = """ train_data must be a ktrain.graph.sg_wrappers.LinkSequenceWrapper object """ raise Exception(err) if len(layer_sizes) != len(preproc.sample_sizes): raise ValueError( 'number of layer_sizes must match len(preproc.sample_sizes)') num_classes = U.nclasses_from_data(train_data) # set loss and activations loss_func = 'categorical_crossentropy' activation = 'softmax' # import stellargraph try: import stellargraph as sg from stellargraph.layer import GraphSAGE, link_classification except: raise Exception(SG_ERRMSG) if version.parse(sg.__version__) < version.parse('0.8'): raise Exception(SG_ERRMSG) # build a GraphSAGE link prediction model graphsage = GraphSAGE(layer_sizes=layer_sizes, generator=train_data, bias=True, dropout=0.3) x_inp, x_out = graphsage.build() prediction = link_classification(output_dim=1, output_act="relu", edge_embedding_method='ip')(x_out) model = Model(inputs=x_inp, outputs=prediction) model.compile(optimizer=U.DEFAULT_OPT, loss='binary_crossentropy', metrics=["accuracy"]) return model
def initialize(self,**hyper_params): if(not "batch_size" in hyper_params.keys()): batch_size = 16 if(not "layer_sizes" in hyper_params.keys()): num_samples = [25, 10] if(not "num_samples" in hyper_params.keys()): layer_sizes = [256, 256] if(not "bias" in hyper_params.keys()): bias = True if(not "dropout" in hyper_params.keys()): dropout = 0.0 if(not "lr" in hyper_params.keys()): lr = 1e-3 if(not "num_walks" in hyper_params.keys()): num_walks = 1 if(not "length" in hyper_params.keys()): length = 5 self.graph = sg.StellarGraph(nodes=self.nodes_df,edges=self.edges_df) self.nodes = list(self.graph.nodes()) del self.nodes_df del self.edges_df unsupervised_samples = UnsupervisedSampler( self.graph, nodes=self.nodes, length=length, number_of_walks=num_walks ) # Train iterators train_gen = GraphSAGELinkGenerator(self.graph, batch_size, num_samples) self.train_flow = train_gen.flow(unsupervised_samples) # Model defining - Keras functional API + Stellargraph layers graphsage = GraphSAGE( layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout, normalize="l2" ) x_inp, x_out = graphsage.in_out_tensors() prediction = link_classification( output_dim=1, output_act="sigmoid", edge_embedding_method="ip" )(x_out) self.model = keras.Model(inputs=x_inp, outputs=prediction) self.model.compile( optimizer=keras.optimizers.Adam(lr=lr), loss=keras.losses.binary_crossentropy, metrics=[keras.metrics.binary_accuracy], ) x_inp_src = x_inp[0::2] x_out_src = x_out[0] self.embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src) self.node_gen = GraphSAGENodeGenerator(self.graph, batch_size, num_samples).flow(self.nodes) return self.model.get_weights()
def initialize(self,**hyper_params): if(not "batch_size" in hyper_params.keys()): batch_size = 20 if(not "layer_sizes" in hyper_params.keys()): num_samples = [20, 10] if(not "num_samples" in hyper_params.keys()): layer_sizes = [10, 10 ] if(not "bias" in hyper_params.keys()): bias = True if(not "dropout" in hyper_params.keys()): dropout = 0.1 if(not "lr" in hyper_params.keys()): lr = 1e-2 graph = sg.StellarGraph(nodes=self.nodes,edges=self.edges) # Test split edge_splitter_test = EdgeSplitter(graph) self.graph_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split( p=0.1, method="global", keep_connected=True, seed = 42 ) # Train split edge_splitter_train = EdgeSplitter(self.graph_test) self.graph_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split( p=0.1, method="global", keep_connected=True, seed = 42 ) # Train iterators train_gen = GraphSAGELinkGenerator(self.graph_train, batch_size, num_samples, seed = 42) self.train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True) # Test iterators test_gen = GraphSAGELinkGenerator(self.graph_train, batch_size, num_samples, seed = 42) self.test_flow = test_gen.flow(edge_ids_test, edge_labels_test, shuffle=True) # Model defining - Keras functional API + Stellargraph layers graphsage = GraphSAGE( layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout ) x_inp, x_out = graphsage.in_out_tensors() prediction = link_classification( output_dim=1, output_act="sigmoid", edge_embedding_method="ip" )(x_out) self.model = keras.Model(inputs=x_inp, outputs=prediction) self.model.compile( optimizer=keras.optimizers.Adam(lr=lr), loss=keras.losses.binary_crossentropy, metrics=[keras.metrics.BinaryAccuracy(),keras.metrics.Recall(),keras.metrics.AUC(),keras.metrics.Precision()], ) # return number of training and testing examples return edge_ids_train.shape[0],edge_ids_test.shape[0]
def initialize(self, **hyper_params): if (not "batch_size" in hyper_params.keys()): batch_size = 20 if (not "layer_sizes" in hyper_params.keys()): num_samples = [20, 10] if (not "num_samples" in hyper_params.keys()): layer_sizes = [20, 20] if (not "bias" in hyper_params.keys()): bias = True if (not "dropout" in hyper_params.keys()): dropout = 0.3 if (not "lr" in hyper_params.keys()): lr = 1e-3 if (not "train_split" in hyper_params.keys()): train_split = 0.2 self.graph = sg.StellarGraph(nodes=self.nodes, edges=self.edges) # Train split edge_splitter_train = EdgeSplitter(self.graph) graph_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split( p=train_split, method="global", keep_connected=True) # Train iterators train_gen = GraphSAGELinkGenerator(graph_train, batch_size, num_samples) self.train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True) # Model defining - Keras functional API + Stellargraph layers graphsage = GraphSAGE(layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout) x_inp, x_out = graphsage.in_out_tensors() prediction = link_classification(output_dim=1, output_act="relu", edge_embedding_method="ip")(x_out) self.model = keras.Model(inputs=x_inp, outputs=prediction) self.model.compile( optimizer=keras.optimizers.Adam(lr=lr), loss=keras.losses.binary_crossentropy, metrics=["acc"], ) return self.model.get_weights()
def _fit_inductive_embedder(self, train_graph): """Fit inductive embedder (predictive model and embeddings).""" if self.model_name in ["cluster_gcn_dgi", "cluster_gat_dgi"]: return _fit_deep_graph_infomax(train_graph, self.params, self.model_name) unsupervised_samples = UnsupervisedSampler( train_graph, nodes=train_graph.nodes(), length=self.params["length"], number_of_walks=self.params["number_of_walks"]) generator = _dispatch_generator(train_graph, self.model_name, self.params, generator_type="edge") layer_sizes = _dispatch_layer_sizes(self.model_name, self.params) embedding_layer = _dispatch_inductive_layer(layer_sizes, generator, self.model_name, self.params) x_inp, x_out = embedding_layer.in_out_tensors() prediction = link_classification(output_dim=1, output_act="sigmoid", edge_embedding_method="ip")(x_out) model = Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=optimizers.Adam(lr=1e-3), loss=losses.binary_crossentropy, metrics=[metrics.binary_accuracy], ) train_generator = generator.flow(unsupervised_samples) model.fit(train_generator, epochs=self.params["epochs"], shuffle=True, verbose=0) if self.model_name == "attri2vec": x_inp_src = x_inp[0] elif self.model_name == "graphsage": x_inp_src = x_inp[0::2] x_out_src = x_out[0] embedding_model = Model(inputs=x_inp_src, outputs=x_out_src) return embedding_model
def create_graphSAGE_model(graph, link_prediction=False): if link_prediction: # We are going to train on the original graph generator = GraphSAGELinkGenerator(graph, batch_size=2, num_samples=[2, 2]) edge_ids_train = np.array([[1, 2], [2, 3], [1, 3]]) train_gen = generator.flow(edge_ids_train, np.array([1, 1, 0])) else: generator = GraphSAGENodeGenerator(graph, batch_size=2, num_samples=[2, 2]) train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]])) # if link_prediction: # edge_ids_train = np.array([[1, 2], [2, 3], [1, 3]]) # train_gen = generator.flow(edge_ids_train, np.array([1, 1, 0])) # else: # train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]])) base_model = GraphSAGE(layer_sizes=[8, 8], generator=train_gen, bias=True, dropout=0.5) if link_prediction: # Expose input and output sockets of graphsage, for source and destination nodes: x_inp_src, x_out_src = base_model.node_model() x_inp_dst, x_out_dst = base_model.node_model() # re-pack into a list where (source, destination) inputs alternate, for link inputs: x_inp = [x for ab in zip(x_inp_src, x_inp_dst) for x in ab] # same for outputs: x_out = [x_out_src, x_out_dst] prediction = link_classification(output_dim=1, output_act="relu", edge_embedding_method="ip")(x_out) keras_model = Model(inputs=x_inp, outputs=prediction) else: x_inp, x_out = base_model.node_model() prediction = layers.Dense(units=2, activation="softmax")(x_out) keras_model = Model(inputs=x_inp, outputs=prediction) return base_model, keras_model, generator, train_gen
def create_model(graph_sage): x_inp, x_out = graph_sage.build(flatten_output=False) # classification layer that takes the pair of node embeddings, combines them, puts them # through a dense layer prediction = link_classification( output_dim=1, output_act="sigmoid", edge_embedding_method="ip", )(x_out) model = keras.Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=keras.optimizers.Adam(lr=1e-3), loss=keras.losses.binary_crossentropy, metrics=[keras.metrics.binary_accuracy], ) return x_inp, x_out, model
# Train iterators train_gen = GraphSAGELinkGenerator(G_train, batch_size, num_samples) train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True) # Model defining - Keras functional API + Stellargraph layers graphsage = GraphSAGE( layer_sizes=layer_sizes, generator=train_gen, bias=True, dropout=0.3 ) x_inp, x_out = graphsage.in_out_tensors() prediction = link_classification( output_dim=1, output_act="relu", edge_embedding_method="ip" )(x_out) model = keras.Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=keras.optimizers.Adam(lr=1e-3), loss=keras.losses.binary_crossentropy, metrics=["acc"], ) # Set weights weights = np.load(path_weights,allow_pickle=True) model.set_weights(weights) print("Training started")
def get_hinsage_model(generator, train_gen, test_gen, num_samples=[8, 4], hinsage_layer_sizes=[32, 32], bias=True, dropout=0.0, lr=1e-2, edge_embedding_method='concat', output_act='sigmoid'): assert len(hinsage_layer_sizes) == len(num_samples) hinsage = HinSAGE(layer_sizes=hinsage_layer_sizes, generator=generator, bias=bias, dropout=dropout) # Expose input and output sockets of hinsage: x_inp, x_out = hinsage.in_out_tensors() # Final estimator layer score_prediction = link_classification( output_dim=1, output_act='sigmoid', edge_embedding_method=edge_embedding_method)(x_out) def root_mean_square_error(s_true, s_pred): return K.sqrt(K.mean(K.pow(s_true - s_pred, 2))) def recall_m(y_true, y_pred): y_pred = tf.where(y_pred > 0.5, 1.0, 0.0) true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) recall = true_positives / (possible_positives + K.epsilon()) return recall def precision_m(y_true, y_pred): y_pred = tf.where(y_pred > 0.5, 1.0, 0.0) true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) precision = true_positives / (predicted_positives + K.epsilon()) return precision def f1_m(y_true, y_pred): y_pred = tf.where(y_pred > 0.5, 1.0, 0.0) precision = precision_m(y_true, y_pred) recall = recall_m(y_true, y_pred) return 2 * ((precision * recall) / (precision + recall + K.epsilon())) model = Model(inputs=x_inp, outputs=score_prediction) model.compile( optimizer=optimizers.Adam(lr=lr), # loss=losses.mean_squared_error, loss=losses.binary_crossentropy, metrics=[ metrics.binary_accuracy, metrics.Precision(), metrics.Recall() ], # metrics=[root_mean_square_error, metrics.mae, 'acc'], ) return model
def train( G, layer_size: List[int], num_samples: List[int], batch_size: int = 100, num_epochs: int = 10, learning_rate: float = 0.001, dropout: float = 0.0, ): """ Train the GraphSAGE model on the specified graph G with given parameters. Args: G: NetworkX graph file layer_size: A list of number of hidden units in each layer of the GraphSAGE model num_samples: Number of neighbours to sample at each layer of the GraphSAGE model batch_size: Size of batch for inference num_epochs: Number of epochs to train the model learning_rate: Initial Learning rate dropout: The dropout (0->1) """ # Split links into train/test print("Using '{}' method to sample negative links".format( args.edge_sampling_method)) # From the original graph, extract E_test and the reduced graph G_test: edge_splitter_test = EdgeSplitter(G) # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G, and obtain the # reduced graph G_test with the sampled links removed: G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split( p=0.1, keep_connected=True, method=args.edge_sampling_method, probs=args.edge_sampling_probs, ) # From G_test, extract E_train and the reduced graph G_train: edge_splitter_train = EdgeSplitter(G_test, G) # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G_test, and obtain the # further reduced graph G_train with the sampled links removed: G_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split( p=0.1, keep_connected=True, method=args.edge_sampling_method, probs=args.edge_sampling_probs, ) # G_train, edge_ds_train, edge_labels_train will be used for model training # G_test, edge_ds_test, edge_labels_test will be used for model testing # Convert G_train and G_test to StellarGraph objects (undirected, as required by GraphSAGE) for ML: G_train = sg.StellarGraph(G_train, node_features="feature") G_test = sg.StellarGraph(G_test, node_features="feature") # Mapper feeds link data from sampled subgraphs to GraphSAGE model # We need to create two mappers: for training and testing of the model train_gen = GraphSAGELinkGenerator(G_train, batch_size, num_samples) train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True) test_gen = GraphSAGELinkGenerator(G_test, batch_size, num_samples) test_flow = test_gen.flow(edge_ids_test, edge_labels_test) # GraphSAGE model graphsage = GraphSAGE(layer_sizes=layer_size, generator=train_gen, bias=True, dropout=dropout) # Construct input and output tensors for the link prediction model x_inp, x_out = graphsage.build() # Final estimator layer prediction = link_classification( output_dim=1, output_act="sigmoid", edge_embedding_method=args.edge_embedding_method, )(x_out) # Stack the GraphSAGE and prediction layers into a Keras model, and specify the loss model = keras.Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=optimizers.Adam(lr=learning_rate), loss=losses.binary_crossentropy, metrics=[metrics.binary_accuracy], ) # Evaluate the initial (untrained) model on the train and test set: init_train_metrics = model.evaluate_generator(train_flow) init_test_metrics = model.evaluate_generator(test_flow) print("\nTrain Set Metrics of the initial (untrained) model:") for name, val in zip(model.metrics_names, init_train_metrics): print("\t{}: {:0.4f}".format(name, val)) print("\nTest Set Metrics of the initial (untrained) model:") for name, val in zip(model.metrics_names, init_test_metrics): print("\t{}: {:0.4f}".format(name, val)) # Train model print("\nTraining the model for {} epochs...".format(num_epochs)) history = model.fit_generator( train_flow, epochs=num_epochs, validation_data=test_flow, verbose=2, shuffle=False, ) # Evaluate and print metrics train_metrics = model.evaluate_generator(train_flow) test_metrics = model.evaluate_generator(test_flow) print("\nTrain Set Metrics of the trained model:") for name, val in zip(model.metrics_names, train_metrics): print("\t{}: {:0.4f}".format(name, val)) print("\nTest Set Metrics of the trained model:") for name, val in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, val)) # Save the trained model save_str = "_n{}_l{}_d{}_r{}".format( "_".join([str(x) for x in num_samples]), "_".join([str(x) for x in layer_size]), dropout, learning_rate, ) model.save("graphsage_link_pred" + save_str + ".h5")
nodes=actual_nodes_train, length=length_of_walks, number_of_walks=number_of_walks) train_gen = GraphSAGELinkGenerator(Gtrain, batch_size, num_samples).flow(unsupervised_samples) # Build the model assert len(layer_sizes) == len(num_samples) graphsage = GraphSAGE(layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=0.0, normalize="l2") x_inp, x_out = graphsage.build(flatten_output=False) prediction = link_classification(output_dim=1, output_act="sigmoid", edge_embedding_method='ip')(x_out) model = keras.Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=keras.optimizers.Adam(lr=1e-3), loss=keras.losses.binary_crossentropy, metrics=[keras.metrics.binary_accuracy], ) # Train the model history = model.fit_generator( train_gen, epochs=nepochs, verbose=verbose, use_multiprocessing=False, workers=nworkers,
def main(): with open(r"training.txt", "r") as f: reader = csv.reader(f) training = list(reader) # in order of training examples training = [element[0].split(" ") for element in training] training = pd.DataFrame(training, columns=['Node1', 'Node2', 'Link']) print("Training examples shape: {}".format(training.shape)) with open(r"testing.txt", "r") as f: reader = csv.reader(f) testing = list(reader) # in order of testing examples testing = [element[0].split(" ") for element in testing] testing = pd.DataFrame(testing, columns=['Node1', 'Node2']) print("Testing examples shape: {}".format(testing.shape)) ''' uncomment lines for reduced corpus with stopword removal. In future integrate stemmer here, multi-language ''' NODE_INFO_DIRECTORY = r"node_information/text/" corpus_path = r"pickles/simple_corpus.PICKLE" ids_path = r"pickles/ids.PICKLE" if os.path.exists(corpus_path): with open(corpus_path, 'rb') as f: corpus = pickle.load(f) f.close() with open(ids_path, 'rb') as f: ids = pickle.load(f) f.close() else: corpus = [] ids = [] for filename in tqdm(os.listdir(NODE_INFO_DIRECTORY), position=0, leave=True): with open(NODE_INFO_DIRECTORY + filename, 'r', encoding='UTF-8', errors='ignore') as f: doc_string = [] for line in f: [ doc_string.append(token.strip()) for token in line.lower().strip().split(" ") if token != "" ] corpus.append(' '.join(doc_string)) ids.append(filename[:-4]) with open(corpus_path, '+wb') as f: pickle.dump(corpus, f) f.close() with open(ids_path, '+wb') as f: pickle.dump(ids, f) f.close() stemmed_corpus_path = r"pickles/stemmed_corpus.PICKLE" if os.path.exists(stemmed_corpus_path): with open(stemmed_corpus_path, 'rb') as f: stemmed_corpus = pickle.load(f) f.close() else: print('Stemmed corpus unavailable') # in order of alphabetical text information i.e. 0, 1, 10, 100 node_info = pd.DataFrame({ 'id': ids, 'corpus': corpus, 'stemmed': stemmed_corpus }) print("Training node info shape: {}".format(node_info.shape)) train_graph_split_path = 'pickles/train_graph_split.PICKLE' if os.path.exists(train_graph_split_path): with open(train_graph_split_path, 'rb') as f: keep_indices = pickle.load(f) f.close() else: keep_indices = random.sample(range(len(training)), k=int(len(training) * 0.05)) with open(train_graph_split_path, '+wb') as f: pickle.dump(keep_indices, f) f.close() data_train_val = training.iloc[keep_indices] linked_nodes = training.loc[training['Link'] == '1'] linked_nodes = linked_nodes[['Node1', 'Node2']] edgelist = linked_nodes.rename(columns={ "Node1": "source", "Node2": "target" }) lda_path = r"pickles/stemmed_lda_matrix.PICKLE" if os.path.exists(lda_path): with open(lda_path, 'rb') as f: lda = pickle.load(f) f.close() lda.shape feature_names = node_column_names = ["w_{}".format(ii) for ii in range(10)] node_data = pd.DataFrame(lda, columns=node_column_names) node_data.index = [str(i) for i in node_data.index] G_all_nx = nx.from_pandas_edgelist(edgelist) all_node_features = node_data[feature_names] G_all = sg.StellarGraph(G_all_nx, node_features=all_node_features) print(G_all.info()) G_all.get_feature_for_nodes(['0']) ## Get DBLP Subgraph ### with papers published before a threshold year sub_linked_nodes = data_train_val.loc[data_train_val['Link'] == '1'] sub_linked_nodes = sub_linked_nodes[['Node1', 'Node2']] subgraph_edgelist = sub_linked_nodes.rename(columns={ "Node1": "source", "Node2": "target" }) G_sub_nx = nx.from_pandas_edgelist(subgraph_edgelist) subgraph_node_ids = sorted(list(G_sub_nx.nodes)) subgraph_node_features = node_data[feature_names].reindex( subgraph_node_ids) G_sub = sg.StellarGraph(G_sub_nx, node_features=subgraph_node_features) print(G_sub.info()) ## Train attri2vec on the DBLP Subgraph nodes = list(G_sub.nodes()) number_of_walks = int(input('Number of Walks: ')) length = int(input('Walk length: ')) unsupervised_samples = UnsupervisedSampler(G_sub, nodes=nodes, length=length, number_of_walks=number_of_walks) batch_size = 50 epochs = int(input('Enter number of epochs: ')) generator = Attri2VecLinkGenerator(G_sub, batch_size) layer_sizes = [128] attri2vec = Attri2Vec(layer_sizes=layer_sizes, generator=generator.flow(unsupervised_samples), bias=False, normalize=None) # Build the model and expose input and output sockets of attri2vec, for node pair inputs: x_inp, x_out = attri2vec.build() prediction = link_classification(output_dim=1, output_act="sigmoid", edge_embedding_method='ip')(x_out) model = keras.Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=keras.optimizers.Adam(lr=1e-2), loss=keras.losses.binary_crossentropy, metrics=[keras.metrics.binary_accuracy], ) history = model.fit_generator( generator.flow(unsupervised_samples), epochs=epochs, verbose=1, use_multiprocessing=bool(int(input('Multiprocessing? 1/0: '))), workers=int(input('Number of workers: ')), shuffle=True, ) print(history) model.save('model_walks{}len{}e{}.h5'.format(number_of_walks, length, epochs)) return model