def create_GAT_model(graph): generator = FullBatchNodeGenerator(graph, sparse=False, method=None) train_gen = generator.flow([0, 1], np.array([[1, 0], [0, 1]])) gat = GAT( layer_sizes=[2, 2], generator=generator, bias=False, in_dropout=0, attn_dropout=0, activations=["elu", "softmax"], normalize=None, saliency_map_support=True, ) for layer in gat._layers: layer._initializer = "ones" x_inp, x_out = gat.node_model() keras_model = Model(inputs=x_inp, outputs=x_out) return gat, keras_model, generator, train_gen
def create_GAT_model(graph): generator = FullBatchNodeGenerator(graph, sparse=False) train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]])) base_model = GAT( layer_sizes=[8, 8, 2], generator=generator, bias=True, in_dropout=0.5, attn_dropout=0.5, activations=["elu", "elu", "softmax"], normalize=None, ) x_inp, x_out = base_model.node_model() keras_model = Model(inputs=x_inp, outputs=x_out) return base_model, keras_model, generator, train_gen
def infer_attributes_gat(Gnx, savepred=True, plot=False): # Define node data feature_names = [ "in_degree", "out_degree", # "in_degree_centrality", # "out_degree_centrality", # "closeness_centrality", # "betweenness_centrality", "clustering_coefficient", # "square_clustering", "core_number", # "pagerank", # "constraint", # "effective_size" ] node_type = [v for k, v in nx.get_node_attributes(Gnx, 'data').items()] d = {"node_type": node_type} if "in_degree" in feature_names: indeg = [v for k, v in Gnx.in_degree] indeg = np.divide(indeg, max(indeg)) indeg[indeg >= 0.5] = 1 indeg[indeg < 0.5] = 0 d["in_degree"] = indeg if "out_degree" in feature_names: outdeg = [v for k, v in Gnx.out_degree] outdeg = np.divide(outdeg, max(outdeg)) outdeg[outdeg >= 0.5] = 1 outdeg[outdeg < 0.5] = 0 d["out_degree"] = outdeg if "in_degree_centrality" in feature_names: indeg_cent = [ v for k, v in nx.algorithms.in_degree_centrality(Gnx).items() ] indeg_cent = np.divide(indeg_cent, max(indeg_cent)) indeg_cent[indeg_cent >= 0.5] = 1 indeg_cent[indeg_cent < 0.5] = 0 d["in_degree_centrality"] = indeg_cent if "out_degree_centrality" in feature_names: outdeg_cent = [ v for k, v in nx.algorithms.out_degree_centrality(Gnx).items() ] outdeg_cent = np.divide(outdeg_cent, max(outdeg_cent)) outdeg_cent[outdeg_cent >= 0.5] = 1 outdeg_cent[outdeg_cent < 0.5] = 0 d["out_degree_centrality"] = outdeg_cent if "closeness_centrality" in feature_names: close_cent = [ v for k, v in nx.algorithms.closeness_centrality(Gnx).items() ] close_cent = np.divide(close_cent, max(close_cent)) close_cent[close_cent >= 0.5] = 1 close_cent[close_cent < 0.5] = 0 d["closeness_centrality"] = close_cent if "betweenness_centrality" in feature_names: between_cent = [ v for k, v in nx.algorithms.betweenness_centrality(Gnx).items() ] between_cent = np.divide(between_cent, max(between_cent)) between_cent[between_cent >= 0.5] = 1 between_cent[between_cent < 0.5] = 0 d["betweenness_centrality"] = between_cent if "clustering_coefficient" in feature_names: clustering_co = [v for k, v in nx.algorithms.clustering(Gnx).items()] clustering_co = np.divide(clustering_co, max(clustering_co)) clustering_co[clustering_co >= 0.5] = 1 clustering_co[clustering_co < 0.5] = 0 d["clustering_coefficient"] = clustering_co if "square_clustering" in feature_names: sq_clustering = [ v for k, v in nx.algorithms.square_clustering(Gnx).items() ] sq_clustering = np.divide(sq_clustering, max(sq_clustering)) sq_clustering[sq_clustering >= 0.5] = 1 sq_clustering[sq_clustering < 0.5] = 0 d["square_clustering"] = sq_clustering if "core_number" in feature_names: core_number = [v for k, v in nx.algorithms.core_number(Gnx).items()] core_number = np.divide(core_number, max(core_number)) core_number[core_number >= 0.5] = 1 core_number[core_number < 0.5] = 0 d["core_number"] = core_number if "pagerank" in feature_names: pagerank = [v for k, v in nx.algorithms.pagerank(Gnx).items()] pagerank = np.divide(pagerank, max(pagerank)) pagerank[pagerank >= 0.5] = 1 pagerank[pagerank < 0.5] = 0 d["pagerank"] = pagerank if "constraint" in feature_names: constraint = [v for k, v in nx.algorithms.constraint(Gnx).items()] constraint = np.divide(constraint, max(constraint)) constraint[np.isnan(constraint)] = 0 constraint[constraint >= 0.5] = 1 constraint[constraint < 0.5] = 0 d["constraint"] = constraint if "effective_size" in feature_names: effective_size = [ v for k, v in nx.algorithms.effective_size(Gnx).items() ] effective_size = np.divide(effective_size, max(effective_size)) effective_size[np.isnan(effective_size)] = 0 effective_size[effective_size >= 0.5] = 1 effective_size[effective_size < 0.5] = 0 d["effective_size"] = effective_size node_data = pd.DataFrame(data=d, index=nodes) node_data = shuffle(node_data) # Split the data train_data, test_data = model_selection.train_test_split( node_data, train_size=int(0.80 * len(Gnx))) val_data, test_data = model_selection.train_test_split( test_data, train_size=int(0.15 * len(Gnx))) # Convert to numeric arrays target_encoding = feature_extraction.DictVectorizer(sparse=False) train_targets = target_encoding.fit_transform( train_data[["node_type"]].to_dict('records')) val_targets = target_encoding.transform(val_data[["node_type" ]].to_dict('records')) test_targets = target_encoding.transform(test_data[["node_type" ]].to_dict('records')) node_features = node_data[feature_names] # Create the GAT model in Keras G = sg.StellarDiGraph(Gnx, node_features=node_features) print(G.info()) generator = FullBatchNodeGenerator(G) train_gen = generator.flow(train_data.index, train_targets) gat = GAT( layer_sizes=[8, train_targets.shape[1]], attn_heads=8, generator=generator, bias=True, in_dropout=0.5, attn_dropout=0.5, activations=["elu", "softmax"], normalize=None, ) # Expose the input and output tensors of the GAT model for node prediction, via GAT.node_model() method: x_inp, predictions = gat.node_model() # Train the model model = Model(inputs=x_inp, outputs=predictions) model.compile( optimizer=optimizers.Adam(lr=0.005), loss=losses.categorical_crossentropy, weighted_metrics=["acc"], ) val_gen = generator.flow(val_data.index, val_targets) if not os.path.isdir(".temp/logs"): os.makedirs(".temp/logs") if not os.path.isdir(".temp/output"): os.makedirs(".temp/output") es_callback = EarlyStopping( monitor="val_weighted_acc", patience= 100 # patience is the number of epochs to wait before early stopping in case of no further improvement ) mc_callback = ModelCheckpoint( ".temp/logs/best_model.h5", monitor="val_weighted_acc", save_best_only=True, save_weights_only=True, ) history = model.fit_generator( train_gen, epochs=2000, validation_data=val_gen, verbose=2, shuffle= False, # this should be False, since shuffling data means shuffling the whole graph callbacks=[es_callback, mc_callback], ) # Reload the saved weights model.load_weights(".temp/logs/best_model.h5") # Evaluate the best nidek in the test set test_gen = generator.flow(test_data.index, test_targets) test_metrics = model.evaluate_generator(test_gen) print("\nTest Set Metrics:") for name, val in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, val)) # Make predictions with the model all_nodes = node_data.index all_gen = generator.flow(all_nodes) all_predictions = model.predict_generator(all_gen) node_predictions = target_encoding.inverse_transform(all_predictions) results = pd.DataFrame(node_predictions, index=G.nodes()).idxmax(axis=1) df = pd.DataFrame({"Predicted": results, "True": node_data['node_type']}) print(df.head) if savepred: df.to_excel(".temp/output/output" + str(datetime.datetime.now()).replace(':', '-') + ".xlsx") if plot: # Node embeddings emb_layer = model.layers[3] print("Embedding layer: {}, output shape {}".format( emb_layer.name, emb_layer.output_shape)) embedding_model = Model(inputs=x_inp, outputs=emb_layer.output) emb = embedding_model.predict_generator(all_gen) X = emb y = np.argmax(target_encoding.transform( node_data.reindex(G.nodes())[["node_type"]].to_dict('records')), axis=1) if X.shape[1] > 2: transform = TSNE #PCA trans = transform(n_components=2) emb_transformed = pd.DataFrame(trans.fit_transform(X), index=list(G.nodes())) emb_transformed['label'] = y else: emb_transformed = pd.DataFrame(X, index=list(G.nodes())) emb_transformed = emb_transformed.rename(columns={'0': 0, '1': 1}) def plot_emb(transform, emb_transformed): fig, ax = plt.subplots(figsize=(7, 7)) ax.scatter(emb_transformed[0], emb_transformed[1], c=emb_transformed['label'].astype("category"), cmap="jet", alpha=0.7) ax.set(aspect="equal", xlabel="$X_1$", ylabel="$X_2$") plt.title( '{} visualization of GAT embeddings for the fighter graph'. format(transform.__name__)) # Plot the training history def remove_prefix(text, prefix): return text[text.startswith(prefix) and len(prefix):] def plot_history(history): metrics = sorted( set([ remove_prefix(m, "val_") for m in list(history.history.keys()) ])) for m in metrics: # summarize history for metric m plt.figure() plt.plot(history.history[m]) plt.plot(history.history['val_' + m]) plt.title(m) plt.ylabel(m) plt.xlabel('epoch') plt.legend(['train', 'validation'], loc='best') plot_history(history) plot_emb(transform, emb_transformed) plt.show() return df
def train( edgelist, node_data, attn_heads, layer_sizes, num_epochs=10, learning_rate=0.005, es_patience=100, dropout=0.0, target_name="subject", ): """ Train a GAT model on the specified graph G with given parameters, evaluate it, and save the model. Args: edgelist: Graph edgelist node_data: Feature and target data for nodes attn_heads: Number of attention heads in GAT layers layer_sizes: A list of number of hidden nodes in each layer num_epochs: Number of epochs to train the model learning_rate: Initial Learning rate dropout: The dropout (0->1) """ # Extract target and encode as a one-hot vector target_encoding = feature_extraction.DictVectorizer(sparse=False) node_targets = target_encoding.fit_transform( node_data[[target_name]].to_dict("records")) node_ids = node_data.index # Extract the feature data. These are the feature vectors that the Keras model will use as input. # The CORA dataset contains attributes 'w_x' that correspond to words found in that publication. node_features = node_data[feature_names] # Create graph from edgelist and set node features and node type Gnx = nx.from_pandas_edgelist(edgelist) # Convert to StellarGraph and prepare for ML G = sg.StellarGraph(Gnx, node_type_name="label", node_features=node_features) # Split nodes into train/test using stratification. train_nodes, test_nodes, train_targets, test_targets = model_selection.train_test_split( node_ids, node_targets, train_size=140, test_size=None, stratify=node_targets, random_state=55232, ) # Further split test set into validation and test val_nodes, test_nodes, val_targets, test_targets = model_selection.train_test_split( test_nodes, test_targets, train_size=500, test_size=1000, random_state=523214) # Create mappers for GraphSAGE that input data from the graph to the model generator = FullBatchNodeGenerator(G) train_gen = generator.flow(train_nodes, train_targets) val_gen = generator.flow(val_nodes, val_targets) # GAT model gat = GAT( layer_sizes=layer_sizes, attn_heads=attn_heads, generator=generator, bias=True, in_dropout=dropout, attn_dropout=dropout, activations=["elu", "elu"], normalize=None, ) # Expose the input and output tensors of the GAT model for nodes: x_inp, x_out = gat.node_model(add_self_loops=True) # Snap the final estimator layer to x_out x_out = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) # Create Keras model for training model = keras.Model(inputs=x_inp, outputs=x_out) model.compile( optimizer=optimizers.Adam(lr=learning_rate, decay=0.001), loss=losses.categorical_crossentropy, weighted_metrics=["acc"], ) print(model.summary()) # Train model # Callbacks if not os.path.isdir("logs"): os.makedirs("logs") N = len(node_ids) es_callback = EarlyStopping(monitor="val_weighted_acc", patience=es_patience) tb_callback = TensorBoard(batch_size=N) mc_callback = ModelCheckpoint( "logs/best_model.h5", monitor="val_weighted_acc", save_best_only=True, save_weights_only=True, ) if args.interface == "fit": print("\nUsing model.fit() to train the model\n") # Get the training data [X, A], y_train, node_mask_train = train_gen.__getitem__(0) N = A.shape[0] # A = sparse.csr_matrix(A + np.eye(A.shape[0])) # Add self-loops # Get the validation data [_, _], y_val, node_mask_val = val_gen.__getitem__(0) history = model.fit( x=[X, A], y=y_train, sample_weight=node_mask_train, batch_size=N, shuffle= False, # must be False, since shuffling data means shuffling the whole graph epochs=num_epochs, verbose=2, validation_data=([X, A], y_val, node_mask_val), callbacks=[es_callback, tb_callback, mc_callback], ) else: print("\nUsing model.fit_generator() to train the model\n") history = model.fit_generator( train_gen, epochs=num_epochs, validation_data=val_gen, verbose=2, shuffle=False, callbacks=[es_callback, tb_callback, mc_callback], ) # Load best model model.load_weights("logs/best_model.h5") # Evaluate on validation set and print metrics if args.interface == "fit": val_metrics = model.evaluate(x=[X, A], y=y_val, sample_weight=node_mask_val, batch_size=N) else: val_metrics = model.evaluate_generator(val_gen) print("\nBest model's Validation Set Metrics:") for name, val in zip(model.metrics_names, val_metrics): print("\t{}: {:0.4f}".format(name, val)) # Evaluate on test set and print metrics if args.interface == "fit": [_, _], y_test, node_mask_test = generator.flow( test_nodes, test_targets).__getitem__(0) test_metrics = model.evaluate(x=[X, A], y=y_test, sample_weight=node_mask_test, batch_size=N) else: test_metrics = model.evaluate_generator( generator.flow(test_nodes, test_targets)) print("\nBest model's Test Set Metrics:") for name, val in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, val)) # Get predictions for all nodes # Note that the `predict` or `predict_generator` function now operates differently to the `GraphSAGE` or `HinSAGE` models # in that if you give it less than the complete set of nodes, it will still return all predictions and in a fixed order # defined by the order of nodes in X and A (which is defined by the order of G.nodes()). if args.interface == "fit": all_predictions = model.predict(x=[X, A], batch_size=N) else: all_predictions = model.predict_generator(generator.flow(node_ids)) # Turn predictions back into the original categories node_predictions = pd.DataFrame( target_encoding.inverse_transform(all_predictions), index=list(G.nodes())) accuracy = np.mean([ "subject=" + gt_subject == p for gt_subject, p in zip(node_data["subject"][list(G.nodes())], node_predictions.idxmax(axis=1)) ]) print("\nAll-node accuracy: {:0.4f}".format(accuracy)) # Save the trained model save_str = "_h{}_l{}_d{}_r{}".format( attn_heads, "_".join([str(x) for x in layer_sizes]), dropout, learning_rate) model.save("cora_gat_model" + save_str + ".h5") # We must also save the target encoding to convert model predictions with open("cora_gat_encoding" + save_str + ".pkl", "wb") as f: pickle.dump([target_encoding], f)