def gcn_pipeline(G, node_subjects, layer_sizes=[16, 16], activations=["relu", "relu"]): #Train and test split train_subjects, val_subjects, test_subjects = training_split(node_subjects) #GCN training generator generator = FullBatchNodeGenerator(G, method="gcn") train_gen = generator.flow( train_subjects.index, train_subjects.values, ) gcn = GCN(layer_sizes=layer_sizes, activations=activations, generator=generator, dropout=0.5) model = build_model(gcn, train_subjects.values.shape[1]) val_gen = generator.flow(val_subjects.index, val_subjects.values) es_callback = EarlyStopping(monitor="val_acc", patience=50, restore_best_weights=True) history = model.fit( train_gen, epochs=200, validation_data=val_gen, verbose=0, shuffle= False, # this should be False, since shuffling data means shuffling the whole graph callbacks=[es_callback], ) plot_results(history) test_metrics(generator, model, test_subjects)
def preprocessing(self, g, train_node, file_emb_output="./emb/100_900_nede2vec.emb"): node_subjects = train_node['values'] node_subjects = node_subjects.astype(str) print(Counter(node_subjects)) #file_emb_output = "./emb/100_900_nede2vec.emb" model = KeyedVectors.load_word2vec_format(file_emb_output) node_ids = model.wv.index2word node_embeddings = ( model.wv.vectors ) # num print("Embedding load success.") reinex_node_embedding = pd.DataFrame(node_embeddings, index=map(int, node_ids)) g_feature_attr = g.copy() G = StellarGraph.from_networkx( g_feature_attr, node_features=reinex_node_embedding, node_type_default="n", edge_type_default="e" ) print(G.info()) train_subjects, test_subjects = model_selection.train_test_split( node_subjects, train_size=160, test_size=None, stratify=node_subjects ) val_subjects, test_subjects = model_selection.train_test_split( test_subjects, train_size=20, test_size=None, stratify=test_subjects ) train_subjects.value_counts().to_frame() target_encoding = preprocessing.LabelBinarizer() # target_encoding = preprocessing.OneHotEncoder() train_targets = target_encoding.fit_transform(train_subjects) val_targets = target_encoding.transform(val_subjects) test_targets = target_encoding.transform(test_subjects) generator = FullBatchNodeGenerator(G, method="gcn") train_gen = generator.flow(train_subjects.index, train_targets) val_gen = generator.flow(val_subjects.index, val_targets) test_gen = generator.flow(test_subjects.index, test_targets) all_nodes = node_subjects.index all_gen = generator.flow(all_nodes) return G, train_gen, train_targets, val_gen, val_targets, test_targets, test_gen, all_gen, generator
def test_GCN_apply_dense(): G, features = create_graph_features() adj = nx.to_numpy_array(G)[None, :, :] n_nodes = features.shape[0] nodes = G.nodes() node_features = pd.DataFrame.from_dict( {n: f for n, f in zip(nodes, features)}, orient="index") G = StellarGraph(G, node_features=node_features) generator = FullBatchNodeGenerator(G, sparse=False, method="none") gcnModel = GCN([2], generator, activations=["relu"], dropout=0.5) x_in, x_out = gcnModel.build() model = keras.Model(inputs=x_in, outputs=x_out) # Check fit method out_indices = np.array([[0, 1]], dtype="int32") preds_1 = model.predict([features[None, :, :], out_indices, adj]) assert preds_1.shape == (1, 2, 2) # Check fit_generator method preds_2 = model.predict_generator(generator.flow(["a", "b"])) assert preds_2.shape == (1, 2, 2) assert preds_1 == pytest.approx(preds_2)
def test_APPNP_apply_propagate_model_sparse(): G, features = create_graph_features() adj = G.to_adjacency_matrix() features, adj = GCN_Aadj_feats_op(features, adj) adj = adj.tocoo() A_indices = np.expand_dims(np.hstack((adj.row[:, None], adj.col[:, None])), 0) A_values = np.expand_dims(adj.data, 0) generator = FullBatchNodeGenerator(G, sparse=True, method="gcn") appnpnModel = APPNP([2], generator=generator, activations=["relu"], dropout=0.5) fully_connected_model = keras.Sequential() fully_connected_model.add(Dense(2)) x_in, x_out = appnpnModel.propagate_model(fully_connected_model) model = keras.Model(inputs=x_in, outputs=x_out) # Check fit method out_indices = np.array([[0, 1]], dtype="int32") preds_1 = model.predict([features[None, :, :], out_indices, A_indices, A_values]) assert preds_1.shape == (1, 2, 2) # Check fit method preds_2 = model.predict(generator.flow(["a", "b"])) assert preds_2.shape == (1, 2, 2) assert preds_1 == pytest.approx(preds_2)
def test_dgi(model_type, sparse): if sparse and model_type is PPNP: pytest.skip("PPNP doesn't support sparse=True") G = example_graph_random() emb_dim = 16 generator = FullBatchNodeGenerator(G, sparse=sparse) corrupted_generator = CorruptedGenerator(generator) gen = corrupted_generator.flow(G.nodes()) base_model = model_type( generator=generator, activations=["relu"], layer_sizes=[emb_dim] ) infomax = DeepGraphInfomax(base_model) model = tf.keras.Model(*infomax.in_out_tensors()) model.compile(loss=tf.nn.sigmoid_cross_entropy_with_logits, optimizer="Adam") model.fit(gen) emb_model = tf.keras.Model(*infomax.embedding_model()) embeddings = emb_model.predict(generator.flow(G.nodes())) assert embeddings.shape == (len(G.nodes()), emb_dim)
def test_GCN_apply_sparse(): G, features = create_graph_features() adj = G.to_adjacency_matrix() features, adj = GCN_Aadj_feats_op(features, adj) adj = adj.tocoo() A_indices = np.expand_dims( np.hstack((adj.row[:, None], adj.col[:, None])).astype(np.int64), 0) A_values = np.expand_dims(adj.data, 0) generator = FullBatchNodeGenerator(G, sparse=True, method="gcn") gcnModel = GCN(layer_sizes=[2], activations=["relu"], generator=generator, dropout=0.5) x_in, x_out = gcnModel.in_out_tensors() model = keras.Model(inputs=x_in, outputs=x_out) # Check fit method out_indices = np.array([[0, 1]], dtype="int32") preds_1 = model.predict( [features[None, :, :], out_indices, A_indices, A_values]) assert preds_1.shape == (1, 2, 2) # Check fit method preds_2 = model.predict(generator.flow(["a", "b"])) assert preds_2.shape == (1, 2, 2) assert preds_1 == pytest.approx(preds_2)
def test_APPNP_apply_propagate_model_dense(): G, features = create_graph_features() adj = nx.to_scipy_sparse_matrix(G) features, adj = GCN_Aadj_feats_op(features, adj) adj = np.array(adj.todense()[None, :, :]) n_nodes = features.shape[0] nodes = G.nodes() node_features = pd.DataFrame.from_dict( {n: f for n, f in zip(nodes, features)}, orient="index") G = StellarGraph(G, node_features=node_features) generator = FullBatchNodeGenerator(G, sparse=False, method="gcn") appnpnModel = APPNP([2], generator=generator, activations=["relu"], dropout=0.5) fully_connected_model = keras.Sequential() fully_connected_model.add(Dense(2)) x_in, x_out = appnpnModel.propagate_model(fully_connected_model) model = keras.Model(inputs=x_in, outputs=x_out) # Check fit method out_indices = np.array([[0, 1]], dtype="int32") preds_1 = model.predict([features[None, :, :], out_indices, adj]) assert preds_1.shape == (1, 2, 2) # Check fit_generator method preds_2 = model.predict_generator(generator.flow(["a", "b"])) assert preds_2.shape == (1, 2, 2) assert preds_1 == pytest.approx(preds_2)
def test_gat_build_no_norm(self): G = example_graph(feature_size=self.F_in) gen = FullBatchNodeGenerator(G, sparse=self.sparse, method=self.method) gat = GAT( layer_sizes=self.layer_sizes, activations=self.activations, attn_heads=self.attn_heads, generator=gen, bias=True, normalize=None, kernel_initializer="ones", attn_kernel_initializer="ones", ) x_in, x_out = gat.in_out_tensors() model = keras.Model(inputs=x_in, outputs=x_out) ng = gen.flow(G.nodes()) actual = model.predict(ng) expected = np.ones((G.number_of_nodes(), self.layer_sizes[-1])) * ( self.F_in * self.layer_sizes[0] * self.attn_heads * np.max(G.node_features(G.nodes()))) assert np.allclose(expected, actual[0])
def test_gat_serialize(self): G = example_graph(feature_size=self.F_in) gen = FullBatchNodeGenerator(G, sparse=self.sparse, method=self.method) gat = GAT( layer_sizes=self.layer_sizes, activations=self.activations, attn_heads=self.attn_heads, generator=gen, bias=True, normalize="l2", ) x_in, x_out = gat.in_out_tensors() model = keras.Model(inputs=x_in, outputs=x_out) ng = gen.flow(G.nodes()) # Save model model_json = model.to_json() # Set all weights to one model_weights = [np.ones_like(w) for w in model.get_weights()] # Load model from json & set all weights model2 = keras.models.model_from_json( model_json, custom_objects={"GraphAttention": GraphAttention}) model2.set_weights(model_weights) # Test deserialized model actual = model2.predict(ng) expected = np.ones( (G.number_of_nodes(), self.layer_sizes[-1])) * (1.0 / G.number_of_nodes()) assert np.allclose(expected, actual[0])
def test_APPNP_apply_dense(): G, features = create_graph_features() adj = G.to_adjacency_matrix() features, adj = GCN_Aadj_feats_op(features, adj) adj = np.array(adj.todense()[None, :, :]) generator = FullBatchNodeGenerator(G, sparse=False, method="gcn") appnpModel = APPNP([2], generator=generator, activations=["relu"], dropout=0.5) x_in, x_out = appnpModel.in_out_tensors() model = keras.Model(inputs=x_in, outputs=x_out) # Check fit method out_indices = np.array([[0, 1]], dtype="int32") preds_1 = model.predict([features[None, :, :], out_indices, adj]) assert preds_1.shape == (1, 2, 2) # Check fit method preds_2 = model.predict(generator.flow(["a", "b"])) assert preds_2.shape == (1, 2, 2) assert preds_1 == pytest.approx(preds_2)
def test_generator_flow_targets_as_list(self): generator = FullBatchNodeGenerator(self.G) node_ids = list(self.G.nodes())[:3] node_targets = [1] * len(node_ids) gen = generator.flow(node_ids, node_targets) inputs, y = gen[0] assert y.shape == (1, 3) assert np.sum(y) == 3
def create_GCN_model_sparse(graph): generator = FullBatchNodeGenerator(graph, sparse=True, method="gcn") train_gen = generator.flow([0, 1], np.array([[1, 0], [0, 1]])) layer_sizes = [2, 2] gcn = GCN( layer_sizes=layer_sizes, activations=["elu", "elu"], generator=generator, dropout=0.3, kernel_regularizer=regularizers.l2(5e-4), ) for layer in gcn._layers: layer._initializer = "ones" x_inp, x_out = gcn.build() keras_model = Model(inputs=x_inp, outputs=x_out) return gcn, keras_model, generator, train_gen
def create_GCN_model(graph): generator = FullBatchNodeGenerator(graph) train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]])) base_model = GCN( layer_sizes=[8, 2], generator=generator, bias=True, dropout=0.5, activations=["elu", "softmax"], ) x_inp, x_out = base_model.build() keras_model = Model(inputs=x_inp, outputs=x_out) return base_model, keras_model, generator, train_gen
def create_GAT_model(graph): generator = FullBatchNodeGenerator(graph, sparse=False, method=None) train_gen = generator.flow([0, 1], np.array([[1, 0], [0, 1]])) gat = GAT( layer_sizes=[2, 2], generator=generator, bias=False, in_dropout=0, attn_dropout=0, activations=["elu", "softmax"], normalize=None, saliency_map_support=True, ) for layer in gat._layers: layer._initializer = "ones" x_inp, x_out = gat.build() keras_model = Model(inputs=x_inp, outputs=x_out) return gat, keras_model, generator, train_gen
def generator_flow( self, G, node_ids, node_targets, sparse=False, method="none", k=1, teleport_probability=0.1, ): generator = FullBatchNodeGenerator( G, sparse=sparse, method=method, k=k, teleport_probability=teleport_probability, ) n_nodes = G.number_of_nodes() gen = generator.flow(node_ids, node_targets) if sparse: [X, tind, A_ind, A_val], y = gen[0] A_sparse = sps.coo_matrix( (A_val[0], (A_ind[0, :, 0], A_ind[0, :, 1])), shape=(n_nodes, n_nodes)) A_dense = A_sparse.toarray() else: [X, tind, A], y = gen[0] A_dense = A[0] assert np.allclose(X, gen.features) # X should be equal to gen.features assert tind.shape[1] == len(node_ids) if node_targets is not None: assert np.allclose(y, node_targets) # Check that the diagonals are one if method == "self_loops": assert np.allclose(A_dense.diagonal(), 1) return A_dense, tind, y
def test_dgi_stateful(): G = example_graph_random() emb_dim = 16 generator = FullBatchNodeGenerator(G) corrupted_generator = CorruptedGenerator(generator) gen = corrupted_generator.flow(G.nodes()) infomax = DeepGraphInfomax( GCN(generator=generator, activations=["relu"], layer_sizes=[emb_dim]) ) model_1 = tf.keras.Model(*infomax.in_out_tensors()) model_2 = tf.keras.Model(*infomax.in_out_tensors()) # check embeddings are equal before training embeddings_1 = tf.keras.Model(*infomax.embedding_model()).predict( generator.flow(G.nodes()) ) embeddings_2 = tf.keras.Model(*infomax.embedding_model()).predict( generator.flow(G.nodes()) ) assert np.array_equal(embeddings_1, embeddings_2) model_1.compile(loss=tf.nn.sigmoid_cross_entropy_with_logits, optimizer="Adam") model_1.fit(gen) # check embeddings are still equal after training one model embeddings_1 = tf.keras.Model(*infomax.embedding_model()).predict( generator.flow(G.nodes()) ) embeddings_2 = tf.keras.Model(*infomax.embedding_model()).predict( generator.flow(G.nodes()) ) assert np.array_equal(embeddings_1, embeddings_2) model_2.compile(loss=tf.nn.sigmoid_cross_entropy_with_logits, optimizer="Adam") model_2.fit(gen) # check embeddings are still equal after training both models embeddings_1 = tf.keras.Model(*infomax.embedding_model()).predict( generator.flow(G.nodes()) ) embeddings_2 = tf.keras.Model(*infomax.embedding_model()).predict( generator.flow(G.nodes()) ) assert np.array_equal(embeddings_1, embeddings_2)
def create_GAT_model(graph): generator = FullBatchNodeGenerator(graph, sparse=False) train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]])) base_model = GAT( layer_sizes=[8, 8, 2], generator=generator, bias=True, in_dropout=0.5, attn_dropout=0.5, activations=["elu", "elu", "softmax"], normalize=None, ) x_inp, x_out = base_model.build() keras_model = Model(inputs=x_inp, outputs=x_out) return base_model, keras_model, generator, train_gen
def test_APPNP_apply_sparse(): G, features = create_graph_features() adj = nx.to_scipy_sparse_matrix(G) features, adj = GCN_Aadj_feats_op(features, adj) adj = adj.tocoo() A_indices = np.expand_dims(np.hstack((adj.row[:, None], adj.col[:, None])), 0) A_values = np.expand_dims(adj.data, 0) nodes = G.nodes() node_features = pd.DataFrame.from_dict( {n: f for n, f in zip(nodes, features)}, orient="index") G = StellarGraph(G, node_features=node_features) generator = FullBatchNodeGenerator(G, sparse=True, method="gcn") appnpnModel = APPNP([2], generator=generator, activations=["relu"], dropout=0.5) x_in, x_out = appnpnModel.build() model = keras.Model(inputs=x_in, outputs=x_out) # Check fit method out_indices = np.array([[0, 1]], dtype="int32") preds_1 = model.predict( [features[None, :, :], out_indices, A_indices, A_values]) assert preds_1.shape == (1, 2, 2) # Check fit_generator method preds_2 = model.predict_generator(generator.flow(["a", "b"])) assert preds_2.shape == (1, 2, 2) assert preds_1 == pytest.approx(preds_2)
cora_dataset = sg.datasets.Cora( ) # the 'cora' dataset is built-in into stellargraph datasets module # it returns a Stellargraph object, and the node subjects (classes) # The features (word occurencess) are already built-in into the "stellar_g" object of type Stellargraph stellar_g, node_classes = cora_dataset.load(directed=True) train_dataset, test_dataset = split_data(node_classes) train_targets, test_targets, target_encoding = encode_classes( train_dataset, test_dataset) ############################################################### # creating GCN model gcn_generator = FullBatchNodeGenerator(stellar_g, method="gcn", sparse=False) train_gcn_gen = gcn_generator.flow(train_dataset.index, train_targets) gcn = GCN(layer_sizes=[16, 16], activations=['relu', 'relu'], generator=gcn_generator, dropout=0.5) # 2 GCN layers gcn_inp, gcn_out = gcn.in_out_tensors() # for the KERAS model # creating KERAS model with the GCN model layers gcn_dense_layer = layers.Dense(units=train_targets.shape[1], activation="softmax")(gcn_out) keras_gcn = Model(inputs=gcn_inp, outputs=gcn_dense_layer) # 2 GCN, 1 Dense keras_gcn.compile( optimizer="adam", loss=losses.categorical_crossentropy, metrics=["accuracy"],
def infer_attributes_gat(Gnx, savepred=True, plot=False): # Define node data feature_names = [ "in_degree", "out_degree", # "in_degree_centrality", # "out_degree_centrality", # "closeness_centrality", # "betweenness_centrality", "clustering_coefficient", # "square_clustering", "core_number", # "pagerank", # "constraint", # "effective_size" ] node_type = [v for k, v in nx.get_node_attributes(Gnx, 'data').items()] d = {"node_type": node_type} if "in_degree" in feature_names: indeg = [v for k, v in Gnx.in_degree] indeg = np.divide(indeg, max(indeg)) indeg[indeg >= 0.5] = 1 indeg[indeg < 0.5] = 0 d["in_degree"] = indeg if "out_degree" in feature_names: outdeg = [v for k, v in Gnx.out_degree] outdeg = np.divide(outdeg, max(outdeg)) outdeg[outdeg >= 0.5] = 1 outdeg[outdeg < 0.5] = 0 d["out_degree"] = outdeg if "in_degree_centrality" in feature_names: indeg_cent = [ v for k, v in nx.algorithms.in_degree_centrality(Gnx).items() ] indeg_cent = np.divide(indeg_cent, max(indeg_cent)) indeg_cent[indeg_cent >= 0.5] = 1 indeg_cent[indeg_cent < 0.5] = 0 d["in_degree_centrality"] = indeg_cent if "out_degree_centrality" in feature_names: outdeg_cent = [ v for k, v in nx.algorithms.out_degree_centrality(Gnx).items() ] outdeg_cent = np.divide(outdeg_cent, max(outdeg_cent)) outdeg_cent[outdeg_cent >= 0.5] = 1 outdeg_cent[outdeg_cent < 0.5] = 0 d["out_degree_centrality"] = outdeg_cent if "closeness_centrality" in feature_names: close_cent = [ v for k, v in nx.algorithms.closeness_centrality(Gnx).items() ] close_cent = np.divide(close_cent, max(close_cent)) close_cent[close_cent >= 0.5] = 1 close_cent[close_cent < 0.5] = 0 d["closeness_centrality"] = close_cent if "betweenness_centrality" in feature_names: between_cent = [ v for k, v in nx.algorithms.betweenness_centrality(Gnx).items() ] between_cent = np.divide(between_cent, max(between_cent)) between_cent[between_cent >= 0.5] = 1 between_cent[between_cent < 0.5] = 0 d["betweenness_centrality"] = between_cent if "clustering_coefficient" in feature_names: clustering_co = [v for k, v in nx.algorithms.clustering(Gnx).items()] clustering_co = np.divide(clustering_co, max(clustering_co)) clustering_co[clustering_co >= 0.5] = 1 clustering_co[clustering_co < 0.5] = 0 d["clustering_coefficient"] = clustering_co if "square_clustering" in feature_names: sq_clustering = [ v for k, v in nx.algorithms.square_clustering(Gnx).items() ] sq_clustering = np.divide(sq_clustering, max(sq_clustering)) sq_clustering[sq_clustering >= 0.5] = 1 sq_clustering[sq_clustering < 0.5] = 0 d["square_clustering"] = sq_clustering if "core_number" in feature_names: core_number = [v for k, v in nx.algorithms.core_number(Gnx).items()] core_number = np.divide(core_number, max(core_number)) core_number[core_number >= 0.5] = 1 core_number[core_number < 0.5] = 0 d["core_number"] = core_number if "pagerank" in feature_names: pagerank = [v for k, v in nx.algorithms.pagerank(Gnx).items()] pagerank = np.divide(pagerank, max(pagerank)) pagerank[pagerank >= 0.5] = 1 pagerank[pagerank < 0.5] = 0 d["pagerank"] = pagerank if "constraint" in feature_names: constraint = [v for k, v in nx.algorithms.constraint(Gnx).items()] constraint = np.divide(constraint, max(constraint)) constraint[np.isnan(constraint)] = 0 constraint[constraint >= 0.5] = 1 constraint[constraint < 0.5] = 0 d["constraint"] = constraint if "effective_size" in feature_names: effective_size = [ v for k, v in nx.algorithms.effective_size(Gnx).items() ] effective_size = np.divide(effective_size, max(effective_size)) effective_size[np.isnan(effective_size)] = 0 effective_size[effective_size >= 0.5] = 1 effective_size[effective_size < 0.5] = 0 d["effective_size"] = effective_size node_data = pd.DataFrame(data=d, index=nodes) node_data = shuffle(node_data) # Split the data train_data, test_data = model_selection.train_test_split( node_data, train_size=int(0.80 * len(Gnx))) val_data, test_data = model_selection.train_test_split( test_data, train_size=int(0.15 * len(Gnx))) # Convert to numeric arrays target_encoding = feature_extraction.DictVectorizer(sparse=False) train_targets = target_encoding.fit_transform( train_data[["node_type"]].to_dict('records')) val_targets = target_encoding.transform(val_data[["node_type" ]].to_dict('records')) test_targets = target_encoding.transform(test_data[["node_type" ]].to_dict('records')) node_features = node_data[feature_names] # Create the GAT model in Keras G = sg.StellarDiGraph(Gnx, node_features=node_features) print(G.info()) generator = FullBatchNodeGenerator(G) train_gen = generator.flow(train_data.index, train_targets) gat = GAT( layer_sizes=[8, train_targets.shape[1]], attn_heads=8, generator=generator, bias=True, in_dropout=0.5, attn_dropout=0.5, activations=["elu", "softmax"], normalize=None, ) # Expose the input and output tensors of the GAT model for node prediction, via GAT.node_model() method: x_inp, predictions = gat.node_model() # Train the model model = Model(inputs=x_inp, outputs=predictions) model.compile( optimizer=optimizers.Adam(lr=0.005), loss=losses.categorical_crossentropy, weighted_metrics=["acc"], ) val_gen = generator.flow(val_data.index, val_targets) if not os.path.isdir(".temp/logs"): os.makedirs(".temp/logs") if not os.path.isdir(".temp/output"): os.makedirs(".temp/output") es_callback = EarlyStopping( monitor="val_weighted_acc", patience= 100 # patience is the number of epochs to wait before early stopping in case of no further improvement ) mc_callback = ModelCheckpoint( ".temp/logs/best_model.h5", monitor="val_weighted_acc", save_best_only=True, save_weights_only=True, ) history = model.fit_generator( train_gen, epochs=2000, validation_data=val_gen, verbose=2, shuffle= False, # this should be False, since shuffling data means shuffling the whole graph callbacks=[es_callback, mc_callback], ) # Reload the saved weights model.load_weights(".temp/logs/best_model.h5") # Evaluate the best nidek in the test set test_gen = generator.flow(test_data.index, test_targets) test_metrics = model.evaluate_generator(test_gen) print("\nTest Set Metrics:") for name, val in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, val)) # Make predictions with the model all_nodes = node_data.index all_gen = generator.flow(all_nodes) all_predictions = model.predict_generator(all_gen) node_predictions = target_encoding.inverse_transform(all_predictions) results = pd.DataFrame(node_predictions, index=G.nodes()).idxmax(axis=1) df = pd.DataFrame({"Predicted": results, "True": node_data['node_type']}) print(df.head) if savepred: df.to_excel(".temp/output/output" + str(datetime.datetime.now()).replace(':', '-') + ".xlsx") if plot: # Node embeddings emb_layer = model.layers[3] print("Embedding layer: {}, output shape {}".format( emb_layer.name, emb_layer.output_shape)) embedding_model = Model(inputs=x_inp, outputs=emb_layer.output) emb = embedding_model.predict_generator(all_gen) X = emb y = np.argmax(target_encoding.transform( node_data.reindex(G.nodes())[["node_type"]].to_dict('records')), axis=1) if X.shape[1] > 2: transform = TSNE #PCA trans = transform(n_components=2) emb_transformed = pd.DataFrame(trans.fit_transform(X), index=list(G.nodes())) emb_transformed['label'] = y else: emb_transformed = pd.DataFrame(X, index=list(G.nodes())) emb_transformed = emb_transformed.rename(columns={'0': 0, '1': 1}) def plot_emb(transform, emb_transformed): fig, ax = plt.subplots(figsize=(7, 7)) ax.scatter(emb_transformed[0], emb_transformed[1], c=emb_transformed['label'].astype("category"), cmap="jet", alpha=0.7) ax.set(aspect="equal", xlabel="$X_1$", ylabel="$X_2$") plt.title( '{} visualization of GAT embeddings for the fighter graph'. format(transform.__name__)) # Plot the training history def remove_prefix(text, prefix): return text[text.startswith(prefix) and len(prefix):] def plot_history(history): metrics = sorted( set([ remove_prefix(m, "val_") for m in list(history.history.keys()) ])) for m in metrics: # summarize history for metric m plt.figure() plt.plot(history.history[m]) plt.plot(history.history['val_' + m]) plt.title(m) plt.ylabel(m) plt.xlabel('epoch') plt.legend(['train', 'validation'], loc='best') plot_history(history) plot_emb(transform, emb_transformed) plt.show() return df
def test_generator_flow_targets_not_iterator(self): generator = FullBatchNodeGenerator(self.G) node_ids = list(self.G.nodes())[:3] node_targets = 1 with pytest.raises(TypeError): generator.flow(node_ids, node_targets)
def train( edgelist, node_data, attn_heads, layer_sizes, num_epochs=10, learning_rate=0.005, es_patience=100, dropout=0.0, target_name="subject", ): """ Train a GAT model on the specified graph G with given parameters, evaluate it, and save the model. Args: edgelist: Graph edgelist node_data: Feature and target data for nodes attn_heads: Number of attention heads in GAT layers layer_sizes: A list of number of hidden nodes in each layer num_epochs: Number of epochs to train the model learning_rate: Initial Learning rate dropout: The dropout (0->1) """ # Extract target and encode as a one-hot vector target_encoding = feature_extraction.DictVectorizer(sparse=False) node_targets = target_encoding.fit_transform( node_data[[target_name]].to_dict("records")) node_ids = node_data.index # Extract the feature data. These are the feature vectors that the Keras model will use as input. # The CORA dataset contains attributes 'w_x' that correspond to words found in that publication. node_features = node_data[feature_names] # Create graph from edgelist and set node features and node type Gnx = nx.from_pandas_edgelist(edgelist) # Convert to StellarGraph and prepare for ML G = sg.StellarGraph(Gnx, node_type_name="label", node_features=node_features) # Split nodes into train/test using stratification. train_nodes, test_nodes, train_targets, test_targets = model_selection.train_test_split( node_ids, node_targets, train_size=140, test_size=None, stratify=node_targets, random_state=55232, ) # Further split test set into validation and test val_nodes, test_nodes, val_targets, test_targets = model_selection.train_test_split( test_nodes, test_targets, train_size=500, test_size=1000, random_state=523214) # Create mappers for GraphSAGE that input data from the graph to the model generator = FullBatchNodeGenerator(G) train_gen = generator.flow(train_nodes, train_targets) val_gen = generator.flow(val_nodes, val_targets) # GAT model gat = GAT( layer_sizes=layer_sizes, attn_heads=attn_heads, generator=generator, bias=True, in_dropout=dropout, attn_dropout=dropout, activations=["elu", "elu"], normalize=None, ) # Expose the input and output tensors of the GAT model for nodes: x_inp, x_out = gat.node_model(add_self_loops=True) # Snap the final estimator layer to x_out x_out = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) # Create Keras model for training model = keras.Model(inputs=x_inp, outputs=x_out) model.compile( optimizer=optimizers.Adam(lr=learning_rate, decay=0.001), loss=losses.categorical_crossentropy, weighted_metrics=["acc"], ) print(model.summary()) # Train model # Callbacks if not os.path.isdir("logs"): os.makedirs("logs") N = len(node_ids) es_callback = EarlyStopping(monitor="val_weighted_acc", patience=es_patience) tb_callback = TensorBoard(batch_size=N) mc_callback = ModelCheckpoint( "logs/best_model.h5", monitor="val_weighted_acc", save_best_only=True, save_weights_only=True, ) if args.interface == "fit": print("\nUsing model.fit() to train the model\n") # Get the training data [X, A], y_train, node_mask_train = train_gen.__getitem__(0) N = A.shape[0] # A = sparse.csr_matrix(A + np.eye(A.shape[0])) # Add self-loops # Get the validation data [_, _], y_val, node_mask_val = val_gen.__getitem__(0) history = model.fit( x=[X, A], y=y_train, sample_weight=node_mask_train, batch_size=N, shuffle= False, # must be False, since shuffling data means shuffling the whole graph epochs=num_epochs, verbose=2, validation_data=([X, A], y_val, node_mask_val), callbacks=[es_callback, tb_callback, mc_callback], ) else: print("\nUsing model.fit_generator() to train the model\n") history = model.fit_generator( train_gen, epochs=num_epochs, validation_data=val_gen, verbose=2, shuffle=False, callbacks=[es_callback, tb_callback, mc_callback], ) # Load best model model.load_weights("logs/best_model.h5") # Evaluate on validation set and print metrics if args.interface == "fit": val_metrics = model.evaluate(x=[X, A], y=y_val, sample_weight=node_mask_val, batch_size=N) else: val_metrics = model.evaluate_generator(val_gen) print("\nBest model's Validation Set Metrics:") for name, val in zip(model.metrics_names, val_metrics): print("\t{}: {:0.4f}".format(name, val)) # Evaluate on test set and print metrics if args.interface == "fit": [_, _], y_test, node_mask_test = generator.flow( test_nodes, test_targets).__getitem__(0) test_metrics = model.evaluate(x=[X, A], y=y_test, sample_weight=node_mask_test, batch_size=N) else: test_metrics = model.evaluate_generator( generator.flow(test_nodes, test_targets)) print("\nBest model's Test Set Metrics:") for name, val in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, val)) # Get predictions for all nodes # Note that the `predict` or `predict_generator` function now operates differently to the `GraphSAGE` or `HinSAGE` models # in that if you give it less than the complete set of nodes, it will still return all predictions and in a fixed order # defined by the order of nodes in X and A (which is defined by the order of G.nodes()). if args.interface == "fit": all_predictions = model.predict(x=[X, A], batch_size=N) else: all_predictions = model.predict_generator(generator.flow(node_ids)) # Turn predictions back into the original categories node_predictions = pd.DataFrame( target_encoding.inverse_transform(all_predictions), index=list(G.nodes())) accuracy = np.mean([ "subject=" + gt_subject == p for gt_subject, p in zip(node_data["subject"][list(G.nodes())], node_predictions.idxmax(axis=1)) ]) print("\nAll-node accuracy: {:0.4f}".format(accuracy)) # Save the trained model save_str = "_h{}_l{}_d{}_r{}".format( attn_heads, "_".join([str(x) for x in layer_sizes]), dropout, learning_rate) model.save("cora_gat_model" + save_str + ".h5") # We must also save the target encoding to convert model predictions with open("cora_gat_encoding" + save_str + ".pkl", "wb") as f: pickle.dump([target_encoding], f)
train_subjects.value_counts().to_frame() # ============================================================================= # # encode labels # ============================================================================= target_encoding = preprocessing.LabelBinarizer() train_targets = target_encoding.fit_transform(train_subjects) val_targets = target_encoding.transform(val_subjects) test_targets = target_encoding.transform(test_subjects) # ============================================================================= # # Init generator # ============================================================================= generator = FullBatchNodeGenerator(G, method="gcn") train_gen = generator.flow(train_subjects.index, train_targets) # ============================================================================= # # Create model # ============================================================================= gcn = GCN(layer_sizes=[16, 16], activations=["relu", "relu"], generator=generator, dropout=0.5) x_inp, x_out = gcn.in_out_tensors() predictions = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) model = Model(inputs=x_inp, outputs=predictions) model.compile( optimizer=optimizers.Adam(lr=0.01),
fullbatch_generator = FullBatchNodeGenerator(G, sparse=False) gcn_model = GCN(layer_sizes=[2], activations=["relu"], generator=fullbatch_generator) corrupted_generator = CorruptedGenerator(fullbatch_generator) gen = corrupted_generator.flow(G.nodes()) infomax = DeepGraphInfomax(gcn_model, corrupted_generator) x_in, x_out = infomax.in_out_tensors() model = Model(inputs=x_in, outputs=x_out) model.compile(loss=tf.nn.sigmoid_cross_entropy_with_logits, optimizer=Adam(lr=1e-3)) epochs = 100 es = EarlyStopping(monitor="loss", min_delta=0, patience=20) history = model.fit(gen, epochs=epochs, verbose=0, callbacks=[es]) plot_history(history) x_emb_in, x_emb_out = gcn_model.in_out_tensors() # for full batch models, squeeze out the batch dim (which is 1) x_out = tf.squeeze(x_emb_out, axis=0) emb_model = Model(inputs=x_emb_in, outputs=x_out) all_embeddings = emb_model.predict(fullbatch_generator.flow(G.nodes())) test = pd.DataFrame(all_embeddings, index=G.nodes()) test.to_csv("/home/jonno/setse_1_data/test_embs.csv" )
G, node_subjects = dataset.load() train_subjects, test_subjects = model_selection.train_test_split( node_subjects, train_size=140, test_size=None, stratify=node_subjects) val_subjects, test_subjects = model_selection.train_test_split( test_subjects, train_size=500, test_size=None, stratify=test_subjects) target_encoding = preprocessing.LabelBinarizer() train_targets = target_encoding.fit_transform(train_subjects) val_targets = target_encoding.transform(val_subjects) test_targets = target_encoding.transform(test_subjects) generator = FullBatchNodeGenerator(G, method="gcn") train_gen = generator.flow(train_subjects.index, train_targets) gcn = GCN(layer_sizes=[16, 8, 8], activations=["relu", "relu", "relu"], generator=generator, dropout=0.5) model = Model(inputs=x_inp, outputs=predictions) model.compile( optimizer=optimizers.Adam(lr=0.01), loss=losses.categorical_crossentropy, metrics=["acc"], ) history = model.fit( train_gen,
node_series = [] label_series = [] for key in node_content_id_map.keys(): node_series.append(node_content_id_map[key]) label_series.append(label_dict[key]) node_label_df = (pd.DataFrame({ 'node': node_series, 'label': label_series }).sort_values(['node'])) train_subjects, test_subjects = model_selection.train_test_split( node_label_df, train_size=0.7, test_size=None, stratify=None) test_gen = fullbatch_generator.flow(test_subjects.index) train_gen = fullbatch_generator.flow(train_subjects.index) test_embeddings = emb_model.predict(test_gen) train_embeddings = emb_model.predict(train_gen) lr = LogisticRegression(multi_class="auto", solver="lbfgs", max_iter=2000) lr.fit(train_embeddings, train_subjects['label']) y_pred = lr.predict(test_embeddings) test_acc = (y_pred == test_subjects['label']).mean() test_acc # random prediction random_preds = list(train_subjects['label'].values) random_test_preds = random.choices(random_preds, k=len(test_subjects))
print('------------------------\nTRAIN:', train_subjects.value_counts().to_frame()) print('------------------------\nTEST:', test_subjects.value_counts().to_frame()) print('------------------------\nVALIDATION:', val_subjects.value_counts().to_frame()) # MAIN CODE ============================================================================================================ target_encoding = preprocessing.LabelBinarizer() train_targets = target_encoding.fit_transform(train_subjects) val_targets = target_encoding.transform(val_subjects) test_targets = target_encoding.transform(test_subjects) all_targets = target_encoding.transform(graph_labels) generator = FullBatchNodeGenerator(graph_stellar, method="gcn") train_gen = generator.flow(train_subjects.index, train_targets) val_gen = generator.flow(val_subjects.index, val_targets) test_gen = generator.flow(test_subjects.index, test_targets) all_gen = generator.flow(graph_labels.index, all_targets) es_callback = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True) auc = tf.keras.metrics.AUC() with tf.device('/CPU:0'): gcn = GCN( layer_sizes=[2*node_feature_count, 2*node_feature_count], activations=['relu', 'relu'], generator=generator ) x_inp, x_out = gcn.in_out_tensors()