示例#1
0
def gcn_pipeline(G,
                 node_subjects,
                 layer_sizes=[16, 16],
                 activations=["relu", "relu"]):
    #Train and test split
    train_subjects, val_subjects, test_subjects = training_split(node_subjects)

    #GCN training generator
    generator = FullBatchNodeGenerator(G, method="gcn")
    train_gen = generator.flow(
        train_subjects.index,
        train_subjects.values,
    )
    gcn = GCN(layer_sizes=layer_sizes,
              activations=activations,
              generator=generator,
              dropout=0.5)
    model = build_model(gcn, train_subjects.values.shape[1])

    val_gen = generator.flow(val_subjects.index, val_subjects.values)
    es_callback = EarlyStopping(monitor="val_acc",
                                patience=50,
                                restore_best_weights=True)
    history = model.fit(
        train_gen,
        epochs=200,
        validation_data=val_gen,
        verbose=0,
        shuffle=
        False,  # this should be False, since shuffling data means shuffling the whole graph
        callbacks=[es_callback],
    )

    plot_results(history)
    test_metrics(generator, model, test_subjects)
    def preprocessing(self, g, train_node, file_emb_output="./emb/100_900_nede2vec.emb"):

        node_subjects = train_node['values']

        node_subjects = node_subjects.astype(str)
        print(Counter(node_subjects))

        #file_emb_output = "./emb/100_900_nede2vec.emb"
        model = KeyedVectors.load_word2vec_format(file_emb_output)
        node_ids = model.wv.index2word
        node_embeddings = (
            model.wv.vectors
        )  # num
        print("Embedding load success.")

        reinex_node_embedding = pd.DataFrame(node_embeddings, index=map(int, node_ids))
        g_feature_attr = g.copy()

        G = StellarGraph.from_networkx(
            g_feature_attr, node_features=reinex_node_embedding, node_type_default="n", edge_type_default="e"
        )
        print(G.info())

        train_subjects, test_subjects = model_selection.train_test_split(
            node_subjects, train_size=160, test_size=None, stratify=node_subjects
        )
        val_subjects, test_subjects = model_selection.train_test_split(
            test_subjects, train_size=20, test_size=None, stratify=test_subjects
        )

        train_subjects.value_counts().to_frame()

        target_encoding = preprocessing.LabelBinarizer()
        # target_encoding = preprocessing.OneHotEncoder()

        train_targets = target_encoding.fit_transform(train_subjects)
        val_targets = target_encoding.transform(val_subjects)
        test_targets = target_encoding.transform(test_subjects)

        generator = FullBatchNodeGenerator(G, method="gcn")
        train_gen = generator.flow(train_subjects.index, train_targets)
        val_gen = generator.flow(val_subjects.index, val_targets)
        test_gen = generator.flow(test_subjects.index, test_targets)

        all_nodes = node_subjects.index
        all_gen = generator.flow(all_nodes)

        return G, train_gen, train_targets, val_gen, val_targets, test_targets, test_gen, all_gen, generator
示例#3
0
def test_GCN_apply_dense():
    G, features = create_graph_features()
    adj = nx.to_numpy_array(G)[None, :, :]
    n_nodes = features.shape[0]

    nodes = G.nodes()
    node_features = pd.DataFrame.from_dict(
        {n: f
         for n, f in zip(nodes, features)}, orient="index")
    G = StellarGraph(G, node_features=node_features)

    generator = FullBatchNodeGenerator(G, sparse=False, method="none")
    gcnModel = GCN([2], generator, activations=["relu"], dropout=0.5)

    x_in, x_out = gcnModel.build()
    model = keras.Model(inputs=x_in, outputs=x_out)

    # Check fit method
    out_indices = np.array([[0, 1]], dtype="int32")
    preds_1 = model.predict([features[None, :, :], out_indices, adj])
    assert preds_1.shape == (1, 2, 2)

    # Check fit_generator method
    preds_2 = model.predict_generator(generator.flow(["a", "b"]))
    assert preds_2.shape == (1, 2, 2)

    assert preds_1 == pytest.approx(preds_2)
示例#4
0
def test_APPNP_apply_propagate_model_sparse():

    G, features = create_graph_features()
    adj = G.to_adjacency_matrix()
    features, adj = GCN_Aadj_feats_op(features, adj)
    adj = adj.tocoo()
    A_indices = np.expand_dims(np.hstack((adj.row[:, None], adj.col[:, None])), 0)
    A_values = np.expand_dims(adj.data, 0)

    generator = FullBatchNodeGenerator(G, sparse=True, method="gcn")
    appnpnModel = APPNP([2], generator=generator, activations=["relu"], dropout=0.5)

    fully_connected_model = keras.Sequential()
    fully_connected_model.add(Dense(2))

    x_in, x_out = appnpnModel.propagate_model(fully_connected_model)
    model = keras.Model(inputs=x_in, outputs=x_out)

    # Check fit method
    out_indices = np.array([[0, 1]], dtype="int32")
    preds_1 = model.predict([features[None, :, :], out_indices, A_indices, A_values])
    assert preds_1.shape == (1, 2, 2)

    # Check fit method
    preds_2 = model.predict(generator.flow(["a", "b"]))
    assert preds_2.shape == (1, 2, 2)

    assert preds_1 == pytest.approx(preds_2)
def test_dgi(model_type, sparse):

    if sparse and model_type is PPNP:
        pytest.skip("PPNP doesn't support sparse=True")

    G = example_graph_random()
    emb_dim = 16

    generator = FullBatchNodeGenerator(G, sparse=sparse)
    corrupted_generator = CorruptedGenerator(generator)
    gen = corrupted_generator.flow(G.nodes())

    base_model = model_type(
        generator=generator, activations=["relu"], layer_sizes=[emb_dim]
    )
    infomax = DeepGraphInfomax(base_model)

    model = tf.keras.Model(*infomax.in_out_tensors())
    model.compile(loss=tf.nn.sigmoid_cross_entropy_with_logits, optimizer="Adam")
    model.fit(gen)

    emb_model = tf.keras.Model(*infomax.embedding_model())
    embeddings = emb_model.predict(generator.flow(G.nodes()))

    assert embeddings.shape == (len(G.nodes()), emb_dim)
示例#6
0
def test_GCN_apply_sparse():

    G, features = create_graph_features()
    adj = G.to_adjacency_matrix()
    features, adj = GCN_Aadj_feats_op(features, adj)
    adj = adj.tocoo()
    A_indices = np.expand_dims(
        np.hstack((adj.row[:, None], adj.col[:, None])).astype(np.int64), 0)
    A_values = np.expand_dims(adj.data, 0)

    generator = FullBatchNodeGenerator(G, sparse=True, method="gcn")
    gcnModel = GCN(layer_sizes=[2],
                   activations=["relu"],
                   generator=generator,
                   dropout=0.5)

    x_in, x_out = gcnModel.in_out_tensors()
    model = keras.Model(inputs=x_in, outputs=x_out)

    # Check fit method
    out_indices = np.array([[0, 1]], dtype="int32")
    preds_1 = model.predict(
        [features[None, :, :], out_indices, A_indices, A_values])
    assert preds_1.shape == (1, 2, 2)

    # Check fit method
    preds_2 = model.predict(generator.flow(["a", "b"]))
    assert preds_2.shape == (1, 2, 2)

    assert preds_1 == pytest.approx(preds_2)
示例#7
0
def test_APPNP_apply_propagate_model_dense():
    G, features = create_graph_features()
    adj = nx.to_scipy_sparse_matrix(G)
    features, adj = GCN_Aadj_feats_op(features, adj)
    adj = np.array(adj.todense()[None, :, :])
    n_nodes = features.shape[0]

    nodes = G.nodes()
    node_features = pd.DataFrame.from_dict(
        {n: f
         for n, f in zip(nodes, features)}, orient="index")
    G = StellarGraph(G, node_features=node_features)

    generator = FullBatchNodeGenerator(G, sparse=False, method="gcn")
    appnpnModel = APPNP([2],
                        generator=generator,
                        activations=["relu"],
                        dropout=0.5)

    fully_connected_model = keras.Sequential()
    fully_connected_model.add(Dense(2))

    x_in, x_out = appnpnModel.propagate_model(fully_connected_model)
    model = keras.Model(inputs=x_in, outputs=x_out)

    # Check fit method
    out_indices = np.array([[0, 1]], dtype="int32")
    preds_1 = model.predict([features[None, :, :], out_indices, adj])
    assert preds_1.shape == (1, 2, 2)

    # Check fit_generator method
    preds_2 = model.predict_generator(generator.flow(["a", "b"]))
    assert preds_2.shape == (1, 2, 2)

    assert preds_1 == pytest.approx(preds_2)
示例#8
0
    def test_gat_build_no_norm(self):
        G = example_graph(feature_size=self.F_in)
        gen = FullBatchNodeGenerator(G, sparse=self.sparse, method=self.method)
        gat = GAT(
            layer_sizes=self.layer_sizes,
            activations=self.activations,
            attn_heads=self.attn_heads,
            generator=gen,
            bias=True,
            normalize=None,
            kernel_initializer="ones",
            attn_kernel_initializer="ones",
        )

        x_in, x_out = gat.in_out_tensors()

        model = keras.Model(inputs=x_in, outputs=x_out)

        ng = gen.flow(G.nodes())
        actual = model.predict(ng)

        expected = np.ones((G.number_of_nodes(), self.layer_sizes[-1])) * (
            self.F_in * self.layer_sizes[0] * self.attn_heads *
            np.max(G.node_features(G.nodes())))
        assert np.allclose(expected, actual[0])
示例#9
0
    def test_gat_serialize(self):
        G = example_graph(feature_size=self.F_in)
        gen = FullBatchNodeGenerator(G, sparse=self.sparse, method=self.method)
        gat = GAT(
            layer_sizes=self.layer_sizes,
            activations=self.activations,
            attn_heads=self.attn_heads,
            generator=gen,
            bias=True,
            normalize="l2",
        )

        x_in, x_out = gat.in_out_tensors()
        model = keras.Model(inputs=x_in, outputs=x_out)

        ng = gen.flow(G.nodes())

        # Save model
        model_json = model.to_json()

        # Set all weights to one
        model_weights = [np.ones_like(w) for w in model.get_weights()]

        # Load model from json & set all weights
        model2 = keras.models.model_from_json(
            model_json, custom_objects={"GraphAttention": GraphAttention})
        model2.set_weights(model_weights)

        # Test deserialized model
        actual = model2.predict(ng)
        expected = np.ones(
            (G.number_of_nodes(),
             self.layer_sizes[-1])) * (1.0 / G.number_of_nodes())
        assert np.allclose(expected, actual[0])
示例#10
0
def test_APPNP_apply_dense():
    G, features = create_graph_features()
    adj = G.to_adjacency_matrix()
    features, adj = GCN_Aadj_feats_op(features, adj)
    adj = np.array(adj.todense()[None, :, :])

    generator = FullBatchNodeGenerator(G, sparse=False, method="gcn")
    appnpModel = APPNP([2],
                       generator=generator,
                       activations=["relu"],
                       dropout=0.5)

    x_in, x_out = appnpModel.in_out_tensors()
    model = keras.Model(inputs=x_in, outputs=x_out)

    # Check fit method
    out_indices = np.array([[0, 1]], dtype="int32")
    preds_1 = model.predict([features[None, :, :], out_indices, adj])
    assert preds_1.shape == (1, 2, 2)

    # Check fit method
    preds_2 = model.predict(generator.flow(["a", "b"]))
    assert preds_2.shape == (1, 2, 2)

    assert preds_1 == pytest.approx(preds_2)
    def test_generator_flow_targets_as_list(self):
        generator = FullBatchNodeGenerator(self.G)
        node_ids = list(self.G.nodes())[:3]
        node_targets = [1] * len(node_ids)
        gen = generator.flow(node_ids, node_targets)

        inputs, y = gen[0]
        assert y.shape == (1, 3)
        assert np.sum(y) == 3
示例#12
0
def create_GCN_model_sparse(graph):
    generator = FullBatchNodeGenerator(graph, sparse=True, method="gcn")
    train_gen = generator.flow([0, 1], np.array([[1, 0], [0, 1]]))

    layer_sizes = [2, 2]
    gcn = GCN(
        layer_sizes=layer_sizes,
        activations=["elu", "elu"],
        generator=generator,
        dropout=0.3,
        kernel_regularizer=regularizers.l2(5e-4),
    )

    for layer in gcn._layers:
        layer._initializer = "ones"
    x_inp, x_out = gcn.build()
    keras_model = Model(inputs=x_inp, outputs=x_out)
    return gcn, keras_model, generator, train_gen
示例#13
0
def create_GCN_model(graph):

    generator = FullBatchNodeGenerator(graph)
    train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]]))

    base_model = GCN(
        layer_sizes=[8, 2],
        generator=generator,
        bias=True,
        dropout=0.5,
        activations=["elu", "softmax"],
    )

    x_inp, x_out = base_model.build()

    keras_model = Model(inputs=x_inp, outputs=x_out)

    return base_model, keras_model, generator, train_gen
示例#14
0
def create_GAT_model(graph):
    generator = FullBatchNodeGenerator(graph, sparse=False, method=None)
    train_gen = generator.flow([0, 1], np.array([[1, 0], [0, 1]]))

    gat = GAT(
        layer_sizes=[2, 2],
        generator=generator,
        bias=False,
        in_dropout=0,
        attn_dropout=0,
        activations=["elu", "softmax"],
        normalize=None,
        saliency_map_support=True,
    )
    for layer in gat._layers:
        layer._initializer = "ones"
    x_inp, x_out = gat.build()
    keras_model = Model(inputs=x_inp, outputs=x_out)
    return gat, keras_model, generator, train_gen
    def generator_flow(
        self,
        G,
        node_ids,
        node_targets,
        sparse=False,
        method="none",
        k=1,
        teleport_probability=0.1,
    ):
        generator = FullBatchNodeGenerator(
            G,
            sparse=sparse,
            method=method,
            k=k,
            teleport_probability=teleport_probability,
        )
        n_nodes = G.number_of_nodes()

        gen = generator.flow(node_ids, node_targets)
        if sparse:
            [X, tind, A_ind, A_val], y = gen[0]
            A_sparse = sps.coo_matrix(
                (A_val[0], (A_ind[0, :, 0], A_ind[0, :, 1])),
                shape=(n_nodes, n_nodes))
            A_dense = A_sparse.toarray()

        else:
            [X, tind, A], y = gen[0]
            A_dense = A[0]

        assert np.allclose(X,
                           gen.features)  # X should be equal to gen.features
        assert tind.shape[1] == len(node_ids)

        if node_targets is not None:
            assert np.allclose(y, node_targets)

        # Check that the diagonals are one
        if method == "self_loops":
            assert np.allclose(A_dense.diagonal(), 1)

        return A_dense, tind, y
def test_dgi_stateful():
    G = example_graph_random()
    emb_dim = 16

    generator = FullBatchNodeGenerator(G)
    corrupted_generator = CorruptedGenerator(generator)
    gen = corrupted_generator.flow(G.nodes())

    infomax = DeepGraphInfomax(
        GCN(generator=generator, activations=["relu"], layer_sizes=[emb_dim])
    )

    model_1 = tf.keras.Model(*infomax.in_out_tensors())
    model_2 = tf.keras.Model(*infomax.in_out_tensors())

    # check embeddings are equal before training
    embeddings_1 = tf.keras.Model(*infomax.embedding_model()).predict(
        generator.flow(G.nodes())
    )
    embeddings_2 = tf.keras.Model(*infomax.embedding_model()).predict(
        generator.flow(G.nodes())
    )

    assert np.array_equal(embeddings_1, embeddings_2)

    model_1.compile(loss=tf.nn.sigmoid_cross_entropy_with_logits, optimizer="Adam")
    model_1.fit(gen)

    # check embeddings are still equal after training one model
    embeddings_1 = tf.keras.Model(*infomax.embedding_model()).predict(
        generator.flow(G.nodes())
    )
    embeddings_2 = tf.keras.Model(*infomax.embedding_model()).predict(
        generator.flow(G.nodes())
    )

    assert np.array_equal(embeddings_1, embeddings_2)

    model_2.compile(loss=tf.nn.sigmoid_cross_entropy_with_logits, optimizer="Adam")
    model_2.fit(gen)

    # check embeddings are still equal after training both models
    embeddings_1 = tf.keras.Model(*infomax.embedding_model()).predict(
        generator.flow(G.nodes())
    )
    embeddings_2 = tf.keras.Model(*infomax.embedding_model()).predict(
        generator.flow(G.nodes())
    )

    assert np.array_equal(embeddings_1, embeddings_2)
示例#17
0
def create_GAT_model(graph):

    generator = FullBatchNodeGenerator(graph, sparse=False)
    train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]]))

    base_model = GAT(
        layer_sizes=[8, 8, 2],
        generator=generator,
        bias=True,
        in_dropout=0.5,
        attn_dropout=0.5,
        activations=["elu", "elu", "softmax"],
        normalize=None,
    )

    x_inp, x_out = base_model.build()

    keras_model = Model(inputs=x_inp, outputs=x_out)

    return base_model, keras_model, generator, train_gen
示例#18
0
def test_APPNP_apply_sparse():

    G, features = create_graph_features()
    adj = nx.to_scipy_sparse_matrix(G)
    features, adj = GCN_Aadj_feats_op(features, adj)
    adj = adj.tocoo()
    A_indices = np.expand_dims(np.hstack((adj.row[:, None], adj.col[:, None])),
                               0)
    A_values = np.expand_dims(adj.data, 0)

    nodes = G.nodes()
    node_features = pd.DataFrame.from_dict(
        {n: f
         for n, f in zip(nodes, features)}, orient="index")
    G = StellarGraph(G, node_features=node_features)

    generator = FullBatchNodeGenerator(G, sparse=True, method="gcn")
    appnpnModel = APPNP([2],
                        generator=generator,
                        activations=["relu"],
                        dropout=0.5)

    x_in, x_out = appnpnModel.build()
    model = keras.Model(inputs=x_in, outputs=x_out)

    # Check fit method
    out_indices = np.array([[0, 1]], dtype="int32")
    preds_1 = model.predict(
        [features[None, :, :], out_indices, A_indices, A_values])
    assert preds_1.shape == (1, 2, 2)

    # Check fit_generator method
    preds_2 = model.predict_generator(generator.flow(["a", "b"]))
    assert preds_2.shape == (1, 2, 2)

    assert preds_1 == pytest.approx(preds_2)
    cora_dataset = sg.datasets.Cora(
    )  # the 'cora' dataset is built-in into stellargraph datasets module
    # it returns a Stellargraph object, and the node subjects (classes)
    # The features (word occurencess) are already built-in into the "stellar_g" object of type Stellargraph
    stellar_g, node_classes = cora_dataset.load(directed=True)
    train_dataset, test_dataset = split_data(node_classes)
    train_targets, test_targets, target_encoding = encode_classes(
        train_dataset, test_dataset)

    ###############################################################

    # creating GCN model
    gcn_generator = FullBatchNodeGenerator(stellar_g,
                                           method="gcn",
                                           sparse=False)
    train_gcn_gen = gcn_generator.flow(train_dataset.index, train_targets)
    gcn = GCN(layer_sizes=[16, 16],
              activations=['relu', 'relu'],
              generator=gcn_generator,
              dropout=0.5)  # 2 GCN layers
    gcn_inp, gcn_out = gcn.in_out_tensors()  # for the KERAS model

    # creating KERAS model with the GCN model layers
    gcn_dense_layer = layers.Dense(units=train_targets.shape[1],
                                   activation="softmax")(gcn_out)
    keras_gcn = Model(inputs=gcn_inp,
                      outputs=gcn_dense_layer)  # 2 GCN, 1 Dense
    keras_gcn.compile(
        optimizer="adam",
        loss=losses.categorical_crossentropy,
        metrics=["accuracy"],
示例#20
0
def infer_attributes_gat(Gnx, savepred=True, plot=False):
    # Define node data
    feature_names = [
        "in_degree",
        "out_degree",
        # "in_degree_centrality",
        # "out_degree_centrality",
        # "closeness_centrality",
        # "betweenness_centrality",
        "clustering_coefficient",
        # "square_clustering",
        "core_number",
        # "pagerank",
        # "constraint",
        # "effective_size"
    ]
    node_type = [v for k, v in nx.get_node_attributes(Gnx, 'data').items()]
    d = {"node_type": node_type}
    if "in_degree" in feature_names:
        indeg = [v for k, v in Gnx.in_degree]
        indeg = np.divide(indeg, max(indeg))
        indeg[indeg >= 0.5] = 1
        indeg[indeg < 0.5] = 0
        d["in_degree"] = indeg
    if "out_degree" in feature_names:
        outdeg = [v for k, v in Gnx.out_degree]
        outdeg = np.divide(outdeg, max(outdeg))
        outdeg[outdeg >= 0.5] = 1
        outdeg[outdeg < 0.5] = 0
        d["out_degree"] = outdeg
    if "in_degree_centrality" in feature_names:
        indeg_cent = [
            v for k, v in nx.algorithms.in_degree_centrality(Gnx).items()
        ]
        indeg_cent = np.divide(indeg_cent, max(indeg_cent))
        indeg_cent[indeg_cent >= 0.5] = 1
        indeg_cent[indeg_cent < 0.5] = 0
        d["in_degree_centrality"] = indeg_cent
    if "out_degree_centrality" in feature_names:
        outdeg_cent = [
            v for k, v in nx.algorithms.out_degree_centrality(Gnx).items()
        ]
        outdeg_cent = np.divide(outdeg_cent, max(outdeg_cent))
        outdeg_cent[outdeg_cent >= 0.5] = 1
        outdeg_cent[outdeg_cent < 0.5] = 0
        d["out_degree_centrality"] = outdeg_cent
    if "closeness_centrality" in feature_names:
        close_cent = [
            v for k, v in nx.algorithms.closeness_centrality(Gnx).items()
        ]
        close_cent = np.divide(close_cent, max(close_cent))
        close_cent[close_cent >= 0.5] = 1
        close_cent[close_cent < 0.5] = 0
        d["closeness_centrality"] = close_cent
    if "betweenness_centrality" in feature_names:
        between_cent = [
            v for k, v in nx.algorithms.betweenness_centrality(Gnx).items()
        ]
        between_cent = np.divide(between_cent, max(between_cent))
        between_cent[between_cent >= 0.5] = 1
        between_cent[between_cent < 0.5] = 0
        d["betweenness_centrality"] = between_cent
    if "clustering_coefficient" in feature_names:
        clustering_co = [v for k, v in nx.algorithms.clustering(Gnx).items()]
        clustering_co = np.divide(clustering_co, max(clustering_co))
        clustering_co[clustering_co >= 0.5] = 1
        clustering_co[clustering_co < 0.5] = 0
        d["clustering_coefficient"] = clustering_co
    if "square_clustering" in feature_names:
        sq_clustering = [
            v for k, v in nx.algorithms.square_clustering(Gnx).items()
        ]
        sq_clustering = np.divide(sq_clustering, max(sq_clustering))
        sq_clustering[sq_clustering >= 0.5] = 1
        sq_clustering[sq_clustering < 0.5] = 0
        d["square_clustering"] = sq_clustering
    if "core_number" in feature_names:
        core_number = [v for k, v in nx.algorithms.core_number(Gnx).items()]
        core_number = np.divide(core_number, max(core_number))
        core_number[core_number >= 0.5] = 1
        core_number[core_number < 0.5] = 0
        d["core_number"] = core_number
    if "pagerank" in feature_names:
        pagerank = [v for k, v in nx.algorithms.pagerank(Gnx).items()]
        pagerank = np.divide(pagerank, max(pagerank))
        pagerank[pagerank >= 0.5] = 1
        pagerank[pagerank < 0.5] = 0
        d["pagerank"] = pagerank
    if "constraint" in feature_names:
        constraint = [v for k, v in nx.algorithms.constraint(Gnx).items()]
        constraint = np.divide(constraint, max(constraint))
        constraint[np.isnan(constraint)] = 0
        constraint[constraint >= 0.5] = 1
        constraint[constraint < 0.5] = 0
        d["constraint"] = constraint
    if "effective_size" in feature_names:
        effective_size = [
            v for k, v in nx.algorithms.effective_size(Gnx).items()
        ]
        effective_size = np.divide(effective_size, max(effective_size))
        effective_size[np.isnan(effective_size)] = 0
        effective_size[effective_size >= 0.5] = 1
        effective_size[effective_size < 0.5] = 0
        d["effective_size"] = effective_size
    node_data = pd.DataFrame(data=d, index=nodes)
    node_data = shuffle(node_data)

    # Split the data
    train_data, test_data = model_selection.train_test_split(
        node_data, train_size=int(0.80 * len(Gnx)))
    val_data, test_data = model_selection.train_test_split(
        test_data, train_size=int(0.15 * len(Gnx)))

    # Convert to numeric arrays
    target_encoding = feature_extraction.DictVectorizer(sparse=False)

    train_targets = target_encoding.fit_transform(
        train_data[["node_type"]].to_dict('records'))
    val_targets = target_encoding.transform(val_data[["node_type"
                                                      ]].to_dict('records'))
    test_targets = target_encoding.transform(test_data[["node_type"
                                                        ]].to_dict('records'))

    node_features = node_data[feature_names]

    # Create the GAT model in Keras
    G = sg.StellarDiGraph(Gnx, node_features=node_features)
    print(G.info())

    generator = FullBatchNodeGenerator(G)

    train_gen = generator.flow(train_data.index, train_targets)

    gat = GAT(
        layer_sizes=[8, train_targets.shape[1]],
        attn_heads=8,
        generator=generator,
        bias=True,
        in_dropout=0.5,
        attn_dropout=0.5,
        activations=["elu", "softmax"],
        normalize=None,
    )

    # Expose the input and output tensors of the GAT model for node prediction, via GAT.node_model() method:
    x_inp, predictions = gat.node_model()

    # Train the model
    model = Model(inputs=x_inp, outputs=predictions)
    model.compile(
        optimizer=optimizers.Adam(lr=0.005),
        loss=losses.categorical_crossentropy,
        weighted_metrics=["acc"],
    )

    val_gen = generator.flow(val_data.index, val_targets)

    if not os.path.isdir(".temp/logs"):
        os.makedirs(".temp/logs")
    if not os.path.isdir(".temp/output"):
        os.makedirs(".temp/output")

    es_callback = EarlyStopping(
        monitor="val_weighted_acc",
        patience=
        100  # patience is the number of epochs to wait before early stopping in case of no further improvement
    )

    mc_callback = ModelCheckpoint(
        ".temp/logs/best_model.h5",
        monitor="val_weighted_acc",
        save_best_only=True,
        save_weights_only=True,
    )

    history = model.fit_generator(
        train_gen,
        epochs=2000,
        validation_data=val_gen,
        verbose=2,
        shuffle=
        False,  # this should be False, since shuffling data means shuffling the whole graph
        callbacks=[es_callback, mc_callback],
    )

    # Reload the saved weights
    model.load_weights(".temp/logs/best_model.h5")

    # Evaluate the best nidek in the test set
    test_gen = generator.flow(test_data.index, test_targets)

    test_metrics = model.evaluate_generator(test_gen)
    print("\nTest Set Metrics:")
    for name, val in zip(model.metrics_names, test_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Make predictions with the model
    all_nodes = node_data.index
    all_gen = generator.flow(all_nodes)
    all_predictions = model.predict_generator(all_gen)

    node_predictions = target_encoding.inverse_transform(all_predictions)

    results = pd.DataFrame(node_predictions, index=G.nodes()).idxmax(axis=1)
    df = pd.DataFrame({"Predicted": results, "True": node_data['node_type']})
    print(df.head)

    if savepred:
        df.to_excel(".temp/output/output" +
                    str(datetime.datetime.now()).replace(':', '-') + ".xlsx")

    if plot:
        # Node embeddings
        emb_layer = model.layers[3]
        print("Embedding layer: {}, output shape {}".format(
            emb_layer.name, emb_layer.output_shape))
        embedding_model = Model(inputs=x_inp, outputs=emb_layer.output)
        emb = embedding_model.predict_generator(all_gen)

        X = emb
        y = np.argmax(target_encoding.transform(
            node_data.reindex(G.nodes())[["node_type"]].to_dict('records')),
                      axis=1)

        if X.shape[1] > 2:
            transform = TSNE  #PCA
            trans = transform(n_components=2)
            emb_transformed = pd.DataFrame(trans.fit_transform(X),
                                           index=list(G.nodes()))
            emb_transformed['label'] = y
        else:
            emb_transformed = pd.DataFrame(X, index=list(G.nodes()))
            emb_transformed = emb_transformed.rename(columns={'0': 0, '1': 1})

        def plot_emb(transform, emb_transformed):
            fig, ax = plt.subplots(figsize=(7, 7))
            ax.scatter(emb_transformed[0],
                       emb_transformed[1],
                       c=emb_transformed['label'].astype("category"),
                       cmap="jet",
                       alpha=0.7)
            ax.set(aspect="equal", xlabel="$X_1$", ylabel="$X_2$")
            plt.title(
                '{} visualization of GAT embeddings for the fighter graph'.
                format(transform.__name__))

        # Plot the training history
        def remove_prefix(text, prefix):
            return text[text.startswith(prefix) and len(prefix):]

        def plot_history(history):
            metrics = sorted(
                set([
                    remove_prefix(m, "val_")
                    for m in list(history.history.keys())
                ]))
            for m in metrics:
                # summarize history for metric m
                plt.figure()
                plt.plot(history.history[m])
                plt.plot(history.history['val_' + m])
                plt.title(m)
                plt.ylabel(m)
                plt.xlabel('epoch')
                plt.legend(['train', 'validation'], loc='best')

        plot_history(history)
        plot_emb(transform, emb_transformed)
        plt.show()

    return df
 def test_generator_flow_targets_not_iterator(self):
     generator = FullBatchNodeGenerator(self.G)
     node_ids = list(self.G.nodes())[:3]
     node_targets = 1
     with pytest.raises(TypeError):
         generator.flow(node_ids, node_targets)
示例#22
0
def train(
    edgelist,
    node_data,
    attn_heads,
    layer_sizes,
    num_epochs=10,
    learning_rate=0.005,
    es_patience=100,
    dropout=0.0,
    target_name="subject",
):
    """
    Train a GAT model on the specified graph G with given parameters, evaluate it, and save the model.

    Args:
        edgelist: Graph edgelist
        node_data: Feature and target data for nodes
        attn_heads: Number of attention heads in GAT layers
        layer_sizes: A list of number of hidden nodes in each layer
        num_epochs: Number of epochs to train the model
        learning_rate: Initial Learning rate
        dropout: The dropout (0->1)
    """
    # Extract target and encode as a one-hot vector
    target_encoding = feature_extraction.DictVectorizer(sparse=False)
    node_targets = target_encoding.fit_transform(
        node_data[[target_name]].to_dict("records"))
    node_ids = node_data.index

    # Extract the feature data. These are the feature vectors that the Keras model will use as input.
    # The CORA dataset contains attributes 'w_x' that correspond to words found in that publication.
    node_features = node_data[feature_names]

    # Create graph from edgelist and set node features and node type
    Gnx = nx.from_pandas_edgelist(edgelist)

    # Convert to StellarGraph and prepare for ML
    G = sg.StellarGraph(Gnx,
                        node_type_name="label",
                        node_features=node_features)

    # Split nodes into train/test using stratification.
    train_nodes, test_nodes, train_targets, test_targets = model_selection.train_test_split(
        node_ids,
        node_targets,
        train_size=140,
        test_size=None,
        stratify=node_targets,
        random_state=55232,
    )

    # Further split test set into validation and test
    val_nodes, test_nodes, val_targets, test_targets = model_selection.train_test_split(
        test_nodes,
        test_targets,
        train_size=500,
        test_size=1000,
        random_state=523214)

    # Create mappers for GraphSAGE that input data from the graph to the model
    generator = FullBatchNodeGenerator(G)
    train_gen = generator.flow(train_nodes, train_targets)
    val_gen = generator.flow(val_nodes, val_targets)

    # GAT model
    gat = GAT(
        layer_sizes=layer_sizes,
        attn_heads=attn_heads,
        generator=generator,
        bias=True,
        in_dropout=dropout,
        attn_dropout=dropout,
        activations=["elu", "elu"],
        normalize=None,
    )
    # Expose the input and output tensors of the GAT model for nodes:
    x_inp, x_out = gat.node_model(add_self_loops=True)

    # Snap the final estimator layer to x_out
    x_out = layers.Dense(units=train_targets.shape[1],
                         activation="softmax")(x_out)

    # Create Keras model for training
    model = keras.Model(inputs=x_inp, outputs=x_out)
    model.compile(
        optimizer=optimizers.Adam(lr=learning_rate, decay=0.001),
        loss=losses.categorical_crossentropy,
        weighted_metrics=["acc"],
    )
    print(model.summary())

    # Train model
    # Callbacks
    if not os.path.isdir("logs"):
        os.makedirs("logs")
    N = len(node_ids)
    es_callback = EarlyStopping(monitor="val_weighted_acc",
                                patience=es_patience)
    tb_callback = TensorBoard(batch_size=N)
    mc_callback = ModelCheckpoint(
        "logs/best_model.h5",
        monitor="val_weighted_acc",
        save_best_only=True,
        save_weights_only=True,
    )

    if args.interface == "fit":
        print("\nUsing model.fit() to train the model\n")
        # Get the training data
        [X, A], y_train, node_mask_train = train_gen.__getitem__(0)
        N = A.shape[0]
        # A = sparse.csr_matrix(A + np.eye(A.shape[0]))  # Add self-loops

        # Get the validation data
        [_, _], y_val, node_mask_val = val_gen.__getitem__(0)

        history = model.fit(
            x=[X, A],
            y=y_train,
            sample_weight=node_mask_train,
            batch_size=N,
            shuffle=
            False,  # must be False, since shuffling data means shuffling the whole graph
            epochs=num_epochs,
            verbose=2,
            validation_data=([X, A], y_val, node_mask_val),
            callbacks=[es_callback, tb_callback, mc_callback],
        )
    else:
        print("\nUsing model.fit_generator() to train the model\n")
        history = model.fit_generator(
            train_gen,
            epochs=num_epochs,
            validation_data=val_gen,
            verbose=2,
            shuffle=False,
            callbacks=[es_callback, tb_callback, mc_callback],
        )

    # Load best model
    model.load_weights("logs/best_model.h5")

    # Evaluate on validation set and print metrics
    if args.interface == "fit":
        val_metrics = model.evaluate(x=[X, A],
                                     y=y_val,
                                     sample_weight=node_mask_val,
                                     batch_size=N)
    else:
        val_metrics = model.evaluate_generator(val_gen)

    print("\nBest model's Validation Set Metrics:")
    for name, val in zip(model.metrics_names, val_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Evaluate on test set and print metrics
    if args.interface == "fit":
        [_, _], y_test, node_mask_test = generator.flow(
            test_nodes, test_targets).__getitem__(0)
        test_metrics = model.evaluate(x=[X, A],
                                      y=y_test,
                                      sample_weight=node_mask_test,
                                      batch_size=N)
    else:
        test_metrics = model.evaluate_generator(
            generator.flow(test_nodes, test_targets))

    print("\nBest model's Test Set Metrics:")
    for name, val in zip(model.metrics_names, test_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Get predictions for all nodes
    # Note that the `predict` or `predict_generator` function now operates differently to the `GraphSAGE` or `HinSAGE` models
    # in that if you give it less than the complete set of nodes, it will still return all predictions and in a fixed order
    # defined by the order of nodes in X and A (which is defined by the order of G.nodes()).
    if args.interface == "fit":
        all_predictions = model.predict(x=[X, A], batch_size=N)
    else:
        all_predictions = model.predict_generator(generator.flow(node_ids))

    # Turn predictions back into the original categories
    node_predictions = pd.DataFrame(
        target_encoding.inverse_transform(all_predictions),
        index=list(G.nodes()))
    accuracy = np.mean([
        "subject=" + gt_subject == p
        for gt_subject, p in zip(node_data["subject"][list(G.nodes())],
                                 node_predictions.idxmax(axis=1))
    ])
    print("\nAll-node accuracy: {:0.4f}".format(accuracy))

    # Save the trained model
    save_str = "_h{}_l{}_d{}_r{}".format(
        attn_heads, "_".join([str(x) for x in layer_sizes]), dropout,
        learning_rate)
    model.save("cora_gat_model" + save_str + ".h5")

    # We must also save the target encoding to convert model predictions
    with open("cora_gat_encoding" + save_str + ".pkl", "wb") as f:
        pickle.dump([target_encoding], f)
示例#23
0
train_subjects.value_counts().to_frame()

# =============================================================================
# # encode labels
# =============================================================================
target_encoding = preprocessing.LabelBinarizer()
train_targets = target_encoding.fit_transform(train_subjects)
val_targets = target_encoding.transform(val_subjects)
test_targets = target_encoding.transform(test_subjects)

# =============================================================================
# # Init generator
# =============================================================================
generator = FullBatchNodeGenerator(G, method="gcn")

train_gen = generator.flow(train_subjects.index, train_targets)

# =============================================================================
# # Create model
# =============================================================================
gcn = GCN(layer_sizes=[16, 16],
          activations=["relu", "relu"],
          generator=generator,
          dropout=0.5)
x_inp, x_out = gcn.in_out_tensors()
predictions = layers.Dense(units=train_targets.shape[1],
                           activation="softmax")(x_out)

model = Model(inputs=x_inp, outputs=predictions)
model.compile(
    optimizer=optimizers.Adam(lr=0.01),
示例#24
0
fullbatch_generator = FullBatchNodeGenerator(G, sparse=False)
gcn_model = GCN(layer_sizes=[2], activations=["relu"], generator=fullbatch_generator)

corrupted_generator = CorruptedGenerator(fullbatch_generator)
gen = corrupted_generator.flow(G.nodes())

infomax = DeepGraphInfomax(gcn_model, corrupted_generator)
x_in, x_out = infomax.in_out_tensors()

model = Model(inputs=x_in, outputs=x_out)
model.compile(loss=tf.nn.sigmoid_cross_entropy_with_logits, optimizer=Adam(lr=1e-3))

epochs = 100

es = EarlyStopping(monitor="loss", min_delta=0, patience=20)
history = model.fit(gen, epochs=epochs, verbose=0, callbacks=[es])
plot_history(history)

x_emb_in, x_emb_out = gcn_model.in_out_tensors()

# for full batch models, squeeze out the batch dim (which is 1)
x_out = tf.squeeze(x_emb_out, axis=0)
emb_model = Model(inputs=x_emb_in, outputs=x_out)

all_embeddings = emb_model.predict(fullbatch_generator.flow(G.nodes()))

test = pd.DataFrame(all_embeddings, index=G.nodes())


test.to_csv("/home/jonno/setse_1_data/test_embs.csv" )
示例#25
0
G, node_subjects = dataset.load()

train_subjects, test_subjects = model_selection.train_test_split(
    node_subjects, train_size=140, test_size=None, stratify=node_subjects)
val_subjects, test_subjects = model_selection.train_test_split(
    test_subjects, train_size=500, test_size=None, stratify=test_subjects)

target_encoding = preprocessing.LabelBinarizer()

train_targets = target_encoding.fit_transform(train_subjects)
val_targets = target_encoding.transform(val_subjects)
test_targets = target_encoding.transform(test_subjects)

generator = FullBatchNodeGenerator(G, method="gcn")

train_gen = generator.flow(train_subjects.index, train_targets)

gcn = GCN(layer_sizes=[16, 8, 8],
          activations=["relu", "relu", "relu"],
          generator=generator,
          dropout=0.5)

model = Model(inputs=x_inp, outputs=predictions)
model.compile(
    optimizer=optimizers.Adam(lr=0.01),
    loss=losses.categorical_crossentropy,
    metrics=["acc"],
)

history = model.fit(
    train_gen,
示例#26
0
node_series = []
label_series = []
for key in node_content_id_map.keys():
    node_series.append(node_content_id_map[key])
    label_series.append(label_dict[key])

node_label_df = (pd.DataFrame({
    'node': node_series,
    'label': label_series
}).sort_values(['node']))

train_subjects, test_subjects = model_selection.train_test_split(
    node_label_df, train_size=0.7, test_size=None, stratify=None)

test_gen = fullbatch_generator.flow(test_subjects.index)
train_gen = fullbatch_generator.flow(train_subjects.index)

test_embeddings = emb_model.predict(test_gen)
train_embeddings = emb_model.predict(train_gen)

lr = LogisticRegression(multi_class="auto", solver="lbfgs", max_iter=2000)
lr.fit(train_embeddings, train_subjects['label'])

y_pred = lr.predict(test_embeddings)
test_acc = (y_pred == test_subjects['label']).mean()
test_acc

# random prediction
random_preds = list(train_subjects['label'].values)
random_test_preds = random.choices(random_preds, k=len(test_subjects))
示例#27
0
print('------------------------\nTRAIN:', train_subjects.value_counts().to_frame())
print('------------------------\nTEST:', test_subjects.value_counts().to_frame())
print('------------------------\nVALIDATION:', val_subjects.value_counts().to_frame())


# MAIN CODE ============================================================================================================
target_encoding = preprocessing.LabelBinarizer()

train_targets = target_encoding.fit_transform(train_subjects)
val_targets = target_encoding.transform(val_subjects)
test_targets = target_encoding.transform(test_subjects)
all_targets = target_encoding.transform(graph_labels)

generator = FullBatchNodeGenerator(graph_stellar, method="gcn")

train_gen = generator.flow(train_subjects.index, train_targets)
val_gen = generator.flow(val_subjects.index, val_targets)
test_gen = generator.flow(test_subjects.index, test_targets)
all_gen = generator.flow(graph_labels.index, all_targets)

es_callback = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)
auc = tf.keras.metrics.AUC()

with tf.device('/CPU:0'):
    gcn = GCN(
        layer_sizes=[2*node_feature_count, 2*node_feature_count],
        activations=['relu', 'relu'],
        generator=generator
    )

    x_inp, x_out = gcn.in_out_tensors()