예제 #1
0
def create_GAT_model(graph):
    generator = FullBatchNodeGenerator(graph, sparse=False, method=None)
    train_gen = generator.flow([0, 1], np.array([[1, 0], [0, 1]]))

    gat = GAT(
        layer_sizes=[2, 2],
        generator=generator,
        bias=False,
        in_dropout=0,
        attn_dropout=0,
        activations=["elu", "softmax"],
        normalize=None,
        saliency_map_support=True,
    )
    for layer in gat._layers:
        layer._initializer = "ones"
    x_inp, x_out = gat.node_model()
    keras_model = Model(inputs=x_inp, outputs=x_out)
    return gat, keras_model, generator, train_gen
예제 #2
0
def create_GAT_model(graph):

    generator = FullBatchNodeGenerator(graph, sparse=False)
    train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]]))

    base_model = GAT(
        layer_sizes=[8, 8, 2],
        generator=generator,
        bias=True,
        in_dropout=0.5,
        attn_dropout=0.5,
        activations=["elu", "elu", "softmax"],
        normalize=None,
    )

    x_inp, x_out = base_model.node_model()

    keras_model = Model(inputs=x_inp, outputs=x_out)

    return base_model, keras_model, generator, train_gen
예제 #3
0
def infer_attributes_gat(Gnx, savepred=True, plot=False):
    # Define node data
    feature_names = [
        "in_degree",
        "out_degree",
        # "in_degree_centrality",
        # "out_degree_centrality",
        # "closeness_centrality",
        # "betweenness_centrality",
        "clustering_coefficient",
        # "square_clustering",
        "core_number",
        # "pagerank",
        # "constraint",
        # "effective_size"
    ]
    node_type = [v for k, v in nx.get_node_attributes(Gnx, 'data').items()]
    d = {"node_type": node_type}
    if "in_degree" in feature_names:
        indeg = [v for k, v in Gnx.in_degree]
        indeg = np.divide(indeg, max(indeg))
        indeg[indeg >= 0.5] = 1
        indeg[indeg < 0.5] = 0
        d["in_degree"] = indeg
    if "out_degree" in feature_names:
        outdeg = [v for k, v in Gnx.out_degree]
        outdeg = np.divide(outdeg, max(outdeg))
        outdeg[outdeg >= 0.5] = 1
        outdeg[outdeg < 0.5] = 0
        d["out_degree"] = outdeg
    if "in_degree_centrality" in feature_names:
        indeg_cent = [
            v for k, v in nx.algorithms.in_degree_centrality(Gnx).items()
        ]
        indeg_cent = np.divide(indeg_cent, max(indeg_cent))
        indeg_cent[indeg_cent >= 0.5] = 1
        indeg_cent[indeg_cent < 0.5] = 0
        d["in_degree_centrality"] = indeg_cent
    if "out_degree_centrality" in feature_names:
        outdeg_cent = [
            v for k, v in nx.algorithms.out_degree_centrality(Gnx).items()
        ]
        outdeg_cent = np.divide(outdeg_cent, max(outdeg_cent))
        outdeg_cent[outdeg_cent >= 0.5] = 1
        outdeg_cent[outdeg_cent < 0.5] = 0
        d["out_degree_centrality"] = outdeg_cent
    if "closeness_centrality" in feature_names:
        close_cent = [
            v for k, v in nx.algorithms.closeness_centrality(Gnx).items()
        ]
        close_cent = np.divide(close_cent, max(close_cent))
        close_cent[close_cent >= 0.5] = 1
        close_cent[close_cent < 0.5] = 0
        d["closeness_centrality"] = close_cent
    if "betweenness_centrality" in feature_names:
        between_cent = [
            v for k, v in nx.algorithms.betweenness_centrality(Gnx).items()
        ]
        between_cent = np.divide(between_cent, max(between_cent))
        between_cent[between_cent >= 0.5] = 1
        between_cent[between_cent < 0.5] = 0
        d["betweenness_centrality"] = between_cent
    if "clustering_coefficient" in feature_names:
        clustering_co = [v for k, v in nx.algorithms.clustering(Gnx).items()]
        clustering_co = np.divide(clustering_co, max(clustering_co))
        clustering_co[clustering_co >= 0.5] = 1
        clustering_co[clustering_co < 0.5] = 0
        d["clustering_coefficient"] = clustering_co
    if "square_clustering" in feature_names:
        sq_clustering = [
            v for k, v in nx.algorithms.square_clustering(Gnx).items()
        ]
        sq_clustering = np.divide(sq_clustering, max(sq_clustering))
        sq_clustering[sq_clustering >= 0.5] = 1
        sq_clustering[sq_clustering < 0.5] = 0
        d["square_clustering"] = sq_clustering
    if "core_number" in feature_names:
        core_number = [v for k, v in nx.algorithms.core_number(Gnx).items()]
        core_number = np.divide(core_number, max(core_number))
        core_number[core_number >= 0.5] = 1
        core_number[core_number < 0.5] = 0
        d["core_number"] = core_number
    if "pagerank" in feature_names:
        pagerank = [v for k, v in nx.algorithms.pagerank(Gnx).items()]
        pagerank = np.divide(pagerank, max(pagerank))
        pagerank[pagerank >= 0.5] = 1
        pagerank[pagerank < 0.5] = 0
        d["pagerank"] = pagerank
    if "constraint" in feature_names:
        constraint = [v for k, v in nx.algorithms.constraint(Gnx).items()]
        constraint = np.divide(constraint, max(constraint))
        constraint[np.isnan(constraint)] = 0
        constraint[constraint >= 0.5] = 1
        constraint[constraint < 0.5] = 0
        d["constraint"] = constraint
    if "effective_size" in feature_names:
        effective_size = [
            v for k, v in nx.algorithms.effective_size(Gnx).items()
        ]
        effective_size = np.divide(effective_size, max(effective_size))
        effective_size[np.isnan(effective_size)] = 0
        effective_size[effective_size >= 0.5] = 1
        effective_size[effective_size < 0.5] = 0
        d["effective_size"] = effective_size
    node_data = pd.DataFrame(data=d, index=nodes)
    node_data = shuffle(node_data)

    # Split the data
    train_data, test_data = model_selection.train_test_split(
        node_data, train_size=int(0.80 * len(Gnx)))
    val_data, test_data = model_selection.train_test_split(
        test_data, train_size=int(0.15 * len(Gnx)))

    # Convert to numeric arrays
    target_encoding = feature_extraction.DictVectorizer(sparse=False)

    train_targets = target_encoding.fit_transform(
        train_data[["node_type"]].to_dict('records'))
    val_targets = target_encoding.transform(val_data[["node_type"
                                                      ]].to_dict('records'))
    test_targets = target_encoding.transform(test_data[["node_type"
                                                        ]].to_dict('records'))

    node_features = node_data[feature_names]

    # Create the GAT model in Keras
    G = sg.StellarDiGraph(Gnx, node_features=node_features)
    print(G.info())

    generator = FullBatchNodeGenerator(G)

    train_gen = generator.flow(train_data.index, train_targets)

    gat = GAT(
        layer_sizes=[8, train_targets.shape[1]],
        attn_heads=8,
        generator=generator,
        bias=True,
        in_dropout=0.5,
        attn_dropout=0.5,
        activations=["elu", "softmax"],
        normalize=None,
    )

    # Expose the input and output tensors of the GAT model for node prediction, via GAT.node_model() method:
    x_inp, predictions = gat.node_model()

    # Train the model
    model = Model(inputs=x_inp, outputs=predictions)
    model.compile(
        optimizer=optimizers.Adam(lr=0.005),
        loss=losses.categorical_crossentropy,
        weighted_metrics=["acc"],
    )

    val_gen = generator.flow(val_data.index, val_targets)

    if not os.path.isdir(".temp/logs"):
        os.makedirs(".temp/logs")
    if not os.path.isdir(".temp/output"):
        os.makedirs(".temp/output")

    es_callback = EarlyStopping(
        monitor="val_weighted_acc",
        patience=
        100  # patience is the number of epochs to wait before early stopping in case of no further improvement
    )

    mc_callback = ModelCheckpoint(
        ".temp/logs/best_model.h5",
        monitor="val_weighted_acc",
        save_best_only=True,
        save_weights_only=True,
    )

    history = model.fit_generator(
        train_gen,
        epochs=2000,
        validation_data=val_gen,
        verbose=2,
        shuffle=
        False,  # this should be False, since shuffling data means shuffling the whole graph
        callbacks=[es_callback, mc_callback],
    )

    # Reload the saved weights
    model.load_weights(".temp/logs/best_model.h5")

    # Evaluate the best nidek in the test set
    test_gen = generator.flow(test_data.index, test_targets)

    test_metrics = model.evaluate_generator(test_gen)
    print("\nTest Set Metrics:")
    for name, val in zip(model.metrics_names, test_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Make predictions with the model
    all_nodes = node_data.index
    all_gen = generator.flow(all_nodes)
    all_predictions = model.predict_generator(all_gen)

    node_predictions = target_encoding.inverse_transform(all_predictions)

    results = pd.DataFrame(node_predictions, index=G.nodes()).idxmax(axis=1)
    df = pd.DataFrame({"Predicted": results, "True": node_data['node_type']})
    print(df.head)

    if savepred:
        df.to_excel(".temp/output/output" +
                    str(datetime.datetime.now()).replace(':', '-') + ".xlsx")

    if plot:
        # Node embeddings
        emb_layer = model.layers[3]
        print("Embedding layer: {}, output shape {}".format(
            emb_layer.name, emb_layer.output_shape))
        embedding_model = Model(inputs=x_inp, outputs=emb_layer.output)
        emb = embedding_model.predict_generator(all_gen)

        X = emb
        y = np.argmax(target_encoding.transform(
            node_data.reindex(G.nodes())[["node_type"]].to_dict('records')),
                      axis=1)

        if X.shape[1] > 2:
            transform = TSNE  #PCA
            trans = transform(n_components=2)
            emb_transformed = pd.DataFrame(trans.fit_transform(X),
                                           index=list(G.nodes()))
            emb_transformed['label'] = y
        else:
            emb_transformed = pd.DataFrame(X, index=list(G.nodes()))
            emb_transformed = emb_transformed.rename(columns={'0': 0, '1': 1})

        def plot_emb(transform, emb_transformed):
            fig, ax = plt.subplots(figsize=(7, 7))
            ax.scatter(emb_transformed[0],
                       emb_transformed[1],
                       c=emb_transformed['label'].astype("category"),
                       cmap="jet",
                       alpha=0.7)
            ax.set(aspect="equal", xlabel="$X_1$", ylabel="$X_2$")
            plt.title(
                '{} visualization of GAT embeddings for the fighter graph'.
                format(transform.__name__))

        # Plot the training history
        def remove_prefix(text, prefix):
            return text[text.startswith(prefix) and len(prefix):]

        def plot_history(history):
            metrics = sorted(
                set([
                    remove_prefix(m, "val_")
                    for m in list(history.history.keys())
                ]))
            for m in metrics:
                # summarize history for metric m
                plt.figure()
                plt.plot(history.history[m])
                plt.plot(history.history['val_' + m])
                plt.title(m)
                plt.ylabel(m)
                plt.xlabel('epoch')
                plt.legend(['train', 'validation'], loc='best')

        plot_history(history)
        plot_emb(transform, emb_transformed)
        plt.show()

    return df
예제 #4
0
def train(
    edgelist,
    node_data,
    attn_heads,
    layer_sizes,
    num_epochs=10,
    learning_rate=0.005,
    es_patience=100,
    dropout=0.0,
    target_name="subject",
):
    """
    Train a GAT model on the specified graph G with given parameters, evaluate it, and save the model.

    Args:
        edgelist: Graph edgelist
        node_data: Feature and target data for nodes
        attn_heads: Number of attention heads in GAT layers
        layer_sizes: A list of number of hidden nodes in each layer
        num_epochs: Number of epochs to train the model
        learning_rate: Initial Learning rate
        dropout: The dropout (0->1)
    """
    # Extract target and encode as a one-hot vector
    target_encoding = feature_extraction.DictVectorizer(sparse=False)
    node_targets = target_encoding.fit_transform(
        node_data[[target_name]].to_dict("records"))
    node_ids = node_data.index

    # Extract the feature data. These are the feature vectors that the Keras model will use as input.
    # The CORA dataset contains attributes 'w_x' that correspond to words found in that publication.
    node_features = node_data[feature_names]

    # Create graph from edgelist and set node features and node type
    Gnx = nx.from_pandas_edgelist(edgelist)

    # Convert to StellarGraph and prepare for ML
    G = sg.StellarGraph(Gnx,
                        node_type_name="label",
                        node_features=node_features)

    # Split nodes into train/test using stratification.
    train_nodes, test_nodes, train_targets, test_targets = model_selection.train_test_split(
        node_ids,
        node_targets,
        train_size=140,
        test_size=None,
        stratify=node_targets,
        random_state=55232,
    )

    # Further split test set into validation and test
    val_nodes, test_nodes, val_targets, test_targets = model_selection.train_test_split(
        test_nodes,
        test_targets,
        train_size=500,
        test_size=1000,
        random_state=523214)

    # Create mappers for GraphSAGE that input data from the graph to the model
    generator = FullBatchNodeGenerator(G)
    train_gen = generator.flow(train_nodes, train_targets)
    val_gen = generator.flow(val_nodes, val_targets)

    # GAT model
    gat = GAT(
        layer_sizes=layer_sizes,
        attn_heads=attn_heads,
        generator=generator,
        bias=True,
        in_dropout=dropout,
        attn_dropout=dropout,
        activations=["elu", "elu"],
        normalize=None,
    )
    # Expose the input and output tensors of the GAT model for nodes:
    x_inp, x_out = gat.node_model(add_self_loops=True)

    # Snap the final estimator layer to x_out
    x_out = layers.Dense(units=train_targets.shape[1],
                         activation="softmax")(x_out)

    # Create Keras model for training
    model = keras.Model(inputs=x_inp, outputs=x_out)
    model.compile(
        optimizer=optimizers.Adam(lr=learning_rate, decay=0.001),
        loss=losses.categorical_crossentropy,
        weighted_metrics=["acc"],
    )
    print(model.summary())

    # Train model
    # Callbacks
    if not os.path.isdir("logs"):
        os.makedirs("logs")
    N = len(node_ids)
    es_callback = EarlyStopping(monitor="val_weighted_acc",
                                patience=es_patience)
    tb_callback = TensorBoard(batch_size=N)
    mc_callback = ModelCheckpoint(
        "logs/best_model.h5",
        monitor="val_weighted_acc",
        save_best_only=True,
        save_weights_only=True,
    )

    if args.interface == "fit":
        print("\nUsing model.fit() to train the model\n")
        # Get the training data
        [X, A], y_train, node_mask_train = train_gen.__getitem__(0)
        N = A.shape[0]
        # A = sparse.csr_matrix(A + np.eye(A.shape[0]))  # Add self-loops

        # Get the validation data
        [_, _], y_val, node_mask_val = val_gen.__getitem__(0)

        history = model.fit(
            x=[X, A],
            y=y_train,
            sample_weight=node_mask_train,
            batch_size=N,
            shuffle=
            False,  # must be False, since shuffling data means shuffling the whole graph
            epochs=num_epochs,
            verbose=2,
            validation_data=([X, A], y_val, node_mask_val),
            callbacks=[es_callback, tb_callback, mc_callback],
        )
    else:
        print("\nUsing model.fit_generator() to train the model\n")
        history = model.fit_generator(
            train_gen,
            epochs=num_epochs,
            validation_data=val_gen,
            verbose=2,
            shuffle=False,
            callbacks=[es_callback, tb_callback, mc_callback],
        )

    # Load best model
    model.load_weights("logs/best_model.h5")

    # Evaluate on validation set and print metrics
    if args.interface == "fit":
        val_metrics = model.evaluate(x=[X, A],
                                     y=y_val,
                                     sample_weight=node_mask_val,
                                     batch_size=N)
    else:
        val_metrics = model.evaluate_generator(val_gen)

    print("\nBest model's Validation Set Metrics:")
    for name, val in zip(model.metrics_names, val_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Evaluate on test set and print metrics
    if args.interface == "fit":
        [_, _], y_test, node_mask_test = generator.flow(
            test_nodes, test_targets).__getitem__(0)
        test_metrics = model.evaluate(x=[X, A],
                                      y=y_test,
                                      sample_weight=node_mask_test,
                                      batch_size=N)
    else:
        test_metrics = model.evaluate_generator(
            generator.flow(test_nodes, test_targets))

    print("\nBest model's Test Set Metrics:")
    for name, val in zip(model.metrics_names, test_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Get predictions for all nodes
    # Note that the `predict` or `predict_generator` function now operates differently to the `GraphSAGE` or `HinSAGE` models
    # in that if you give it less than the complete set of nodes, it will still return all predictions and in a fixed order
    # defined by the order of nodes in X and A (which is defined by the order of G.nodes()).
    if args.interface == "fit":
        all_predictions = model.predict(x=[X, A], batch_size=N)
    else:
        all_predictions = model.predict_generator(generator.flow(node_ids))

    # Turn predictions back into the original categories
    node_predictions = pd.DataFrame(
        target_encoding.inverse_transform(all_predictions),
        index=list(G.nodes()))
    accuracy = np.mean([
        "subject=" + gt_subject == p
        for gt_subject, p in zip(node_data["subject"][list(G.nodes())],
                                 node_predictions.idxmax(axis=1))
    ])
    print("\nAll-node accuracy: {:0.4f}".format(accuracy))

    # Save the trained model
    save_str = "_h{}_l{}_d{}_r{}".format(
        attn_heads, "_".join([str(x) for x in layer_sizes]), dropout,
        learning_rate)
    model.save("cora_gat_model" + save_str + ".h5")

    # We must also save the target encoding to convert model predictions
    with open("cora_gat_encoding" + save_str + ".pkl", "wb") as f:
        pickle.dump([target_encoding], f)