예제 #1
0
def inference(embeddings):
    G = form_graph("data/cutted_edges.csv", "data/cutted_features.csv", "data/cutted_edges_to.csv", embeddings)
    edge_splitter_full = EdgeSplitter(G)

    import keras as keras
    from stellargraph.layer import MeanAggregator, LinkEmbedding
    model = keras.models.load_model('data/w_rev1.h5',
                                    custom_objects={'MeanAggregator': MeanAggregator, 'LinkEmbedding': LinkEmbedding})

    G_full, edge_ids_full, edge_labels_full = edge_splitter_full.train_test_split(
        p=0.5, method="global", keep_connected=True
    )

    batch_size = 20
    num_samples = [20, 10]

    generator = GraphSAGELinkGenerator(G, batch_size, num_samples)
    hold_out_gen = generator.flow(edge_ids_full, edge_labels_full)

    hold_out_predictions_pr = model.predict(hold_out_gen)

    ID = 111180
    EDGE_results = []
    for i in range(len(edge_ids_full)):
        if edge_ids_full[i][0] == ID or edge_ids_full[i][1] == ID:
            EDGE_results.append(i)
    predictions = [[hold_out_predictions_pr[EDGE_results[i]][0], edge_ids_full[EDGE_results[i]][0]] for i in
                   range(len(hold_out_predictions_pr[EDGE_results]))]

    a = sorted(predictions, reverse=True)[0:10]
    sorted_ids = [a[i][1] for i in range(len(a))]

    return sorted_ids
예제 #2
0
    def initialize(self, **hyper_params):

        if (not "batch_size" in hyper_params.keys()):
            batch_size = 20
        if (not "layer_sizes" in hyper_params.keys()):
            num_samples = [20, 10]
        if (not "num_samples" in hyper_params.keys()):
            layer_sizes = [20, 20]
        if (not "bias" in hyper_params.keys()):
            bias = True
        if (not "dropout" in hyper_params.keys()):
            dropout = 0.3
        if (not "lr" in hyper_params.keys()):
            lr = 1e-3
        if (not "train_split" in hyper_params.keys()):
            train_split = 0.2

        self.graph = sg.StellarGraph(nodes=self.nodes, edges=self.edges)

        # Train split
        edge_splitter_train = EdgeSplitter(self.graph)
        graph_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(
            p=train_split, method="global", keep_connected=True)

        # Train iterators
        train_gen = GraphSAGELinkGenerator(graph_train, batch_size,
                                           num_samples)
        self.train_flow = train_gen.flow(edge_ids_train,
                                         edge_labels_train,
                                         shuffle=True)

        # Model defining - Keras functional API + Stellargraph layers
        graphsage = GraphSAGE(layer_sizes=layer_sizes,
                              generator=train_gen,
                              bias=bias,
                              dropout=dropout)

        x_inp, x_out = graphsage.in_out_tensors()

        prediction = link_classification(output_dim=1,
                                         output_act="relu",
                                         edge_embedding_method="ip")(x_out)

        self.model = keras.Model(inputs=x_inp, outputs=prediction)

        self.model.compile(
            optimizer=keras.optimizers.Adam(lr=lr),
            loss=keras.losses.binary_crossentropy,
            metrics=["acc"],
        )

        return self.model.get_weights()
예제 #3
0
def test(G, model_file: AnyStr, batch_size: int = 100):
    """
    Load the serialized model and evaluate on a random balanced subset of all links in the graph.
    Note that the set of links the model is evaluated on may contain links from the model's training set.
    To avoid this, set the seed of the edge splitter to the same seed as used for link splitting in train()

    Args:
        G: NetworkX graph file
        model_file: Location of Keras model to load
        batch_size: Size of batch for inference
    """
    print("Loading model from ", model_file)
    model = keras.models.load_model(
        model_file, custom_objects={"MeanAggregator": MeanAggregator})

    # Get required input shapes from model
    num_samples = [
        int(model.input_shape[ii + 1][1] / model.input_shape[ii][1])
        for ii in range(1,
                        len(model.input_shape) - 1, 2)
    ]

    edge_splitter_test = EdgeSplitter(G)
    # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G, and obtain the
    # reduced graph G_test with the sampled links removed:
    G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split(
        p=0.1,
        method=args.edge_sampling_method,
        probs=args.edge_sampling_probs)

    # Convert G_test to StellarGraph object (undirected, as required by GraphSAGE):
    G_test = sg.StellarGraph(G_test, node_features="feature")

    # Generator feeds data from (source, target) sampled subgraphs to GraphSAGE model
    test_gen = GraphSAGELinkGenerator(
        G_test,
        batch_size,
        num_samples,
        name="test",
    ).flow(edge_ids_test, edge_labels_test)

    # Evaluate and print metrics
    test_metrics = model.evaluate_generator(test_gen)

    print("\nTest Set Evaluation:")
    for name, val in zip(model.metrics_names, test_metrics):
        print("\t{}: {:0.4f}".format(name, val))
예제 #4
0
def splitSampleGraph():

    print('Graph post 2020')
    graph = nx.read_gml('../../graphs/graph_postCOVID_final_netx.gml.gz')
    
    for i in range(1, 6):
        print('Current run ' + str(i))

        # Define an edge splitter on the original graph:
        edge_splitter_ = EdgeSplitter(graph)

        # Randomly sample a fraction p of the graph (positive links), and same number of negative links, from graph, and obtain the
        # reduced graph graph_subset with the sampled links removed:
        graph_, sampled_edges, sample_labels = edge_splitter_.train_test_split(p=0.5, method="global")

        nx.write_gml(graph_, '../../graphs/graph_sampled_' + str(i) + '.gml.gz')

        del graph_

        # Now, split the sampled edges into training-test-validation sets for performing link prediction

        # Split operation 1 - obtain test versus train+validation
        (sampled_comp, sampled_test, labels_comp, labels_test,) = train_test_split(sampled_edges, sample_labels, train_size=0.65, test_size=0.35)

        # Split operation 2 - divide the comp block into training and validation sets
        (sampled_training, sampled_validation, labels_training, labels_validation,) = train_test_split(sampled_comp, labels_comp, train_size=0.77, test_size=0.23)

        # Save the sampled training validation test sets
        df_train = pd.DataFrame({'node1': np.array(sampled_training)[:, 0], 'node2': np.array(sampled_training)[:, 1], 'labels': labels_training})
        df_train.to_csv('../../graphs/graph_train_edges_sampled_' + str(i) + '.csv')
        del df_train

        print('Number of training samples (positive) ' + str(len(labels_training)/2.0))

        df_val = pd.DataFrame({'node1': np.array(sampled_validation)[:, 0], 'node2': np.array(sampled_validation)[:, 1], 'labels': labels_validation})
        df_val.to_csv('../../graphs/graph_val_edges_sampled_' + str(i) + '.csv')
        del df_val

        print('Number of validation samples (positive) ' + str(len(labels_validation)/2.0))

        df_test = pd.DataFrame({'node1': np.array(sampled_test)[:, 0], 'node2': np.array(sampled_test)[:, 1], 'labels': labels_test})
        df_test.to_csv('../../graphs/graph_test_edges_sampled_' + str(i) + '.csv')
        del df_test

        print('Number of test samples (positive) ' + str(len(labels_test)/2.0))
def SplitGraphObj(graphObj):
    # Creating splitter object using EdgeSplitter and create test graph and test set.
    # Test graph is a reduced version of original graph obtained by removing links from test set.
    # Here we are samping 0.1 which is ~10% of positive and negative links and creating Test set.

    test_splitobj = EdgeSplitter(graphObj)
    test_graph, edgelist_test, labels_test = test_splitobj.train_test_split(
    p=0.1, method="global")

    # Creating train set and train graph further using reduced test graph
    # train_test_split returns 'reduced graph (positive edges removed)' ,
    #'N*2 dim edgelist for pos and neg edges sampled',
    #'labels '0' and '1' based for neg and pos edge resp'

    train_splitobj = EdgeSplitter(test_graph, graphObj)
    train_graph, edgelist, labels = train_splitobj.train_test_split(
        p=0.1, method="global"
    )

    #Using sklearn train_test_split method to split the edgelist and labels
    #Here we are splitting into 75% and 25% to generate model selection and train set.

    (   edgelist_train,
        edgelist_model_selection,
        labels_train,
        labels_model_selection,
    ) = train_test_split(edgelist, labels, train_size=0.75, test_size=0.25)

    return test_graph, train_graph, edgelist_test, edgelist_train,edgelist_model_selection, labels_test ,labels_train, labels_model_selection
예제 #6
0
    def initialize(self,**hyper_params):

        if(not "batch_size" in hyper_params.keys()):
            batch_size = 20
        if(not "layer_sizes" in hyper_params.keys()):
            num_samples = [20, 10]
        if(not "num_samples" in hyper_params.keys()):
            layer_sizes = [10, 10 ]
        if(not "bias" in hyper_params.keys()):
            bias = True
        if(not "dropout" in hyper_params.keys()):
            dropout = 0.1
        if(not "lr" in hyper_params.keys()):
            lr = 1e-2

        graph = sg.StellarGraph(nodes=self.nodes,edges=self.edges)

        # Test split
        edge_splitter_test = EdgeSplitter(graph)
        self.graph_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split(
            p=0.1, method="global", keep_connected=True, seed = 42
        )

        # Train split
        edge_splitter_train = EdgeSplitter(self.graph_test)
        self.graph_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(
            p=0.1, method="global", keep_connected=True, seed = 42
        )

        # Train iterators
        train_gen = GraphSAGELinkGenerator(self.graph_train, batch_size, num_samples, seed = 42)
        self.train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True)

        # Test iterators
        test_gen = GraphSAGELinkGenerator(self.graph_train, batch_size, num_samples, seed = 42)
        self.test_flow = test_gen.flow(edge_ids_test, edge_labels_test, shuffle=True)

        # Model defining - Keras functional API + Stellargraph layers
        graphsage = GraphSAGE(
            layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout
        )

        x_inp, x_out = graphsage.in_out_tensors()

        prediction = link_classification(
            output_dim=1, output_act="sigmoid", edge_embedding_method="ip"
        )(x_out)

        self.model = keras.Model(inputs=x_inp, outputs=prediction)

        self.model.compile(
            optimizer=keras.optimizers.Adam(lr=lr),
            loss=keras.losses.binary_crossentropy,
            metrics=[keras.metrics.BinaryAccuracy(),keras.metrics.Recall(),keras.metrics.AUC(),keras.metrics.Precision()],
        )

        # return number of training and testing examples
        return edge_ids_train.shape[0],edge_ids_test.shape[0]
def create_samples(g):
    G = sg.StellarGraph.from_networkx(g, node_features="feature")
    edge_splitter_test = EdgeSplitter(G)
    G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split(
        p=0.1, method="global", keep_connected=True)

    id_1, id_2 = [], []
    for edge in edge_ids_test:
        id_1.append(edge[0])
        id_2.append(edge[1])
    link = list(edge_labels_test)

    num = int(nx.number_of_edges(g) * 0.1)
    positive_sample = edge_ids_test[:num]
    # negative_sample = edge_ids_test[num:]

    for edge in positive_sample:
        g.remove_edge(edge[0], edge[1])

    df = pd.DataFrame({'id_1': id_1, 'id_2': id_2, 'link': link})
    # df.to_csv('nodes.csv', encoding='utf-8')

    return g, df
def split_graph(g, p_test=0.1, p_train=0.1):
    # TEST
    edge_splitter_test = EdgeSplitter(g)

    # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from graph, and obtain the
    # reduced graph graph_test with the sampled links removed:
    g_test, edges_test, labels_test = edge_splitter_test.train_test_split(
        p=p_test, method="global", edge_label='image2word')

    # TRAIN
    edge_splitter_train = EdgeSplitter(g_test, g)

    # Sampling for the second time for train to eliminate overlap with test:
    g_train, edges_train, labels_train = edge_splitter_train.train_test_split(
        p=p_train, method="global", edge_label='image2word')

    return edges_train, edges_test, labels_train, labels_test, g_train, g_test
예제 #9
0
path_weights = "./weights/weights.npy"
path_node_partition = "./data/4_attributes_0"
path_edge_partition = "./data/4_0"

# Constructing the graph
nodes = pd.read_csv(path_node_partition , sep='\t', lineterminator='\n',header=None).loc[:,0:1433]
nodes.set_index(0,inplace=True)

edges = pd.read_csv(path_edge_partition , sep='\s+', lineterminator='\n', header=None)
edges.columns = ["source","target"]

G = sg.StellarGraph(nodes=nodes,edges=edges)

# Train split
edge_splitter_train = EdgeSplitter(G)
G_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(
    p=0.2, method="global", keep_connected=True
)

# Hyperparams
batch_size = 20
epochs = 20
num_samples = [20, 10]
layer_sizes = [20, 20]

# Train iterators
train_gen = GraphSAGELinkGenerator(G_train, batch_size, num_samples)
train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True)

# `stellargraph.data`. We will use the train graph for training the model (a binary classifier that, given two nodes,
# predicts whether a link between these two nodes should exist or not) and the test graph for evaluating the model's
# performance on hold out data.
#
# Each of these graphs will have the same number of nodes as the input graph, but the number of links will differ (be
# reduced) as some of the links will be removed during each split and used as the positive samples for
# training/testing the link prediction classifier.

# From the original graph G, extract a randomly sampled subset of test edges (true and false citation links) and the
# reduced graph G_test with the positive test edges removed:

# In[4]:


# Define an edge splitter on the original graph G:
edge_splitter_test = EdgeSplitter(G)

# Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G, and obtain the
# reduced graph G_test with the sampled links removed:
G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split(
    p=0.1, method="global", keep_connected=True, seed=42
)


# The reduced graph G_test, together with the test ground truth set of links (edge_ids_test, edge_labels_test),
# will be used for testing the model.
#
# Now, repeat this procedure to obtain validation data that we are going to use for early stopping in order to
# prevent overfitting. From the reduced graph G_test, extract a randomly sampled subset of validation edges (true and
# false citation links) and the reduced graph G_val with the positive validation edges removed.
예제 #11
0
from sklearn.model_selection import train_test_split

# import data and create initial graph G
# filename_edges = "Datasets/Cora/cora_cites.csv"
# wt = False
filename_edges = "Datasets/soc-sign-bitcoinotc/soc-sign-bitcoinotc-temporal.csv"
wt=True

edgelist_df = utils.load_data(filename_edges, weighted = wt)

G = StellarGraph(edges = edgelist_df)
print("\n", G.info())
print("Created master graph from data")

# Define an edge splitter on the original graph:
edge_splitter_test = EdgeSplitter(G)

# Randomly sample a fraction p=0.1 of all positive links, 
# and same number of negative links, from graph, and obtain the
# reduced graph graph_test with the sampled links removed:
(
 G_test,         # To compute node embeddings with mode edges than G_train
 examples_test,  
 labels_test
 ) = edge_splitter_test.train_test_split(p=0.1, 
                                         method="global")

#print(G_test.info())
print("Created test Graph from master graph")

# Do the same process to compute a training subset from within the test graph
예제 #12
0
        g_nx = g_nx.subgraph(nodes[0:subgraph_num_nodes])

    # Check if graph is connected; if not, then select the largest subgraph to continue
    if nx.is_connected(g_nx):
        print("Graph is connected")
    else:
        print("Graph is not connected")
        # take the largest connected component as the data
        g_nx_ccs = (g_nx.subgraph(c).copy()
                    for c in nx.connected_components(g_nx))
        g_nx = max(g_nx_ccs, key=len)
        print("Largest subgraph statistics: {} nodes, {} edges".format(
            g_nx.number_of_nodes(), g_nx.number_of_edges()))

    # From the original graph, extract E_test and G_test
    edge_splitter_test = EdgeSplitter(g_nx)
    if args.hin:
        g_test, edge_data_ids_test, edge_data_labels_test = edge_splitter_test.train_test_split(
            p=p,
            edge_label=args.edge_type,
            edge_attribute_label=args.edge_attribute_label,
            edge_attribute_threshold=args.edge_attribute_threshold,
            attribute_is_datetime=args.attribute_is_datetime,
            method=args.sampling_method,
            probs=sampling_probs,
        )
    else:
        g_test, edge_data_ids_test, edge_data_labels_test = edge_splitter_test.train_test_split(
            p=p, method=args.sampling_method, probs=sampling_probs)
    if args.show_histograms:
        if args.sampling_method == "local":
예제 #13
0
                                                features=ft)

G = StellarGraph(node_features_df, edgelist_df)
print("Created master graph from data")
print("\n", G.info())

# # Reduce initial Graph
# edge_splitter_test = EdgeSplitter(G)

# G,_,_ = edge_splitter_test.train_test_split(p=0.8,
#                                             method="global",
#                                             keep_connected=True)
# print("\n", G.info())

# Define an edge splitter on the original graph G:
edge_splitter_test = EdgeSplitter(G)

# Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G, and obtain the
# reduced graph G_test with the sampled links removed:
(G_test, edge_ids_test,
 edge_labels_test) = edge_splitter_test.train_test_split(p=0.1,
                                                         method="global",
                                                         keep_connected=True)

# Define an edge splitter on the reduced graph G_test:
edge_splitter_train = EdgeSplitter(G_test)

# Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G_test, and obtain the
# reduced graph G_train with the sampled links removed:
(G_train, edge_ids_train,
 edge_labels_train) = edge_splitter_train.train_test_split(p=0.1,
예제 #14
0
def train(
    G,
    layer_size: List[int],
    num_samples: List[int],
    batch_size: int = 100,
    num_epochs: int = 10,
    learning_rate: float = 0.001,
    dropout: float = 0.0,
):
    """
    Train the GraphSAGE model on the specified graph G
    with given parameters.

    Args:
        G: NetworkX graph file
        layer_size: A list of number of hidden units in each layer of the GraphSAGE model
        num_samples: Number of neighbours to sample at each layer of the GraphSAGE model
        batch_size: Size of batch for inference
        num_epochs: Number of epochs to train the model
        learning_rate: Initial Learning rate
        dropout: The dropout (0->1)
    """

    # Split links into train/test
    print("Using '{}' method to sample negative links".format(
        args.edge_sampling_method))

    # From the original graph, extract E_test and the reduced graph G_test:
    edge_splitter_test = EdgeSplitter(G)
    # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G, and obtain the
    # reduced graph G_test with the sampled links removed:
    G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split(
        p=0.1,
        keep_connected=True,
        method=args.edge_sampling_method,
        probs=args.edge_sampling_probs,
    )

    # From G_test, extract E_train and the reduced graph G_train:
    edge_splitter_train = EdgeSplitter(G_test, G)
    # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G_test, and obtain the
    # further reduced graph G_train with the sampled links removed:
    G_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(
        p=0.1,
        keep_connected=True,
        method=args.edge_sampling_method,
        probs=args.edge_sampling_probs,
    )

    # G_train, edge_ds_train, edge_labels_train will be used for model training
    # G_test, edge_ds_test, edge_labels_test will be used for model testing

    # Convert G_train and G_test to StellarGraph objects (undirected, as required by GraphSAGE) for ML:
    G_train = sg.StellarGraph(G_train, node_features="feature")
    G_test = sg.StellarGraph(G_test, node_features="feature")

    # Mapper feeds link data from sampled subgraphs to GraphSAGE model
    # We need to create two mappers: for training and testing of the model
    train_gen = GraphSAGELinkGenerator(G_train, batch_size, num_samples)
    train_flow = train_gen.flow(edge_ids_train,
                                edge_labels_train,
                                shuffle=True)

    test_gen = GraphSAGELinkGenerator(G_test, batch_size, num_samples)
    test_flow = test_gen.flow(edge_ids_test, edge_labels_test)

    # GraphSAGE model
    graphsage = GraphSAGE(layer_sizes=layer_size,
                          generator=train_gen,
                          bias=True,
                          dropout=dropout)

    # Construct input and output tensors for the link prediction model
    x_inp, x_out = graphsage.build()

    # Final estimator layer
    prediction = link_classification(
        output_dim=1,
        output_act="sigmoid",
        edge_embedding_method=args.edge_embedding_method,
    )(x_out)

    # Stack the GraphSAGE and prediction layers into a Keras model, and specify the loss
    model = keras.Model(inputs=x_inp, outputs=prediction)
    model.compile(
        optimizer=optimizers.Adam(lr=learning_rate),
        loss=losses.binary_crossentropy,
        metrics=[metrics.binary_accuracy],
    )

    # Evaluate the initial (untrained) model on the train and test set:
    init_train_metrics = model.evaluate_generator(train_flow)
    init_test_metrics = model.evaluate_generator(test_flow)

    print("\nTrain Set Metrics of the initial (untrained) model:")
    for name, val in zip(model.metrics_names, init_train_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    print("\nTest Set Metrics of the initial (untrained) model:")
    for name, val in zip(model.metrics_names, init_test_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Train model
    print("\nTraining the model for {} epochs...".format(num_epochs))
    history = model.fit_generator(
        train_flow,
        epochs=num_epochs,
        validation_data=test_flow,
        verbose=2,
        shuffle=False,
    )

    # Evaluate and print metrics
    train_metrics = model.evaluate_generator(train_flow)
    test_metrics = model.evaluate_generator(test_flow)

    print("\nTrain Set Metrics of the trained model:")
    for name, val in zip(model.metrics_names, train_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    print("\nTest Set Metrics of the trained model:")
    for name, val in zip(model.metrics_names, test_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Save the trained model
    save_str = "_n{}_l{}_d{}_r{}".format(
        "_".join([str(x) for x in num_samples]),
        "_".join([str(x) for x in layer_size]),
        dropout,
        learning_rate,
    )
    model.save("graphsage_link_pred" + save_str + ".h5")
예제 #15
0
# * For choosing the best classifier, an **Model Selection Test Set** (`examples_model_selection`) of positive and negative edges that weren't used for computing node embeddings or training the classifier
# * For the final evaluation, a **Test Graph** (`graph_test`) to compute test node embeddings with more edges than the Train Graph, and a **Test Set** (`examples_test`) of positive and negative edges not used for neither computing the test node embeddings or for classifier training or model selection

# ###  Test Graph
#
# We begin with the full graph and use the `EdgeSplitter` class to produce:
#
# * Test Graph
# * Test set of positive/negative link examples
#
# The Test Graph is the reduced graph we obtain from removing the test set of links from the full graph.

# In[6]:

# Define an edge splitter on the original graph:
edge_splitter_test = EdgeSplitter(graph)

# Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from graph, and obtain the
# reduced graph graph_test with the sampled links removed:
graph_test, examples_test, labels_test = edge_splitter_test.train_test_split(
    p=0.1, method="global")

print(graph_test.info())

# ### Train Graph
#
# This time, we use the `EdgeSplitter` on the Test Graph, and perform a train/test split on the examples to produce:
#
# * Train Graph
# * Training set of link examples
# * Set of link examples for model selection
예제 #16
0
    def iteration(ind):
        datasets = {}
        edge_splitter_test = EdgeSplitter(G)
        G_test, ids_test, _ = edge_splitter_test.train_test_split(
            p=float(probs[0]), method="global", keep_connected=True, seed=ind)
        ids_test, labels_test = undirected_label2directed_label(
            A, ids_test, task)

        edge_splitter_val = EdgeSplitter(G_test)
        G_val, ids_val, _ = edge_splitter_val.train_test_split(
            p=float(probs[1]), method="global", keep_connected=True, seed=ind)
        ids_val, labels_val = undirected_label2directed_label(A, ids_val, task)

        edge_splitter_train = EdgeSplitter(G_val)
        _, ids_train, _ = edge_splitter_train.train_test_split(
            p=0.99, method="global", keep_connected=False, seed=ind)
        ids_train, labels_train = undirected_label2directed_label(
            A, ids_train, task)

        # observation after removing edges for training/validation/testing
        edges = [e for e in G_val.edges]
        # convert back to directed graph
        oberved_edges = np.zeros((len(edges), 2), dtype=np.int32)
        undirected_edges = np.zeros((2 * len(G.edges), 2), dtype=np.int32)

        for i, e in enumerate(edges):
            if A[e[0], e[1]] > 0:
                oberved_edges[i, 0] = int(e[0])
                oberved_edges[i, 1] = int(e[1])
            if A[e[1], e[0]] > 0:
                oberved_edges[i, 0] = int(e[1])
                oberved_edges[i, 1] = int(e[0])

        for i, e in enumerate(G.edges):
            if A[e[0], e[1]] > 0 or A[e[1], e[0]] > 0:
                undirected_edges[i, :] = [int(e[1]), e[0]]
                undirected_edges[i + len(edges), :] = [int(e[0]), e[1]]
        if label_dim == 2:
            ids_train = ids_train[labels_train < 2]
            labels_train = labels_train[labels_train < 2]
            ids_test = ids_test[labels_test < 2]
            labels_test = labels_test[labels_test < 2]
            ids_val = ids_val[labels_val < 2]
            labels_val = labels_val[labels_val < 2]
        ############################################
        # training data
        ############################################
        datasets[ind] = {}
        datasets[ind]['graph'] = torch.from_numpy(oberved_edges.T).long()
        datasets[ind]['undirected'] = undirected_edges

        datasets[ind]['train'] = {}
        datasets[ind]['train']['pairs'] = ids_train
        datasets[ind]['train']['label'] = labels_train
        ############################################
        # validation data
        ############################################
        datasets[ind]['validate'] = {}
        datasets[ind]['validate']['pairs'] = ids_val
        datasets[ind]['validate']['label'] = labels_val
        ############################################
        # test data
        ############################################
        datasets[ind]['test'] = {}
        datasets[ind]['test']['pairs'] = ids_test
        datasets[ind]['test']['label'] = labels_test
        return datasets
예제 #17
0
def graph_links_from_csv(nodes_filepath, 
                         links_filepath,
                         sample_sizes=[10, 20],
                         train_pct=0.1, val_pct=0.1, sep=',', 
                         holdout_pct=None, 
                         holdout_for_inductive=False,
                         missing_label_value=None,
                         random_state=None,
                         verbose=1):
    """
    Loads graph data from CSV files. 
    Returns generators for links in graph for use with GraphSAGE model.
    Args:
        nodes_filepath(str): file path to training CSV containing node attributes
        links_filepath(str): file path to training CSV describing links among nodes
        sample_sizes(int): Number of nodes to sample at each neighborhood level.
        train_pct(float): Proportion of edges to use for training.
                          Default is 0.1.
                          Note that train_pct is applied after val_pct is applied.
        val_pct(float): Proportion of edges to use for validation
        sep (str):  delimiter for CSVs. Default is comma.
        random_state (int):  random seed for train/test split
        verbose (boolean): verbosity
    Return:
        tuple of EdgeSequenceWrapper objects for train and validation sets and LinkPreprocessor
    """

    # import stellargraph
    try:
        import stellargraph as sg
        from stellargraph.data import EdgeSplitter
    except:
        raise Exception(SG_ERRMSG)
    if version.parse(sg.__version__) < version.parse('0.8'):
        raise Exception(SG_ERRMSG)


    #----------------------------------------------------------------
    # read graph structure
    #----------------------------------------------------------------
    nx_sep = None if sep in [' ', '\t'] else sep
    G = nx.read_edgelist(path=links_filepath, delimiter=nx_sep)
    print(nx.info(G))




    #----------------------------------------------------------------
    # read node attributes
    #----------------------------------------------------------------
    node_attr = pd.read_csv(nodes_filepath, sep=sep, header=None)
    num_features = len(node_attr.columns.values) - 1 # subract ID and treat all other columns as features
    feature_names = ["w_{}".format(ii) for ii in range(num_features)]
    node_data = pd.read_csv(nodes_filepath, header=None, names=feature_names, sep=sep)
    node_data.index = node_data.index.map(str)
    df = node_data[node_data.index.isin(list(G.nodes()))]
    for col in feature_names:
        if not isinstance(node_data[col].values[0], str): continue
        df = pd.concat([df, df[col].astype('str').str.get_dummies().add_prefix(col+'_')], axis=1, sort=False)
        df = df.drop([col], axis=1)
    feature_names = df.columns.values
    node_data = df
    node_features = node_data[feature_names].values
    for nid, f in zip(node_data.index, node_features):
        G.node[nid][sg.globalvar.TYPE_ATTR_NAME] = "node"  
        G.node[nid]["feature"] = f


    #----------------------------------------------------------------
    # train/validation sets
    #----------------------------------------------------------------
    edge_splitter_test = EdgeSplitter(G)
    G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split(p=val_pct, method="global", keep_connected=True)
    edge_splitter_train = EdgeSplitter(G_test)
    G_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(p=train_pct, method="global", keep_connected=True)
    epp = LinkPreprocessor(G, sample_sizes=sample_sizes)
    trn = epp.preprocess_train(G_train, edge_ids_train, edge_labels_train)
    val = epp.preprocess_valid(G_test, edge_ids_test, edge_labels_test)
    
    return (trn, val, epp)
예제 #18
0
def main():
    """Load the graph, create the embeddings, evaluate them with link prediction and save the results."""

    args = parse_args()

    graph = utils.load_graph(args.weighted, args.directed, args.input)
    utils.print_graph_info(graph, "original graph")

    graph.remove_nodes_from(list(nx.isolates(graph)))
    utils.print_graph_info(graph, "graph without isolates")

    edge_splitter_test = EdgeSplitter(graph)

    graph_test, X_test_edges, y_test = edge_splitter_test.train_test_split(
        p=args.test_percentage, method="global")

    edge_splitter_train = EdgeSplitter(graph_test, graph)
    graph_train, X_edges, y = edge_splitter_train.train_test_split(
        p=args.train_percentage, method="global")
    X_train_edges, X_model_selection_edges, y_train, y_model_selection = train_test_split(
        X_edges, y, train_size=0.75, test_size=0.25)

    logger.info(f'\nEmbedding algorithm started.')
    start = time.time()

    embedding.create_embedding(args, graph_train)
    time_diff = time.time() - start
    logger.info(f'\nEmbedding algorithm finished in {time_diff:.2f} seconds.')

    embeddings = utils.load_embedding(args.output)

    logger.info(f'\nEmbedding evaluation started.')
    start = time.time()
    results = evaluation.evaluate(args.classifier, embeddings, X_train_edges,
                                  y_train, X_model_selection_edges,
                                  y_model_selection)

    time_diff = time.time() - start
    logger.info(f'Embedding evaluation finished in {time_diff:.2f} seconds.')

    best_result = max(results, key=lambda result: result["roc_auc"])

    logger.info(
        f"\nBest roc_auc_score on train set using '{best_result['binary_operator'].__name__}': {best_result['roc_auc']}."
    )

    logger.info(f'\nEmbedding algorithm started.')
    start = time.time()

    embedding.create_embedding(args, graph_test)
    time_diff = time.time() - start
    logger.info(f'\nEmbedding algorithm finished in {time_diff:.2f} seconds.')

    embedding_test = utils.load_embedding(args.output)

    roc_auc, average_precision, accuracy, f1 = evaluation.evaluate_model(
        best_result["classifier"], embedding_test,
        best_result["binary_operator"], X_test_edges, y_test)

    logger.info(
        f"Scores on test set using '{best_result['binary_operator'].__name__}'."
    )
    logger.info(f"roc_auc_score: {roc_auc}")
    logger.info(f"average_precision_score: {average_precision}")
    logger.info(f"accuracy_score: {accuracy}")
    logger.info(f"f1_score on test set using: {f1}\n")

    if (args.results):
        evaluation.save_evaluation_results(
            args.dataset, args.method, args.classifier,
            (roc_auc, average_precision, accuracy, f1), args.results)