示例#1
0
    def initialize(self,**hyper_params):

        if(not "batch_size" in hyper_params.keys()):
            batch_size = 16
        if(not "layer_sizes" in hyper_params.keys()):
            num_samples = [25, 10]
        if(not "num_samples" in hyper_params.keys()):
            layer_sizes = [256, 256]
        if(not "bias" in hyper_params.keys()):
            bias = True
        if(not "dropout" in hyper_params.keys()):
            dropout = 0.0
        if(not "lr" in hyper_params.keys()):
            lr = 1e-3
        if(not "num_walks" in hyper_params.keys()):
            num_walks = 1
        if(not "length" in hyper_params.keys()):
            length = 5

        self.graph = sg.StellarGraph(nodes=self.nodes_df,edges=self.edges_df)
        self.nodes = list(self.graph.nodes())

        del self.nodes_df
        del self.edges_df

        unsupervised_samples = UnsupervisedSampler(
            self.graph, nodes=self.nodes, length=length, number_of_walks=num_walks
        )

        # Train iterators
        train_gen = GraphSAGELinkGenerator(self.graph, batch_size, num_samples)
        self.train_flow = train_gen.flow(unsupervised_samples)

        # Model defining - Keras functional API + Stellargraph layers
        graphsage = GraphSAGE(
            layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout, normalize="l2"
        )

        x_inp, x_out = graphsage.in_out_tensors()

        prediction = link_classification(
            output_dim=1, output_act="sigmoid", edge_embedding_method="ip"
        )(x_out)

        self.model = keras.Model(inputs=x_inp, outputs=prediction)

        self.model.compile(
            optimizer=keras.optimizers.Adam(lr=lr),
            loss=keras.losses.binary_crossentropy,
            metrics=[keras.metrics.binary_accuracy],
        )

        x_inp_src = x_inp[0::2]
        x_out_src = x_out[0]
        self.embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

        self.node_gen = GraphSAGENodeGenerator(self.graph, batch_size, num_samples).flow(self.nodes)

        return self.model.get_weights()
def create_train_gen(G):
    # This generates random walk samples from the graph
    unsupervised_samples = UnsupervisedSampler(
        G,
        nodes=list(G.nodes()),
        length=config.WALK_LENGTH,
        number_of_walks=config.NUM_WALKS,
    )

    return GraphSAGELinkGenerator(G, config.BATCH_SIZE, config.NUM_SAMPLES).flow(
        unsupervised_samples
    )
示例#3
0
    def _fit_inductive_embedder(self, train_graph):
        """Fit inductive embedder (predictive model and embeddings)."""
        if self.model_name in ["cluster_gcn_dgi", "cluster_gat_dgi"]:
            return _fit_deep_graph_infomax(train_graph, self.params,
                                           self.model_name)

        unsupervised_samples = UnsupervisedSampler(
            train_graph,
            nodes=train_graph.nodes(),
            length=self.params["length"],
            number_of_walks=self.params["number_of_walks"])

        generator = _dispatch_generator(train_graph,
                                        self.model_name,
                                        self.params,
                                        generator_type="edge")
        layer_sizes = _dispatch_layer_sizes(self.model_name, self.params)
        embedding_layer = _dispatch_inductive_layer(layer_sizes, generator,
                                                    self.model_name,
                                                    self.params)

        x_inp, x_out = embedding_layer.in_out_tensors()

        prediction = link_classification(output_dim=1,
                                         output_act="sigmoid",
                                         edge_embedding_method="ip")(x_out)

        model = Model(inputs=x_inp, outputs=prediction)
        model.compile(
            optimizer=optimizers.Adam(lr=1e-3),
            loss=losses.binary_crossentropy,
            metrics=[metrics.binary_accuracy],
        )
        train_generator = generator.flow(unsupervised_samples)

        model.fit(train_generator,
                  epochs=self.params["epochs"],
                  shuffle=True,
                  verbose=0)

        if self.model_name == "attri2vec":
            x_inp_src = x_inp[0]
        elif self.model_name == "graphsage":
            x_inp_src = x_inp[0::2]

        x_out_src = x_out[0]

        embedding_model = Model(inputs=x_inp_src, outputs=x_out_src)
        return embedding_model
示例#4
0
# -

vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,3))
X = vectorizer.fit_transform(list(features.values()))

G = StellarGraph(X.toarray())

dataset = datasets.Cora()
display(HTML(dataset.description))
G, node_subjects = dataset.load()

nodes = list(G.nodes())
number_of_walks = 1
length = 5
unsupervised_samples = UnsupervisedSampler(
    G, nodes=nodes, length=length, number_of_walks=number_of_walks
)

batch_size = 50
epochs = 4
num_samples = [10, 5]

generator = GraphSAGELinkGenerator(G, batch_size, num_samples)
train_gen = generator.flow(unsupervised_samples)



# +
# feature extractoring and preprocessing data
import librosa
import pandas as pd
示例#5
0
        if n not in Gtrain_nx.nodes():
            Gtrain_nx.add_node(n)
    nx.set_node_attributes(Gtrain_nx, "paper", "label")

## Train the embedding
#                    mo"number of epochs to train for"l
G = sg.StellarGraph(Gnx, node_features=node_features)
Gtrain = sg.StellarGraph(Gtrain_nx, node_features=node_features)

# The graph G
#                    together wi"number of parallel workers to use" the unsupervised sampler will be used to generate samples.
actual_nodes_train = list(Gtrain.nodes())
if testtype == 'nodes':
    assert set(nodes_train).issuperset(actual_nodes_train)
unsupervised_samples = UnsupervisedSampler(Gtrain,
                                           nodes=actual_nodes_train,
                                           length=length_of_walks,
                                           number_of_walks=number_of_walks)
train_gen = GraphSAGELinkGenerator(Gtrain, batch_size,
                                   num_samples).flow(unsupervised_samples)

# Build the model
assert len(layer_sizes) == len(num_samples)
graphsage = GraphSAGE(layer_sizes=layer_sizes,
                      generator=train_gen,
                      bias=bias,
                      dropout=0.0,
                      normalize="l2")
x_inp, x_out = graphsage.build(flatten_output=False)
prediction = link_classification(output_dim=1,
                                 output_act="sigmoid",
                                 edge_embedding_method='ip')(x_out)
from stellargraph.data import UnsupervisedSampler
from sklearn.model_selection import train_test_split

from tensorflow import keras

# parameter specification
number_of_walks = 1
length = 5
batch_size = 500
epochs = 5
num_samples = [10, 10]
layer_sizes = [40, 30]
learning_rate = 5e-2

unsupervisedSamples = UnsupervisedSampler(Gs,
                                          nodes=G.nodes(),
                                          length=length,
                                          number_of_walks=number_of_walks)

generator = GraphSAGELinkGenerator(Gs, batch_size, num_samples)
train_gen = generator.flow(unsupervisedSamples)

assert len(layer_sizes) == len(num_samples)

graphsage = GraphSAGE(layer_sizes=layer_sizes,
                      generator=generator,
                      bias=True,
                      dropout=0.0,
                      normalize="l2")

x_inp, x_out = graphsage.build()
示例#7
0
def main():
    with open(r"training.txt", "r") as f:
        reader = csv.reader(f)
        training = list(reader)
    # in order of training examples
    training = [element[0].split(" ") for element in training]
    training = pd.DataFrame(training, columns=['Node1', 'Node2', 'Link'])
    print("Training examples shape: {}".format(training.shape))

    with open(r"testing.txt", "r") as f:
        reader = csv.reader(f)
        testing = list(reader)
    # in order of testing examples
    testing = [element[0].split(" ") for element in testing]
    testing = pd.DataFrame(testing, columns=['Node1', 'Node2'])
    print("Testing examples shape: {}".format(testing.shape))
    '''
    uncomment lines for reduced corpus with stopword removal. In future integrate stemmer here, multi-language
    '''
    NODE_INFO_DIRECTORY = r"node_information/text/"

    corpus_path = r"pickles/simple_corpus.PICKLE"
    ids_path = r"pickles/ids.PICKLE"
    if os.path.exists(corpus_path):
        with open(corpus_path, 'rb') as f:
            corpus = pickle.load(f)
        f.close()
        with open(ids_path, 'rb') as f:
            ids = pickle.load(f)
        f.close()
    else:
        corpus = []
        ids = []
        for filename in tqdm(os.listdir(NODE_INFO_DIRECTORY),
                             position=0,
                             leave=True):
            with open(NODE_INFO_DIRECTORY + filename,
                      'r',
                      encoding='UTF-8',
                      errors='ignore') as f:
                doc_string = []
                for line in f:
                    [
                        doc_string.append(token.strip())
                        for token in line.lower().strip().split(" ")
                        if token != ""
                    ]
                corpus.append(' '.join(doc_string))
                ids.append(filename[:-4])
        with open(corpus_path, '+wb') as f:
            pickle.dump(corpus, f)
        f.close()
        with open(ids_path, '+wb') as f:
            pickle.dump(ids, f)
        f.close()

    stemmed_corpus_path = r"pickles/stemmed_corpus.PICKLE"
    if os.path.exists(stemmed_corpus_path):
        with open(stemmed_corpus_path, 'rb') as f:
            stemmed_corpus = pickle.load(f)
        f.close()
    else:
        print('Stemmed corpus unavailable')

    # in order of alphabetical text information i.e. 0, 1, 10, 100
    node_info = pd.DataFrame({
        'id': ids,
        'corpus': corpus,
        'stemmed': stemmed_corpus
    })
    print("Training node info shape: {}".format(node_info.shape))

    train_graph_split_path = 'pickles/train_graph_split.PICKLE'

    if os.path.exists(train_graph_split_path):
        with open(train_graph_split_path, 'rb') as f:
            keep_indices = pickle.load(f)
        f.close()
    else:
        keep_indices = random.sample(range(len(training)),
                                     k=int(len(training) * 0.05))
        with open(train_graph_split_path, '+wb') as f:
            pickle.dump(keep_indices, f)
        f.close()

    data_train_val = training.iloc[keep_indices]

    linked_nodes = training.loc[training['Link'] == '1']
    linked_nodes = linked_nodes[['Node1', 'Node2']]
    edgelist = linked_nodes.rename(columns={
        "Node1": "source",
        "Node2": "target"
    })

    lda_path = r"pickles/stemmed_lda_matrix.PICKLE"
    if os.path.exists(lda_path):
        with open(lda_path, 'rb') as f:
            lda = pickle.load(f)
        f.close()

    lda.shape

    feature_names = node_column_names = ["w_{}".format(ii) for ii in range(10)]
    node_data = pd.DataFrame(lda, columns=node_column_names)
    node_data.index = [str(i) for i in node_data.index]

    G_all_nx = nx.from_pandas_edgelist(edgelist)

    all_node_features = node_data[feature_names]

    G_all = sg.StellarGraph(G_all_nx, node_features=all_node_features)

    print(G_all.info())

    G_all.get_feature_for_nodes(['0'])

    ## Get DBLP Subgraph
    ### with papers published before a threshold year

    sub_linked_nodes = data_train_val.loc[data_train_val['Link'] == '1']
    sub_linked_nodes = sub_linked_nodes[['Node1', 'Node2']]
    subgraph_edgelist = sub_linked_nodes.rename(columns={
        "Node1": "source",
        "Node2": "target"
    })

    G_sub_nx = nx.from_pandas_edgelist(subgraph_edgelist)

    subgraph_node_ids = sorted(list(G_sub_nx.nodes))

    subgraph_node_features = node_data[feature_names].reindex(
        subgraph_node_ids)

    G_sub = sg.StellarGraph(G_sub_nx, node_features=subgraph_node_features)

    print(G_sub.info())

    ## Train attri2vec on the DBLP Subgraph

    nodes = list(G_sub.nodes())
    number_of_walks = int(input('Number of Walks: '))
    length = int(input('Walk length: '))

    unsupervised_samples = UnsupervisedSampler(G_sub,
                                               nodes=nodes,
                                               length=length,
                                               number_of_walks=number_of_walks)

    batch_size = 50
    epochs = int(input('Enter number of epochs: '))

    generator = Attri2VecLinkGenerator(G_sub, batch_size)

    layer_sizes = [128]
    attri2vec = Attri2Vec(layer_sizes=layer_sizes,
                          generator=generator.flow(unsupervised_samples),
                          bias=False,
                          normalize=None)

    # Build the model and expose input and output sockets of attri2vec, for node pair inputs:
    x_inp, x_out = attri2vec.build()

    prediction = link_classification(output_dim=1,
                                     output_act="sigmoid",
                                     edge_embedding_method='ip')(x_out)

    model = keras.Model(inputs=x_inp, outputs=prediction)

    model.compile(
        optimizer=keras.optimizers.Adam(lr=1e-2),
        loss=keras.losses.binary_crossentropy,
        metrics=[keras.metrics.binary_accuracy],
    )

    history = model.fit_generator(
        generator.flow(unsupervised_samples),
        epochs=epochs,
        verbose=1,
        use_multiprocessing=bool(int(input('Multiprocessing? 1/0: '))),
        workers=int(input('Number of workers: ')),
        shuffle=True,
    )
    print(history)
    model.save('model_walks{}len{}e{}.h5'.format(number_of_walks, length,
                                                 epochs))
    return model