def initialize(self,**hyper_params): if(not "batch_size" in hyper_params.keys()): batch_size = 16 if(not "layer_sizes" in hyper_params.keys()): num_samples = [25, 10] if(not "num_samples" in hyper_params.keys()): layer_sizes = [256, 256] if(not "bias" in hyper_params.keys()): bias = True if(not "dropout" in hyper_params.keys()): dropout = 0.0 if(not "lr" in hyper_params.keys()): lr = 1e-3 if(not "num_walks" in hyper_params.keys()): num_walks = 1 if(not "length" in hyper_params.keys()): length = 5 self.graph = sg.StellarGraph(nodes=self.nodes_df,edges=self.edges_df) self.nodes = list(self.graph.nodes()) del self.nodes_df del self.edges_df unsupervised_samples = UnsupervisedSampler( self.graph, nodes=self.nodes, length=length, number_of_walks=num_walks ) # Train iterators train_gen = GraphSAGELinkGenerator(self.graph, batch_size, num_samples) self.train_flow = train_gen.flow(unsupervised_samples) # Model defining - Keras functional API + Stellargraph layers graphsage = GraphSAGE( layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout, normalize="l2" ) x_inp, x_out = graphsage.in_out_tensors() prediction = link_classification( output_dim=1, output_act="sigmoid", edge_embedding_method="ip" )(x_out) self.model = keras.Model(inputs=x_inp, outputs=prediction) self.model.compile( optimizer=keras.optimizers.Adam(lr=lr), loss=keras.losses.binary_crossentropy, metrics=[keras.metrics.binary_accuracy], ) x_inp_src = x_inp[0::2] x_out_src = x_out[0] self.embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src) self.node_gen = GraphSAGENodeGenerator(self.graph, batch_size, num_samples).flow(self.nodes) return self.model.get_weights()
def create_train_gen(G): # This generates random walk samples from the graph unsupervised_samples = UnsupervisedSampler( G, nodes=list(G.nodes()), length=config.WALK_LENGTH, number_of_walks=config.NUM_WALKS, ) return GraphSAGELinkGenerator(G, config.BATCH_SIZE, config.NUM_SAMPLES).flow( unsupervised_samples )
def _fit_inductive_embedder(self, train_graph): """Fit inductive embedder (predictive model and embeddings).""" if self.model_name in ["cluster_gcn_dgi", "cluster_gat_dgi"]: return _fit_deep_graph_infomax(train_graph, self.params, self.model_name) unsupervised_samples = UnsupervisedSampler( train_graph, nodes=train_graph.nodes(), length=self.params["length"], number_of_walks=self.params["number_of_walks"]) generator = _dispatch_generator(train_graph, self.model_name, self.params, generator_type="edge") layer_sizes = _dispatch_layer_sizes(self.model_name, self.params) embedding_layer = _dispatch_inductive_layer(layer_sizes, generator, self.model_name, self.params) x_inp, x_out = embedding_layer.in_out_tensors() prediction = link_classification(output_dim=1, output_act="sigmoid", edge_embedding_method="ip")(x_out) model = Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=optimizers.Adam(lr=1e-3), loss=losses.binary_crossentropy, metrics=[metrics.binary_accuracy], ) train_generator = generator.flow(unsupervised_samples) model.fit(train_generator, epochs=self.params["epochs"], shuffle=True, verbose=0) if self.model_name == "attri2vec": x_inp_src = x_inp[0] elif self.model_name == "graphsage": x_inp_src = x_inp[0::2] x_out_src = x_out[0] embedding_model = Model(inputs=x_inp_src, outputs=x_out_src) return embedding_model
# - vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,3)) X = vectorizer.fit_transform(list(features.values())) G = StellarGraph(X.toarray()) dataset = datasets.Cora() display(HTML(dataset.description)) G, node_subjects = dataset.load() nodes = list(G.nodes()) number_of_walks = 1 length = 5 unsupervised_samples = UnsupervisedSampler( G, nodes=nodes, length=length, number_of_walks=number_of_walks ) batch_size = 50 epochs = 4 num_samples = [10, 5] generator = GraphSAGELinkGenerator(G, batch_size, num_samples) train_gen = generator.flow(unsupervised_samples) # + # feature extractoring and preprocessing data import librosa import pandas as pd
if n not in Gtrain_nx.nodes(): Gtrain_nx.add_node(n) nx.set_node_attributes(Gtrain_nx, "paper", "label") ## Train the embedding # mo"number of epochs to train for"l G = sg.StellarGraph(Gnx, node_features=node_features) Gtrain = sg.StellarGraph(Gtrain_nx, node_features=node_features) # The graph G # together wi"number of parallel workers to use" the unsupervised sampler will be used to generate samples. actual_nodes_train = list(Gtrain.nodes()) if testtype == 'nodes': assert set(nodes_train).issuperset(actual_nodes_train) unsupervised_samples = UnsupervisedSampler(Gtrain, nodes=actual_nodes_train, length=length_of_walks, number_of_walks=number_of_walks) train_gen = GraphSAGELinkGenerator(Gtrain, batch_size, num_samples).flow(unsupervised_samples) # Build the model assert len(layer_sizes) == len(num_samples) graphsage = GraphSAGE(layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=0.0, normalize="l2") x_inp, x_out = graphsage.build(flatten_output=False) prediction = link_classification(output_dim=1, output_act="sigmoid", edge_embedding_method='ip')(x_out)
from stellargraph.data import UnsupervisedSampler from sklearn.model_selection import train_test_split from tensorflow import keras # parameter specification number_of_walks = 1 length = 5 batch_size = 500 epochs = 5 num_samples = [10, 10] layer_sizes = [40, 30] learning_rate = 5e-2 unsupervisedSamples = UnsupervisedSampler(Gs, nodes=G.nodes(), length=length, number_of_walks=number_of_walks) generator = GraphSAGELinkGenerator(Gs, batch_size, num_samples) train_gen = generator.flow(unsupervisedSamples) assert len(layer_sizes) == len(num_samples) graphsage = GraphSAGE(layer_sizes=layer_sizes, generator=generator, bias=True, dropout=0.0, normalize="l2") x_inp, x_out = graphsage.build()
def main(): with open(r"training.txt", "r") as f: reader = csv.reader(f) training = list(reader) # in order of training examples training = [element[0].split(" ") for element in training] training = pd.DataFrame(training, columns=['Node1', 'Node2', 'Link']) print("Training examples shape: {}".format(training.shape)) with open(r"testing.txt", "r") as f: reader = csv.reader(f) testing = list(reader) # in order of testing examples testing = [element[0].split(" ") for element in testing] testing = pd.DataFrame(testing, columns=['Node1', 'Node2']) print("Testing examples shape: {}".format(testing.shape)) ''' uncomment lines for reduced corpus with stopword removal. In future integrate stemmer here, multi-language ''' NODE_INFO_DIRECTORY = r"node_information/text/" corpus_path = r"pickles/simple_corpus.PICKLE" ids_path = r"pickles/ids.PICKLE" if os.path.exists(corpus_path): with open(corpus_path, 'rb') as f: corpus = pickle.load(f) f.close() with open(ids_path, 'rb') as f: ids = pickle.load(f) f.close() else: corpus = [] ids = [] for filename in tqdm(os.listdir(NODE_INFO_DIRECTORY), position=0, leave=True): with open(NODE_INFO_DIRECTORY + filename, 'r', encoding='UTF-8', errors='ignore') as f: doc_string = [] for line in f: [ doc_string.append(token.strip()) for token in line.lower().strip().split(" ") if token != "" ] corpus.append(' '.join(doc_string)) ids.append(filename[:-4]) with open(corpus_path, '+wb') as f: pickle.dump(corpus, f) f.close() with open(ids_path, '+wb') as f: pickle.dump(ids, f) f.close() stemmed_corpus_path = r"pickles/stemmed_corpus.PICKLE" if os.path.exists(stemmed_corpus_path): with open(stemmed_corpus_path, 'rb') as f: stemmed_corpus = pickle.load(f) f.close() else: print('Stemmed corpus unavailable') # in order of alphabetical text information i.e. 0, 1, 10, 100 node_info = pd.DataFrame({ 'id': ids, 'corpus': corpus, 'stemmed': stemmed_corpus }) print("Training node info shape: {}".format(node_info.shape)) train_graph_split_path = 'pickles/train_graph_split.PICKLE' if os.path.exists(train_graph_split_path): with open(train_graph_split_path, 'rb') as f: keep_indices = pickle.load(f) f.close() else: keep_indices = random.sample(range(len(training)), k=int(len(training) * 0.05)) with open(train_graph_split_path, '+wb') as f: pickle.dump(keep_indices, f) f.close() data_train_val = training.iloc[keep_indices] linked_nodes = training.loc[training['Link'] == '1'] linked_nodes = linked_nodes[['Node1', 'Node2']] edgelist = linked_nodes.rename(columns={ "Node1": "source", "Node2": "target" }) lda_path = r"pickles/stemmed_lda_matrix.PICKLE" if os.path.exists(lda_path): with open(lda_path, 'rb') as f: lda = pickle.load(f) f.close() lda.shape feature_names = node_column_names = ["w_{}".format(ii) for ii in range(10)] node_data = pd.DataFrame(lda, columns=node_column_names) node_data.index = [str(i) for i in node_data.index] G_all_nx = nx.from_pandas_edgelist(edgelist) all_node_features = node_data[feature_names] G_all = sg.StellarGraph(G_all_nx, node_features=all_node_features) print(G_all.info()) G_all.get_feature_for_nodes(['0']) ## Get DBLP Subgraph ### with papers published before a threshold year sub_linked_nodes = data_train_val.loc[data_train_val['Link'] == '1'] sub_linked_nodes = sub_linked_nodes[['Node1', 'Node2']] subgraph_edgelist = sub_linked_nodes.rename(columns={ "Node1": "source", "Node2": "target" }) G_sub_nx = nx.from_pandas_edgelist(subgraph_edgelist) subgraph_node_ids = sorted(list(G_sub_nx.nodes)) subgraph_node_features = node_data[feature_names].reindex( subgraph_node_ids) G_sub = sg.StellarGraph(G_sub_nx, node_features=subgraph_node_features) print(G_sub.info()) ## Train attri2vec on the DBLP Subgraph nodes = list(G_sub.nodes()) number_of_walks = int(input('Number of Walks: ')) length = int(input('Walk length: ')) unsupervised_samples = UnsupervisedSampler(G_sub, nodes=nodes, length=length, number_of_walks=number_of_walks) batch_size = 50 epochs = int(input('Enter number of epochs: ')) generator = Attri2VecLinkGenerator(G_sub, batch_size) layer_sizes = [128] attri2vec = Attri2Vec(layer_sizes=layer_sizes, generator=generator.flow(unsupervised_samples), bias=False, normalize=None) # Build the model and expose input and output sockets of attri2vec, for node pair inputs: x_inp, x_out = attri2vec.build() prediction = link_classification(output_dim=1, output_act="sigmoid", edge_embedding_method='ip')(x_out) model = keras.Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=keras.optimizers.Adam(lr=1e-2), loss=keras.losses.binary_crossentropy, metrics=[keras.metrics.binary_accuracy], ) history = model.fit_generator( generator.flow(unsupervised_samples), epochs=epochs, verbose=1, use_multiprocessing=bool(int(input('Multiprocessing? 1/0: '))), workers=int(input('Number of workers: ')), shuffle=True, ) print(history) model.save('model_walks{}len{}e{}.h5'.format(number_of_walks, length, epochs)) return model