def initialize(self,**hyper_params): if(not "batch_size" in hyper_params.keys()): batch_size = 16 if(not "layer_sizes" in hyper_params.keys()): num_samples = [25, 10] if(not "num_samples" in hyper_params.keys()): layer_sizes = [256, 256] if(not "bias" in hyper_params.keys()): bias = True if(not "dropout" in hyper_params.keys()): dropout = 0.0 if(not "lr" in hyper_params.keys()): lr = 1e-3 if(not "num_walks" in hyper_params.keys()): num_walks = 1 if(not "length" in hyper_params.keys()): length = 5 self.graph = sg.StellarGraph(nodes=self.nodes_df,edges=self.edges_df) self.nodes = list(self.graph.nodes()) del self.nodes_df del self.edges_df unsupervised_samples = UnsupervisedSampler( self.graph, nodes=self.nodes, length=length, number_of_walks=num_walks ) # Train iterators train_gen = GraphSAGELinkGenerator(self.graph, batch_size, num_samples) self.train_flow = train_gen.flow(unsupervised_samples) # Model defining - Keras functional API + Stellargraph layers graphsage = GraphSAGE( layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout, normalize="l2" ) x_inp, x_out = graphsage.in_out_tensors() prediction = link_classification( output_dim=1, output_act="sigmoid", edge_embedding_method="ip" )(x_out) self.model = keras.Model(inputs=x_inp, outputs=prediction) self.model.compile( optimizer=keras.optimizers.Adam(lr=lr), loss=keras.losses.binary_crossentropy, metrics=[keras.metrics.binary_accuracy], ) x_inp_src = x_inp[0::2] x_out_src = x_out[0] self.embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src) self.node_gen = GraphSAGENodeGenerator(self.graph, batch_size, num_samples).flow(self.nodes) return self.model.get_weights()
def initialize(self,**hyper_params): if(not "batch_size" in hyper_params.keys()): batch_size = 20 if(not "layer_sizes" in hyper_params.keys()): num_samples = [20, 10] if(not "num_samples" in hyper_params.keys()): layer_sizes = [10, 10 ] if(not "bias" in hyper_params.keys()): bias = True if(not "dropout" in hyper_params.keys()): dropout = 0.1 if(not "lr" in hyper_params.keys()): lr = 1e-2 graph = sg.StellarGraph(nodes=self.nodes,edges=self.edges) # Test split edge_splitter_test = EdgeSplitter(graph) self.graph_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split( p=0.1, method="global", keep_connected=True, seed = 42 ) # Train split edge_splitter_train = EdgeSplitter(self.graph_test) self.graph_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split( p=0.1, method="global", keep_connected=True, seed = 42 ) # Train iterators train_gen = GraphSAGELinkGenerator(self.graph_train, batch_size, num_samples, seed = 42) self.train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True) # Test iterators test_gen = GraphSAGELinkGenerator(self.graph_train, batch_size, num_samples, seed = 42) self.test_flow = test_gen.flow(edge_ids_test, edge_labels_test, shuffle=True) # Model defining - Keras functional API + Stellargraph layers graphsage = GraphSAGE( layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout ) x_inp, x_out = graphsage.in_out_tensors() prediction = link_classification( output_dim=1, output_act="sigmoid", edge_embedding_method="ip" )(x_out) self.model = keras.Model(inputs=x_inp, outputs=prediction) self.model.compile( optimizer=keras.optimizers.Adam(lr=lr), loss=keras.losses.binary_crossentropy, metrics=[keras.metrics.BinaryAccuracy(),keras.metrics.Recall(),keras.metrics.AUC(),keras.metrics.Precision()], ) # return number of training and testing examples return edge_ids_train.shape[0],edge_ids_test.shape[0]
def initialize(self, **hyper_params): if (not "batch_size" in hyper_params.keys()): batch_size = 20 if (not "layer_sizes" in hyper_params.keys()): num_samples = [20, 10] if (not "num_samples" in hyper_params.keys()): layer_sizes = [20, 20] if (not "bias" in hyper_params.keys()): bias = True if (not "dropout" in hyper_params.keys()): dropout = 0.3 if (not "lr" in hyper_params.keys()): lr = 1e-3 if (not "train_split" in hyper_params.keys()): train_split = 0.2 self.graph = sg.StellarGraph(nodes=self.nodes, edges=self.edges) # Train split edge_splitter_train = EdgeSplitter(self.graph) graph_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split( p=train_split, method="global", keep_connected=True) # Train iterators train_gen = GraphSAGELinkGenerator(graph_train, batch_size, num_samples) self.train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True) # Model defining - Keras functional API + Stellargraph layers graphsage = GraphSAGE(layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout) x_inp, x_out = graphsage.in_out_tensors() prediction = link_classification(output_dim=1, output_act="relu", edge_embedding_method="ip")(x_out) self.model = keras.Model(inputs=x_inp, outputs=prediction) self.model.compile( optimizer=keras.optimizers.Adam(lr=lr), loss=keras.losses.binary_crossentropy, metrics=["acc"], ) return self.model.get_weights()
def create_graphSAGE_model(graph, link_prediction=False): if link_prediction: # We are going to train on the original graph generator = GraphSAGELinkGenerator(graph, batch_size=2, num_samples=[2, 2]) edge_ids_train = np.array([[1, 2], [2, 3], [1, 3]]) train_gen = generator.flow(edge_ids_train, np.array([1, 1, 0])) else: generator = GraphSAGENodeGenerator(graph, batch_size=2, num_samples=[2, 2]) train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]])) # if link_prediction: # edge_ids_train = np.array([[1, 2], [2, 3], [1, 3]]) # train_gen = generator.flow(edge_ids_train, np.array([1, 1, 0])) # else: # train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]])) base_model = GraphSAGE( layer_sizes=[8, 8], generator=generator, bias=True, dropout=0.5 ) if link_prediction: # Expose input and output sockets of graphsage, for source and destination nodes: x_inp, x_out = base_model.in_out_tensors() prediction = link_classification( output_dim=1, output_act="relu", edge_embedding_method="ip" )(x_out) keras_model = Model(inputs=x_inp, outputs=prediction) else: x_inp, x_out = base_model.in_out_tensors() prediction = layers.Dense(units=2, activation="softmax")(x_out) keras_model = Model(inputs=x_inp, outputs=prediction) return base_model, keras_model, generator, train_gen
def run_model(self): graph_sampled, label_series_sampled = self.prepare_data_for_stellargraph( ) train_targets, valid_targets, test_targets, train_labels, valid_labels, test_labels = self.get_train_valid_test( label_series_sampled) batch_size = self.hyperparams["batch_size"] num_samples = self.hyperparams["num_samples"] generator = GraphSAGENodeGenerator(graph_sampled, batch_size, num_samples) train_gen = generator.flow(train_labels.index, train_targets, shuffle=True) graphsage_model = GraphSAGE( layer_sizes=self.hyperparams["layer_sizes"], generator=generator, bias=self.hyperparams["bias"], dropout=self.hyperparams["dropout"], ) x_inp, x_out = graphsage_model.in_out_tensors() prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) model = Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=optimizers.Adam(lr=self.hyperparams["lr"]), loss=losses.categorical_crossentropy, metrics=["acc"], ) valid_gen = generator.flow(valid_labels.index, valid_targets) history = model.fit( train_gen, epochs=self.hyperparams["n_epochs"], validation_data=valid_gen, verbose=self.hyperparams["verbose"], shuffle=True, use_multiprocessing=True, ) sg.utils.plot_history(history) test_gen = generator.flow(test_labels.index, test_targets) test_metrics = model.evaluate(test_gen) print("\nTest Set Metrics:") for name, valid in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, valid))
def get_dropout(input_tensor, p=0.1, mc=False): if mc: return Dropout(p)(input_tensor, training=True) else: return Dropout(p)(input_tensor) graphsage_model = GraphSAGE(layer_sizes=[64, 32, 16], generator=generator, activations=["relu", "relu", "linear"], bias=True, aggregator=MaxPoolingAggregator, dropout=0.1) x_inp, x_out = graphsage_model.in_out_tensors() x_out = layers.Dense(units=10, activation="relu")(x_out) x_out = layers.Dense(units=10, activation="relu")(x_out) x_out = get_dropout(x_out, p=0.1, mc='mc') prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) model = Model(inputs=x_inp, outputs=prediction) model.summary() ## # model.compile( optimizer=optimizers.Adam(), loss = noderankloss(), metrics=["acc"]) # model.compile( optimizer=optimizers.Adam(), loss="mean_squared_error", metrics=["acc"]) model.compile(optimizer=optimizers.Adam(), loss=tf.keras.losses.CategoricalCrossentropy(), metrics=["acc"])
epochs = 20 num_samples = [20, 10] layer_sizes = [20, 20] # Train iterators train_gen = GraphSAGELinkGenerator(G_train, batch_size, num_samples) train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True) # Model defining - Keras functional API + Stellargraph layers graphsage = GraphSAGE( layer_sizes=layer_sizes, generator=train_gen, bias=True, dropout=0.3 ) x_inp, x_out = graphsage.in_out_tensors() prediction = link_classification( output_dim=1, output_act="relu", edge_embedding_method="ip" )(x_out) model = keras.Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=keras.optimizers.Adam(lr=1e-3), loss=keras.losses.binary_crossentropy, metrics=["acc"], ) # Set weights weights = np.load(path_weights,allow_pickle=True)
def train(G_list, nodes_subjects_list, run_num=1, start_month_id=220, end_month_id=264): # 提前定义一些列表方便记录数据,大循环的列表存小循环的列表 graph_history_list_list = [] model_list_list = [] train_gen_list_list = [] time_list_list = [] model_weight_list_list = [] # 选择运行run_num次 run_num = run_num # 选择进行训练的月份,end_month_id最多取 start_month_id = start_month_id end_month_id = end_month_id # 创建文件夹保存model if not os.path.exists('model'): os.makedirs('model') # 创建文件夹保存history if not os.path.exists('history'): os.makedirs('history') # 创建文件夹保存figure if not os.path.exists('figure'): os.makedirs('figure') # 创建文件夹保存figure if not os.path.exists('figure_distribution'): os.makedirs('figure_distribution') # 创建文件夹保存test结果 if not os.path.exists('test_result'): os.makedirs('test_result') # 大循环记录训练了几次,计算多次是为了减少variance # 小循环记录训练的月份 for j in range(run_num): num_samples = [40] # 提前定义一些列表记录小循环的数据 graph_history_list = [] model_list = [] train_gen_list = [] time_list = [] model_weight_list = [] test_result = [] # i为0代表220 for i in range(start_month_id - 220, end_month_id - 220): start = time.time() # 前一个月训练,后一个月验证 train_idx = i val_idx = i + 1 test_idx = i + 2 # 用train_idx的数据生成训练集的generator generator = GraphSAGENodeGenerator( G=G_list[train_idx], batch_size=len(nodes_subjects_list[train_idx]), num_samples=num_samples, seed=100) train_gen = generator.flow(list( nodes_subjects_list[train_idx].index), nodes_subjects_list[train_idx].values, shuffle=False) # 生成GraphSAGE模型 graphsage_model = GraphSAGE(layer_sizes=[1], generator=generator, bias=True, aggregator=sg.layer.MeanAggregator, normalize=None) # 提取输出输出的tensor,用keras来构建模型 x_inp, x_out = graphsage_model.in_out_tensors() # prediction = layers.Dense(units=1)(x_out) # 用val_idx的数据生成验证集的generator generator = GraphSAGENodeGenerator( G=G_list[val_idx], batch_size=len(nodes_subjects_list[val_idx]), num_samples=num_samples, seed=100) val_gen = generator.flow(list(nodes_subjects_list[val_idx].index), nodes_subjects_list[val_idx].values) # 用test_idx的数据生成验证集的generator generator = GraphSAGENodeGenerator( G=G_list[test_idx], batch_size=len(nodes_subjects_list[test_idx]), num_samples=num_samples, seed=100) test_gen = generator.flow( list(nodes_subjects_list[test_idx].index), nodes_subjects_list[test_idx].values) # 通过输入输出的tensor构建model model = Model(inputs=x_inp, outputs=x_out) monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=10, verbose=2, mode='auto', restore_best_weights=True) model.compile(optimizer=optimizers.Adam(lr=0.05), loss=losses.mean_squared_error, metrics=[pearson_r]) history = model.fit(train_gen, epochs=500, validation_data=val_gen, verbose=0, shuffle=False, callbacks=[monitor]) test_metrics = model.evaluate(test_gen) test_result_dict = {} print("\n" + str(train_idx + 220) + "'s Test Set: " + str(test_idx + 220) + "'s Metrics:") for name, val in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, val)) test_result_dict[name] = val json.dump( test_result_dict, open( 'test_result/' + str(train_idx + 220) + "_" + str(test_idx + 220) + '.json', 'w')) test_preds = model.predict(test_gen) end = time.time() # 保存一些结果 graph_history_list.append(history) # 保存训练过程 model_list.append(model) # 保存model train_gen_list.append(train_gen) # 保存train_gen方便之后算中间层的结果 time_list.append(end - start) # 保存运行时间 model_weight_list.append(model.weights) # 保存model的参数 test_result.append(test_metrics[1]) # # 存模型model # model.save('model/' + str(train_idx + 220) + "_" + str(val_idx + 220) + '.h5') # # 存训练过程history # json.dump(history.history, # open('history/' + str(train_idx + 220) + "_" + str(val_idx + 220) + '.json', 'w')) # # 存训练过程图片figure # sg.utils.plot_history(history) # plt.title(str(train_idx + 220) + '->' + str(val_idx + 220)) # plt.savefig('figure/' + str(train_idx + 220) + "_" + str(val_idx + 220) + '.png') # plt.show() # 存test的prediction的distribution plt.figure(figsize=(5, 10)) plt.subplot(211) plt.hist(test_preds, bins=500) plt.title("Distribution of Prediction of " + str(test_idx + 220)) plt.subplot(212) plt.hist(nodes_subjects_list[test_idx].values, bins=500) plt.title("Distribution of Origin of " + str(test_idx + 220)) plt.xlabel("ic=" + str(test_metrics[1])) plt.savefig('figure_distribution/distribution-' + str(train_idx + 220) + "_" + str(test_idx + 220) + '.png', dpi=300) plt.show() print(str(i + 220) + "'s " + str(j + 1) + " run has finished") print() # 将小循环的数据保存 graph_history_list_list.append(graph_history_list) model_list_list.append(model_list) train_gen_list_list.append(train_gen_list) time_list_list.append(time_list) model_weight_list_list.append(model_weight_list) return graph_history_list_list, model_list_list, train_gen_list_list, time_list_list, model_weight_list_list, test_result