def graphsage_pipeline(G, node_subjects, layer_sizes=[32, 32]): train_subjects, val_subjects, test_subjects = training_split(node_subjects) batch_size = 50 num_samples = [10, 5] generator = GraphSAGENodeGenerator(G, batch_size, num_samples) train_gen = generator.flow(train_subjects.index, train_subjects.values, shuffle=True) graphsage_model = GraphSAGE( layer_sizes=layer_sizes, generator=generator, bias=True, dropout=0.5, ) model = build_model(graphsage_model, train_subjects.values.shape[1]) val_gen = generator.flow(val_subjects.index, val_subjects.values) es_callback = EarlyStopping(monitor="val_acc", patience=50, restore_best_weights=True) history = model.fit(train_gen, epochs=200, validation_data=val_gen, verbose=0, shuffle=False, callbacks=[es_callback]) plot_results(history) test_metrics(generator, model, test_subjects)
def run_model(self): graph_sampled, label_series_sampled = self.prepare_data_for_stellargraph( ) train_targets, valid_targets, test_targets, train_labels, valid_labels, test_labels = self.get_train_valid_test( label_series_sampled) batch_size = self.hyperparams["batch_size"] num_samples = self.hyperparams["num_samples"] generator = GraphSAGENodeGenerator(graph_sampled, batch_size, num_samples) train_gen = generator.flow(train_labels.index, train_targets, shuffle=True) graphsage_model = GraphSAGE( layer_sizes=self.hyperparams["layer_sizes"], generator=generator, bias=self.hyperparams["bias"], dropout=self.hyperparams["dropout"], ) x_inp, x_out = graphsage_model.in_out_tensors() prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) model = Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=optimizers.Adam(lr=self.hyperparams["lr"]), loss=losses.categorical_crossentropy, metrics=["acc"], ) valid_gen = generator.flow(valid_labels.index, valid_targets) history = model.fit( train_gen, epochs=self.hyperparams["n_epochs"], validation_data=valid_gen, verbose=self.hyperparams["verbose"], shuffle=True, use_multiprocessing=True, ) sg.utils.plot_history(history) test_gen = generator.flow(test_labels.index, test_targets) test_metrics = model.evaluate(test_gen) print("\nTest Set Metrics:") for name, valid in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, valid))
def preprocess_valid(self, node_ids): """ preprocess validation nodes (transductive inference) node_ids (list): list of node IDs that generator will yield """ if not self.ids_exist(node_ids): raise ValueError('node_ids must exist in self.df') if self.y_encoding is None: raise Exception('Unset parameters. Are you sure you called preprocess_train first?') # subset df for validation nodes df_val = self.df[self.df.index.isin(node_ids)] # one-hot-encode target val_targets = self.y_encoding.transform(df_val[["target"]].to_dict('records')) # import stellargraph try: import stellargraph as sg from stellargraph.mapper import GraphSAGENodeGenerator except: raise Exception(SG_ERRMSG) if version.parse(sg.__version__) < version.parse('0.8'): raise Exception(SG_ERRMSG) # return generator if self.G_sg is None: self.G_sg = sg.StellarGraph(self.G, node_features=self.df[self.feature_names]) generator = GraphSAGENodeGenerator(self.G_sg, U.DEFAULT_BS, [self.sampsize,self.sampsize]) val_gen = generator.flow(df_val.index, val_targets, shuffle=False) from .node_generator import NodeSequenceWrapper return NodeSequenceWrapper(val_gen)
def preprocess_train(self, node_ids): """ preprocess training set """ if not self.ids_exist(node_ids): raise ValueError('node_ids must exist in self.df') # subset df for training nodes df_tr = self.df[self.df.index.isin(node_ids)] # one-hot-encode target self.y_encoding = sklearn.feature_extraction.DictVectorizer(sparse=False) train_targets = self.y_encoding.fit_transform(df_tr[["target"]].to_dict('records')) # import stellargraph try: import stellargraph as sg from stellargraph.mapper import GraphSAGENodeGenerator except: raise Exception(SG_ERRMSG) if version.parse(sg.__version__) < version.parse('0.8'): raise Exception(SG_ERRMSG) # return generator G_sg = sg.StellarGraph(self.G, node_features=self.df[self.feature_names]) self.G_sg = G_sg generator = GraphSAGENodeGenerator(G_sg, U.DEFAULT_BS, [self.sampsize, self.sampsize]) train_gen = generator.flow(df_tr.index, train_targets, shuffle=True) from .node_generator import NodeSequenceWrapper return NodeSequenceWrapper(train_gen)
def test(edgelist, node_data, model_file, batch_size, target_name="subject"): """ Load the serialized model and evaluate on all nodes in the graph. Args: G: NetworkX graph file target_converter: Class to give numeric representations of node targets feature_converter: CLass to give numeric representations of the node features model_file: Location of Keras model to load batch_size: Size of batch for inference """ # Extract the feature data. These are the feature vectors that the Keras model will use as input. # The CORA dataset contains attributes 'w_x' that correspond to words found in that publication. node_features = node_data[feature_names] # Create graph from edgelist and set node features and node type Gnx = nx.from_pandas_edgelist(edgelist) # We must also save the target encoding to convert model predictions encoder_file = model_file.replace( "cora_example_model", "cora_example_encoding" ).replace(".h5", ".pkl") with open(encoder_file, "rb") as f: target_encoding = pickle.load(f)[0] # Endode targets with pre-trained encoder node_targets = target_encoding.transform( node_data[[target_name]].to_dict("records") ) node_ids = node_data.index # Convert to StellarGraph and prepare for ML G = sg.StellarGraph(Gnx, node_features=node_features) # Load Keras model model = keras.models.load_model( model_file, custom_objects={"MeanAggregator": MeanAggregator} ) print("Loaded model:") model.summary() # Get required samples from model # TODO: Can we move this to the library? num_samples = [ int(model.input_shape[ii + 1][1] / model.input_shape[ii][1]) for ii in range(len(model.input_shape) - 1) ] # Create mappers for GraphSAGE that input data from the graph to the model generator = GraphSAGENodeGenerator( G, batch_size, num_samples, seed=42 ) all_gen = generator.flow(node_ids, node_targets) # Evaluate and print metrics all_metrics = model.evaluate_generator(all_gen) print("\nAll-node Evaluation:") for name, val in zip(model.metrics_names, all_metrics): print("\t{}: {:0.4f}".format(name, val))
def preprocess_test(self, df_te, G_te): """ ``` preprocess for inductive inference df_te (DataFrame): pandas dataframe containing new node attributes G_te (Graph): a networkx Graph containing new nodes ``` """ try: import networkx as nx except ImportError: raise ImportError("Please install networkx: pip install networkx") if self.y_encoding is None: raise Exception( "Unset parameters. Are you sure you called preprocess_train first?" ) # get aggregrated df # df_agg = pd.concat([df_te, self.df]).drop_duplicates(keep='last') df_agg = pd.concat([df_te, self.df]) # df_te = pd.concat([self.df, df_agg]).drop_duplicates(keep=False) # get aggregrated graph is_subset = set(self.G.nodes()) <= set(G_te.nodes()) if not is_subset: raise ValueError("Nodes in self.G must be subset of G_te") G_agg = nx.compose(self.G, G_te) # one-hot-encode target if "target" in df_te.columns: test_targets = self.y_encoding.transform( df_te[["target"]].to_dict("records")) else: test_targets = [-1] * len(df_te.shape[0]) # import stellargraph try: import stellargraph as sg from stellargraph.mapper import GraphSAGENodeGenerator except: raise Exception(SG_ERRMSG) if version.parse(sg.__version__) < version.parse("0.8"): raise Exception(SG_ERRMSG) # return generator G_sg = sg.StellarGraph(G_agg, node_features=df_agg[self.feature_names]) generator = GraphSAGENodeGenerator(G_sg, U.DEFAULT_BS, [self.sampsize, self.sampsize]) test_gen = generator.flow(df_te.index, test_targets, shuffle=False) from .sg_wrappers import NodeSequenceWrapper return NodeSequenceWrapper(test_gen)
def test_graphsage_constructor(): gs = GraphSAGE(layer_sizes=[4], n_samples=[2], input_dim=2, normalize="l2", multiplicity=1) assert gs.dims == [2, 4] assert gs.n_samples == [2] assert gs.max_hops == 1 assert gs.bias assert len(gs._aggs) == 1 # Check incorrect normalization flag with pytest.raises(ValueError): GraphSAGE( layer_sizes=[4], n_samples=[2], input_dim=2, normalize=lambda x: x, multiplicity=1, ) with pytest.raises(ValueError): GraphSAGE( layer_sizes=[4], n_samples=[2], input_dim=2, normalize="unknown", multiplicity=1, ) # Check requirement for generator or n_samples with pytest.raises(KeyError): GraphSAGE(layer_sizes=[4]) # Construction from generator G = example_graph(feature_size=3) gen = GraphSAGENodeGenerator(G, batch_size=2, num_samples=[2, 2]) gs = GraphSAGE(layer_sizes=[4, 8], generator=gen, bias=True) # The GraphSAGE should no longer accept a Sequence t_gen = gen.flow([1, 2]) with pytest.raises(TypeError): gs = GraphSAGE(layer_sizes=[4, 8], generator=t_gen, bias=True) assert gs.dims == [3, 4, 8] assert gs.n_samples == [2, 2] assert gs.max_hops == 2 assert gs.bias assert len(gs._aggs) == 2
def train( edgelist, node_data, layer_size, num_samples, batch_size=100, num_epochs=10, learning_rate=0.005, dropout=0.0, target_name="subject", ): """ Train a GraphSAGE model on the specified graph G with given parameters, evaluate it, and save the model. Args: edgelist: Graph edgelist node_data: Feature and target data for nodes layer_size: A list of number of hidden nodes in each layer num_samples: Number of neighbours to sample at each layer batch_size: Size of batch for inference num_epochs: Number of epochs to train the model learning_rate: Initial Learning rate dropout: The dropout (0->1) """ # Extract target and encode as a one-hot vector target_encoding = feature_extraction.DictVectorizer(sparse=False) node_targets = target_encoding.fit_transform( node_data[[target_name]].to_dict("records")) node_ids = node_data.index # Extract the feature data. These are the feature vectors that the Keras model will use as input. # The CORA dataset contains attributes 'w_x' that correspond to words found in that publication. node_features = node_data[feature_names] # Create graph from edgelist and set node features and node type Gnx = nx.from_pandas_edgelist(edgelist, edge_attr="label") nx.set_node_attributes(Gnx, "paper", "label") # Convert to StellarGraph and prepare for ML G = sg.StellarGraph(Gnx, node_type_name="label", node_features=node_features) # Split nodes into train/test using stratification. train_nodes, test_nodes, train_targets, test_targets = model_selection.train_test_split( node_ids, node_targets, train_size=140, test_size=None, stratify=node_targets, random_state=5232, ) # Split test set into test and validation val_nodes, test_nodes, val_targets, test_targets = model_selection.train_test_split( test_nodes, test_targets, train_size=500, test_size=None, random_state=5214) # Create mappers for GraphSAGE that input data from the graph to the model generator = GraphSAGENodeGenerator(G, batch_size, num_samples, seed=5312) train_gen = generator.flow(train_nodes, train_targets, shuffle=True) val_gen = generator.flow(val_nodes, val_targets) # GraphSAGE model model = GraphSAGE( layer_sizes=layer_size, generator=train_gen, bias=True, dropout=dropout, aggregator=MeanAggregator, ) # Expose the input and output sockets of the model: x_inp, x_out = model.build() # Snap the final estimator layer to x_out prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) # Create Keras model for training model = keras.Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=optimizers.Adam(lr=learning_rate, decay=0.001), loss=losses.categorical_crossentropy, metrics=[metrics.categorical_accuracy], ) print(model.summary()) # Train model history = model.fit_generator(train_gen, epochs=num_epochs, validation_data=val_gen, verbose=2, shuffle=False) # Evaluate on test set and print metrics test_metrics = model.evaluate_generator( generator.flow(test_nodes, test_targets)) print("\nTest Set Metrics:") for name, val in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, val)) # Get predictions for all nodes all_predictions = model.predict_generator(generator.flow(node_ids)) # Turn predictions back into the original categories node_predictions = pd.DataFrame( target_encoding.inverse_transform(all_predictions), index=node_ids) accuracy = np.mean([ "subject=" + gt_subject == p for gt_subject, p in zip( node_data["subject"], node_predictions.idxmax(axis=1)) ]) print("All-node accuracy: {:3f}".format(accuracy)) # TODO: extract the GraphSAGE embeddings from x_out, and save/plot them # Save the trained model save_str = "_n{}_l{}_d{}_r{}".format( "_".join([str(x) for x in num_samples]), "_".join([str(x) for x in layer_size]), dropout, learning_rate, ) model.save("cora_example_model" + save_str + ".h5") # We must also save the target encoding to convert model predictions with open("cora_example_encoding" + save_str + ".pkl", "wb") as f: pickle.dump([target_encoding], f)
G = StellarGraph.from_networkx(graph, node_features="feature") print(G.node_types) G.check_graph_for_ml() nodes = [node for node in graph.nodes] shuffle(nodes) train_ids = nodes[:5000] test_ids = nodes[5000:] train_labels= [graph.nodes[id]["_class"] for id in train_ids] test_labels = [graph.nodes[id]["_class"] for id in test_ids] all_labels = train_labels + test_labels train_labels = np.array(train_labels).reshape(len(train_ids),1) test_labels = np.array(test_labels).reshape(len(test_ids), 1) print(np.unique(train_labels, return_counts=True)) print(np.unique(test_labels, return_counts=True)) generator = GraphSAGENodeGenerator(G, batch_size=50, num_samples=[10,10]) train_data_gen = generator.flow(train_ids, train_labels) test_data_gen = generator.flow(test_ids, test_labels) all_gen = generator.flow(list(nodes), all_labels) print("Node Gen done!") base_model = GraphSAGE(layer_sizes=[32, 32], generator=generator, bias=True, dropout=0.8) x_in, x_out = base_model.build() prediction = layers.Dense(units=2, activation="softmax")(x_out) print("model building done") model = Model(inputs=x_in, outputs = prediction) model.compile(optimizer=optimizers.Adam(lr=0.005), loss=losses.categorical_crossentropy, metrics=["acc"]) tensorboard = callbacks.TensorBoard(log_dir="logs",embeddings_freq=1, update_freq=1, histogram_freq=1) tboard = model.fit(train_data_gen, epochs=4, validation_data=test_data_gen, verbose=True, shuffle=False, callbacks=[tensorboard])
# Create graphSAGENodeGenerator object which feeds data from a graph # to a model. It requires batch_size and the number of nodes to sample # in a decided number of layers. batch_size = 50 num_samples = [10, 5] #two layers # Create data generator for our graph, specified by which type # of model (GraphSAGE) and the learning task (Node) ... generator = GraphSAGENodeGenerator(G, batch_size, num_samples) print(train_subjects.index) # Create an iterator for our training data, this takes the indeces of the # nodes in the graph to be used for training, as well as their respective # one-hot encoded label vectors train_gen = generator.flow(train_subjects.index, train_targets, shuffle = True) # Specify the graph-learning model graphsage_model = GraphSAGE( layer_sizes=[32, 32], generator=generator, bias=True, dropout=0.5, aggregator = MeanAggregator ) # Extract the input and output tensors of the model. Set predictions # of the model to be a softmax layer taking output tensor as its input. x_inp, x_out = graphsage_model.in_out_tensors() prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) model = Model(inputs = x_inp, outputs = prediction) model.compile( optimizer=optimizers.Adam(lr=0.005),
# return K.mean(tf.linalg.diag_part(tempmatrix)) return temploss return loss indices = bf.expandy(batch_size, 2) #%% #################################### Model Evaluation ###################################################### #%% ############################################################################################################ filepath = '.\models\\EGR_Graphsage\\pl13000_customloss.h5' model1 = models.load_model(filepath, custom_objects={ "MeanAggregator": MeanAggregator, 'loss': noderankloss(indices) }) all_nodes = targetdf.index.values all_mapper = generator.flow(all_nodes) y_pred = model1.predict(all_mapper) ## kendall tau metric for rank ktau, p_value = stats.kendalltau(targetdf['btw'], y_pred) print(ktau) ## top k pf vs.compute_topkperf(targetdf['btw'], y_pred, 0.9) ##
stratify=labels_sampled, random_state=42, ) # Turn labels into one-hot encodings target_encoding = preprocessing.LabelBinarizer() train_targets = target_encoding.fit_transform(train_labels) val_targets = target_encoding.transform(val_labels) # Create a node generator for undirected graph batch_size = 50 num_samples = [10, 10] generator = GraphSAGENodeGenerator(graph_sampled, batch_size, num_samples) # create iterator for training data train_gen = generator.flow(train_labels.index, train_targets, shuffle=True) # Make graphsage model graphsage_model = GraphSAGE( layer_sizes=[32, 32], generator=generator, bias=True, dropout=0.5, ) x_inp, x_out = graphsage_model.in_out_tensors() prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) model = Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=optimizers.Adam(lr=0.005),
def train_model(Gnx, train_data, test_data, all_features): output_results = {} from collections import Counter #TODO: save size of dataset, train_data, and test data #save the count of each subject in the blocks print(len(train_data), len(test_data)) subject_groups_train = Counter(train_data['subject']) subject_groups_test = Counter(test_data['subject']) output_results['train_size'] = len(train_data) output_results['test_size'] = len(test_data) output_results['subject_groups_train'] = subject_groups_train output_results['subject_groups_test'] = subject_groups_test #node_features = train_data[feature_names] #print (feature_names) G = sg.StellarGraph(Gnx, node_features=all_features) #TODO: save graph info print(G.info()) print("writing graph.dot") #write_dot(Gnx,"graph.dot") output_results['graph_info'] = G.info() print("building the graph generator...") batch_size = 50 num_samples = [10, 5] generator = GraphSAGENodeGenerator(G, batch_size, num_samples) #generator = HinSAGENodeGenerator(G, batch_size, num_samples) target_encoding = feature_extraction.DictVectorizer(sparse=False) train_targets = target_encoding.fit_transform( train_data[["subject"]].to_dict('records')) print(np.unique(train_data["subject"].to_list())) class_weights = class_weight.compute_class_weight( 'balanced', np.unique(train_data["subject"].to_list()), train_data["subject"].to_list()) print('class_weights', class_weights) test_targets = target_encoding.transform(test_data[["subject" ]].to_dict('records')) train_gen = generator.flow(train_data.index, train_targets, shuffle=True) graphsage_model = GraphSAGE( #graphsage_model = HinSAGE( #layer_sizes=[32, 32], layer_sizes=[80, 80], generator=generator, #train_gen, bias=True, dropout=0.5, ) print("building model...") #x_inp, x_out = graphsage_model.build(flatten_output=True) x_inp, x_out = graphsage_model.build() prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) model = Model(inputs=x_inp, outputs=prediction) print("compiling model...") model.compile( optimizer=optimizers.Adam(lr=0.005), loss=losses.categorical_crossentropy, metrics=["acc", metrics.categorical_accuracy], ) print("testing the model...") test_gen = generator.flow(test_data.index, test_targets) history = model.fit_generator( train_gen, epochs=EPOCH, validation_data=test_gen, verbose=2, shuffle=True, class_weight=class_weights, ) # save test metrics test_metrics = model.evaluate_generator(test_gen) print("\nTest Set Metrics:") output_results['test_metrics'] = [] for name, val in zip(model.metrics_names, test_metrics): output_results['test_metrics'].append({'name': name, 'val:': val}) print("\t{}: {:0.4f}".format(name, val)) test_nodes = test_data.index test_mapper = generator.flow(test_nodes) test_predictions = model.predict_generator(test_mapper) node_predictions = target_encoding.inverse_transform(test_predictions) results = pd.DataFrame(node_predictions, index=test_nodes).idxmax(axis=1) df = pd.DataFrame({ "Predicted": results, "True": test_data['subject'] }) #, "program":test_data['program']}) clean_result_labels = df["Predicted"].map( lambda x: x.replace('subject=', '')) # save predicted labels pred_labels = np.unique(clean_result_labels.values) #pred_program = np.unique(df['program'].values) # save predictions per label precision, recall, f1, _ = skmetrics.precision_recall_fscore_support( df['True'].values, clean_result_labels.values, average=None, labels=pred_labels) output_results['classifier'] = [] for lbl, prec, rec, fm in zip(pred_labels, precision, recall, f1): output_results['classifier'].append({ 'label': lbl, 'precision': prec, 'recall': rec, 'fscore': fm }) print(output_results['classifier']) print(pred_labels) print('precision: {}'.format(precision)) print('recall: {}'.format(recall)) print('fscore: {}'.format(f1)) return generator, model, x_inp, x_out, history, target_encoding, output_results
def train(G_list, nodes_subjects_list, run_num=1, start_month_id=220, end_month_id=264): # 提前定义一些列表方便记录数据,大循环的列表存小循环的列表 graph_history_list_list = [] model_list_list = [] train_gen_list_list = [] time_list_list = [] model_weight_list_list = [] # 选择运行run_num次 run_num = run_num # 选择进行训练的月份,end_month_id最多取 start_month_id = start_month_id end_month_id = end_month_id # 创建文件夹保存model if not os.path.exists('model'): os.makedirs('model') # 创建文件夹保存history if not os.path.exists('history'): os.makedirs('history') # 创建文件夹保存figure if not os.path.exists('figure'): os.makedirs('figure') # 创建文件夹保存figure if not os.path.exists('figure_distribution'): os.makedirs('figure_distribution') # 创建文件夹保存test结果 if not os.path.exists('test_result'): os.makedirs('test_result') # 大循环记录训练了几次,计算多次是为了减少variance # 小循环记录训练的月份 for j in range(run_num): num_samples = [40] # 提前定义一些列表记录小循环的数据 graph_history_list = [] model_list = [] train_gen_list = [] time_list = [] model_weight_list = [] test_result = [] # i为0代表220 for i in range(start_month_id - 220, end_month_id - 220): start = time.time() # 前一个月训练,后一个月验证 train_idx = i val_idx = i + 1 test_idx = i + 2 # 用train_idx的数据生成训练集的generator generator = GraphSAGENodeGenerator( G=G_list[train_idx], batch_size=len(nodes_subjects_list[train_idx]), num_samples=num_samples, seed=100) train_gen = generator.flow(list( nodes_subjects_list[train_idx].index), nodes_subjects_list[train_idx].values, shuffle=False) # 生成GraphSAGE模型 graphsage_model = GraphSAGE(layer_sizes=[1], generator=generator, bias=True, aggregator=sg.layer.MeanAggregator, normalize=None) # 提取输出输出的tensor,用keras来构建模型 x_inp, x_out = graphsage_model.in_out_tensors() # prediction = layers.Dense(units=1)(x_out) # 用val_idx的数据生成验证集的generator generator = GraphSAGENodeGenerator( G=G_list[val_idx], batch_size=len(nodes_subjects_list[val_idx]), num_samples=num_samples, seed=100) val_gen = generator.flow(list(nodes_subjects_list[val_idx].index), nodes_subjects_list[val_idx].values) # 用test_idx的数据生成验证集的generator generator = GraphSAGENodeGenerator( G=G_list[test_idx], batch_size=len(nodes_subjects_list[test_idx]), num_samples=num_samples, seed=100) test_gen = generator.flow( list(nodes_subjects_list[test_idx].index), nodes_subjects_list[test_idx].values) # 通过输入输出的tensor构建model model = Model(inputs=x_inp, outputs=x_out) monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=10, verbose=2, mode='auto', restore_best_weights=True) model.compile(optimizer=optimizers.Adam(lr=0.05), loss=losses.mean_squared_error, metrics=[pearson_r]) history = model.fit(train_gen, epochs=500, validation_data=val_gen, verbose=0, shuffle=False, callbacks=[monitor]) test_metrics = model.evaluate(test_gen) test_result_dict = {} print("\n" + str(train_idx + 220) + "'s Test Set: " + str(test_idx + 220) + "'s Metrics:") for name, val in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, val)) test_result_dict[name] = val json.dump( test_result_dict, open( 'test_result/' + str(train_idx + 220) + "_" + str(test_idx + 220) + '.json', 'w')) test_preds = model.predict(test_gen) end = time.time() # 保存一些结果 graph_history_list.append(history) # 保存训练过程 model_list.append(model) # 保存model train_gen_list.append(train_gen) # 保存train_gen方便之后算中间层的结果 time_list.append(end - start) # 保存运行时间 model_weight_list.append(model.weights) # 保存model的参数 test_result.append(test_metrics[1]) # # 存模型model # model.save('model/' + str(train_idx + 220) + "_" + str(val_idx + 220) + '.h5') # # 存训练过程history # json.dump(history.history, # open('history/' + str(train_idx + 220) + "_" + str(val_idx + 220) + '.json', 'w')) # # 存训练过程图片figure # sg.utils.plot_history(history) # plt.title(str(train_idx + 220) + '->' + str(val_idx + 220)) # plt.savefig('figure/' + str(train_idx + 220) + "_" + str(val_idx + 220) + '.png') # plt.show() # 存test的prediction的distribution plt.figure(figsize=(5, 10)) plt.subplot(211) plt.hist(test_preds, bins=500) plt.title("Distribution of Prediction of " + str(test_idx + 220)) plt.subplot(212) plt.hist(nodes_subjects_list[test_idx].values, bins=500) plt.title("Distribution of Origin of " + str(test_idx + 220)) plt.xlabel("ic=" + str(test_metrics[1])) plt.savefig('figure_distribution/distribution-' + str(train_idx + 220) + "_" + str(test_idx + 220) + '.png', dpi=300) plt.show() print(str(i + 220) + "'s " + str(j + 1) + " run has finished") print() # 将小循环的数据保存 graph_history_list_list.append(graph_history_list) model_list_list.append(model_list) train_gen_list_list.append(train_gen_list) time_list_list.append(time_list) model_weight_list_list.append(model_weight_list) return graph_history_list_list, model_list_list, train_gen_list_list, time_list_list, model_weight_list_list, test_result
## ########################################### build graph ################################################ #%% ############################################################################################################ G = StellarGraph.from_networkx(g, node_features="feature") print(G.info()) #%% #################################### Graphsage Model loadig ########################################### #%% ############################################################################################################ batch_size = 70 num_samples = [15, 10, 5, 5] generator = GraphSAGENodeGenerator(G, batch_size, num_samples) targets = np.array(targetdf['btw']) test_gen = generator.flow(targetdf.index, targets) indices = bf.expandy(batch_size, 2) def noderankloss(index): def loss(y_true, y_pred): # tf.print(tf.gather(y_true, tf.constant(index[:, 0]))) yt = tf.math.sigmoid( tf.gather(y_true, tf.constant(index[:, 0])) - tf.gather(y_true, tf.constant(index[:, 1]))) yp = tf.math.sigmoid( tf.gather(y_pred, tf.constant(index[:, 0])) - tf.gather(y_pred, tf.constant(index[:, 1]))) # tf.print(tf.shape(yt))
# train_targets = target_encoding.fit_transform(temp_train_subjects).toarray() # test_targets = target_encoding.transform(temp_test_subjects).toarray() train_targets = np.array(train_subjects) test_targets = np.array(test_subjects) ## #################################### Graphsage Model building ########################################### #%% ############################################################################################################ batch_size = 40 # number of nodes to consider for each hop num_samples = [15, 10, 5] generator = GraphSAGENodeGenerator(G, batch_size, num_samples) train_gen = generator.flow( train_subjects.index, train_targets, shuffle=True) # train_subjects.index for selecting training nodes test_gen = generator.flow(test_subjects.index, test_targets) # aggregatortype = MaxPoolingAggregator(), # layer_sizes (list): Hidden feature dimensions for each layer. activations (list): Activations applied to each layer's output; def get_dropout(input_tensor, p=0.1, mc=False): if mc: return Dropout(p)(input_tensor, training=True) else: return Dropout(p)(input_tensor) graphsage_model = GraphSAGE(layer_sizes=[64, 32, 16],
def _train_model(self, gnx, train_data, test_data, all_features, target_feature_name): subject_groups_train = Counter(train_data[target_feature_name]) subject_groups_test = Counter(test_data[target_feature_name]) graph = sg.StellarGraph(gnx, node_features=all_features) output_results = { 'train_size': len(train_data), 'test_size': len(test_data), 'subject_groups_train': subject_groups_train, 'subject_groups_test': subject_groups_test, 'graph_info': graph.info() } num_samples = [10, 5] generator = GraphSAGENodeGenerator(graph, self.batch_size, num_samples) target_encoding = feature_extraction.DictVectorizer(sparse=False) train_targets = target_encoding.fit_transform( train_data[[target_feature_name]].to_dict('records')) class_weights = class_weight.compute_class_weight( class_weight='balanced', classes=np.unique(train_data[target_feature_name].to_list()), y=train_data[target_feature_name].to_list()) class_weights = dict(enumerate(class_weights)) test_targets = target_encoding.transform( test_data[[target_feature_name]].to_dict('records')) train_gen = generator.flow(train_data.index, train_targets, shuffle=True) graph_sage_model = GraphSAGE( layer_sizes=[80, 80], generator=generator, # train_gen, bias=True, dropout=0.5, ) print('building model...') x_inp, x_out = graph_sage_model.build() prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) model = Model(inputs=x_inp, outputs=prediction) print('compiling model...') model.compile( optimizer=optimizers.Adam(learning_rate=0.005), loss=losses.categorical_crossentropy, metrics=['acc', metrics.categorical_accuracy], ) print('testing the model...') test_gen = generator.flow(test_data.index, test_targets) history = model.fit( train_gen, epochs=self.num_epochs, validation_data=test_gen, verbose=2, shuffle=True, class_weight=class_weights, ) # save test metrics test_metrics = model.evaluate(test_gen) print('Test Set Metrics:') output_results['test_metrics'] = [] for name, val in zip(model.metrics_names, test_metrics): output_results['test_metrics'].append({'name': name, 'val:': val}) print("\t{}: {:0.4f}".format(name, val)) test_nodes = test_data.index test_mapper = generator.flow(test_nodes) test_predictions = model.predict(test_mapper) node_predictions = target_encoding.inverse_transform(test_predictions) results = pd.DataFrame(node_predictions, index=test_nodes).idxmax(axis=1) df = pd.DataFrame({ 'Predicted': results, 'True': test_data[target_feature_name] }) clean_result_labels = df['Predicted'].map( lambda x: x.replace('subject=', '')) # save predicted labels pred_labels = np.unique(clean_result_labels.values) precision, recall, f1, _ = skmetrics.precision_recall_fscore_support( df['True'].values, clean_result_labels.values, average=None, labels=pred_labels) output_results['classifier'] = [] for lbl, prec, rec, fm in zip(pred_labels, precision, recall, f1): output_results['classifier'].append({ 'label': lbl, 'precision': prec, 'recall': rec, 'fscore': fm }) print(output_results['classifier']) print(pred_labels) print('precision: {}'.format(precision)) print('recall: {}'.format(recall)) print('fscore: {}'.format(f1)) output_results['history'] = { 'epochs': history.epoch, 'training_log': history.history, 'training_params': history.params } return generator, model, x_inp, x_out, history, target_encoding, output_results