def main(): args = sgpr_args() if len(sys.argv) > 1: args.load(sys.argv[1]) else: args.load('./config/config.yml') args.load(os.path.abspath('./config/config.yml')) tab_printer(args) trainer = SGTrainer(args, False) trainer.model.eval() if not os.path.exists(args.output_path): os.makedirs(args.output_path) for sequence in tqdm(args.sequences): print("sequence: ", sequence) gt_db = [] pred_db = [] graph_pairs = load_paires( os.path.join(args.pair_list_dir, sequence + ".txt"), args.graph_pairs_dir) batches = [ graph_pairs[graph:graph + args.batch_size] for graph in range(0, len(graph_pairs), args.batch_size) ] for batch in tqdm(batches): pred, gt = trainer.eval_batch_pair(batch) pred_db.extend(pred) gt_db.extend(gt) assert len(pred_db) == len(gt_db) assert np.sum(gt_db) > 0 # gt_db should have positive samples # calc metrics pred_db = np.array(pred_db) gt_db = np.array(gt_db) # save results gt_db_path = os.path.join(args.output_path, sequence + "_gt_db.npy") pred_db_path = os.path.join(args.output_path, sequence + "_DL_db.npy") np.save(gt_db_path, gt_db) np.save(pred_db_path, pred_db) #####ROC fpr, tpr, roc_thresholds = metrics.roc_curve(gt_db, pred_db) roc_auc = metrics.auc(fpr, tpr) print("fpr: ", fpr) print("tpr: ", tpr) print("thresholds: ", roc_thresholds) print("roc_auc: ", roc_auc) # plot ROC Curve plt.figure(0) lw = 2 plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('DL ROC Curve') plt.legend(loc="lower right") roc_out = os.path.join(args.output_path, sequence + "_DL_roc_curve.png") plt.savefig(roc_out) #### P-R precision, recall, pr_thresholds = metrics.precision_recall_curve( gt_db, pred_db) # plot p-r curve plt.figure(1) lw = 2 plt.plot(recall, precision, color='darkorange', lw=lw, label='P-R curve') plt.axis([0, 1, 0, 1]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('DL Precision-Recall Curve') plt.legend(loc="lower right") pr_out = os.path.join(args.output_path, sequence + "_DL_pr_curve.png") plt.savefig(pr_out) if args.show: plt.show() # calc F1-score F1_score = 2 * precision * recall / (precision + recall) F1_score = np.nan_to_num(F1_score) F1_max_score = np.max(F1_score) f1_out = os.path.join(args.output_path, sequence + "_DL_F1_max.txt") print('F1 max score', F1_max_score) with open(f1_out, "w") as out: out.write(str(F1_max_score))
from grarep import GraRep from parser import parameter_parser from utils import read_graph, tab_printer def learn_model(args): """ Method to create adjacency matrix powers, read features, and learn embedding. :param args: Arguments object. """ A, nodes = read_graph(args.edge_path) model = GraRep(A, nodes, args) model.optimize() model.save_embedding() if __name__ == "__main__": args = parameter_parser() tab_printer(args) learn_model(args)
def main(unused_argv): """Main function for running experiments.""" # Load data utils.tab_printer(FLAGS.flag_values_dict()) (full_adj, feats, y_train, y_val, y_test, train_mask, val_mask, test_mask, train_data, val_data, test_data, num_data) = utils.load_ne_data_transductive_sparse( FLAGS.data_prefix, FLAGS.dataset, FLAGS.precalc, list(map(float, FLAGS.split))) # Partition graph and do preprocessing if FLAGS.bsize > 1: # multi cluster per epoch _, parts = partition_utils.partition_graph(full_adj, np.arange(num_data), FLAGS.num_clusters) parts = [np.array(pt) for pt in parts] else: (parts, features_batches, support_batches, y_train_batches, train_mask_batches) = utils.preprocess(full_adj, feats, y_train, train_mask, np.arange(num_data), FLAGS.num_clusters, FLAGS.diag_lambda, sparse_input=True) # valid & test in the same time # validation set (_, val_features_batches, test_features_batches, val_support_batches, y_val_batches, y_test_batches, val_mask_batches, test_mask_batches) = utils.preprocess_val_test( full_adj, feats, y_val, val_mask, y_test, test_mask, np.arange(num_data), FLAGS.num_clusters_val, FLAGS.diag_lambda) # (_, val_features_batches, val_support_batches, y_val_batches, # val_mask_batches) = utils.preprocess(full_adj, feats, y_val, val_mask, # np.arange(num_data), # FLAGS.num_clusters_val, # FLAGS.diag_lambda) # # test set # (_, test_features_batches, test_support_batches, y_test_batches, # test_mask_batches) = utils.preprocess(full_adj, feats, y_test, # test_mask, np.arange(num_data), # FLAGS.num_clusters_test, # FLAGS.diag_lambda) idx_parts = list(range(len(parts))) # Define placeholders placeholders = { 'support': tf.sparse_placeholder(tf.float32), # 'features': # tf.placeholder(tf.float32), 'features': tf.sparse_placeholder(tf.float32), 'labels': tf.placeholder(tf.float32, shape=(None, y_train.shape[1])), 'labels_mask': tf.placeholder(tf.int32), 'dropout': tf.placeholder_with_default(0., shape=()), 'fm_dropout': tf.placeholder_with_default(0., shape=()), 'gat_dropout': tf.placeholder_with_default(0., shape=()), # gat attn drop 'num_features_nonzero': tf.placeholder(tf.int32) # helper variable for sparse dropout } # Create model if FLAGS.model == 'gcn': model = models.GCN(placeholders, input_dim=feats.shape[1], logging=True, multilabel=FLAGS.multilabel, norm=FLAGS.layernorm, precalc=FLAGS.precalc, num_layers=FLAGS.num_layers, residual=False, sparse_inputs=True) elif FLAGS.model == 'gcn_nfm': model = models.GCN_NFM(placeholders, input_dim=feats.shape[1], logging=True, multilabel=FLAGS.multilabel, norm=FLAGS.layernorm, precalc=FLAGS.precalc, num_layers=FLAGS.num_layers, residual=False, sparse_inputs=True) elif FLAGS.model == 'gat_nfm': gat_layers = list(map(int, FLAGS.gat_layers)) model = models.GAT_NFM(placeholders, input_dim=feats.shape[1], logging=True, multilabel=FLAGS.multilabel, norm=FLAGS.layernorm, precalc=FLAGS.precalc, num_layers=FLAGS.num_layers, residual=False, sparse_inputs=True, gat_layers=gat_layers) else: raise ValueError(str(FLAGS.model)) # Initialize session sess = tf.Session() # Init variables sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() cost_val = [] acc_val = [] total_training_time = 0.0 # Train model for epoch in range(FLAGS.epochs): t = time.time() np.random.shuffle(idx_parts) if FLAGS.bsize > 1: (features_batches, support_batches, y_train_batches, train_mask_batches) = utils.preprocess_multicluster( full_adj, parts, feats, y_train, train_mask, FLAGS.num_clusters, FLAGS.bsize, FLAGS.diag_lambda, True) for pid in range(len(features_batches)): # Use preprocessed batch data features_b = features_batches[pid] support_b = support_batches[pid] y_train_b = y_train_batches[pid] train_mask_b = train_mask_batches[pid] # Construct feed dictionary feed_dict = utils.construct_feed_dict(features_b, support_b, y_train_b, train_mask_b, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) feed_dict.update( {placeholders['fm_dropout']: FLAGS.fm_dropout}) feed_dict.update( {placeholders['gat_dropout']: FLAGS.gat_dropout}) # Training step outs = sess.run([model.opt_op, model.loss, model.accuracy], feed_dict=feed_dict) # debug outs = sess.run([model.opt_op, model.loss, model.accuracy], feed_dict=feed_dict) else: np.random.shuffle(idx_parts) for pid in idx_parts: # Use preprocessed batch data features_b = features_batches[pid] support_b = support_batches[pid] y_train_b = y_train_batches[pid] train_mask_b = train_mask_batches[pid] # Construct feed dictionary feed_dict = utils.construct_feed_dict(features_b, support_b, y_train_b, train_mask_b, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) feed_dict.update( {placeholders['fm_dropout']: FLAGS.fm_dropout}) feed_dict.update( {placeholders['gat_dropout']: FLAGS.gat_dropout}) # Training step outs = sess.run([model.opt_op, model.loss, model.accuracy], feed_dict=feed_dict) total_training_time += time.time() - t print_str = 'Epoch: %04d ' % ( epoch + 1) + 'training time: {:.5f} '.format( total_training_time) + 'train_acc= {:.5f} '.format(outs[2]) # Validation ## todo: merge validation in train procedure if FLAGS.validation: cost, acc, micro, macro = evaluate(sess, model, val_features_batches, val_support_batches, y_val_batches, val_mask_batches, val_data, placeholders) cost_val.append(cost) acc_val.append(acc) print_str += 'val_acc= {:.5f} '.format( acc) + 'mi F1= {:.5f} ma F1= {:.5f} '.format(micro, macro) # tf.logging.info(print_str) print(print_str) if epoch > FLAGS.early_stopping and cost_val[-1] > np.mean( cost_val[-(FLAGS.early_stopping + 1):-1]): tf.logging.info('Early stopping...') break ### use acc early stopping, lower performance than using loss # if epoch > FLAGS.early_stopping and acc_val[-1] < np.mean( # acc_val[-(FLAGS.early_stopping + 1):-1]): # tf.logging.info('Early stopping...') # break tf.logging.info('Optimization Finished!') # Save model saver.save(sess, FLAGS.save_name) # Load model (using CPU for inference) with tf.device('/cpu:0'): sess_cpu = tf.Session(config=tf.ConfigProto(device_count={'GPU': 0})) sess_cpu.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess_cpu, FLAGS.save_name) # Testing test_cost, test_acc, micro, macro = evaluate( sess_cpu, model, test_features_batches, val_support_batches, y_test_batches, test_mask_batches, test_data, placeholders) print_str = 'Test set results: ' + 'cost= {:.5f} '.format( test_cost) + 'accuracy= {:.5f} '.format( test_acc) + 'mi F1= {:.5f} ma F1= {:.5f}'.format(micro, macro) tf.logging.info(print_str)