Exemplo n.º 1
0
def main():
    args = sgpr_args()
    if len(sys.argv) > 1:
        args.load(sys.argv[1])
    else:
        args.load('./config/config.yml')
    args.load(os.path.abspath('./config/config.yml'))
    tab_printer(args)
    trainer = SGTrainer(args, False)
    trainer.model.eval()
    if not os.path.exists(args.output_path):
        os.makedirs(args.output_path)
    for sequence in tqdm(args.sequences):
        print("sequence: ", sequence)
        gt_db = []
        pred_db = []
        graph_pairs = load_paires(
            os.path.join(args.pair_list_dir, sequence + ".txt"),
            args.graph_pairs_dir)
        batches = [
            graph_pairs[graph:graph + args.batch_size]
            for graph in range(0, len(graph_pairs), args.batch_size)
        ]
        for batch in tqdm(batches):
            pred, gt = trainer.eval_batch_pair(batch)
            pred_db.extend(pred)
            gt_db.extend(gt)
        assert len(pred_db) == len(gt_db)
        assert np.sum(gt_db) > 0  # gt_db should have positive samples
        # calc metrics
        pred_db = np.array(pred_db)
        gt_db = np.array(gt_db)
        # save results
        gt_db_path = os.path.join(args.output_path, sequence + "_gt_db.npy")
        pred_db_path = os.path.join(args.output_path, sequence + "_DL_db.npy")
        np.save(gt_db_path, gt_db)
        np.save(pred_db_path, pred_db)
        #####ROC
        fpr, tpr, roc_thresholds = metrics.roc_curve(gt_db, pred_db)
        roc_auc = metrics.auc(fpr, tpr)
        print("fpr: ", fpr)
        print("tpr: ", tpr)
        print("thresholds: ", roc_thresholds)
        print("roc_auc: ", roc_auc)

        # plot ROC Curve
        plt.figure(0)
        lw = 2
        plt.plot(fpr,
                 tpr,
                 color='darkorange',
                 lw=lw,
                 label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('DL ROC Curve')
        plt.legend(loc="lower right")
        roc_out = os.path.join(args.output_path,
                               sequence + "_DL_roc_curve.png")
        plt.savefig(roc_out)

        #### P-R
        precision, recall, pr_thresholds = metrics.precision_recall_curve(
            gt_db, pred_db)
        # plot p-r curve
        plt.figure(1)
        lw = 2
        plt.plot(recall,
                 precision,
                 color='darkorange',
                 lw=lw,
                 label='P-R curve')
        plt.axis([0, 1, 0, 1])
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title('DL Precision-Recall Curve')
        plt.legend(loc="lower right")
        pr_out = os.path.join(args.output_path, sequence + "_DL_pr_curve.png")
        plt.savefig(pr_out)
        if args.show:
            plt.show()
        # calc F1-score
        F1_score = 2 * precision * recall / (precision + recall)
        F1_score = np.nan_to_num(F1_score)
        F1_max_score = np.max(F1_score)
        f1_out = os.path.join(args.output_path, sequence + "_DL_F1_max.txt")
        print('F1 max score', F1_max_score)
        with open(f1_out, "w") as out:
            out.write(str(F1_max_score))
Exemplo n.º 2
0
from grarep import GraRep
from parser import parameter_parser
from utils import read_graph, tab_printer


def learn_model(args):
    """
    Method to create adjacency matrix powers, read features, and learn embedding.
    :param args: Arguments object.
    """
    A, nodes = read_graph(args.edge_path)
    model = GraRep(A, nodes, args)
    model.optimize()
    model.save_embedding()


if __name__ == "__main__":
    args = parameter_parser()
    tab_printer(args)
    learn_model(args)
Exemplo n.º 3
0
def main(unused_argv):
    """Main function for running experiments."""
    # Load data
    utils.tab_printer(FLAGS.flag_values_dict())
    (full_adj, feats, y_train, y_val, y_test, train_mask, val_mask, test_mask,
     train_data, val_data, test_data,
     num_data) = utils.load_ne_data_transductive_sparse(
         FLAGS.data_prefix, FLAGS.dataset, FLAGS.precalc,
         list(map(float, FLAGS.split)))

    # Partition graph and do preprocessing
    if FLAGS.bsize > 1:  # multi cluster per epoch
        _, parts = partition_utils.partition_graph(full_adj,
                                                   np.arange(num_data),
                                                   FLAGS.num_clusters)

        parts = [np.array(pt) for pt in parts]
    else:
        (parts, features_batches, support_batches, y_train_batches,
         train_mask_batches) = utils.preprocess(full_adj,
                                                feats,
                                                y_train,
                                                train_mask,
                                                np.arange(num_data),
                                                FLAGS.num_clusters,
                                                FLAGS.diag_lambda,
                                                sparse_input=True)
    # valid & test in the same time
    # validation set
    (_, val_features_batches, test_features_batches, val_support_batches,
     y_val_batches, y_test_batches,
     val_mask_batches, test_mask_batches) = utils.preprocess_val_test(
         full_adj, feats, y_val, val_mask, y_test, test_mask,
         np.arange(num_data), FLAGS.num_clusters_val, FLAGS.diag_lambda)

    # (_, val_features_batches, val_support_batches, y_val_batches,
    #  val_mask_batches) = utils.preprocess(full_adj, feats, y_val, val_mask,
    #                                       np.arange(num_data),
    #                                       FLAGS.num_clusters_val,
    #                                       FLAGS.diag_lambda)
    # # test set
    # (_, test_features_batches, test_support_batches, y_test_batches,
    #  test_mask_batches) = utils.preprocess(full_adj, feats, y_test,
    #                                        test_mask, np.arange(num_data),
    #                                        FLAGS.num_clusters_test,
    #                                        FLAGS.diag_lambda)
    idx_parts = list(range(len(parts)))

    # Define placeholders
    placeholders = {
        'support': tf.sparse_placeholder(tf.float32),
        # 'features':
        #     tf.placeholder(tf.float32),
        'features': tf.sparse_placeholder(tf.float32),
        'labels': tf.placeholder(tf.float32, shape=(None, y_train.shape[1])),
        'labels_mask': tf.placeholder(tf.int32),
        'dropout': tf.placeholder_with_default(0., shape=()),
        'fm_dropout': tf.placeholder_with_default(0., shape=()),
        'gat_dropout': tf.placeholder_with_default(0.,
                                                   shape=()),  # gat attn drop
        'num_features_nonzero':
        tf.placeholder(tf.int32)  # helper variable for sparse dropout
    }

    # Create model
    if FLAGS.model == 'gcn':
        model = models.GCN(placeholders,
                           input_dim=feats.shape[1],
                           logging=True,
                           multilabel=FLAGS.multilabel,
                           norm=FLAGS.layernorm,
                           precalc=FLAGS.precalc,
                           num_layers=FLAGS.num_layers,
                           residual=False,
                           sparse_inputs=True)
    elif FLAGS.model == 'gcn_nfm':
        model = models.GCN_NFM(placeholders,
                               input_dim=feats.shape[1],
                               logging=True,
                               multilabel=FLAGS.multilabel,
                               norm=FLAGS.layernorm,
                               precalc=FLAGS.precalc,
                               num_layers=FLAGS.num_layers,
                               residual=False,
                               sparse_inputs=True)
    elif FLAGS.model == 'gat_nfm':
        gat_layers = list(map(int, FLAGS.gat_layers))
        model = models.GAT_NFM(placeholders,
                               input_dim=feats.shape[1],
                               logging=True,
                               multilabel=FLAGS.multilabel,
                               norm=FLAGS.layernorm,
                               precalc=FLAGS.precalc,
                               num_layers=FLAGS.num_layers,
                               residual=False,
                               sparse_inputs=True,
                               gat_layers=gat_layers)
    else:
        raise ValueError(str(FLAGS.model))

    # Initialize session
    sess = tf.Session()

    # Init variables
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    cost_val = []
    acc_val = []
    total_training_time = 0.0
    # Train model
    for epoch in range(FLAGS.epochs):
        t = time.time()
        np.random.shuffle(idx_parts)
        if FLAGS.bsize > 1:
            (features_batches, support_batches, y_train_batches,
             train_mask_batches) = utils.preprocess_multicluster(
                 full_adj, parts, feats, y_train, train_mask,
                 FLAGS.num_clusters, FLAGS.bsize, FLAGS.diag_lambda, True)
            for pid in range(len(features_batches)):
                # Use preprocessed batch data
                features_b = features_batches[pid]
                support_b = support_batches[pid]
                y_train_b = y_train_batches[pid]
                train_mask_b = train_mask_batches[pid]
                # Construct feed dictionary
                feed_dict = utils.construct_feed_dict(features_b, support_b,
                                                      y_train_b, train_mask_b,
                                                      placeholders)
                feed_dict.update({placeholders['dropout']: FLAGS.dropout})
                feed_dict.update(
                    {placeholders['fm_dropout']: FLAGS.fm_dropout})
                feed_dict.update(
                    {placeholders['gat_dropout']: FLAGS.gat_dropout})
                # Training step
                outs = sess.run([model.opt_op, model.loss, model.accuracy],
                                feed_dict=feed_dict)
                # debug
                outs = sess.run([model.opt_op, model.loss, model.accuracy],
                                feed_dict=feed_dict)
        else:
            np.random.shuffle(idx_parts)
            for pid in idx_parts:
                # Use preprocessed batch data
                features_b = features_batches[pid]
                support_b = support_batches[pid]
                y_train_b = y_train_batches[pid]
                train_mask_b = train_mask_batches[pid]
                # Construct feed dictionary
                feed_dict = utils.construct_feed_dict(features_b, support_b,
                                                      y_train_b, train_mask_b,
                                                      placeholders)
                feed_dict.update({placeholders['dropout']: FLAGS.dropout})
                feed_dict.update(
                    {placeholders['fm_dropout']: FLAGS.fm_dropout})
                feed_dict.update(
                    {placeholders['gat_dropout']: FLAGS.gat_dropout})
                # Training step
                outs = sess.run([model.opt_op, model.loss, model.accuracy],
                                feed_dict=feed_dict)

        total_training_time += time.time() - t
        print_str = 'Epoch: %04d ' % (
            epoch + 1) + 'training time: {:.5f} '.format(
                total_training_time) + 'train_acc= {:.5f} '.format(outs[2])

        # Validation
        ## todo: merge validation in train procedure
        if FLAGS.validation:
            cost, acc, micro, macro = evaluate(sess, model,
                                               val_features_batches,
                                               val_support_batches,
                                               y_val_batches, val_mask_batches,
                                               val_data, placeholders)
            cost_val.append(cost)
            acc_val.append(acc)
            print_str += 'val_acc= {:.5f} '.format(
                acc) + 'mi F1= {:.5f} ma F1= {:.5f} '.format(micro, macro)

        # tf.logging.info(print_str)
        print(print_str)

        if epoch > FLAGS.early_stopping and cost_val[-1] > np.mean(
                cost_val[-(FLAGS.early_stopping + 1):-1]):
            tf.logging.info('Early stopping...')
            break

        ### use acc early stopping, lower performance than using loss
        # if epoch > FLAGS.early_stopping and acc_val[-1] < np.mean(
        #     acc_val[-(FLAGS.early_stopping + 1):-1]):
        #   tf.logging.info('Early stopping...')
        #   break

    tf.logging.info('Optimization Finished!')

    # Save model
    saver.save(sess, FLAGS.save_name)

    # Load model (using CPU for inference)
    with tf.device('/cpu:0'):
        sess_cpu = tf.Session(config=tf.ConfigProto(device_count={'GPU': 0}))
        sess_cpu.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        saver.restore(sess_cpu, FLAGS.save_name)
        # Testing
        test_cost, test_acc, micro, macro = evaluate(
            sess_cpu, model, test_features_batches, val_support_batches,
            y_test_batches, test_mask_batches, test_data, placeholders)
        print_str = 'Test set results: ' + 'cost= {:.5f} '.format(
            test_cost) + 'accuracy= {:.5f} '.format(
                test_acc) + 'mi F1= {:.5f} ma F1= {:.5f}'.format(micro, macro)
        tf.logging.info(print_str)