Exemplo n.º 1
0
def evaluate_htne(project_dir="/nfs/zty/Graph/4-htne/emb"):
    fname = iterate_datasets(dataset=args.dataset)
    fname = fname[args.start:args.end]
    if args.run:
        logger.info("Running {} embedding programs.".format(args.method))
        run_htne(dataset=args.dataset, n_jobs=args.n_jobs, fname=fname)
        logger.info("Done training embedding.")
    else:
        logger.info("Use pretrained {} embeddings.".format(args.method))

    for name in fname:
        logger.info(name)

        edgel, nodel = load_label_edges(dataset=name)
        train_edges, valid_edges, test_edges = id_map(edgel[0], nodel[0])

        for hist_len in [20]:
            fpath = "{}/{}.emb{}".format(project_dir, name, hist_len)
            id2idx, embeds = load_embeddings(fpath, skiprows=1, sep=" ")
            X_train = edge2tabular(train_edges, id2idx, embeds)
            y_train = train_edges["label"]
            X_valid = edge2tabular(valid_edges, id2idx, embeds)
            y_valid = valid_edges["label"]
            X_test = edge2tabular(test_edges, id2idx, embeds)
            y_test = test_edges["label"]
            # print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
            vauc, acc, f1, auc = lr_evaluate(X_train, y_train, X_valid,
                                             y_valid, X_test, y_test)
            write_result(name, "htne", {
                "hist_len": hist_len,
                "epoch": 50
            }, (vauc, acc, f1, auc))
Exemplo n.º 2
0
def evaluate_ctdne(project_dir="/nfs/zty/Graph/Dynamic-Graph/ctdne_embs"):
    fname = iterate_datasets(dataset=args.dataset)
    fname = fname[args.start:args.end]
    if args.run:
        logger.info("Running {} embedding programs.".format(args.method))
        Parallel(n_jobs=args.n_jobs)(delayed(run_ctdne)(fname=[name])
                                     for name in fname)
        logger.info("Done {} embeddings.".format(args.method))
    for name in fname:
        logger.info(
            "dataset={}, num_walk=10, walk_length=80, context_window=10".
            format(name))

        fpath = "{}/{}.emb".format(project_dir, name)
        id2idx, embeds = load_embeddings(fpath, skiprows=0, sep=" ")

        edgel, nodel = load_label_edges(dataset=name)
        train_edges, valid_edges, test_edges = id_map(edgel[0], nodel[0])

        X_train = edge2tabular(train_edges, id2idx, embeds)
        y_train = train_edges["label"]
        X_valid = edge2tabular(valid_edges, id2idx, embeds)
        y_valid = valid_edges["label"]
        X_test = edge2tabular(test_edges, id2idx, embeds)
        y_test = test_edges["label"]
        # print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
        vauc, acc, f1, auc = lr_evaluate(X_train, y_train, X_valid, y_valid,
                                         X_test, y_test)
        write_result(name, "ctdne", {
            "num_walk": 10,
            "walk_length": 80,
            "context_window": 10
        }, (vauc, acc, f1, auc))
Exemplo n.º 3
0
def evaluate_node2vec(project_dir="/nfs/zty/Graph/0-node2vec/emb"):
    fname = iterate_datasets()
    fname = fname[args.start:args.end]
    if args.run:
        logger.info("Running {} embedding programs.".format("node2vec"))
        run_node2vec(dataset=args.dataset,
                     n_jobs=args.n_jobs,
                     fname=fname,
                     start=args.start,
                     end=args.end,
                     times=args.times)
        logger.info("Done node2vec embedding.")
    else:
        logger.info("Use pretrained {} embeddings.".format("node2vec"))
    for name, p, q in product(fname, [0.25, 0.5, 1, 2, 4],
                              [0.25, 0.5, 1, 2, 4]):
        logger.info("dataset={}, p={:.2f}, q={:.2f}".format(name, p, q))

        edges, nodes = load_label_edges(dataset=name)
        train_edges, valid_edges, test_edges = id_map(edges[0], nodes[0])

        fpath = "{}/{}-{p:.2f}-{q:.2f}.emb".format(project_dir, name, p=p, q=q)
        id2idx, embs = load_embeddings(fpath, skiprows=1)
        X_train = edge2tabular(train_edges, id2idx, embs)
        y_train = train_edges["label"]
        X_valid = edge2tabular(valid_edges, id2idx, embs)
        y_valid = valid_edges["label"]
        X_test = edge2tabular(test_edges, id2idx, embs)
        y_test = test_edges["label"]
        # print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
        vauc, acc, f1, auc = lr_evaluate(X_train, y_train, X_valid, y_valid,
                                         X_test, y_test)
        write_result(name, "node2vec", {"p": p, "q": q}, (vauc, acc, f1, auc))
    pass
Exemplo n.º 4
0
def prepare_dataset(dataset):
    edges, nodes = load_split_edges(dataset=dataset)
    edges = pd.concat(edges[0]).reset_index(drop=True)
    nodes = nodes[0]
    labels, _ = load_label_edges(dataset=dataset)
    train_labels, val_labels, test_labels = labels[0]
    id2idx = {row.node_id: row.id_map for row in nodes.itertuples()}

    def _f(edges):
        edges["from_node_id"] = edges["from_node_id"].map(id2idx)
        edges["to_node_id"] = edges["to_node_id"].map(id2idx)
        return edges

    edges, train_labels, val_labels, test_labels = [
        _f(e) for e in [edges, train_labels, val_labels, test_labels]
    ]
    tmax, tmin = edges["timestamp"].max(), edges["timestamp"].min()

    # def scaler(s): return (s - tmin) / (tmax - tmin)
    def scaler(s):
        return (s - tmin)

    edges["timestamp"] = scaler(edges["timestamp"])
    train_labels["timestamp"] = scaler(train_labels["timestamp"])
    val_labels["timestamp"] = scaler(val_labels["timestamp"])
    test_labels["timestamp"] = scaler(test_labels["timestamp"])
    return nodes, edges, train_labels, val_labels, test_labels
Exemplo n.º 5
0
def evaluate_triad(project_dir="/nfs/zty/Graph/2-DynamicTriad/output"):
    fname = iterate_datasets(dataset=args.dataset)
    fname = fname[args.start:args.end]
    if args.run:
        logger.info("Running {} embedding programs.".format(args.method))
        run_triad(dataset=args.dataset,
                  n_jobs=args.n_jobs,
                  fname=fname,
                  start=args.start,
                  end=args.end,
                  times=args.times)
        logger.info("Done training embedding.")
    else:
        logger.info("Use pretrained {} embeddings.".format(args.method))
    for name, stepsize in product(fname, [1, 4, 8]):
        logger.info(name)

        edgel, nodel = load_label_edges(dataset=name)
        train_edges, valid_edges, test_edges = id_map(edgel[0], nodel[0])

        fdir = "{}/{}-{}/".format(project_dir, name, stepsize)
        step_embeds = [
            load_embeddings(fdir + f, skiprows=0) for f in os.listdir(fdir)
        ]
        id2idx, embeds = step_embeds[-1]
        X_train = edge2tabular(train_edges, id2idx, embeds)
        y_train = train_edges["label"]
        X_valid = edge2tabular(valid_edges, id2idx, embeds)
        y_valid = valid_edges["label"]
        X_test = edge2tabular(test_edges, id2idx, embeds)
        y_test = test_edges["label"]
        # print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
        vauc, acc, f1, auc = lr_evaluate(X_train, y_train, X_valid, y_valid,
                                         X_test, y_test)
        write_result(name, "triad", {
            "beta_1": 0.1,
            "beta_2": 0.1,
            "stepsize": stepsize
        }, (vauc, acc, f1, auc))
Exemplo n.º 6
0
def main(argv=None):
    print("Loading training data {}.".format(FLAGS.dataset))
    edges, nodes = load_split_edges(dataset=FLAGS.dataset)
    edges, nodes = edges[0], nodes[0]
    label_edges, _ = load_label_edges(dataset=FLAGS.dataset)
    label_edges = label_edges[0]
    # edges, nodes = load_data(datadir="./ctdne_data/", dataset=FLAGS.dataset)
    # train_edges = pd.read_csv("../train_data/{}.csv".format(FLAGS.dataset))
    # test_edges = pd.read_csv("../test_data/{}.csv".format(FLAGS.dataset))
    print("Done loading training data.")
    # test_ratio is consistent with the comparison experiment
    trainer = ModelTrainer(edges, nodes, val_ratio=0.05, test_ratio=0.25)
    # print(len(train_edges) // 2, len(test_edges) // 2, len(trainer.batch.edges))
    # assert(len(train_edges)+len(test_edges) ==
    #        2 * (len(trainer.batch.edges)-1))
    early_stopper = EarlyStopMonitor()
    if FLAGS.pretrain:
        trainer.restore_models()
    print("train numbers %d batch_size %d" %
          (len(trainer.batch.train_idx), FLAGS.batch_size))
    for epoch in range(FLAGS.epochs):
        trainer.train(epoch=epoch)
        val_auc = trainer.valid()
        print(f"val_auc: {val_auc}")
        # trainer.save_models(epoch=epoch)
        if early_stopper.early_stop_check(val_auc):
            print(f"No improvement over {early_stopper.max_round} epochs")
            trainer.params["epochs"] = epoch
            # trainer.restore(epoch=epoch-2)
            break
    _, valid_edges, test_edges = label_edges
    validy = trainer.test(valid_edges)
    testy = trainer.test(test_edges)
    if FLAGS.epochs > 1:
        write_result(valid_edges["label"], validy, test_edges["label"], testy,
                     trainer.params)
Exemplo n.º 7
0
def main_sage(argv=None):
    print("Loading training data {}.".format(FLAGS.dataset))
    edges, nodes = load_split_edges(dataset=FLAGS.dataset)
    edges, nodes = edges[0], nodes[0]
    label_edges, _ = load_label_edges(dataset=FLAGS.dataset)
    label_edges = label_edges[0]
    print("Done loading training data.")
    trainer = SAGETrainer(edges, nodes, val_ratio=0.05, test_ratio=0.25)
    early_stopper = EarlyStopMonitor()
    for epoch in range(FLAGS.epochs):
        trainer.train(epoch=epoch)
        val_auc = trainer.valid()
        # trainer.save_models(epoch=epoch)
        print(f"val_auc: {val_auc}")
        if early_stopper.early_stop_check(val_auc):
            print(f"No improvement over {early_stopper.max_round} epochs")
            trainer.params["epochs"] = epoch
            # trainer.restore(epoch=epoch-2)
            break
    _, valid_edges, test_edges = label_edges
    validy = trainer.test(valid_edges)
    testy = trainer.test(test_edges)
    write_result(valid_edges["label"], validy, test_edges["label"], testy,
                 trainer.params)