Пример #1
0
 def test_feed_dict(self):
     _, _, test_train, test_test = utils.load_pdata(self.dataset)
     test_train = test_train[:, 0]
     test_test = test_test[:, 0]
     t_train_feed = self.node_feed_dict(test_train)
     t_test_feed = self.node_feed_dict(test_test)
     return t_train_feed, t_test_feed
Пример #2
0
def feature_test(dataset, train_embeddings, test_embeddings):
    if dataset == 'cora':
        classes = 7
    elif dataset == 'citeseer':
        classes = 6
    elif dataset == 'pubmed':
        classes = 3
    else:
        raise Exception('Error : wrong dataset name')

    _, _, train_data, test_data = load_pdata(dataset)

    test_l = test_data[:, 1]
    test_label = []
    for i in xrange(test_data.shape[0]):
        temp = [0] * classes
        temp[test_data[i][1] - 1] += 1
        test_label.append(temp)
    test_label = np.array(test_label)  #1000 * 6

    train_l = train_data[:, 1]
    train_label = []
    for i in xrange(train_data.shape[0]):
        temp = [0] * classes
        temp[train_data[i][1] - 1] += 1
        train_label.append(temp)
    train_label = np.array(train_label)  #120 * 6

    test_in = np.asarray(test_embeddings)
    train_in = np.asarray(train_embeddings)

    y_train_ = sparse.coo_matrix(train_label)
    y_train = [[] for x in xrange(y_train_.shape[0])]
    cy = y_train_.tocoo()
    for i, j in izip(cy.row, cy.col):
        y_train[i].append(j)

    assert sum(len(l) for l in y_train) == y_train_.nnz

    y_test_ = sparse.coo_matrix(test_label)

    y_test = [[] for x in xrange(y_test_.shape[0])]
    cy = y_test_.tocoo()
    for i, j in izip(cy.row, cy.col):
        y_test[i].append(j)
    y_train = np.array(y_train)
    #y_test = np.array(y_test)

    clf = TopKRanker(LogisticRegression())
    clf.fit(train_in, y_train)

    top_k_list = [len(l) for l in y_test]
    preds = clf.predict(test_in, top_k_list)
    acc = accuracy_score(y_test, preds)
    return acc
Пример #3
0
def compute_correlation(dataset, embeddings, rpr_matrix):
    graph, _, _, _ = load_pdata(dataset)
    eu_dists = []
    stru_dists = []
    for node in graph:
        for nei in graph[node]:
            if node == nei:
                continue
            dist_eu = np.linalg.norm(embeddings[node] - embeddings[nei])
            dist_stru, _ = fastdtw(embeddings[node],
                                   embeddings[nei],
                                   radius=1,
                                   dist=cost)
            eu_dists.append(dist_eu)
            stru_dists.append(dist_stru)
    pear_rho, pear_p = pearsonr(stru_dists, eu_dists)
    spea_rho, spea_p = spearmanr(stru_dists, eu_dists)
    return "P ratio and p: {:.2f} + {:.2f}, S ratio and p: {:.2f} + {:.2f}".format(
        pear_rho, pear_p, spea_rho, spea_p)
Пример #4
0
        y_test[i].append(j)
    y_train = np.array(y_train)
    #y_test = np.array(y_test)

    clf = TopKRanker(LogisticRegression())
    clf.fit(train_in, y_train)

    top_k_list = [len(l) for l in y_test]
    preds = clf.predict(test_in, top_k_list)
    acc = accuracy_score(y_test, preds)
    return acc


if __name__ == '__main__':
    prefix = sys.argv[1]
    mask_rate = float(sys.argv[2])
    G, feats, train_data, test_data = load_pdata(prefix)
    features = np.asarray(feats.todense())

    test_id = test_data[:, 0]
    train_id = train_data[:, 0]

    feat_train = []
    feat_test = []
    for id_ in train_id:
        feat_train.append(features[id_])
    for id_ in test_id:
        feat_test.append(features[id_])

    acc_f = feature_test(prefix, feat_train, feat_test)
    print("feats: {:.3f}".format(acc_f))
Пример #5
0
dic_p = sys.argv[2]
emb_dic = {}
with open(dic_p, 'r') as f:
    k = 0
    for line in f:
        if k == 0:
            k += 1
            continue
        else:
            word = line.strip().split()[0]
            word = int(word)
            emb_dic[word] = k
            k += 1
classes = int(sys.argv[4])
_, _, train_data, test_data = load_pdata(sys.argv[3])
index = test_data[:, 0]
test_l = test_data[:, 1]
test_label = []
for i in xrange(test_data.shape[0]):
    temp = [0] * classes
    temp[test_data[i][1] - 1] += 1
    test_label.append(temp)
test_label = np.array(test_label)  #1000 * 6

train_index = train_data[:, 0]
train_l = train_data[:, 1]
train_label = []
for i in xrange(train_data.shape[0]):
    temp = [0] * classes
    temp[train_data[i][1] - 1] += 1
Пример #6
0
dic_p = sys.argv[2]
emb_dic = {}
with open(dic_p, 'r') as f:
    k = 0
    for line in f:
        if k == 0:
            k += 1
            continue
        else:
            word = line.strip().split()[0]
            word = int(word)
            emb_dic[word] = k
            k += 1
classes = int(sys.argv[4])
_, _, train_data, test_data = load_pdata(sys.argv[3])
index = test_data[:, 0]
test_l = test_data[:, 1]
test_label = []
for i in xrange(test_data.shape[0]):
    temp = [0] * classes
    temp[test_data[i][1] - 1] += 1
    test_label.append(temp)
test_label = np.array(test_label)   #1000 * 6

train_index = train_data[:, 0]
train_l = train_data[:, 1]
train_label = []
for i in xrange(train_data.shape[0]):
    temp = [0] * classes
    temp[train_data[i][1] - 1] += 1
Пример #7
0
def main():
    G = read_graph()
    if FLAGS.preprocess:
        print(" - Computing Rooted PageRank matrix...")
        rpr_matrix, pairs, rpr_arg = construct_rpr_matrix(G)
        utils.dump_to_disk(rpr_arg, './var/' + FLAGS.train_prefix + '_rpr_arg')
        print(" - RPR matrix completed.")
        degrees, degree_permuted = utils.create_degree(G)
        print(" - Dumping degree vectors to disk...")
        utils.dump_to_disk(degrees, './var/' + FLAGS.train_prefix + '_degrees')
        utils.dump_to_disk(degree_permuted,
                           './var/' + FLAGS.train_prefix + '_degree_permuted')
        print(" - Degree vectors dumped.")
    else:
        print(" - Loading precomputed Rooted PageRank matrix...")
        rpr_file = './var/' + FLAGS.train_prefix + '_rpr.mat'
        rpr_matrix = sio.loadmat(rpr_file)['rpr_matrix']
        rpr_arg = utils.load_pkl('./var/' + FLAGS.train_prefix + '_rpr_arg')
        print(" - RPR matrix loaded.")
        print(" - Loading Degree vectors...")
        degrees = utils.load_pkl('./var/' + FLAGS.train_prefix + '_degrees')
        degree_permuted = utils.load_pkl('./var/' + FLAGS.train_prefix +
                                         '_degree_permuted')
        print(" - Degree vectors loaded.")
        pairs = []
        with open('./var/' + FLAGS.train_prefix + '_normal_walks.txt',
                  'r') as fp:
            for line in fp:
                n_pair = line.split()
                pairs.append((int(n_pair[0]), int(n_pair[1])))
        print(" - Training pairs loaded")

    placeholders = construct_placeholders()

    minibatch = MinibatchIterator(G,
                                  placeholders,
                                  degrees,
                                  rpr_matrix,
                                  pairs,
                                  batchsize=FLAGS.batchsize,
                                  stru_rate=FLAGS.stru_rate,
                                  dataset=FLAGS.train_prefix)

    _, features, _, _ = utils.load_pdata(FLAGS.train_prefix)
    # TODO: maybe can be more efficiently written by sparse multipications
    features = np.asarray(features.todense())

    if FLAGS.PRETRAIN:
        from gensim.models.keyedvectors import KeyedVectors
        n2v_embedding = './baselines/{}_{}.embeddings'.format(
            'node2vec', FLAGS.train_prefix)
        n_model = KeyedVectors.load_word2vec_format(n2v_embedding,
                                                    binary=False)
        pretrained = np.asarray(
            [n_model[str(node)] for node in xrange(rpr_matrix.shape[0])])
        model = PretrainModel(placeholders,
                              features,
                              pretrained,
                              len(G.nodes()),
                              degree_permuted,
                              rpr_matrix,
                              rpr_arg,
                              dropout=FLAGS.dropout,
                              nodevec_dim=FLAGS.dim,
                              lr=FLAGS.learning_rate,
                              logging=True)
    else:
        model = AggregateModel(placeholders,
                               features,
                               len(G.nodes()),
                               degree_permuted,
                               rpr_matrix,
                               rpr_arg,
                               dropout=FLAGS.dropout,
                               nodevec_dim=FLAGS.dim,
                               lr=FLAGS.learning_rate,
                               logging=True)

    config = tf.ConfigProto(log_device_placement=FLAGS.log_device_placement)
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True

    sess = tf.Session(config=config)
    saver = tf.train.Saver(max_to_keep=5)
    merged = tf.summary.merge_all()
    summary_writer = tf.summary.FileWriter(log_dir(), sess.graph)

    # Init variables
    sess.run(tf.global_variables_initializer())

    # Train model
    total_steps = 0
    average_time = 0.0
    average_test = 0.0
    test_steps = 0
    epoch_test_acc = [0.0]

    for epoch in xrange(FLAGS.epoch):
        minibatch.shuffle()
        _iter = 0
        print("Epoch : %02d" % (epoch + 1),
              "Batchs per epoch : %04d" % (len(pairs) / FLAGS.batchsize))

        while not minibatch.end():
            feed_dict = minibatch.next_minibatch_feed_dict()
            t = time.time()
            # training step
            outs = sess.run(
                [merged, model.opt_op, model.loss, model.embeddings],
                feed_dict=feed_dict)
            train_cost = outs[2]

            average_time = (average_time * total_steps + time.time() -
                            t) / (total_steps + 1)

            if _iter % FLAGS.verbose == 0:
                if FLAGS.CORR:
                    all_feed = minibatch.all_feed_dict()
                    out = sess.run([
                        model.train_inputs_all, model.train_inputs_f,
                        model.embed, model.loss
                    ],
                                   feed_dict=all_feed)
                    str_corr = test.compute_correlation(
                        FLAGS.train_prefix, out[1], rpr_matrix)
                    print("Epoch: ", '%02d' % (epoch + 1), "iter: ",
                          '%03d' % _iter, "loss: ",
                          "{:.3f}".format(train_cost), "corr: ", str_corr,
                          "train time: ", "{:.3f}".format(average_time))
                else:
                    train_feed, test_feed = minibatch.test_feed_dict()
                    out_train = sess.run([
                        model.train_inputs_all, model.train_inputs_f,
                        model.embed
                    ],
                                         feed_dict=train_feed)
                    t1 = time.time()
                    out_test = sess.run([
                        model.train_inputs_all, model.train_inputs_f,
                        model.embed
                    ],
                                        feed_dict=test_feed)
                    average_test = (average_test * test_steps + time.time() -
                                    t1) / (test_steps + 1)
                    test_steps += 1

                    acc_f = test.feature_test(FLAGS.train_prefix, out_train[1],
                                              out_test[1])
                    epoch_test_acc.append(acc_f)
                    print("Epoch: ", '%02d' % (epoch + 1), "iter: ",
                          '%03d' % _iter, "loss: ",
                          "{:.3f}".format(train_cost), "now acc: ",
                          "{:.3f}".format(epoch_test_acc[-1]), "best acc: ",
                          "{:.3f}".format(max(epoch_test_acc)), "train time: ",
                          "{:.3f}".format(average_time), "test time: ",
                          "{:.3f}".format(average_test))

            _iter += 1
            total_steps += 1
        if epoch % FLAGS.save_per_epoch:
            saver.save(sess, os.path.join(log_dir(), 'model.ckpt'), epoch)
    print("Optimization finished !")