Пример #1
0
def gae_for(args):
    print("Using {} dataset".format(args.dataset_str))
    adj, features = load_data(args.dataset_str)
    n_nodes, feat_dim = features.shape

    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix(
        (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()

    adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(
        adj)
    adj = adj_train

    # Some preprocessing
    adj_norm = preprocess_graph(adj)
    adj_label = adj_train + sp.eye(adj_train.shape[0])
    # adj_label = sparse_to_tuple(adj_label)
    adj_label = torch.FloatTensor(adj_label.toarray())

    pos_weight = torch.Tensor(
        [float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()])
    norm = adj.shape[0] * adj.shape[0] / float(
        (adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

    model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    hidden_emb = None
    for epoch in range(args.epochs):
        t = time.time()
        model.train()
        optimizer.zero_grad()
        recovered, mu, logvar = model(features, adj_norm)
        loss = loss_function(preds=recovered,
                             labels=adj_label,
                             mu=mu,
                             logvar=logvar,
                             n_nodes=n_nodes,
                             norm=norm,
                             pos_weight=pos_weight)
        loss.backward()
        cur_loss = loss.item()
        optimizer.step()

        hidden_emb = mu.data.numpy()
        roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges,
                                          val_edges_false)

        print("Epoch:", '%04d' % (epoch + 1), "train_loss=",
              "{:.5f}".format(cur_loss), "val_ap=", "{:.5f}".format(ap_curr),
              "time=", "{:.5f}".format(time.time() - t))

    print("Optimization Finished!")

    roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges,
                                        test_edges_false)
    print('Test ROC score: ' + str(roc_score))
    print('Test AP score: ' + str(ap_score))
Пример #2
0
def gae_for(args):
    print("Using {} dataset".format(args.dataset_str))
    adj, features, y_test, tx, ty, test_maks, true_labels = load_data(
        args.dataset_str)
    n_nodes, feat_dim = features.shape

    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix(
        (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()

    adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(
        adj)
    adj = adj_train

    # Before proceeding further, make the structure for doing deepWalk
    if args.dw == 1:
        print('Using deepWalk regularization...')
        G = load_edgelist_from_csr_matrix(adj_orig, undirected=True)
        print("Number of nodes: {}".format(len(G.nodes())))
        num_walks = len(G.nodes()) * args.number_walks
        print("Number of walks: {}".format(num_walks))
        data_size = num_walks * args.walk_length
        print("Data size (walks*length): {}".format(data_size))

    # Some preprocessing
    adj_norm = preprocess_graph(adj)
    adj_label = adj_train + sp.eye(adj_train.shape[0])
    # adj_label = sparse_to_tuple(adj_label)
    adj_label = torch.FloatTensor(adj_label.toarray())

    pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
    norm = adj.shape[0] * adj.shape[0] / float(
        (adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

    if args.model == 'gcn_vae':
        model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout)
    else:
        model = GCNModelAE(feat_dim, args.hidden1, args.hidden2, args.dropout)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    if args.dw == 1:
        sg = SkipGram(args.hidden2, adj.shape[0])
        optimizer_dw = optim.Adam(sg.parameters(), lr=args.lr_dw)

        # Construct the nodes for doing random walk. Doing it before since the seed is fixed
        nodes_in_G = list(G.nodes())
        chunks = len(nodes_in_G) // args.number_walks
        random.Random().shuffle(nodes_in_G)

    hidden_emb = None
    for epoch in tqdm(range(args.epochs)):
        t = time.time()
        model.train()
        optimizer.zero_grad()
        z, mu, logvar = model(features, adj_norm)

        # After back-propagating gae loss, now do the deepWalk regularization
        if args.dw == 1:
            sg.train()
            if args.full_number_walks > 0:
                walks = build_deepwalk_corpus(G,
                                              num_paths=args.full_number_walks,
                                              path_length=args.walk_length,
                                              alpha=0,
                                              rand=random.Random(SEED))
            else:
                walks = build_deepwalk_corpus_iter(
                    G,
                    num_paths=args.number_walks,
                    path_length=args.walk_length,
                    alpha=0,
                    rand=random.Random(SEED),
                    chunk=epoch % chunks,
                    nodes=nodes_in_G)
            for walk in walks:
                if args.context == 1:
                    # Construct the pairs for predicting context node
                    # for each node, treated as center word
                    curr_pair = (int(walk[center_node_pos]), [])
                    for center_node_pos in range(len(walk)):
                        # for each window position
                        for w in range(-args.window_size,
                                       args.window_size + 1):
                            context_node_pos = center_node_pos + w
                            # make soure not jump out sentence
                            if context_node_pos < 0 or context_node_pos >= len(
                                    walk
                            ) or center_node_pos == context_node_pos:
                                continue
                            context_node_idx = walk[context_node_pos]
                            curr_pair[1].append(int(context_node_idx))
                else:
                    # first item in the walk is the starting node
                    curr_pair = (int(walk[0]), [
                        int(context_node_idx) for context_node_idx in walk[1:]
                    ])

                if args.ns == 1:
                    neg_nodes = []
                    pos_nodes = set(walk)
                    while len(neg_nodes) < args.walk_length - 1:
                        rand_node = random.randint(0, n_nodes - 1)
                        if rand_node not in pos_nodes:
                            neg_nodes.append(rand_node)
                    neg_nodes = torch.from_numpy(np.array(neg_nodes)).long()

                # Do actual prediction
                src_node = torch.from_numpy(np.array([curr_pair[0]])).long()
                tgt_nodes = torch.from_numpy(np.array(curr_pair[1])).long()
                optimizer_dw.zero_grad()
                log_pos = sg(src_node, tgt_nodes, neg_sample=False)
                if args.ns == 1:
                    loss_neg = sg(src_node, neg_nodes, neg_sample=True)
                    loss_dw = log_pos + loss_neg
                else:
                    loss_dw = log_pos
                loss_dw.backward(retain_graph=True)
                cur_dw_loss = loss_dw.item()
                optimizer_dw.step()

        loss = loss_function(preds=model.dc(z),
                             labels=adj_label,
                             mu=mu,
                             logvar=logvar,
                             n_nodes=n_nodes,
                             norm=norm,
                             pos_weight=pos_weight)
        loss.backward()
        cur_loss = loss.item()
        optimizer.step()

        hidden_emb = mu.data.numpy()
        roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges,
                                          val_edges_false)

        if args.dw == 1:
            tqdm.write(
                "Epoch: {}, train_loss_gae={:.5f}, train_loss_dw={:.5f}, val_ap={:.5f}, time={:.5f}"
                .format(epoch + 1, cur_loss, cur_dw_loss, ap_curr,
                        time.time() - t))
        else:
            tqdm.write(
                "Epoch: {}, train_loss_gae={:.5f}, val_ap={:.5f}, time={:.5f}".
                format(epoch + 1, cur_loss, ap_curr,
                       time.time() - t))

        if (epoch + 1) % 10 == 0:
            tqdm.write("Evaluating intermediate results...")
            kmeans = KMeans(n_clusters=args.n_clusters,
                            random_state=0).fit(hidden_emb)
            predict_labels = kmeans.predict(hidden_emb)
            cm = clustering_metrics(true_labels, predict_labels)
            cm.evaluationClusterModelFromLabel(tqdm)
            roc_score, ap_score = get_roc_score(hidden_emb, adj_orig,
                                                test_edges, test_edges_false)
            tqdm.write('ROC: {}, AP: {}'.format(roc_score, ap_score))
            np.save('logs/emb_epoch_{}.npy'.format(epoch + 1), hidden_emb)

    tqdm.write("Optimization Finished!")

    roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges,
                                        test_edges_false)
    tqdm.write('Test ROC score: ' + str(roc_score))
    tqdm.write('Test AP score: ' + str(ap_score))
    kmeans = KMeans(n_clusters=args.n_clusters, random_state=0).fit(hidden_emb)
    predict_labels = kmeans.predict(hidden_emb)
    cm = clustering_metrics(true_labels, predict_labels)
    cm.evaluationClusterModelFromLabel(tqdm)

    if args.plot == 1:
        cm.plotClusters(tqdm, hidden_emb, true_labels)
Пример #3
0
def gae_for(args):
    print("Using {} dataset".format(args.dataset_str))
    adj, features = load_corpus(args.dataset_str)
    # n_nodes, feat_dim = features.shape
    # print(n_nodes, feat_dim)
    print(type(features))

    print(adj)
    # print(adj[0], adj[1])
    features = sp.identity(features.shape[0])  # featureless

    # print(adj.shape)
    # print(features.shape)

    # Some preprocessing
    features = preprocess_features(features)

    adj_norm = preprocess_adj(adj)
    num_supports = 1
    # model_func = GCN
    adj_norm = torch.FloatTensor(adj_norm.toarray())
    features = torch.FloatTensor(features.toarray())
    n_nodes, feat_dim = features.shape
    print(n_nodes, feat_dim)
    print(type(features))
    print(type(adj_norm))
    print(features.shape)
    print(adj_norm.shape)
    # n_nodes, feat_dim = features.shape
    # print(n_nodes, feat_dim)
    # Store original adjacency matrix (without diagonal entries) for later
    # adj_orig = adj
    # adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    # adj_orig.eliminate_zeros()

    # modified/added by hollis
    # adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj)
    # Remove diagonal elements

    # adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape)
    # adj.eliminate_zeros()
    # Check that diag is zero:
    # assert np.diag(adj.todense()).sum() == 0

    # adj_train = sp.csr_matrix(adj)
    # adj_train = adj_train + adj_train.T

    # Some preprocessing
    # adj_norm = normalize_adj(adj)

    # adj_label = adj_train + sp.eye(adj_train.shape[0])

    # adj_label = sparse_to_tuple(adj_label)

    # adj_label = torch.FloatTensor(adj_label.toarray())
    # adj_label = np.array(adj_label, dtype=float)
    # adj_label = torch.FloatTensor(adj_label)

    pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()

    # added by hollis
    # pos_weight = torch.from_numpy(np.array(pos_weight))

    norm = adj.shape[0] * adj.shape[0] / float(
        (adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

    # model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout)
    model = GCNModelVAE(feat_dim, args.hidden1, args.dropout)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    hidden_emb = None
    for epoch in range(args.epochs):
        print("in epoch")
        t = time.time()
        model.train()
        optimizer.zero_grad()
        # recovered, mu, logvar = model(features, adj_norm)

        print("before model")
        recovered, mu, logvar = model(features, adj_norm)

        print("before loss")
        loss = loss_function(preds=recovered,
                             labels=adj_norm,
                             mu=mu,
                             logvar=logvar,
                             n_nodes=n_nodes,
                             norm=norm)
        #loss = loss_function(preds=recovered, labels=adj_label,
        #                     mu=mu, logvar=logvar, n_nodes=n_nodes,
        #                     norm=norm, pos_weight=pos_weight)
        print("befor backword")
        loss.backward()
        cur_loss = loss.item()
        optimizer.step()

        hidden_emb = mu.data.numpy()
        hidden_emb = np.array(hidden_emb)

        if epoch == 1:
            fni = "./result/emb_init.txt"
            hidden_emb = np.array(hidden_emb)
            np.savetxt(fni, hidden_emb)

        if epoch == args.epochs - 1:
            fnf = "./result/emb.txt"
            hidden_emb = np.array(hidden_emb)
            np.savetxt(fnf, hidden_emb)

        #roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false)

        print(
            "Epoch:",
            '%04d' % (epoch + 1),
            "train_loss=",
            "{:.5f}".format(cur_loss),
            # "val_ap=", "{:.5f}".format(ap_curr),
            "time=",
            "{:.5f}".format(time.time() - t))

    print("Optimization Finished!")
Пример #4
0
def gae_for(args, position):
    print("Using {} dataset".format(args.dataset_str))
    #qhashes, chashes = load_hashes()
    Q, X = load_data()
    prebuild = "/media/chundi/3b6b0f74-0ac7-42c7-b76b-00c65f5b3673/revisitop/cnnimageretrieval-pytorch/data/test/matlab_data/GEM_wDis_prebuild.bin"
    Q_features = "/media/chundi/3b6b0f74-0ac7-42c7-b76b-00c65f5b3673/revisitop/cnnimageretrieval-pytorch/data/test/matlab_data/roxford5k_GEM_lw_query_feats.npy"  #"/media/jason/cc0aeb62-0bc7-4f3e-99a0-3bba3dd9f8fc/landmarks/oxfordRe/evaluation/roxHD_query_fused.npy"
    X_features = "/media/chundi/3b6b0f74-0ac7-42c7-b76b-00c65f5b3673/revisitop/cnnimageretrieval-pytorch/data/test/matlab_data/roxford5k_GEM_index.npy"
    D_features = "/media/chundi/3b6b0f74-0ac7-42c7-b76b-00c65f5b3673/revisitop/cnnimageretrieval-pytorch/data/test/matlab_data/roxford5k_GEM_Dis.npy"
    adj, features, adj_Q, features_Q = load_from_prebuild(prebuild,
                                                          Q_features,
                                                          X_features,
                                                          D_features,
                                                          k=5)  # ----> 1M
    #cut_size = 800000
    #adj = adj[:cut_size, :cut_size]
    #adj_Q = adj_Q[:, :cut_size]
    #features = features[:cut_size]
    #Q = np.load("/media/jason/cc0aeb62-0bc7-4f3e-99a0-3bba3dd9f8fc/landmarks/oxfordRe/evaluation/roxHD_query_fused.npy").T.astype(np.float32)
    #X = np.load("/media/jason/cc0aeb62-0bc7-4f3e-99a0-3bba3dd9f8fc/landmarks/oxfordRe/evaluation/roxHD_index_fused.npy").T.astype(np.float32)
    #D = np.load("/media/jason/cc0aeb62-0bc7-4f3e-99a0-3bba3dd9f8fc/landmarks/revisitop1m/revisitDistractors_fused_3s_cq.npy").T.astype(np.float32)
    #X = np.concatenate((X.T,D.T)).T
    # load the distractor too, shape should be (2048, 1M)
    #adj, features = gen_graph_index(Q, X, k=5, k_qe=3, do_qe=False) #-----> 5k

    #adj_Q, features_Q = gen_graph(Q, X, k=5, k_qe=3, do_qe=False) #generate validation/revop evaluation the same way as training ----> 5k
    features_all = np.concatenate([features_Q, features])
    features_all = torch.from_numpy(features_all)
    #adj_Q = adj_Q.todense()
    #adj_all = np.concatenate([adj_Q, adj.todense()])
    #adj_all = np.pad(adj_all, [[0,0], [Q.shape[1], 0]], "constant")
    adj_all = sp.vstack((adj_Q, adj))
    zeros = sp.csr_matrix((adj_all.shape[0], Q.shape[1]))
    adj_all = sp.hstack((zeros, adj_all))
    adj_all = sp.csr_matrix(adj_all)
    rows, columns = adj_all.nonzero()
    print("Making Symmetry")
    for i in range(rows.shape[0]):
        if rows[i] < Q.shape[1]:
            adj_all[columns[i], rows[i]] = adj_all[rows[i], columns[i]]
        else:
            break
    #adj_all = sp.csr_matrix(adj_all)
    print("preprocessing adj_all")
    adj_all_norm = preprocess_graph(adj_all)
    #adj = add_neighbours_neighbour(adj)
    #adj1, features1 = load_data(args.dataset_str)
    features = torch.from_numpy(features)
    #features_all = torch.from_numpy(features_all)
    n_nodes, feat_dim = features.shape

    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix(
        (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()
    adj = adj_orig

    #adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj)
    print("Sampling validation")
    adj_train, adj_val, features, features_valid = mask_test_rows(
        adj, features)
    adj = adj_train

    # Some preprocessing
    print("preprocessing adj")
    adj_norm = preprocess_graph(adj)
    #adj_norm_label = preprocess_graph_sp(adj)

    adj_label = adj_train + sp.eye(
        adj_train.shape[0]
    )  #adj_norm_label + sp.eye(adj_train.shape[0]) #adj_train + sp.eye(adj_train.shape[0])
    #rows, columns = adj_label.nonzero()
    #adj_label[columns, rows] = adj_label[rows, columns]
    # adj_label = sparse_to_tuple(adj_label)
    #adj_label = torch.FloatTensor(adj_label.toarray())

    print("adj sum: " + str(adj.sum()))
    pos_weight = float(float(adj.shape[0]) * adj.shape[0] -
                       adj.sum()) / adj.sum()
    print("top part: " +
          str(float(float(adj.shape[0]) * adj.shape[0] - adj.sum())))
    print("pos wieght: " + str(pos_weight))
    norm = adj.shape[0] * adj.shape[0] / float(
        (adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

    # for validation data processing:
    zero = sp.csr_matrix((adj_train.shape[0], adj_val.shape[0]))
    adj_train_ext = sp.hstack((zero, adj_train))
    adj_evaluate = sp.vstack((adj_val, adj_train_ext))
    adj_evaluate = sp.csr_matrix(adj_evaluate)
    rows, columns = adj_evaluate.nonzero()
    val_edges = []
    val_edges_false = []
    pos = {}
    print("getting positive edges")
    all_val = [i for i in range(len(rows)) if rows[i] < adj_val.shape[0]]
    for i in all_val:
        sys.stdout.write("\r sampling edges for validtion: [" + str(i) + "]")
        val_edges.append((rows[i], columns[i]))
        if rows[i] not in pos:
            pos[rows[i]] = []
        pos[rows[i]].append(columns[i])
    #for i in range(rows.shape[0]):
    #    sys.stdout.write("\r sampling edges for validtion: [" + str(i) + "/" + str(adj_val.shape[0]) + "]")
    #    sys.stdout.flush()
    #    if rows[i] < adj_val.shape[0]:
    #        val_edges.append((rows[i], columns[i]))
    #        if rows[i] not in pos:
    #            pos[rows[i]] = []
    #        pos[rows[i]].append(columns[i])
    #        adj_evaluate[columns[i], rows[i]] = adj_evaluate[rows[i], columns[i]]
    #    else:
    #        break
    step = 0
    neg_per_pos = 100
    #for r in pos:
    #    p = pos[r]
    #neg_edges = Parallel(n_jobs=40)(delayed(neg_sample)(pos[i], adj_val.shape[1], neg_per_pos, i) for i in pos)
    #val_edges_false = [(i, item) for i in range(len(neg_edges)) for item in neg_edges[i]]
    #a = np.random.permutation(adj_val.shape[1])
    #a = [i for i in a if i not in p]
    #a = a[:100]
    #for i in a:
    #    val_edges_false.append((r, i))
    ##count = 0
    ##i = 0
    #sys.stdout.write("\r sampling neg edges for validtion: [" + str(step) + "/" + str(len(pos)) + "]")
    #sys.stdout.flush()
    #step += 1
    #while count < 100:
    #    if a[i] not in p:
    #        val_edges_false.append((r, a[i]))
    #        count += 1
    #    i += 1

    print("preprocessing adj_evaluate")
    adj_evaluate_norm = preprocess_graph(adj_evaluate)
    #adj_evaluate_norm_label = preprocess_graph_sp(adj_evaluate)

    adj_label_evaluate = adj_evaluate + sp.eye(
        adj_evaluate.shape[0]
    )  #adj_evaluate_norm_label+ sp.eye(adj_evaluate.shape[0]) #adj_evaluate + sp.eye(adj_evaluate.shape[0])
    #adj_label_evaluate = torch.FloatTensor(adj_label_evaluate.toarray()) #sparse_mx_to_torch_sparse_tensor(adj_label_evaluate)
    features_evaluate = np.concatenate([features, features_valid])
    features_evaluate = torch.from_numpy(features_evaluate)
    # validation done

    if mode == "VAE":
        model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout)
        adj_label = torch.FloatTensor(adj_label.toarray())
        adj_label_evaluate = torch.FloatTensor(adj_label_evaluate.toarray())
    elif mode == "AE":
        model = GCNModelAE(feat_dim, args.hidden1, args.hidden2, args.dropout)
        #adj_label = torch.FloatTensor(adj_label.toarray())
        #adj_label_evaluate = torch.FloatTensor(adj_label_evaluate.toarray())
    elif mode == "VAE_batch":
        model = GCNModelVAE_batch(feat_dim, args.hidden1, args.hidden2,
                                  args.dropout)
    elif mode == "AE_batch":
        model = GCNModelAE_batch(feat_dim, args.hidden1, args.hidden2,
                                 args.dropout).cuda()
    #model = torch.nn.DataParallel(model)
    #model = model.cuda()

    # train_dataset = GAEDataset(adj_norm, adj_label, features)
    # train_loader = DataLoader(dataset=train_dataset, batch_size=args.batch_size,
    #                           shuffle=True, num_workers=8, pin_memory=True)
    train_ids = torch.tensor(range(features.shape[0]), dtype=torch.long)
    train_dataset = TensorDataset(train_ids)
    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=8,
                              pin_memory=True)

    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    #optimizer = pSGLD(model.parameters(), lr=args.lr)
    #optimizer = optim.RMSprop(model.parameters(), lr=args.lr)

    hidden_emb = None
    pos_weight = torch.from_numpy(np.array(0.0, dtype=np.float32))
    #ipdb.set_trace()
    t = time.time()
    best = 0
    best_epoch = 0
    best_val_cost = 99999
    best_val_epoch = 0
    best_val_epoch_revop = 0
    prev_loss = 0
    prev_val_loss = 99999
    best_val_roc = 0
    best_val_ap = 0
    best_val_roc_revop = 0
    best_val_ap_revop = 0
    torch.set_num_threads(20)
    print("NUM THREADS USED")
    print(torch.get_num_threads())
    for epoch in range(args.epochs):
        #t = time.time()
        model.train()

        lossVal = 0
        lossValNorm = 0
        backtime = time.time()
        for batchID, (inds) in enumerate(train_loader):
            z = model(features, adj_norm)
            inds = inds[0]
            adj = F.relu(torch.mm(z[inds], z[inds].t()))
            preds = adj
            label_batch = torch.FloatTensor(adj_label[inds, :][:,
                                                               inds].toarray())

            cost = norm * F.binary_cross_entropy_with_logits(
                preds, label_batch, pos_weight=pos_weight)
            lossVal += cost.item()
            lossValNorm += 1

            optimizer.zero_grad()

            cost.backward(retain_graph=True)

            # if batchID == 0:
            #     cost.backward(retain_graph=True)
            # else:
            #     cost.backward()

            optimizer.step()
            if batchID >= 10:
                break

        backtime_done = time.time() - backtime
        sys.stdout.write("\r time taken to do epoch: " + str(backtime_done) +
                         "   opt: " + str(time.time() - backtime) + "\n")
        sys.stdout.flush()

        #optimizer.step()
        # sample rows only: non-square
        #for i in range(0, d.shape[0], sample_size):
        #    selection = random_perm[i:i+sample_size]
        #    to_keep = selection
        #    sample_adj_norm = sparse_mx_to_torch_sparse_tensor(sp.coo_matrix(d[to_keep, :]))
        #    sample_adj = adj[to_keep, :]
        #    sample_features = features
        #    recovered, mu, logvar = model(sample_features, sample_adj_norm)
        #    loss = loss_function(preds=recovered, labels=sample_adj_label,
        #                         mu=mu, logvar=logvar, n_nodes=n_nodes,
        #                         norm=norm, pos_weight=pos_weight)
        #    loss.backward()
        #    cur_loss = loss.item()
        #    optimizer.step()
        #    sys.stdout.write("\r                                                                                                                                       ")
        #    sys.stdout.write("\r" + "sampling [" + str(i) + "/" + str(d.shape[0]))
        #    sys.stdout.flush()

        # sample rows + take their postiives and add to rows (make it square)
        #for i in range(0, d.shape[0], sample_size):
        #    selection = random_perm[i:i+sample_size]
        #    to_keep = np.nonzero(d[selection, :])
        #    # (array([0, 1, 2, 2]), array([0, 1, 0, 1]))
        #    the_set = set(list(to_keep[0]) + list(to_keep[1]))
        #    temp = set(list(to_keep[0]))
        #    to_keep = list(the_set - column_exclude)
        #    # column_exclude.union(temp)
        #    # these ar ethe rows and columns that we need ne select
        #    sample_adj_norm = sparse_mx_to_torch_sparse_tensor(sp.coo_matrix(d[to_keep, :][:,to_keep]))
        #    sample_features = features[to_keep, :]
        #    sample_adj_label = adj_label[to_keep, :][:,to_keep]
        #    #print(samplei_adj_norm.shape)
        #    #print(sample_features.shape)
        #    #print(sample_adj_label.shape)
        #    #print(sample.shape)
        #    sample_adj = adj[to_keep, :][:,to_keep]
        #    pos_weight = float(sample_adj.shape[0] * sample_adj.shape[0] - sample_adj.sum()) / sample_adj.sum()
        #    pos_weight = torch.from_numpy(np.array(pos_weight))
        #    norm = sample_adj.shape[0] * sample_adj.shape[0] / float((sample_adj.shape[0] * sample_adj.shape[0] - sample_adj.sum()) * 2)

        #    n_nodes, feat_dim = sample_features.shape

        #    if mode == "VAE":
        #        recovered, mu, logvar = model(sample_features, sample_adj_norm)
        #        #recovered = recovered[i:i+500]
        #        #sample_adj_label = sample_adj_label[i:i+500]
        #        loss = loss_function(preds=recovered, labels=sample_adj_label,
        #                             mu=mu, logvar=logvar, n_nodes=n_nodes,
        #                             norm=norm, pos_weight=pos_weight)
        #    elif mode == "AE":
        #        recovered = model(sample_features, sample_adj_norm)
        #        loss = loss_function_ae(preds=recovered, labels=sample_adj_label,
        #                                norm=norm, pos_weight=pos_weight)
        #    loss.backward()
        #    cur_loss = loss.item()
        #    optimizer.step()
        #    sys.stdout.write("\r                                                                                                                                       ")
        #    sys.stdout.write("\r" + "sampling [" + str(i) + "/" + str(d.shape[0]) + "]....size of sample=" + str(len(sample_features)))
        #    sys.stdout.flush()

        sys.stdout.write(
            "\r                                                                                                                                          \r"
        )
        if (epoch + 1) % 1 == 0:
            model.eval()
            #adj_dense = adj_train.todense()
            #adj_val_dense = adj_val.todense()
            #adj_train_ext = np.pad(adj_dense, [[0,0], [adj_val_dense.shape[0], 0]], "constant")
            #adj_evaluate = np.concatenate([adj_val_dense, adj_train_ext])
            #zero = sp.csr_matrix((adj_train.shape[0], adj_val.shape[0]))
            #adj_train_ext = sp.hstack((zero, adj_train))
            #adj_evaluate = sp.vstack((adj_val, adj_train_ext))
            ##zeros = sp.csr_matrix((adj_evaluate.shape[0], adj_val.shape[1]))
            ##adj_evaluate = sp.hstack((zeros, adj_evaluate))
            #adj_evaluate = sp.csr_matrix(adj_evaluate)
            #rows, columns = adj_evaluate.nonzero()
            #for i in range(rows.shape[0]):
            #    if rows[i] < adj_val.shape[1]:
            #        adj_evaluate[columns[i], rows[i]] = adj_evaluate[rows[i], columns[i]]
            #    else:
            #        break

            #adj_evaluate_norm = preprocess_graph(adj_evaluate)

            #adj_label_evaluate = adj_evaluate + sp.eye(adj_evaluate.shape[0])
            #adj_label_evaluate = torch.FloatTensor(adj_label_evaluate.toarray())
            ##adj_label_evaluate = sparse_to_tuple(adj_label_evaluate)

            #features_evaluate = np.concatenate([features, features_valid])
            #features_evaluate = torch.from_numpy(features_evaluate)
            just_adj_evaluate = sparse_mx_to_torch_sparse_tensor(adj_evaluate)
            #recovered, mu, logvar = model(features_evaluate, just_adj_evaluate.coalesce().indices(), just_adj_evaluate.coalesce().values())
            #recovered, mu, logvar = model(features_evaluate, adj_evaluate_norm)
            #val_loss = loss_function(preds=recovered, labels=adj_label_evaluate,
            #                         mu=mu, logvar=logvar, n_nodes=n_nodes,
            #                         norm=norm, pos_weight=pos_weight)
            # if mode == "VAE":
            #     recovered, mu, logvar = model(features_evaluate, adj_evaluate_norm)
            #     val_loss = loss_function(preds=recovered, labels=adj_label_evaluate,
            #                              mu=mu, logvar=logvar, n_nodes=n_nodes,
            #                              norm=norm, pos_weight=pos_weight)
            # elif mode == "AE":
            #     mu = model(features_evaluate, adj_evaluate_norm)
            #     val_loss = loss_function_ae(preds=recovered, labels=adj_label_evaluate,
            #                              norm=norm, pos_weight=pos_weight)
            # elif mode == "VAE_batch":
            #     recovered, mu, logvar, val_loss = model(features_evaluate, adj_evaluate_norm, labels=adj_label_evaluate, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight, opt=None, training=False)
            # elif mode == "AE_batch":
            #     recovered, mu, val_loss = model(features_evaluate, adj_evaluate_norm, labels=adj_label_evaluate, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight, opt=None, training=False)
            # val_emb = mu.data.numpy()
            #roc_curr, ap_curr = get_roc_score(val_emb, val_edges, val_edges_false)

            # do one q at a time
            #revop_map = eval_each_q(model, adj_all, features_all, Q.shape[1])

            # hack by appending stuff on top of adj
            if mode == "VAE":
                _, mu, _ = model(features_all, adj_all_norm)
            elif mode == "AE":
                mu = model(features_all, adj_all_norm)
            elif mode == "VAE_batch":
                mu = model(features_all,
                           adj_all_norm,
                           None,
                           None,
                           None,
                           None,
                           None,
                           just_mu=True,
                           training=False)
            elif mode == "AE_batch":
                mu = model(features_all,
                           adj_all_norm,
                           None,
                           None,
                           None,
                           None,
                           None,
                           just_mu=True,
                           training=False)
            hidden_emb = mu.data.numpy()
            ## get validation loss
            #recovered, mu, logvar = model(features, adj_norm)
            #val_loss = loss_function(preds=recovered, labels=adj_label,
            #                         mu=mu, logvar=logvar, n_nodes=n_nodes,
            #                         norm=norm, pos_weight=pos_weight)
            revop_map = get_roc_score_matrix(hidden_emb, Q.shape[1])

            if best <= revop_map:
                emb = hidden_emb
                Q_end = Q.shape[1]
                best = revop_map
                best_epoch = epoch + 1
                # write it into a file and do egt on that
                #embQ = emb[:Q_end,:].T
                #embX = emb[Q_end:,:].T
                #np.save("/media/jason/28c9eee1-312e-47d0-88ce-572813ebd6f1/graph/gae-pytorch/best_embedding2.npy",hidden_emb)
                #concat = np.concatenate((embQ.T,embX.T))
                #revop_inner_prod = np.matmul(concat, concat.T)
                #revop_preds = np.argsort(-revop_inner_prod,axis=0)
                #if revop_map > 54:
                #    f = open("best_result.txt", "w")
                #    for i in range(revop_preds.shape[1]):
                #        if i < Q_end:
                #            f.write(qhashes[i] + ",")
                #        else:
                #            f.write(chashes[i - Q_end] + ",")
                #        for j in revop_preds[:,i]:
                #            if j < Q_end:
                #                f.write(qhashes[j] + " " + str(int(revop_inner_prod[j,i] * 1000)) + " ")
                #            else:
                #                f.write(chashes[j - Q_end] + " " + str(int(revop_inner_prod[j,i] * 1000)) + " ")
                #        f.write("\n")
                #        f.flush()
                #        #for j in range()
                #    f.close()

            if best_val_cost > -99.0:  #prev_val_loss - val_loss > 0 and prev_val_loss - val_loss > prev_loss - cur_loss and best_val_cost > val_loss:
                best_val_cost = -99.0
                best_val_epoch = epoch + 1
                best_val_epoch_revop = revop_map

            #if best_val_roc < roc_curr:
            #    best_val_roc = roc_curr
            #    best_val_roc_revop = revop_map

            #if best_val_ap < ap_curr:
            #    best_val_ap = ap_curr
            #    best_val_ap_revop = revop_map
            print(
                "Epoch:",
                '%04d' % (epoch + 1),
                "train_loss=",
                "{:.5f}".format(lossVal / lossValNorm),
                "val_loss=",
                "{:.5f}".format(-99.0),
                #"val_roc_curr=", "{:.5f}".format(roc_curr),
                #"val_ap_curr=", "{:.5f}".format(ap_curr),
                "revop=",
                "{:.5f}".format(revop_map),
                "best_revop=",
                "{:.5f}".format(best),
                "revop_at_best_val=",
                "{:.5f}".format(best_val_epoch_revop),
                #"revop_at_best_val_roc=", "{:.5f}".format(best_val_roc_revop),
                #"revop_at_best_ap_roc=", "{:.5f}".format(best_val_ap_revop),
                "time=",
                "{:.5f}".format(time.time() - t))
            prev_val_loss = -99.0
            prev_loss = -99.0
            t = time.time()
    print("Optimization Finished!")

    #roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false)
    #print('Test ROC score: ' + str(roc_score))
    #print('Test AP score: ' + str(ap_score))
    return best, best_val_epoch_revop, best_val_roc_revop, best_val_ap_revop
Пример #5
0
    def __init__(self, graph_edgelist, num_actions, dimension, learning_rate=0.01, epochs=300, hidden1=32, hidden2=16,
                 dropout=0., model_str='gcn_vae', use_features=0):

        """Initialize ExactBasis."""
        if graph_edgelist is None:
            raise ValueError('graph cannot be None')

        if dimension < 1:
            raise ValueError('dimension must be >= 1')

        self.__num_actions = BasisFunction._validate_num_actions(num_actions)

        self._dimension = dimension

        adj, features = self.read_graph(graph_edgelist)

        # Store original adjacency matrix (without diagonal entries) for later
        adj_orig = adj
        adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
        adj_orig.eliminate_zeros()

        adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj)
        # adj = adj_train

        if use_features == 0:
            features = sp.identity(features.shape[0])  # featureless

        # Some preprocessing
        adj_norm = preprocess_graph(adj)

        # Define placeholders
        placeholders = {
            'features': tf.sparse_placeholder(tf.float32),
            'adj': tf.sparse_placeholder(tf.float32),
            'adj_orig': tf.sparse_placeholder(tf.float32),
            'dropout': tf.placeholder_with_default(0., shape=())
        }

        num_nodes = adj.shape[0]

        features = sparse_to_tuple(features.tocoo())
        num_features = features[2][1]
        features_nonzero = features[1].shape[0]

        # Create model
        model = None
        if model_str == 'gcn_ae':
            model = GCNModelAE(placeholders, num_features, features_nonzero, hidden1, hidden2, dimension)
        elif model_str == 'gcn_vae':
            model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero, hidden1, dimension)

        pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
        norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

        # Optimizer
        with tf.name_scope('optimizer'):
            if model_str == 'gcn_ae':
                opt = OptimizerAE(preds=model.reconstructions,
                                  labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'],
                                                                              validate_indices=False), [-1]),
                                  pos_weight=pos_weight,
                                  norm=norm, learning_rate=learning_rate)
            elif model_str == 'gcn_vae':
                opt = OptimizerVAE(preds=model.reconstructions,
                                   labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'],
                                                                               validate_indices=False), [-1]),
                                   model=model, num_nodes=num_nodes,
                                   pos_weight=pos_weight,
                                   norm=norm, learning_rate=learning_rate)

        # Initialize session
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())

        adj_label = adj_train + sp.eye(adj_train.shape[0])
        adj_label = sparse_to_tuple(adj_label)

        # Train model
        for epoch in range(epochs):
            t = time.time()
            # Construct feed dictionary
            feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders)
            feed_dict.update({placeholders['dropout']: dropout})
            # Run single weight update
            outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict)

        print("GCN Optimization Finished!")

        feed_dict.update({placeholders['dropout']: 0})
        self.embeddings = sess.run(model.z_mean, feed_dict=feed_dict)
Пример #6
0
def gae(filename, output_dir):

    # Settings
    flags = tf.app.flags
    FLAGS = flags.FLAGS
    flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
    flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.')
    flags.DEFINE_integer('hidden1', 32, 'Number of units in hidden layer 1.')
    flags.DEFINE_integer('hidden2', 16, 'Number of units in hidden layer 2.')
    flags.DEFINE_float('weight_decay', 0.,
                       'Weight for L2 loss on embedding matrix.')
    flags.DEFINE_float('dropout', 0., 'Dropout rate (1 - keep probability).')
    flags.DEFINE_string('filename', 'email-Eu-core.mat', 'dataset')
    flags.DEFINE_string('model', 'gcn_vae', 'Model string.')
    flags.DEFINE_string('dataset', 'cora', 'Dataset string.')
    flags.DEFINE_integer('features', 0,
                         'Whether to use features (1) or not (0).')

    model_str = FLAGS.model
    # dataset_str = FLAGS.dataset

    # Load data
    # adj, features = load_data(dataset_str)
    adj, R, edges = load_network_data(filename)

    num_edges = np.sum(adj)
    length = adj.shape[0]
    A = np.array(adj, copy=True)
    adj = sp.csr_matrix(adj)
    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix(
        (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()

    adj_train, train_edges = mask_test_edges(adj)
    adj = adj_train

    if FLAGS.features == 0:
        features = sp.identity(adj.shape[0])  # featureless

    # Some preprocessing
    adj_norm = preprocess_graph(adj)

    # Define placeholders
    placeholders = {
        'features': tf.sparse_placeholder(tf.float32),
        'adj': tf.sparse_placeholder(tf.float32),
        'adj_orig': tf.sparse_placeholder(tf.float32),
        'dropout': tf.placeholder_with_default(0., shape=())
    }

    num_nodes = adj.shape[0]

    features = sparse_to_tuple(features.tocoo())
    num_features = features[2][1]
    features_nonzero = features[1].shape[0]

    # Create model
    model = None
    if model_str == 'gcn_ae':
        model = GCNModelAE(placeholders, num_features, features_nonzero)
    elif model_str == 'gcn_vae':
        model = GCNModelVAE(placeholders, num_features, num_nodes,
                            features_nonzero)

    pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
    norm = adj.shape[0] * adj.shape[0] / float(
        (adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

    # Optimizer
    with tf.name_scope('optimizer'):
        if model_str == 'gcn_ae':
            opt = OptimizerAE(preds=model.reconstructions,
                              labels=tf.reshape(
                                  tf.sparse_tensor_to_dense(
                                      placeholders['adj_orig'],
                                      validate_indices=False), [-1]),
                              pos_weight=pos_weight,
                              norm=norm)
        elif model_str == 'gcn_vae':
            opt = OptimizerVAE(preds=model.reconstructions,
                               labels=tf.reshape(
                                   tf.sparse_tensor_to_dense(
                                       placeholders['adj_orig'],
                                       validate_indices=False), [-1]),
                               model=model,
                               num_nodes=num_nodes,
                               pos_weight=pos_weight,
                               norm=norm)

    # Initialize session
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    adj_label = adj_train + sp.eye(adj_train.shape[0])
    adj_label = sparse_to_tuple(adj_label)

    # Train model
    for epoch in range(FLAGS.epochs):

        t = time.time()
        # Construct feed dictionary
        feed_dict = construct_feed_dict(adj_norm, adj_label, features,
                                        placeholders)
        feed_dict.update({placeholders['dropout']: FLAGS.dropout})
        # Run single weight update
        outs = sess.run([opt.opt_op, opt.cost, opt.accuracy],
                        feed_dict=feed_dict)
        # Compute average loss
        # avg_cost = outs[1]
        # avg_accuracy = outs[2]
        #
        # if (epoch + 1) % 10 == 0:
        #     print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost),
        #           "train_acc=", "{:.5f}".format(avg_accuracy), "time=", "{:.5f}".format(time.time() - t))

    print("GAE Optimization Finished!")

    feed_dict.update({placeholders['dropout']: 0})
    emb = sess.run(model.z_mean, feed_dict=feed_dict)

    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    # Predict on test set of edges
    adj_rec = np.dot(emb, emb.T)
    adj_rec = np.array(adj_rec)
    # adj_rec = adj_rec[1:length, :][:, 1:length]
    DD = np.sort(adj_rec.flatten())
    threshold = DD[int(-1 * num_edges)]
    network_C = np.array([[
        0 if adj_rec[i, j] < threshold else 1 for i in range(adj_rec.shape[0])
    ] for j in range(adj_rec.shape[1])],
                         dtype=np.int8)
    # np.save('../data/GAE_network.npy', network_C[1:length, :][:, 1:length])
    os.chdir('../')
    np.save('{}/GAE_network.npy'.format(output_dir, filename),
            network_C[1:length, :][:, 1:length])

    A_copy = adj_rec
    final_network = [A_copy]
    # orinal_network = [A]
    for i in range(1, 5):
        adjacent_matrix = tf.placeholder(tf.float32, shape=A_copy.shape)
        R_matrix = tf.placeholder(tf.float32, shape=R[i - 1, 0].shape)
        A_copy = sess.run(tf.matmul(tf.matmul(R_matrix, adjacent_matrix),
                                    tf.transpose(R_matrix)),
                          feed_dict={
                              R_matrix: R[i - 1, 0].todense(),
                              adjacent_matrix: A_copy
                          })
        final_network.append(np.array(A_copy))

        # adjacent_matrix = tf.placeholder(tf.float32, shape=A.shape)
        # R_matrix = tf.placeholder(tf.float32, shape=R[i - 1, 0].shape)
        # A = sess.run(tf.matmul(tf.matmul(R_matrix, adjacent_matrix), tf.transpose(R_matrix)),
        #                        feed_dict={R_matrix: R[i - 1, 0].todense(), adjacent_matrix: A})
        # orinal_network.append(A)
    # draw_graph(final_network, edges, output_dir)
    network_B = final_network[0]
    print('Generating graph by GAE algorithm.')
    DD = np.sort(network_B.flatten())[::-1]
    threshold = DD[edges[0, 0]]
    network_C = np.array([[
        0 if network_B[i, j] < threshold else 1
        for i in range(network_B.shape[0])
    ] for j in range(network_B.shape[1])])
    _A_obs = network_C + network_C.T
    _A_obs[_A_obs > 1] = 1
    _A_obs = np.array(_A_obs)
    print('Computing metrics for graph generated by GAE')
    c = compute_graph_statistics(_A_obs)
    with open('{}/gae_network_statistics.pickle'.format(output_dir),
              'wb') as handle:
        pickle.dump(c, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print(c)
Пример #7
0
    'adj_orig': tf.sparse_placeholder(tf.float32),
    'dropout': tf.placeholder_with_default(0., shape=())
    # 'features_nonzero': tf.placeholder_with_default(features_nonzero, shape=()),
    # 'num_nodes': tf.placeholder_with_default(num_nodes, shape=())
}

# How much to weigh positive examples (true edges) in cost print_function
  # Want to weigh less-frequent classes higher, so as to prevent model output bias
  # pos_weight = (num. negative samples / (num. positive samples)
pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()

# normalize (scale) average weighted cost
norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2)
 
# Create VAE model
model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero,
                   HIDDEN1_DIM, HIDDEN2_DIM)

opt = OptimizerVAE(preds=model.reconstructions,
                           labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'],
                                                                       validate_indices=False), [-1]),
                           model=model, num_nodes=num_nodes,
                           pos_weight=pos_weight,
                           norm=norm,
                           learning_rate=LEARNING_RATE)

cost_val = []
acc_val = []
val_roc_score = []

# Initialize session
sess = tf.Session()
Пример #8
0
def gae_scores(
    adj_sparse,
    train_test_split,
    features_matrix=None,
    LEARNING_RATE = 0.01,
    EPOCHS = 250,
    HIDDEN1_DIM = 32,
    HIDDEN2_DIM = 16,
    DROPOUT = 0,
    edge_score_mode="dot-product",
    verbose=1,
    dtype=tf.float32
    ):
    adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \
        test_edges, test_edges_false = train_test_split # Unpack train-test split

    if verbose >= 1:
        print('GAE preprocessing...')

    # start_time = time.time()

    # 由于内存限制,使用CPU (隐藏 GPU)训练
    os.environ['CUDA_VISIBLE_DEVICES'] = ""

    # 特征转换 正常矩阵 --> 稀疏矩阵 --> 元组
    # 特征元组包含: (矩阵坐标列表, 矩阵值列表, 矩阵维度)
    if features_matrix is None:
        x = sp.lil_matrix(np.identity(adj_sparse.shape[0]))
    else:
        x = sp.lil_matrix(features_matrix)
    features_tuple = sparse_to_tuple(x)
    features_shape = features_tuple[2]

    # 获取图属性 (用于输入模型)
    num_nodes = adj_sparse.shape[0] # 邻接矩阵的节点数量
    num_features = features_shape[1] # 特征数量 (特征矩阵的列数)
    features_nonzero = features_tuple[1].shape[0] # 特征矩阵中的非零条目数(或者矩阵值列表长度)

    # 保存原始邻接矩阵 (没有对角线条目) 到后面使用
    adj_orig = deepcopy(adj_sparse)
    adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()

    # 归一化邻接矩阵
    adj_norm = preprocess_graph(adj_train)

    # 添加对角线
    adj_label = adj_train + sp.eye(adj_train.shape[0])
    adj_label = sparse_to_tuple(adj_label)

    # 定义占位符
    placeholders = {
        'features': tf.sparse_placeholder(tf.float32),
        'adj': tf.sparse_placeholder(tf.float32),
        'adj_orig': tf.sparse_placeholder(tf.float32),
        'dropout': tf.placeholder_with_default(0., shape=())
    }

    # How much to weigh positive examples (true edges) in cost print_function
      # Want to weigh less-frequent classes higher, so as to prevent model output bias
      # pos_weight = (num. negative samples / (num. positive samples)
    pos_weight = float(adj_sparse.shape[0] * adj_sparse.shape[0] - adj_sparse.sum()) / adj_sparse.sum()

    # normalize (scale) average weighted cost
    norm = adj_sparse.shape[0] * adj_sparse.shape[0] / float((adj_sparse.shape[0] * adj_sparse.shape[0] - adj_sparse.sum()) * 2)

    if verbose >= 1:
        print('Initializing GAE model...')

    # 创建 VAE 模型
    model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero,
                       HIDDEN1_DIM, HIDDEN2_DIM, dtype=dtype, flatten_output=False)

    opt = OptimizerVAE(preds=model.reconstructions,
                               labels=tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False),
                               # labels=placeholders['adj_orig'],
                               model=model, num_nodes=num_nodes,
                               pos_weight=pos_weight,
                               norm=norm,
                               learning_rate=LEARNING_RATE,
                               dtype=tf.float32)

    cost_val = []
    acc_val = []
    val_roc_score = []

    prev_embs = []

    # 初始化 session
    sess = tf.Session()

    if verbose >= 1:
        # 打印所有可训练的变量
        total_parameters = 0
        for variable in tf.trainable_variables():
            # shape 是tf.Dimension的一个数组
            shape = variable.get_shape()
            print("Variable shape: ", shape)
            variable_parameters = 1
            for dim in shape:
                print("Current dimension: ", dim)
                variable_parameters *= dim.value
            print("Variable params: ", variable_parameters)
            total_parameters += variable_parameters
            print('')
        print("TOTAL TRAINABLE PARAMS: ", total_parameters)

        print('Initializing TF variables...')

    sess.run(tf.global_variables_initializer())

    if verbose >= 1:
        print('Starting GAE training!')

    start_time = time.time()
    # 训练模型
    train_loss = []
    train_acc = []
    val_roc = []
    val_ap = []
    for epoch in range(EPOCHS):

        t = time.time()
        # 构造 feed dictionary
        feed_dict = construct_feed_dict(adj_norm, adj_label, features_tuple, placeholders)
        feed_dict.update({placeholders['dropout']: DROPOUT})
        # 单一权重更新
        outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict)

        # 计算平均损失
        avg_cost = outs[1]
        avg_accuracy = outs[2]

        # 评估预测
        feed_dict.update({placeholders['dropout']: 0})
        gae_emb = sess.run(model.z_mean, feed_dict=feed_dict)

        prev_embs.append(gae_emb)

        gae_score_matrix = np.dot(gae_emb, gae_emb.T)



        roc_curr, ap_curr = get_roc_score(val_edges, val_edges_false, gae_score_matrix, apply_sigmoid=True)
        val_roc_score.append(roc_curr)

        # 每次迭代打印结果
        # if verbose == 2:
        #     print(("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost),
        #       "train_acc=", "{:.5f}".format(avg_accuracy), "val_roc=", "{:.5f}".format(val_roc_score[-1]),
        #       "val_ap=", "{:.5f}".format(ap_curr),
        #       "time=", "{:.5f}".format(time.time() - t)))

        train_loss.append(avg_cost)
        train_acc.append(avg_accuracy)
        val_roc.append(val_roc_score[-1])
        val_ap.append(ap_curr)

    # 画出训练过程损失和准确度以及验证AUC和AP
    #draw_gae_training('hamster', EPOCHS, train_loss, train_acc, val_roc, val_ap)
    runtime = time.time() - start_time
    if verbose == 2:
        print("Optimization Finished!")

    # 打印最终结果
    feed_dict.update({placeholders['dropout']: 0})
    gae_emb = sess.run(model.z_mean, feed_dict=feed_dict)

    # 点积边得分
    if edge_score_mode == "dot-product":
        gae_score_matrix = np.dot(gae_emb, gae_emb.T)

        # runtime = time.time() - start_time

        # 计算最终得分
        gae_val_roc, gae_val_ap = get_roc_score(val_edges, val_edges_false, gae_score_matrix)
        gae_test_roc, gae_test_ap = get_roc_score(test_edges, test_edges_false, gae_score_matrix)
    
    # 采取自举边嵌入 (通过哈达玛积)
    elif edge_score_mode == "edge-emb":
        def get_edge_embeddings(edge_list):
            embs = []
            for edge in edge_list:
                node1 = edge[0]
                node2 = edge[1]
                emb1 = gae_emb[node1]
                emb2 = gae_emb[node2]
                edge_emb = np.multiply(emb1, emb2)
                embs.append(edge_emb)
            embs = np.array(embs)
            return embs

        # 训练集 边嵌入
        pos_train_edge_embs = get_edge_embeddings(train_edges)
        neg_train_edge_embs = get_edge_embeddings(train_edges_false)
        train_edge_embs = np.concatenate([pos_train_edge_embs, neg_train_edge_embs])

        # 创建训练集 边标签: 1 = real edge, 0 = false edge
        train_edge_labels = np.concatenate([np.ones(len(train_edges)), np.zeros(len(train_edges_false))])

        # 验证集 边嵌入,标签
        if len(val_edges) > 0 and len(val_edges_false) > 0:
            pos_val_edge_embs = get_edge_embeddings(val_edges)
            neg_val_edge_embs = get_edge_embeddings(val_edges_false)
            val_edge_embs = np.concatenate([pos_val_edge_embs, neg_val_edge_embs])
            val_edge_labels = np.concatenate([np.ones(len(val_edges)), np.zeros(len(val_edges_false))])

        # 测试集 边嵌入,标签
        pos_test_edge_embs = get_edge_embeddings(test_edges)
        neg_test_edge_embs = get_edge_embeddings(test_edges_false)
        test_edge_embs = np.concatenate([pos_test_edge_embs, neg_test_edge_embs])

        # 创建验证集 边标签: 1 = real edge, 0 = false edge
        test_edge_labels = np.concatenate([np.ones(len(test_edges)), np.zeros(len(test_edges_false))])

        # 在训练集边嵌入上训练逻辑回归分类器
        edge_classifier = LogisticRegression(random_state=0, solver='liblinear')
        edge_classifier.fit(train_edge_embs, train_edge_labels)

        #预测边得分: 分为1类(真实边)的概率
        if len(val_edges) > 0 and len(val_edges_false) > 0:
            val_preds = edge_classifier.predict_proba(val_edge_embs)[:, 1]
        test_preds = edge_classifier.predict_proba(test_edge_embs)[:, 1]

        #runtime = time.time() - start_time

        # 计算得分
        if len(val_edges) > 0 and len(val_edges_false) > 0:
            gae_val_roc = roc_auc_score(val_edge_labels, val_preds)
            gae_val_roc_curve = roc_curve(val_edge_labels, val_preds)
            gae_val_ap = average_precision_score(val_edge_labels, val_preds)
        else:
            gae_val_roc = None
            gae_val_roc_curve = None
            gae_val_ap = None
        
        gae_test_roc = roc_auc_score(test_edge_labels, test_preds)
        gae_test_roc_curve = roc_curve(test_edge_labels, test_preds)
        gae_test_pr_curve = precision_recall_curve(test_edge_labels, test_preds)
        gae_test_ap = average_precision_score(test_edge_labels, test_preds)

    # 记录得分
    gae_scores = {}

    gae_scores['test_roc'] = gae_test_roc
    gae_scores['test_ap'] = gae_test_ap

    gae_scores['val_roc'] = gae_val_roc
    gae_scores['val_ap'] = gae_val_ap

    if(edge_score_mode=="edge-emb"):
        gae_scores['test_roc_curve'] = gae_test_roc_curve
        gae_scores['val_roc_curve'] = gae_val_roc_curve
        gae_scores['test_pr_curve'] = gae_test_pr_curve

    gae_scores['val_roc_per_epoch'] = val_roc_score
    gae_scores['runtime'] = runtime
    return gae_scores
Пример #9
0
def gae_for(args):
    print("Using {} dataset".format(args.dataset_str))
    adj, features = load_data(args.dataset_str)
    n_nodes, feat_dim = features.shape

    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix(
        (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()

    adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(
        adj)
    adj = adj_train

    # Some preprocessing
    adj_norm = preprocess_graph(adj)
    adj_label = adj_train + sp.eye(adj_train.shape[0])
    # adj_label = sparse_to_tuple(adj_label)
    adj_label = torch.FloatTensor(adj_label.toarray())

    pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
    norm = adj.shape[0] * adj.shape[0] / float(
        (adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

    lst_result = []
    for i in range(10):
        model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout)
        optimizer = optim.Adam(model.parameters(), lr=args.lr)

        hidden_emb = None
        max_roc_ap = 0
        for epoch in range(args.epochs):
            t = time.time()
            model.train()
            optimizer.zero_grad()
            recovered, mu, logvar = model(features, adj_norm)
            loss = loss_function(preds=recovered,
                                 labels=adj_label,
                                 mu=mu,
                                 logvar=logvar,
                                 n_nodes=n_nodes,
                                 norm=norm,
                                 pos_weight=pos_weight)
            loss.backward()
            cur_loss = loss.item()
            optimizer.step()

            hidden_emb = mu.data.numpy()

            roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges,
                                              val_edges_false)
            roc_ap = roc_curr + ap_curr
            if max_roc_ap < roc_ap:
                max_roc_ap = roc_ap
                h_emb_best_model = hidden_emb

            print("Epoch:", '%04d' % (epoch + 1), "train_loss=",
                  "{:.5f}".format(cur_loss), "val_ap=",
                  "{:.5f}".format(ap_curr), "time=",
                  "{:.5f}".format(time.time() - t))
            roc_score, ap_score = get_roc_score(hidden_emb, adj_orig,
                                                test_edges, test_edges_false)
            print('Test ROC score: ' + str(roc_score))
            print('Test AP score: ' + str(ap_score))
            print("---------------------------------------")
        print("Optimization Finished!: ", i)
        roc_score, ap_score = get_roc_score(h_emb_best_model, adj_orig,
                                            test_edges, test_edges_false)

        # roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false)
        lst_result.append([i, roc_score, ap_score])

        print('Test ROC score: ' + str(roc_score))
        print('Test AP score: ' + str(ap_score))
    lst_result = np.array(lst_result)
    csv_info = np.append(
        lst_result,
        [["mean", np.mean(lst_result[:, 1]),
          np.mean(lst_result[:, 2])]],
        axis=0)
    csv_info = np.append(
        csv_info,
        [["std", np.std(lst_result[:, 1]),
          np.std(lst_result[:, 2])]],
        axis=0)

    t = int(time.time())
    folder = Path(os.path.join(os.getcwd(), "csv"))
    csv_name = "{}_{}_{}_{}_{}.csv".format(args.dataset_str, args.epochs,
                                           args.hidden1, args.hidden2, t)

    df = pd.DataFrame(csv_info, columns=['run', 'ROC', "AP"])
    df.to_csv(os.path.join(folder, csv_name))
Пример #10
0
    def gcn_multilayer(self):
        """Neural embedding of a multilayer network"""
        all_nodes = self.get_all_nodes()
        tmp_fname = pjoin(self.out_dir, 'tmp.emb')
        for net_name, net in self.nets.items():
            self.log.info('Run GCN For Net: %s' % net_name)
            # =============================================================
            adjacency_matrix = nx.adjacency_matrix(net)
            adjacency_matrix = adjacency_matrix.todense()
            nodes_count = adjacency_matrix.shape[0]
            adj = adjacency_matrix
            features = sp.identity(nodes_count)
            adj = sp.csr_matrix(adj)
            # ----------------myCode-----------------------------------
            # Store original adjacency matrix (without diagonal entries) for later
            adj_orig = adj
            adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
            adj_orig.eliminate_zeros()
            # tst_actual_matrix = adj.toarray()
            adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj)
            adj = adj_train
            # -----------------------------myCode-------------------------
            # if FLAGS.features == 0:
            #    features = sp.identity(features.shape[0])  # featureless
            # -----------------------------myCode-------------------------
            # Some pre processing
            adj_norm = preprocess_graph(adj)
            # Define placeholders
            placeholders = {
                'features': tf.sparse_placeholder(tf.float32),
                'adj': tf.sparse_placeholder(tf.float32),
                'adj_orig': tf.sparse_placeholder(tf.float32),
                'dropout': tf.placeholder_with_default(0., shape=())
            }
            num_nodes = adj.shape[0]
            features = sparse_to_tuple(features.tocoo())
            num_features = features[2][1]
            features_nonzero = features[1].shape[0]
            # Create model
            model = None
            if self.model_str == 'gcn_ae':
                model = GCNModelAE(placeholders, num_features, features_nonzero, self.hidden1, self.hidden2)
            elif self.model_str == 'gcn_vae':
                model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero, self.hidden1, self.hidden2)

            pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
            norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

            # Optimizer
            with tf.name_scope('optimizer'):
                if self.model_str == 'gcn_ae':
                    opt = OptimizerAE(preds=model.reconstructions,
                                      labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'],
                                                                                  validate_indices=False), [-1]),
                                      pos_weight=pos_weight,
                                      norm=norm)
                elif self.model_str == 'gcn_vae':
                    opt = OptimizerVAE(preds=model.reconstructions,
                                       labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'],
                                                                                   validate_indices=False), [-1]),
                                       model=model, num_nodes=num_nodes,
                                       pos_weight=pos_weight,
                                       norm=norm)

            # Initialize session
            sess = tf.Session()
            sess.run(tf.global_variables_initializer())

            cost_val = []
            acc_val = []

            def get_roc_score(edges_pos, edges_neg, emb=None):
                if emb is None:
                    feed_dict.update({placeholders['dropout']: 0})
                    emb = sess.run(model.z_mean, feed_dict=feed_dict)

                def sigmoid(x):
                    return 1 / (1 + np.exp(-x))

                # Predict on test set of edges
                adj_rec = np.dot(emb, emb.T)
                preds = []
                pos = []
                for e in edges_pos:
                    preds.append(sigmoid(adj_rec[e[0], e[1]]))
                    pos.append(adj_orig[e[0], e[1]])

                preds_neg = []
                neg = []
                for e in edges_neg:
                    preds_neg.append(sigmoid(adj_rec[e[0], e[1]]))
                    neg.append(adj_orig[e[0], e[1]])

                preds_all = np.hstack([preds, preds_neg])
                labels_all = np.hstack([np.ones(len(preds)), np.zeros(len(preds_neg))])
                roc_score = roc_auc_score(labels_all, preds_all)
                ap_score = average_precision_score(labels_all, preds_all)

                return roc_score, ap_score

            cost_val = []
            acc_val = []
            val_roc_score = []
            adj_label = adj_train + sp.eye(adj_train.shape[0])
            adj_label = sparse_to_tuple(adj_label)
            # Train model
            # for epoch in range(FLAGS.epochs):
            # epochs = 10
            dropout = 0
            for epoch in range(self.n_iter):
                self.log.info('Iteration: %d' % epoch)
                t = time.time()
                # Construct feed dictionary
                feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders)
                # feed_dict.update({placeholders['dropout']: FLAGS.dropout})
                # -----------myCode------------
                feed_dict.update({placeholders['dropout']: dropout})
                # -----------myCode------------
                # Run single weight update
                outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict)

                # Compute average loss
                avg_cost = outs[1]
                avg_accuracy = outs[2]

                roc_curr, ap_curr = get_roc_score(val_edges, val_edges_false)
                val_roc_score.append(roc_curr)

                print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost),
                      "train_acc=", "{:.5f}".format(avg_accuracy), "val_roc=", "{:.5f}".format(val_roc_score[-1]),
                      "val_ap=", "{:.5f}".format(ap_curr),
                      "time=", "{:.5f}".format(time.time() - t))

            print("Optimization Finished!")
            roc_score, ap_score = get_roc_score(test_edges, test_edges_false)
            print('Test ROC score: ' + str(roc_score))
            print('Test AP score: ' + str(ap_score))

            # ------vector generation -----------------------------
            vectors = sess.run(model.embeddings, feed_dict=feed_dict)
            fname = self.out_dir + net_name +'vectors.txt'
            # with open(fname, 'a+') as fout:
            #     for line in np.array(vectors):
            #         fout.write(line + "\n")
            np.savetxt(fname, np.array(vectors), fmt="%s", delimiter='  ')
            self.log.info('Saving vectors: %s' % fname)
            # ==============================================================
            self.log.info('after exec gcn : %s' % net_name)

        self.log.info('Done!')
Пример #11
0
    def run(self):
        if self.file_expr == '':
            # text-image-code combination
            n_by_n, x_train, y_train, train_mask, val_mask, test_mask, idx_supernodes, label_encoder = graph_generator.load_combo(
                self.labels_dict)
        else:
            n_by_n, x_train, y_train, train_mask, val_mask, test_mask, idx_supernodes, label_encoder = graph_generator.load_data(
                self.labels_dict,
                self.file_expr,
                min_valid_triples=self.min_valid_triples,
                sep=self.file_sep,
                select_rels=self.select_rels)
        self.idx_supernodes = idx_supernodes
        adj = nx.adjacency_matrix(nx.from_scipy_sparse_matrix(
            n_by_n))  #nx.adjacency_matrix(nx.from_numpy_array(n_by_n))
        features = scipy.sparse.csr.csr_matrix(x_train)

        # Store original adjacency matrix (without diagonal entries) for later
        adj_orig = adj
        adj_orig = adj_orig - sp.dia_matrix(
            (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
        adj_orig.eliminate_zeros()
        self.adj_orig = adj_orig

        adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges2(
            adj)
        adj = adj_train
        # Some preprocessing
        adj_norm = preprocess_graph(adj)
        num_nodes = adj.shape[0]

        if not self.use_features:
            features = sp.identity(features.shape[0])  # featureless
        features = sparse_to_tuple(features.tocoo())
        num_features = features[2][1]
        features_nonzero = features[1].shape[0]

        # Create model
        if model_str == 'gcn_ae':
            self.model = GCNModelAE(self.placeholders, num_features,
                                    features_nonzero)
        elif model_str == 'gcn_vae':
            self.model = GCNModelVAE(self.placeholders, num_features,
                                     num_nodes, features_nonzero)

        pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
        norm = adj.shape[0] * adj.shape[0] / float(
            (adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

        # Optimizer
        with tf.name_scope('optimizer'):
            if model_str == 'gcn_ae':
                opt = OptimizerAE(preds=self.model.reconstructions,
                                  labels=tf.reshape(
                                      tf.sparse_tensor_to_dense(
                                          self.placeholders['adj_orig'],
                                          validate_indices=False), [-1]),
                                  pos_weight=pos_weight,
                                  norm=norm)
            elif model_str == 'gcn_vae':
                opt = OptimizerVAE(preds=self.model.reconstructions,
                                   labels=tf.reshape(
                                       tf.sparse_tensor_to_dense(
                                           self.placeholders['adj_orig'],
                                           validate_indices=False), [-1]),
                                   model=self.model,
                                   num_nodes=num_nodes,
                                   pos_weight=pos_weight,
                                   norm=norm)

        # Initialize session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        val_roc_score = []
        adj_label = adj_train + sp.eye(adj_train.shape[0])
        adj_label = sparse_to_tuple(adj_label)

        #import datetime
        #log_dir="logs/gae/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

        # Train model
        for epoch in range(self.epochs):  #FLAGS.epochs):
            t = time.time()
            # Construct feed dictionary
            self.feed_dict = construct_feed_dict(adj_norm, adj_label, features,
                                                 self.placeholders)
            self.feed_dict.update(
                {self.placeholders['dropout']:
                 self.dropout_rate})  # FLAGS.dropout})
            # Run single weight update
            outs = self.sess.run([opt.opt_op, opt.cost, opt.accuracy],
                                 feed_dict=self.feed_dict)

            # Compute average loss
            avg_cost = outs[1]
            avg_accuracy = outs[2]

            roc_curr, ap_curr = self.get_roc_score(val_edges, val_edges_false)
            val_roc_score.append(roc_curr)

            #    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

            print("Epoch:", '%04d' % (epoch + 1), "train_loss=",
                  "{:.5f}".format(avg_cost), "train_acc=",
                  "{:.5f}".format(avg_accuracy), "val_roc=",
                  "{:.5f}".format(val_roc_score[-1]), "val_ap=",
                  "{:.5f}".format(ap_curr), "time=",
                  "{:.5f}".format(time.time() - t))

        print("Optimization Finished!")

        roc_score, ap_score = self.get_roc_score(test_edges, test_edges_false)
        print('Test ROC score: ' + str(roc_score))
        print('Test AP score: ' + str(ap_score))

        [supernodes, supernodes_embeddings,
         supernodes_labels] = self.get_embeddings(y_train, label_encoder)
        self.supernodes = [
            supernodes, supernodes_embeddings, supernodes_labels
        ]
Пример #12
0
def main(args):
    """ Compute embeddings using GAE/VGAE. """

    # Load edgelist
    oneIndx = False
    E = np.loadtxt(args.inputgraph, delimiter=args.delimiter, dtype=int)
    if np.min(E) == 1:
        oneIndx = True
        E -= 1

    # Create an unweighted graph
    G = nx.Graph()
    G.add_edges_from(E[:, :2])

    # Get adj matrix of the graph
    tr_A = nx.adjacency_matrix(G, weight=None)
    num_nodes = tr_A.shape[0]

    # Set main diag to 1s and normalize (algorithm requirement)
    adj_norm = preprocess_graph(tr_A)

    # Define placeholders
    placeholders = {
        'features': tf.sparse_placeholder(tf.float32),
        'adj': tf.sparse_placeholder(tf.float32),
        'adj_orig': tf.sparse_placeholder(tf.float32),
        'dropout': tf.placeholder_with_default(0., shape=())
    }

    # Create empty feature matrix
    features = sp.identity(num_nodes)  # featureless
    features = sparse_to_tuple(features.tocoo())
    num_features = features[2][1]
    features_nonzero = features[1].shape[0]

    # Create model
    model = None
    if args.model == 'gcn_ae':
        model = GCNModelAE(placeholders, num_features, features_nonzero)
    elif args.model == 'gcn_vae':
        model = GCNModelVAE(placeholders, num_features, num_nodes,
                            features_nonzero)

    pos_weight = float(tr_A.shape[0] * tr_A.shape[0] - tr_A.sum()) / tr_A.sum()
    norm = tr_A.shape[0] * tr_A.shape[0] / float(
        (tr_A.shape[0] * tr_A.shape[0] - tr_A.sum()) * 2)

    # Optimizer
    with tf.name_scope('optimizer'):
        if args.model == 'gcn_ae':
            opt = OptimizerAE(preds=model.reconstructions,
                              labels=tf.reshape(
                                  tf.sparse_tensor_to_dense(
                                      placeholders['adj_orig'],
                                      validate_indices=False), [-1]),
                              pos_weight=pos_weight,
                              norm=norm)
        elif args.model == 'gcn_vae':
            opt = OptimizerVAE(preds=model.reconstructions,
                               labels=tf.reshape(
                                   tf.sparse_tensor_to_dense(
                                       placeholders['adj_orig'],
                                       validate_indices=False), [-1]),
                               model=model,
                               num_nodes=num_nodes,
                               pos_weight=pos_weight,
                               norm=norm)

    # Initialize session
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    adj_label = tr_A + sp.eye(tr_A.shape[0])
    adj_label = sparse_to_tuple(adj_label)

    # Train model
    for epoch in range(FLAGS.epochs):
        # Construct feed dictionary
        feed_dict = construct_feed_dict(adj_norm, adj_label, features,
                                        placeholders)
        feed_dict.update({placeholders['dropout']: FLAGS.dropout})
        # Run single weight update
        outs = sess.run([opt.opt_op, opt.cost, opt.accuracy],
                        feed_dict=feed_dict)
        print("Epoch:", '%04d' % (epoch + 1), "train_loss=",
              "{:.5f}".format(outs[1]), "train_acc=", "{:.5f}".format(outs[2]))

    # Compute predictions
    feed_dict.update({placeholders['dropout']: 0})
    emb = sess.run(model.z_mean, feed_dict=feed_dict)

    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    # Node similarities
    adj_rec = np.dot(emb, emb.T)

    start = time.time()
    # Read the train edges and compute similarity
    if args.tr_e is not None:
        train_edges = np.loadtxt(args.tr_e,
                                 delimiter=args.delimiter,
                                 dtype=int)
        if oneIndx:
            train_edges -= 1
        scores = list()
        for src, dst in train_edges:
            scores.append(sigmoid(adj_rec[src, dst]))
        np.savetxt(args.tr_pred, scores, delimiter=args.delimiter)

        # Read the test edges and run predictions
        if args.te_e is not None:
            test_edges = np.loadtxt(args.te_e,
                                    delimiter=args.delimiter,
                                    dtype=int)
            if oneIndx:
                test_edges -= 1
            scores = list()
            for src, dst in test_edges:
                scores.append(sigmoid(adj_rec[src, dst]))
            np.savetxt(args.te_pred, scores, delimiter=args.delimiter)

    # If no edge lists provided to predict links, then just store the embeddings
    else:
        np.savetxt(args.output, emb, delimiter=args.delimiter)

    print('Prediction time: {}'.format(time.time() - start))
Пример #13
0
    def fit(self, adj, features, labels):
        adj_orig = adj
        adj_orig = adj_orig - sp.dia_matrix(
            (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
        adj_orig.eliminate_zeros()
        adj_train = gen_train_edges(adj)

        adj = adj_train

        # Some preprocessing
        adj_norm = preprocess_graph(adj)
        num_nodes = adj.shape[0]
        input_feature_dim = features.shape[1]
        features = normalize_vectors(features)

        # Define placeholders
        self.placeholders = {
            'features':
            tf.compat.v1.placeholder(tf.float32,
                                     shape=(None, input_feature_dim)),
            # 'features': tf.compat.v1.sparse_placeholder(tf.float32),
            'adj':
            tf.compat.v1.sparse_placeholder(tf.float32),
            'adj_orig':
            tf.compat.v1.sparse_placeholder(tf.float32),
            'dropout':
            tf.compat.v1.placeholder_with_default(0., shape=())
        }

        if self.model_type == 'gcn_ae':
            self.model = GCNModelAE(self.placeholders, input_feature_dim)
        elif self.model_type == 'gcn_vae':
            self.model = GCNModelVAE(self.placeholders, input_feature_dim,
                                     num_nodes)
        pos_weight = float(adj.shape[0] * adj.shape[0] -
                           adj.sum()) / adj.sum()  # negative edges/pos edges
        # print('positive edge weight', pos_weight)
        norm = adj.shape[0] * adj.shape[0] / float(
            (adj.shape[0] * adj.shape[0] - adj.nnz) * 2)

        # Optimizer
        with tf.compat.v1.name_scope('optimizer'):
            if self.model_type == 'gcn_ae':
                opt = OptimizerAE(preds=self.model.reconstructions,
                                  labels=tf.reshape(
                                      tf.sparse.to_dense(
                                          self.placeholders['adj_orig'],
                                          validate_indices=False), [-1]),
                                  pos_weight=pos_weight,
                                  norm=norm)
            elif self.model_type == 'gcn_vae':
                opt = OptimizerVAE(preds=self.model.reconstructions,
                                   labels=tf.reshape(
                                       tf.sparse.to_dense(
                                           self.placeholders['adj_orig'],
                                           validate_indices=False), [-1]),
                                   model=self.model,
                                   num_nodes=num_nodes,
                                   pos_weight=pos_weight,
                                   norm=norm)

        # Initialize session
        self.sess = tf.compat.v1.Session()
        self.sess.run(tf.compat.v1.global_variables_initializer())

        adj_label = adj_train + sp.eye(adj_train.shape[0])
        adj_label = sparse_to_tuple(adj_label)

        # Train model
        for epoch in range(FLAGS.epochs):

            t = time.time()
            # Construct feed dictionary
            self.feed_dict = construct_feed_dict(adj_norm, adj_label, features,
                                                 self.placeholders)
            self.feed_dict.update(
                {self.placeholders['dropout']: FLAGS.dropout})
            # Run single weight update
            outs = self.sess.run([opt.opt_op, opt.cost, opt.accuracy],
                                 feed_dict=self.feed_dict)

            # Compute average loss
            avg_cost = outs[1]
            avg_accuracy = outs[2]
def gae_scores(
    adj_sparse,
    train_test_split,
    features_matrix=None,
    LEARNING_RATE = 0.01,
    EPOCHS = 200,
    HIDDEN1_DIM = 32,
    HIDDEN2_DIM = 16,
    DROPOUT = 0,
    edge_score_mode="dot-product",
    verbose=1,
    dtype=tf.float32
    ):
    adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \
        test_edges, test_edges_false = train_test_split # Unpack train-test split

    if verbose >= 1:
        print 'GAE preprocessing...'

    start_time = time.time()

    # Train on CPU (hide GPU) due to memory constraints
    os.environ['CUDA_VISIBLE_DEVICES'] = ""

    # Convert features from normal matrix --> sparse matrix --> tuple
        # features_tuple contains: (list of matrix coordinates, list of values, matrix dimensions)
    if features_matrix is None:
        x = sp.lil_matrix(np.identity(adj_sparse.shape[0]))
    else:
        x = sp.lil_matrix(features_matrix)
    features_tuple = sparse_to_tuple(x)
    features_shape = features_tuple[2]

    # Get graph attributes (to feed into model)
    num_nodes = adj_sparse.shape[0] # number of nodes in adjacency matrix
    num_features = features_shape[1] # number of features (columsn of features matrix)
    features_nonzero = features_tuple[1].shape[0] # number of non-zero entries in features matrix (or length of values list)

    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = deepcopy(adj_sparse)
    adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()

    # Normalize adjacency matrix
    adj_norm = preprocess_graph(adj_train)

    # Add in diagonals
    adj_label = adj_train + sp.eye(adj_train.shape[0])
    adj_label = sparse_to_tuple(adj_label)

    # Define placeholders
    placeholders = { # TODO: try making these dense from the get-go
        'features': tf.sparse_placeholder(tf.float16),
        'adj': tf.sparse_placeholder(tf.float16),
        'adj_orig': tf.sparse_placeholder(tf.float16),
        'dropout': tf.placeholder_with_default(0., shape=())
    }

    # How much to weigh positive examples (true edges) in cost print_function
      # Want to weigh less-frequent classes higher, so as to prevent model output bias
      # pos_weight = (num. negative samples / (num. positive samples)
    pos_weight = float(adj_sparse.shape[0] * adj_sparse.shape[0] - adj_sparse.sum()) / adj_sparse.sum()

    # normalize (scale) average weighted cost
    norm = adj_sparse.shape[0] * adj_sparse.shape[0] / float((adj_sparse.shape[0] * adj_sparse.shape[0] - adj_sparse.sum()) * 2)

    if verbose >= 1:
        print 'Initializing GAE model...'

    # Create VAE model
    model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero,
                       HIDDEN1_DIM, HIDDEN2_DIM, dtype=dtype, flatten_output=False)

    opt = OptimizerVAE(preds=model.reconstructions,
                               labels=tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False),
                               # labels=placeholders['adj_orig'],
                               model=model, num_nodes=num_nodes,
                               pos_weight=pos_weight,
                               norm=norm,
                               learning_rate=LEARNING_RATE,
                               dtype=tf.float16)

    cost_val = []
    acc_val = []
    val_roc_score = []

    prev_embs = []

    # Initialize session
    sess = tf.Session()

    if verbose >= 1:
        # Print total # trainable variables 
        total_parameters = 0
        for variable in tf.trainable_variables():
            # shape is an array of tf.Dimension
            shape = variable.get_shape()
            print "Variable shape: ", shape
            variable_parameters = 1
            for dim in shape:
                print "Current dimension: ", dim
                variable_parameters *= dim.value
            print "Variable params: ", variable_parameters
            total_parameters += variable_parameters
            print ''
        print "TOTAL TRAINABLE PARAMS: ", total_parameters

        print 'Initializing TF variables...'

    sess.run(tf.global_variables_initializer())

    if verbose >= 1:
        print 'Starting GAE training!'

    # Train model
    for epoch in range(EPOCHS):

        t = time.time()
        # Construct feed dictionary
        feed_dict = construct_feed_dict(adj_norm, adj_label, features_tuple, placeholders)
        feed_dict.update({placeholders['dropout']: DROPOUT})
        # Run single weight update
        outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict)

        # Compute average loss
        avg_cost = outs[1]
        avg_accuracy = outs[2]

        # Evaluate predictions
        feed_dict.update({placeholders['dropout']: 0})
        gae_emb = sess.run(model.z_mean, feed_dict=feed_dict)

        prev_embs.append(gae_emb)

        gae_score_matrix = np.dot(gae_emb, gae_emb.T)

        # # TODO: remove this (debugging)
        # if not np.isfinite(gae_score_matrix).all():
        #     print 'Found non-finite value in GAE score matrix! Epoch: {}'.format(epoch)
        #     with open('numpy-nan-debugging.pkl', 'wb') as f:
        #         dump_info = {}
        #         dump_info['gae_emb'] = gae_emb
        #         dump_info['epoch'] = epoch
        #         dump_info['gae_score_matrix'] = gae_score_matrix
        #         dump_info['adj_norm'] = adj_norm
        #         dump_info['adj_label'] = adj_label
        #         dump_info['features_tuple'] = features_tuple
        #         # dump_info['feed_dict'] = feed_dict
        #         dump_info['prev_embs'] = prev_embs
        #         pickle.dump(dump_info, f, protocol=2)
        # # END TODO


        roc_curr, ap_curr = get_roc_score(val_edges, val_edges_false, gae_score_matrix, apply_sigmoid=True)
        val_roc_score.append(roc_curr)

        # Print results for this epoch
        if verbose == 2:
            print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost),
              "train_acc=", "{:.5f}".format(avg_accuracy), "val_roc=", "{:.5f}".format(val_roc_score[-1]),
              "val_ap=", "{:.5f}".format(ap_curr),
              "time=", "{:.5f}".format(time.time() - t))

    if verbose == 2:
        print("Optimization Finished!")

    # Print final results
    feed_dict.update({placeholders['dropout']: 0})
    gae_emb = sess.run(model.z_mean, feed_dict=feed_dict)

    # Dot product edge scores (default)
    if edge_score_mode == "dot-product":
        gae_score_matrix = np.dot(gae_emb, gae_emb.T)

        runtime = time.time() - start_time

        # Calculate final scores
        gae_val_roc, gae_val_ap = get_roc_score(val_edges, val_edges_false, gae_score_matrix)
        gae_test_roc, gae_test_ap = get_roc_score(test_edges, test_edges_false, gae_score_matrix)
    
    # Take bootstrapped edge embeddings (via hadamard product)
    elif edge_score_mode == "edge-emb":
        def get_edge_embeddings(edge_list):
            embs = []
            for edge in edge_list:
                node1 = edge[0]
                node2 = edge[1]
                emb1 = gae_emb[node1]
                emb2 = gae_emb[node2]
                edge_emb = np.multiply(emb1, emb2)
                embs.append(edge_emb)
            embs = np.array(embs)
            return embs

        # Train-set edge embeddings
        pos_train_edge_embs = get_edge_embeddings(train_edges)
        neg_train_edge_embs = get_edge_embeddings(train_edges_false)
        train_edge_embs = np.concatenate([pos_train_edge_embs, neg_train_edge_embs])

        # Create train-set edge labels: 1 = real edge, 0 = false edge
        train_edge_labels = np.concatenate([np.ones(len(train_edges)), np.zeros(len(train_edges_false))])

        # Val-set edge embeddings, labels
        if len(val_edges) > 0 and len(val_edges_false) > 0:
            pos_val_edge_embs = get_edge_embeddings(val_edges)
            neg_val_edge_embs = get_edge_embeddings(val_edges_false)
            val_edge_embs = np.concatenate([pos_val_edge_embs, neg_val_edge_embs])
            val_edge_labels = np.concatenate([np.ones(len(val_edges)), np.zeros(len(val_edges_false))])

        # Test-set edge embeddings, labels
        pos_test_edge_embs = get_edge_embeddings(test_edges)
        neg_test_edge_embs = get_edge_embeddings(test_edges_false)
        test_edge_embs = np.concatenate([pos_test_edge_embs, neg_test_edge_embs])

        # Create val-set edge labels: 1 = real edge, 0 = false edge
        test_edge_labels = np.concatenate([np.ones(len(test_edges)), np.zeros(len(test_edges_false))])

        # Train logistic regression classifier on train-set edge embeddings
        edge_classifier = LogisticRegression(random_state=0)
        edge_classifier.fit(train_edge_embs, train_edge_labels)

        # Predicted edge scores: probability of being of class "1" (real edge)
        if len(val_edges) > 0 and len(val_edges_false) > 0:
            val_preds = edge_classifier.predict_proba(val_edge_embs)[:, 1]
        test_preds = edge_classifier.predict_proba(test_edge_embs)[:, 1]

        runtime = time.time() - start_time

        # Calculate scores
        if len(val_edges) > 0 and len(val_edges_false) > 0:
            gae_val_roc = roc_auc_score(val_edge_labels, val_preds)
            # gae_val_roc_curve = roc_curve(val_edge_labels, val_preds)
            gae_val_ap = average_precision_score(val_edge_labels, val_preds)
        else:
            gae_val_roc = None
            gae_val_roc_curve = None
            gae_val_ap = None
        
        gae_test_roc = roc_auc_score(test_edge_labels, test_preds)
        # gae_test_roc_curve = roc_curve(test_edge_labels, test_preds)
        gae_test_ap = average_precision_score(test_edge_labels, test_preds)

    # Record scores
    gae_scores = {}

    gae_scores['test_roc'] = gae_test_roc
    # gae_scores['test_roc_curve'] = gae_test_roc_curve
    gae_scores['test_ap'] = gae_test_ap

    gae_scores['val_roc'] = gae_val_roc
    # gae_scores['val_roc_curve'] = gae_val_roc_curve
    gae_scores['val_ap'] = gae_val_ap

    gae_scores['val_roc_per_epoch'] = val_roc_score
    gae_scores['runtime'] = runtime
    return gae_scores
Пример #15
0
def GAEembedding(z, adj, args):
    '''
    GAE embedding for clustering
    Param:
        z,adj
    Return:
        Embedding from graph
    '''
    # true_labels = np.asarray(true_labels)

    # args.model = 'gcn_vae'
    # args.dw    = 0
    # args.epochs = 200
    # args.hidden1 = 32
    # args.hidden2 = 16
    # args.lr      = 0.01
    # args.dropout = 0.
    # args.dataset_sr = 'cora'
    # args.walk_length = 5
    # args.window_size = 3
    # args.number_walks = 5
    # args.full_number_walks =0
    # args.lr_dw   = 0.001
    # args.context = 0
    # args.ns = 1
    # args.n_clusters = 11
    # args.plot = 0

    # featrues from z
    # Louvain
    features = z
    # features = torch.DoubleTensor(features)
    features = torch.FloatTensor(features)

    # Old implementation
    # adj, features, y_test, tx, ty, test_maks, true_labels = load_data(args.dataset_str)

    n_nodes, feat_dim = features.shape

    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix(
        (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()

    adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(
        adj)
    adj = adj_train

    # Before proceeding further, make the structure for doing deepWalk
    # if args.dw == 1:
    #     print('Using deepWalk regularization...')
    #     G = load_edgelist_from_csr_matrix(adj_orig, undirected=True)
    #     print("Number of nodes: {}".format(len(G.nodes())))
    #     num_walks = len(G.nodes()) * args.number_walks
    #     print("Number of walks: {}".format(num_walks))
    #     data_size = num_walks * args.walk_length
    #     print("Data size (walks*length): {}".format(data_size))

    # Some preprocessing
    adj_norm = preprocess_graph(adj)
    adj_label = adj_train + sp.eye(adj_train.shape[0])
    # adj_label = sparse_to_tuple(adj_label)
    # adj_label = torch.DoubleTensor(adj_label.toarray())
    adj_label = torch.FloatTensor(adj_label.toarray())

    pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
    norm = adj.shape[0] * adj.shape[0] / float(
        (adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

    if args.GAEmodel == 'gcn_vae':
        model = GCNModelVAE(feat_dim, args.GAEhidden1, args.GAEhidden2,
                            args.GAEdropout)
    else:
        model = GCNModelAE(feat_dim, args.GAEhidden1, args.GAEhidden2,
                           args.GAEdropout)
    if args.precisionModel == 'Double':
        model = model.double()
    optimizer = optim.Adam(model.parameters(), lr=args.GAElr)

    # if args.dw == 1:
    #     sg = SkipGram(args.hidden2, adj.shape[0])
    #     optimizer_dw = optim.Adam(sg.parameters(), lr=args.lr_dw)

    #     # Construct the nodes for doing random walk. Doing it before since the seed is fixed
    #     nodes_in_G = list(G.nodes())
    #     chunks = len(nodes_in_G) // args.number_walks
    #     random.Random().shuffle(nodes_in_G)

    hidden_emb = None
    for epoch in tqdm(range(args.GAEepochs)):
        t = time.time()
        # mem=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        # print('Mem consumption before training: '+str(mem))
        model.train()
        optimizer.zero_grad()
        z, mu, logvar = model(features, adj_norm)

        # After back-propagating gae loss, now do the deepWalk regularization
        # if args.dw == 1:
        #     sg.train()
        #     if args.full_number_walks > 0:
        #         walks = build_deepwalk_corpus(G, num_paths=args.full_number_walks,
        #                                       path_length=args.walk_length, alpha=0,
        #                                       rand=random.Random(SEED))
        #     else:
        #         walks = build_deepwalk_corpus_iter(G, num_paths=args.number_walks,
        #                                            path_length=args.walk_length, alpha=0,
        #                                            rand=random.Random(SEED),
        #                                            chunk=epoch % chunks,
        #                                            nodes=nodes_in_G)
        #     for walk in walks:
        #         if args.context == 1:
        #             # Construct the pairs for predicting context node
        #             # for each node, treated as center word
        #             curr_pair = (int(walk[center_node_pos]), [])
        #             for center_node_pos in range(len(walk)):
        #                 # for each window position
        #                 for w in range(-args.window_size, args.window_size + 1):
        #                     context_node_pos = center_node_pos + w
        #                     # make soure not jump out sentence
        #                     if context_node_pos < 0 or context_node_pos >= len(walk) or center_node_pos == context_node_pos:
        #                         continue
        #                     context_node_idx = walk[context_node_pos]
        #                     curr_pair[1].append(int(context_node_idx))
        #         else:
        #             # first item in the walk is the starting node
        #             curr_pair = (int(walk[0]), [int(context_node_idx) for context_node_idx in walk[1:]])

        #         if args.ns == 1:
        #             neg_nodes = []
        #             pos_nodes = set(walk)
        #             while len(neg_nodes) < args.walk_length - 1:
        #                 rand_node = random.randint(0, n_nodes - 1)
        #                 if rand_node not in pos_nodes:
        #                     neg_nodes.append(rand_node)
        #             neg_nodes = torch.from_numpy(np.array(neg_nodes)).long()

        #         # Do actual prediction
        #         src_node = torch.from_numpy(np.array([curr_pair[0]])).long()
        #         tgt_nodes = torch.from_numpy(np.array(curr_pair[1])).long()
        #         optimizer_dw.zero_grad()
        #         log_pos = sg(src_node, tgt_nodes, neg_sample=False)
        #         if args.ns == 1:
        #             loss_neg = sg(src_node, neg_nodes, neg_sample=True)
        #             loss_dw = log_pos + loss_neg
        #         else:
        #             loss_dw = log_pos
        #         loss_dw.backward(retain_graph=True)
        #         cur_dw_loss = loss_dw.item()
        #         optimizer_dw.step()

        loss = loss_function(preds=model.dc(z),
                             labels=adj_label,
                             mu=mu,
                             logvar=logvar,
                             n_nodes=n_nodes,
                             norm=norm,
                             pos_weight=pos_weight)
        loss.backward()
        cur_loss = loss.item()
        optimizer.step()

        hidden_emb = mu.data.numpy()
        # TODO, this is prediction
        # roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false)
        ap_curr = 0

        # if args.dw == 1:
        #     tqdm.write("Epoch: {}, train_loss_gae={:.5f}, train_loss_dw={:.5f}, val_ap={:.5f}, time={:.5f}".format(
        #         epoch + 1, cur_loss, cur_dw_loss,
        #         ap_curr, time.time() - t))
        # else:
        tqdm.write(
            "Epoch: {}, train_loss_gae={:.5f}, val_ap={:.5f}, time={:.5f}".
            format(epoch + 1, cur_loss, ap_curr,
                   time.time() - t))

        # if (epoch + 1) % 10 == 0:
        #     tqdm.write("Evaluating intermediate results...")
        #     kmeans = KMeans(n_clusters=args.n_clusters, random_state=0).fit(hidden_emb)
        #     predict_labels = kmeans.predict(hidden_emb)
        #     cm = clustering_metrics(true_labels, predict_labels)
        #     cm.evaluationClusterModelFromLabel(tqdm)
        #     roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false)
        #     tqdm.write('ROC: {}, AP: {}'.format(roc_score, ap_score))
        #     np.save('logs/emb_epoch_{}.npy'.format(epoch + 1), hidden_emb)

    tqdm.write("Optimization Finished!")

    roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges,
                                        test_edges_false)
    tqdm.write('Test ROC score: ' + str(roc_score))
    tqdm.write('Test AP score: ' + str(ap_score))
    # kmeans = KMeans(n_clusters=args.n_clusters, random_state=0).fit(hidden_emb)
    # predict_labels = kmeans.predict(hidden_emb)
    # cm = clustering_metrics(true_labels, predict_labels)
    # cm.evaluationClusterModelFromLabel(tqdm)

    # if args.GAEplot == 1:
    #     cm.plotClusters(tqdm, hidden_emb, true_labels)

    return hidden_emb
Пример #16
0
            'features': tf.sparse_placeholder(tf.float32),
            'adj': tf.sparse_placeholder(tf.float32),
            'adj_orig': tf.sparse_placeholder(tf.float32),
            'dropout': tf.placeholder_with_default(0., shape=())
        }

        num_nodes = adj.shape[0]  #网络中结点的个数

        features = sparse_to_tuple(features.tocoo(
        ))  #特征转化为稀疏矩阵存储,存成tuple的形式,第一行存一对对关系,第二行存每对关系的值,第三行存行列数
        num_features = features[2][1]  #特征的维度
        features_nonzero = features[1].shape[0]  #有特征的节点个数

        # Create model
        model = None
        model = GCNModelVAE(placeholders, num_features, num_nodes,
                            features_nonzero)

        pos_weight = float(adj.shape[0] * adj.shape[0] -
                           adj.sum()) / adj.sum()  #负样本与正样本的比例
        print('adj.shape', adj.shape[0], adj.sum())
        norm = adj.shape[0] * adj.shape[0] / float(
            (adj.shape[0] * adj.shape[0] - adj.sum()) *
            2)  # shape[0] = 913, adj.sum() = 10734

        # Optimizer
        opt = OptimizerVAE(preds=model.reconstructions,
                           labels=tf.reshape(
                               tf.sparse_tensor_to_dense(
                                   placeholders['adj_orig'],
                                   validate_indices=False), [-1]),
                           model=model,
Пример #17
0
def GAEembedding(z, adj, args):
    '''
    GAE embedding for clustering
    Param:
        z,adj
    Return:
        Embedding from graph
    '''
    # featrues from z
    # Louvain
    features = z
    # features = torch.DoubleTensor(features)
    features = torch.FloatTensor(features)

    # Old implementation
    # adj, features, y_test, tx, ty, test_maks, true_labels = load_data(args.dataset_str)

    n_nodes, feat_dim = features.shape

    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix(
        (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()

    adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(
        adj)
    adj = adj_train

    # Some preprocessing
    adj_norm = preprocess_graph(adj)
    adj_label = adj_train + sp.eye(adj_train.shape[0])
    # adj_label = sparse_to_tuple(adj_label)
    # adj_label = torch.DoubleTensor(adj_label.toarray())
    adj_label = torch.FloatTensor(adj_label.toarray())

    pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
    norm = adj.shape[0] * adj.shape[0] / float(
        (adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

    if args.GAEmodel == 'gcn_vae':
        model = GCNModelVAE(feat_dim, args.GAEhidden1, args.GAEhidden2,
                            args.GAEdropout)
    else:
        model = GCNModelAE(feat_dim, args.GAEhidden1, args.GAEhidden2,
                           args.GAEdropout)
    if args.precisionModel == 'Double':
        model = model.double()
    optimizer = optim.Adam(model.parameters(), lr=args.GAElr)

    hidden_emb = None
    for epoch in tqdm(range(args.GAEepochs)):
        t = time.time()
        # mem=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        # print('Mem consumption before training: '+str(mem))
        model.train()
        optimizer.zero_grad()
        z, mu, logvar = model(features, adj_norm)

        loss = loss_function(preds=model.dc(z),
                             labels=adj_label,
                             mu=mu,
                             logvar=logvar,
                             n_nodes=n_nodes,
                             norm=norm,
                             pos_weight=pos_weight)
        loss.backward()
        cur_loss = loss.item()
        optimizer.step()

        hidden_emb = mu.data.numpy()
        # TODO, this is prediction
        # roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false)
        ap_curr = 0

        tqdm.write(
            "Epoch: {}, train_loss_gae={:.5f}, val_ap={:.5f}, time={:.5f}".
            format(epoch + 1, cur_loss, ap_curr,
                   time.time() - t))

    tqdm.write("Optimization Finished!")

    roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges,
                                        test_edges_false)
    tqdm.write('Test ROC score: ' + str(roc_score))
    tqdm.write('Test AP score: ' + str(ap_score))

    return hidden_emb