def gae_for(args): print("Using {} dataset".format(args.dataset_str)) adj, features = load_data(args.dataset_str) n_nodes, feat_dim = features.shape # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj) adj = adj_train # Some preprocessing adj_norm = preprocess_graph(adj) adj_label = adj_train + sp.eye(adj_train.shape[0]) # adj_label = sparse_to_tuple(adj_label) adj_label = torch.FloatTensor(adj_label.toarray()) pos_weight = torch.Tensor( [float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()]) norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout) optimizer = optim.Adam(model.parameters(), lr=args.lr) hidden_emb = None for epoch in range(args.epochs): t = time.time() model.train() optimizer.zero_grad() recovered, mu, logvar = model(features, adj_norm) loss = loss_function(preds=recovered, labels=adj_label, mu=mu, logvar=logvar, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight) loss.backward() cur_loss = loss.item() optimizer.step() hidden_emb = mu.data.numpy() roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false) print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(cur_loss), "val_ap=", "{:.5f}".format(ap_curr), "time=", "{:.5f}".format(time.time() - t)) print("Optimization Finished!") roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) print('Test ROC score: ' + str(roc_score)) print('Test AP score: ' + str(ap_score))
def gae_for(args): print("Using {} dataset".format(args.dataset_str)) adj, features, y_test, tx, ty, test_maks, true_labels = load_data( args.dataset_str) n_nodes, feat_dim = features.shape # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj) adj = adj_train # Before proceeding further, make the structure for doing deepWalk if args.dw == 1: print('Using deepWalk regularization...') G = load_edgelist_from_csr_matrix(adj_orig, undirected=True) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) # Some preprocessing adj_norm = preprocess_graph(adj) adj_label = adj_train + sp.eye(adj_train.shape[0]) # adj_label = sparse_to_tuple(adj_label) adj_label = torch.FloatTensor(adj_label.toarray()) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) if args.model == 'gcn_vae': model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout) else: model = GCNModelAE(feat_dim, args.hidden1, args.hidden2, args.dropout) optimizer = optim.Adam(model.parameters(), lr=args.lr) if args.dw == 1: sg = SkipGram(args.hidden2, adj.shape[0]) optimizer_dw = optim.Adam(sg.parameters(), lr=args.lr_dw) # Construct the nodes for doing random walk. Doing it before since the seed is fixed nodes_in_G = list(G.nodes()) chunks = len(nodes_in_G) // args.number_walks random.Random().shuffle(nodes_in_G) hidden_emb = None for epoch in tqdm(range(args.epochs)): t = time.time() model.train() optimizer.zero_grad() z, mu, logvar = model(features, adj_norm) # After back-propagating gae loss, now do the deepWalk regularization if args.dw == 1: sg.train() if args.full_number_walks > 0: walks = build_deepwalk_corpus(G, num_paths=args.full_number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(SEED)) else: walks = build_deepwalk_corpus_iter( G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(SEED), chunk=epoch % chunks, nodes=nodes_in_G) for walk in walks: if args.context == 1: # Construct the pairs for predicting context node # for each node, treated as center word curr_pair = (int(walk[center_node_pos]), []) for center_node_pos in range(len(walk)): # for each window position for w in range(-args.window_size, args.window_size + 1): context_node_pos = center_node_pos + w # make soure not jump out sentence if context_node_pos < 0 or context_node_pos >= len( walk ) or center_node_pos == context_node_pos: continue context_node_idx = walk[context_node_pos] curr_pair[1].append(int(context_node_idx)) else: # first item in the walk is the starting node curr_pair = (int(walk[0]), [ int(context_node_idx) for context_node_idx in walk[1:] ]) if args.ns == 1: neg_nodes = [] pos_nodes = set(walk) while len(neg_nodes) < args.walk_length - 1: rand_node = random.randint(0, n_nodes - 1) if rand_node not in pos_nodes: neg_nodes.append(rand_node) neg_nodes = torch.from_numpy(np.array(neg_nodes)).long() # Do actual prediction src_node = torch.from_numpy(np.array([curr_pair[0]])).long() tgt_nodes = torch.from_numpy(np.array(curr_pair[1])).long() optimizer_dw.zero_grad() log_pos = sg(src_node, tgt_nodes, neg_sample=False) if args.ns == 1: loss_neg = sg(src_node, neg_nodes, neg_sample=True) loss_dw = log_pos + loss_neg else: loss_dw = log_pos loss_dw.backward(retain_graph=True) cur_dw_loss = loss_dw.item() optimizer_dw.step() loss = loss_function(preds=model.dc(z), labels=adj_label, mu=mu, logvar=logvar, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight) loss.backward() cur_loss = loss.item() optimizer.step() hidden_emb = mu.data.numpy() roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false) if args.dw == 1: tqdm.write( "Epoch: {}, train_loss_gae={:.5f}, train_loss_dw={:.5f}, val_ap={:.5f}, time={:.5f}" .format(epoch + 1, cur_loss, cur_dw_loss, ap_curr, time.time() - t)) else: tqdm.write( "Epoch: {}, train_loss_gae={:.5f}, val_ap={:.5f}, time={:.5f}". format(epoch + 1, cur_loss, ap_curr, time.time() - t)) if (epoch + 1) % 10 == 0: tqdm.write("Evaluating intermediate results...") kmeans = KMeans(n_clusters=args.n_clusters, random_state=0).fit(hidden_emb) predict_labels = kmeans.predict(hidden_emb) cm = clustering_metrics(true_labels, predict_labels) cm.evaluationClusterModelFromLabel(tqdm) roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) tqdm.write('ROC: {}, AP: {}'.format(roc_score, ap_score)) np.save('logs/emb_epoch_{}.npy'.format(epoch + 1), hidden_emb) tqdm.write("Optimization Finished!") roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) tqdm.write('Test ROC score: ' + str(roc_score)) tqdm.write('Test AP score: ' + str(ap_score)) kmeans = KMeans(n_clusters=args.n_clusters, random_state=0).fit(hidden_emb) predict_labels = kmeans.predict(hidden_emb) cm = clustering_metrics(true_labels, predict_labels) cm.evaluationClusterModelFromLabel(tqdm) if args.plot == 1: cm.plotClusters(tqdm, hidden_emb, true_labels)
def gae_for(args): print("Using {} dataset".format(args.dataset_str)) adj, features = load_corpus(args.dataset_str) # n_nodes, feat_dim = features.shape # print(n_nodes, feat_dim) print(type(features)) print(adj) # print(adj[0], adj[1]) features = sp.identity(features.shape[0]) # featureless # print(adj.shape) # print(features.shape) # Some preprocessing features = preprocess_features(features) adj_norm = preprocess_adj(adj) num_supports = 1 # model_func = GCN adj_norm = torch.FloatTensor(adj_norm.toarray()) features = torch.FloatTensor(features.toarray()) n_nodes, feat_dim = features.shape print(n_nodes, feat_dim) print(type(features)) print(type(adj_norm)) print(features.shape) print(adj_norm.shape) # n_nodes, feat_dim = features.shape # print(n_nodes, feat_dim) # Store original adjacency matrix (without diagonal entries) for later # adj_orig = adj # adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) # adj_orig.eliminate_zeros() # modified/added by hollis # adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj) # Remove diagonal elements # adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape) # adj.eliminate_zeros() # Check that diag is zero: # assert np.diag(adj.todense()).sum() == 0 # adj_train = sp.csr_matrix(adj) # adj_train = adj_train + adj_train.T # Some preprocessing # adj_norm = normalize_adj(adj) # adj_label = adj_train + sp.eye(adj_train.shape[0]) # adj_label = sparse_to_tuple(adj_label) # adj_label = torch.FloatTensor(adj_label.toarray()) # adj_label = np.array(adj_label, dtype=float) # adj_label = torch.FloatTensor(adj_label) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() # added by hollis # pos_weight = torch.from_numpy(np.array(pos_weight)) norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) # model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout) model = GCNModelVAE(feat_dim, args.hidden1, args.dropout) optimizer = optim.Adam(model.parameters(), lr=args.lr) hidden_emb = None for epoch in range(args.epochs): print("in epoch") t = time.time() model.train() optimizer.zero_grad() # recovered, mu, logvar = model(features, adj_norm) print("before model") recovered, mu, logvar = model(features, adj_norm) print("before loss") loss = loss_function(preds=recovered, labels=adj_norm, mu=mu, logvar=logvar, n_nodes=n_nodes, norm=norm) #loss = loss_function(preds=recovered, labels=adj_label, # mu=mu, logvar=logvar, n_nodes=n_nodes, # norm=norm, pos_weight=pos_weight) print("befor backword") loss.backward() cur_loss = loss.item() optimizer.step() hidden_emb = mu.data.numpy() hidden_emb = np.array(hidden_emb) if epoch == 1: fni = "./result/emb_init.txt" hidden_emb = np.array(hidden_emb) np.savetxt(fni, hidden_emb) if epoch == args.epochs - 1: fnf = "./result/emb.txt" hidden_emb = np.array(hidden_emb) np.savetxt(fnf, hidden_emb) #roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false) print( "Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(cur_loss), # "val_ap=", "{:.5f}".format(ap_curr), "time=", "{:.5f}".format(time.time() - t)) print("Optimization Finished!")
def gae_for(args, position): print("Using {} dataset".format(args.dataset_str)) #qhashes, chashes = load_hashes() Q, X = load_data() prebuild = "/media/chundi/3b6b0f74-0ac7-42c7-b76b-00c65f5b3673/revisitop/cnnimageretrieval-pytorch/data/test/matlab_data/GEM_wDis_prebuild.bin" Q_features = "/media/chundi/3b6b0f74-0ac7-42c7-b76b-00c65f5b3673/revisitop/cnnimageretrieval-pytorch/data/test/matlab_data/roxford5k_GEM_lw_query_feats.npy" #"/media/jason/cc0aeb62-0bc7-4f3e-99a0-3bba3dd9f8fc/landmarks/oxfordRe/evaluation/roxHD_query_fused.npy" X_features = "/media/chundi/3b6b0f74-0ac7-42c7-b76b-00c65f5b3673/revisitop/cnnimageretrieval-pytorch/data/test/matlab_data/roxford5k_GEM_index.npy" D_features = "/media/chundi/3b6b0f74-0ac7-42c7-b76b-00c65f5b3673/revisitop/cnnimageretrieval-pytorch/data/test/matlab_data/roxford5k_GEM_Dis.npy" adj, features, adj_Q, features_Q = load_from_prebuild(prebuild, Q_features, X_features, D_features, k=5) # ----> 1M #cut_size = 800000 #adj = adj[:cut_size, :cut_size] #adj_Q = adj_Q[:, :cut_size] #features = features[:cut_size] #Q = np.load("/media/jason/cc0aeb62-0bc7-4f3e-99a0-3bba3dd9f8fc/landmarks/oxfordRe/evaluation/roxHD_query_fused.npy").T.astype(np.float32) #X = np.load("/media/jason/cc0aeb62-0bc7-4f3e-99a0-3bba3dd9f8fc/landmarks/oxfordRe/evaluation/roxHD_index_fused.npy").T.astype(np.float32) #D = np.load("/media/jason/cc0aeb62-0bc7-4f3e-99a0-3bba3dd9f8fc/landmarks/revisitop1m/revisitDistractors_fused_3s_cq.npy").T.astype(np.float32) #X = np.concatenate((X.T,D.T)).T # load the distractor too, shape should be (2048, 1M) #adj, features = gen_graph_index(Q, X, k=5, k_qe=3, do_qe=False) #-----> 5k #adj_Q, features_Q = gen_graph(Q, X, k=5, k_qe=3, do_qe=False) #generate validation/revop evaluation the same way as training ----> 5k features_all = np.concatenate([features_Q, features]) features_all = torch.from_numpy(features_all) #adj_Q = adj_Q.todense() #adj_all = np.concatenate([adj_Q, adj.todense()]) #adj_all = np.pad(adj_all, [[0,0], [Q.shape[1], 0]], "constant") adj_all = sp.vstack((adj_Q, adj)) zeros = sp.csr_matrix((adj_all.shape[0], Q.shape[1])) adj_all = sp.hstack((zeros, adj_all)) adj_all = sp.csr_matrix(adj_all) rows, columns = adj_all.nonzero() print("Making Symmetry") for i in range(rows.shape[0]): if rows[i] < Q.shape[1]: adj_all[columns[i], rows[i]] = adj_all[rows[i], columns[i]] else: break #adj_all = sp.csr_matrix(adj_all) print("preprocessing adj_all") adj_all_norm = preprocess_graph(adj_all) #adj = add_neighbours_neighbour(adj) #adj1, features1 = load_data(args.dataset_str) features = torch.from_numpy(features) #features_all = torch.from_numpy(features_all) n_nodes, feat_dim = features.shape # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj = adj_orig #adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj) print("Sampling validation") adj_train, adj_val, features, features_valid = mask_test_rows( adj, features) adj = adj_train # Some preprocessing print("preprocessing adj") adj_norm = preprocess_graph(adj) #adj_norm_label = preprocess_graph_sp(adj) adj_label = adj_train + sp.eye( adj_train.shape[0] ) #adj_norm_label + sp.eye(adj_train.shape[0]) #adj_train + sp.eye(adj_train.shape[0]) #rows, columns = adj_label.nonzero() #adj_label[columns, rows] = adj_label[rows, columns] # adj_label = sparse_to_tuple(adj_label) #adj_label = torch.FloatTensor(adj_label.toarray()) print("adj sum: " + str(adj.sum())) pos_weight = float(float(adj.shape[0]) * adj.shape[0] - adj.sum()) / adj.sum() print("top part: " + str(float(float(adj.shape[0]) * adj.shape[0] - adj.sum()))) print("pos wieght: " + str(pos_weight)) norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) # for validation data processing: zero = sp.csr_matrix((adj_train.shape[0], adj_val.shape[0])) adj_train_ext = sp.hstack((zero, adj_train)) adj_evaluate = sp.vstack((adj_val, adj_train_ext)) adj_evaluate = sp.csr_matrix(adj_evaluate) rows, columns = adj_evaluate.nonzero() val_edges = [] val_edges_false = [] pos = {} print("getting positive edges") all_val = [i for i in range(len(rows)) if rows[i] < adj_val.shape[0]] for i in all_val: sys.stdout.write("\r sampling edges for validtion: [" + str(i) + "]") val_edges.append((rows[i], columns[i])) if rows[i] not in pos: pos[rows[i]] = [] pos[rows[i]].append(columns[i]) #for i in range(rows.shape[0]): # sys.stdout.write("\r sampling edges for validtion: [" + str(i) + "/" + str(adj_val.shape[0]) + "]") # sys.stdout.flush() # if rows[i] < adj_val.shape[0]: # val_edges.append((rows[i], columns[i])) # if rows[i] not in pos: # pos[rows[i]] = [] # pos[rows[i]].append(columns[i]) # adj_evaluate[columns[i], rows[i]] = adj_evaluate[rows[i], columns[i]] # else: # break step = 0 neg_per_pos = 100 #for r in pos: # p = pos[r] #neg_edges = Parallel(n_jobs=40)(delayed(neg_sample)(pos[i], adj_val.shape[1], neg_per_pos, i) for i in pos) #val_edges_false = [(i, item) for i in range(len(neg_edges)) for item in neg_edges[i]] #a = np.random.permutation(adj_val.shape[1]) #a = [i for i in a if i not in p] #a = a[:100] #for i in a: # val_edges_false.append((r, i)) ##count = 0 ##i = 0 #sys.stdout.write("\r sampling neg edges for validtion: [" + str(step) + "/" + str(len(pos)) + "]") #sys.stdout.flush() #step += 1 #while count < 100: # if a[i] not in p: # val_edges_false.append((r, a[i])) # count += 1 # i += 1 print("preprocessing adj_evaluate") adj_evaluate_norm = preprocess_graph(adj_evaluate) #adj_evaluate_norm_label = preprocess_graph_sp(adj_evaluate) adj_label_evaluate = adj_evaluate + sp.eye( adj_evaluate.shape[0] ) #adj_evaluate_norm_label+ sp.eye(adj_evaluate.shape[0]) #adj_evaluate + sp.eye(adj_evaluate.shape[0]) #adj_label_evaluate = torch.FloatTensor(adj_label_evaluate.toarray()) #sparse_mx_to_torch_sparse_tensor(adj_label_evaluate) features_evaluate = np.concatenate([features, features_valid]) features_evaluate = torch.from_numpy(features_evaluate) # validation done if mode == "VAE": model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout) adj_label = torch.FloatTensor(adj_label.toarray()) adj_label_evaluate = torch.FloatTensor(adj_label_evaluate.toarray()) elif mode == "AE": model = GCNModelAE(feat_dim, args.hidden1, args.hidden2, args.dropout) #adj_label = torch.FloatTensor(adj_label.toarray()) #adj_label_evaluate = torch.FloatTensor(adj_label_evaluate.toarray()) elif mode == "VAE_batch": model = GCNModelVAE_batch(feat_dim, args.hidden1, args.hidden2, args.dropout) elif mode == "AE_batch": model = GCNModelAE_batch(feat_dim, args.hidden1, args.hidden2, args.dropout).cuda() #model = torch.nn.DataParallel(model) #model = model.cuda() # train_dataset = GAEDataset(adj_norm, adj_label, features) # train_loader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, # shuffle=True, num_workers=8, pin_memory=True) train_ids = torch.tensor(range(features.shape[0]), dtype=torch.long) train_dataset = TensorDataset(train_ids) train_loader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=8, pin_memory=True) optimizer = optim.Adam(model.parameters(), lr=args.lr) #optimizer = pSGLD(model.parameters(), lr=args.lr) #optimizer = optim.RMSprop(model.parameters(), lr=args.lr) hidden_emb = None pos_weight = torch.from_numpy(np.array(0.0, dtype=np.float32)) #ipdb.set_trace() t = time.time() best = 0 best_epoch = 0 best_val_cost = 99999 best_val_epoch = 0 best_val_epoch_revop = 0 prev_loss = 0 prev_val_loss = 99999 best_val_roc = 0 best_val_ap = 0 best_val_roc_revop = 0 best_val_ap_revop = 0 torch.set_num_threads(20) print("NUM THREADS USED") print(torch.get_num_threads()) for epoch in range(args.epochs): #t = time.time() model.train() lossVal = 0 lossValNorm = 0 backtime = time.time() for batchID, (inds) in enumerate(train_loader): z = model(features, adj_norm) inds = inds[0] adj = F.relu(torch.mm(z[inds], z[inds].t())) preds = adj label_batch = torch.FloatTensor(adj_label[inds, :][:, inds].toarray()) cost = norm * F.binary_cross_entropy_with_logits( preds, label_batch, pos_weight=pos_weight) lossVal += cost.item() lossValNorm += 1 optimizer.zero_grad() cost.backward(retain_graph=True) # if batchID == 0: # cost.backward(retain_graph=True) # else: # cost.backward() optimizer.step() if batchID >= 10: break backtime_done = time.time() - backtime sys.stdout.write("\r time taken to do epoch: " + str(backtime_done) + " opt: " + str(time.time() - backtime) + "\n") sys.stdout.flush() #optimizer.step() # sample rows only: non-square #for i in range(0, d.shape[0], sample_size): # selection = random_perm[i:i+sample_size] # to_keep = selection # sample_adj_norm = sparse_mx_to_torch_sparse_tensor(sp.coo_matrix(d[to_keep, :])) # sample_adj = adj[to_keep, :] # sample_features = features # recovered, mu, logvar = model(sample_features, sample_adj_norm) # loss = loss_function(preds=recovered, labels=sample_adj_label, # mu=mu, logvar=logvar, n_nodes=n_nodes, # norm=norm, pos_weight=pos_weight) # loss.backward() # cur_loss = loss.item() # optimizer.step() # sys.stdout.write("\r ") # sys.stdout.write("\r" + "sampling [" + str(i) + "/" + str(d.shape[0])) # sys.stdout.flush() # sample rows + take their postiives and add to rows (make it square) #for i in range(0, d.shape[0], sample_size): # selection = random_perm[i:i+sample_size] # to_keep = np.nonzero(d[selection, :]) # # (array([0, 1, 2, 2]), array([0, 1, 0, 1])) # the_set = set(list(to_keep[0]) + list(to_keep[1])) # temp = set(list(to_keep[0])) # to_keep = list(the_set - column_exclude) # # column_exclude.union(temp) # # these ar ethe rows and columns that we need ne select # sample_adj_norm = sparse_mx_to_torch_sparse_tensor(sp.coo_matrix(d[to_keep, :][:,to_keep])) # sample_features = features[to_keep, :] # sample_adj_label = adj_label[to_keep, :][:,to_keep] # #print(samplei_adj_norm.shape) # #print(sample_features.shape) # #print(sample_adj_label.shape) # #print(sample.shape) # sample_adj = adj[to_keep, :][:,to_keep] # pos_weight = float(sample_adj.shape[0] * sample_adj.shape[0] - sample_adj.sum()) / sample_adj.sum() # pos_weight = torch.from_numpy(np.array(pos_weight)) # norm = sample_adj.shape[0] * sample_adj.shape[0] / float((sample_adj.shape[0] * sample_adj.shape[0] - sample_adj.sum()) * 2) # n_nodes, feat_dim = sample_features.shape # if mode == "VAE": # recovered, mu, logvar = model(sample_features, sample_adj_norm) # #recovered = recovered[i:i+500] # #sample_adj_label = sample_adj_label[i:i+500] # loss = loss_function(preds=recovered, labels=sample_adj_label, # mu=mu, logvar=logvar, n_nodes=n_nodes, # norm=norm, pos_weight=pos_weight) # elif mode == "AE": # recovered = model(sample_features, sample_adj_norm) # loss = loss_function_ae(preds=recovered, labels=sample_adj_label, # norm=norm, pos_weight=pos_weight) # loss.backward() # cur_loss = loss.item() # optimizer.step() # sys.stdout.write("\r ") # sys.stdout.write("\r" + "sampling [" + str(i) + "/" + str(d.shape[0]) + "]....size of sample=" + str(len(sample_features))) # sys.stdout.flush() sys.stdout.write( "\r \r" ) if (epoch + 1) % 1 == 0: model.eval() #adj_dense = adj_train.todense() #adj_val_dense = adj_val.todense() #adj_train_ext = np.pad(adj_dense, [[0,0], [adj_val_dense.shape[0], 0]], "constant") #adj_evaluate = np.concatenate([adj_val_dense, adj_train_ext]) #zero = sp.csr_matrix((adj_train.shape[0], adj_val.shape[0])) #adj_train_ext = sp.hstack((zero, adj_train)) #adj_evaluate = sp.vstack((adj_val, adj_train_ext)) ##zeros = sp.csr_matrix((adj_evaluate.shape[0], adj_val.shape[1])) ##adj_evaluate = sp.hstack((zeros, adj_evaluate)) #adj_evaluate = sp.csr_matrix(adj_evaluate) #rows, columns = adj_evaluate.nonzero() #for i in range(rows.shape[0]): # if rows[i] < adj_val.shape[1]: # adj_evaluate[columns[i], rows[i]] = adj_evaluate[rows[i], columns[i]] # else: # break #adj_evaluate_norm = preprocess_graph(adj_evaluate) #adj_label_evaluate = adj_evaluate + sp.eye(adj_evaluate.shape[0]) #adj_label_evaluate = torch.FloatTensor(adj_label_evaluate.toarray()) ##adj_label_evaluate = sparse_to_tuple(adj_label_evaluate) #features_evaluate = np.concatenate([features, features_valid]) #features_evaluate = torch.from_numpy(features_evaluate) just_adj_evaluate = sparse_mx_to_torch_sparse_tensor(adj_evaluate) #recovered, mu, logvar = model(features_evaluate, just_adj_evaluate.coalesce().indices(), just_adj_evaluate.coalesce().values()) #recovered, mu, logvar = model(features_evaluate, adj_evaluate_norm) #val_loss = loss_function(preds=recovered, labels=adj_label_evaluate, # mu=mu, logvar=logvar, n_nodes=n_nodes, # norm=norm, pos_weight=pos_weight) # if mode == "VAE": # recovered, mu, logvar = model(features_evaluate, adj_evaluate_norm) # val_loss = loss_function(preds=recovered, labels=adj_label_evaluate, # mu=mu, logvar=logvar, n_nodes=n_nodes, # norm=norm, pos_weight=pos_weight) # elif mode == "AE": # mu = model(features_evaluate, adj_evaluate_norm) # val_loss = loss_function_ae(preds=recovered, labels=adj_label_evaluate, # norm=norm, pos_weight=pos_weight) # elif mode == "VAE_batch": # recovered, mu, logvar, val_loss = model(features_evaluate, adj_evaluate_norm, labels=adj_label_evaluate, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight, opt=None, training=False) # elif mode == "AE_batch": # recovered, mu, val_loss = model(features_evaluate, adj_evaluate_norm, labels=adj_label_evaluate, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight, opt=None, training=False) # val_emb = mu.data.numpy() #roc_curr, ap_curr = get_roc_score(val_emb, val_edges, val_edges_false) # do one q at a time #revop_map = eval_each_q(model, adj_all, features_all, Q.shape[1]) # hack by appending stuff on top of adj if mode == "VAE": _, mu, _ = model(features_all, adj_all_norm) elif mode == "AE": mu = model(features_all, adj_all_norm) elif mode == "VAE_batch": mu = model(features_all, adj_all_norm, None, None, None, None, None, just_mu=True, training=False) elif mode == "AE_batch": mu = model(features_all, adj_all_norm, None, None, None, None, None, just_mu=True, training=False) hidden_emb = mu.data.numpy() ## get validation loss #recovered, mu, logvar = model(features, adj_norm) #val_loss = loss_function(preds=recovered, labels=adj_label, # mu=mu, logvar=logvar, n_nodes=n_nodes, # norm=norm, pos_weight=pos_weight) revop_map = get_roc_score_matrix(hidden_emb, Q.shape[1]) if best <= revop_map: emb = hidden_emb Q_end = Q.shape[1] best = revop_map best_epoch = epoch + 1 # write it into a file and do egt on that #embQ = emb[:Q_end,:].T #embX = emb[Q_end:,:].T #np.save("/media/jason/28c9eee1-312e-47d0-88ce-572813ebd6f1/graph/gae-pytorch/best_embedding2.npy",hidden_emb) #concat = np.concatenate((embQ.T,embX.T)) #revop_inner_prod = np.matmul(concat, concat.T) #revop_preds = np.argsort(-revop_inner_prod,axis=0) #if revop_map > 54: # f = open("best_result.txt", "w") # for i in range(revop_preds.shape[1]): # if i < Q_end: # f.write(qhashes[i] + ",") # else: # f.write(chashes[i - Q_end] + ",") # for j in revop_preds[:,i]: # if j < Q_end: # f.write(qhashes[j] + " " + str(int(revop_inner_prod[j,i] * 1000)) + " ") # else: # f.write(chashes[j - Q_end] + " " + str(int(revop_inner_prod[j,i] * 1000)) + " ") # f.write("\n") # f.flush() # #for j in range() # f.close() if best_val_cost > -99.0: #prev_val_loss - val_loss > 0 and prev_val_loss - val_loss > prev_loss - cur_loss and best_val_cost > val_loss: best_val_cost = -99.0 best_val_epoch = epoch + 1 best_val_epoch_revop = revop_map #if best_val_roc < roc_curr: # best_val_roc = roc_curr # best_val_roc_revop = revop_map #if best_val_ap < ap_curr: # best_val_ap = ap_curr # best_val_ap_revop = revop_map print( "Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(lossVal / lossValNorm), "val_loss=", "{:.5f}".format(-99.0), #"val_roc_curr=", "{:.5f}".format(roc_curr), #"val_ap_curr=", "{:.5f}".format(ap_curr), "revop=", "{:.5f}".format(revop_map), "best_revop=", "{:.5f}".format(best), "revop_at_best_val=", "{:.5f}".format(best_val_epoch_revop), #"revop_at_best_val_roc=", "{:.5f}".format(best_val_roc_revop), #"revop_at_best_ap_roc=", "{:.5f}".format(best_val_ap_revop), "time=", "{:.5f}".format(time.time() - t)) prev_val_loss = -99.0 prev_loss = -99.0 t = time.time() print("Optimization Finished!") #roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) #print('Test ROC score: ' + str(roc_score)) #print('Test AP score: ' + str(ap_score)) return best, best_val_epoch_revop, best_val_roc_revop, best_val_ap_revop
def __init__(self, graph_edgelist, num_actions, dimension, learning_rate=0.01, epochs=300, hidden1=32, hidden2=16, dropout=0., model_str='gcn_vae', use_features=0): """Initialize ExactBasis.""" if graph_edgelist is None: raise ValueError('graph cannot be None') if dimension < 1: raise ValueError('dimension must be >= 1') self.__num_actions = BasisFunction._validate_num_actions(num_actions) self._dimension = dimension adj, features = self.read_graph(graph_edgelist) # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj) # adj = adj_train if use_features == 0: features = sp.identity(features.shape[0]) # featureless # Some preprocessing adj_norm = preprocess_graph(adj) # Define placeholders placeholders = { 'features': tf.sparse_placeholder(tf.float32), 'adj': tf.sparse_placeholder(tf.float32), 'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()) } num_nodes = adj.shape[0] features = sparse_to_tuple(features.tocoo()) num_features = features[2][1] features_nonzero = features[1].shape[0] # Create model model = None if model_str == 'gcn_ae': model = GCNModelAE(placeholders, num_features, features_nonzero, hidden1, hidden2, dimension) elif model_str == 'gcn_vae': model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero, hidden1, dimension) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2) # Optimizer with tf.name_scope('optimizer'): if model_str == 'gcn_ae': opt = OptimizerAE(preds=model.reconstructions, labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=pos_weight, norm=norm, learning_rate=learning_rate) elif model_str == 'gcn_vae': opt = OptimizerVAE(preds=model.reconstructions, labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), [-1]), model=model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm, learning_rate=learning_rate) # Initialize session sess = tf.Session() sess.run(tf.global_variables_initializer()) adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) # Train model for epoch in range(epochs): t = time.time() # Construct feed dictionary feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders) feed_dict.update({placeholders['dropout']: dropout}) # Run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict) print("GCN Optimization Finished!") feed_dict.update({placeholders['dropout']: 0}) self.embeddings = sess.run(model.z_mean, feed_dict=feed_dict)
def gae(filename, output_dir): # Settings flags = tf.app.flags FLAGS = flags.FLAGS flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.') flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.') flags.DEFINE_integer('hidden1', 32, 'Number of units in hidden layer 1.') flags.DEFINE_integer('hidden2', 16, 'Number of units in hidden layer 2.') flags.DEFINE_float('weight_decay', 0., 'Weight for L2 loss on embedding matrix.') flags.DEFINE_float('dropout', 0., 'Dropout rate (1 - keep probability).') flags.DEFINE_string('filename', 'email-Eu-core.mat', 'dataset') flags.DEFINE_string('model', 'gcn_vae', 'Model string.') flags.DEFINE_string('dataset', 'cora', 'Dataset string.') flags.DEFINE_integer('features', 0, 'Whether to use features (1) or not (0).') model_str = FLAGS.model # dataset_str = FLAGS.dataset # Load data # adj, features = load_data(dataset_str) adj, R, edges = load_network_data(filename) num_edges = np.sum(adj) length = adj.shape[0] A = np.array(adj, copy=True) adj = sp.csr_matrix(adj) # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges = mask_test_edges(adj) adj = adj_train if FLAGS.features == 0: features = sp.identity(adj.shape[0]) # featureless # Some preprocessing adj_norm = preprocess_graph(adj) # Define placeholders placeholders = { 'features': tf.sparse_placeholder(tf.float32), 'adj': tf.sparse_placeholder(tf.float32), 'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()) } num_nodes = adj.shape[0] features = sparse_to_tuple(features.tocoo()) num_features = features[2][1] features_nonzero = features[1].shape[0] # Create model model = None if model_str == 'gcn_ae': model = GCNModelAE(placeholders, num_features, features_nonzero) elif model_str == 'gcn_vae': model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) # Optimizer with tf.name_scope('optimizer'): if model_str == 'gcn_ae': opt = OptimizerAE(preds=model.reconstructions, labels=tf.reshape( tf.sparse_tensor_to_dense( placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=pos_weight, norm=norm) elif model_str == 'gcn_vae': opt = OptimizerVAE(preds=model.reconstructions, labels=tf.reshape( tf.sparse_tensor_to_dense( placeholders['adj_orig'], validate_indices=False), [-1]), model=model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm) # Initialize session sess = tf.Session() sess.run(tf.global_variables_initializer()) adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) # Train model for epoch in range(FLAGS.epochs): t = time.time() # Construct feed dictionary feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict) # Compute average loss # avg_cost = outs[1] # avg_accuracy = outs[2] # # if (epoch + 1) % 10 == 0: # print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost), # "train_acc=", "{:.5f}".format(avg_accuracy), "time=", "{:.5f}".format(time.time() - t)) print("GAE Optimization Finished!") feed_dict.update({placeholders['dropout']: 0}) emb = sess.run(model.z_mean, feed_dict=feed_dict) def sigmoid(x): return 1 / (1 + np.exp(-x)) # Predict on test set of edges adj_rec = np.dot(emb, emb.T) adj_rec = np.array(adj_rec) # adj_rec = adj_rec[1:length, :][:, 1:length] DD = np.sort(adj_rec.flatten()) threshold = DD[int(-1 * num_edges)] network_C = np.array([[ 0 if adj_rec[i, j] < threshold else 1 for i in range(adj_rec.shape[0]) ] for j in range(adj_rec.shape[1])], dtype=np.int8) # np.save('../data/GAE_network.npy', network_C[1:length, :][:, 1:length]) os.chdir('../') np.save('{}/GAE_network.npy'.format(output_dir, filename), network_C[1:length, :][:, 1:length]) A_copy = adj_rec final_network = [A_copy] # orinal_network = [A] for i in range(1, 5): adjacent_matrix = tf.placeholder(tf.float32, shape=A_copy.shape) R_matrix = tf.placeholder(tf.float32, shape=R[i - 1, 0].shape) A_copy = sess.run(tf.matmul(tf.matmul(R_matrix, adjacent_matrix), tf.transpose(R_matrix)), feed_dict={ R_matrix: R[i - 1, 0].todense(), adjacent_matrix: A_copy }) final_network.append(np.array(A_copy)) # adjacent_matrix = tf.placeholder(tf.float32, shape=A.shape) # R_matrix = tf.placeholder(tf.float32, shape=R[i - 1, 0].shape) # A = sess.run(tf.matmul(tf.matmul(R_matrix, adjacent_matrix), tf.transpose(R_matrix)), # feed_dict={R_matrix: R[i - 1, 0].todense(), adjacent_matrix: A}) # orinal_network.append(A) # draw_graph(final_network, edges, output_dir) network_B = final_network[0] print('Generating graph by GAE algorithm.') DD = np.sort(network_B.flatten())[::-1] threshold = DD[edges[0, 0]] network_C = np.array([[ 0 if network_B[i, j] < threshold else 1 for i in range(network_B.shape[0]) ] for j in range(network_B.shape[1])]) _A_obs = network_C + network_C.T _A_obs[_A_obs > 1] = 1 _A_obs = np.array(_A_obs) print('Computing metrics for graph generated by GAE') c = compute_graph_statistics(_A_obs) with open('{}/gae_network_statistics.pickle'.format(output_dir), 'wb') as handle: pickle.dump(c, handle, protocol=pickle.HIGHEST_PROTOCOL) print(c)
'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()) # 'features_nonzero': tf.placeholder_with_default(features_nonzero, shape=()), # 'num_nodes': tf.placeholder_with_default(num_nodes, shape=()) } # How much to weigh positive examples (true edges) in cost print_function # Want to weigh less-frequent classes higher, so as to prevent model output bias # pos_weight = (num. negative samples / (num. positive samples) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() # normalize (scale) average weighted cost norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2) # Create VAE model model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero, HIDDEN1_DIM, HIDDEN2_DIM) opt = OptimizerVAE(preds=model.reconstructions, labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), [-1]), model=model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm, learning_rate=LEARNING_RATE) cost_val = [] acc_val = [] val_roc_score = [] # Initialize session sess = tf.Session()
def gae_scores( adj_sparse, train_test_split, features_matrix=None, LEARNING_RATE = 0.01, EPOCHS = 250, HIDDEN1_DIM = 32, HIDDEN2_DIM = 16, DROPOUT = 0, edge_score_mode="dot-product", verbose=1, dtype=tf.float32 ): adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \ test_edges, test_edges_false = train_test_split # Unpack train-test split if verbose >= 1: print('GAE preprocessing...') # start_time = time.time() # 由于内存限制,使用CPU (隐藏 GPU)训练 os.environ['CUDA_VISIBLE_DEVICES'] = "" # 特征转换 正常矩阵 --> 稀疏矩阵 --> 元组 # 特征元组包含: (矩阵坐标列表, 矩阵值列表, 矩阵维度) if features_matrix is None: x = sp.lil_matrix(np.identity(adj_sparse.shape[0])) else: x = sp.lil_matrix(features_matrix) features_tuple = sparse_to_tuple(x) features_shape = features_tuple[2] # 获取图属性 (用于输入模型) num_nodes = adj_sparse.shape[0] # 邻接矩阵的节点数量 num_features = features_shape[1] # 特征数量 (特征矩阵的列数) features_nonzero = features_tuple[1].shape[0] # 特征矩阵中的非零条目数(或者矩阵值列表长度) # 保存原始邻接矩阵 (没有对角线条目) 到后面使用 adj_orig = deepcopy(adj_sparse) adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() # 归一化邻接矩阵 adj_norm = preprocess_graph(adj_train) # 添加对角线 adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) # 定义占位符 placeholders = { 'features': tf.sparse_placeholder(tf.float32), 'adj': tf.sparse_placeholder(tf.float32), 'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()) } # How much to weigh positive examples (true edges) in cost print_function # Want to weigh less-frequent classes higher, so as to prevent model output bias # pos_weight = (num. negative samples / (num. positive samples) pos_weight = float(adj_sparse.shape[0] * adj_sparse.shape[0] - adj_sparse.sum()) / adj_sparse.sum() # normalize (scale) average weighted cost norm = adj_sparse.shape[0] * adj_sparse.shape[0] / float((adj_sparse.shape[0] * adj_sparse.shape[0] - adj_sparse.sum()) * 2) if verbose >= 1: print('Initializing GAE model...') # 创建 VAE 模型 model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero, HIDDEN1_DIM, HIDDEN2_DIM, dtype=dtype, flatten_output=False) opt = OptimizerVAE(preds=model.reconstructions, labels=tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), # labels=placeholders['adj_orig'], model=model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm, learning_rate=LEARNING_RATE, dtype=tf.float32) cost_val = [] acc_val = [] val_roc_score = [] prev_embs = [] # 初始化 session sess = tf.Session() if verbose >= 1: # 打印所有可训练的变量 total_parameters = 0 for variable in tf.trainable_variables(): # shape 是tf.Dimension的一个数组 shape = variable.get_shape() print("Variable shape: ", shape) variable_parameters = 1 for dim in shape: print("Current dimension: ", dim) variable_parameters *= dim.value print("Variable params: ", variable_parameters) total_parameters += variable_parameters print('') print("TOTAL TRAINABLE PARAMS: ", total_parameters) print('Initializing TF variables...') sess.run(tf.global_variables_initializer()) if verbose >= 1: print('Starting GAE training!') start_time = time.time() # 训练模型 train_loss = [] train_acc = [] val_roc = [] val_ap = [] for epoch in range(EPOCHS): t = time.time() # 构造 feed dictionary feed_dict = construct_feed_dict(adj_norm, adj_label, features_tuple, placeholders) feed_dict.update({placeholders['dropout']: DROPOUT}) # 单一权重更新 outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict) # 计算平均损失 avg_cost = outs[1] avg_accuracy = outs[2] # 评估预测 feed_dict.update({placeholders['dropout']: 0}) gae_emb = sess.run(model.z_mean, feed_dict=feed_dict) prev_embs.append(gae_emb) gae_score_matrix = np.dot(gae_emb, gae_emb.T) roc_curr, ap_curr = get_roc_score(val_edges, val_edges_false, gae_score_matrix, apply_sigmoid=True) val_roc_score.append(roc_curr) # 每次迭代打印结果 # if verbose == 2: # print(("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost), # "train_acc=", "{:.5f}".format(avg_accuracy), "val_roc=", "{:.5f}".format(val_roc_score[-1]), # "val_ap=", "{:.5f}".format(ap_curr), # "time=", "{:.5f}".format(time.time() - t))) train_loss.append(avg_cost) train_acc.append(avg_accuracy) val_roc.append(val_roc_score[-1]) val_ap.append(ap_curr) # 画出训练过程损失和准确度以及验证AUC和AP #draw_gae_training('hamster', EPOCHS, train_loss, train_acc, val_roc, val_ap) runtime = time.time() - start_time if verbose == 2: print("Optimization Finished!") # 打印最终结果 feed_dict.update({placeholders['dropout']: 0}) gae_emb = sess.run(model.z_mean, feed_dict=feed_dict) # 点积边得分 if edge_score_mode == "dot-product": gae_score_matrix = np.dot(gae_emb, gae_emb.T) # runtime = time.time() - start_time # 计算最终得分 gae_val_roc, gae_val_ap = get_roc_score(val_edges, val_edges_false, gae_score_matrix) gae_test_roc, gae_test_ap = get_roc_score(test_edges, test_edges_false, gae_score_matrix) # 采取自举边嵌入 (通过哈达玛积) elif edge_score_mode == "edge-emb": def get_edge_embeddings(edge_list): embs = [] for edge in edge_list: node1 = edge[0] node2 = edge[1] emb1 = gae_emb[node1] emb2 = gae_emb[node2] edge_emb = np.multiply(emb1, emb2) embs.append(edge_emb) embs = np.array(embs) return embs # 训练集 边嵌入 pos_train_edge_embs = get_edge_embeddings(train_edges) neg_train_edge_embs = get_edge_embeddings(train_edges_false) train_edge_embs = np.concatenate([pos_train_edge_embs, neg_train_edge_embs]) # 创建训练集 边标签: 1 = real edge, 0 = false edge train_edge_labels = np.concatenate([np.ones(len(train_edges)), np.zeros(len(train_edges_false))]) # 验证集 边嵌入,标签 if len(val_edges) > 0 and len(val_edges_false) > 0: pos_val_edge_embs = get_edge_embeddings(val_edges) neg_val_edge_embs = get_edge_embeddings(val_edges_false) val_edge_embs = np.concatenate([pos_val_edge_embs, neg_val_edge_embs]) val_edge_labels = np.concatenate([np.ones(len(val_edges)), np.zeros(len(val_edges_false))]) # 测试集 边嵌入,标签 pos_test_edge_embs = get_edge_embeddings(test_edges) neg_test_edge_embs = get_edge_embeddings(test_edges_false) test_edge_embs = np.concatenate([pos_test_edge_embs, neg_test_edge_embs]) # 创建验证集 边标签: 1 = real edge, 0 = false edge test_edge_labels = np.concatenate([np.ones(len(test_edges)), np.zeros(len(test_edges_false))]) # 在训练集边嵌入上训练逻辑回归分类器 edge_classifier = LogisticRegression(random_state=0, solver='liblinear') edge_classifier.fit(train_edge_embs, train_edge_labels) #预测边得分: 分为1类(真实边)的概率 if len(val_edges) > 0 and len(val_edges_false) > 0: val_preds = edge_classifier.predict_proba(val_edge_embs)[:, 1] test_preds = edge_classifier.predict_proba(test_edge_embs)[:, 1] #runtime = time.time() - start_time # 计算得分 if len(val_edges) > 0 and len(val_edges_false) > 0: gae_val_roc = roc_auc_score(val_edge_labels, val_preds) gae_val_roc_curve = roc_curve(val_edge_labels, val_preds) gae_val_ap = average_precision_score(val_edge_labels, val_preds) else: gae_val_roc = None gae_val_roc_curve = None gae_val_ap = None gae_test_roc = roc_auc_score(test_edge_labels, test_preds) gae_test_roc_curve = roc_curve(test_edge_labels, test_preds) gae_test_pr_curve = precision_recall_curve(test_edge_labels, test_preds) gae_test_ap = average_precision_score(test_edge_labels, test_preds) # 记录得分 gae_scores = {} gae_scores['test_roc'] = gae_test_roc gae_scores['test_ap'] = gae_test_ap gae_scores['val_roc'] = gae_val_roc gae_scores['val_ap'] = gae_val_ap if(edge_score_mode=="edge-emb"): gae_scores['test_roc_curve'] = gae_test_roc_curve gae_scores['val_roc_curve'] = gae_val_roc_curve gae_scores['test_pr_curve'] = gae_test_pr_curve gae_scores['val_roc_per_epoch'] = val_roc_score gae_scores['runtime'] = runtime return gae_scores
def gae_for(args): print("Using {} dataset".format(args.dataset_str)) adj, features = load_data(args.dataset_str) n_nodes, feat_dim = features.shape # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj) adj = adj_train # Some preprocessing adj_norm = preprocess_graph(adj) adj_label = adj_train + sp.eye(adj_train.shape[0]) # adj_label = sparse_to_tuple(adj_label) adj_label = torch.FloatTensor(adj_label.toarray()) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) lst_result = [] for i in range(10): model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout) optimizer = optim.Adam(model.parameters(), lr=args.lr) hidden_emb = None max_roc_ap = 0 for epoch in range(args.epochs): t = time.time() model.train() optimizer.zero_grad() recovered, mu, logvar = model(features, adj_norm) loss = loss_function(preds=recovered, labels=adj_label, mu=mu, logvar=logvar, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight) loss.backward() cur_loss = loss.item() optimizer.step() hidden_emb = mu.data.numpy() roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false) roc_ap = roc_curr + ap_curr if max_roc_ap < roc_ap: max_roc_ap = roc_ap h_emb_best_model = hidden_emb print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(cur_loss), "val_ap=", "{:.5f}".format(ap_curr), "time=", "{:.5f}".format(time.time() - t)) roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) print('Test ROC score: ' + str(roc_score)) print('Test AP score: ' + str(ap_score)) print("---------------------------------------") print("Optimization Finished!: ", i) roc_score, ap_score = get_roc_score(h_emb_best_model, adj_orig, test_edges, test_edges_false) # roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) lst_result.append([i, roc_score, ap_score]) print('Test ROC score: ' + str(roc_score)) print('Test AP score: ' + str(ap_score)) lst_result = np.array(lst_result) csv_info = np.append( lst_result, [["mean", np.mean(lst_result[:, 1]), np.mean(lst_result[:, 2])]], axis=0) csv_info = np.append( csv_info, [["std", np.std(lst_result[:, 1]), np.std(lst_result[:, 2])]], axis=0) t = int(time.time()) folder = Path(os.path.join(os.getcwd(), "csv")) csv_name = "{}_{}_{}_{}_{}.csv".format(args.dataset_str, args.epochs, args.hidden1, args.hidden2, t) df = pd.DataFrame(csv_info, columns=['run', 'ROC', "AP"]) df.to_csv(os.path.join(folder, csv_name))
def gcn_multilayer(self): """Neural embedding of a multilayer network""" all_nodes = self.get_all_nodes() tmp_fname = pjoin(self.out_dir, 'tmp.emb') for net_name, net in self.nets.items(): self.log.info('Run GCN For Net: %s' % net_name) # ============================================================= adjacency_matrix = nx.adjacency_matrix(net) adjacency_matrix = adjacency_matrix.todense() nodes_count = adjacency_matrix.shape[0] adj = adjacency_matrix features = sp.identity(nodes_count) adj = sp.csr_matrix(adj) # ----------------myCode----------------------------------- # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() # tst_actual_matrix = adj.toarray() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj) adj = adj_train # -----------------------------myCode------------------------- # if FLAGS.features == 0: # features = sp.identity(features.shape[0]) # featureless # -----------------------------myCode------------------------- # Some pre processing adj_norm = preprocess_graph(adj) # Define placeholders placeholders = { 'features': tf.sparse_placeholder(tf.float32), 'adj': tf.sparse_placeholder(tf.float32), 'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()) } num_nodes = adj.shape[0] features = sparse_to_tuple(features.tocoo()) num_features = features[2][1] features_nonzero = features[1].shape[0] # Create model model = None if self.model_str == 'gcn_ae': model = GCNModelAE(placeholders, num_features, features_nonzero, self.hidden1, self.hidden2) elif self.model_str == 'gcn_vae': model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero, self.hidden1, self.hidden2) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2) # Optimizer with tf.name_scope('optimizer'): if self.model_str == 'gcn_ae': opt = OptimizerAE(preds=model.reconstructions, labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=pos_weight, norm=norm) elif self.model_str == 'gcn_vae': opt = OptimizerVAE(preds=model.reconstructions, labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), [-1]), model=model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm) # Initialize session sess = tf.Session() sess.run(tf.global_variables_initializer()) cost_val = [] acc_val = [] def get_roc_score(edges_pos, edges_neg, emb=None): if emb is None: feed_dict.update({placeholders['dropout']: 0}) emb = sess.run(model.z_mean, feed_dict=feed_dict) def sigmoid(x): return 1 / (1 + np.exp(-x)) # Predict on test set of edges adj_rec = np.dot(emb, emb.T) preds = [] pos = [] for e in edges_pos: preds.append(sigmoid(adj_rec[e[0], e[1]])) pos.append(adj_orig[e[0], e[1]]) preds_neg = [] neg = [] for e in edges_neg: preds_neg.append(sigmoid(adj_rec[e[0], e[1]])) neg.append(adj_orig[e[0], e[1]]) preds_all = np.hstack([preds, preds_neg]) labels_all = np.hstack([np.ones(len(preds)), np.zeros(len(preds_neg))]) roc_score = roc_auc_score(labels_all, preds_all) ap_score = average_precision_score(labels_all, preds_all) return roc_score, ap_score cost_val = [] acc_val = [] val_roc_score = [] adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) # Train model # for epoch in range(FLAGS.epochs): # epochs = 10 dropout = 0 for epoch in range(self.n_iter): self.log.info('Iteration: %d' % epoch) t = time.time() # Construct feed dictionary feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders) # feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # -----------myCode------------ feed_dict.update({placeholders['dropout']: dropout}) # -----------myCode------------ # Run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict) # Compute average loss avg_cost = outs[1] avg_accuracy = outs[2] roc_curr, ap_curr = get_roc_score(val_edges, val_edges_false) val_roc_score.append(roc_curr) print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost), "train_acc=", "{:.5f}".format(avg_accuracy), "val_roc=", "{:.5f}".format(val_roc_score[-1]), "val_ap=", "{:.5f}".format(ap_curr), "time=", "{:.5f}".format(time.time() - t)) print("Optimization Finished!") roc_score, ap_score = get_roc_score(test_edges, test_edges_false) print('Test ROC score: ' + str(roc_score)) print('Test AP score: ' + str(ap_score)) # ------vector generation ----------------------------- vectors = sess.run(model.embeddings, feed_dict=feed_dict) fname = self.out_dir + net_name +'vectors.txt' # with open(fname, 'a+') as fout: # for line in np.array(vectors): # fout.write(line + "\n") np.savetxt(fname, np.array(vectors), fmt="%s", delimiter=' ') self.log.info('Saving vectors: %s' % fname) # ============================================================== self.log.info('after exec gcn : %s' % net_name) self.log.info('Done!')
def run(self): if self.file_expr == '': # text-image-code combination n_by_n, x_train, y_train, train_mask, val_mask, test_mask, idx_supernodes, label_encoder = graph_generator.load_combo( self.labels_dict) else: n_by_n, x_train, y_train, train_mask, val_mask, test_mask, idx_supernodes, label_encoder = graph_generator.load_data( self.labels_dict, self.file_expr, min_valid_triples=self.min_valid_triples, sep=self.file_sep, select_rels=self.select_rels) self.idx_supernodes = idx_supernodes adj = nx.adjacency_matrix(nx.from_scipy_sparse_matrix( n_by_n)) #nx.adjacency_matrix(nx.from_numpy_array(n_by_n)) features = scipy.sparse.csr.csr_matrix(x_train) # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() self.adj_orig = adj_orig adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges2( adj) adj = adj_train # Some preprocessing adj_norm = preprocess_graph(adj) num_nodes = adj.shape[0] if not self.use_features: features = sp.identity(features.shape[0]) # featureless features = sparse_to_tuple(features.tocoo()) num_features = features[2][1] features_nonzero = features[1].shape[0] # Create model if model_str == 'gcn_ae': self.model = GCNModelAE(self.placeholders, num_features, features_nonzero) elif model_str == 'gcn_vae': self.model = GCNModelVAE(self.placeholders, num_features, num_nodes, features_nonzero) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) # Optimizer with tf.name_scope('optimizer'): if model_str == 'gcn_ae': opt = OptimizerAE(preds=self.model.reconstructions, labels=tf.reshape( tf.sparse_tensor_to_dense( self.placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=pos_weight, norm=norm) elif model_str == 'gcn_vae': opt = OptimizerVAE(preds=self.model.reconstructions, labels=tf.reshape( tf.sparse_tensor_to_dense( self.placeholders['adj_orig'], validate_indices=False), [-1]), model=self.model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm) # Initialize session self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) val_roc_score = [] adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) #import datetime #log_dir="logs/gae/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # Train model for epoch in range(self.epochs): #FLAGS.epochs): t = time.time() # Construct feed dictionary self.feed_dict = construct_feed_dict(adj_norm, adj_label, features, self.placeholders) self.feed_dict.update( {self.placeholders['dropout']: self.dropout_rate}) # FLAGS.dropout}) # Run single weight update outs = self.sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=self.feed_dict) # Compute average loss avg_cost = outs[1] avg_accuracy = outs[2] roc_curr, ap_curr = self.get_roc_score(val_edges, val_edges_false) val_roc_score.append(roc_curr) # tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1) print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost), "train_acc=", "{:.5f}".format(avg_accuracy), "val_roc=", "{:.5f}".format(val_roc_score[-1]), "val_ap=", "{:.5f}".format(ap_curr), "time=", "{:.5f}".format(time.time() - t)) print("Optimization Finished!") roc_score, ap_score = self.get_roc_score(test_edges, test_edges_false) print('Test ROC score: ' + str(roc_score)) print('Test AP score: ' + str(ap_score)) [supernodes, supernodes_embeddings, supernodes_labels] = self.get_embeddings(y_train, label_encoder) self.supernodes = [ supernodes, supernodes_embeddings, supernodes_labels ]
def main(args): """ Compute embeddings using GAE/VGAE. """ # Load edgelist oneIndx = False E = np.loadtxt(args.inputgraph, delimiter=args.delimiter, dtype=int) if np.min(E) == 1: oneIndx = True E -= 1 # Create an unweighted graph G = nx.Graph() G.add_edges_from(E[:, :2]) # Get adj matrix of the graph tr_A = nx.adjacency_matrix(G, weight=None) num_nodes = tr_A.shape[0] # Set main diag to 1s and normalize (algorithm requirement) adj_norm = preprocess_graph(tr_A) # Define placeholders placeholders = { 'features': tf.sparse_placeholder(tf.float32), 'adj': tf.sparse_placeholder(tf.float32), 'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()) } # Create empty feature matrix features = sp.identity(num_nodes) # featureless features = sparse_to_tuple(features.tocoo()) num_features = features[2][1] features_nonzero = features[1].shape[0] # Create model model = None if args.model == 'gcn_ae': model = GCNModelAE(placeholders, num_features, features_nonzero) elif args.model == 'gcn_vae': model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero) pos_weight = float(tr_A.shape[0] * tr_A.shape[0] - tr_A.sum()) / tr_A.sum() norm = tr_A.shape[0] * tr_A.shape[0] / float( (tr_A.shape[0] * tr_A.shape[0] - tr_A.sum()) * 2) # Optimizer with tf.name_scope('optimizer'): if args.model == 'gcn_ae': opt = OptimizerAE(preds=model.reconstructions, labels=tf.reshape( tf.sparse_tensor_to_dense( placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=pos_weight, norm=norm) elif args.model == 'gcn_vae': opt = OptimizerVAE(preds=model.reconstructions, labels=tf.reshape( tf.sparse_tensor_to_dense( placeholders['adj_orig'], validate_indices=False), [-1]), model=model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm) # Initialize session sess = tf.Session() sess.run(tf.global_variables_initializer()) adj_label = tr_A + sp.eye(tr_A.shape[0]) adj_label = sparse_to_tuple(adj_label) # Train model for epoch in range(FLAGS.epochs): # Construct feed dictionary feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict) print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]), "train_acc=", "{:.5f}".format(outs[2])) # Compute predictions feed_dict.update({placeholders['dropout']: 0}) emb = sess.run(model.z_mean, feed_dict=feed_dict) def sigmoid(x): return 1 / (1 + np.exp(-x)) # Node similarities adj_rec = np.dot(emb, emb.T) start = time.time() # Read the train edges and compute similarity if args.tr_e is not None: train_edges = np.loadtxt(args.tr_e, delimiter=args.delimiter, dtype=int) if oneIndx: train_edges -= 1 scores = list() for src, dst in train_edges: scores.append(sigmoid(adj_rec[src, dst])) np.savetxt(args.tr_pred, scores, delimiter=args.delimiter) # Read the test edges and run predictions if args.te_e is not None: test_edges = np.loadtxt(args.te_e, delimiter=args.delimiter, dtype=int) if oneIndx: test_edges -= 1 scores = list() for src, dst in test_edges: scores.append(sigmoid(adj_rec[src, dst])) np.savetxt(args.te_pred, scores, delimiter=args.delimiter) # If no edge lists provided to predict links, then just store the embeddings else: np.savetxt(args.output, emb, delimiter=args.delimiter) print('Prediction time: {}'.format(time.time() - start))
def fit(self, adj, features, labels): adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train = gen_train_edges(adj) adj = adj_train # Some preprocessing adj_norm = preprocess_graph(adj) num_nodes = adj.shape[0] input_feature_dim = features.shape[1] features = normalize_vectors(features) # Define placeholders self.placeholders = { 'features': tf.compat.v1.placeholder(tf.float32, shape=(None, input_feature_dim)), # 'features': tf.compat.v1.sparse_placeholder(tf.float32), 'adj': tf.compat.v1.sparse_placeholder(tf.float32), 'adj_orig': tf.compat.v1.sparse_placeholder(tf.float32), 'dropout': tf.compat.v1.placeholder_with_default(0., shape=()) } if self.model_type == 'gcn_ae': self.model = GCNModelAE(self.placeholders, input_feature_dim) elif self.model_type == 'gcn_vae': self.model = GCNModelVAE(self.placeholders, input_feature_dim, num_nodes) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() # negative edges/pos edges # print('positive edge weight', pos_weight) norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.nnz) * 2) # Optimizer with tf.compat.v1.name_scope('optimizer'): if self.model_type == 'gcn_ae': opt = OptimizerAE(preds=self.model.reconstructions, labels=tf.reshape( tf.sparse.to_dense( self.placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=pos_weight, norm=norm) elif self.model_type == 'gcn_vae': opt = OptimizerVAE(preds=self.model.reconstructions, labels=tf.reshape( tf.sparse.to_dense( self.placeholders['adj_orig'], validate_indices=False), [-1]), model=self.model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm) # Initialize session self.sess = tf.compat.v1.Session() self.sess.run(tf.compat.v1.global_variables_initializer()) adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) # Train model for epoch in range(FLAGS.epochs): t = time.time() # Construct feed dictionary self.feed_dict = construct_feed_dict(adj_norm, adj_label, features, self.placeholders) self.feed_dict.update( {self.placeholders['dropout']: FLAGS.dropout}) # Run single weight update outs = self.sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=self.feed_dict) # Compute average loss avg_cost = outs[1] avg_accuracy = outs[2]
def gae_scores( adj_sparse, train_test_split, features_matrix=None, LEARNING_RATE = 0.01, EPOCHS = 200, HIDDEN1_DIM = 32, HIDDEN2_DIM = 16, DROPOUT = 0, edge_score_mode="dot-product", verbose=1, dtype=tf.float32 ): adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \ test_edges, test_edges_false = train_test_split # Unpack train-test split if verbose >= 1: print 'GAE preprocessing...' start_time = time.time() # Train on CPU (hide GPU) due to memory constraints os.environ['CUDA_VISIBLE_DEVICES'] = "" # Convert features from normal matrix --> sparse matrix --> tuple # features_tuple contains: (list of matrix coordinates, list of values, matrix dimensions) if features_matrix is None: x = sp.lil_matrix(np.identity(adj_sparse.shape[0])) else: x = sp.lil_matrix(features_matrix) features_tuple = sparse_to_tuple(x) features_shape = features_tuple[2] # Get graph attributes (to feed into model) num_nodes = adj_sparse.shape[0] # number of nodes in adjacency matrix num_features = features_shape[1] # number of features (columsn of features matrix) features_nonzero = features_tuple[1].shape[0] # number of non-zero entries in features matrix (or length of values list) # Store original adjacency matrix (without diagonal entries) for later adj_orig = deepcopy(adj_sparse) adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() # Normalize adjacency matrix adj_norm = preprocess_graph(adj_train) # Add in diagonals adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) # Define placeholders placeholders = { # TODO: try making these dense from the get-go 'features': tf.sparse_placeholder(tf.float16), 'adj': tf.sparse_placeholder(tf.float16), 'adj_orig': tf.sparse_placeholder(tf.float16), 'dropout': tf.placeholder_with_default(0., shape=()) } # How much to weigh positive examples (true edges) in cost print_function # Want to weigh less-frequent classes higher, so as to prevent model output bias # pos_weight = (num. negative samples / (num. positive samples) pos_weight = float(adj_sparse.shape[0] * adj_sparse.shape[0] - adj_sparse.sum()) / adj_sparse.sum() # normalize (scale) average weighted cost norm = adj_sparse.shape[0] * adj_sparse.shape[0] / float((adj_sparse.shape[0] * adj_sparse.shape[0] - adj_sparse.sum()) * 2) if verbose >= 1: print 'Initializing GAE model...' # Create VAE model model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero, HIDDEN1_DIM, HIDDEN2_DIM, dtype=dtype, flatten_output=False) opt = OptimizerVAE(preds=model.reconstructions, labels=tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), # labels=placeholders['adj_orig'], model=model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm, learning_rate=LEARNING_RATE, dtype=tf.float16) cost_val = [] acc_val = [] val_roc_score = [] prev_embs = [] # Initialize session sess = tf.Session() if verbose >= 1: # Print total # trainable variables total_parameters = 0 for variable in tf.trainable_variables(): # shape is an array of tf.Dimension shape = variable.get_shape() print "Variable shape: ", shape variable_parameters = 1 for dim in shape: print "Current dimension: ", dim variable_parameters *= dim.value print "Variable params: ", variable_parameters total_parameters += variable_parameters print '' print "TOTAL TRAINABLE PARAMS: ", total_parameters print 'Initializing TF variables...' sess.run(tf.global_variables_initializer()) if verbose >= 1: print 'Starting GAE training!' # Train model for epoch in range(EPOCHS): t = time.time() # Construct feed dictionary feed_dict = construct_feed_dict(adj_norm, adj_label, features_tuple, placeholders) feed_dict.update({placeholders['dropout']: DROPOUT}) # Run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict) # Compute average loss avg_cost = outs[1] avg_accuracy = outs[2] # Evaluate predictions feed_dict.update({placeholders['dropout']: 0}) gae_emb = sess.run(model.z_mean, feed_dict=feed_dict) prev_embs.append(gae_emb) gae_score_matrix = np.dot(gae_emb, gae_emb.T) # # TODO: remove this (debugging) # if not np.isfinite(gae_score_matrix).all(): # print 'Found non-finite value in GAE score matrix! Epoch: {}'.format(epoch) # with open('numpy-nan-debugging.pkl', 'wb') as f: # dump_info = {} # dump_info['gae_emb'] = gae_emb # dump_info['epoch'] = epoch # dump_info['gae_score_matrix'] = gae_score_matrix # dump_info['adj_norm'] = adj_norm # dump_info['adj_label'] = adj_label # dump_info['features_tuple'] = features_tuple # # dump_info['feed_dict'] = feed_dict # dump_info['prev_embs'] = prev_embs # pickle.dump(dump_info, f, protocol=2) # # END TODO roc_curr, ap_curr = get_roc_score(val_edges, val_edges_false, gae_score_matrix, apply_sigmoid=True) val_roc_score.append(roc_curr) # Print results for this epoch if verbose == 2: print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost), "train_acc=", "{:.5f}".format(avg_accuracy), "val_roc=", "{:.5f}".format(val_roc_score[-1]), "val_ap=", "{:.5f}".format(ap_curr), "time=", "{:.5f}".format(time.time() - t)) if verbose == 2: print("Optimization Finished!") # Print final results feed_dict.update({placeholders['dropout']: 0}) gae_emb = sess.run(model.z_mean, feed_dict=feed_dict) # Dot product edge scores (default) if edge_score_mode == "dot-product": gae_score_matrix = np.dot(gae_emb, gae_emb.T) runtime = time.time() - start_time # Calculate final scores gae_val_roc, gae_val_ap = get_roc_score(val_edges, val_edges_false, gae_score_matrix) gae_test_roc, gae_test_ap = get_roc_score(test_edges, test_edges_false, gae_score_matrix) # Take bootstrapped edge embeddings (via hadamard product) elif edge_score_mode == "edge-emb": def get_edge_embeddings(edge_list): embs = [] for edge in edge_list: node1 = edge[0] node2 = edge[1] emb1 = gae_emb[node1] emb2 = gae_emb[node2] edge_emb = np.multiply(emb1, emb2) embs.append(edge_emb) embs = np.array(embs) return embs # Train-set edge embeddings pos_train_edge_embs = get_edge_embeddings(train_edges) neg_train_edge_embs = get_edge_embeddings(train_edges_false) train_edge_embs = np.concatenate([pos_train_edge_embs, neg_train_edge_embs]) # Create train-set edge labels: 1 = real edge, 0 = false edge train_edge_labels = np.concatenate([np.ones(len(train_edges)), np.zeros(len(train_edges_false))]) # Val-set edge embeddings, labels if len(val_edges) > 0 and len(val_edges_false) > 0: pos_val_edge_embs = get_edge_embeddings(val_edges) neg_val_edge_embs = get_edge_embeddings(val_edges_false) val_edge_embs = np.concatenate([pos_val_edge_embs, neg_val_edge_embs]) val_edge_labels = np.concatenate([np.ones(len(val_edges)), np.zeros(len(val_edges_false))]) # Test-set edge embeddings, labels pos_test_edge_embs = get_edge_embeddings(test_edges) neg_test_edge_embs = get_edge_embeddings(test_edges_false) test_edge_embs = np.concatenate([pos_test_edge_embs, neg_test_edge_embs]) # Create val-set edge labels: 1 = real edge, 0 = false edge test_edge_labels = np.concatenate([np.ones(len(test_edges)), np.zeros(len(test_edges_false))]) # Train logistic regression classifier on train-set edge embeddings edge_classifier = LogisticRegression(random_state=0) edge_classifier.fit(train_edge_embs, train_edge_labels) # Predicted edge scores: probability of being of class "1" (real edge) if len(val_edges) > 0 and len(val_edges_false) > 0: val_preds = edge_classifier.predict_proba(val_edge_embs)[:, 1] test_preds = edge_classifier.predict_proba(test_edge_embs)[:, 1] runtime = time.time() - start_time # Calculate scores if len(val_edges) > 0 and len(val_edges_false) > 0: gae_val_roc = roc_auc_score(val_edge_labels, val_preds) # gae_val_roc_curve = roc_curve(val_edge_labels, val_preds) gae_val_ap = average_precision_score(val_edge_labels, val_preds) else: gae_val_roc = None gae_val_roc_curve = None gae_val_ap = None gae_test_roc = roc_auc_score(test_edge_labels, test_preds) # gae_test_roc_curve = roc_curve(test_edge_labels, test_preds) gae_test_ap = average_precision_score(test_edge_labels, test_preds) # Record scores gae_scores = {} gae_scores['test_roc'] = gae_test_roc # gae_scores['test_roc_curve'] = gae_test_roc_curve gae_scores['test_ap'] = gae_test_ap gae_scores['val_roc'] = gae_val_roc # gae_scores['val_roc_curve'] = gae_val_roc_curve gae_scores['val_ap'] = gae_val_ap gae_scores['val_roc_per_epoch'] = val_roc_score gae_scores['runtime'] = runtime return gae_scores
def GAEembedding(z, adj, args): ''' GAE embedding for clustering Param: z,adj Return: Embedding from graph ''' # true_labels = np.asarray(true_labels) # args.model = 'gcn_vae' # args.dw = 0 # args.epochs = 200 # args.hidden1 = 32 # args.hidden2 = 16 # args.lr = 0.01 # args.dropout = 0. # args.dataset_sr = 'cora' # args.walk_length = 5 # args.window_size = 3 # args.number_walks = 5 # args.full_number_walks =0 # args.lr_dw = 0.001 # args.context = 0 # args.ns = 1 # args.n_clusters = 11 # args.plot = 0 # featrues from z # Louvain features = z # features = torch.DoubleTensor(features) features = torch.FloatTensor(features) # Old implementation # adj, features, y_test, tx, ty, test_maks, true_labels = load_data(args.dataset_str) n_nodes, feat_dim = features.shape # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj) adj = adj_train # Before proceeding further, make the structure for doing deepWalk # if args.dw == 1: # print('Using deepWalk regularization...') # G = load_edgelist_from_csr_matrix(adj_orig, undirected=True) # print("Number of nodes: {}".format(len(G.nodes()))) # num_walks = len(G.nodes()) * args.number_walks # print("Number of walks: {}".format(num_walks)) # data_size = num_walks * args.walk_length # print("Data size (walks*length): {}".format(data_size)) # Some preprocessing adj_norm = preprocess_graph(adj) adj_label = adj_train + sp.eye(adj_train.shape[0]) # adj_label = sparse_to_tuple(adj_label) # adj_label = torch.DoubleTensor(adj_label.toarray()) adj_label = torch.FloatTensor(adj_label.toarray()) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) if args.GAEmodel == 'gcn_vae': model = GCNModelVAE(feat_dim, args.GAEhidden1, args.GAEhidden2, args.GAEdropout) else: model = GCNModelAE(feat_dim, args.GAEhidden1, args.GAEhidden2, args.GAEdropout) if args.precisionModel == 'Double': model = model.double() optimizer = optim.Adam(model.parameters(), lr=args.GAElr) # if args.dw == 1: # sg = SkipGram(args.hidden2, adj.shape[0]) # optimizer_dw = optim.Adam(sg.parameters(), lr=args.lr_dw) # # Construct the nodes for doing random walk. Doing it before since the seed is fixed # nodes_in_G = list(G.nodes()) # chunks = len(nodes_in_G) // args.number_walks # random.Random().shuffle(nodes_in_G) hidden_emb = None for epoch in tqdm(range(args.GAEepochs)): t = time.time() # mem=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss # print('Mem consumption before training: '+str(mem)) model.train() optimizer.zero_grad() z, mu, logvar = model(features, adj_norm) # After back-propagating gae loss, now do the deepWalk regularization # if args.dw == 1: # sg.train() # if args.full_number_walks > 0: # walks = build_deepwalk_corpus(G, num_paths=args.full_number_walks, # path_length=args.walk_length, alpha=0, # rand=random.Random(SEED)) # else: # walks = build_deepwalk_corpus_iter(G, num_paths=args.number_walks, # path_length=args.walk_length, alpha=0, # rand=random.Random(SEED), # chunk=epoch % chunks, # nodes=nodes_in_G) # for walk in walks: # if args.context == 1: # # Construct the pairs for predicting context node # # for each node, treated as center word # curr_pair = (int(walk[center_node_pos]), []) # for center_node_pos in range(len(walk)): # # for each window position # for w in range(-args.window_size, args.window_size + 1): # context_node_pos = center_node_pos + w # # make soure not jump out sentence # if context_node_pos < 0 or context_node_pos >= len(walk) or center_node_pos == context_node_pos: # continue # context_node_idx = walk[context_node_pos] # curr_pair[1].append(int(context_node_idx)) # else: # # first item in the walk is the starting node # curr_pair = (int(walk[0]), [int(context_node_idx) for context_node_idx in walk[1:]]) # if args.ns == 1: # neg_nodes = [] # pos_nodes = set(walk) # while len(neg_nodes) < args.walk_length - 1: # rand_node = random.randint(0, n_nodes - 1) # if rand_node not in pos_nodes: # neg_nodes.append(rand_node) # neg_nodes = torch.from_numpy(np.array(neg_nodes)).long() # # Do actual prediction # src_node = torch.from_numpy(np.array([curr_pair[0]])).long() # tgt_nodes = torch.from_numpy(np.array(curr_pair[1])).long() # optimizer_dw.zero_grad() # log_pos = sg(src_node, tgt_nodes, neg_sample=False) # if args.ns == 1: # loss_neg = sg(src_node, neg_nodes, neg_sample=True) # loss_dw = log_pos + loss_neg # else: # loss_dw = log_pos # loss_dw.backward(retain_graph=True) # cur_dw_loss = loss_dw.item() # optimizer_dw.step() loss = loss_function(preds=model.dc(z), labels=adj_label, mu=mu, logvar=logvar, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight) loss.backward() cur_loss = loss.item() optimizer.step() hidden_emb = mu.data.numpy() # TODO, this is prediction # roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false) ap_curr = 0 # if args.dw == 1: # tqdm.write("Epoch: {}, train_loss_gae={:.5f}, train_loss_dw={:.5f}, val_ap={:.5f}, time={:.5f}".format( # epoch + 1, cur_loss, cur_dw_loss, # ap_curr, time.time() - t)) # else: tqdm.write( "Epoch: {}, train_loss_gae={:.5f}, val_ap={:.5f}, time={:.5f}". format(epoch + 1, cur_loss, ap_curr, time.time() - t)) # if (epoch + 1) % 10 == 0: # tqdm.write("Evaluating intermediate results...") # kmeans = KMeans(n_clusters=args.n_clusters, random_state=0).fit(hidden_emb) # predict_labels = kmeans.predict(hidden_emb) # cm = clustering_metrics(true_labels, predict_labels) # cm.evaluationClusterModelFromLabel(tqdm) # roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) # tqdm.write('ROC: {}, AP: {}'.format(roc_score, ap_score)) # np.save('logs/emb_epoch_{}.npy'.format(epoch + 1), hidden_emb) tqdm.write("Optimization Finished!") roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) tqdm.write('Test ROC score: ' + str(roc_score)) tqdm.write('Test AP score: ' + str(ap_score)) # kmeans = KMeans(n_clusters=args.n_clusters, random_state=0).fit(hidden_emb) # predict_labels = kmeans.predict(hidden_emb) # cm = clustering_metrics(true_labels, predict_labels) # cm.evaluationClusterModelFromLabel(tqdm) # if args.GAEplot == 1: # cm.plotClusters(tqdm, hidden_emb, true_labels) return hidden_emb
'features': tf.sparse_placeholder(tf.float32), 'adj': tf.sparse_placeholder(tf.float32), 'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()) } num_nodes = adj.shape[0] #网络中结点的个数 features = sparse_to_tuple(features.tocoo( )) #特征转化为稀疏矩阵存储,存成tuple的形式,第一行存一对对关系,第二行存每对关系的值,第三行存行列数 num_features = features[2][1] #特征的维度 features_nonzero = features[1].shape[0] #有特征的节点个数 # Create model model = None model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() #负样本与正样本的比例 print('adj.shape', adj.shape[0], adj.sum()) norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) # shape[0] = 913, adj.sum() = 10734 # Optimizer opt = OptimizerVAE(preds=model.reconstructions, labels=tf.reshape( tf.sparse_tensor_to_dense( placeholders['adj_orig'], validate_indices=False), [-1]), model=model,
def GAEembedding(z, adj, args): ''' GAE embedding for clustering Param: z,adj Return: Embedding from graph ''' # featrues from z # Louvain features = z # features = torch.DoubleTensor(features) features = torch.FloatTensor(features) # Old implementation # adj, features, y_test, tx, ty, test_maks, true_labels = load_data(args.dataset_str) n_nodes, feat_dim = features.shape # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj) adj = adj_train # Some preprocessing adj_norm = preprocess_graph(adj) adj_label = adj_train + sp.eye(adj_train.shape[0]) # adj_label = sparse_to_tuple(adj_label) # adj_label = torch.DoubleTensor(adj_label.toarray()) adj_label = torch.FloatTensor(adj_label.toarray()) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) if args.GAEmodel == 'gcn_vae': model = GCNModelVAE(feat_dim, args.GAEhidden1, args.GAEhidden2, args.GAEdropout) else: model = GCNModelAE(feat_dim, args.GAEhidden1, args.GAEhidden2, args.GAEdropout) if args.precisionModel == 'Double': model = model.double() optimizer = optim.Adam(model.parameters(), lr=args.GAElr) hidden_emb = None for epoch in tqdm(range(args.GAEepochs)): t = time.time() # mem=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss # print('Mem consumption before training: '+str(mem)) model.train() optimizer.zero_grad() z, mu, logvar = model(features, adj_norm) loss = loss_function(preds=model.dc(z), labels=adj_label, mu=mu, logvar=logvar, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight) loss.backward() cur_loss = loss.item() optimizer.step() hidden_emb = mu.data.numpy() # TODO, this is prediction # roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false) ap_curr = 0 tqdm.write( "Epoch: {}, train_loss_gae={:.5f}, val_ap={:.5f}, time={:.5f}". format(epoch + 1, cur_loss, ap_curr, time.time() - t)) tqdm.write("Optimization Finished!") roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) tqdm.write('Test ROC score: ' + str(roc_score)) tqdm.write('Test AP score: ' + str(ap_score)) return hidden_emb