def gae_for(args): print("Using {} dataset".format(args.dataset_str)) adj, features, y_test, tx, ty, test_maks, true_labels = load_data( args.dataset_str) n_nodes, feat_dim = features.shape # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj) adj = adj_train # Before proceeding further, make the structure for doing deepWalk if args.dw == 1: print('Using deepWalk regularization...') G = load_edgelist_from_csr_matrix(adj_orig, undirected=True) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) # Some preprocessing adj_norm = preprocess_graph(adj) adj_label = adj_train + sp.eye(adj_train.shape[0]) # adj_label = sparse_to_tuple(adj_label) adj_label = torch.FloatTensor(adj_label.toarray()) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) if args.model == 'gcn_vae': model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout) else: model = GCNModelAE(feat_dim, args.hidden1, args.hidden2, args.dropout) optimizer = optim.Adam(model.parameters(), lr=args.lr) if args.dw == 1: sg = SkipGram(args.hidden2, adj.shape[0]) optimizer_dw = optim.Adam(sg.parameters(), lr=args.lr_dw) # Construct the nodes for doing random walk. Doing it before since the seed is fixed nodes_in_G = list(G.nodes()) chunks = len(nodes_in_G) // args.number_walks random.Random().shuffle(nodes_in_G) hidden_emb = None for epoch in tqdm(range(args.epochs)): t = time.time() model.train() optimizer.zero_grad() z, mu, logvar = model(features, adj_norm) # After back-propagating gae loss, now do the deepWalk regularization if args.dw == 1: sg.train() if args.full_number_walks > 0: walks = build_deepwalk_corpus(G, num_paths=args.full_number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(SEED)) else: walks = build_deepwalk_corpus_iter( G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(SEED), chunk=epoch % chunks, nodes=nodes_in_G) for walk in walks: if args.context == 1: # Construct the pairs for predicting context node # for each node, treated as center word curr_pair = (int(walk[center_node_pos]), []) for center_node_pos in range(len(walk)): # for each window position for w in range(-args.window_size, args.window_size + 1): context_node_pos = center_node_pos + w # make soure not jump out sentence if context_node_pos < 0 or context_node_pos >= len( walk ) or center_node_pos == context_node_pos: continue context_node_idx = walk[context_node_pos] curr_pair[1].append(int(context_node_idx)) else: # first item in the walk is the starting node curr_pair = (int(walk[0]), [ int(context_node_idx) for context_node_idx in walk[1:] ]) if args.ns == 1: neg_nodes = [] pos_nodes = set(walk) while len(neg_nodes) < args.walk_length - 1: rand_node = random.randint(0, n_nodes - 1) if rand_node not in pos_nodes: neg_nodes.append(rand_node) neg_nodes = torch.from_numpy(np.array(neg_nodes)).long() # Do actual prediction src_node = torch.from_numpy(np.array([curr_pair[0]])).long() tgt_nodes = torch.from_numpy(np.array(curr_pair[1])).long() optimizer_dw.zero_grad() log_pos = sg(src_node, tgt_nodes, neg_sample=False) if args.ns == 1: loss_neg = sg(src_node, neg_nodes, neg_sample=True) loss_dw = log_pos + loss_neg else: loss_dw = log_pos loss_dw.backward(retain_graph=True) cur_dw_loss = loss_dw.item() optimizer_dw.step() loss = loss_function(preds=model.dc(z), labels=adj_label, mu=mu, logvar=logvar, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight) loss.backward() cur_loss = loss.item() optimizer.step() hidden_emb = mu.data.numpy() roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false) if args.dw == 1: tqdm.write( "Epoch: {}, train_loss_gae={:.5f}, train_loss_dw={:.5f}, val_ap={:.5f}, time={:.5f}" .format(epoch + 1, cur_loss, cur_dw_loss, ap_curr, time.time() - t)) else: tqdm.write( "Epoch: {}, train_loss_gae={:.5f}, val_ap={:.5f}, time={:.5f}". format(epoch + 1, cur_loss, ap_curr, time.time() - t)) if (epoch + 1) % 10 == 0: tqdm.write("Evaluating intermediate results...") kmeans = KMeans(n_clusters=args.n_clusters, random_state=0).fit(hidden_emb) predict_labels = kmeans.predict(hidden_emb) cm = clustering_metrics(true_labels, predict_labels) cm.evaluationClusterModelFromLabel(tqdm) roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) tqdm.write('ROC: {}, AP: {}'.format(roc_score, ap_score)) np.save('logs/emb_epoch_{}.npy'.format(epoch + 1), hidden_emb) tqdm.write("Optimization Finished!") roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) tqdm.write('Test ROC score: ' + str(roc_score)) tqdm.write('Test AP score: ' + str(ap_score)) kmeans = KMeans(n_clusters=args.n_clusters, random_state=0).fit(hidden_emb) predict_labels = kmeans.predict(hidden_emb) cm = clustering_metrics(true_labels, predict_labels) cm.evaluationClusterModelFromLabel(tqdm) if args.plot == 1: cm.plotClusters(tqdm, hidden_emb, true_labels)
'features': tf.sparse_placeholder(tf.float32), 'adj': tf.sparse_placeholder(tf.float32), 'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()) } num_nodes = adj.shape[0] features = sparse_to_tuple(features.tocoo()) num_features = features[2][1] features_nonzero = features[1].shape[0] # Create model model = None if model_str == 'gcn_ae': model = GCNModelAE(placeholders, num_features, features_nonzero) elif model_str == 'gcn_vae': model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2) # Optimizer with tf.name_scope('optimizer'): if model_str == 'gcn_ae': opt = OptimizerAE(preds=model.reconstructions, labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=pos_weight, norm=norm) elif model_str == 'gcn_vae':
'adj_l1_splits': tf.placeholder(dtype=tf.int32) , 'adj_orig_values' : tf.placeholder(dtype=tf.float32), 'adj_orig_l2_splits' : tf.placeholder(dtype=tf.int32), 'adj_orig_l1_splits': tf.placeholder(dtype=tf.int32) , 'dropout': tf.placeholder_with_default(0., shape=()) , 'pos_weight' : tf.placeholder(dtype=tf.float32) , 'norm' : tf.placeholder(dtype=tf.float32) , 'ROC_Score' : tf.placeholder(dtype=tf.float32) , 'AP' : tf.placeholder(dtype=tf.float32) } #(?, 3703, 32) # Create model model = None if model_str == 'gcn_ae': model = GCNModelAE(placeholders, 300, 0) elif model_str == 'gcn_vae': model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero) # Optimizer with tf.name_scope('optimizer'): if model_str == 'gcn_ae': labels_placeholders = placeholders['adj_orig_values'] , placeholders['adj_orig_l2_splits'] , placeholders['adj_orig_l1_splits'] opt = OptimizerAE(preds=model.reconstructions,batch_size=batchsize, labels=labels_placeholders ,pos_weight=placeholders['pos_weight'], norm=placeholders['norm'] , roc=placeholders['ROC_Score'] , ap=placeholders['AP'])
num_nodes = adj.shape[0] features = sparse_to_tuple(features.tocoo()) num_features = features[2][1] features_nonzero = features[1].shape[0] adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj) roc_score_arr,aupr_score_arr,precision_arr, recall_arr,accuracy_arr,f_arr = [],[],[],[],[],[] print("split end") CV = 5 for i in range(CV): adj = adj_train[i] adj_norm = preprocess_graph(adj) # Some preprocessing model = GCNModelAE(placeholders, num_features, features_nonzero) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) with tf.name_scope('optimizer'): # Optimizer if model_str == 'gcn_ae': opt = OptimizerAE(preds=model.reconstructions, labels=tf.reshape( tf.sparse_tensor_to_dense( placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=pos_weight, norm=norm) sess = tf.Session() # Initialize sessioN
def __init__(self, graph_edgelist, num_actions, dimension, learning_rate=0.01, epochs=300, hidden1=32, hidden2=16, dropout=0., model_str='gcn_vae', use_features=0): """Initialize ExactBasis.""" if graph_edgelist is None: raise ValueError('graph cannot be None') if dimension < 1: raise ValueError('dimension must be >= 1') self.__num_actions = BasisFunction._validate_num_actions(num_actions) self._dimension = dimension adj, features = self.read_graph(graph_edgelist) # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj) # adj = adj_train if use_features == 0: features = sp.identity(features.shape[0]) # featureless # Some preprocessing adj_norm = preprocess_graph(adj) # Define placeholders placeholders = { 'features': tf.sparse_placeholder(tf.float32), 'adj': tf.sparse_placeholder(tf.float32), 'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()) } num_nodes = adj.shape[0] features = sparse_to_tuple(features.tocoo()) num_features = features[2][1] features_nonzero = features[1].shape[0] # Create model model = None if model_str == 'gcn_ae': model = GCNModelAE(placeholders, num_features, features_nonzero, hidden1, hidden2, dimension) elif model_str == 'gcn_vae': model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero, hidden1, dimension) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2) # Optimizer with tf.name_scope('optimizer'): if model_str == 'gcn_ae': opt = OptimizerAE(preds=model.reconstructions, labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=pos_weight, norm=norm, learning_rate=learning_rate) elif model_str == 'gcn_vae': opt = OptimizerVAE(preds=model.reconstructions, labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), [-1]), model=model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm, learning_rate=learning_rate) # Initialize session sess = tf.Session() sess.run(tf.global_variables_initializer()) adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) # Train model for epoch in range(epochs): t = time.time() # Construct feed dictionary feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders) feed_dict.update({placeholders['dropout']: dropout}) # Run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict) print("GCN Optimization Finished!") feed_dict.update({placeholders['dropout']: 0}) self.embeddings = sess.run(model.z_mean, feed_dict=feed_dict)
def gae_for(args, position): print("Using {} dataset".format(args.dataset_str)) #qhashes, chashes = load_hashes() Q, X = load_data() prebuild = "/media/chundi/3b6b0f74-0ac7-42c7-b76b-00c65f5b3673/revisitop/cnnimageretrieval-pytorch/data/test/matlab_data/GEM_wDis_prebuild.bin" Q_features = "/media/chundi/3b6b0f74-0ac7-42c7-b76b-00c65f5b3673/revisitop/cnnimageretrieval-pytorch/data/test/matlab_data/roxford5k_GEM_lw_query_feats.npy" #"/media/jason/cc0aeb62-0bc7-4f3e-99a0-3bba3dd9f8fc/landmarks/oxfordRe/evaluation/roxHD_query_fused.npy" X_features = "/media/chundi/3b6b0f74-0ac7-42c7-b76b-00c65f5b3673/revisitop/cnnimageretrieval-pytorch/data/test/matlab_data/roxford5k_GEM_index.npy" D_features = "/media/chundi/3b6b0f74-0ac7-42c7-b76b-00c65f5b3673/revisitop/cnnimageretrieval-pytorch/data/test/matlab_data/roxford5k_GEM_Dis.npy" adj, features, adj_Q, features_Q = load_from_prebuild(prebuild, Q_features, X_features, D_features, k=5) # ----> 1M #cut_size = 800000 #adj = adj[:cut_size, :cut_size] #adj_Q = adj_Q[:, :cut_size] #features = features[:cut_size] #Q = np.load("/media/jason/cc0aeb62-0bc7-4f3e-99a0-3bba3dd9f8fc/landmarks/oxfordRe/evaluation/roxHD_query_fused.npy").T.astype(np.float32) #X = np.load("/media/jason/cc0aeb62-0bc7-4f3e-99a0-3bba3dd9f8fc/landmarks/oxfordRe/evaluation/roxHD_index_fused.npy").T.astype(np.float32) #D = np.load("/media/jason/cc0aeb62-0bc7-4f3e-99a0-3bba3dd9f8fc/landmarks/revisitop1m/revisitDistractors_fused_3s_cq.npy").T.astype(np.float32) #X = np.concatenate((X.T,D.T)).T # load the distractor too, shape should be (2048, 1M) #adj, features = gen_graph_index(Q, X, k=5, k_qe=3, do_qe=False) #-----> 5k #adj_Q, features_Q = gen_graph(Q, X, k=5, k_qe=3, do_qe=False) #generate validation/revop evaluation the same way as training ----> 5k features_all = np.concatenate([features_Q, features]) features_all = torch.from_numpy(features_all) #adj_Q = adj_Q.todense() #adj_all = np.concatenate([adj_Q, adj.todense()]) #adj_all = np.pad(adj_all, [[0,0], [Q.shape[1], 0]], "constant") adj_all = sp.vstack((adj_Q, adj)) zeros = sp.csr_matrix((adj_all.shape[0], Q.shape[1])) adj_all = sp.hstack((zeros, adj_all)) adj_all = sp.csr_matrix(adj_all) rows, columns = adj_all.nonzero() print("Making Symmetry") for i in range(rows.shape[0]): if rows[i] < Q.shape[1]: adj_all[columns[i], rows[i]] = adj_all[rows[i], columns[i]] else: break #adj_all = sp.csr_matrix(adj_all) print("preprocessing adj_all") adj_all_norm = preprocess_graph(adj_all) #adj = add_neighbours_neighbour(adj) #adj1, features1 = load_data(args.dataset_str) features = torch.from_numpy(features) #features_all = torch.from_numpy(features_all) n_nodes, feat_dim = features.shape # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj = adj_orig #adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj) print("Sampling validation") adj_train, adj_val, features, features_valid = mask_test_rows( adj, features) adj = adj_train # Some preprocessing print("preprocessing adj") adj_norm = preprocess_graph(adj) #adj_norm_label = preprocess_graph_sp(adj) adj_label = adj_train + sp.eye( adj_train.shape[0] ) #adj_norm_label + sp.eye(adj_train.shape[0]) #adj_train + sp.eye(adj_train.shape[0]) #rows, columns = adj_label.nonzero() #adj_label[columns, rows] = adj_label[rows, columns] # adj_label = sparse_to_tuple(adj_label) #adj_label = torch.FloatTensor(adj_label.toarray()) print("adj sum: " + str(adj.sum())) pos_weight = float(float(adj.shape[0]) * adj.shape[0] - adj.sum()) / adj.sum() print("top part: " + str(float(float(adj.shape[0]) * adj.shape[0] - adj.sum()))) print("pos wieght: " + str(pos_weight)) norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) # for validation data processing: zero = sp.csr_matrix((adj_train.shape[0], adj_val.shape[0])) adj_train_ext = sp.hstack((zero, adj_train)) adj_evaluate = sp.vstack((adj_val, adj_train_ext)) adj_evaluate = sp.csr_matrix(adj_evaluate) rows, columns = adj_evaluate.nonzero() val_edges = [] val_edges_false = [] pos = {} print("getting positive edges") all_val = [i for i in range(len(rows)) if rows[i] < adj_val.shape[0]] for i in all_val: sys.stdout.write("\r sampling edges for validtion: [" + str(i) + "]") val_edges.append((rows[i], columns[i])) if rows[i] not in pos: pos[rows[i]] = [] pos[rows[i]].append(columns[i]) #for i in range(rows.shape[0]): # sys.stdout.write("\r sampling edges for validtion: [" + str(i) + "/" + str(adj_val.shape[0]) + "]") # sys.stdout.flush() # if rows[i] < adj_val.shape[0]: # val_edges.append((rows[i], columns[i])) # if rows[i] not in pos: # pos[rows[i]] = [] # pos[rows[i]].append(columns[i]) # adj_evaluate[columns[i], rows[i]] = adj_evaluate[rows[i], columns[i]] # else: # break step = 0 neg_per_pos = 100 #for r in pos: # p = pos[r] #neg_edges = Parallel(n_jobs=40)(delayed(neg_sample)(pos[i], adj_val.shape[1], neg_per_pos, i) for i in pos) #val_edges_false = [(i, item) for i in range(len(neg_edges)) for item in neg_edges[i]] #a = np.random.permutation(adj_val.shape[1]) #a = [i for i in a if i not in p] #a = a[:100] #for i in a: # val_edges_false.append((r, i)) ##count = 0 ##i = 0 #sys.stdout.write("\r sampling neg edges for validtion: [" + str(step) + "/" + str(len(pos)) + "]") #sys.stdout.flush() #step += 1 #while count < 100: # if a[i] not in p: # val_edges_false.append((r, a[i])) # count += 1 # i += 1 print("preprocessing adj_evaluate") adj_evaluate_norm = preprocess_graph(adj_evaluate) #adj_evaluate_norm_label = preprocess_graph_sp(adj_evaluate) adj_label_evaluate = adj_evaluate + sp.eye( adj_evaluate.shape[0] ) #adj_evaluate_norm_label+ sp.eye(adj_evaluate.shape[0]) #adj_evaluate + sp.eye(adj_evaluate.shape[0]) #adj_label_evaluate = torch.FloatTensor(adj_label_evaluate.toarray()) #sparse_mx_to_torch_sparse_tensor(adj_label_evaluate) features_evaluate = np.concatenate([features, features_valid]) features_evaluate = torch.from_numpy(features_evaluate) # validation done if mode == "VAE": model = GCNModelVAE(feat_dim, args.hidden1, args.hidden2, args.dropout) adj_label = torch.FloatTensor(adj_label.toarray()) adj_label_evaluate = torch.FloatTensor(adj_label_evaluate.toarray()) elif mode == "AE": model = GCNModelAE(feat_dim, args.hidden1, args.hidden2, args.dropout) #adj_label = torch.FloatTensor(adj_label.toarray()) #adj_label_evaluate = torch.FloatTensor(adj_label_evaluate.toarray()) elif mode == "VAE_batch": model = GCNModelVAE_batch(feat_dim, args.hidden1, args.hidden2, args.dropout) elif mode == "AE_batch": model = GCNModelAE_batch(feat_dim, args.hidden1, args.hidden2, args.dropout).cuda() #model = torch.nn.DataParallel(model) #model = model.cuda() # train_dataset = GAEDataset(adj_norm, adj_label, features) # train_loader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, # shuffle=True, num_workers=8, pin_memory=True) train_ids = torch.tensor(range(features.shape[0]), dtype=torch.long) train_dataset = TensorDataset(train_ids) train_loader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=8, pin_memory=True) optimizer = optim.Adam(model.parameters(), lr=args.lr) #optimizer = pSGLD(model.parameters(), lr=args.lr) #optimizer = optim.RMSprop(model.parameters(), lr=args.lr) hidden_emb = None pos_weight = torch.from_numpy(np.array(0.0, dtype=np.float32)) #ipdb.set_trace() t = time.time() best = 0 best_epoch = 0 best_val_cost = 99999 best_val_epoch = 0 best_val_epoch_revop = 0 prev_loss = 0 prev_val_loss = 99999 best_val_roc = 0 best_val_ap = 0 best_val_roc_revop = 0 best_val_ap_revop = 0 torch.set_num_threads(20) print("NUM THREADS USED") print(torch.get_num_threads()) for epoch in range(args.epochs): #t = time.time() model.train() lossVal = 0 lossValNorm = 0 backtime = time.time() for batchID, (inds) in enumerate(train_loader): z = model(features, adj_norm) inds = inds[0] adj = F.relu(torch.mm(z[inds], z[inds].t())) preds = adj label_batch = torch.FloatTensor(adj_label[inds, :][:, inds].toarray()) cost = norm * F.binary_cross_entropy_with_logits( preds, label_batch, pos_weight=pos_weight) lossVal += cost.item() lossValNorm += 1 optimizer.zero_grad() cost.backward(retain_graph=True) # if batchID == 0: # cost.backward(retain_graph=True) # else: # cost.backward() optimizer.step() if batchID >= 10: break backtime_done = time.time() - backtime sys.stdout.write("\r time taken to do epoch: " + str(backtime_done) + " opt: " + str(time.time() - backtime) + "\n") sys.stdout.flush() #optimizer.step() # sample rows only: non-square #for i in range(0, d.shape[0], sample_size): # selection = random_perm[i:i+sample_size] # to_keep = selection # sample_adj_norm = sparse_mx_to_torch_sparse_tensor(sp.coo_matrix(d[to_keep, :])) # sample_adj = adj[to_keep, :] # sample_features = features # recovered, mu, logvar = model(sample_features, sample_adj_norm) # loss = loss_function(preds=recovered, labels=sample_adj_label, # mu=mu, logvar=logvar, n_nodes=n_nodes, # norm=norm, pos_weight=pos_weight) # loss.backward() # cur_loss = loss.item() # optimizer.step() # sys.stdout.write("\r ") # sys.stdout.write("\r" + "sampling [" + str(i) + "/" + str(d.shape[0])) # sys.stdout.flush() # sample rows + take their postiives and add to rows (make it square) #for i in range(0, d.shape[0], sample_size): # selection = random_perm[i:i+sample_size] # to_keep = np.nonzero(d[selection, :]) # # (array([0, 1, 2, 2]), array([0, 1, 0, 1])) # the_set = set(list(to_keep[0]) + list(to_keep[1])) # temp = set(list(to_keep[0])) # to_keep = list(the_set - column_exclude) # # column_exclude.union(temp) # # these ar ethe rows and columns that we need ne select # sample_adj_norm = sparse_mx_to_torch_sparse_tensor(sp.coo_matrix(d[to_keep, :][:,to_keep])) # sample_features = features[to_keep, :] # sample_adj_label = adj_label[to_keep, :][:,to_keep] # #print(samplei_adj_norm.shape) # #print(sample_features.shape) # #print(sample_adj_label.shape) # #print(sample.shape) # sample_adj = adj[to_keep, :][:,to_keep] # pos_weight = float(sample_adj.shape[0] * sample_adj.shape[0] - sample_adj.sum()) / sample_adj.sum() # pos_weight = torch.from_numpy(np.array(pos_weight)) # norm = sample_adj.shape[0] * sample_adj.shape[0] / float((sample_adj.shape[0] * sample_adj.shape[0] - sample_adj.sum()) * 2) # n_nodes, feat_dim = sample_features.shape # if mode == "VAE": # recovered, mu, logvar = model(sample_features, sample_adj_norm) # #recovered = recovered[i:i+500] # #sample_adj_label = sample_adj_label[i:i+500] # loss = loss_function(preds=recovered, labels=sample_adj_label, # mu=mu, logvar=logvar, n_nodes=n_nodes, # norm=norm, pos_weight=pos_weight) # elif mode == "AE": # recovered = model(sample_features, sample_adj_norm) # loss = loss_function_ae(preds=recovered, labels=sample_adj_label, # norm=norm, pos_weight=pos_weight) # loss.backward() # cur_loss = loss.item() # optimizer.step() # sys.stdout.write("\r ") # sys.stdout.write("\r" + "sampling [" + str(i) + "/" + str(d.shape[0]) + "]....size of sample=" + str(len(sample_features))) # sys.stdout.flush() sys.stdout.write( "\r \r" ) if (epoch + 1) % 1 == 0: model.eval() #adj_dense = adj_train.todense() #adj_val_dense = adj_val.todense() #adj_train_ext = np.pad(adj_dense, [[0,0], [adj_val_dense.shape[0], 0]], "constant") #adj_evaluate = np.concatenate([adj_val_dense, adj_train_ext]) #zero = sp.csr_matrix((adj_train.shape[0], adj_val.shape[0])) #adj_train_ext = sp.hstack((zero, adj_train)) #adj_evaluate = sp.vstack((adj_val, adj_train_ext)) ##zeros = sp.csr_matrix((adj_evaluate.shape[0], adj_val.shape[1])) ##adj_evaluate = sp.hstack((zeros, adj_evaluate)) #adj_evaluate = sp.csr_matrix(adj_evaluate) #rows, columns = adj_evaluate.nonzero() #for i in range(rows.shape[0]): # if rows[i] < adj_val.shape[1]: # adj_evaluate[columns[i], rows[i]] = adj_evaluate[rows[i], columns[i]] # else: # break #adj_evaluate_norm = preprocess_graph(adj_evaluate) #adj_label_evaluate = adj_evaluate + sp.eye(adj_evaluate.shape[0]) #adj_label_evaluate = torch.FloatTensor(adj_label_evaluate.toarray()) ##adj_label_evaluate = sparse_to_tuple(adj_label_evaluate) #features_evaluate = np.concatenate([features, features_valid]) #features_evaluate = torch.from_numpy(features_evaluate) just_adj_evaluate = sparse_mx_to_torch_sparse_tensor(adj_evaluate) #recovered, mu, logvar = model(features_evaluate, just_adj_evaluate.coalesce().indices(), just_adj_evaluate.coalesce().values()) #recovered, mu, logvar = model(features_evaluate, adj_evaluate_norm) #val_loss = loss_function(preds=recovered, labels=adj_label_evaluate, # mu=mu, logvar=logvar, n_nodes=n_nodes, # norm=norm, pos_weight=pos_weight) # if mode == "VAE": # recovered, mu, logvar = model(features_evaluate, adj_evaluate_norm) # val_loss = loss_function(preds=recovered, labels=adj_label_evaluate, # mu=mu, logvar=logvar, n_nodes=n_nodes, # norm=norm, pos_weight=pos_weight) # elif mode == "AE": # mu = model(features_evaluate, adj_evaluate_norm) # val_loss = loss_function_ae(preds=recovered, labels=adj_label_evaluate, # norm=norm, pos_weight=pos_weight) # elif mode == "VAE_batch": # recovered, mu, logvar, val_loss = model(features_evaluate, adj_evaluate_norm, labels=adj_label_evaluate, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight, opt=None, training=False) # elif mode == "AE_batch": # recovered, mu, val_loss = model(features_evaluate, adj_evaluate_norm, labels=adj_label_evaluate, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight, opt=None, training=False) # val_emb = mu.data.numpy() #roc_curr, ap_curr = get_roc_score(val_emb, val_edges, val_edges_false) # do one q at a time #revop_map = eval_each_q(model, adj_all, features_all, Q.shape[1]) # hack by appending stuff on top of adj if mode == "VAE": _, mu, _ = model(features_all, adj_all_norm) elif mode == "AE": mu = model(features_all, adj_all_norm) elif mode == "VAE_batch": mu = model(features_all, adj_all_norm, None, None, None, None, None, just_mu=True, training=False) elif mode == "AE_batch": mu = model(features_all, adj_all_norm, None, None, None, None, None, just_mu=True, training=False) hidden_emb = mu.data.numpy() ## get validation loss #recovered, mu, logvar = model(features, adj_norm) #val_loss = loss_function(preds=recovered, labels=adj_label, # mu=mu, logvar=logvar, n_nodes=n_nodes, # norm=norm, pos_weight=pos_weight) revop_map = get_roc_score_matrix(hidden_emb, Q.shape[1]) if best <= revop_map: emb = hidden_emb Q_end = Q.shape[1] best = revop_map best_epoch = epoch + 1 # write it into a file and do egt on that #embQ = emb[:Q_end,:].T #embX = emb[Q_end:,:].T #np.save("/media/jason/28c9eee1-312e-47d0-88ce-572813ebd6f1/graph/gae-pytorch/best_embedding2.npy",hidden_emb) #concat = np.concatenate((embQ.T,embX.T)) #revop_inner_prod = np.matmul(concat, concat.T) #revop_preds = np.argsort(-revop_inner_prod,axis=0) #if revop_map > 54: # f = open("best_result.txt", "w") # for i in range(revop_preds.shape[1]): # if i < Q_end: # f.write(qhashes[i] + ",") # else: # f.write(chashes[i - Q_end] + ",") # for j in revop_preds[:,i]: # if j < Q_end: # f.write(qhashes[j] + " " + str(int(revop_inner_prod[j,i] * 1000)) + " ") # else: # f.write(chashes[j - Q_end] + " " + str(int(revop_inner_prod[j,i] * 1000)) + " ") # f.write("\n") # f.flush() # #for j in range() # f.close() if best_val_cost > -99.0: #prev_val_loss - val_loss > 0 and prev_val_loss - val_loss > prev_loss - cur_loss and best_val_cost > val_loss: best_val_cost = -99.0 best_val_epoch = epoch + 1 best_val_epoch_revop = revop_map #if best_val_roc < roc_curr: # best_val_roc = roc_curr # best_val_roc_revop = revop_map #if best_val_ap < ap_curr: # best_val_ap = ap_curr # best_val_ap_revop = revop_map print( "Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(lossVal / lossValNorm), "val_loss=", "{:.5f}".format(-99.0), #"val_roc_curr=", "{:.5f}".format(roc_curr), #"val_ap_curr=", "{:.5f}".format(ap_curr), "revop=", "{:.5f}".format(revop_map), "best_revop=", "{:.5f}".format(best), "revop_at_best_val=", "{:.5f}".format(best_val_epoch_revop), #"revop_at_best_val_roc=", "{:.5f}".format(best_val_roc_revop), #"revop_at_best_ap_roc=", "{:.5f}".format(best_val_ap_revop), "time=", "{:.5f}".format(time.time() - t)) prev_val_loss = -99.0 prev_loss = -99.0 t = time.time() print("Optimization Finished!") #roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) #print('Test ROC score: ' + str(roc_score)) #print('Test AP score: ' + str(ap_score)) return best, best_val_epoch_revop, best_val_roc_revop, best_val_ap_revop
def GAEembedding(z, adj, args): ''' GAE embedding for clustering Param: z,adj Return: Embedding from graph ''' # true_labels = np.asarray(true_labels) # args.model = 'gcn_vae' # args.dw = 0 # args.epochs = 200 # args.hidden1 = 32 # args.hidden2 = 16 # args.lr = 0.01 # args.dropout = 0. # args.dataset_sr = 'cora' # args.walk_length = 5 # args.window_size = 3 # args.number_walks = 5 # args.full_number_walks =0 # args.lr_dw = 0.001 # args.context = 0 # args.ns = 1 # args.n_clusters = 11 # args.plot = 0 # featrues from z # Louvain features = z # features = torch.DoubleTensor(features) features = torch.FloatTensor(features) # Old implementation # adj, features, y_test, tx, ty, test_maks, true_labels = load_data(args.dataset_str) n_nodes, feat_dim = features.shape # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj) adj = adj_train # Before proceeding further, make the structure for doing deepWalk # if args.dw == 1: # print('Using deepWalk regularization...') # G = load_edgelist_from_csr_matrix(adj_orig, undirected=True) # print("Number of nodes: {}".format(len(G.nodes()))) # num_walks = len(G.nodes()) * args.number_walks # print("Number of walks: {}".format(num_walks)) # data_size = num_walks * args.walk_length # print("Data size (walks*length): {}".format(data_size)) # Some preprocessing adj_norm = preprocess_graph(adj) adj_label = adj_train + sp.eye(adj_train.shape[0]) # adj_label = sparse_to_tuple(adj_label) # adj_label = torch.DoubleTensor(adj_label.toarray()) adj_label = torch.FloatTensor(adj_label.toarray()) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) if args.GAEmodel == 'gcn_vae': model = GCNModelVAE(feat_dim, args.GAEhidden1, args.GAEhidden2, args.GAEdropout) else: model = GCNModelAE(feat_dim, args.GAEhidden1, args.GAEhidden2, args.GAEdropout) if args.precisionModel == 'Double': model = model.double() optimizer = optim.Adam(model.parameters(), lr=args.GAElr) # if args.dw == 1: # sg = SkipGram(args.hidden2, adj.shape[0]) # optimizer_dw = optim.Adam(sg.parameters(), lr=args.lr_dw) # # Construct the nodes for doing random walk. Doing it before since the seed is fixed # nodes_in_G = list(G.nodes()) # chunks = len(nodes_in_G) // args.number_walks # random.Random().shuffle(nodes_in_G) hidden_emb = None for epoch in tqdm(range(args.GAEepochs)): t = time.time() # mem=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss # print('Mem consumption before training: '+str(mem)) model.train() optimizer.zero_grad() z, mu, logvar = model(features, adj_norm) # After back-propagating gae loss, now do the deepWalk regularization # if args.dw == 1: # sg.train() # if args.full_number_walks > 0: # walks = build_deepwalk_corpus(G, num_paths=args.full_number_walks, # path_length=args.walk_length, alpha=0, # rand=random.Random(SEED)) # else: # walks = build_deepwalk_corpus_iter(G, num_paths=args.number_walks, # path_length=args.walk_length, alpha=0, # rand=random.Random(SEED), # chunk=epoch % chunks, # nodes=nodes_in_G) # for walk in walks: # if args.context == 1: # # Construct the pairs for predicting context node # # for each node, treated as center word # curr_pair = (int(walk[center_node_pos]), []) # for center_node_pos in range(len(walk)): # # for each window position # for w in range(-args.window_size, args.window_size + 1): # context_node_pos = center_node_pos + w # # make soure not jump out sentence # if context_node_pos < 0 or context_node_pos >= len(walk) or center_node_pos == context_node_pos: # continue # context_node_idx = walk[context_node_pos] # curr_pair[1].append(int(context_node_idx)) # else: # # first item in the walk is the starting node # curr_pair = (int(walk[0]), [int(context_node_idx) for context_node_idx in walk[1:]]) # if args.ns == 1: # neg_nodes = [] # pos_nodes = set(walk) # while len(neg_nodes) < args.walk_length - 1: # rand_node = random.randint(0, n_nodes - 1) # if rand_node not in pos_nodes: # neg_nodes.append(rand_node) # neg_nodes = torch.from_numpy(np.array(neg_nodes)).long() # # Do actual prediction # src_node = torch.from_numpy(np.array([curr_pair[0]])).long() # tgt_nodes = torch.from_numpy(np.array(curr_pair[1])).long() # optimizer_dw.zero_grad() # log_pos = sg(src_node, tgt_nodes, neg_sample=False) # if args.ns == 1: # loss_neg = sg(src_node, neg_nodes, neg_sample=True) # loss_dw = log_pos + loss_neg # else: # loss_dw = log_pos # loss_dw.backward(retain_graph=True) # cur_dw_loss = loss_dw.item() # optimizer_dw.step() loss = loss_function(preds=model.dc(z), labels=adj_label, mu=mu, logvar=logvar, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight) loss.backward() cur_loss = loss.item() optimizer.step() hidden_emb = mu.data.numpy() # TODO, this is prediction # roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false) ap_curr = 0 # if args.dw == 1: # tqdm.write("Epoch: {}, train_loss_gae={:.5f}, train_loss_dw={:.5f}, val_ap={:.5f}, time={:.5f}".format( # epoch + 1, cur_loss, cur_dw_loss, # ap_curr, time.time() - t)) # else: tqdm.write( "Epoch: {}, train_loss_gae={:.5f}, val_ap={:.5f}, time={:.5f}". format(epoch + 1, cur_loss, ap_curr, time.time() - t)) # if (epoch + 1) % 10 == 0: # tqdm.write("Evaluating intermediate results...") # kmeans = KMeans(n_clusters=args.n_clusters, random_state=0).fit(hidden_emb) # predict_labels = kmeans.predict(hidden_emb) # cm = clustering_metrics(true_labels, predict_labels) # cm.evaluationClusterModelFromLabel(tqdm) # roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) # tqdm.write('ROC: {}, AP: {}'.format(roc_score, ap_score)) # np.save('logs/emb_epoch_{}.npy'.format(epoch + 1), hidden_emb) tqdm.write("Optimization Finished!") roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) tqdm.write('Test ROC score: ' + str(roc_score)) tqdm.write('Test AP score: ' + str(ap_score)) # kmeans = KMeans(n_clusters=args.n_clusters, random_state=0).fit(hidden_emb) # predict_labels = kmeans.predict(hidden_emb) # cm = clustering_metrics(true_labels, predict_labels) # cm.evaluationClusterModelFromLabel(tqdm) # if args.GAEplot == 1: # cm.plotClusters(tqdm, hidden_emb, true_labels) return hidden_emb
def gae(filename, output_dir): # Settings flags = tf.app.flags FLAGS = flags.FLAGS flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.') flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.') flags.DEFINE_integer('hidden1', 32, 'Number of units in hidden layer 1.') flags.DEFINE_integer('hidden2', 16, 'Number of units in hidden layer 2.') flags.DEFINE_float('weight_decay', 0., 'Weight for L2 loss on embedding matrix.') flags.DEFINE_float('dropout', 0., 'Dropout rate (1 - keep probability).') flags.DEFINE_string('filename', 'email-Eu-core.mat', 'dataset') flags.DEFINE_string('model', 'gcn_vae', 'Model string.') flags.DEFINE_string('dataset', 'cora', 'Dataset string.') flags.DEFINE_integer('features', 0, 'Whether to use features (1) or not (0).') model_str = FLAGS.model # dataset_str = FLAGS.dataset # Load data # adj, features = load_data(dataset_str) adj, R, edges = load_network_data(filename) num_edges = np.sum(adj) length = adj.shape[0] A = np.array(adj, copy=True) adj = sp.csr_matrix(adj) # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges = mask_test_edges(adj) adj = adj_train if FLAGS.features == 0: features = sp.identity(adj.shape[0]) # featureless # Some preprocessing adj_norm = preprocess_graph(adj) # Define placeholders placeholders = { 'features': tf.sparse_placeholder(tf.float32), 'adj': tf.sparse_placeholder(tf.float32), 'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()) } num_nodes = adj.shape[0] features = sparse_to_tuple(features.tocoo()) num_features = features[2][1] features_nonzero = features[1].shape[0] # Create model model = None if model_str == 'gcn_ae': model = GCNModelAE(placeholders, num_features, features_nonzero) elif model_str == 'gcn_vae': model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) # Optimizer with tf.name_scope('optimizer'): if model_str == 'gcn_ae': opt = OptimizerAE(preds=model.reconstructions, labels=tf.reshape( tf.sparse_tensor_to_dense( placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=pos_weight, norm=norm) elif model_str == 'gcn_vae': opt = OptimizerVAE(preds=model.reconstructions, labels=tf.reshape( tf.sparse_tensor_to_dense( placeholders['adj_orig'], validate_indices=False), [-1]), model=model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm) # Initialize session sess = tf.Session() sess.run(tf.global_variables_initializer()) adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) # Train model for epoch in range(FLAGS.epochs): t = time.time() # Construct feed dictionary feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict) # Compute average loss # avg_cost = outs[1] # avg_accuracy = outs[2] # # if (epoch + 1) % 10 == 0: # print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost), # "train_acc=", "{:.5f}".format(avg_accuracy), "time=", "{:.5f}".format(time.time() - t)) print("GAE Optimization Finished!") feed_dict.update({placeholders['dropout']: 0}) emb = sess.run(model.z_mean, feed_dict=feed_dict) def sigmoid(x): return 1 / (1 + np.exp(-x)) # Predict on test set of edges adj_rec = np.dot(emb, emb.T) adj_rec = np.array(adj_rec) # adj_rec = adj_rec[1:length, :][:, 1:length] DD = np.sort(adj_rec.flatten()) threshold = DD[int(-1 * num_edges)] network_C = np.array([[ 0 if adj_rec[i, j] < threshold else 1 for i in range(adj_rec.shape[0]) ] for j in range(adj_rec.shape[1])], dtype=np.int8) # np.save('../data/GAE_network.npy', network_C[1:length, :][:, 1:length]) os.chdir('../') np.save('{}/GAE_network.npy'.format(output_dir, filename), network_C[1:length, :][:, 1:length]) A_copy = adj_rec final_network = [A_copy] # orinal_network = [A] for i in range(1, 5): adjacent_matrix = tf.placeholder(tf.float32, shape=A_copy.shape) R_matrix = tf.placeholder(tf.float32, shape=R[i - 1, 0].shape) A_copy = sess.run(tf.matmul(tf.matmul(R_matrix, adjacent_matrix), tf.transpose(R_matrix)), feed_dict={ R_matrix: R[i - 1, 0].todense(), adjacent_matrix: A_copy }) final_network.append(np.array(A_copy)) # adjacent_matrix = tf.placeholder(tf.float32, shape=A.shape) # R_matrix = tf.placeholder(tf.float32, shape=R[i - 1, 0].shape) # A = sess.run(tf.matmul(tf.matmul(R_matrix, adjacent_matrix), tf.transpose(R_matrix)), # feed_dict={R_matrix: R[i - 1, 0].todense(), adjacent_matrix: A}) # orinal_network.append(A) # draw_graph(final_network, edges, output_dir) network_B = final_network[0] print('Generating graph by GAE algorithm.') DD = np.sort(network_B.flatten())[::-1] threshold = DD[edges[0, 0]] network_C = np.array([[ 0 if network_B[i, j] < threshold else 1 for i in range(network_B.shape[0]) ] for j in range(network_B.shape[1])]) _A_obs = network_C + network_C.T _A_obs[_A_obs > 1] = 1 _A_obs = np.array(_A_obs) print('Computing metrics for graph generated by GAE') c = compute_graph_statistics(_A_obs) with open('{}/gae_network_statistics.pickle'.format(output_dir), 'wb') as handle: pickle.dump(c, handle, protocol=pickle.HIGHEST_PROTOCOL) print(c)
def gcn_multilayer(self): """Neural embedding of a multilayer network""" all_nodes = self.get_all_nodes() tmp_fname = pjoin(self.out_dir, 'tmp.emb') for net_name, net in self.nets.items(): self.log.info('Run GCN For Net: %s' % net_name) # ============================================================= adjacency_matrix = nx.adjacency_matrix(net) adjacency_matrix = adjacency_matrix.todense() nodes_count = adjacency_matrix.shape[0] adj = adjacency_matrix features = sp.identity(nodes_count) adj = sp.csr_matrix(adj) # ----------------myCode----------------------------------- # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() # tst_actual_matrix = adj.toarray() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj) adj = adj_train # -----------------------------myCode------------------------- # if FLAGS.features == 0: # features = sp.identity(features.shape[0]) # featureless # -----------------------------myCode------------------------- # Some pre processing adj_norm = preprocess_graph(adj) # Define placeholders placeholders = { 'features': tf.sparse_placeholder(tf.float32), 'adj': tf.sparse_placeholder(tf.float32), 'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()) } num_nodes = adj.shape[0] features = sparse_to_tuple(features.tocoo()) num_features = features[2][1] features_nonzero = features[1].shape[0] # Create model model = None if self.model_str == 'gcn_ae': model = GCNModelAE(placeholders, num_features, features_nonzero, self.hidden1, self.hidden2) elif self.model_str == 'gcn_vae': model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero, self.hidden1, self.hidden2) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2) # Optimizer with tf.name_scope('optimizer'): if self.model_str == 'gcn_ae': opt = OptimizerAE(preds=model.reconstructions, labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=pos_weight, norm=norm) elif self.model_str == 'gcn_vae': opt = OptimizerVAE(preds=model.reconstructions, labels=tf.reshape(tf.sparse_tensor_to_dense(placeholders['adj_orig'], validate_indices=False), [-1]), model=model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm) # Initialize session sess = tf.Session() sess.run(tf.global_variables_initializer()) cost_val = [] acc_val = [] def get_roc_score(edges_pos, edges_neg, emb=None): if emb is None: feed_dict.update({placeholders['dropout']: 0}) emb = sess.run(model.z_mean, feed_dict=feed_dict) def sigmoid(x): return 1 / (1 + np.exp(-x)) # Predict on test set of edges adj_rec = np.dot(emb, emb.T) preds = [] pos = [] for e in edges_pos: preds.append(sigmoid(adj_rec[e[0], e[1]])) pos.append(adj_orig[e[0], e[1]]) preds_neg = [] neg = [] for e in edges_neg: preds_neg.append(sigmoid(adj_rec[e[0], e[1]])) neg.append(adj_orig[e[0], e[1]]) preds_all = np.hstack([preds, preds_neg]) labels_all = np.hstack([np.ones(len(preds)), np.zeros(len(preds_neg))]) roc_score = roc_auc_score(labels_all, preds_all) ap_score = average_precision_score(labels_all, preds_all) return roc_score, ap_score cost_val = [] acc_val = [] val_roc_score = [] adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) # Train model # for epoch in range(FLAGS.epochs): # epochs = 10 dropout = 0 for epoch in range(self.n_iter): self.log.info('Iteration: %d' % epoch) t = time.time() # Construct feed dictionary feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders) # feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # -----------myCode------------ feed_dict.update({placeholders['dropout']: dropout}) # -----------myCode------------ # Run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict) # Compute average loss avg_cost = outs[1] avg_accuracy = outs[2] roc_curr, ap_curr = get_roc_score(val_edges, val_edges_false) val_roc_score.append(roc_curr) print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost), "train_acc=", "{:.5f}".format(avg_accuracy), "val_roc=", "{:.5f}".format(val_roc_score[-1]), "val_ap=", "{:.5f}".format(ap_curr), "time=", "{:.5f}".format(time.time() - t)) print("Optimization Finished!") roc_score, ap_score = get_roc_score(test_edges, test_edges_false) print('Test ROC score: ' + str(roc_score)) print('Test AP score: ' + str(ap_score)) # ------vector generation ----------------------------- vectors = sess.run(model.embeddings, feed_dict=feed_dict) fname = self.out_dir + net_name +'vectors.txt' # with open(fname, 'a+') as fout: # for line in np.array(vectors): # fout.write(line + "\n") np.savetxt(fname, np.array(vectors), fmt="%s", delimiter=' ') self.log.info('Saving vectors: %s' % fname) # ============================================================== self.log.info('after exec gcn : %s' % net_name) self.log.info('Done!')
def run(self): if self.file_expr == '': # text-image-code combination n_by_n, x_train, y_train, train_mask, val_mask, test_mask, idx_supernodes, label_encoder = graph_generator.load_combo( self.labels_dict) else: n_by_n, x_train, y_train, train_mask, val_mask, test_mask, idx_supernodes, label_encoder = graph_generator.load_data( self.labels_dict, self.file_expr, min_valid_triples=self.min_valid_triples, sep=self.file_sep, select_rels=self.select_rels) self.idx_supernodes = idx_supernodes adj = nx.adjacency_matrix(nx.from_scipy_sparse_matrix( n_by_n)) #nx.adjacency_matrix(nx.from_numpy_array(n_by_n)) features = scipy.sparse.csr.csr_matrix(x_train) # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() self.adj_orig = adj_orig adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges2( adj) adj = adj_train # Some preprocessing adj_norm = preprocess_graph(adj) num_nodes = adj.shape[0] if not self.use_features: features = sp.identity(features.shape[0]) # featureless features = sparse_to_tuple(features.tocoo()) num_features = features[2][1] features_nonzero = features[1].shape[0] # Create model if model_str == 'gcn_ae': self.model = GCNModelAE(self.placeholders, num_features, features_nonzero) elif model_str == 'gcn_vae': self.model = GCNModelVAE(self.placeholders, num_features, num_nodes, features_nonzero) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) # Optimizer with tf.name_scope('optimizer'): if model_str == 'gcn_ae': opt = OptimizerAE(preds=self.model.reconstructions, labels=tf.reshape( tf.sparse_tensor_to_dense( self.placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=pos_weight, norm=norm) elif model_str == 'gcn_vae': opt = OptimizerVAE(preds=self.model.reconstructions, labels=tf.reshape( tf.sparse_tensor_to_dense( self.placeholders['adj_orig'], validate_indices=False), [-1]), model=self.model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm) # Initialize session self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) val_roc_score = [] adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) #import datetime #log_dir="logs/gae/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # Train model for epoch in range(self.epochs): #FLAGS.epochs): t = time.time() # Construct feed dictionary self.feed_dict = construct_feed_dict(adj_norm, adj_label, features, self.placeholders) self.feed_dict.update( {self.placeholders['dropout']: self.dropout_rate}) # FLAGS.dropout}) # Run single weight update outs = self.sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=self.feed_dict) # Compute average loss avg_cost = outs[1] avg_accuracy = outs[2] roc_curr, ap_curr = self.get_roc_score(val_edges, val_edges_false) val_roc_score.append(roc_curr) # tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1) print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(avg_cost), "train_acc=", "{:.5f}".format(avg_accuracy), "val_roc=", "{:.5f}".format(val_roc_score[-1]), "val_ap=", "{:.5f}".format(ap_curr), "time=", "{:.5f}".format(time.time() - t)) print("Optimization Finished!") roc_score, ap_score = self.get_roc_score(test_edges, test_edges_false) print('Test ROC score: ' + str(roc_score)) print('Test AP score: ' + str(ap_score)) [supernodes, supernodes_embeddings, supernodes_labels] = self.get_embeddings(y_train, label_encoder) self.supernodes = [ supernodes, supernodes_embeddings, supernodes_labels ]
def main(args): """ Compute embeddings using GAE/VGAE. """ # Load edgelist oneIndx = False E = np.loadtxt(args.inputgraph, delimiter=args.delimiter, dtype=int) if np.min(E) == 1: oneIndx = True E -= 1 # Create an unweighted graph G = nx.Graph() G.add_edges_from(E[:, :2]) # Get adj matrix of the graph tr_A = nx.adjacency_matrix(G, weight=None) num_nodes = tr_A.shape[0] # Set main diag to 1s and normalize (algorithm requirement) adj_norm = preprocess_graph(tr_A) # Define placeholders placeholders = { 'features': tf.sparse_placeholder(tf.float32), 'adj': tf.sparse_placeholder(tf.float32), 'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=()) } # Create empty feature matrix features = sp.identity(num_nodes) # featureless features = sparse_to_tuple(features.tocoo()) num_features = features[2][1] features_nonzero = features[1].shape[0] # Create model model = None if args.model == 'gcn_ae': model = GCNModelAE(placeholders, num_features, features_nonzero) elif args.model == 'gcn_vae': model = GCNModelVAE(placeholders, num_features, num_nodes, features_nonzero) pos_weight = float(tr_A.shape[0] * tr_A.shape[0] - tr_A.sum()) / tr_A.sum() norm = tr_A.shape[0] * tr_A.shape[0] / float( (tr_A.shape[0] * tr_A.shape[0] - tr_A.sum()) * 2) # Optimizer with tf.name_scope('optimizer'): if args.model == 'gcn_ae': opt = OptimizerAE(preds=model.reconstructions, labels=tf.reshape( tf.sparse_tensor_to_dense( placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=pos_weight, norm=norm) elif args.model == 'gcn_vae': opt = OptimizerVAE(preds=model.reconstructions, labels=tf.reshape( tf.sparse_tensor_to_dense( placeholders['adj_orig'], validate_indices=False), [-1]), model=model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm) # Initialize session sess = tf.Session() sess.run(tf.global_variables_initializer()) adj_label = tr_A + sp.eye(tr_A.shape[0]) adj_label = sparse_to_tuple(adj_label) # Train model for epoch in range(FLAGS.epochs): # Construct feed dictionary feed_dict = construct_feed_dict(adj_norm, adj_label, features, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Run single weight update outs = sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=feed_dict) print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]), "train_acc=", "{:.5f}".format(outs[2])) # Compute predictions feed_dict.update({placeholders['dropout']: 0}) emb = sess.run(model.z_mean, feed_dict=feed_dict) def sigmoid(x): return 1 / (1 + np.exp(-x)) # Node similarities adj_rec = np.dot(emb, emb.T) start = time.time() # Read the train edges and compute similarity if args.tr_e is not None: train_edges = np.loadtxt(args.tr_e, delimiter=args.delimiter, dtype=int) if oneIndx: train_edges -= 1 scores = list() for src, dst in train_edges: scores.append(sigmoid(adj_rec[src, dst])) np.savetxt(args.tr_pred, scores, delimiter=args.delimiter) # Read the test edges and run predictions if args.te_e is not None: test_edges = np.loadtxt(args.te_e, delimiter=args.delimiter, dtype=int) if oneIndx: test_edges -= 1 scores = list() for src, dst in test_edges: scores.append(sigmoid(adj_rec[src, dst])) np.savetxt(args.te_pred, scores, delimiter=args.delimiter) # If no edge lists provided to predict links, then just store the embeddings else: np.savetxt(args.output, emb, delimiter=args.delimiter) print('Prediction time: {}'.format(time.time() - start))
def fit(self, adj, features, labels): adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train = gen_train_edges(adj) adj = adj_train # Some preprocessing adj_norm = preprocess_graph(adj) num_nodes = adj.shape[0] input_feature_dim = features.shape[1] features = normalize_vectors(features) # Define placeholders self.placeholders = { 'features': tf.compat.v1.placeholder(tf.float32, shape=(None, input_feature_dim)), # 'features': tf.compat.v1.sparse_placeholder(tf.float32), 'adj': tf.compat.v1.sparse_placeholder(tf.float32), 'adj_orig': tf.compat.v1.sparse_placeholder(tf.float32), 'dropout': tf.compat.v1.placeholder_with_default(0., shape=()) } if self.model_type == 'gcn_ae': self.model = GCNModelAE(self.placeholders, input_feature_dim) elif self.model_type == 'gcn_vae': self.model = GCNModelVAE(self.placeholders, input_feature_dim, num_nodes) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() # negative edges/pos edges # print('positive edge weight', pos_weight) norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.nnz) * 2) # Optimizer with tf.compat.v1.name_scope('optimizer'): if self.model_type == 'gcn_ae': opt = OptimizerAE(preds=self.model.reconstructions, labels=tf.reshape( tf.sparse.to_dense( self.placeholders['adj_orig'], validate_indices=False), [-1]), pos_weight=pos_weight, norm=norm) elif self.model_type == 'gcn_vae': opt = OptimizerVAE(preds=self.model.reconstructions, labels=tf.reshape( tf.sparse.to_dense( self.placeholders['adj_orig'], validate_indices=False), [-1]), model=self.model, num_nodes=num_nodes, pos_weight=pos_weight, norm=norm) # Initialize session self.sess = tf.compat.v1.Session() self.sess.run(tf.compat.v1.global_variables_initializer()) adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) # Train model for epoch in range(FLAGS.epochs): t = time.time() # Construct feed dictionary self.feed_dict = construct_feed_dict(adj_norm, adj_label, features, self.placeholders) self.feed_dict.update( {self.placeholders['dropout']: FLAGS.dropout}) # Run single weight update outs = self.sess.run([opt.opt_op, opt.cost, opt.accuracy], feed_dict=self.feed_dict) # Compute average loss avg_cost = outs[1] avg_accuracy = outs[2]
def GAEembedding(z, adj, args): ''' GAE embedding for clustering Param: z,adj Return: Embedding from graph ''' # featrues from z # Louvain features = z # features = torch.DoubleTensor(features) features = torch.FloatTensor(features) # Old implementation # adj, features, y_test, tx, ty, test_maks, true_labels = load_data(args.dataset_str) n_nodes, feat_dim = features.shape # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj) adj = adj_train # Some preprocessing adj_norm = preprocess_graph(adj) adj_label = adj_train + sp.eye(adj_train.shape[0]) # adj_label = sparse_to_tuple(adj_label) # adj_label = torch.DoubleTensor(adj_label.toarray()) adj_label = torch.FloatTensor(adj_label.toarray()) pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) if args.GAEmodel == 'gcn_vae': model = GCNModelVAE(feat_dim, args.GAEhidden1, args.GAEhidden2, args.GAEdropout) else: model = GCNModelAE(feat_dim, args.GAEhidden1, args.GAEhidden2, args.GAEdropout) if args.precisionModel == 'Double': model = model.double() optimizer = optim.Adam(model.parameters(), lr=args.GAElr) hidden_emb = None for epoch in tqdm(range(args.GAEepochs)): t = time.time() # mem=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss # print('Mem consumption before training: '+str(mem)) model.train() optimizer.zero_grad() z, mu, logvar = model(features, adj_norm) loss = loss_function(preds=model.dc(z), labels=adj_label, mu=mu, logvar=logvar, n_nodes=n_nodes, norm=norm, pos_weight=pos_weight) loss.backward() cur_loss = loss.item() optimizer.step() hidden_emb = mu.data.numpy() # TODO, this is prediction # roc_curr, ap_curr = get_roc_score(hidden_emb, adj_orig, val_edges, val_edges_false) ap_curr = 0 tqdm.write( "Epoch: {}, train_loss_gae={:.5f}, val_ap={:.5f}, time={:.5f}". format(epoch + 1, cur_loss, ap_curr, time.time() - t)) tqdm.write("Optimization Finished!") roc_score, ap_score = get_roc_score(hidden_emb, adj_orig, test_edges, test_edges_false) tqdm.write('Test ROC score: ' + str(roc_score)) tqdm.write('Test AP score: ' + str(ap_score)) return hidden_emb