def main(args): """ Train GAE """ print("Using {} dataset".format(args.dataset_str)) # Load data np.random.seed(1) adj, features = load_data(args.dataset_str) N, D = features.shape # Store original adjacency matrix (without diagonal entries) adj_orig = adj adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj) # Some preprocessing adj_train_norm = preprocess_graph(adj_train) adj_train_norm = Variable(make_sparse(adj_train_norm)) adj_train_labels = Variable( torch.FloatTensor(adj_train + sp.eye(adj_train.shape[0]).todense())) features = Variable(make_sparse(features)) n_edges = adj_train_labels.sum() data = { 'adj_norm': adj_train_norm, 'adj_labels': adj_train_labels, 'features': features, } gae = GAE(data, n_hidden=32, n_latent=16, dropout=args.dropout, subsampling=args.subsampling) optimizer = Adam({"lr": args.lr, "betas": (0.95, 0.999)}) svi = SVI(gae.model, gae.guide, optimizer, loss="ELBO") # Results results = defaultdict(list) # Full batch training loop for epoch in range(args.num_epochs): # initialize loss accumulator epoch_loss = 0. # do ELBO gradient and accumulate loss epoch_loss += svi.step() # report training diagnostics if args.subsampling: normalized_loss = epoch_loss / float(2 * n_edges) else: normalized_loss = epoch_loss / (2 * N * N) results['train_elbo'].append(normalized_loss) # Training loss emb = gae.get_embeddings() accuracy, roc_curr, ap_curr, = eval_gae(val_edges, val_edges_false, emb, adj_orig) results['accuracy_train'].append(accuracy) results['roc_train'].append(roc_curr) results['ap_train'].append(ap_curr) print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(normalized_loss), "train_acc=", "{:.5f}".format(accuracy), "val_roc=", "{:.5f}".format(roc_curr), "val_ap=", "{:.5f}".format(ap_curr)) # Test loss if epoch % args.test_freq == 0: emb = gae.get_embeddings() accuracy, roc_score, ap_score = eval_gae(test_edges, test_edges_false, emb, adj_orig) results['accuracy_test'].append(accuracy) results['roc_test'].append(roc_curr) results['ap_test'].append(ap_curr) print("Optimization Finished!") # Test loss emb = gae.get_embeddings() accuracy, roc_score, ap_score = eval_gae(test_edges, test_edges_false, emb, adj_orig) print('Test Accuracy: ' + str(accuracy)) print('Test ROC score: ' + str(roc_score)) print('Test AP score: ' + str(ap_score)) # Plot plot_results(results, args.test_freq, path=args.dataset_str + "_results.png")
adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() else: # Load data adj, features = load_data(dataset_str) print("Loaded dataset") # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj) adj = adj_train if FLAGS.features == 0: features = sp.identity(features.shape[0]) # featureless # Some preprocessing adj_norm = preprocess_graph(adj) # Define placeholders placeholders = { 'features': tf.sparse_placeholder(tf.float32), 'adj': tf.sparse_placeholder(tf.float32), 'adj_orig': tf.sparse_placeholder(tf.float32), 'dropout': tf.placeholder_with_default(0., shape=())
if dataset_str == 'synthetic': adj, features = get_synthetic_data(p=p, attrNoise=attrNoise, m=m) else: adj, features = load_data(dataset_str) # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj # sparse matrix # adj_orig.diagonal()[np.newaxis, :] row vector adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) # set the diagnal elements to 0 adj_orig.eliminate_zeros( ) # sparse matrix should not contain entries equals 0. So always call eliminate_zeros() after an update. adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj, test_percent=10., val_percent=5.) adj = adj_train # This is the adj matrix that masked out all validation and testing entries. #print(adj_train.shape) #import pdb;pdb.set_trace() if FLAGS.features == 0: features = sp.identity( features.shape[0]) # featureless. sparse coo_matrix. # Some preprocessing #adj_norm = preprocess_graph(adj) attn_adj_norm = adj + sp.eye(adj.shape[0]) attn_adj_norm = sparse_to_tuple(attn_adj_norm) # a tuple adj_norm = preprocess_graph(
def format_data(data_name): # Load data adj, features, y_test, tx, ty, test_maks, true_labels = load_data( data_name) # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj) adj = adj_train if FLAGS.features == 0: features = sp.identity(features.shape[0]) # featureless # Some preprocessing adj_norm = preprocess_graph(adj) num_nodes = adj.shape[0] features = sparse_to_tuple(features.tocoo()) num_features = features[2][1] features_nonzero = features[1].shape[0] pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) items = [ adj, num_features, num_nodes, features_nonzero, pos_weight, norm, adj_norm, adj_label, features, true_labels, train_edges, val_edges, val_edges_false, test_edges, test_edges_false, adj_orig ] feas = {} for item in items: # item_name = [ k for k,v in locals().iteritems() if v == item][0] feas[retrieve_name(item)] = item return feas
def main(args): dataset = args.dataset emb_output_dir = args.output epochs = args.epochs agg = args.agg p = args.p tr = args.tr lam = args.lam lose_func = args.loss # Preprocess dataset adj, views_features = load_data(dataset, num_views=3) adj_orig = adj adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() # Calculate pairwise simlarity. views_sim_matrix = {} views_feature_matrix = {} for view in list(views_features.keys()): feature_matrix = csc_matrix.todense(views_features[view]) views_feature_matrix.update({view:feature_matrix}) kernal = "rbf" if lose_func == 'all': attr_sim = cal_attr_sim(views_feature_matrix, dataset) else: attr_sim = 0 # split nodes to train, valid and test datasets, # remove test edges from train adjacent matrix. adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(dataset, adj) print("Masking edges Done!") adj = adj_train nx_G = nx.from_numpy_array(adj.toarray()) num_nodes = adj.shape[0] adj_norm = preprocess_graph(adj) views_features_num = {} views_features_nonzero = {} for view in list(views_features.keys()): views_features[view] = sparse_to_tuple(views_features[view].tocoo()) views_features_num.update({view:views_features[view][2][1]}) views_features_nonzero.update({view:views_features[view][1].shape[0]}) # Build model MagCAE = {} for view in list(views_features.keys()): x,y = views_features[view][2][0], views_features[view][2][1] model = GAE(y, views_features_nonzero[view], adj_norm, math.ceil(2*p*y), math.ceil(p*y)) MagCAE.update({view:model}) # Loss function and optimizer. # loss weight taken by each nodes to the total loss. pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) /adj.sum() norm = adj.shape[0] * adj.shape[0] / float(adj.shape[0] * adj.shape[0] - adj.sum())*2 optimizer = tf.keras.optimizers.Adam() adj_targ = adj_train + sp.eye(adj_train.shape[0]) adj_targ = sparse_to_tuple(adj_targ) indices= np.array(adj_targ[0]) values = np.array(adj_targ[1]) dense_shape = np.array(adj_targ[2]) sparse_targ = tf.SparseTensor(indices = indices, values = values, dense_shape = dense_shape) sparse_targ = tf.cast(sparse_targ, dtype=tf.float32) adj_targ = tf.sparse.to_dense(sparse_targ) adj_targ = tf.reshape(adj_targ,[-1]) # Train and Evaluate Model # Training Loop: # In each epoch: views - > view_embedding -> aggregate embedding -> total loss -> update gradients decoder = Decoder(100) for epoch in range(epochs): loss = 0 start = time.time() with tf.GradientTape() as tape: ag_embedding ={} for VAE in list(MagCAE.keys()): v_embedding, a_hat = MagCAE[VAE](views_features[VAE]) ag_embedding.update({VAE:v_embedding}) # aggregate embeddings embedding, aggregator = aggregate_embeddings(ag_embedding, agg) # reconstruct a_hat a_hat = decoder(embedding) loss += loss_function(a_hat, adj_targ, pos_weight, norm, attr_sim, embedding, num_nodes, lam, lose_func) if agg == "weighted_concat": variables = MagCAE['view1'].trainable_variables + MagCAE['view2'].trainable_variables + MagCAE['view3'].trainable_variables + aggregator.trainable_variables gradients = tape.gradient(loss, variables) optimizer.apply_gradients(zip(gradients, variables)) # Evaluate on validate set embedding = np.array(embedding) roc_cur, ap_cur, _, _ = evaluate(val_edges, val_edges_false, adj_orig, embedding) print("Epoch {}: Val_Roc {:.4f}, Val_AP {:.4f}, Time Consumed {:.2f} sec\n".format(epoch+1, roc_cur, ap_cur, time.time()-start)) print("Training Finished!") # Evaluation Result on test Edges test_embedding= {} for VAE in list(MagCAE.keys()): v_embedding, a_hat = MagCAE[VAE](views_features[VAE]) test_embedding.update({VAE:v_embedding}) # aggregate embeddings embedding, aggregator = aggregate_embeddings(test_embedding, agg) embedding = np.array(embedding) # embedding is a tensor, convert to np array. # reconstruct a_hat test_roc, test_ap, fpr, tpr = evaluate(test_edges, test_edges_false, adj_orig, embedding) print("MagCAE test result on {}".format(dataset)) print("Test Roc: {}, Test AP: {}, P: {}, Training Ratio: {}, Lambda: {}.".format(test_roc, test_ap, p, tr, lam))
x = sp.lil_matrix(features) features_tuple = sparse_to_tuple(x) features_shape = features_tuple[2] # Get graph attributes (to feed into model) num_nodes = adj.shape[0] # number of nodes in adjacency matrix num_features = features_shape[ 1] # number of features (columsn of features matrix) features_nonzero = features_tuple[1].shape[ 0] # number of non-zero entries in features matrix (or length of values list) # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() np.random.seed(0) # IMPORTANT: guarantees consistent train/test splits adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj, test_frac=.3, val_frac=.1) # Normalize adjacency matrix adj_norm = preprocess_graph(adj_train) # Add in diagonals adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) # Inspect train/test split print("Total nodes:", adj.shape[0]) print("Total edges:", int( adj.nnz / 2)) # adj is symmetric, so nnz (num non-zero) = 2*num_edges print("Training edges (positive):", len(train_edges)) print("Training edges (negative):", len(train_edges_false)) print("Validation edges (positive):", len(val_edges)) print("Validation edges (negative):", len(val_edges_false))
def main(args): """ Train GAE """ # Compute the device upon which to run device = torch.device("cuda" if args.use_cuda else "cpu") print("Using {} dataset".format(args.dataset_str)) # Load data np.random.seed(1) adj, features = load_data(args.dataset_str) N, D = features.shape # Store original adjacency matrix (without diagonal entries) adj_orig = adj adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj) # Some preprocessing adj_train_norm = preprocess_graph(adj_train) adj_train_norm = make_sparse(adj_train_norm) adj_train_labels = torch.FloatTensor(adj_train + sp.eye(adj_train.shape[0]).todense()) features = make_sparse(features) n_edges = adj_train_labels.sum() data = { 'adj_norm': adj_train_norm, 'adj_labels': adj_train_labels, 'features': features, } gae = GAE(data, n_hidden=32, n_latent=16, dropout=args.dropout) # Send the model and data to the available device gae.to(device) data['adj_norm'] = data['adj_norm'].to(device) data['adj_labels'] = data['adj_labels'].to(device) data['features'] = data['features'].to(device) optimizer = optim.Adam(gae.parameters(), lr=args.lr, betas=(0.95, 0.999), weight_decay=args.weight_decay) # Results results = defaultdict(list) # Full batch training loop for epoch in range(args.num_epochs): t = time.time() gae.train() optimizer.zero_grad() # forward pass output = gae(data['features'], data['adj_norm']) # Compute the loss logits = output targets = data['adj_labels'] loss = gae.norm * F.binary_cross_entropy_with_logits( logits, targets, pos_weight=gae.pos_weight) loss.backward() optimizer.step() results['train_elbo'].append(loss.item()) gae.eval() emb = gae.get_embeddings(data['features'], data['adj_norm']) accuracy, roc_curr, ap_curr, = eval_gae(val_edges, val_edges_false, emb, adj_orig) results['accuracy_train'].append(accuracy) results['roc_train'].append(roc_curr) results['ap_train'].append(ap_curr) print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(loss.item()), "train_acc=", "{:.5f}".format(accuracy), "val_roc=", "{:.5f}".format(roc_curr), "val_ap=", "{:.5f}".format(ap_curr)) # Test loss if epoch % args.test_freq == 0: with torch.no_grad(): gae.eval() emb = gae.get_embeddings(data['features'], data['adj_norm']) accuracy, roc_score, ap_score = eval_gae( test_edges, test_edges_false, emb, adj_orig) results['accuracy_test'].append(accuracy) results['roc_test'].append(roc_curr) results['ap_test'].append(ap_curr) gae.train() print("Optimization Finished!") with torch.no_grad(): # Test loss gae.eval() emb = emb = gae.get_embeddings(data['features'], data['adj_norm']) accuracy, roc_score, ap_score = eval_gae(test_edges, test_edges_false, emb, adj_orig) print('Test Accuracy: ' + str(accuracy)) print('Test ROC score: ' + str(roc_score)) print('Test AP score: ' + str(ap_score)) # Plot plot_results(results, args.test_freq, path=args.dataset_str + "_GAE_results.png")
def format_data(data_name): # Load data adj, features, true_labels = load_data(data_name) # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix( (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges( adj) adj = adj_train if FLAGS.features == 0: features = sp.identity(features.shape[0]) # featureless # Some preprocessing adj_norm = preprocess_graph(adj) num_nodes = adj.shape[0] features = sparse_to_tuple(features.tocoo()) num_features = features[2][1] features_nonzero = features[1].shape[0] pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float( (adj.shape[0] * adj.shape[0] - adj.sum()) * 2) adj_label = adj_train + 2 * sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) feas = {} feas['adj'] = adj feas['num_features'] = num_features feas['num_nodes'] = num_nodes feas['features_nonzero'] = features_nonzero feas['pos_weight'] = pos_weight feas['norm'] = norm feas['adj_norm'] = adj_norm feas['adj_label'] = adj_label feas['features'] = features feas['true_labels'] = true_labels feas['train_edges'] = train_edges feas['val_edges'] = val_edges feas['val_edges_false'] = val_edges_false feas['test_edges'] = test_edges feas['test_edges_false'] = test_edges_false feas['adj_orig'] = adj_orig return feas
def format_data_ui(data_name, has_features=1): # Load data fpath_dir = '../data/useritem/%s/' % data_name fpath_input = '%sinput.pkl' % fpath_dir with open(fpath_input, 'rb') as f: (n_users, n_items, item_features, train, valid, test) = pkl.load( f) # here features is not the returned features ui_graph = defaultdict(list) ii_graph = defaultdict(set) ii_graph_list = defaultdict(list) # dict() for edge, value in train.items(): u, i = edge ui_graph[u].append(i) # edge_dict = defaultdict(int) tmp_u_number = len(ui_graph) for index, (u, ilist) in enumerate(ui_graph.items()): if index % 500 == 0: print('user number: %d/%d' % (index, tmp_u_number)) for i in ilist: for j in ilist: # ii_graph[i].add(j) if i != j: edge_dict[(i, j)] += 1 if len(edge_dict) % 5000 == 0: print('len(edge_dict):%d' % len(edge_dict)) print('len(edge_dict):%d' % len(edge_dict)) edge_visit_thresh = 2 for edge, visit_num in edge_dict.items(): i1, i2 = edge if visit_num >= edge_visit_thresh: ii_graph_list[i1].append(i2) # = list(iset) print('%s:get ii mat' % (datetime.datetime.now().isoformat())) adj = nx.adjacency_matrix(nx.from_dict_of_lists(ii_graph_list)) print('adj shape:', adj.get_shape()) # features: lil_matrix features = item_features.tolil() # true_labels: the neighbor truth : not used for me and arga... true_labels = None # --transform over, now follows the original procedure # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj) adj = adj_train print('%s:mask test edges over' % (datetime.datetime.now().isoformat())) if has_features == 0: features = sp.identity(features.shape[0]) # featureless # Some preprocessing adj_norm = preprocess_graph(adj) num_nodes = adj.shape[0] features = sparse_to_tuple(features.tocoo()) num_features = features[2][1] features_nonzero = features[1].shape[0] pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2) adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) items = [adj, num_features, num_nodes, features_nonzero, pos_weight, norm, adj_norm, adj_label, features, true_labels, train_edges, val_edges, val_edges_false, test_edges, test_edges_false, adj_orig] feas = {} for item in items: feas[retrieve_name(item)] = item return feas
def format_data_ui_concat(data_name, has_features=1): ''' concat u and i ==> get a u-u mat and its map ''' # Load data fpath_dir = '../data/useritem/%s/' % data_name fpath_input = '%sinput.pkl' % fpath_dir with open(fpath_input, 'rb') as f: (n_users, n_items, item_features, train, valid, test) = pkl.load( f) # here features is not the returned features ui_graph = defaultdict(list) ii_graph = defaultdict(set) ii_graph_list = defaultdict(list) # dict() user_set = set() item_set = set() tag_set = set() for edge, value in train.items(): u, i = edge user_set.add(u) item_set.add(i) for edge, value in valid.items(): u, i = edge user_set.add(u) item_set.add(i) for edge, value in test.items(): u, i = edge user_set.add(u) item_set.add(i) # check if n_users is from [0, n_users-1] print(n_users) print('user:len, min, max:', len(user_set), min(user_set), max(user_set)) print(n_items) print('item:len, min, max:', len(item_set), min(item_set), max(item_set)) max_user_index_plus1 = max(user_set) + 1 user_plus_item_num = (max_user_index_plus1 + max(item_set)) + 1 new_ui_edge_dict = defaultdict(list) for edge, value in train.items(): u, i = edge new_ui_edge_dict[u].append(i + max_user_index_plus1) new_ui_edge_dict[i + max_user_index_plus1].append(u) print('%s:get ii mat' % (datetime.datetime.now().isoformat())) G_ui = nx.from_dict_of_lists(new_ui_edge_dict) G_ui_nodes_list = list(G_ui.nodes()) adj = nx.adjacency_matrix(G_ui) ui_to_ui_index_dict = dict() # include u and i for i in range(len(G_ui_nodes_list)): ui_to_ui_index_dict[G_ui_nodes_list[i]] = i print('adj shape:', adj.get_shape()) tag_set = set() for (item, tag), value in item_features.items(): tag_set.add(tag) max_tag_num = max(tag_set) + 1 item_features_mapped = sp.dok_matrix((user_plus_item_num, max_tag_num), dtype=np.int64) unused_item_cnt = 0 # no user item info's item for (item, tag), value in item_features.items(): # not used item item_mapped = item + max_user_index_plus1 if item_mapped not in ui_to_ui_index_dict: unused_item_cnt += 1 else: item_features_mapped[ui_to_ui_index_dict[item_mapped], tag] = value print('unused_item_cnt: %d' % unused_item_cnt) features = item_features_mapped.tolil() # item_features.tolil() # map to new position # true_labels: the neighbor truth : not used for me and arga... true_labels = None # --transform over, now follows the original procedure # Store original adjacency matrix (without diagonal entries) for later adj_orig = adj adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape) # to remove adj_matirx's diag, offset is 0 adj_orig.eliminate_zeros() adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj) adj = adj_train print('%s:mask test edges over' % (datetime.datetime.now().isoformat())) # if FLAGS.features == 0: if has_features == 0: features = sp.identity(features.shape[0]) # featureless #just diag have 1 # Some preprocessing adj_norm = preprocess_graph(adj) num_nodes = adj.shape[0] features = sparse_to_tuple(features.tocoo()) num_features = features[2][1] features_nonzero = features[1].shape[0] pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum() norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2) adj_label = adj_train + sp.eye(adj_train.shape[0]) adj_label = sparse_to_tuple(adj_label) items = [adj, num_features, num_nodes, features_nonzero, pos_weight, norm, adj_norm, adj_label, features, true_labels, train_edges, val_edges, val_edges_false, test_edges, test_edges_false, adj_orig, ui_to_ui_index_dict, max_user_index_plus1] # add ui feas = {} for item in items: feas[retrieve_name(item)] = item return feas
def val_test_data_gen(self): """Loop over the graph time sequence and compute a random set as a test set""" self.val_edges_list = [] self.val_edges_false_list = [] self.test_edges_list = [] self.test_edges_false_list = [] # new edges self.all_pos_edge_set = [] self.new_edges_list = [] self.new_edges_false_list = [] # Loop over the sequence length. # So if seq_len is 30, i will be 0...29 basically every graph in the time series for i in range(self.args.seq_len): val_test_graph, _ = load_adj_graph(f'{self.data_loc}_t{i}.npz') val_test_graph_adj, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(val_test_graph, test_percent=30., val_percent=20.) self.val_edges_list.append(val_edges) self.val_edges_false_list.append(val_edges_false) self.test_edges_list.append(test_edges) self.test_edges_false_list.append(test_edges_false) pos_edges = np.concatenate((val_edges, test_edges, train_edges)).tolist() self.all_pos_edge_set.append(set(map(tuple, pos_edges))) # Look over sequence again to get the new edges at each time point for i in range(self.args.seq_len): if i == 0: # on first loop do nothing self.new_edges_list.append(None) self.new_edges_false_list.append(None) # new edges since the last time step new_edges = np.array(list(self.all_pos_edge_set[i] - self.all_pos_edge_set[i-1])) if len(new_edges) == 0: # if the edge list is empty self.new_edges_list.append(None) self.new_edges_false_list.append(None) else: num_edges = len(new_edges) self.new_edges_list.append(new_edges) self.new_edges_false_list.append(self.test_edges_false_list[i][:num_edges]) print("Validation and Test edges captured from last graph in the sequence") # Set the number of vertices in the graph self.num_nodes = val_test_graph.shape[0]
def link_pred_emb(p=8, q=0.5, win_size=10, num_walks=10, walk_length=20, dimension=55, iter=1, rocfile="Plots/roctest.png", result_file_path="results/parameters.txt") -> None: """The main function. Link prediction is done here.""" # Load pickled (adj, feat) tuple with open(os.path.join(NETWORK_DIR, PICKLE_FILE), "rb") as file: adj, features = pickle.load(file) with open(os.path.join(NETWORK_DIR, ID_MAP_FILE), "rb") as file: id_map = pickle.load(file) g = nx.Graph(adj) # Recreate graph using node indices (0 to num_nodes-1) # Draw the network # nx.draw_networkx(g, with_labels=False, node_size=50, node_color="r") # plt.show() # Preprocessing (train/test split) np.random.seed(0) # make sure train-test split is consistent adj_sparse = nx.to_scipy_sparse_matrix(g) # Perform train-test split ( adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false, ) = mask_test_edges(adj_sparse, test_frac=0.3, val_frac=0.1) # new graph object with only non-hidden edges g_train = nx.from_scipy_sparse_matrix(adj_train) # Inspect train/test split print("Total nodes:", adj_sparse.shape[0]) # adj is symmetric, so nnz (num non-zero) = 2 * num_edges print("Total edges:", int(adj_sparse.nnz / 2)) print("Training edges (positive):", len(train_edges)) print("Training edges (negative):", len(train_edges_false)) print("Validation edges (positive):", len(val_edges)) print("Validation edges (negative):", len(val_edges_false)) print("Test edges (positive):", len(test_edges)) print("Test edges (negative):", len(test_edges_false)) # Train node2vec (Learn Node Embeddings) # node2vec settings # NOTE: When p = q = 1, this is equivalent to DeepWalk # P = 5 # Return hyperparameter # Q = 0.65 # In-out hyperparameter # WINDOW_SIZE = 10 # Context size for optimization # NUM_WALKS = 5 # Number of walks per source # WALK_LENGTH = 5 # Length of walk per source # DIMENSIONS = 128 # Embedding dimension # DIRECTED = False # Graph directed/undirected # WORKERS = 8 # Num. parallel workers # ITER = 1 # SGD epochs P = p # Return hyperparameter Q = q # In-out hyperparameter WINDOW_SIZE = win_size # Context size for optimization NUM_WALKS = num_walks # Number of walks per source WALK_LENGTH = walk_length # Length of walk per source DIMENSIONS = dimension # Embedding dimension DIRECTED = False # Graph directed/undirected WORKERS = 8 # Num. parallel workers ITER = iter # SGD epochs # Preprocessing, generate walks # create node2vec graph instance g_n2v = node2vec.Graph(g_train, DIRECTED, P, Q) g_n2v.preprocess_transition_probs() walks = g_n2v.simulate_walks(NUM_WALKS, WALK_LENGTH) walks = [list(map(str, walk)) for walk in walks] # Train skip-gram model model = Word2Vec( walks, size=DIMENSIONS, window=WINDOW_SIZE, min_count=0, sg=1, workers=WORKERS, iter=ITER, ) # Store embeddings mapping emb_mappings = model.wv model.wv.save_word2vec_format('Neo-Emb-2.emd') # Create node embeddings matrix (rows = nodes, columns = embedding features) emb_list = [] for node_index in range(0, adj_sparse.shape[0]): node_str = str(node_index) node_emb = emb_mappings[node_str] emb_list.append(node_emb) emb_matrix = np.vstack(emb_list) def get_edge_embeddings(edge_list): """ Generate bootstrapped edge embeddings (as is done in node2vec paper) Edge embedding for (v1, v2) = hadamard product of node embeddings for v1, v2. """ embs = [] for edge in edge_list: node1 = edge[0] node2 = edge[1] emb1 = emb_matrix[node1] emb2 = emb_matrix[node2] edge_emb = np.multiply(emb1, emb2) embs.append(edge_emb) embs = np.array(embs) return embs # Train-set edge embeddings pos_train_edge_embs = get_edge_embeddings(train_edges) neg_train_edge_embs = get_edge_embeddings(train_edges_false) train_edge_embs = np.concatenate( [pos_train_edge_embs, neg_train_edge_embs]) # Create train-set edge labels: 1 = real edge, 0 = false edge train_edge_labels = np.concatenate( [np.ones(len(train_edges)), np.zeros(len(train_edges_false))]) # Val-set edge embeddings, labels pos_val_edge_embs = get_edge_embeddings(val_edges) neg_val_edge_embs = get_edge_embeddings(val_edges_false) val_edge_embs = np.concatenate([pos_val_edge_embs, neg_val_edge_embs]) val_edge_labels = np.concatenate( [np.ones(len(val_edges)), np.zeros(len(val_edges_false))]) # Test-set edge embeddings, labels pos_test_edge_embs = get_edge_embeddings(test_edges) neg_test_edge_embs = get_edge_embeddings(test_edges_false) test_edge_embs = np.concatenate([pos_test_edge_embs, neg_test_edge_embs]) # Create val-set edge labels: 1 = real edge, 0 = false edge test_edge_labels = np.concatenate( [np.ones(len(test_edges)), np.zeros(len(test_edges_false))]) # Train logistic regression classifier on train-set edge embeddings #edge_classifier = LogisticRegression(random_state=0) edge_classifier = RandomForestClassifier(max_depth=10, random_state=0) edge_classifier.fit(train_edge_embs, train_edge_labels) # Predicted edge scores: probability of being of class "1" (real edge) val_preds = edge_classifier.predict_proba(val_edge_embs)[:, 1] val_roc = roc_auc_score(val_edge_labels, val_preds) val_ap = average_precision_score(val_edge_labels, val_preds) # Predicted edge scores: probability of being of class "1" (real edge) test_preds = edge_classifier.predict_proba(test_edge_embs)[:, 1] test_roc = roc_auc_score(test_edge_labels, test_preds) test_ap = average_precision_score(test_edge_labels, test_preds) result_file = open(result_file_path, "w") for para, value in emb_paras.items(): if para != "roc_file": result_file.write(para + " : " + str(value)) result_file.write("\n") for para, value in tuning_para.items(): if para != "fig_path": result_file.write(para + " : " + str(value)) result_file.write("\n") result_file.write("node2vec Validation ROC score: " + str(val_roc)) result_file.write("\n") result_file.write("node2vec Validation AP score: " + str(val_ap)) result_file.write("\n") result_file.write("node2vec Test ROC score: " + str(test_roc)) result_file.write("\n") result_file.write("node2vec Test AP score: " + str(test_ap)) result_file.write("\n") silhouette_score, purity_score = cluster(**tuning_para) result_file.write("silhouette score:" + str(silhouette_score)) result_file.write("\n") result_file.write("purity score:" + str(purity_score)) result_file.write("\n") result_file.close() fpr = dict() tpr = dict() roc_auc = dict() fpr, tpr, _ = roc_curve(test_edge_labels, test_preds) roc_auc = auc(fpr, tpr) lw = 2 plt.plot(fpr, tpr, color='darkorange', lw=lw, label='%s (area = %0.2f)' % ('RF', roc_auc)) plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('test') plt.legend(loc='lower right') plt.savefig(rocfile) plt.close()