def test_gae(): model = GAE(encoder=lambda x: x) model.reset_parameters() x = torch.Tensor([[1, -1], [1, 2], [2, 1]]) z = model.encode(x) assert z.tolist() == x.tolist() adj = model.decode(z) assert adj.tolist() == torch.sigmoid( torch.Tensor([[+2, -1, +1], [-1, +5, +4], [+1, +4, +5]])).tolist() edge_index = torch.tensor([[0, 1], [1, 2]]) value = model.decode_indices(z, edge_index) assert value.tolist() == torch.sigmoid(torch.Tensor([-1, 4])).tolist() edge_index = torch.tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]) data = Data(edge_index=edge_index) data = model.split_edges(data, val_ratio=0.2, test_ratio=0.3) assert data.val_pos_edge_index.size() == (2, 2) assert data.val_neg_edge_index.size() == (2, 2) assert data.test_pos_edge_index.size() == (2, 3) assert data.test_neg_edge_index.size() == (2, 3) assert data.train_pos_edge_index.size() == (2, 5) assert data.train_neg_adj_mask.size() == (11, 11) assert data.train_neg_adj_mask.sum().item() == (11**2 - 11) / 2 - 4 - 6 - 5 z = torch.randn(11, 16) loss = model.recon_loss(z, data.train_pos_edge_index) assert loss.item() > 0 auc, ap = model.test(z, data.val_pos_edge_index, data.val_neg_edge_index) assert auc >= 0 and auc <= 1 and ap >= 0 and ap <= 1
def test_gae(): model = GAE(encoder=lambda x: x) model.reset_parameters() x = torch.Tensor([[1, -1], [1, 2], [2, 1]]) z = model.encode(x) assert z.tolist() == x.tolist() adj = model.decoder.forward_all(z) assert adj.tolist() == torch.sigmoid( torch.Tensor([[+2, -1, +1], [-1, +5, +4], [+1, +4, +5]])).tolist() edge_index = torch.tensor([[0, 1], [1, 2]]) value = model.decode(z, edge_index) assert value.tolist() == torch.sigmoid(torch.Tensor([-1, 4])).tolist() edge_index = torch.tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]) data = Data(edge_index=edge_index) data.num_nodes = edge_index.max().item() + 1 data = train_test_split_edges(data, val_ratio=0.2, test_ratio=0.3) z = torch.randn(11, 16) loss = model.recon_loss(z, data.train_pos_edge_index) assert loss.item() > 0 auc, ap = model.test(z, data.val_pos_edge_index, data.val_neg_edge_index) assert auc >= 0 and auc <= 1 and ap >= 0 and ap <= 1
def test_gae(): model = GAE(encoder=lambda x: x) model.reset_parameters() x = torch.Tensor([[1, -1], [1, 2], [2, 1]]) z = model.encode(x) assert z.tolist() == x.tolist() adj = model.decoder.forward_all(z) assert adj.tolist() == torch.sigmoid( torch.Tensor([[+2, -1, +1], [-1, +5, +4], [+1, +4, +5]])).tolist() edge_index = torch.tensor([[0, 1], [1, 2]]) value = model.decode(z, edge_index) assert value.tolist() == torch.sigmoid(torch.Tensor([-1, 4])).tolist() edge_index = torch.tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]) data = Data(edge_index=edge_index, num_nodes=11) transform = RandomLinkSplit(split_labels=True, add_negative_train_samples=False) train_data, val_data, test_data = transform(data) z = torch.randn(11, 16) loss = model.recon_loss(z, train_data.pos_edge_label_index) assert loss.item() > 0 auc, ap = model.test(z, val_data.pos_edge_label_index, val_data.neg_edge_label_index) assert auc >= 0 and auc <= 1 and ap >= 0 and ap <= 1
def fit_model_once(self): #GCN if self.model_type == "GCN": encoder = EncoderGCN(in_channels=self.dataset.num_features, out_channels=32) #SAGE if self.model_type == "SAGE": encoder = EncoderSAGE(in_channels=self.dataset.num_features, out_channels=32) #GIN if self.model_type == "GIN": encoder = EncoderGIN(in_channels=self.dataset.num_features, out_channels=32) #GAT if self.model_type == "GAT": encoder = EncoderGAT(in_channels=self.dataset.num_features, out_channels=16, heads=8) #AGNN if self.model_type == "AGNN": encoder = EncoderAGNN(in_channels=self.dataset.num_features, out_channels=16) #GraphUNet if self.model_type == "GraphUNet": encoder = EncoderGraphUNet(in_channels=self.dataset.num_features, hidden_channels=32, out_channels=16) model = GAE(encoder=encoder, decoder=InnerProductDecoder()) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) trainer = TrainerGae(model, self.device, self.data, writer_path='runs/{}/{}/'.format( self.model_type, self.text_encoding) + self.time_mark()) model = trainer.fit(optimizer, patience=self.patience, num_epochs=self.num_epochs) auc, ap = trainer.evaluate(validation=False, test=True) return model, auc, ap
def load_data(dataset_name): if dataset_name in ['cora', 'citeseer', 'pubmed']: path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '.', 'data', dataset_name) data = Planetoid(path, dataset_name)[0] else: data = load_wiki.load_data() data.edge_index = gutils.to_undirected(data.edge_index) data = GAE.split_edges(GAE, data) features = data.x.numpy() train_pos_edges = data.train_pos_edge_index.numpy() train_neg_edges = sample_negative(count=train_pos_edges.shape[1], avoid=train_pos_edges, nodes=features.shape[0]) x_tr, y_tr = combine_node_pair_features(features, train_pos_edges, train_neg_edges) x_val, y_val = combine_node_pair_features(features, data.val_pos_edge_index.numpy(), data.val_neg_edge_index.numpy()) x_test, y_test = combine_node_pair_features( features, data.test_pos_edge_index.numpy(), data.test_neg_edge_index.numpy()) return x_tr, y_tr, x_val, y_val, x_test, y_test
def test_init(): encoder = torch.nn.Linear(16, 32) decoder = torch.nn.Linear(32, 16) discriminator = torch.nn.Linear(32, 1) GAE(encoder, decoder) VGAE(encoder, decoder) ARGA(encoder, discriminator, decoder) ARGVA(encoder, discriminator, decoder)
def fit_model_once(self): gcn_model = GAE(encoder=EncoderGCN( in_channels=self.dataset.num_features, out_channels=32), decoder=InnerProductDecoder()) optimizer = torch.optim.Adam(gcn_model.parameters(), lr=0.01) trainer_gcn = TrainerGae( gcn_model, self.device, self.data, writer_path='runs/gae_gcn/{}/'.format(self.feature) + self.time_mark()) gcn_model = trainer_gcn.fit(optimizer, patience=self.patience, num_epochs=self.num_epochs) auc, ap = trainer_gcn.evaluate(validation=False, test=True) return auc, ap
def get_model_and_optimizer(training_method, dataset_name, features_dimension, device): training_method_signature = 'BP' if training_method == 'bp' else 'ALT' if training_method_signature == 'BP': model = GAE(GraphEncoder(features_dimension, 16)) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) else: model = GAE(DFAGraphEncoder(features_dimension, 16, training_method=training_method)) if dataset_name == 'cora': optimizer = torch.optim.Adam(model.parameters(), lr=0.01) elif dataset_name == 'citeseer': optimizer = torch.optim.Adam(model.parameters(), lr=0.02) elif dataset_name == 'pubmed': optimizer = torch.optim.Adam(model.parameters(), lr=0.01) return model.to(device), optimizer
def __init__(self, data, embed_dim, **kwargs): super(UnsGAE, self).__init__() self.data = data self.input_dim = self.data.dim self.embed_dim = embed_dim # for now, we only work with 2-layer encoders self.hidden_dim = kwargs.get('hidden_dim', 2*embed_dim) self.encoder = kwargs.get('encoder', batched_SAGEEncoder) self.encoder = self.encoder(self.input_dim, self.hidden_dim, self.embed_dim) self.model = GAE(self.encoder) # preparing the device device = kwargs.get('device', 'cuda') if device=='gpu' and not(torch.cuda.is_available()): print('CUDA is not available in PyTorch. the model ' +\ 'will be initiated on CPU.') device = 'cpu' self.device = torch.device(device)
def train(dataset, args, writer = None): task = args.task test_loader = loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True) if task == 'link': model = GAE(models.GNNStack(dataset.num_node_features, args.hidden_dim, int(dataset.num_classes), args)) elif task == 'node': model = models.GNNStack(dataset.num_node_features, args.hidden_dim, int(dataset.num_classes), args) else: raise RuntimeError("Unknown task.") metrics_for_labels = True if args.metrics_for_labels == 'True' else False scheduler, opt = build_optimizer(args, model.parameters()) print("Training \nModel: {}, Data representation: {}. Dataset: {}, Task type: {}". format(args.model_name, args.graph_type, args.dataset, args.task)) metric_text = 'test accuracy' if task == 'node' else 'test precision' for epoch in range(args.epochs): total_loss = 0 model.train() for batch in loader: opt.zero_grad() if task == 'node': pred = model(batch) label = batch.y pred = pred[batch.train_mask] label = label[batch.train_mask] loss = model.loss(pred, label) else: train_pos_edge_index = batch.train_pos_edge_index z = model.encode(batch) loss = model.recon_loss(z, train_pos_edge_index) loss.backward() opt.step() total_loss += loss.item() * batch.num_graphs total_loss /= len(loader.dataset) if writer == None: print(total_loss) else: writer.add_scalar("loss", total_loss, epoch) if epoch % 10 == 0: test_metric, _ = test(loader, model, task = task) if writer == None: print(test_metric, metric_text) else: writer.add_scalar(metric_text, test_metric, epoch) if metrics_for_labels == True and epoch == args.epochs -1: _, labels_metrics = test(loader, model, task = task, metrics_for_labels=metrics_for_labels) print('{} for labels:\n {}'.format(metric_text, labels_metrics))
def load_data(dataset_name): path = osp.join(osp.dirname(osp.realpath(__file__)), '.', 'data', dataset_name) dataset = Planetoid(path, dataset_name, T.TargetIndegree()) num_features = dataset.num_features data = GAE.split_edges(GAE, dataset[0]) data.train_pos_edge_index = gutils.to_undirected(data.train_pos_edge_index) data.val_pos_edge_index = gutils.to_undirected(data.val_pos_edge_index) data.test_pos_edge_index = gutils.to_undirected(data.test_pos_edge_index) data.edge_index = torch.cat([ data.train_pos_edge_index, data.val_pos_edge_index, data.test_pos_edge_index ], dim=1) data.edge_train_mask = torch.cat([ torch.ones((data.train_pos_edge_index.size(-1))), torch.zeros((data.val_pos_edge_index.size(-1))), torch.zeros((data.test_pos_edge_index.size(-1))) ], dim=0).byte() data.edge_val_mask = torch.cat([ torch.zeros((data.train_pos_edge_index.size(-1))), torch.ones((data.val_pos_edge_index.size(-1))), torch.zeros((data.test_pos_edge_index.size(-1))) ], dim=0).byte() data.edge_test_mask = torch.cat([ torch.zeros((data.train_pos_edge_index.size(-1))), torch.zeros((data.val_pos_edge_index.size(-1))), torch.ones((data.test_pos_edge_index.size(-1))) ], dim=0).byte() data.edge_type = torch.zeros(((data.edge_index.size(-1)), )).long() data.batch = torch.zeros((1, data.num_nodes), dtype=torch.int64).view(-1) data.num_graphs = 1 return data, num_features
data = dataset[0] class Encoder(torch.nn.Module): def __init__(self, in_channels, out_channels): super(Encoder, self).__init__() self.conv1 = GCNConv(in_channels, 2 * out_channels) self.conv2 = GCNConv(2 * out_channels, out_channels) def forward(self, x, edge_index): x = F.relu(self.conv1(x, edge_index)) return self.conv2(x, edge_index) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = GAE(Encoder(dataset.num_features, out_channels=16)).to(device) data.train_mask = data.val_mask = data.test_mask = data.y = None data = model.split_edges(data) x, edge_index = data.x.to(device), data.edge_index.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) def train(): model.train() optimizer.zero_grad() z = model.encode(x, edge_index) loss = model.loss(z, data.train_pos_edge_index, data.train_neg_adj_mask) loss.backward() optimizer.step()
link_probs = link_logits.sigmoid() link_labels = get_link_labels(pos_edge_index, neg_edge_index) perfs.append(roc_auc_score(link_labels, link_probs)) return perfs # -------------------------------------------------------------------------------------------------------- # -------------------------------------------------------------------------------------------------------- train_loader = NeighborSampler(data.train_pos_edge_index, batch_size=8, shuffle=True, sizes=[5, 5]) # subgraph_loader = NeighborSampler(data.edge_index, node_idx=None, sizes=[-1], batch_size=128, shuffle=False) enc = Encoder() decod = DEDICOMDecoder() model = GAE(enc, decod) optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001) # ---------------------------------------------------------------------------------------------------------- # ---------------------------------------------------------------------------------------------------------- best_val_perf = test_perf = 0 for epoch in range(1, 3): train_loss = train() print(epoch, train_loss) # val_perf = test() # log = 'Epoch: {:03d}, Loss: {:.4f}, Val: {:.4f}' # print(log.format(epoch, train_loss, val_perf[0]))
def run_experiment(args): """ Performing experiment for the given arguments """ dataset, data = load_data(args.dataset) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Define Model encoder = create_encoder(args.model, dataset.num_features, args.latent_dim).to(device) decoder = create_decoder(args.decoder).to(device) if args.model == 'GAE': model = GAE(encoder=encoder, decoder=decoder).to(device) else: model = VGAE(encoder=encoder, decoder=decoder).to(device) # Split edges of a torch_geometric.data.Data object into pos negative train/val/test edges # default ratios of positive edges: val_ratio=0.05, test_ratio=0.1 print("Data.edge_index.size", data.edge_index.size(1)) data = model.split_edges(data) node_features, train_pos_edge_index = data.x.to( device), data.train_pos_edge_index.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) def train_epoch(): """ Performing training over a single epoch and optimize over loss :return: log - loss of training loss """ # Todo: Add logging of results model.train() optimizer.zero_grad() # Compute latent embedding Z latent_embeddings = model.encode(node_features, train_pos_edge_index) # Calculate loss and loss = model.recon_loss(latent_embeddings, train_pos_edge_index) if args.model in ['VGAE']: loss = loss + (1 / data.num_nodes) * model.kl_loss() # Compute gradients loss.backward() # Perform optimization step optimizer.step() # print("Train-Epoch: {} Loss: {}".format(epoch, loss)) # ToDo: Add logging via Tensorboard log = {'loss': loss} return log def test(pos_edge_index, neg_edge_index): model.eval() with torch.no_grad(): # compute latent var z = model.encode(node_features, train_pos_edge_index) # model.test return - AUC, AP return model.test(z, pos_edge_index, neg_edge_index) def test_naive_graph(z, sample_size=1000): if args.sample_dense_evaluation: graph_type = "sampled" z_sample, index_mapping = sample_graph(z, sample_size) t = time.time() adjacency = model.decoder.forward_all( z_sample, sigmoid=(args.decoder == 'dot')) else: graph_type = "full" t = time.time() adjacency = model.decoder.forward_all( z, sigmoid=(args.decoder == 'dot')) print(f"Computing {graph_type} graph took {time.time() - t} seconds.") print( f"Adjacency matrix takes {adjacency.element_size() * adjacency.nelement() / 10 ** 6} MB of memory." ) if args.min_sim_absolute_value is None: args.min_sim_absolute_value, _ = sample_percentile( args.min_sim, adjacency, dist_measure=args.decoder, sample_size=sample_size) if args.sample_dense_evaluation: precision, recall = sampled_dense_precision_recall( data, adjacency, index_mapping, args.min_sim_absolute_value) else: precision, recall = dense_precision_recall( data, adjacency, args.min_sim_absolute_value) print("Predicted {} adjacency matrix has precision {} and recall {}!". format(graph_type, precision, recall)) return precision, recall def sample_graph(z, sample_size): N, D = z.shape sample_size = min(sample_size, N) sample_ix = np.random.choice(np.arange(N), size=sample_size, replace=False) # Returns the sampled embeddings, and a mapping from their indices to the originals return z[sample_ix], {i: sample_ix[i] for i in np.arange(sample_size)} def test_compare_lsh_naive_graphs(z, assure_correctness=True): """ :param z: :param assure_correctness: :return: """ # Naive Adjacency-Matrix (Non-LSH-Version) t = time.time() # Don't use sigmoid in order to directly compare thresholds with LSH naive_adjacency = model.decoder.forward_all( z, sigmoid=(args.decoder == 'dot')) naive_time = time.time() - t naive_size = naive_adjacency.element_size() * naive_adjacency.nelement( ) / 10**6 if args.min_sim_absolute_value is None: args.min_sim_absolute_value, _ = sample_percentile( args.min_sim, z, dist_measure=args.decoder) print( "______________________________Naive Graph Computation KPI____________________________________________" ) print(f"Computing naive graph took {naive_time} seconds.") print(f"Naive adjacency matrix takes {naive_size} MB of memory.") # LSH-Adjacency-Matrix: t = time.time() lsh_adjacency = LSHDecoder(bands=args.lsh_bands, rows=args.lsh_rows, verbose=True, assure_correctness=assure_correctness, sim_thresh=args.min_sim_absolute_value)(z) lsh_time = time.time() - t lsh_size = lsh_adjacency.element_size() * lsh_adjacency._nnz() / 10**6 print( "__________________________________LSH Graph Computation KPI__________________________________________" ) print(f"Computing LSH graph took {lsh_time} seconds.") print(f"Sparse adjacency matrix takes {lsh_size} MB of memory.") print( "________________________________________Precision-Recall_____________________________________________" ) # 1) Evaluation: Both Adjacency matrices against ground truth graph naive_precision, naive_recall = dense_precision_recall( data, naive_adjacency, args.min_sim_absolute_value) lsh_precision, lsh_recall = sparse_precision_recall( data, lsh_adjacency) print( f"Naive-Precision {naive_precision}; Naive-Recall {naive_recall}") print(f"LSH-Precision {lsh_precision}; LSH-Recall {lsh_recall}") print( "_____________________________Comparison Sparse vs Dense______________________________________________" ) # 2) Evation: Compare both adjacency matrices against each other compare_precision, compare_recall = sparse_v_dense_precision_recall( naive_adjacency, lsh_adjacency, args.min_sim_absolute_value) print( f"LSH sparse matrix has {compare_precision} precision and {compare_recall} recall w.r.t. the naively generated dense matrix!" ) return naive_precision, naive_recall, naive_time, naive_size, lsh_precision, lsh_recall, lsh_time, lsh_size, compare_precision, compare_recall # Training routine early_stopping = EarlyStopping(args.use_early_stopping, patience=args.early_stopping_patience, verbose=True) logs = [] if args.load_model and os.path.isfile("checkpoint.pt"): print("Loading model from savefile...") model.load_state_dict(torch.load("checkpoint.pt")) if not (args.load_model and args.early_stopping_patience == 0): for epoch in range(1, args.epochs): log = train_epoch() logs.append(log) # Validation metrics val_auc, val_ap = test(data.val_pos_edge_index, data.val_neg_edge_index) print('Validation-Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format( epoch, val_auc, val_ap)) # Stop training if validation scores have not improved early_stopping(val_ap, model) if early_stopping.early_stop: print("Applying early-stopping") break else: epoch = 0 # Load best encoder print("Load best model for evaluation.") model.load_state_dict(torch.load('checkpoint.pt')) print( "__________________________________________________________________________" ) # Training is finished, calculate test metrics test_auc, test_ap = test(data.test_pos_edge_index, data.test_neg_edge_index) print('Test Results: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format( epoch, test_auc, test_ap)) # Check if early stopping was applied or not - if not: model might not be done with training if args.epochs == epoch + 1: print("Model might need more epochs - Increase number of Epochs!") # Evaluate full graph latent_embeddings = model.encode(node_features, train_pos_edge_index) # Save embeddings to embeddings folder if flag is set if args.save_embeddings: embeddings_folder = osp.join(osp.dirname(osp.abspath(__file__)), 'embeddings') if not osp.isdir(embeddings_folder): os.makedirs(embeddings_folder) torch.save( latent_embeddings, osp.join(embeddings_folder, args.dataset + "_" + args.decoder + ".pt")) if not args.lsh: # Compute precision recall w.r.t the ground truth graph graph_precision, graph_recall = test_naive_graph(latent_embeddings) del model del encoder del decoder torch.cuda.empty_cache() else: # Precision w.r.t. the generated graph naive_precision, naive_recall, naive_time, naive_size, lsh_precision, \ lsh_recall, lsh_time, lsh_size, \ compare_precision, compare_recall = test_compare_lsh_naive_graphs( latent_embeddings) del model del encoder del decoder torch.cuda.empty_cache() return { 'args': args, 'test_auc': test_auc, 'test_ap': test_ap, 'naive_precision': naive_precision, 'naive_recall': naive_recall, 'naive_time': naive_time, 'naive_size': naive_size, 'lsh_precision': lsh_precision, 'lsh_recall': lsh_recall, 'lsh_time': lsh_time, 'lsh_size': lsh_size, 'compare_precision': compare_precision, 'compare_recall': compare_recall }
def run_model(dataset, conf): # ## 1) Build Table graph # ### Tables tokenization tokenized_tables, vocabulary, cell_dict, reversed_dictionary = corpus_tuple = create_corpus( dataset, include_attr=conf["add_attr"]) if conf["shuffle_vocab"] == True: shuffled_vocab = shuffle_vocabulary(vocabulary) else: shuffled_vocab = None nodes = build_node_features(vocabulary) row_edges_index, row_edges_weights = build_graph_edges( tokenized_tables, s_vocab=shuffled_vocab, sample_frac=conf["row_edges_sample"], columns=False) col_edges_index, col_edges_weights = build_graph_edges( tokenized_tables, s_vocab=shuffled_vocab, sample_frac=conf["column_edges_sample"], columns=True) edges = torch.cat((row_edges_index, col_edges_index), dim=1) weights = torch.cat((row_edges_weights, col_edges_weights), dim=0) graph_data = Data(x=nodes, edge_index=edges, edge_attr=weights) # ## 2 ) Run Table Auto-Encoder Model: device = 'cuda' if torch.cuda.is_available() else 'cpu' loader = DataLoader(torch.arange(graph_data.num_nodes), batch_size=128, shuffle=True) graph_data = graph_data.to(device) x, train_pos_edge_index = nodes, edges class Encoder(torch.nn.Module): def __init__(self, in_channels, out_channels): super(Encoder, self).__init__() self.conv1 = GCNConv(in_channels, 2 * out_channels, cached=True) self.conv_mu = GCNConv(2 * out_channels, out_channels, cached=True) self.conv_logvar = GCNConv(2 * out_channels, out_channels, cached=True) def forward(self, x, edge_index): x = F.relu(self.conv1(x, edge_index)) return self.conv_mu(x, edge_index), self.conv_logvar(x, edge_index) channels = conf["vector_size"] enc = Encoder(graph_data.num_features, channels) model = GAE(enc) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) def train(model, optimizer, x, train_pos_edge_index): model.train() optimizer.zero_grad() z = model.encode(x, train_pos_edge_index) loss = model.recon_loss(z, train_pos_edge_index) #loss = model.kl_loss() loss.backward() optimizer.step() return loss losses = [] for epoch in range(conf["epoch_num"]): loss = train(model, optimizer, x, train_pos_edge_index) losses.append(loss) print(epoch, loss) losses.append(loss) # ### 3) Extract the latent cell vectors, generate table vectors: def get_cell_vectors(model, x, train_pos_edge_index): model.eval() with torch.no_grad(): z = model.encode(x, train_pos_edge_index) cell_vectors = z.numpy() return z, cell_vectors z, cell_vectors = get_cell_vectors(model, x, train_pos_edge_index) vec_list = generate_table_vectors(cell_vectors, tokenized_tables, s_vocab=shuffled_vocab) # ## 3) Evaluate the model result_score = evaluate_model(dataset, vec_list, k=5) return cell_vectors, vec_list, losses, result_score
self.conv2 = GCNConv(2 * out_channels, out_channels, cached=False) # Map into 2*out_channels dimentions with def forward(self, x, edge_index): x = F.relu(self.conv1(x, edge_index)) return self.conv2(x, edge_index) #This is the size of the latent embedding channels = 32 # We have 75 origional features num_features = 75 dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #dev = torch.device('cpu') model = GAE(Encoder(num_features, channels).to(dev)) #data.train_mask = data.val_mask = data.test_mask = data.y = None #data = model.split_edges(data) #x, train_edge_index = data.x.to(dev), data.edge_index.to(dev) optimizer = torch.optim.Adam(model.parameters(), lr=0.005) def train(loader): model.train() loss_all = 0 for data in loader: x, train_edge_index = data.x.to(dev), data.edge_index.to(dev) optimizer.zero_grad() z = model.encode(x, train_edge_index) loss = model.recon_loss(z, train_edge_index) loss.backward()
act = F.relu sum_res = True variational = False path = Path(__file__).parent / "../../test/data/BBA-subset-100.h5" node_feature_path = ( Path(__file__).parent / "../../test/data/onehot_bba_amino_acid_labels.npy" ) dataset = ContactMapDataset( path, "contact_map", ["rmsd"], node_feature_path=node_feature_path ) loader = DataLoader(dataset, batch_size=1, shuffle=True) # Select node AE if args.linear: node_ae = GAE(LinearEncoder(num_features, node_out_channels)) else: node_ae = GAE(GCNEncoder(num_features, node_out_channels)) # Select graph AE encoder = VariationalGraphEncoder( node_out_channels, hidden_channels, graph_out_channels, depth, pool_ratios, act, variational, ) decoder = VariationalGraphDecoder( graph_out_channels,
Path(__file__).parent / "../../test/data/onehot_bba_amino_acid_labels.npy" ) dataset = ContactMapDataset( path, "contact_map", ["rmsd"], node_feature_path=node_feature_path ) data = dataset[0]["X"] loader = DataLoader(dataset, batch_size=1, shuffle=True) # Parameters out_channels = 10 num_features = 13 # Model if not args.variational: if not args.linear: model = GAE(GCNEncoder(num_features, out_channels)) else: model = GAE(LinearEncoder(num_features, out_channels)) else: if args.linear: model = VGAE(VariationalLinearEncoder(num_features, out_channels)) else: model = VGAE(VariationalGCNEncoder(num_features, out_channels)) # Hardware device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model, data = model.to(device), data.to(device) # Optimizer optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
dataset = Planetoid(root='tmp', name='PubMed') print("use dataset: PubMed") data = dataset[0] enhanced_data = train_test_split_edges(data.clone(), val_ratio=0.1, test_ratio=0.2) train_data = Data(x=enhanced_data.x, edge_index=enhanced_data['train_pos_edge_index']).to(DEVICE) target_data = data.to(DEVICE) if args.model is 'VGAE': model = VGAE(encoder=VEncoder(data['x'].shape[1])).to(DEVICE) else: model = GAE(encoder=Encoder(data['x'].shape[1])).to(DEVICE) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=5e-4) def model_train(): print("========Start training========") for epoch in range(args.num_epoch): model.train() optimizer.zero_grad() z = model.encode(train_data) recon_loss = model.recon_loss(z, target_data['edge_index']) if args.model is 'VGAE': recon_loss += model.kl_loss() / data['x'].shape[0]
parser.add_argument('--dataset') parser.add_argument('--epochs', type=int, default=200) parser.add_argument('--val-freq', type=int, default=20) parser.add_argument('--runs', type=int, default=10) parser.add_argument('--test', action='store_true', default=False) args = parser.parse_args() if args.dataset in ['cora', 'citeseer', 'pubmed']: path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '.', 'data', args.dataset) data = Planetoid(path, args.dataset)[0] else: data = load_wiki.load_data() data.edge_index = gutils.to_undirected(data.edge_index) data = GAE.split_edges(GAE, data) num_features = data.x.shape[1] aucs = [] aps = [] for run in range(args.runs): model = VGAE(VGAE_Encoder(num_features)) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) # Training loop for epoch in range(args.epochs): model.train() optimizer.zero_grad() z = model.encode(data.x, data.train_pos_edge_index) loss = model.recon_loss( z, data.train_pos_edge_index) #0.01*model.kl_loss()
help='Residual connection') args = parser.parse_args() #download datasets path = os.join(os.dirname(os.realpath(__file__)), '..', 'data', args.dataset) dataset = Planetoid(path, args.dataset) dev = torch.device(args.dev) if args.model == VGAE: model = VGAE( Encoder_VGAE(dataset.num_features, args.hidden1, args.hidden2, args.depth, args.res)).to(dev) else: model = GAE( Encoder_GAE(dataset.num_features, args.hidden1, args.hidden2, args.depth, args.res)).to(dev) auc_score_list = [] ap_score_list = [] print("Dataset: ", args.dataset, " Model: ", args.model, ", Residual :", args.res, ", Layer depth:", args.depth, " ") for i in range(1, args.runs + 1): data = dataset[0] data.train_mask = data.val_mask = data.test_mask = data.y = None data = train_test_split_edges(data) x, train_pos_edge_index = data.x.to(dev), data.train_pos_edge_index.to(dev) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
def run_GAE(input_data, output_dir, epochs=1000, lr=0.01, weight_decay=0.0005): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('Device: '.ljust(32), device) print('Model Name: '.ljust(32), 'GAE') print('Model params:{:19} lr: {} weight_decay: {}'.format( '', lr, weight_decay)) print('Total number of epochs to run: '.ljust(32), epochs) print('*' * 70) data = input_data.clone().to(device) in_channels = data.num_features out_channels = data.num_classes.item() model = GAE(GAEncoder(in_channels, out_channels)).to(device) data = input_data.clone().to(device) split_data = model.split_edges(data) x, train_pos_edge_index, edge_attr = split_data.x.to( device), split_data.train_pos_edge_index.to(device), data.edge_attr.to( device) split_data.train_idx = split_data.test_idx = data.y = None optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) train_losses, test_losses = [], [] aucs = [] aps = [] model.train() for epoch in range(1, epochs + 1): train_loss = 0 test_loss = 0 optimizer.zero_grad() z = model.encode(x, train_pos_edge_index) train_loss = model.recon_loss(z, train_pos_edge_index) train_losses.append(train_loss) train_loss.backward() optimizer.step() model.eval() with torch.no_grad(): z = model.encode(x, train_pos_edge_index) auc, ap = model.test(z, split_data.test_pos_edge_index, split_data.test_neg_edge_index) test_loss = model.recon_loss(z, data.test_pos_edge_index) test_losses.append(test_loss.item()) aucs.append(auc) aps.append(ap) figname = os.path.join( output_dir, "_".join((GAE.__name__, str(lr), str(weight_decay)))) makepath(output_dir) if (epoch % int(epochs / 10) == 0): print( 'Epoch: {} Train loss: {} Test loss: {} AUC: {} AP: {}' .format(epoch, train_loss, test_loss, auc, ap)) if (epoch == epochs): print( '-' * 65, '\nFinal epoch: {} Train loss: {} Test loss: {} AUC: {} AP: {}' .format(epoch, train_loss, test_loss, auc, ap)) log = 'Final epoch: {} Train loss: {} Test loss: {} AUC: {} AP: {}'.format( epoch, train_loss, test_loss, auc, ap) write_log(log, figname) print('-' * 65) plot_linkpred(train_losses, test_losses, aucs, aps, output_dir, epochs, figname) return
super().__init__() self.rel_emb = Parameter(torch.Tensor(num_relations, hidden_channels)) self.reset_parameters() def reset_parameters(self): torch.nn.init.xavier_uniform_(self.rel_emb) def forward(self, z, edge_index, edge_type): z_src, z_dst = z[edge_index[0]], z[edge_index[1]] rel = self.rel_emb[edge_type] return torch.sum(z_src * rel * z_dst, dim=1) model = GAE( RGCNEncoder(data.num_nodes, hidden_channels=500, num_relations=dataset.num_relations), DistMultDecoder(dataset.num_relations // 2, hidden_channels=500), ) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) def negative_sampling(edge_index, num_nodes): # Sample edges by corrupting either the subject or the object of each edge. mask_1 = torch.rand(edge_index.size(1)) < 0.5 mask_2 = ~mask_1 neg_edge_index = edge_index.clone() neg_edge_index[0, mask_1] = torch.randint(num_nodes, (mask_1.sum(), )) neg_edge_index[1, mask_2] = torch.randint(num_nodes, (mask_2.sum(), )) return neg_edge_index
def test(pos_edge_index, neg_edge_index): model.eval() with torch.no_grad(): for _, _, adjs in train_loader: adjs = [adj.to(dev) for adj in adjs] z = model.encode(x, adjs) return model.test(z, pos_edge_index, neg_edge_index) if __name__ == "__main__": dataset = RetweetDataset(root='./', transform=T.NormalizeFeatures()) latent_dim = 32 dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = GAE(Encoder(dataset.num_features, latent_dim)).to(dev) data = dataset[0] data.train_mask = data.val_mask = data.test_mask = data.y = None data = train_test_split_edges(data) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) train_loader = NeighborSampler(data.train_pos_edge_index, node_idx=None, sizes=[25, 10], num_nodes=data.num_nodes, batch_size=data.x.shape[0], shuffle=True, num_workers=1) x, train_pos_edge_index = data.x.to(dev), data.train_pos_edge_index.to(dev)
class UnsGAE(object): def __init__(self, data, embed_dim, **kwargs): super(UnsGAE, self).__init__() self.data = data self.input_dim = self.data.dim self.embed_dim = embed_dim # for now, we only work with 2-layer encoders self.hidden_dim = kwargs.get('hidden_dim', 2*embed_dim) self.encoder = kwargs.get('encoder', batched_SAGEEncoder) self.encoder = self.encoder(self.input_dim, self.hidden_dim, self.embed_dim) self.model = GAE(self.encoder) # preparing the device device = kwargs.get('device', 'cuda') if device=='gpu' and not(torch.cuda.is_available()): print('CUDA is not available in PyTorch. the model ' +\ 'will be initiated on CPU.') device = 'cpu' self.device = torch.device(device) def init_model(self, sizes, weights_path=None): self.model = self.model.to(self.device) # sizes are directly used for initializing the model # but it will be used for every feed-forward as the # sampling size of the neighbors assert len(sizes)==self.model.encoder.num_layers, \ 'Number of sizes should be equal to the number of layers in the encoder.' self.sizes = sizes if not(hasattr(self.data, 'loader')): self.data.get_neighbor_sampler(self.sizes) if weights_path is not None: self.model.load_state_dict(torch.load(weights_path, map_location=self.device)) def init_training(self, neg_num, optim='Adam', lr=1e-5, smooth_par=0.75): if optim=='Adam': self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr) elif optim=='SGD': self.optimizer = torch.optim.SGD(self.model.parameters(), lr=lr) self.train_one_epoch = self._train_edge_batching self.neg_num = neg_num if not(hasattr(self.data, 'pos_pairs')): assert 'pos_samples_path' in kwargs, 'The provided data does ' +\ 'not come with positive pairs, and we need a path to the ' +\ 'already selected positive samples. You can provide it through ' +\ 'input pos_samples_path .' include_nodes = kwargs.get('include_nodes', None) self.data.load_positive_pairs(kwargs['pos_samples_path'], include_nodes) if not(hasattr(self.data, 'neg_sampler')): #smooth_par = kwargs.get('smooth_par', 0.75) self.data.get_negative_sampler(smooth_par) if not(hasattr(self.data, 'x_all')): self.data._fetch_node_features() def init_validation(self): if not(hasattr(self.data, 'x_all')): self.data._fetch_node_features() def embed_some(self, sample_inds, b=100): """This will be used in the training, when the embedding of a batch of samples are needed """ quot, rem = np.divmod(len(sample_inds), b) Z = [] for i in range(quot+1): if i<quot: b_ids = sample_inds[i*b:(i+1)*b] elif rem>0: b_ids = sample_inds[i*b:] # neighbor-sampling for each sample _, n_id, adjs = self.data.train_loader.sample(b_ids) adjs = [adj.to(self.device) for adj in adjs] # get feature vectors through the neighbors sampled above batch_X = torch.from_numpy(self.data.get_node_features(n_id)) batch_X = batch_X.to(torch.float).to(self.device) # the encoder's output as the embedding try: batch_Z = self.model.encoder(batch_X, adjs) except: pdb.set_trace() Z += [batch_Z] Z = torch.cat(Z, dim=0) return Z def embed_all(self): L = self.model.encoder.num_layers pbar = tqdm(total=self.data.n_x * L, position=0, leave=True) pbar.set_description('Evaluating') self.model.encoder.eval() # inference is used in the evaluation stage (not in training) when # the embeddings for "all" nodes will be computed. It's written in a way # that is faster than the foward-passing function which is mostly used # for single batches in the training with torch.no_grad(): for i in range(L): xs = [] for batch_size, n_id, adj in self.data.test_loader: edge_index, _, size = adj.to(self.device) if i==0: x = torch.from_numpy(self.data.get_node_features(n_id)) x = x.to(torch.float).to(self.device) else: x = x_all[n_id,:].to(self.device) x_target = x[:size[1]] x = self.model.encoder.convs[i]((x,x_target), edge_index) if i != L-1: x = F.relu(x) xs.append(x[:batch_size,:].cpu()) pbar.update(batch_size) x_all = torch.cat(xs, dim=0) pbar.close() return x_all def _train_edge_batching(self, ep, batch_size=5000): assert hasattr(self.data, 'pos_pairs'), 'Positive and negative ' + \ 'samples must be generated before starting the training' self.model.train() neg_num = self.neg_num torch.multiprocessing.set_sharing_strategy('file_system') pbar = tqdm(total=self.data.pos_pairs.shape[1], position=0, leave=True) pbar.set_description(f'Epoch {ep:02d}') total_loss = 0 np.random.shuffle(self.data.pos_pairs.T) quot, rem = np.divmod(self.data.pos_pairs.shape[1], batch_size) for i in range(quot+1): # positive mini-batch # (#: batch size) if i<quot: batch_pos_pairs = self.data.pos_pairs[:,i*batch_size:(i+1)*batch_size] else: batch_pos_pairs = self.data.pos_pairs[:,i*batch_size:] batch_pos_samples, pos_edge_index = np.unique(batch_pos_pairs, return_inverse=True) pos_edge_index = pos_edge_index.reshape(batch_pos_pairs.shape) # negative mini-batch # (#: batch_size * neg_num) batch_neg_samples = self.data.neg_sampler.sample( torch.Size([neg_num*batch_pos_pairs.shape[1]])) neg_edge_index = np.array([np.repeat(pos_edge_index[0,:],neg_num), np.arange(pos_edge_index.max()+1, pos_edge_index.max()+len(batch_neg_samples)+1)]) # embeddings of the nodes involved in + and - edges self.optimizer.zero_grad() unodes = batch_pos_samples.tolist() + batch_neg_samples.tolist() Z = self.embed_some(unodes) # reconstruction loss pos_edge_index = torch.from_numpy(pos_edge_index).to(self.device) neg_edge_index = torch.from_numpy(neg_edge_index).to(self.device) loss = self.model.recon_loss(Z, pos_edge_index, neg_edge_index) loss.backward() self.optimizer.step() total_loss += float(loss) pbar.update(batch_size) pbar.close() loss = total_loss / (quot+1) return loss def validate(self): self.model.eval() Z = self.embed_all() ents_Z = Z[:-1,:][self.data.selected_inds[:-1]>=self.data.nA,:].detach().numpy() prop_Z = Z[self.data.tags=='prop',:].detach().numpy() scores = np.dot(ents_Z, prop_Z.T).squeeze() sorted_ents = self.data.selected_ents[np.argsort(-scores)] unstudied_sorted_ents = np.array([x for x in sorted_ents if x not in self.data.studied_ents]) preds = unstudied_sorted_ents[:50] prec = np.isin(preds,self.data.GT).sum() / len(preds) return prec
def perturb_edges(data, name, remove_pct, add_pct, hidden_channels=16, epochs=400): if remove_pct == 0 and add_pct == 0: return try: cached = pickle.load( open(f'{ROOT}/cache/edge/{name}_{remove_pct}_{add_pct}.pt', 'rb')) print(f'Use cached edge augmentation for dataset {name}') if data.setting == 'inductive': data.train_edge_index = cached else: data.edge_index = cached return except FileNotFoundError: try: A_pred, adj_orig = pickle.load( open(f'{ROOT}/cache/edge/{name}.pt', 'rb')) A = sample_graph_det(adj_orig, A_pred, remove_pct, add_pct) data.edge_index, _ = from_scipy_sparse_matrix(A) pickle.dump( data.edge_index, open(f'{ROOT}/cache/edge/{name}_{remove_pct}_{add_pct}.pt', 'wb')) return except FileNotFoundError: print( f'cache/edge/{name}_{remove_pct}_{add_pct}.pt not found! Regenerating it now' ) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if data.setting == 'inductive': train_data = Data(x=data.train_x, ori_x=data.ori_x, edge_index=data.train_edge_index, y=data.train_y) else: train_data = deepcopy(data) edge_index = deepcopy(train_data.edge_index) train_data = train_test_split_edges(train_data, val_ratio=0.1, test_ratio=0) num_features = train_data.ori_x.shape[1] model = GAE(GCNEncoder(num_features, hidden_channels)) model = model.to(device) x = train_data.ori_x.to(device) train_pos_edge_index = train_data.train_pos_edge_index.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) best_val_auc = 0 best_z = None for epoch in range(1, epochs + 1): model.train() optimizer.zero_grad() z = model.encode(x, train_pos_edge_index) loss = model.recon_loss(z, train_pos_edge_index) loss.backward() optimizer.step() model.eval() with torch.no_grad(): z = model.encode(x, train_pos_edge_index) auc, ap = model.test(z, train_data.val_pos_edge_index, train_data.val_neg_edge_index) print('Val | Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format( epoch, auc, ap)) if auc > best_val_auc: best_val_auc = auc best_z = deepcopy(z) A_pred = torch.sigmoid(torch.mm(z, z.T)).cpu().numpy() adj_orig = to_scipy_sparse_matrix(edge_index).asformat('csr') adj_pred = sample_graph_det(adj_orig, A_pred, remove_pct, add_pct) if data.setting == 'inductive': data.train_edge_index, _ = from_scipy_sparse_matrix(adj_pred) else: data.edge_index, _ = from_scipy_sparse_matrix(adj_pred) pickle.dump((A_pred, adj_orig), open(f'{ROOT}/cache/edge/{name}.pt', 'wb')) if data.setting == 'inductive': pickle.dump( data.train_edge_index, open(f'{ROOT}/cache/edge/{name}_{remove_pct}_{add_pct}.pt', 'wb')) else: pickle.dump( data.edge_index, open(f'{ROOT}/cache/edge/{name}_{remove_pct}_{add_pct}.pt', 'wb'))
class VariationalLinearEncoder(torch.nn.Module): def __init__(self, in_channels, out_channels): super().__init__() self.conv_mu = GCNConv(in_channels, out_channels) self.conv_logstd = GCNConv(in_channels, out_channels) def forward(self, x, edge_index): return self.conv_mu(x, edge_index), self.conv_logstd(x, edge_index) in_channels, out_channels = dataset.num_features, 16 if not args.variational and not args.linear: model = GAE(GCNEncoder(in_channels, out_channels)) elif not args.variational and args.linear: model = GAE(LinearEncoder(in_channels, out_channels)) elif args.variational and not args.linear: model = VGAE(VariationalGCNEncoder(in_channels, out_channels)) elif args.variational and args.linear: model = VGAE(VariationalLinearEncoder(in_channels, out_channels)) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) def train(): model.train() optimizer.zero_grad() z = model.encode(train_data.x, train_data.edge_index)
x = self.conv1(x, edge_index).relu() return self.conv_mu(x, edge_index), self.conv_logstd(x, edge_index) if __name__ == "__main__": filePath = '../wholeYear/' if len(sys.argv) > 1 else sys.argv[2] dataset = WholeYearDataset(filePath) d = dataset[0] train_test_split_edges(d) #parameters out_channels = 2 num_features = d.num_features model_gae1 = GAE(GCNEncoder(num_features, out_channels)) areasUnderCurve_gae_weekday, precisions_gae_weekday, losses_gae_weekday = runAutoencoder( model_gae1, d, 1000, torch.optim.Adam, 0.001) plotAUC_AP_Loss(areasUnderCurve_gae_weekday, precisions_gae_weekday, losses_gae_weekday, 1000, "GAE 1: 2 Convolutions") model2 = GAE(GCNEncoder2(num_features, out_channels)) areasUnderCurve_gae_weekday_model2, precisions_gae_weekday_model2, losses_gae_weekday_model2 = runAutoencoder( model2, d, 1000, torch.optim.Adam, 0.001) plotAUC_AP_Loss(areasUnderCurve_gae_weekday_model2, precisions_gae_weekday_model2, losses_gae_weekday_model2, 1000, "GAE 2: 2 Convolutions 1 Linear") modelVgae = VGAE(VariationalGCNEncoder(num_features, out_channels)) runVariational1 = runVariational(modelVgae, d, 1000, torch.optim.Adam, 0.001)