def train_model_and_save_embeddings(dataset, data, epochs, learning_rate, device): # Define Model encoder = EmbeddingEncoder(emb_dim=200, out_channels=64, n_nodes=dataset.num_nodes).to(device) decoder = CosineSimDecoder().to(device) model = VGAE(encoder=encoder, decoder=decoder).to(device) node_features, train_pos_edge_index = data.x.to( device), data.edge_index.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # data.edge_index = data.edge_index.long() assert data.edge_index.max().item() < dataset.num_nodes data_loader = NeighborSampler(data, size=[25, 10], num_hops=2, batch_size=10000, shuffle=False, add_self_loops=False) model.train() for epoch in tqdm(range(epochs)): epoch_loss = 0.0 for data_flow in tqdm(data_loader()): optimizer.zero_grad() data_flow = data_flow.to(device) block = data_flow[0] embeddings = model.encode( node_features[block.n_id], block.edge_index ) # TODO Avoid computation of all node features! loss = model.recon_loss(embeddings, block.edge_index) loss = loss + (1 / len(block.n_id)) * model.kl_loss() epoch_loss += loss.item() # Compute gradients loss.backward() # Perform optimization step optimizer.step() z = model.encode(node_features, train_pos_edge_index) torch.save(z.cpu(), "large_emb.pt") print(f"Loss after epoch {epoch} / {epochs}: {epoch_loss}") return model
if args.dataset in ['cora', 'citeseer', 'pubmed']: path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '.', 'data', args.dataset) data = Planetoid(path, args.dataset)[0] else: data = load_wiki.load_data() data.edge_index = gutils.to_undirected(data.edge_index) data = GAE.split_edges(GAE, data) num_features = data.x.shape[1] aucs = [] aps = [] for run in range(args.runs): model = VGAE(VGAE_Encoder(num_features)) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) # Training loop for epoch in range(args.epochs): model.train() optimizer.zero_grad() z = model.encode(data.x, data.train_pos_edge_index) loss = model.recon_loss( z, data.train_pos_edge_index) #0.01*model.kl_loss() loss.backward() optimizer.step() # Log validation metrics if epoch % args.val_freq == 0: model.eval() with torch.no_grad():
def main(): model_name = 'VGAE' disease_gene_files = [ 'data/OMIM/3-fold-1.txt', 'data/OMIM/3-fold-2.txt', 'data/OMIM/3-fold-3.txt' ] disease_disease_file = 'data/MimMiner/MimMiner.txt' gene_gene_file = 'data/HumanNetV2/HumanNet_V2.txt' prediction_files = [ f'data/prediction/{model_name}/prediction-3-fold-1.txt', f'data/prediction/{model_name}/prediction-3-fold-2.txt', f'data/prediction/{model_name}/prediction-3-fold-3.txt' ] for counter in [3]: g_nx = nx.Graph() with open(disease_gene_files[counter], 'r') as f: for line in f: node1, node2, tag = line.strip().split('\t') if tag == 'train': g_nx.add_node(node1) g_nx.add_node(node2) g_nx.add_edge(node1, node2, weight=1) with open(gene_gene_file, 'r') as f: for line in f: node1, node2 = line.strip().split('\t') g_nx.add_node(node1) g_nx.add_node(node2) g_nx.add_edge(node1, node2, weight=1) with open(disease_disease_file, 'r') as f: for line in f: node1, node2, weight = line.strip().split('\t') g_nx.add_node(node1) g_nx.add_node(node2) g_nx.add_edge(node1, node2, weight=1) print('read data success') name_id = dict(zip(g_nx.nodes(), range(g_nx.number_of_nodes()))) g_nx = nx.relabel_nodes(g_nx, name_id) # transform from networkx to pyg data g_nx = g_nx.to_directed() if not nx.is_directed(g_nx) else g_nx edge_index = torch.tensor(list(g_nx.edges)).t().contiguous() data = {} data['edge_index'] = edge_index.view(2, -1) data = torch_geometric.data.Data.from_dict(data) data.num_nodes = g_nx.number_of_nodes() data.x = torch.from_numpy(np.eye(data.num_nodes)).float() data.train_mask = data.val_mask = data.test_mask = data.y = None print( f'Graph information:\nNode:{data.num_nodes}\nEdge:{data.num_edges}\nFeature:{data.num_node_features}' ) channels = 128 dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = VGAE(Encoder(data.num_node_features, channels)).to(dev) x, train_pos_edge_index = data.x.to(dev), data.edge_index.to(dev) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) for epoch in range(4000): model.train() optimizer.zero_grad() z = model.encode(x, train_pos_edge_index) loss = model.recon_loss( z, train_pos_edge_index) + (1 / data.num_nodes) * model.kl_loss() loss.backward() optimizer.step() nowTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(f'{nowTime}\tepoch:{epoch}\tloss:{loss}') z = model.encode(x, train_pos_edge_index) pred = model.decoder.forward_all(z).cpu().detach().numpy().tolist() id_name = {} diseases = set() genes = set() for key in name_id: id_name[name_id[key]] = key if key.startswith('g_'): genes.add(key) elif key.startswith('d_'): diseases.add(key) test_diseases = set() with open(disease_gene_files[counter], 'r') as f: for line in f: disease, gene, tag = line.strip().split('\t') if tag == 'test': test_diseases.add(disease) with open(prediction_files[counter], 'w') as f: for disease in test_diseases: sims = {} if disease not in diseases: for gene in genes: sims[gene] = 0 else: for gene in genes: sim = pred[name_id[disease]][name_id[gene]] sims[gene] = sim sorted_sims = sorted(sims.items(), key=lambda item: item[1], reverse=True) c = 0 for gene, sim in sorted_sims: f.write(disease + '\t' + gene + '\t' + str(sim) + '\n') c += 1 if c >= 150: break
Encoder_GAE(dataset.num_features, args.hidden1, args.hidden2, args.depth, args.res)).to(dev) auc_score_list = [] ap_score_list = [] print("Dataset: ", args.dataset, " Model: ", args.model, ", Residual :", args.res, ", Layer depth:", args.depth, " ") for i in range(1, args.runs + 1): data = dataset[0] data.train_mask = data.val_mask = data.test_mask = data.y = None data = train_test_split_edges(data) x, train_pos_edge_index = data.x.to(dev), data.train_pos_edge_index.to(dev) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) adj_train = train_pos_edge_index adj_train_dense = to_dense_adj(adj_train)[0] adj_train_dense = adj_train_dense norm = adj_train_dense.shape[0] * adj_train_dense.shape[0] / float( (adj_train_dense.shape[0] * adj_train_dense.shape[0] - adj_train_dense.sum()) * 2) z_final = None for epoch in range(1, 100): model.train() optimizer.zero_grad()
def run_VGAE(input_data, output_dir, epochs=1000, lr=0.01, weight_decay=0.0005): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('Device: '.ljust(32), device) print('Model Name: '.ljust(32), 'VGAE') print('Model params:{:19} lr: {} weight_decay: {}'.format( '', lr, weight_decay)) print('Total number of epochs to run: '.ljust(32), epochs) print('*' * 70) data = input_data.clone().to(device) model = VGAE(VGAEncoder(data.num_features, data.num_classes.item())).to(device) data = model.split_edges(data) x, train_pos_edge_index, edge_attr = data.x.to( device), data.train_pos_edge_index.to(device), data.edge_attr.to( device) data.train_idx = data.test_idx = data.y = None optimizer = torch.optim.Adam(model.parameters(), lr=0.01) train_losses = [] test_losses = [] aucs = [] aps = [] model.train() for epoch in range(1, epochs + 1): train_loss, test_loss = 0, 0 optimizer.zero_grad() z = model.encode(x, train_pos_edge_index) train_loss = model.recon_loss( z, train_pos_edge_index) + (1 / data.num_nodes) * model.kl_loss() train_losses.append(train_loss.item()) train_loss.backward() optimizer.step() model.eval() with torch.no_grad(): z = model.encode(x, train_pos_edge_index) auc, ap = model.test(z, data.test_pos_edge_index, data.test_neg_edge_index) test_loss = model.recon_loss( z, data.test_pos_edge_index) + (1 / data.num_nodes) * model.kl_loss() test_losses.append(test_loss.item()) aucs.append(auc) aps.append(ap) makepath(output_dir) figname = os.path.join( output_dir, "_".join( (VGAE.__name__, str(lr), str(weight_decay), str(epochs)))) # print('AUC: {:.4f}, AP: {:.4f}'.format(auc, ap)) if (epoch % int(epochs / 10) == 0): print( 'Epoch: {} Train loss: {} Test loss: {} AUC: {} AP: {:.4f}' .format(epoch, train_loss, test_loss, auc, ap)) if (epoch == epochs): print( '-' * 65, '\nFinal epoch: {} Train loss: {} Test loss: {} AUC: {} AP: {}' .format(epoch, train_loss, test_loss, auc, ap)) log = 'Final epoch: {} Train loss: {} Test loss: {} AUC: {} AP: {}'.format( epoch, train_loss, test_loss, auc, ap) write_log(log, figname) print('-' * 65) plot_linkpred(train_losses, test_losses, aucs, aps, output_dir, epochs, figname) return
def run_model(dataset, conf): # ## 1) Build Table graph # ### Tables tokenization tokenized_tables, vocabulary, cell_dict, reversed_dictionary = corpus_tuple = create_corpus( dataset, include_attr=conf["add_attr"]) if conf["shuffle_vocab"] == True: shuffled_vocab = shuffle_vocabulary(vocabulary) else: shuffled_vocab = None nodes = build_node_features(vocabulary) row_edges_index, row_edges_weights = build_graph_edges( tokenized_tables, s_vocab=shuffled_vocab, sample_frac=conf["row_edges_sample"], columns=False) col_edges_index, col_edges_weights = build_graph_edges( tokenized_tables, s_vocab=shuffled_vocab, sample_frac=conf["column_edges_sample"], columns=True) edges = torch.cat((row_edges_index, col_edges_index), dim=1) weights = torch.cat((row_edges_weights, col_edges_weights), dim=0) graph_data = Data(x=nodes, edge_index=edges, edge_attr=weights) # ## 2 ) Run Table Auto-Encoder Model: device = 'cuda' if torch.cuda.is_available() else 'cpu' loader = DataLoader(torch.arange(graph_data.num_nodes), batch_size=128, shuffle=True) graph_data = graph_data.to(device) x, train_pos_edge_index = nodes, edges class Encoder(torch.nn.Module): def __init__(self, in_channels, out_channels): super(Encoder, self).__init__() self.conv1 = GCNConv(in_channels, 2 * out_channels, cached=True) self.conv_mu = GCNConv(2 * out_channels, out_channels, cached=True) self.conv_logvar = GCNConv(2 * out_channels, out_channels, cached=True) def forward(self, x, edge_index): x = F.relu(self.conv1(x, edge_index)) return self.conv_mu(x, edge_index), self.conv_logvar(x, edge_index) channels = conf["vector_size"] enc = Encoder(graph_data.num_features, channels) model = VGAE(enc) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) def train(model, optimizer, x, train_pos_edge_index): model.train() optimizer.zero_grad() z = model.encode(x, train_pos_edge_index) rl = model.recon_loss(z, train_pos_edge_index) kl = model.kl_loss() loss = rl + kl loss.backward() optimizer.step() return (rl, kl, loss) losses = [] for epoch in range(conf["epoch_num"]): loss = train(model, optimizer, x, train_pos_edge_index) losses.append(loss) print(epoch, loss) losses.append(loss) # ### 3) Extract the latent cell vectors, generate table vectors: def get_cell_vectors(model, x, train_pos_edge_index): model.eval() with torch.no_grad(): z = model.encode(x, train_pos_edge_index) cell_vectors = z.numpy() return z, cell_vectors z, cell_vectors = get_cell_vectors(model, x, train_pos_edge_index) vec_list = generate_table_vectors(cell_vectors, tokenized_tables, s_vocab=shuffled_vocab) # ## 3) Evaluate the model result_score = evaluate_model(dataset, vec_list, k=5) return cell_vectors, vec_list, losses, result_score