dataset = Planetoid(root='tmp', name='PubMed') print("use dataset: PubMed") data = dataset[0] enhanced_data = train_test_split_edges(data.clone(), val_ratio=0.1, test_ratio=0.2) train_data = Data(x=enhanced_data.x, edge_index=enhanced_data['train_pos_edge_index']).to(DEVICE) target_data = data.to(DEVICE) if args.model is 'VGAE': model = VGAE(encoder=VEncoder(data['x'].shape[1])).to(DEVICE) else: model = GAE(encoder=Encoder(data['x'].shape[1])).to(DEVICE) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=5e-4) def model_train(): print("========Start training========") for epoch in range(args.num_epoch): model.train() optimizer.zero_grad() z = model.encode(train_data) recon_loss = model.recon_loss(z, target_data['edge_index']) if args.model is 'VGAE': recon_loss += model.kl_loss() / data['x'].shape[0]
self.conv2 = GCNConv(2 * out_channels, out_channels, cached=False) # Map into 2*out_channels dimentions with def forward(self, x, edge_index): x = F.relu(self.conv1(x, edge_index)) return self.conv2(x, edge_index) #This is the size of the latent embedding channels = 32 # We have 75 origional features num_features = 75 dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #dev = torch.device('cpu') model = GAE(Encoder(num_features, channels).to(dev)) #data.train_mask = data.val_mask = data.test_mask = data.y = None #data = model.split_edges(data) #x, train_edge_index = data.x.to(dev), data.edge_index.to(dev) optimizer = torch.optim.Adam(model.parameters(), lr=0.005) def train(loader): model.train() loss_all = 0 for data in loader: x, train_edge_index = data.x.to(dev), data.edge_index.to(dev) optimizer.zero_grad() z = model.encode(x, train_edge_index) loss = model.recon_loss(z, train_edge_index) loss.backward()
def run_GAE(input_data, output_dir, epochs=1000, lr=0.01, weight_decay=0.0005): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('Device: '.ljust(32), device) print('Model Name: '.ljust(32), 'GAE') print('Model params:{:19} lr: {} weight_decay: {}'.format( '', lr, weight_decay)) print('Total number of epochs to run: '.ljust(32), epochs) print('*' * 70) data = input_data.clone().to(device) in_channels = data.num_features out_channels = data.num_classes.item() model = GAE(GAEncoder(in_channels, out_channels)).to(device) data = input_data.clone().to(device) split_data = model.split_edges(data) x, train_pos_edge_index, edge_attr = split_data.x.to( device), split_data.train_pos_edge_index.to(device), data.edge_attr.to( device) split_data.train_idx = split_data.test_idx = data.y = None optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) train_losses, test_losses = [], [] aucs = [] aps = [] model.train() for epoch in range(1, epochs + 1): train_loss = 0 test_loss = 0 optimizer.zero_grad() z = model.encode(x, train_pos_edge_index) train_loss = model.recon_loss(z, train_pos_edge_index) train_losses.append(train_loss) train_loss.backward() optimizer.step() model.eval() with torch.no_grad(): z = model.encode(x, train_pos_edge_index) auc, ap = model.test(z, split_data.test_pos_edge_index, split_data.test_neg_edge_index) test_loss = model.recon_loss(z, data.test_pos_edge_index) test_losses.append(test_loss.item()) aucs.append(auc) aps.append(ap) figname = os.path.join( output_dir, "_".join((GAE.__name__, str(lr), str(weight_decay)))) makepath(output_dir) if (epoch % int(epochs / 10) == 0): print( 'Epoch: {} Train loss: {} Test loss: {} AUC: {} AP: {}' .format(epoch, train_loss, test_loss, auc, ap)) if (epoch == epochs): print( '-' * 65, '\nFinal epoch: {} Train loss: {} Test loss: {} AUC: {} AP: {}' .format(epoch, train_loss, test_loss, auc, ap)) log = 'Final epoch: {} Train loss: {} Test loss: {} AUC: {} AP: {}'.format( epoch, train_loss, test_loss, auc, ap) write_log(log, figname) print('-' * 65) plot_linkpred(train_losses, test_losses, aucs, aps, output_dir, epochs, figname) return
help='Residual connection') args = parser.parse_args() #download datasets path = os.join(os.dirname(os.realpath(__file__)), '..', 'data', args.dataset) dataset = Planetoid(path, args.dataset) dev = torch.device(args.dev) if args.model == VGAE: model = VGAE( Encoder_VGAE(dataset.num_features, args.hidden1, args.hidden2, args.depth, args.res)).to(dev) else: model = GAE( Encoder_GAE(dataset.num_features, args.hidden1, args.hidden2, args.depth, args.res)).to(dev) auc_score_list = [] ap_score_list = [] print("Dataset: ", args.dataset, " Model: ", args.model, ", Residual :", args.res, ", Layer depth:", args.depth, " ") for i in range(1, args.runs + 1): data = dataset[0] data.train_mask = data.val_mask = data.test_mask = data.y = None data = train_test_split_edges(data) x, train_pos_edge_index = data.x.to(dev), data.train_pos_edge_index.to(dev) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
act = F.relu sum_res = True variational = False path = Path(__file__).parent / "../../test/data/BBA-subset-100.h5" node_feature_path = ( Path(__file__).parent / "../../test/data/onehot_bba_amino_acid_labels.npy" ) dataset = ContactMapDataset( path, "contact_map", ["rmsd"], node_feature_path=node_feature_path ) loader = DataLoader(dataset, batch_size=1, shuffle=True) # Select node AE if args.linear: node_ae = GAE(LinearEncoder(num_features, node_out_channels)) else: node_ae = GAE(GCNEncoder(num_features, node_out_channels)) # Select graph AE encoder = VariationalGraphEncoder( node_out_channels, hidden_channels, graph_out_channels, depth, pool_ratios, act, variational, ) decoder = VariationalGraphDecoder( graph_out_channels,
class VariationalLinearEncoder(torch.nn.Module): def __init__(self, in_channels, out_channels): super(VariationalLinearEncoder, self).__init__() self.conv_mu = GCNConv(in_channels, out_channels, cached=True) self.conv_logstd = GCNConv(in_channels, out_channels, cached=True) def forward(self, x, edge_index): return self.conv_mu(x, edge_index), self.conv_logstd(x, edge_index) out_channels = 16 num_features = dataset.num_features if not args.variational: if not args.linear: model = GAE(GCNEncoder(num_features, out_channels)) else: model = GAE(LinearEncoder(num_features, out_channels)) else: if args.linear: model = VGAE(VariationalLinearEncoder(num_features, out_channels)) else: model = VGAE(VariationalGCNEncoder(num_features, out_channels)) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = model.to(device) x = data.x.to(device) train_pos_edge_index = data.train_pos_edge_index.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
def perturb_edges(data, name, remove_pct, add_pct, hidden_channels=16, epochs=400): if remove_pct == 0 and add_pct == 0: return try: cached = pickle.load( open(f'{ROOT}/cache/edge/{name}_{remove_pct}_{add_pct}.pt', 'rb')) print(f'Use cached edge augmentation for dataset {name}') if data.setting == 'inductive': data.train_edge_index = cached else: data.edge_index = cached return except FileNotFoundError: try: A_pred, adj_orig = pickle.load( open(f'{ROOT}/cache/edge/{name}.pt', 'rb')) A = sample_graph_det(adj_orig, A_pred, remove_pct, add_pct) data.edge_index, _ = from_scipy_sparse_matrix(A) pickle.dump( data.edge_index, open(f'{ROOT}/cache/edge/{name}_{remove_pct}_{add_pct}.pt', 'wb')) return except FileNotFoundError: print( f'cache/edge/{name}_{remove_pct}_{add_pct}.pt not found! Regenerating it now' ) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if data.setting == 'inductive': train_data = Data(x=data.train_x, ori_x=data.ori_x, edge_index=data.train_edge_index, y=data.train_y) else: train_data = deepcopy(data) edge_index = deepcopy(train_data.edge_index) train_data = train_test_split_edges(train_data, val_ratio=0.1, test_ratio=0) num_features = train_data.ori_x.shape[1] model = GAE(GCNEncoder(num_features, hidden_channels)) model = model.to(device) x = train_data.ori_x.to(device) train_pos_edge_index = train_data.train_pos_edge_index.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) best_val_auc = 0 best_z = None for epoch in range(1, epochs + 1): model.train() optimizer.zero_grad() z = model.encode(x, train_pos_edge_index) loss = model.recon_loss(z, train_pos_edge_index) loss.backward() optimizer.step() model.eval() with torch.no_grad(): z = model.encode(x, train_pos_edge_index) auc, ap = model.test(z, train_data.val_pos_edge_index, train_data.val_neg_edge_index) print('Val | Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format( epoch, auc, ap)) if auc > best_val_auc: best_val_auc = auc best_z = deepcopy(z) A_pred = torch.sigmoid(torch.mm(z, z.T)).cpu().numpy() adj_orig = to_scipy_sparse_matrix(edge_index).asformat('csr') adj_pred = sample_graph_det(adj_orig, A_pred, remove_pct, add_pct) if data.setting == 'inductive': data.train_edge_index, _ = from_scipy_sparse_matrix(adj_pred) else: data.edge_index, _ = from_scipy_sparse_matrix(adj_pred) pickle.dump((A_pred, adj_orig), open(f'{ROOT}/cache/edge/{name}.pt', 'wb')) if data.setting == 'inductive': pickle.dump( data.train_edge_index, open(f'{ROOT}/cache/edge/{name}_{remove_pct}_{add_pct}.pt', 'wb')) else: pickle.dump( data.edge_index, open(f'{ROOT}/cache/edge/{name}_{remove_pct}_{add_pct}.pt', 'wb'))
def run_model(dataset, conf): # ## 1) Build Table graph # ### Tables tokenization tokenized_tables, vocabulary, cell_dict, reversed_dictionary = corpus_tuple = create_corpus( dataset, include_attr=conf["add_attr"]) if conf["shuffle_vocab"] == True: shuffled_vocab = shuffle_vocabulary(vocabulary) else: shuffled_vocab = None nodes = build_node_features(vocabulary) row_edges_index, row_edges_weights = build_graph_edges( tokenized_tables, s_vocab=shuffled_vocab, sample_frac=conf["row_edges_sample"], columns=False) col_edges_index, col_edges_weights = build_graph_edges( tokenized_tables, s_vocab=shuffled_vocab, sample_frac=conf["column_edges_sample"], columns=True) edges = torch.cat((row_edges_index, col_edges_index), dim=1) weights = torch.cat((row_edges_weights, col_edges_weights), dim=0) graph_data = Data(x=nodes, edge_index=edges, edge_attr=weights) # ## 2 ) Run Table Auto-Encoder Model: device = 'cuda' if torch.cuda.is_available() else 'cpu' loader = DataLoader(torch.arange(graph_data.num_nodes), batch_size=128, shuffle=True) graph_data = graph_data.to(device) x, train_pos_edge_index = nodes, edges class Encoder(torch.nn.Module): def __init__(self, in_channels, out_channels): super(Encoder, self).__init__() self.conv1 = GCNConv(in_channels, 2 * out_channels, cached=True) self.conv_mu = GCNConv(2 * out_channels, out_channels, cached=True) self.conv_logvar = GCNConv(2 * out_channels, out_channels, cached=True) def forward(self, x, edge_index): x = F.relu(self.conv1(x, edge_index)) return self.conv_mu(x, edge_index), self.conv_logvar(x, edge_index) channels = conf["vector_size"] enc = Encoder(graph_data.num_features, channels) model = GAE(enc) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) def train(model, optimizer, x, train_pos_edge_index): model.train() optimizer.zero_grad() z = model.encode(x, train_pos_edge_index) loss = model.recon_loss(z, train_pos_edge_index) #loss = model.kl_loss() loss.backward() optimizer.step() return loss losses = [] for epoch in range(conf["epoch_num"]): loss = train(model, optimizer, x, train_pos_edge_index) losses.append(loss) print(epoch, loss) losses.append(loss) # ### 3) Extract the latent cell vectors, generate table vectors: def get_cell_vectors(model, x, train_pos_edge_index): model.eval() with torch.no_grad(): z = model.encode(x, train_pos_edge_index) cell_vectors = z.numpy() return z, cell_vectors z, cell_vectors = get_cell_vectors(model, x, train_pos_edge_index) vec_list = generate_table_vectors(cell_vectors, tokenized_tables, s_vocab=shuffled_vocab) # ## 3) Evaluate the model result_score = evaluate_model(dataset, vec_list, k=5) return cell_vectors, vec_list, losses, result_score
x = self.conv1(x, edge_index).relu() return self.conv_mu(x, edge_index), self.conv_logstd(x, edge_index) if __name__ == "__main__": filePath = '../wholeYear/' if len(sys.argv) > 1 else sys.argv[2] dataset = WholeYearDataset(filePath) d = dataset[0] train_test_split_edges(d) #parameters out_channels = 2 num_features = d.num_features model_gae1 = GAE(GCNEncoder(num_features, out_channels)) areasUnderCurve_gae_weekday, precisions_gae_weekday, losses_gae_weekday = runAutoencoder( model_gae1, d, 1000, torch.optim.Adam, 0.001) plotAUC_AP_Loss(areasUnderCurve_gae_weekday, precisions_gae_weekday, losses_gae_weekday, 1000, "GAE 1: 2 Convolutions") model2 = GAE(GCNEncoder2(num_features, out_channels)) areasUnderCurve_gae_weekday_model2, precisions_gae_weekday_model2, losses_gae_weekday_model2 = runAutoencoder( model2, d, 1000, torch.optim.Adam, 0.001) plotAUC_AP_Loss(areasUnderCurve_gae_weekday_model2, precisions_gae_weekday_model2, losses_gae_weekday_model2, 1000, "GAE 2: 2 Convolutions 1 Linear") modelVgae = VGAE(VariationalGCNEncoder(num_features, out_channels)) runVariational1 = runVariational(modelVgae, d, 1000, torch.optim.Adam, 0.001)
super().__init__() self.rel_emb = Parameter(torch.Tensor(num_relations, hidden_channels)) self.reset_parameters() def reset_parameters(self): torch.nn.init.xavier_uniform_(self.rel_emb) def forward(self, z, edge_index, edge_type): z_src, z_dst = z[edge_index[0]], z[edge_index[1]] rel = self.rel_emb[edge_type] return torch.sum(z_src * rel * z_dst, dim=1) model = GAE( RGCNEncoder(data.num_nodes, hidden_channels=500, num_relations=dataset.num_relations), DistMultDecoder(dataset.num_relations // 2, hidden_channels=500), ) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) def negative_sampling(edge_index, num_nodes): # Sample edges by corrupting either the subject or the object of each edge. mask_1 = torch.rand(edge_index.size(1)) < 0.5 mask_2 = ~mask_1 neg_edge_index = edge_index.clone() neg_edge_index[0, mask_1] = torch.randint(num_nodes, (mask_1.sum(), )) neg_edge_index[1, mask_2] = torch.randint(num_nodes, (mask_2.sum(), )) return neg_edge_index
data = dataset[0] class Encoder(torch.nn.Module): def __init__(self, in_channels, out_channels): super(Encoder, self).__init__() self.conv1 = GCNConv(in_channels, 2 * out_channels) self.conv2 = GCNConv(2 * out_channels, out_channels) def forward(self, x, edge_index): x = F.relu(self.conv1(x, edge_index)) return self.conv2(x, edge_index) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = GAE(Encoder(dataset.num_features, out_channels=32)).to(device) data.train_mask = data.val_mask = data.test_mask = data.y = None data = model.split_edges(data) x, edge_index = data.x.to(device), data.edge_index.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) def train(): model.train() optimizer.zero_grad() z = model.encode(x, edge_index) loss = model.recon_loss(z, data.train_pos_edge_index, data.train_neg_adj_mask) loss.backward() optimizer.step()