def test_contains_self_loops(): row = torch.tensor([0, 1, 0]) col = torch.tensor([1, 0, 0]) assert contains_self_loops(torch.stack([row, col], dim=0)) row = torch.tensor([0, 1, 1]) col = torch.tensor([1, 0, 2]) assert not contains_self_loops(torch.stack([row, col], dim=0))
def CreateAuxGraph(edge_index, pos_i, pos_j, original_vertex_features, pos): # Build new auxiliary graph: num_graph_verts = original_vertex_features.shape[0] new_verts = torch.arange(num_graph_verts, num_graph_verts + len(edge_index[0, :])) # Add edges in G as vertices in aux G new_vertex_pos = torch.cat( [pos, torch.zeros(len(new_verts), pos.shape[1]).to(device)] , dim=0) new_vertex_features = torch.cat( [original_vertex_features, torch.zeros(len(new_verts), original_vertex_features.shape[1]).to(device)] , dim=0) # Compute needed components: sources = edge_index[0, :] targets = edge_index[1, :] edge_indices = num_graph_verts + torch.arange(0, len(edge_index[0, :])) edge_indices = edge_indices.to(device) new_source_edges = torch.stack((sources, edge_indices), dim=0) new_target_edges = torch.stack((targets, edge_indices), dim=0) new_vertex_pos[num_graph_verts:, :] = (pos_i + pos_j) / 2 # Build graph from components: edge_index_aux = torch.cat([new_target_edges, new_source_edges], dim=1) assert (not contains_self_loops(edge_index_aux)) assert (not contains_isolated_nodes(edge_index_aux)) return edge_index_aux, new_vertex_features, new_vertex_pos
def load_torch_geometric_data(dataset, name): cur = os.getcwd() if dataset in {'WikiCS', 'Flickr'}: data = eval(dataset + "(root = '" + cur.replace("\\", "/") + "/torch_geometric_data/" + dataset + "')") else: data = eval(dataset + "(root = '" + cur.replace("\\", "/") + "/torch_geometric_data/" + dataset + "'," + "name = '" + name + "')") # e.g. Coauthor(root='...', name = 'CS') edge = data[0].edge_index if contains_self_loops(edge): edge = remove_self_loops(edge)[0] print("Original data contains self-loop, it is now removed") adj = to_dense_adj(edge)[0].numpy() print("Nodes: %d, edges: %d, features: %d, classes: %d. \n" % (len(adj[0]), len(edge[0]) / 2, len( data[0].x[0]), len(np.unique(data[0].y)))) mask = np.transpose(adj) != adj col_sum = adj.sum(axis=0) print("Check adjacency matrix is sysmetric: %r" % (mask.sum().item() == 0)) print("Check the number of isolated nodes: %d" % ((col_sum == 0).sum().item())) print("Node degree Max: %d, Mean: %.4f, SD: %.4f" % (col_sum.max(), col_sum.mean(), col_sum.std())) return adj, data[0].x.numpy(), data[0].y.numpy()
def forward(self, data, return_hidden_feature=False): #import pdb #pdb.set_trace() if torch.cuda.is_available(): data.x = data.x.cuda() data.edge_attr = data.edge_attr.cuda() data.edge_index = data.edge_index.cuda() data.batch = data.batch.cuda() # make sure that we have undirected graph if not is_undirected(data.edge_index): data.edge_index = to_undirected(data.edge_index) # make sure that nodes can propagate messages to themselves if not contains_self_loops(data.edge_index): data.edge_index, data.edge_attr = add_self_loops( data.edge_index, data.edge_attr.view(-1)) # covalent_propagation # add self loops to enable self propagation covalent_edge_index, covalent_edge_attr = self.covalent_neighbor_threshold( data.edge_index, data.edge_attr) ( non_covalent_edge_index, non_covalent_edge_attr, ) = self.non_covalent_neighbor_threshold(data.edge_index, data.edge_attr) # covalent_propagation and non_covalent_propagation covalent_x = self.covalent_propagation(data.x, covalent_edge_index, covalent_edge_attr) non_covalent_x = self.non_covalent_propagation( covalent_x, non_covalent_edge_index, non_covalent_edge_attr) # zero out the protein features then do ligand only gather...hacky sure but it gets the job done non_covalent_ligand_only_x = non_covalent_x non_covalent_ligand_only_x[data.x[:, 14] == -1] = 0 pool_x = self.global_add_pool(non_covalent_ligand_only_x, data.batch) # fully connected and output layers if return_hidden_feature or self.always_return_hidden_feature: # return prediction and atomistic features (covalent result, non-covalent result, pool result) avg_covalent_x, _ = avg_pool_x(data.batch, covalent_x, data.batch) avg_non_covalent_x, _ = avg_pool_x(data.batch, non_covalent_x, data.batch) fc0_x, fc1_x, output_x = self.output(pool_x, return_hidden_feature=True) return avg_covalent_x, avg_non_covalent_x, pool_x, fc0_x, fc1_x, output_x else: return self.output(pool_x)
def test_structured_negative_sampling(): edge_index = torch.as_tensor([[0, 0, 1, 2], [0, 1, 2, 3]]) i, j, k = structured_negative_sampling(edge_index) assert i.size(0) == edge_index.size(1) assert j.size(0) == edge_index.size(1) assert k.size(0) == edge_index.size(1) adj = torch.zeros(4, 4, dtype=torch.bool) adj[i, j] = 1 neg_adj = torch.zeros(4, 4, dtype=torch.bool) neg_adj[i, k] = 1 assert (adj & neg_adj).sum() == 0 # Test with no self-loops: edge_index = torch.LongTensor([[0, 0, 1, 1, 2], [1, 2, 0, 2, 1]]) i, j, k = structured_negative_sampling(edge_index, num_nodes=4, contains_neg_self_loops=False) neg_edge_index = torch.vstack([i, k]) assert not contains_self_loops(neg_edge_index)
def forward(self, node_x: torch.Tensor, solution_x: torch.Tensor, edge_index: torch.Tensor, batch): assert node_x.size() == solution_x.size() num_nodes, _ = node_x.size() assert num_nodes == edge_index.size(1) assert not contains_self_loops(edge_index) node_x = self.node_lin(node_x) solution_x = self.solution_lin(solution_x) # x = self.norm_x(node_x + solution_x, batch) # x = F.relu(x) x = F.gelu(node_x + solution_x) x = self.norm_x(x, batch) self._batch = batch self.propagate(edge_index, x=x, size=None) edge_embedding = self._edge self._edge = None return edge_embedding
def print_stats(): for data in DATASETS: out = load_data(data) num_graphs = len(out) avg_nodes = out.data.x.size(0) / num_graphs avg_edges = out.data.edge_index.size(1) / num_graphs num_features = out.num_features num_classes = out.num_classes print( f'{data}\t{num_graphs}\t{avg_nodes}\t{avg_edges}\t{num_features}\t{num_classes}', end='\t') undirected, self_loops, isolated_nodes, onehot = True, False, False, True for graph in out: if not is_undirected(graph.edge_index, num_nodes=graph.num_nodes): undirected = False if contains_self_loops(graph.edge_index): self_loops = True if contains_isolated_nodes(graph.edge_index, num_nodes=graph.num_nodes): isolated_nodes = True if ((graph.x > 0).sum(dim=1) != 1).sum() > 0: onehot = False print(f'{undirected}\t{self_loops}\t{isolated_nodes}\t{onehot}')
def contains_self_loops(self): """Returns :obj:`True`, if the graph does not contain self-loops.""" return contains_self_loops(self.edge_index)
def test_contains_self_loops(): edge_index = torch.tensor([[0, 1, 0], [1, 0, 0]]) assert contains_self_loops(edge_index) edge_index = torch.tensor([[0, 1, 1], [1, 0, 2]]) assert not contains_self_loops(edge_index)
def contains_self_loops(self): return contains_self_loops(self.edge_index)
def read_files(self, verbose=True): start = time.time() if verbose: print("="*100 + "\n\t\t\t\t Preparing Data for {}\n".format(self.config['data_name']) + "="*100) print("\n\n==>> Loading feature matrix and adj matrix....") if self.config['data_name'] in ['gossipcop', 'politifact']: x_file = os.path.join(self.config['data_path'], self.config['data_name'], 'feat_matrix_lr_train_30_5.npz'.format(self.config['data_name'])) y_file = os.path.join(self.config['data_path'], self.config['data_name'], 'all_labels_lr_train_30_5.json'.format(self.config['data_name'])) # adj_name = 'adj_matrix_lr_train_30_5_edge.npy'.format(self.config['data_name']) if self.config['model_name'] != 'HGCN' else 'adj_matrix_lr_train_30_5.npz'.format(self.config['data_name']) adj_name = 'adj_matrix_lr_train_30_5_edge.npy'.format(self.config['data_name']) edge_index_file = os.path.join(self.config['data_path'], self.config['data_name'], adj_name) node2id_file = os.path.join(self.config['data_path'], self.config['data_name'], 'node2id_lr_train_30_5.json'.format(self.config['data_name'])) node_type_file = os.path.join(self.config['data_path'], self.config['data_name'], 'node_type_lr_train_30_5.npy'.format(self.config['data_name'])) split_mask_file = os.path.join(self.config['data_path'], self.config['data_name'], 'split_mask_lr_30_5.json') if self.config['model_name'] in ['rgcn', 'rgat', 'rsage']: edge_type_file = os.path.join(self.config['data_path'], self.config['data_name'], 'edge_type_lr_train_30_5_edge.npy'.format(self.config['data_name'])) else: x_file = os.path.join(self.config['data_path'], self.config['data_name'], 'feat_matrix_lr_top10_train.npz') y_file = os.path.join(self.config['data_path'], self.config['data_name'], 'all_labels_lr_top10_train.json') # adj_name = 'adj_matrix_lr_top10_train_edge.npy' if self.config['model_name'] != 'HGCN' else 'adj_matrix_lr_top10_train.npz' adj_name = 'adj_matrix_lr_top10_train_edge.npy' edge_index_file = os.path.join(self.config['data_path'], self.config['data_name'], adj_name) node2id_file = os.path.join(self.config['data_path'], self.config['data_name'], 'node2id_lr_top10_train.json') node_type_file = os.path.join(self.config['data_path'], self.config['data_name'], 'node_type_lr_top10_train.npy') split_mask_file = os.path.join(self.config['data_path'], self.config['data_name'], 'split_mask_top10.json') if self.config['model_name'] in ['rgcn', 'rgat', 'rsage']: edge_type_file = os.path.join(self.config['data_path'], self.config['data_name'], 'edge_type_lr_top10_edge.npy') # if self.config['model_name'] != 'HGCN': # edge_index_data = np.load(edge_index_file) # edge_index_data = torch.from_numpy(edge_index_data).long() # elif self.config['model_name'] == 'HGCN': # edge_index_data = load_npz(edge_index_file) # # edge_index_data = torch.from_numpy(edge_index_data.toarray()) # edge_index_data = edge_index_data.tocoo() # indices = torch.from_numpy(np.vstack((edge_index_data.row, edge_index_data.col)).astype(np.int64)) # values = torch.Tensor(edge_index_data.data) # shape = torch.Size(edge_index_data.shape) # edge_index_data = torch.sparse.FloatTensor(indices, values, shape) self.edge_index_data = np.load(edge_index_file) self.edge_index_data = torch.from_numpy(edge_index_data).long() self.x_data = load_npz(x_file) self.x_data = torch.from_numpy(self.x_data.toarray()) self.y_data = json.load(open(y_file, 'r')) self.y_data = torch.LongTensor(self.y_data['all_labels']) self.node2id = json.load(open(node2id_file, 'r')) # node_type = np.load(node_type_file) # node_type = torch.from_numpy(node_type).float() if self.config['model_name'] in ['rgcn', 'rgat', 'rsage']: self.edge_type_data = np.load(edge_type_file) self.edge_type_data = torch.from_numpy(self.edge_type_data).long() else: self.edge_type_data = None self.split_masks = json.load(open(split_mask_file, 'r')) num_nodes, self.vocab_size = self.x_data.shape if self.config['model_name'] != 'HGCN': isolated_nodes = contains_isolated_nodes(edge_index= self.edge_index_data) self_loops = contains_self_loops(edge_index= self.edge_index_data) if verbose: print("\n\n" + "-"*50 + "\nDATA STATISTICS:\n" + "-"*50) if self.config['model_name'] != 'HGCN': print("Contains isolated nodes = ", isolated_nodes) print("Contains self loops = ", self_loops) print("Vocabulary size = ", self.vocab_size) print('No. of nodes in graph = ', num_nodes) print('No. of nodes after removing isolated nodes = ', new_num_nodes) print("No. of edges in graph = ", self.data.num_edges) print("\nNo.of train instances = ", self.data.train_mask.sum().item()) print("No.of val instances = ", self.data.val_mask.sum().item()) print("No.of test instances = ", num_nodes - self.data.train_mask.sum().item() - self.data.val_mask.sum().item()) end = time.time() hours, minutes, seconds = calc_elapsed_time(start, end) print("\n"+ "-"*50 + "\nTook {:0>2} hours: {:0>2} mins: {:05.2f} secs to Prepare Data\n".format(hours,minutes,seconds))
def forward(self, data, return_hidden_feature=False): data.x = data.x.cuda() data.edge_attr = data.edge_attr.cuda() data.edge_index = data.edge_index.cuda() data.batch = data.batch.cuda() # make sure that we have undirected graph if not is_undirected(data.edge_index): data.edge_index = to_undirected(data.edge_index) # make sure that nodes can propagate messages to themselves if not contains_self_loops(data.edge_index): data.edge_index, data.edge_attr = add_self_loops( data.edge_index, data.edge_attr.view(-1)) """ # now select the top 5 closest neighbors to each node dense_adj = sparse_to_dense(edge_index=data.edge_index, edge_attr=data.edge_attr) #top_k_vals, top_k_idxs = torch.topk(dense_adj, dim=0, k=5, largest=False) #dense_adj = torch.zeros_like(dense_adj).scatter(1, top_k_idxs, top_k_vals) dense_adj[dense_adj == 0] = 10000 # insert artificially large values for 0 valued entries that will throw off NN calculation top_k_vals, top_k_idxs = torch.topk(dense_adj, dim=1, k=15, largest=False) dense_adj = torch.zeros_like(dense_adj).scatter(1, top_k_idxs, top_k_vals) data.edge_index, data.edge_attr = dense_to_sparse(dense_adj) """ # covalent_propagation # add self loops to enable self propagation covalent_edge_index, covalent_edge_attr = self.covalent_neighbor_threshold( data.edge_index, data.edge_attr) ( non_covalent_edge_index, non_covalent_edge_attr, ) = self.non_covalent_neighbor_threshold(data.edge_index, data.edge_attr) # covalent_propagation and non_covalent_propagation covalent_x = self.covalent_propagation(data.x, covalent_edge_index, covalent_edge_attr) non_covalent_x = self.non_covalent_propagation( covalent_x, non_covalent_edge_index, non_covalent_edge_attr) # zero out the protein features then do ligand only gather...hacky sure but it gets the job done non_covalent_ligand_only_x = non_covalent_x non_covalent_ligand_only_x[data.x[:, 14] == -1] = 0 pool_x = self.global_add_pool(non_covalent_ligand_only_x, data.batch) # fully connected and output layers if return_hidden_feature: # return prediction and atomistic features (covalent result, non-covalent result, pool result) avg_covalent_x, _ = avg_pool_x(data.batch, covalent_x, data.batch) avg_non_covalent_x, _ = avg_pool_x(data.batch, non_covalent_x, data.batch) fc0_x, fc1_x, output_x = self.output(pool_x, return_hidden_feature=True) return avg_covalent_x, avg_non_covalent_x, pool_x, fc0_x, fc1_x, output_x else: return self.output(pool_x)