def visualize_graph_dataset(dataset_name): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # checking whether you have a GPU config = { 'dataset_name': dataset_name, # Cora or PPI 'layer_type': LayerType.IMP3, # don't care, but it's needed for load_graph_data function to work 'should_visualize': True # visualize the dataset } load_graph_data(config, device)
def visualize_graph_dataset(dataset_name): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # checking whether you have a GPU config = { 'dataset_name': dataset_name, 'layer_type': LayerType.IMP3, # don't care 'should_visualize': True # visualize the dataset } load_graph_data(config, device)
def train_gat(config): global BEST_VAL_ACC, BEST_VAL_LOSS device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # checking whether you have a GPU, I hope so! # Step 1: load the graph data node_features, node_labels, edge_index, train_indices, val_indices, test_indices = load_graph_data(config, device) # Step 2: prepare the model gat = GAT( num_of_layers=config['num_of_layers'], num_heads_per_layer=config['num_heads_per_layer'], num_features_per_layer=config['num_features_per_layer'], add_skip_connection=config['add_skip_connection'], bias=config['bias'], dropout=config['dropout'], layer_type=config['layer_type'], log_attention_weights=False # no need to store attentions, used only in playground.py while visualizing ).to(device) # Step 3: Prepare other training related utilities (loss & optimizer and decorator function) loss_fn = nn.CrossEntropyLoss(reduction='mean') optimizer = Adam(gat.parameters(), lr=config['lr'], weight_decay=config['weight_decay']) # The decorator function makes things cleaner since there is a lot of redundancy between the train and val loops main_loop = get_main_loop( config, gat, loss_fn, optimizer, node_features, node_labels, edge_index, train_indices, val_indices, test_indices, config['patience_period'], time.time()) BEST_VAL_ACC, BEST_VAL_LOSS, PATIENCE_CNT = [0, 0, 0] # reset vars used for early stopping # Step 4: Start the training procedure for epoch in range(config['num_of_epochs']): # Training loop main_loop(phase=LoopPhase.TRAIN, epoch=epoch) # Validation loop with torch.no_grad(): try: main_loop(phase=LoopPhase.VAL, epoch=epoch) except Exception as e: # "patience has run out" exception :O print(str(e)) break # break out from the training loop # Step 5: Potentially test your model # Don't overfit to the test dataset - only when you've fine-tuned your model on the validation dataset should you # report your final loss and accuracy on the test dataset. Friends don't let friends overfit to the test data. <3 if config['should_test']: test_acc = main_loop(phase=LoopPhase.TEST) config['test_acc'] = test_acc print(f'Test accuracy = {test_acc}') else: config['test_acc'] = -1 # Save the latest GAT in the binaries directory torch.save(utils.get_training_state(config, gat), os.path.join(BINARIES_PATH, utils.get_available_binary_name()))
def visualize_gat_properties(model_name=r'gat_000000.pth', dataset_name=DatasetType.CORA.name, visualization_type=VisualizationType.ATTENTION): """ Using t-SNE to visualize GAT embeddings in 2D space. Check out this one for more intuition on how to tune t-SNE: https://distill.pub/2016/misread-tsne/ If you think it'd be useful for me to implement t-SNE as well and explain how every single detail works open up an issue or DM me on social media! <3 Note: I also tried using UMAP but it doesn't provide any more insight than t-SNE. (con: it has a lot of dependencies if you want to use their plotting functionality) """ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # checking whether you have a GPU, I hope so! config = { 'dataset_name': dataset_name, 'layer_type': LayerType.IMP3, 'should_visualize': False # don't visualize the dataset } # Step 1: Prepare the data node_features, node_labels, topology, train_indices, val_indices, test_indices = load_graph_data( config, device) # Step 2: Prepare the model model_path = os.path.join(BINARIES_PATH, model_name) model_state = torch.load(model_path) gat = GAT(num_of_layers=model_state['num_of_layers'], num_heads_per_layer=model_state['num_heads_per_layer'], num_features_per_layer=model_state['num_features_per_layer'], add_skip_connection=model_state['add_skip_connection'], bias=model_state['bias'], dropout=model_state['dropout'], layer_type=name_to_layer_type(model_state['layer_type']), log_attention_weights=True).to(device) print_model_metadata(model_state) gat.load_state_dict(model_state["state_dict"], strict=True) gat.eval( ) # some layers like nn.Dropout behave differently in train vs eval mode so this part is important # Step 3: Calculate the things we'll need for different visualization types (attention, scores, edge_index) # This context manager is important (and you'll often see it), otherwise PyTorch will eat much more memory. # It would be saving activations for backprop but we are not going to do any model training just the prediction. with torch.no_grad(): # Step 3: Run predictions and collect the high dimensional data all_nodes_unnormalized_scores, _ = gat( (node_features, topology)) # shape = (N, num of classes) all_nodes_unnormalized_scores = all_nodes_unnormalized_scores.cpu( ).numpy() # We'll need the edge index in different for multiple visualization types if config[ 'layer_type'] == LayerType.IMP3: # imp 3 works with edge index while others work with adjacency info edge_index = topology else: edge_index = convert_adj_to_edge_index(topology) # Step 4: Perform a specific visualization if visualization_type == VisualizationType.ATTENTION: # The number of nodes for which we want to visualize their attention over neighboring nodes # (2x this actually as we add nodes with highest degree + random nodes) num_nodes_of_interest = 4 # 4 is an arbitrary number you can play with these numbers head_to_visualize = 0 # plot attention from this multi-head attention's head gat_layer_id = 1 # plot attention from this GAT layer # Build up the complete graph # node_features shape = (N, FIN), where N is the number of nodes and FIN number of input features total_num_of_nodes = len(node_features) complete_graph = ig.Graph() complete_graph.add_vertices( total_num_of_nodes ) # igraph creates nodes with ids [0, total_num_of_nodes - 1] edge_index_tuples = list(zip( edge_index[0, :], edge_index[1, :])) # igraph requires this format complete_graph.add_edges(edge_index_tuples) # Pick the target nodes to plot (nodes with highest degree + random nodes) # Note: there could be an overlap between random nodes and nodes with highest degree - but highly unlikely nodes_of_interest_ids = np.argpartition( complete_graph.degree(), -num_nodes_of_interest)[-num_nodes_of_interest:] random_node_ids = np.random.randint(low=0, high=total_num_of_nodes, size=num_nodes_of_interest) nodes_of_interest_ids = np.append(nodes_of_interest_ids, random_node_ids) np.random.shuffle(nodes_of_interest_ids) target_node_ids = edge_index[1] source_nodes = edge_index[0] for target_node_id in nodes_of_interest_ids: # Step 1: Find the neighboring nodes to the target node # Note: self edge for CORA is included so the target node is it's own neighbor (Alexandro yo soy tu madre) src_nodes_indices = torch.eq(target_node_ids, target_node_id) source_node_ids = source_nodes[src_nodes_indices].cpu().numpy() size_of_neighborhood = len(source_node_ids) # Step 2: Fetch their labels labels = node_labels[source_node_ids].cpu().numpy() # Step 3: Fetch the attention weights for edges (attention is logged during GAT's forward pass above) # attention shape = (N, NH, 1) -> (N, NH) - we just squeeze the last dim it's superfluous all_attention_weights = gat.gat_net[ gat_layer_id].attention_weights.squeeze(dim=-1) attention_weights = all_attention_weights[ src_nodes_indices, head_to_visualize].cpu().numpy() # This part shows that for CORA what GAT learns is pretty much constant attention weights! Like in GCN! print( f'Max attention weight = {np.max(attention_weights)} and min = {np.min(attention_weights)}' ) attention_weights /= np.max( attention_weights ) # rescale the biggest weight to 1 for nicer plotting # Build up the neighborhood graph whose attention we want to visualize # igraph constraint - it works with contiguous range of ids so we map e.g. node 497 to 0, 12 to 1, etc. id_to_igraph_id = dict( zip(source_node_ids, range(len(source_node_ids)))) ig_graph = ig.Graph() ig_graph.add_vertices(size_of_neighborhood) ig_graph.add_edges([(id_to_igraph_id[neighbor], id_to_igraph_id[target_node_id]) for neighbor in source_node_ids]) # Prepare the visualization settings dictionary and plot visual_style = { "edge_width": attention_weights, # make edges as thick as the corresponding attention weight "layout": ig_graph.layout_reingold_tilford_circular( ) # layout for tree-like graphs } # This is the only part that's Cora specific as Cora has 7 labels if dataset_name.lower() == DatasetType.CORA.name.lower(): visual_style["vertex_color"] = [ cora_label_to_color_map[label] for label in labels ] else: print( 'Add custom color scheme for your specific dataset. Using igraph default coloring.' ) ig.plot(ig_graph, **visual_style) elif visualization_type == VisualizationType.EMBEDDINGS: # visualize embeddings (using t-SNE) node_labels = node_labels.cpu().numpy() num_classes = len(set(node_labels)) # Feel free to experiment with perplexity it's arguable the most important parameter of t-SNE and it basically # controls the standard deviation of Gaussians i.e. the size of the neighborhoods in high dim (original) space. # Simply put the goal of t-SNE is to minimize the KL-divergence between joint Gaussian distribution fit over # high dim points and between the t-Student distribution fit over low dimension points (the ones we're plotting) # Intuitively, by doing this, we preserve the similarities (relationships) between the high and low dim points. # This (probably) won't make much sense if you're not already familiar with t-SNE, God knows I've tried. :P t_sne_embeddings = TSNE( n_components=2, perplexity=30, method='barnes_hut').fit_transform(all_nodes_unnormalized_scores) for class_id in range(num_classes): # We extract the points whose true label equals class_id and we color them in the same way, hopefully # they'll be clustered together on the 2D chart - that would mean that GAT has learned good representations! plt.scatter(t_sne_embeddings[node_labels == class_id, 0], t_sne_embeddings[node_labels == class_id, 1], s=20, color=cora_label_to_color_map[class_id], edgecolors='black', linewidths=0.2) plt.show() # We want our local probability distributions (attention weights over the neighborhoods) to be # non-uniform because that means that GAT is learning a useful pattern. Entropy histograms help us visualize # how different those neighborhood distributions are from the uniform distribution (constant attention). # If the GAT is learning const attention we could well be using GCN or some even simpler models. elif visualization_type == VisualizationType.ENTROPY: num_heads_per_layer = [layer.num_of_heads for layer in gat.gat_net] num_layers = len(num_heads_per_layer) num_of_nodes = len(node_features) target_node_ids = edge_index[1].cpu().numpy() # For every GAT layer and for every GAT attention head plot the entropy histogram for layer_id in range(num_layers): # Fetch the attention weights for edges (attention is logged during GAT's forward pass above) # attention shape = (N, NH, 1) -> (N, NH) - we just squeeze the last dim it's superfluous all_attention_weights = gat.gat_net[ layer_id].attention_weights.squeeze(dim=-1).cpu().numpy() for head_id in range(num_heads_per_layer[layer_id]): uniform_dist_entropy_list = [ ] # save the ideal uniform histogram as the reference neighborhood_entropy_list = [] for target_node_id in range( num_of_nodes ): # find every the neighborhood for every node in the graph # These attention weights sum up to 1 by GAT design so we can treat it as a probability distribution neigborhood_attention = all_attention_weights[ target_node_ids == target_node_id].flatten() # Reference uniform distribution of the same length ideal_uniform_attention = np.ones( len(neigborhood_attention)) / len( neigborhood_attention) # Calculate the entropy, check out this video if you're not familiar with the concept: # https://www.youtube.com/watch?v=ErfnhcEV1O8 (Aurélien Géron) neighborhood_entropy_list.append( entropy(neigborhood_attention, base=2)) uniform_dist_entropy_list.append( entropy(ideal_uniform_attention, base=2)) title = f'Cora entropy histogram layer={layer_id}, attention head={head_id}' draw_entropy_histogram(uniform_dist_entropy_list, title, color='orange', uniform_distribution=True) draw_entropy_histogram(neighborhood_entropy_list, title, color='dodgerblue') fig = plt.gcf() # get current figure plt.show() fig.savefig( os.path.join(DATA_DIR_PATH, f'layer_{layer_id}_head_{head_id}.jpg')) plt.close() else: raise Exception( f'Visualization type {visualization_type} not supported.')
def train_gat_cora(config): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # checking whether you have a GPU, I hope so! # Step 1: load the graph data node_features, node_labels, edge_index, train_indices, val_indices, test_indices = load_graph_data( config, device) ### BUG: node_features vary in AR # What is edg-index? it is a representation of the edges of the graph # graph_data = (node_features, edge_index) # Step 2: prepare the model gat = GAT( num_of_layers=config['num_of_layers'], num_heads_per_layer=config['num_heads_per_layer'], num_features_per_layer=config['num_features_per_layer'], add_skip_connection=config['add_skip_connection'], bias=config['bias'], dropout=config['dropout'], layer_type=config['layer_type'], log_attention_weights= False # no need to store attentions, used only in playground.py for visualizations ).to(device) # Step 3: Prepare other training related utilities (loss & optimizer and decorator function) loss_fn = nn.CrossEntropyLoss(reduction='mean') optimizer = Adam(gat.parameters(), lr=config['lr'], weight_decay=config['weight_decay']) if phase == LoopPhase.TRAIN: gat.train() else: gat.eval() # Do a forwards pass and extract only the relevant node scores (train/val or test ones) # Note: [0] just extracts the node_features part of the data (index 1 contains the edge_index) # shape = (N, C) where N is the number of nodes in the split (train/val/test) and C is the number of classes nodes_unnormalized_scores = gat(graph_data)[0].index_select( node_dim, node_indices) # Example: let's take an output for a single node on Cora - it's a vector of size 7 and it contains unnormalized # scores like: V = [-1.393, 3.0765, -2.4445, 9.6219, 2.1658, -5.5243, -4.6247] # What PyTorch's cross entropy loss does is for every such vector it first applies a softmax, and so we'll # have the V transformed into: [1.6421e-05, 1.4338e-03, 5.7378e-06, 0.99797, 5.7673e-04, 2.6376e-07, 6.4848e-07] # secondly, whatever the correct class is (say it's 3), it will then take the element at position 3, # 0.99797 in this case, and the loss will be -log(0.99797). It does this for every node and applies a mean. # You can see that as the probability of the correct class for most nodes approaches 1 we get to 0 loss! <3 loss = cross_entropy_loss(nodes_unnormalized_scores, gt_node_labels) if phase == LoopPhase.TRAIN: optimizer.zero_grad( ) # clean the trainable weights gradients in the computational graph (.grad fields) loss.backward( ) # compute the gradients for every trainable weight in the computational graph optimizer.step() # apply the gradients to weights
def train_gat_ppi(config): """ Very similar to Cora's training script. The main differences are: 1. Using dataloaders since we're dealing with an inductive setting - multiple graphs per batch 2. Doing multi-class classification (BCEWithLogitsLoss) and reporting micro-F1 instead of accuracy 3. Model architecture and hyperparams are a bit different (as reported in the GAT paper) """ global BEST_VAL_PERF, BEST_VAL_LOSS # Checking whether you have a strong GPU. Since PPI training requires almost 8 GBs of VRAM # I've added the option to force the use of CPU even though you have a GPU on your system (but it's too weak). device = torch.device("cuda" if torch.cuda.is_available() and not config['force_cpu'] else "cpu") # Step 1: prepare the data loaders data_loader_train, data_loader_val, data_loader_test = load_graph_data(config, device) # Step 2: prepare the model gat = GAT( num_of_layers=config['num_of_layers'], num_heads_per_layer=config['num_heads_per_layer'], num_features_per_layer=config['num_features_per_layer'], add_skip_connection=config['add_skip_connection'], bias=config['bias'], dropout=config['dropout'], layer_type=config['layer_type'], log_attention_weights=False # no need to store attentions, used only in playground.py for visualizations ).to(device) # Step 3: Prepare other training related utilities (loss & optimizer and decorator function) loss_fn = nn.BCEWithLogitsLoss(reduction='mean') optimizer = Adam(gat.parameters(), lr=config['lr'], weight_decay=config['weight_decay']) # The decorator function makes things cleaner since there is a lot of redundancy between the train and val loops main_loop = get_main_loop( config, gat, loss_fn, optimizer, config['patience_period'], time.time()) BEST_VAL_PERF, BEST_VAL_LOSS, PATIENCE_CNT = [0, 0, 0] # reset vars used for early stopping # Step 4: Start the training procedure for epoch in range(config['num_of_epochs']): # Training loop main_loop(phase=LoopPhase.TRAIN, data_loader=data_loader_train, epoch=epoch) # Validation loop with torch.no_grad(): try: main_loop(phase=LoopPhase.VAL, data_loader=data_loader_val, epoch=epoch) except Exception as e: # "patience has run out" exception :O print(str(e)) break # break out from the training loop # Step 5: Potentially test your model # Don't overfit to the test dataset - only when you've fine-tuned your model on the validation dataset should you # report your final loss and micro-F1 on the test dataset. Friends don't let friends overfit to the test data. <3 if config['should_test']: micro_f1 = main_loop(phase=LoopPhase.TEST, data_loader=data_loader_test) config['test_perf'] = micro_f1 print('*' * 50) print(f'Test micro-F1 = {micro_f1}') else: config['test_perf'] = -1 # Save the latest GAT in the binaries directory torch.save( utils.get_training_state(config, gat), os.path.join(BINARIES_PATH, utils.get_available_binary_name(config['dataset_name'])) )
def train_gat_ppi(config): # 记录全局参数,最好的验证F1值,最好的验证损失 global BEST_VAL_MICRO_F1, BEST_VAL_LOSS device = torch.device("cuda" if torch.cuda.is_available() and not config['force_cpu'] else "cpu") # Step1 加载数据 data_loader_train, data_loader_val, data_loader_test = load_graph_data( config, device) # Step2 准备模型 gat = GAT_ppi(num_of_layers=config['num_of_layers'], num_heads_per_layer=config['num_heads_per_layer'], num_features_per_layer=config['num_features_per_layer'], add_skip_connection=config['add_skip_connection'], bias=config['bias'], dropout=config['dropout'], log_attention_weights=False).to(device) # Step3 准备训练工具 loss_fn = nn.BCEWithLogitsLoss(reduction='mean') optimizer = Adam(gat.parameters(), lr=config['lr'], weight_decay=config['weight_decay']) # 返回主迭代方法,这样提高代码复用率 main_loop = get_main_loop(config=config, gat=gat, sigmoid_cross_entropy_loss=loss_fn, optimizer=optimizer, patience_period=config['patience_period'], time_start=time.time()) BEST_VAL_MICRO_F1, BEST_VAL_LOSS, PATIENCE_CNT = [0, 0, 0] # 重置 # Step4 开始训练过程 for epoch in range(config['num_of_epochs']): # 训练循环 main_loop(phase=LoopPhase.TRAIN, data_loader=data_loader_train, epoch=epoch) # 验证循环 with torch.no_grad(): try: main_loop(phase=LoopPhase.VAL, data_loader=data_loader_val, epoch=epoch) except Exception as e: print(str(e)) break # Step5 验证 if config['should_test']: micro_f1 = main_loop(phase=LoopPhase.TEST, data_loader=data_loader_test) config['test_perf'] = micro_f1 print('*' * 50) print(f'Test micro-F1 = {micro_f1}') else: config['test_perf'] = -1 # 保存最新的GAT模型的二进制文件 torch.save( utils.get_training_state(config, gat), os.path.join(BINARIES_PATH, utils.get_available_binary_name(config['dataset_name'])))