def __init__(self,
                 av,
                 gr: PermGnnGraph,
                 num_hash_tables=10,
                 subset_size=8):
        super(LSH, self).__init__()
        assert (subset_size <= av.HASHCODE_DIM)
        self.av = av
        self.gr = gr
        self.all_nodes = self.gr.get_num_nodes()

        self.num_hash_tables = num_hash_tables
        #No. of buckets in a hashTable is 2^subset_size
        self.subset_size = subset_size
        self.powers_of_two = cudavar(
            self.av,
            torch.from_numpy(1 << np.arange(self.subset_size -
                                            1, -1, -1)).type(
                                                torch.FloatTensor))
        self.hash_functions = None
        self.init_hash_functions()

        #This contains +1 or -1. used for bucketifying
        self.hashcode_mat = cudavar(self.av, torch.tensor([]))
        self.all_hash_tables = []
        self.candidate_set = np.zeros(
            (self.gr.get_num_nodes(), self.gr.get_num_nodes()))
Exemplo n.º 2
0
    def __init__(self, av, gr: PermGnnGraph):
        super(PermutationGenerator, self).__init__()
        self.av = av
        self.gr = gr
        self.features = nn.Embedding(self.gr.get_num_nodes(),
                                     self.gr.get_num_features())
        self.features.weight = nn.Parameter(cudavar(
            self.av, torch.FloatTensor(self.gr.node_features)),
                                            requires_grad=False)
        self.feature_dim = self.gr.get_num_features()

        self.adj_list = self.gr.adjacency_list
        #Max set size is one greater than max node outdegree accounting for presence of the node itself in set
        self.max_set_size = self.gr.get_max_node_outdegree() + 1
        # Lookup table set_size:mask . Given set_size k and max_set_size n
        # Mask pattern sets top left (k)*(k) square to 1 inside arrays of size n*n. Rest elements are 0
        self.set_size_to_mask_map = [
            torch.cat((torch.repeat_interleave(
                torch.tensor([1, 0]),
                torch.tensor([x, self.max_set_size - x])).repeat(x, 1),
                       torch.repeat_interleave(
                           torch.tensor([1, 0]),
                           torch.tensor([0, self.max_set_size
                                         ])).repeat(self.max_set_size - x, 1)))
            for x in range(1, self.max_set_size + 1)
        ]
        # List of tensors corr to each node. Each tensor is input sequence of neighbourhood features
        #if self.av.TASK == "1Perm":
        self.neighbour_features_all = [
            self.features(
                cudavar(self.av,
                        torch.LongTensor(sorted(list(self.adj_list[node])))))
            for node in range(self.gr.get_num_nodes())
        ]
        #else:
        #  self.neighbour_features_all = [self.features(cudavar(self.av,torch.LongTensor(list(self.adj_list[node])))) for node in range(self.gr.get_num_nodes())]
        #numpy array of set sizes for all node ids. Used later for variable length LSTM code
        self.set_size_all = np.array(
            [len(x) for x in self.neighbour_features_all])
        #Generate boolean mask for each node based on it's set_size. Used for masked sinkhorn normalization
        self.sets_maskB_all = cudavar(
            self.av,
            torch.stack([
                self.set_size_to_mask_map[x - 1] == 0
                for x in self.set_size_all
            ]))
        #Generates padded tensor of dim(num_nodes*max_set_size*feature_dimension)
        self.padded_neighbour_features_all = pad_sequence(
            self.neighbour_features_all, batch_first=True)
        self.latent_dim = self.av.PERM_NETWORK_LATENT_DIM
        self.output_dim = self.max_set_size
        self.linear1 = nn.Linear(self.feature_dim, self.latent_dim)
        self.relu1 = nn.ReLU()
        self.linear2 = nn.Linear(self.latent_dim, self.output_dim)
Exemplo n.º 3
0
 def init_non_nbr_mat(self, list_training_edges):
     for (a, b) in list_training_edges:
         self.non_nbr_mat[a][b] = 1
     z = cudavar(
         self.av,
         torch.zeros(self.gr.get_num_nodes(), self.gr.get_num_nodes()))
     o = cudavar(
         self.av,
         torch.ones(self.gr.get_num_nodes(), self.gr.get_num_nodes()))
     reverse = torch.where(self.non_nbr_mat == 0, o, z)
     self.non_nbr_mat = reverse
    def init_hash_functions(self):
        self.hash_functions = cudavar(self.av, torch.LongTensor([]))

        hash_code_dim = self.av.HASHCODE_DIM
        indices = list(range(hash_code_dim))
        for i in range(self.num_hash_tables):
            random.shuffle(indices)
            self.hash_functions = torch.cat(
                (self.hash_functions,
                 cudavar(self.av, torch.LongTensor(
                     [indices[:self.subset_size]]))),
                dim=0)
Exemplo n.º 5
0
    def computeLoss(self, nodes):
        """
      :param   nodes  : batch of node ids from range 0..NUM_NODES
      :return  loss   : Hinge ranking loss
    """
        loss = 0
        all_nodes = list(range(self.gr.get_num_nodes()))
        all_embeds = cudavar(self.av, torch.tensor([]))

        if self.av.TASK == "Multiperm":
            all_embeds_perms = []
            for rep in range(self.av.NUM_PERMS):
                temp = cudavar(self.av, torch.tensor([]))
                #batch and send nodes to avoid memory limit crash for larger graphs
                for i in range(0, self.gr.get_num_nodes(), self.av.BATCH_SIZE):
                    batch_nodes = all_nodes[i:i + self.av.BATCH_SIZE]
                    temp = torch.cat((temp, self.forward(batch_nodes)), dim=0)
                all_embeds_perms.append(temp)
            all_embeds = torch.mean(torch.stack(all_embeds_perms), 0)
        else:
            #batch and send nodes to avoid memory limit crash for larger graphs
            for i in range(0, self.gr.get_num_nodes(), self.av.BATCH_SIZE):
                batch_nodes = all_nodes[i:i + self.av.BATCH_SIZE]
                all_embeds = torch.cat((all_embeds, self.forward(batch_nodes)),
                                       dim=0)

        #Filter for query_nodes
        nodes = list(set(self.gr.query_node_list).intersection(set(nodes)))

        for i in range(len(nodes)):
            selfemb = all_embeds[nodes[i]]

            nbrs = all_embeds[list(self.gr.query_node_nbr[nodes[i]])]
            nonnbrs = all_embeds[list(self.gr.query_node_non_nbr[nodes[i]])]

            #https://pytorch.org/docs/master/generated/torch.nn.CosineSimilarity.html
            cos = nn.CosineSimilarity(dim=1, eps=1e-6)
            pos_scores = cos(nbrs, selfemb.unsqueeze(0))
            neg_scores = cos(nonnbrs, selfemb.unsqueeze(0))

            len_pos = pos_scores.shape[0]
            len_neg = neg_scores.shape[0]
            expanded_pos_scores = pos_scores.unsqueeze(1).expand(
                len_pos, len_neg)
            expanded_neg_scores = neg_scores.unsqueeze(0).expand(
                len_pos, len_neg)

            loss += torch.max(
                self.av.MARGIN + expanded_neg_scores - expanded_pos_scores,
                cudavar(self.av, torch.tensor([0.]))).sum()

        return loss
Exemplo n.º 6
0
def fetch_permgnn_embeddings(av,gr: PermGnnGraph):
  avTask = av.TASK

  av.TASK = "PermGNN"
  pickle_fp = "./data/embeddingPickles/"+av.TASK+"_"+av.DATASET_NAME+"_tfrac_"+str(av.TEST_FRAC)+"_vfrac_"+str(av.VAL_FRAC) + "_embedding_mat.pkl"
  if not os.path.exists(pickle_fp):
    query_nodes, \
    list_training_edges, \
    list_training_non_edges, \
    list_test_edges, \
    list_test_non_edges, \
    list_val_edges, \
    list_val_non_edges = fetch_lp_data_split(av,gr)

    prep_permgnn_graph(av,gr,query_nodes,list_training_edges,list_training_non_edges,list_val_edges,list_test_edges,list_val_non_edges,list_test_non_edges)
    device = "cuda" if av.has_cuda and av.want_cuda else "cpu"
    permNet = PermutationGenerator(av,gr).to(device)
    permGNN = PermutationInvariantGNN(av,gr,permNet).to(device)
    #if VAL_FRAC is 0, we fetch model weights from last trained epoch
    # else we fetch  best performing model on validation dataset
    if av.VAL_FRAC==0:
      checkpoint = load_model(av)
      logger.info("Loading latest trained model from training epoch %d",checkpoint['epoch'])
    else:
      es = EarlyStoppingModule(av)
      checkpoint = es.load_best_model()
      logger.info("Loading best validation result model from training epoch %d",checkpoint['epoch'])

    permGNN.load_state_dict(checkpoint['model_state_dict'])

    all_nodes = list(range(permGNN.gr.get_num_nodes()))
    all_embeds = cudavar(av,torch.tensor([]))
    for i in range(0,permGNN.gr.get_num_nodes(),av.BATCH_SIZE) :
      batch_nodes = all_nodes[i:i+av.BATCH_SIZE]
      set_size = permGNN.permNet.set_size_all[batch_nodes]
      neighbour_features = permGNN.permNet.padded_neighbour_features_all[batch_nodes]
      all_embeds = torch.cat((all_embeds,permGNN.getEmbeddingForFeatures(set_size,neighbour_features).data),dim=0)

    logger.info("Creating permgnn embedding pickle at %s",pickle_fp)
    with open(pickle_fp, 'wb') as f:
      pickle.dump(all_embeds, f)

  else:
    logger.info("Loading permgnn embedding pickle from %s",pickle_fp)
    with open(pickle_fp, 'rb') as f:
      all_embeds = pickle.load(f)

  av.TASK = avTask
  return cudavar(av,all_embeds)
Exemplo n.º 7
0
    def computeLoss(self, nodes):
        """
      :param   nodes  : batch of node ids from range 0..NUM_NODES
      :return  loss   : Hinge ranking loss
    """
        loss1 = loss2 = loss3 = 0
        all_hashcodes = self.forward(nodes)
        num_nodes = len(nodes)

        for i in range(len(nodes)):
            selfcode = all_hashcodes[i]

            loss1 = loss1 + torch.abs(torch.sum(selfcode))
            loss2 = loss2 + torch.norm(torch.abs(selfcode) - 1, p=1)

        indices = cudavar(av, torch.tensor(nodes))
        non_nbrs = torch.index_select(
            torch.index_select(self.non_nbr_mat, 0, indices), 1, indices)
        similarity_mat = torch.mul(
            torch.abs(
                torch.mm(all_hashcodes, torch.transpose(all_hashcodes, 0, 1))),
            non_nbrs)

        loss3 = torch.sum(similarity_mat) - torch.sum(
            torch.diagonal(similarity_mat))

        return loss1, loss2, loss3, num_nodes
Exemplo n.º 8
0
def compute_scores(av,permGNN,query_nodes,list_test_edges,list_test_non_edges):
  all_nodes = list(range(permGNN.gr.get_num_nodes()))
  all_embeds = cudavar(av,torch.tensor([]))
  #batch and send nodes to avoid memory limit crash for larger graphs
  for i in range(0,permGNN.gr.get_num_nodes(),av.BATCH_SIZE) : 
    batch_nodes = all_nodes[i:i+av.BATCH_SIZE]
    all_embeds = torch.cat((all_embeds,permGNN.forward(batch_nodes).data),dim=0)
  return compute_scores_from_embeds(av,all_embeds,query_nodes,list_test_edges,list_test_non_edges) 
 def assign_bucket(self, function_id, node_hash_code):
     func = self.hash_functions[function_id]
     # convert sequence of -1 and 1 to binary by replacing -1 s to 0
     binary_id = torch.max(
         torch.index_select(node_hash_code, dim=0, index=func),
         cudavar(self.av, torch.tensor([0.])))
     #map binary sequence to int which is bucket Id
     bucket_id = self.powers_of_two @ binary_id
     return bucket_id.item()
Exemplo n.º 10
0
 def getEmbeddingForFeatures(self,
                             set_size,
                             neighbour_features,
                             diagnostic_mode=False):
     """
   :param  set_size           : neighbourhood set sizes for each node. 
                                Needed for variable lenghth LSTM code
   :param  neighbour_features : permutation of neighbour feature vectors for each node
                                For node_set_size k and max_set_size n last (n-k) rows are padded with 0
   :return node_embeddings    : Embedding dim currently same as input feature dimension. 
 """
     #Below 3 steps of pack_padded_sequence -> LSTM -> pad_packed_sequence
     #ensures variable length LSTM. So 0 padded rows not fed to LSTM network
     packed_neighbour_features = pack_padded_sequence(neighbour_features,
                                                      set_size,
                                                      batch_first=True,
                                                      enforce_sorted=False)
     packed_lstm_output, (ht, ct) = self.lstm(packed_neighbour_features)
     padded_lstm_output = pad_packed_sequence(packed_lstm_output,
                                              batch_first=True)
     #appends 1 in bias column except for the rows which are pads
     aug_lstm_output = torch.cat(
         (padded_lstm_output[0],
          pad_sequence([
              cudavar(self.av, torch.ones([x])).unsqueeze(0).t()
              for x in padded_lstm_output[1].tolist()
          ],
                       batch_first=True)),
         dim=2)
     node_embeddings = torch.sum(
         self.fully_connected_layer(aug_lstm_output), dim=1)
     #diagonistic mode wasadded later to instrument sensitivity to permutations across layers
     if diagnostic_mode:
         lstm_output_flat = padded_lstm_output[0].flatten(1)
         zero_pad = cudavar(
             self.av,
             torch.zeros(padded_lstm_output[0].shape[0],
                         (self.permNet.max_set_size -
                          padded_lstm_output[0].shape[1]) *
                         padded_lstm_output[0].shape[2]))
         final = torch.cat((lstm_output_flat, zero_pad), 1)
         return final.data, node_embeddings.data
     else:
         return node_embeddings
Exemplo n.º 11
0
 def __init__(self, av, gr: PermGnnGraph):
     super(HashCodeGenerator, self).__init__()
     self.av = av
     self.gr = gr
     self.all_embeddings = nn.Embedding(self.gr.get_num_nodes(),
                                        self.av.EMBEDDING_DIM)
     self.non_nbr_mat = cudavar(
         self.av,
         torch.zeros(self.gr.get_num_nodes(), self.gr.get_num_nodes()))
     #Reusing PERM_NETWORK_LATENT_DIM here because why not \()/
     self.latent_dim = self.av.PERM_NETWORK_LATENT_DIM
     self.hash_linear1 = nn.Linear(self.av.EMBEDDING_DIM,
                                   self.av.HASHCODE_DIM)
     self.hash_tanh1 = nn.Tanh()
     nn.init.normal_(self.hash_linear1.weight)
Exemplo n.º 12
0
def run_graph_lp_hash_gaussian(av, gr: PermGnnGraph):
    pickle_fp = av.DIR_PATH + "/data/hashcodePickles/" + av.TASK + "_gaussian_" + av.DATASET_NAME + "_tfrac_" + str(
        av.TEST_FRAC) + "_vfrac_" + str(av.VAL_FRAC) + "_hashcode_mat.pkl"
    if not os.path.exists(pickle_fp):
        #fetch permGNN embeddings
        device = "cuda" if av.has_cuda and av.want_cuda else "cpu"
        all_embeds = fetch_permgnn_embeddings(av, gr)
        fp = av.DIR_PATH + "/data/hashcodePickles/gauss_hplanes_D_16.pkl"
        hplanes = pickle.load(open(fp, 'rb'))
        projections = all_embeds.cpu().numpy() @ np.transpose(hplanes)
        hcode = np.sign(projections)
        all_hashcodes = cudavar(av, torch.tensor(hcode))
        logger.info("Dumping gaussian hashcode pickle at %s", pickle_fp)
        with open(pickle_fp, 'wb') as f:
            pickle.dump(all_hashcodes, f)
def compute_loss(av,gr, all_embeds):
    loss=0
    nodes = gr.query_node_list
    for i in range(len(nodes)):
      selfemb = all_embeds[nodes[i]]

      nbrs = all_embeds[list(gr.query_node_nbr[nodes[i]])]
      nonnbrs = all_embeds[list(gr.query_node_non_nbr[nodes[i]])]
      
      #https://pytorch.org/docs/master/generated/torch.nn.CosineSimilarity.html
      cos = nn.CosineSimilarity(dim=1, eps=1e-6)
      pos_scores = cos(nbrs,selfemb.unsqueeze(0))
      neg_scores = cos(nonnbrs,selfemb.unsqueeze(0))  
      
      len_pos = pos_scores.shape[0]
      len_neg = neg_scores.shape[0]
      expanded_pos_scores = pos_scores.unsqueeze(1).expand(len_pos,len_neg)
      expanded_neg_scores = neg_scores.unsqueeze(0).expand(len_pos,len_neg)
        
      loss += torch.max(av.MARGIN + expanded_neg_scores - expanded_pos_scores,cudavar(av,torch.tensor([0.]))).sum()

    return loss.item()
Exemplo n.º 14
0
    def __init__(self, av, gr: PermGnnGraph, permNet: PermutationGenerator):
        super(PermutationInvariantGNN, self).__init__()
        self.av = av
        self.gr = gr
        self.features = nn.Embedding(self.gr.get_num_nodes(),
                                     self.gr.get_num_features())
        self.features.weight = nn.Parameter(cudavar(
            self.av, torch.FloatTensor(self.gr.node_features)),
                                            requires_grad=False)
        self.adj_list = self.gr.adjacency_list

        self.lstm_input_size = self.gr.get_num_features()
        self.lstm_hidden_size = self.av.LSTM_HIDDEN_DIM
        self.fclayer_output_size = self.av.EMBEDDING_DIM
        #LSTM layer init
        self.lstm = nn.LSTM(self.lstm_input_size,
                            self.lstm_hidden_size,
                            num_layers=1,
                            batch_first=True)
        #FC layer init. Bias folded in with Weight matrix, so aug_lstm_output generated below to compensate
        self.fully_connected_layer = nn.Linear(self.lstm_hidden_size + 1,
                                               self.fclayer_output_size,
                                               bias=False)
        self.permNet = permNet
 def init_hash_code_mat(self, all_hashcodes):
     self.hashcode_mat = cudavar(self.av, torch.sign(all_hashcodes))
     if (torch.sign(all_hashcodes) == 0).any():
         logger.info("Hashcode had 0 bits. replacing all with 1")
         all_hashcodes[all_hashcodes == 0] = 1
Exemplo n.º 16
0
def compute_scores_from_embeds(av,all_embeds,query_nodes,list_test_edges,list_test_non_edges):
  cos = nn.CosineSimilarity(dim=1, eps=1e-6)
  #per qnode
  #all_qnode_auc = [] 
  all_qnode_ap = []
  all_qnode_rr = []
  #all_qnode_ndcg = []
  for qnode in query_nodes : 
    qnode_edges = list(filter(lambda x: x[0]==qnode or x[1]==qnode, list_test_edges))
    qnode_non_edges = list(filter(lambda x: x[0]==qnode or x[1]==qnode, list_test_non_edges))
    if len(qnode_edges)==0 or len(qnode_non_edges)==0: 
      continue
    a,b = zip(*qnode_edges)
    self_tensors = torch.index_select(all_embeds,dim=0,index=cudavar(av,torch.tensor(a)))
    nbr_tensors  = torch.index_select(all_embeds,dim=0,index=cudavar(av,torch.tensor(b)))
    pos_scores   = cos(self_tensors,nbr_tensors)

    a,b = zip(*qnode_non_edges)
    self_tensors = torch.index_select(all_embeds,dim=0,index=cudavar(av,torch.tensor(a)))
    nbr_tensors  = torch.index_select(all_embeds,dim=0,index=cudavar(av,torch.tensor(b)))
    neg_scores   = cos(self_tensors,nbr_tensors)

    if av.has_cuda and av.want_cuda:
      all_scores = torch.cat((pos_scores,neg_scores)).cpu().numpy()
    else:
      all_scores = torch.cat((pos_scores,neg_scores)).numpy()

    all_labels = np.hstack([np.ones(len(pos_scores)), np.zeros(len(neg_scores))])
    auc_score  = roc_auc_score(all_labels, all_scores)
    ap_score   = average_precision_score(all_labels, all_scores)
    #ndcg       = ndcg_score([all_labels],[all_scores])

    so = np.argsort(all_scores)[::-1]
    labels_rearranged = all_labels[so]
    rr_score = 1/(labels_rearranged.tolist().index(1)+1)
    
    #all_qnode_auc.append(auc_score)
    all_qnode_ap.append(ap_score)
    all_qnode_rr.append(rr_score)
    #all_qnode_ndcg.append(ndcg)
  #agglo
  pos_scores = []
  neg_scores = []

  a,b = zip(*list_test_edges)
  self_tensors = torch.index_select(all_embeds,dim=0,index=cudavar(av,torch.tensor(a)))
  nbr_tensors  = torch.index_select(all_embeds,dim=0,index=cudavar(av,torch.tensor(b)))
  pos_scores   = cos(self_tensors,nbr_tensors)

  a,b = zip(*list_test_non_edges)
  self_tensors = torch.index_select(all_embeds,dim=0,index=cudavar(av,torch.tensor(a)))
  nbr_tensors  = torch.index_select(all_embeds,dim=0,index=cudavar(av,torch.tensor(b)))
  neg_scores   = cos(self_tensors,nbr_tensors)

  if av.has_cuda and av.want_cuda:
    all_scores = torch.cat((pos_scores,neg_scores)).cpu().numpy()
  else:
    all_scores = torch.cat((pos_scores,neg_scores)).numpy()

  all_labels = np.hstack([np.ones(len(pos_scores)), np.zeros(len(neg_scores))])
  auc_score  = roc_auc_score(all_labels, all_scores)
  ap_score   = average_precision_score(all_labels, all_scores)
  #ndcg       = ndcg_score([all_labels],[all_scores])
  
  #so = np.argsort(all_scores)[::-1]
  #labels_rearranged = all_labels[so]
  #rr_score = 1/(labels_rearranged.tolist().index(1)+1)

  return auc_score, ap_score, np.mean(all_qnode_ap), np.mean(all_qnode_rr)
Exemplo n.º 17
0
def run_graph_lp_hash(av, gr: PermGnnGraph):
    pickle_fp = av.DIR_PATH + "/data/hashcodePickles/" + av.TASK + "_" + av.DATASET_NAME + "_tfrac_" + str(
        av.TEST_FRAC) + "_vfrac_" + str(av.VAL_FRAC) + "_L1_" + str(
            av.LAMBDA1) + "_L2_" + str(av.LAMBDA2) + "_hashcode_mat.pkl"
    if not os.path.exists(pickle_fp):
        #if av.has_cuda:
        #  torch.cuda.reset_max_memory_allocated(0)
        #fetch permGNN embeddings
        device = "cuda" if av.has_cuda and av.want_cuda else "cpu"
        query_nodes, \
          list_training_edges, \
          list_training_non_edges, \
          list_test_edges, \
          list_test_non_edges, \
          list_val_edges, \
          list_val_non_edges = fetch_lp_data_split(av,gr)

        prep_permgnn_graph(av, gr, query_nodes, list_training_edges,
                           list_training_non_edges, list_val_edges,
                           list_test_edges, list_val_non_edges,
                           list_test_non_edges)
        all_embeds = fetch_permgnn_embeddings(av, gr)

        hashCodeGenerator = HashCodeGenerator(av, gr).to(device)
        hashCodeGenerator.init_embeddings(all_embeds)
        hashCodeGenerator.init_non_nbr_mat(list_training_edges)

        es = EarlyStoppingModule(av, 50, 0.001)

        optimizerFunc = torch.optim.SGD(hashCodeGenerator.parameters(),
                                        lr=av.LEARNING_RATE_FUNC)
        nodes = list(range(gr.get_num_nodes()))
        epoch = 0
        #if VAL_FRAC is 0, we train model for NUM_EPOCHS
        #else we train model till early stopping criteria is met
        while av.VAL_FRAC != 0 or epoch < av.NUM_EPOCHS:
            random.shuffle(nodes)
            start_time = time.time()
            totalEpochLoss = 0
            for i in range(0, gr.get_num_nodes(), av.BATCH_SIZE):
                nodes_batch = nodes[i:i + av.BATCH_SIZE]
                hashCodeGenerator.zero_grad()
                loss1, loss2, loss3, num_nodes = hashCodeGenerator.computeLoss(
                    nodes_batch)
                totalLoss = (av.LAMBDA1 / num_nodes) * loss1 + (
                    av.LAMBDA2 / num_nodes) * loss2 + (
                        (1 -
                         (av.LAMBDA1 + av.LAMBDA2)) / (num_nodes**2)) * loss3
                totalLoss.backward()
                optimizerFunc.step()
                totalEpochLoss = totalEpochLoss + totalLoss.item()
            end_time = time.time()
            logger.info("Epoch: %d totalEpochLoss: %f time: %.2f", epoch,
                        totalEpochLoss, end_time - start_time)
            if av.VAL_FRAC != 0:
                if es.check([-totalEpochLoss], hashCodeGenerator, epoch):
                    break
            epoch += 1
        if av.has_cuda:
            logger.info("Max gpu memory used: %.6f ",
                        torch.cuda.max_memory_allocated(device=0) / (1024**3))

        #generate and dump hashcode  pickles
        all_nodes = list(range(gr.get_num_nodes()))
        all_hashcodes = cudavar(av, torch.tensor([]))
        for i in range(0, gr.get_num_nodes(), av.BATCH_SIZE):
            batch_nodes = all_nodes[i:i + av.BATCH_SIZE]
            all_hashcodes = torch.cat(
                (all_hashcodes, hashCodeGenerator.forward(batch_nodes).data),
                dim=0)
        logger.info("Dumping trained hashcode pickle at %s", pickle_fp)
        with open(pickle_fp, 'wb') as f:
            pickle.dump(all_hashcodes, f)
    def compute_lp_scores(self,
                          all_embeds,
                          query_nodes,
                          candidate_list,
                          k,
                          use_tensor=False):
        """
      return both AP/MAP for given candidate_list    
    """
        agglo_k = len(query_nodes) * k
        time_dict = {}
        time_dict['start_score_computation'] = time.time()
        #cos = nn.CosineSimilarity(dim=1, eps=1e-6)
        #a,b = zip(*candidate_list)
        #self_tensors = torch.index_select(all_embeds,dim=0,index=cudavar(self.av,torch.tensor(a)))
        #nbr_tensors  = torch.index_select(all_embeds,dim=0,index=cudavar(self.av,torch.tensor(b)))
        #scores   = cos(self_tensors,nbr_tensors).tolist()
        #cos = nn.CosineSimilarity(dim=0, eps=1e-6)
        #scores = []
        #for (a,b) in candidate_list:
        #  scores.append( cos(all_embeds[a],all_embeds[b]))
        if use_tensor:
            cos = nn.CosineSimilarity(dim=1, eps=1e-6)
            a, b = zip(*candidate_list)
            self_tensors = torch.index_select(all_embeds,
                                              dim=0,
                                              index=cudavar(
                                                  self.av, torch.tensor(a)))
            nbr_tensors = torch.index_select(all_embeds,
                                             dim=0,
                                             index=cudavar(
                                                 self.av, torch.tensor(b)))
            scores = cos(self_tensors, nbr_tensors).tolist()
        else:
            cos = nn.CosineSimilarity(dim=0, eps=1e-6)
            scores = []
            for (a, b) in candidate_list:
                scores.append(cos(all_embeds[a], all_embeds[b]))
            scores = torch.stack(scores).tolist()

        time_dict['end_score_computation'] = time.time()
        time_dict['start_heap_procedure'] = time.time()
        score_heap = []
        heap_size = 0
        qnode_heap_dict = {}
        qnode_heap_size_dict = {}
        for node in query_nodes:
            qnode_heap_dict[node] = []
            qnode_heap_size_dict[node] = 0

        for i in range(len(candidate_list)):
            if heap_size < agglo_k:
                heap_size = heap_size + 1
                heapq.heappush(score_heap, (scores[i], candidate_list[i]))
            else:
                heapq.heappushpop(score_heap, (scores[i], candidate_list[i]))
            for node in candidate_list[i]:
                if node in query_nodes:
                    if qnode_heap_size_dict[node] < k:
                        qnode_heap_size_dict[
                            node] = qnode_heap_size_dict[node] + 1
                        heapq.heappush(qnode_heap_dict[node],
                                       (scores[i], candidate_list[i]))
                    else:
                        heapq.heappushpop(qnode_heap_dict[node],
                                          (scores[i], candidate_list[i]))

        time_dict['end_heap_procedure'] = time.time()
        scores, predicted_edges = list(zip(*score_heap))
        all_scores = list(scores)
        all_labels = np.array([
            self.candidate_set[a][b] for (a, b) in list(list(predicted_edges))
        ])
        all_labels[all_labels == -1] = 0
        if np.all(all_labels == 1):
            ap_score = 1
        elif np.all(all_labels == 0):
            ap_score = 0
        else:
            ap_score = average_precision_score(all_labels, all_scores)

        ndcg = ndcg_score([all_labels], [all_scores])  #,k=agglo_k)

        ap_score_agglo = ap_score
        ndcg_score_agglo = ndcg

        all_qnode_ap = []
        all_qnode_ndcg = []
        for qnode in query_nodes:
            if qnode_heap_size_dict[qnode] == 0:
                continue
            scores, predicted_edges = list(zip(*qnode_heap_dict[qnode]))
            all_scores = list(scores)
            all_labels = np.array([
                self.candidate_set[a][b]
                for (a, b) in list(list(predicted_edges))
            ])
            all_labels[all_labels == -1] = 0
            if np.all(all_labels == 1):
                ap_score = 1
                ndcg = 1
            elif np.all(all_labels == 0):
                ap_score = 0
                ndcg = 0
            else:
                ap_score = average_precision_score(all_labels, all_scores)
                ndcg = ndcg_score([all_labels], [all_scores])
            all_qnode_ap.append(ap_score)
            all_qnode_ndcg.append(ndcg)

        return ap_score_agglo, ndcg_score_agglo, np.mean(
            all_qnode_ap), np.mean(all_qnode_ndcg), time_dict
Exemplo n.º 19
0
def lp_permute_test_result(av,gr: PermGnnGraph):
  query_nodes, \
    list_training_edges, \
    list_training_non_edges, \
    list_test_edges, \
    list_test_non_edges, \
    list_val_edges, \
    list_val_non_edges = fetch_lp_data_split(av,gr)

  prep_permgnn_graph(av,gr,query_nodes,list_training_edges,list_training_non_edges,list_val_edges,list_test_edges,list_val_non_edges,list_test_non_edges)

  device = "cuda" if av.has_cuda and av.want_cuda else "cpu"
  permNet = PermutationGenerator(av,gr).to(device)
  permGNN = PermutationInvariantGNN(av,gr,permNet).to(device)
  #if VAL_FRAC is 0, we fetch model weights from last trained epoch
  # else we fetch  best performing model on validation dataset
  if av.VAL_FRAC==0:
    checkpoint = load_model(av)
    logger.info("Loading latest trained model from training epoch %d",checkpoint['epoch'])
  else:
    es = EarlyStoppingModule(av)
    checkpoint = es.load_best_model()
    logger.info("Loading best validation result model from training epoch %d",checkpoint['epoch'])

  permGNN.load_state_dict(checkpoint['model_state_dict'])

  logger.info("Test scores  with canonical input sequence")
  start_time = time.time()
  
  all_nodes = list(range(permGNN.gr.get_num_nodes()))
  all_embeds = cudavar(av,torch.tensor([]))
  #batch and send nodes to avoid memory limit crash for larger graphs
  for i in range(0,permGNN.gr.get_num_nodes(),av.BATCH_SIZE) : 
    batch_nodes = all_nodes[i:i+av.BATCH_SIZE]
    set_size = permGNN.permNet.set_size_all[batch_nodes]
    neighbour_features = permGNN.permNet.padded_neighbour_features_all[batch_nodes]
    all_embeds = torch.cat((all_embeds,permGNN.getEmbeddingForFeatures(set_size,neighbour_features).data),dim=0)

  auc_score, ap_score, map_score, mrr_score = compute_scores_from_embeds(av,all_embeds,query_nodes,list_test_edges,list_test_non_edges)

  end_time = time.time()
  logger.info("auc_score: %.6f ap_score: %.6f map_score: %.6f mrr_score: %.6f Time: %.2f",auc_score,ap_score,map_score,mrr_score ,end_time-start_time)

  logger.info("Test scores with randomly permuted input sequence")
  for num_run in range(10):
    start_time = time.time()
  
    all_nodes = list(range(permGNN.gr.get_num_nodes()))
    all_embeds = cudavar(av,torch.tensor([]))
    #permute neighbour features
    perm_neighbour_features = pad_sequence([mat[torch.randperm(int(size))] \
                                            for (mat,size) in zip(permGNN.permNet.padded_neighbour_features_all,permGNN.permNet.set_size_all)],\
                                           batch_first=True)
    #batch and send nodes to avoid memory limit crash for larger graphs
    for i in range(0,permGNN.gr.get_num_nodes(),av.BATCH_SIZE) : 
      batch_nodes = all_nodes[i:i+av.BATCH_SIZE]
      set_size = permGNN.permNet.set_size_all[batch_nodes]
      #neighbour_features = permGNN.padded_neighbour_features_all[batch_nodes]
      neighbour_features = perm_neighbour_features[batch_nodes]      
      all_embeds = torch.cat((all_embeds,permGNN.getEmbeddingForFeatures(set_size,neighbour_features).data),dim=0)

    auc_score, ap_score, map_score, mrr_score = compute_scores_from_embeds(av,all_embeds,query_nodes,list_test_edges,list_test_non_edges)

    end_time = time.time()
    logger.info("auc_score: %.6f ap_score: %.6f map_score: %.6f mrr_score: %.6f Time: %.2f",auc_score,ap_score,map_score,mrr_score ,end_time-start_time)
def performance_analysis(av,gr: PermGnnGraph):
  query_nodes, \
    list_training_edges, \
    list_training_non_edges, \
    list_test_edges, \
    list_test_non_edges, \
    list_val_edges, \
    list_val_non_edges = fetch_lp_data_split(av,gr)

  prep_permgnn_graph(av,gr,query_nodes,list_training_edges,list_training_non_edges,list_val_edges,list_test_edges,list_val_non_edges,list_test_non_edges)
  
  #num_perms = 5
  num_perms = 1
  all_info = generate_global_permutations(av,gr,num_perms) 

  device = "cuda" if av.has_cuda and av.want_cuda else "cpu"
  permNet = PermutationGenerator(av,gr).to(device)
  permGNN = PermutationInvariantGNN(av,gr,permNet).to(device)
  #if VAL_FRAC is 0, we fetch model weights from last trained epoch
  # else we fetch  best performing model on validation dataset
  if av.VAL_FRAC==0:
    checkpoint = load_model(av)
    logger.info("Loading latest trained model from training epoch %d",checkpoint['epoch'])
  else:
    es = EarlyStoppingModule(av)
    checkpoint = es.load_best_model()
    logger.info("Loading best validation result model from training epoch %d",checkpoint['epoch'])

  permGNN.load_state_dict(checkpoint['model_state_dict'])

  cos = nn.CosineSimilarity(dim=1, eps=1e-6)
  
  all_nodes = list(range(permGNN.gr.get_num_nodes()))

  canonical_lstm_op = cudavar(av,torch.tensor([]))
  canonical_embeds = cudavar(av,torch.tensor([]))
  #batch and send nodes to avoid memory limit crash for larger graphs
  for i in range(0,permGNN.gr.get_num_nodes(),av.BATCH_SIZE) : 
    batch_nodes = all_nodes[i:i+av.BATCH_SIZE]
    set_size = permGNN.permNet.set_size_all[batch_nodes]
    neighbour_features = permGNN.permNet.padded_neighbour_features_all[batch_nodes]
    lstm_op,embeds = permGNN.getEmbeddingForFeatures(set_size,neighbour_features,True)
    canonical_lstm_op = torch.cat((canonical_lstm_op,lstm_op),dim=0)
    canonical_embeds = torch.cat((canonical_embeds,embeds),dim=0)
  canonical_inputs = permGNN.permNet.padded_neighbour_features_all.flatten(1) 
   
  canonical_tr_loss = compute_loss(av,gr,canonical_embeds)   
  
  for sample_frac in [0,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,1]:
    for perm_type in ['rand','rev']:
      for n_perm in range(num_perms):
        perm_info = all_info[sample_frac][perm_type][n_perm]  
        all_embeds = cudavar(av,torch.tensor([]))
        all_lstm_op = cudavar(av,torch.tensor([]))
        #permute neighbour features
        perm_neighbour_features = []
        for node in range(gr.get_num_nodes()): 
          node_feats_orig = permGNN.permNet.padded_neighbour_features_all[node]
          node_feats_perm = node_feats_orig[torch.tensor(perm_info[node])]
          perm_neighbour_features.append(node_feats_perm)
        perm_neighbour_features = pad_sequence(perm_neighbour_features,batch_first=True)
        #batch and send nodes to avoid memory limit crash for larger graphs
        for i in range(0,permGNN.gr.get_num_nodes(),av.BATCH_SIZE) : 
          batch_nodes = all_nodes[i:i+av.BATCH_SIZE]
          set_size = permGNN.permNet.set_size_all[batch_nodes]
          neighbour_features = perm_neighbour_features[batch_nodes]
          lstm_op,embeds = permGNN.getEmbeddingForFeatures(set_size,neighbour_features,True)
          all_lstm_op = torch.cat((all_lstm_op,lstm_op),dim=0)
          all_embeds = torch.cat((all_embeds,embeds),dim=0)
        
        all_info[sample_frac][perm_type][n_perm]['inputs_sens_score_list'] = cos(canonical_inputs, perm_neighbour_features.flatten(1))
        all_info[sample_frac][perm_type][n_perm]['lstm_op_sens_score_list'] = cos(canonical_lstm_op,all_lstm_op) 
        all_info[sample_frac][perm_type][n_perm]['embeds_sens_score_list'] = cos(canonical_embeds,all_embeds)

        perm_tr_loss = compute_loss(av,gr,all_embeds)
        all_info[sample_frac][perm_type][n_perm]['loss_var'] = abs(perm_tr_loss-canonical_tr_loss)/canonical_tr_loss
  fname = av.DIR_PATH+"/data/KTAU_var_data/" + "Ktau_variation_data"+"_"+av.TASK+"_"+av.DATASET_NAME+"_tfrac_"+str(av.TEST_FRAC)+"_vfrac_"+str(av.VAL_FRAC) + "_data.pkl"
  pickle.dump(all_info,open(fname,"wb"))
Exemplo n.º 21
0
 def forward(self, nodes):
     node_embeddings = self.all_embeddings(
         cudavar(av, torch.LongTensor(nodes)))
     node_hashcodes = self.hash_tanh1(self.hash_linear1(node_embeddings))
     return node_hashcodes