Exemplo n.º 1
0
 def forward(self, mmfeats, ious1dmask):
     """
     Inputs:
         mmfeats (tensor[B, seg, hdim])
     Returns:
         iou_predict (tensor(B, seg*num_anchors))
         box_predict (tensor(B, seg*num_anchors, 2))
         ious1dmask (tensor(1, seg*num_anchors))
     """
     B, seg = mmfeats.shape[0], mmfeats.shape[1]
     # Predict Alignment Score
     if self.causal:
         if self.training:
             # De-confound Training
             self.d = 0.9*self.d + 0.1*mmfeats.detach().mean(0, keepdim=True) # [1, seg, hdim]
             mmfeats = F.normalize(mmfeats, dim=2)
             iou_predict = self.tau * self.fc_score(mmfeats.transpose(-1, -2)) / \
                             (torch.norm(self.fc_score.weight[:,:,0],dim=1)[None,:,None] + self.gamma) # [B, num_anchors, seg]
         else:
             # counterfactual TDE inference
             bias = self.cos_sim(mmfeats, self.d).unsqueeze(2) * F.normalize(self.d, dim=2) # [1, seg, hdim]
             mmfeats = F.normalize(mmfeats, dim=2) - self.alpha*bias
             iou_predict = self.tau * self.fc_score(mmfeats.transpose(-1, -2)) / \
                             (torch.norm(self.fc_score.weight[:,:,0],dim=1)[None,:,None] + self.gamma) # [B, num_anchors, seg]
     else:
         iou_predict = self.fc_score(mmfeats.transpose(-1, -2)) # [B, seg, num_anchors]
     iou_predict = torch.sigmoid(iou_predict).transpose(-1, -2) # [B, seg, num_anchors]
     iou_predict = iou_predict.contiguous().view(B, -1) * ious1dmask.float()
     # Predict Classification Score
     box_offset = self.fc_reg(mmfeats.transpose(-1, -2)).transpose(-1, -2)
     box_offset = box_offset.contiguous().view(B, seg * self.num_anchors, 2) # [B, seg*num_anchor, 2]
     
     return iou_predict, box_offset
Exemplo n.º 2
0
def extract_data(sample):
    states_tuple = tuple([_.state for _ in sample])
    actions_tuple = tuple([_.action for _ in sample])
    next_states_tuple = tuple([_.next_state for _ in sample])
    rewards_tuple = tuple([_.reward for _ in sample])

    compressed_rewards = torch.cat(rewards_tuple, dim=0)
    compressed_states = torch.cat(states_tuple, dim=0).requires_grad_()
    compressed_actions = torch.cat(actions_tuple, dim=0).requires_grad_()
    compressed_next_states = torch.cat(next_states_tuple, dim=0)

    return F.normalize(compressed_states), F.normalize(
        compressed_actions), F.normalize(
            compressed_next_states), compressed_rewards
Exemplo n.º 3
0
    def forward(self, mmfeats, proposals):
        """
        Inputs:
            mmfeats (tensor(B, seg, hdim))
            proposals  (tensor(num_prop, 2))
        Returns:
            iou_predict (tensor(B, seg*num_anchors))
            box_predict (tensor(B, seg*num_anchors, 2))
            ious1dmask (tensor(1, seg*num_anchors))
        """
        B, seg = mmfeats.shape[0], mmfeats.shape[1]
        mmfeats = self.norm(mmfeats.transpose(-1, -2)) # [B, hdim, seg]
        iou_predict = []
        box_offset = []
        
        for k in range(self.num_anchors):
            # Predict Alignment 
            if self.causal:
                if self.training:
                    self.d = 0.9*self.d + 0.1*mmfeats.detach().mean(0, keepdim=True) # [1, hdim, seg]
                    mmfeats = F.normalize(mmfeats, dim=1)
                    iou_predict.append(
                        self.tau * self.fc_score[k](mmfeats).squeeze(1) / (torch.norm(self.fc_score[k].weight) + self.gamma)
                    )
                else:
                    bias = self.cos_sim(mmfeats, self.d).unsqueeze(1) * F.normalize(self.d, dim=1)
                    mmfeats = F.normalize(mmfeats, dim=1)
                    iou_predict.append(
                        self.tau * self.fc_score[k](mmfeats - self.alpha*bias).squeeze(1) / (torch.norm(self.fc_score[k].weight) + self.gamma)
                    )
            else:
                iou_predict.append(
                    self.fc_score[k](mmfeats).squeeze(1)
                ) # [B, num_prop_width]
            # Predict Classification Score
            box_offset.append(
                self.fc_reg[k](mmfeats)
            ) # [B, 2, num_prop_width]
        # Predict Classification Score
        iou_predict = torch.cat(iou_predict, dim=1)
        iou_predict = torch.sigmoid(iou_predict) # [B, num_prop]
        # Predict Alignment Score
        box_offset = torch.cat(box_offset, dim=2)
        box_offset = box_offset.transpose(-1, -2)  # [B, num_prop, 2]
        box_anchor = proposals
        box_predict = box_anchor + box_offset # [B, num_prop, 2]

        return iou_predict, box_predict
Exemplo n.º 4
0
 def forward(self, semantic_aware_seg_feats, seg_masks):
     """ Perform Regression
     Args:
         semantic_aware_seg_feats: segment-level features; [B,seg,D]
         seg_masks: masks for effective segments in video; [B,seg]
     Returns:
         loc: prediction of normalized time span (t^s, t^e); [B,2]
         att_w: temporal attention weights (o); [B,seg]
     """
     if self.causal:
         # De-confound Training
         semantic_aware_seg_feats = F.normalize(semantic_aware_seg_feats, dim=2)
         summarized_vfeat, att_w = self.tatt(semantic_aware_seg_feats, seg_masks)
         if self.training:
             self.d = 0.9*self.d + 0.1*summarized_vfeat.mean(0, keepdim=True) # [1, hdim]
         else:
             # counterfactual TDE inference
             bias = self.cos_sim(summarized_vfeat, self.d).unsqueeze(1) * self.d # [1, hdim]
             summarized_vfeat = summarized_vfeat - self.alpha*bias
     else:
         # perform Eq. (13) and (14)
         summarized_vfeat, att_w = self.tatt(semantic_aware_seg_feats, seg_masks)
     # perform Eq. (15)
     loc = self.MLP_reg(summarized_vfeat) # loc = [t^s, t^e]
     return loc, att_w
Exemplo n.º 5
0
 def forward(self, batch):
     """
     Takes a data batch as input that should contain two word vectors. They are composed two times to get two
     representations. Both composition functions share the transformations but for each representation a different
     weighting is applied. A final representation is constructed using a weighted summation of both representations.
     :param batch: a dictionary
     :return: the final composed phrase, representation 1, representation 2
     """
     device = batch["device"]
     self._representation_1 = self.compose(
         word1=batch["w1"].to(device),
         word2=batch["w2"].to(device),
         combinig_tensor=self.combining_tensor_1,
         combining_bias=self.combining_bias_1)
     self._representation_2 = self.compose(
         word1=batch["w1"].to(device),
         word2=batch["w2"].to(device),
         combinig_tensor=self.combining_tensor_2,
         combining_bias=self.combining_bias_2)
     self._composed_phrase = self.representation_1 + self.representation_2
     if self.normalize_embeddings:
         self._composed_phrase = F.normalize(self.composed_phrase,
                                             p=2,
                                             dim=1)
     return self.composed_phrase, self.representation_1, self.representation_2
Exemplo n.º 6
0
    def forward(self, x, idx):
        contrast_out = None
        l, ab = torch.split(x, [1, 2], dim=1)

        feat_l, all_feats_l = self.resnet_l(l)
        feat_ab, all_feats_ab = self.resnet_ab(ab)

        # Normalize features
        feat_l = F.normalize(feat_l)
        feat_ab = F.normalize(feat_ab)

        feats = torch.cat([feat_l, feat_ab], 1)
        dense_feats = torch.cat([all_feats_l[-2], all_feats_ab[-2]], 1)

        if self.run_selfsup:
            self.contrast.float()
            contrast_out = self.contrast(feat_l.float(), feat_ab.float(), idx)

        return feats, dense_feats, contrast_out
Exemplo n.º 7
0
def video2feats(feat_file, vids, num_pre_clips, dataset_name):
    assert exists(feat_file)
    vid_feats = {}
    with h5py.File(feat_file, 'r') as f:
        for vid in vids:
            if dataset_name == "activitynet":
                feat = f[vid]['c3d_features'][:]
            else:
                feat = f[vid][:]
            feat = F.normalize(torch.from_numpy(feat), dim=1)
            vid_feats[vid] = avgfeats(feat, num_pre_clips)
    return vid_feats
 def compose(self, word1, word2, training):
     composed_phrase = transweigh(
         word1=word1,
         word2=word2,
         transformation_tensor=self.transformation_tensor,
         transformation_bias=self.transformation_bias,
         combining_bias=self.combining_bias,
         combining_tensor=self.combining_tensor,
         dropout_rate=self.dropout_rate,
         training=training)
     if self.normalize_embeddings:
         composed_phrase = F.normalize(composed_phrase, p=2, dim=1)
     return composed_phrase
 def compose(self, word1, word2):
     """
     This functions composes two input representations with the transformation weighting model. If set to True,
     the composed representation is normalized
     :param word1: the representation of the first word (torch tensor)
     :param word2: the representation of the second word (torch tensor)
     :param training: True if the model should be trained, False if the model is in inference
     :return: the composed vector representation, eventually normalized to unit norm
     """
     composed_phrase = transweigh(
         word1=word1,
         word2=word2,
         transformation_tensor=self.transformation_tensor,
         transformation_bias=self.transformation_bias,
         combining_bias=self.combining_bias,
         combining_tensor=self.combining_tensor,
         dropout_rate=self.dropout_rate,
         training=self.training)
     if self.normalize_embeddings:
         composed_phrase = F.normalize(composed_phrase, p=2, dim=1)
     return composed_phrase
Exemplo n.º 10
0
 def compose(self, word1, word2, combinig_tensor, combining_bias):
     """
     This functions composes two input representations with the transformation weighting model. If set to True,
     the composed representation is normalized
     :param word1: the representation of the first word (torch tensor)
     :param word2: the representation of the second word (torch tensor)
     :param combinig_tensor: The tensor used for weighting the transformed input vectors into one representation
     :param combining_bias: The corresponding bias
     :return: a composed representation
     """
     composed_phrase = transweigh(
         word1=word1,
         word2=word2,
         transformation_tensor=self.transformation_tensor,
         transformation_bias=self.transformation_bias,
         combining_bias=combining_bias,
         combining_tensor=combinig_tensor,
         dropout_rate=self.dropout_rate,
         training=self.training)
     if self.normalize_embeddings:
         composed_phrase = F.normalize(composed_phrase, p=2, dim=1)
     return composed_phrase
Exemplo n.º 11
0
 def forward(self, queries, wordlens, map2d):
     queries = self.encode_query(queries, wordlens)[:, :, None, None]
     map2d = self.conv(map2d)
     # print('map2d.shape',map2d.shape)
     return F.normalize(queries * map2d)