def forward(self, mmfeats, ious1dmask): """ Inputs: mmfeats (tensor[B, seg, hdim]) Returns: iou_predict (tensor(B, seg*num_anchors)) box_predict (tensor(B, seg*num_anchors, 2)) ious1dmask (tensor(1, seg*num_anchors)) """ B, seg = mmfeats.shape[0], mmfeats.shape[1] # Predict Alignment Score if self.causal: if self.training: # De-confound Training self.d = 0.9*self.d + 0.1*mmfeats.detach().mean(0, keepdim=True) # [1, seg, hdim] mmfeats = F.normalize(mmfeats, dim=2) iou_predict = self.tau * self.fc_score(mmfeats.transpose(-1, -2)) / \ (torch.norm(self.fc_score.weight[:,:,0],dim=1)[None,:,None] + self.gamma) # [B, num_anchors, seg] else: # counterfactual TDE inference bias = self.cos_sim(mmfeats, self.d).unsqueeze(2) * F.normalize(self.d, dim=2) # [1, seg, hdim] mmfeats = F.normalize(mmfeats, dim=2) - self.alpha*bias iou_predict = self.tau * self.fc_score(mmfeats.transpose(-1, -2)) / \ (torch.norm(self.fc_score.weight[:,:,0],dim=1)[None,:,None] + self.gamma) # [B, num_anchors, seg] else: iou_predict = self.fc_score(mmfeats.transpose(-1, -2)) # [B, seg, num_anchors] iou_predict = torch.sigmoid(iou_predict).transpose(-1, -2) # [B, seg, num_anchors] iou_predict = iou_predict.contiguous().view(B, -1) * ious1dmask.float() # Predict Classification Score box_offset = self.fc_reg(mmfeats.transpose(-1, -2)).transpose(-1, -2) box_offset = box_offset.contiguous().view(B, seg * self.num_anchors, 2) # [B, seg*num_anchor, 2] return iou_predict, box_offset
def extract_data(sample): states_tuple = tuple([_.state for _ in sample]) actions_tuple = tuple([_.action for _ in sample]) next_states_tuple = tuple([_.next_state for _ in sample]) rewards_tuple = tuple([_.reward for _ in sample]) compressed_rewards = torch.cat(rewards_tuple, dim=0) compressed_states = torch.cat(states_tuple, dim=0).requires_grad_() compressed_actions = torch.cat(actions_tuple, dim=0).requires_grad_() compressed_next_states = torch.cat(next_states_tuple, dim=0) return F.normalize(compressed_states), F.normalize( compressed_actions), F.normalize( compressed_next_states), compressed_rewards
def forward(self, mmfeats, proposals): """ Inputs: mmfeats (tensor(B, seg, hdim)) proposals (tensor(num_prop, 2)) Returns: iou_predict (tensor(B, seg*num_anchors)) box_predict (tensor(B, seg*num_anchors, 2)) ious1dmask (tensor(1, seg*num_anchors)) """ B, seg = mmfeats.shape[0], mmfeats.shape[1] mmfeats = self.norm(mmfeats.transpose(-1, -2)) # [B, hdim, seg] iou_predict = [] box_offset = [] for k in range(self.num_anchors): # Predict Alignment if self.causal: if self.training: self.d = 0.9*self.d + 0.1*mmfeats.detach().mean(0, keepdim=True) # [1, hdim, seg] mmfeats = F.normalize(mmfeats, dim=1) iou_predict.append( self.tau * self.fc_score[k](mmfeats).squeeze(1) / (torch.norm(self.fc_score[k].weight) + self.gamma) ) else: bias = self.cos_sim(mmfeats, self.d).unsqueeze(1) * F.normalize(self.d, dim=1) mmfeats = F.normalize(mmfeats, dim=1) iou_predict.append( self.tau * self.fc_score[k](mmfeats - self.alpha*bias).squeeze(1) / (torch.norm(self.fc_score[k].weight) + self.gamma) ) else: iou_predict.append( self.fc_score[k](mmfeats).squeeze(1) ) # [B, num_prop_width] # Predict Classification Score box_offset.append( self.fc_reg[k](mmfeats) ) # [B, 2, num_prop_width] # Predict Classification Score iou_predict = torch.cat(iou_predict, dim=1) iou_predict = torch.sigmoid(iou_predict) # [B, num_prop] # Predict Alignment Score box_offset = torch.cat(box_offset, dim=2) box_offset = box_offset.transpose(-1, -2) # [B, num_prop, 2] box_anchor = proposals box_predict = box_anchor + box_offset # [B, num_prop, 2] return iou_predict, box_predict
def forward(self, semantic_aware_seg_feats, seg_masks): """ Perform Regression Args: semantic_aware_seg_feats: segment-level features; [B,seg,D] seg_masks: masks for effective segments in video; [B,seg] Returns: loc: prediction of normalized time span (t^s, t^e); [B,2] att_w: temporal attention weights (o); [B,seg] """ if self.causal: # De-confound Training semantic_aware_seg_feats = F.normalize(semantic_aware_seg_feats, dim=2) summarized_vfeat, att_w = self.tatt(semantic_aware_seg_feats, seg_masks) if self.training: self.d = 0.9*self.d + 0.1*summarized_vfeat.mean(0, keepdim=True) # [1, hdim] else: # counterfactual TDE inference bias = self.cos_sim(summarized_vfeat, self.d).unsqueeze(1) * self.d # [1, hdim] summarized_vfeat = summarized_vfeat - self.alpha*bias else: # perform Eq. (13) and (14) summarized_vfeat, att_w = self.tatt(semantic_aware_seg_feats, seg_masks) # perform Eq. (15) loc = self.MLP_reg(summarized_vfeat) # loc = [t^s, t^e] return loc, att_w
def forward(self, batch): """ Takes a data batch as input that should contain two word vectors. They are composed two times to get two representations. Both composition functions share the transformations but for each representation a different weighting is applied. A final representation is constructed using a weighted summation of both representations. :param batch: a dictionary :return: the final composed phrase, representation 1, representation 2 """ device = batch["device"] self._representation_1 = self.compose( word1=batch["w1"].to(device), word2=batch["w2"].to(device), combinig_tensor=self.combining_tensor_1, combining_bias=self.combining_bias_1) self._representation_2 = self.compose( word1=batch["w1"].to(device), word2=batch["w2"].to(device), combinig_tensor=self.combining_tensor_2, combining_bias=self.combining_bias_2) self._composed_phrase = self.representation_1 + self.representation_2 if self.normalize_embeddings: self._composed_phrase = F.normalize(self.composed_phrase, p=2, dim=1) return self.composed_phrase, self.representation_1, self.representation_2
def forward(self, x, idx): contrast_out = None l, ab = torch.split(x, [1, 2], dim=1) feat_l, all_feats_l = self.resnet_l(l) feat_ab, all_feats_ab = self.resnet_ab(ab) # Normalize features feat_l = F.normalize(feat_l) feat_ab = F.normalize(feat_ab) feats = torch.cat([feat_l, feat_ab], 1) dense_feats = torch.cat([all_feats_l[-2], all_feats_ab[-2]], 1) if self.run_selfsup: self.contrast.float() contrast_out = self.contrast(feat_l.float(), feat_ab.float(), idx) return feats, dense_feats, contrast_out
def video2feats(feat_file, vids, num_pre_clips, dataset_name): assert exists(feat_file) vid_feats = {} with h5py.File(feat_file, 'r') as f: for vid in vids: if dataset_name == "activitynet": feat = f[vid]['c3d_features'][:] else: feat = f[vid][:] feat = F.normalize(torch.from_numpy(feat), dim=1) vid_feats[vid] = avgfeats(feat, num_pre_clips) return vid_feats
def compose(self, word1, word2, training): composed_phrase = transweigh( word1=word1, word2=word2, transformation_tensor=self.transformation_tensor, transformation_bias=self.transformation_bias, combining_bias=self.combining_bias, combining_tensor=self.combining_tensor, dropout_rate=self.dropout_rate, training=training) if self.normalize_embeddings: composed_phrase = F.normalize(composed_phrase, p=2, dim=1) return composed_phrase
def compose(self, word1, word2): """ This functions composes two input representations with the transformation weighting model. If set to True, the composed representation is normalized :param word1: the representation of the first word (torch tensor) :param word2: the representation of the second word (torch tensor) :param training: True if the model should be trained, False if the model is in inference :return: the composed vector representation, eventually normalized to unit norm """ composed_phrase = transweigh( word1=word1, word2=word2, transformation_tensor=self.transformation_tensor, transformation_bias=self.transformation_bias, combining_bias=self.combining_bias, combining_tensor=self.combining_tensor, dropout_rate=self.dropout_rate, training=self.training) if self.normalize_embeddings: composed_phrase = F.normalize(composed_phrase, p=2, dim=1) return composed_phrase
def compose(self, word1, word2, combinig_tensor, combining_bias): """ This functions composes two input representations with the transformation weighting model. If set to True, the composed representation is normalized :param word1: the representation of the first word (torch tensor) :param word2: the representation of the second word (torch tensor) :param combinig_tensor: The tensor used for weighting the transformed input vectors into one representation :param combining_bias: The corresponding bias :return: a composed representation """ composed_phrase = transweigh( word1=word1, word2=word2, transformation_tensor=self.transformation_tensor, transformation_bias=self.transformation_bias, combining_bias=combining_bias, combining_tensor=combinig_tensor, dropout_rate=self.dropout_rate, training=self.training) if self.normalize_embeddings: composed_phrase = F.normalize(composed_phrase, p=2, dim=1) return composed_phrase
def forward(self, queries, wordlens, map2d): queries = self.encode_query(queries, wordlens)[:, :, None, None] map2d = self.conv(map2d) # print('map2d.shape',map2d.shape) return F.normalize(queries * map2d)