def forward(self, meta_parameter: TensorList, feat, label, sample_weight=None): # Assumes multiple filters, i.e. (sequences, filters, feat_dim, fH, fW) filter = meta_parameter[0] num_images = feat.shape[0] num_sequences = feat.shape[1] if feat.dim() == 5 else 1 # Compute scores scores = filter_layer.apply_filter( feat, filter, dilation_factors=self.filter_dilation_factors) if sample_weight is None: sample_weight = math.sqrt(1.0 / num_images) elif isinstance(sample_weight, torch.Tensor): if sample_weight.numel() == scores.numel(): sample_weight = sample_weight.view(scores.shape) elif sample_weight.dim() == 1: sample_weight = sample_weight.view(-1, 1, 1, 1, 1) label = label.view(scores.shape) data_residual = sample_weight * (scores - label) # Compute regularization residual. Put batch in second dimension reg_residual = self.filter_reg * filter.view(1, num_sequences, -1) return TensorList([data_residual, reg_residual])
def regress(self, weights, feat): """Run regressor (filter) on the features (feat).""" offset_maps = filter_layer.apply_filter(feat, weights) offset_maps = torch.relu(offset_maps) return offset_maps
def classify(self, weights, feat): """Run classifier (filter) on the features (feat).""" scores = filter_layer.apply_filter(feat, weights) if self.output_activation is not None: scores = self.output_activation(scores) return scores
def track_frame(self, filter_weights, backbone_feat): if backbone_feat.dim() == 5: num_sequences = backbone_feat.shape[1] backbone_feat = backbone_feat.reshape(-1, *backbone_feat.shape[-3:]) else: num_sequences = None test_feat = self.extract_classification_feat(backbone_feat, num_sequences) scores = filter_layer.apply_filter(test_feat, filter_weights) return scores
def forward(self, meta_parameter: TensorList, feat, bb, sample_weight=None, is_distractor=None): filter = meta_parameter[0] num_images = feat.shape[0] num_sequences = feat.shape[1] if feat.dim() == 5 else 1 filter_sz = (filter.shape[-2], filter.shape[-1]) # Compute scores scores = filter_layer.apply_filter(feat, filter) # Compute distance map center = ((bb[..., :2] + bb[..., 2:] / 2) / self.feat_stride).reshape( -1, 2).flip((1, )) if is_distractor is not None: center[is_distractor.reshape(-1), :] = 99999 dist_map = self.distance_map(center, scores.shape[-2:]) # Compute label map masks and weight label_map = self.label_map_predictor(dist_map).reshape( num_images, num_sequences, dist_map.shape[-2], dist_map.shape[-1]) target_mask = self.target_mask_predictor(dist_map).reshape( num_images, num_sequences, dist_map.shape[-2], dist_map.shape[-1]) spatial_weight = self.spatial_weight_predictor(dist_map).reshape( num_images, num_sequences, dist_map.shape[-2], dist_map.shape[-1]) if sample_weight is None: sample_weight = math.sqrt(1.0 / num_images) * spatial_weight elif isinstance(sample_weight, torch.Tensor): sample_weight = sample_weight.sqrt().reshape(-1, 1, 1, 1) * spatial_weight # Compute data residual scores_act = self.score_activation(scores, target_mask) data_residual = sample_weight * (scores_act - label_map) # Compute regularization residual. Put batch in second dimension reg_residual = self.filter_reg * filter.reshape(1, num_sequences, -1) return TensorList([data_residual, reg_residual])
def classify(self, weights, feat): """Run classifier (filter) on the features (feat).""" scores = filter_layer.apply_filter(feat, weights) return scores
def forward(self, weights, feat, bb, sample_weight=None, num_iter=None, compute_losses=True): """Runs the optimizer module. Note that [] denotes an optional dimension. args: weights: Initial weights. Dims (sequences, feat_dim, wH, wW). feat: Input feature maps. Dims (images_in_sequence, [sequences], feat_dim, H, W). bb: Target bounding boxes (x, y, w, h) in the image coords. Dims (images_in_sequence, [sequences], 4). sample_weight: Optional weight for each sample. Dims: (images_in_sequence, [sequences]). num_iter: Number of iterations to run. compute_losses: Whether to compute the (train) loss in each iteration. returns: weights: The final oprimized weights. weight_iterates: The weights computed in each iteration (including initial input and final output). losses: Train losses.""" # Sizes num_iter = self.num_iter if num_iter is None else num_iter num_images = feat.shape[0] num_sequences = feat.shape[1] if feat.dim() == 5 else 1 filter_sz = (weights.shape[-2], weights.shape[-1]) output_sz = (feat.shape[-2] + (weights.shape[-2] + 1) % 2, feat.shape[-1] + (weights.shape[-1] + 1) % 2) # Get learnable scalars step_length_factor = torch.exp(self.log_step_length) reg_weight = (self.filter_reg * self.filter_reg).clamp(min=self.min_filter_reg**2) # Compute distance map dmap_offset = (torch.Tensor(filter_sz).to(bb.device) % 2) / 2.0 center = ((bb[..., :2] + bb[..., 2:] / 2) / self.feat_stride).view( -1, 2).flip((1, )) - dmap_offset dist_map = self.distance_map(center, output_sz) # Compute label map masks and weight label_map = self.label_map_predictor(dist_map).view( num_images, num_sequences, *dist_map.shape[-2:]) target_mask = self.target_mask_predictor(dist_map).view( num_images, num_sequences, *dist_map.shape[-2:]) spatial_weight = self.spatial_weight_predictor(dist_map).view( num_images, num_sequences, *dist_map.shape[-2:]) # Get total sample weights if sample_weight is None: sample_weight = math.sqrt(1.0 / num_images) * spatial_weight elif isinstance(sample_weight, torch.Tensor): sample_weight = sample_weight.sqrt().view( num_images, num_sequences, 1, 1) * spatial_weight weight_iterates = [weights] losses = [] for i in range(num_iter): if i > 0 and i % self.detach_length == 0: weights = weights.detach() # Compute residuals scores = filter_layer.apply_filter(feat, weights) scores_act = self.score_activation(scores, target_mask) score_mask = self.score_activation_deriv(scores, target_mask) residuals = sample_weight * (scores_act - label_map) if compute_losses: losses.append(((residuals**2).sum() + reg_weight * (weights**2).sum()) / num_sequences) # Compute gradient residuals_mapped = score_mask * (sample_weight * residuals) weights_grad = filter_layer.apply_feat_transpose(feat, residuals_mapped, filter_sz, training=self.training) + \ reg_weight * weights # Map the gradient with the Jacobian scores_grad = filter_layer.apply_filter(feat, weights_grad) scores_grad = sample_weight * (score_mask * scores_grad) # Compute optimal step length alpha_num = (weights_grad * weights_grad).sum(dim=(1, 2, 3)) alpha_den = ((scores_grad * scores_grad).view( num_images, num_sequences, -1).sum(dim=(0, 2)) + reg_weight * alpha_num).clamp(1e-8) alpha = alpha_num / alpha_den # Update filter weights = weights - (step_length_factor * alpha.view(-1, 1, 1, 1)) * weights_grad # Add the weight iterate weight_iterates.append(weights) if compute_losses: scores = filter_layer.apply_filter(feat, weights) scores = self.score_activation(scores, target_mask) losses.append((((sample_weight * (scores - label_map))**2).sum() + reg_weight * (weights**2).sum()) / num_sequences) return weights, weight_iterates, losses
def forward(self, weights, feat, bb, radius=0, dim=4, sample_weight=None, num_iter=None, compute_losses=True): """Runs the optimizer module. args: weights: Initial weights. Dims (sequences, feat_dim, wH, wW). feat: Input feature maps. Dims (images_in_sequence, [sequences], feat_dim, H, W). bb: Target bounding boxes (x, y, w, h) in the image coords. Dims (images_in_sequence, [sequences], 4). radius: The size of vicinity of the target center. dim: Dims of offset maps, default is 4, indicating the distance from the center to four sides of the target. num_iter: Number of iterations to run. compute_losses: Whether to compute the (train) loss in each iteration. returns: weights: The final oprimized weights. weight_iterates: The weights computed in each iteration (including initial input and final output). losses: Train losses.""" # Sizes num_iter = self.num_iter if num_iter is None else num_iter num_images = feat.shape[0] num_sequences = feat.shape[1] if feat.dim() == 5 else 1 filter_sz = (weights.shape[-2], weights.shape[-1]) output_sz = (feat.shape[-2] + (weights.shape[-2] + 1) % 2, feat.shape[-1] + (weights.shape[-1] + 1) % 2) # Get learnable scalars step_length_factor = torch.exp(self.log_step_length) reg_weight = (self.filter_reg * self.filter_reg).clamp(min=self.min_filter_reg**2) # print("filter_reg: {}".format(self.filter_reg)) # print("log_step_length: {}".format(self.log_step_length)) w2h2_label, label_mask = self.generate_w2h2_label(bb, num_images, num_sequences, radius=radius, output_sz=output_sz, dim=dim) # shape: (num_images, num_sequences, 4, 72, 72) # Get total sample weights if sample_weight is None: sample_weight = math.sqrt(1.0 / num_images) elif isinstance(sample_weight, torch.Tensor): sample_weight = sample_weight.sqrt().view(num_images, num_sequences, 1, 1, 1) weight_iterates = [weights] losses = [] for i in range(num_iter): if i > 0 and i % self.detach_length == 0: weights = weights.detach() # Compute residuals # feat shape: [num_images, num_sequences, 256, 72, 72], weights shape: [num_sequences, 4, 256, 5, 5] scores = filter_layer.apply_filter(feat, weights) residuals = sample_weight * label_mask * (scores - w2h2_label.detach()) if compute_losses: losses.append((residuals**2).mean()) # Compute gradient residuals_mapped = sample_weight * residuals weights_grad = filter_layer.apply_feat_transpose(feat, residuals_mapped, filter_sz, training=self.training) + \ reg_weight * weights # print("weights_grad shape: {}".format(weights_grad.shape)) # [num_sequences, 4, 256, 5, 5] # Map the gradient with the Jacobian scores_grad = filter_layer.apply_filter(feat, weights_grad) scores_grad = sample_weight * scores_grad # print("scores_grad shape: {}".format(scores_grad.shape)) # [num_images, num_sequences, 4, 72, 72] # Compute optimal step length alpha_num = (weights_grad * weights_grad).view(num_sequences, -1).sum(dim=1) alpha_den = ((scores_grad * scores_grad).view( num_images, num_sequences, -1).sum(dim=(0, 2)) + reg_weight * alpha_num).clamp(1e-8) # print("alpha_num: {}, alpha_den: {}".format(alpha_num, alpha_den)) alpha = alpha_num / alpha_den # Update filter weights = weights - (step_length_factor * alpha.view(-1, 1, 1, 1, 1)) * weights_grad # Add the weight iterate weight_iterates.append(weights) if compute_losses: scores = filter_layer.apply_filter(feat, weights) losses.append(((sample_weight * label_mask * (scores - w2h2_label.detach()))**2).mean()) return weights, weight_iterates, losses
def forward(self, weights, feat, bb, sample_weight=None, num_iter=None, compute_losses=True): """Runs the optimizer module. Note that [] denotes an optional dimension. args: weights: Initial weights. Dims (sequences, feat_dim, wH, wW). feat: Input feature maps. Dims (images_in_sequence, [sequences], feat_dim, H, W). bb: Target bounding boxes (x, y, w, h) in the image coords. Dims (images_in_sequence, [sequences], 4). sample_weight: Optional weight for each sample. Dims: (images_in_sequence, [sequences]). num_iter: Number of iterations to run. compute_losses: Whether to compute the (train) loss in each iteration. returns: weights: The final oprimized weights. weight_iterates: The weights computed in each iteration (including initial input and final output). losses: Train losses.""" # Sizes num_iter = self.num_iter if num_iter is None else num_iter num_images = feat.shape[0] num_sequences = feat.shape[1] if feat.dim() == 5 else 1 filter_sz = (weights.shape[-2], weights.shape[-1]) output_sz = (feat.shape[-2] + (weights.shape[-2] + 1) % 2, feat.shape[-1] + (weights.shape[-1] + 1) % 2) # Get learnable scalars step_length_factor = torch.exp(self.log_step_length) reg_weight = (self.filter_reg*self.filter_reg).clamp(min=self.min_filter_reg**2) # Compute label density offset = (torch.Tensor(filter_sz).to(bb.device) % 2) / 2.0 center = ((bb[..., :2] + bb[..., 2:] / 2) / self.feat_stride).flip((-1,)) - offset label_density = self.get_label_density(center, output_sz) # Get total sample weights if sample_weight is None: sample_weight = torch.Tensor([1.0 / num_images]).to(feat.device) elif isinstance(sample_weight, torch.Tensor): sample_weight = sample_weight.reshape(num_images, num_sequences, 1, 1) exp_reg = 0 if self.softmax_reg is None else math.exp(self.softmax_reg) def _compute_loss(scores, weights): return torch.sum(sample_weight.reshape(sample_weight.shape[0], -1) * (torch.log(scores.exp().sum(dim=(-2, -1)) + exp_reg) - (label_density * scores).sum(dim=(-2, -1)))) / num_sequences +\ reg_weight * (weights ** 2).sum() / num_sequences weight_iterates = [weights] losses = [] for i in range(num_iter): if i > 0 and i % self.detach_length == 0: weights = weights.detach() # Compute "residuals" scores = filter_layer.apply_filter(feat, weights) scores_softmax = activation.softmax_reg(scores.reshape(num_images, num_sequences, -1), dim=2, reg=self.softmax_reg).reshape(scores.shape) res = sample_weight*(scores_softmax - label_density) if compute_losses: losses.append(_compute_loss(scores, weights)) # Compute gradient weights_grad = filter_layer.apply_feat_transpose(feat, res, filter_sz, training=self.training) + \ reg_weight * weights # Map the gradient with the Hessian scores_grad = filter_layer.apply_filter(feat, weights_grad) sm_scores_grad = scores_softmax * scores_grad hes_scores_grad = sm_scores_grad - scores_softmax * torch.sum(sm_scores_grad, dim=(-2,-1), keepdim=True) grad_hes_grad = (scores_grad * hes_scores_grad).reshape(num_images, num_sequences, -1).sum(dim=2).clamp(min=0) grad_hes_grad = (sample_weight.reshape(sample_weight.shape[0], -1) * grad_hes_grad).sum(dim=0) # Compute optimal step length alpha_num = (weights_grad * weights_grad).sum(dim=(1,2,3)) alpha_den = (grad_hes_grad + (reg_weight + self.alpha_eps) * alpha_num).clamp(1e-8) alpha = alpha_num / alpha_den # Update filter weights = weights - (step_length_factor * alpha.reshape(-1, 1, 1, 1)) * weights_grad # Add the weight iterate weight_iterates.append(weights) if compute_losses: scores = filter_layer.apply_filter(feat, weights) losses.append(_compute_loss(scores, weights)) return weights, weight_iterates, losses
def forward(self, filter, feat, label, compute_losses=True, sample_weight=None, num_iter=None, train_bb=None, is_distractor=None, test_feat=None, test_label=None, test_anno=None): if num_iter is None: num_iter = self.num_iter num_images = feat.shape[0] num_sequences = feat.shape[1] if feat.dim() == 5 else 1 filter_sz = (filter.shape[-2], filter.shape[-1]) step_length = torch.exp(self.log_step_length) reg_weight = self.filter_reg * self.filter_reg # Compute distance map center = ((train_bb[..., :2] + train_bb[..., 2:] / 2) / self.feat_stride).view(-1, 2).flip((1, )) if is_distractor is not None: center[is_distractor.view(-1), :] = 99999 dist_map = self.distance_map(center, label.shape[-2:]) # Compute label map masks and weight label_map = self.label_map_predictor(dist_map).view( num_images, num_sequences, dist_map.shape[-2], dist_map.shape[-1]) target_mask = self.target_mask_predictor(dist_map).view( num_images, num_sequences, dist_map.shape[-2], dist_map.shape[-1]) spatial_weight = self.spatial_weight_predictor(dist_map).view( num_images, num_sequences, dist_map.shape[-2], dist_map.shape[-1]) background_mask = 1.0 - target_mask if sample_weight is None: sample_weight = (1.0 / feat.shape[0]) * (spatial_weight * spatial_weight) elif isinstance(sample_weight, torch.Tensor): sample_weight = sample_weight.view( -1, 1, 1, 1) * (spatial_weight * spatial_weight) losses = {'train': [], 'test': []} for i in range(num_iter): # Compute gradient scores = filter_layer.apply_filter(feat, filter) scores = target_mask * scores + background_mask * F.relu(scores) score_mask = (scores.detach() > 0).float() * background_mask + target_mask residuals = sample_weight * (scores - label_map) filter_grad = filter_layer.apply_feat_transpose(feat, residuals, filter_sz, training=self.training) + \ reg_weight * filter # Map the gradient scores_grad = filter_layer.apply_filter(feat, filter_grad) scores_grad = sample_weight * (score_mask * scores_grad) filter_q = filter_layer.apply_feat_transpose(feat, scores_grad, filter_sz, training=self.training) + \ reg_weight * filter_grad # Compute step length alpha_num = (filter_grad * filter_grad).view(filter.shape[0], -1).sum(dim=1) alpha_den = (filter_grad * filter_q).view( filter.shape[0], -1).sum(dim=1).abs().clamp(1e-4) alpha = alpha_num / alpha_den # Update filter filter = filter - (step_length * alpha.view(-1, 1, 1, 1)) * filter_grad if compute_losses: losses['train'].append( (sample_weight * (scores - label_map)**2).mean()) if test_feat is not None: losses['test'].append( self._compute_test_loss(filter, test_feat, test_label, test_anno)) if compute_losses: scores = filter_layer.apply_filter(feat, filter) scores = target_mask * scores + background_mask * F.relu(scores) losses['train'].append( (sample_weight * (scores - label_map)**2).mean()) if test_feat is not None: losses['test'].append( self._compute_test_loss(filter, test_feat, test_label, test_anno)) return filter, losses
def _compute_test_loss(self, filter, feat, label, target_bb=None): scores = filter_layer.apply_filter(feat, filter) return self.test_loss(scores, label, target_bb)
def apply_target_model(self, weights, feat): """ Apply the target model to obtain the mask encodings""" mask_encoding = filter_layer.apply_filter( feat, weights, dilation_factors=self.filter_dilation_factors) return mask_encoding