def extract_transformed(self, im, dp, pos, scale, image_sz, transforms): """Extract features from a set of transformed image samples. args: im: Image. dp: Depth pos: Center position for extraction. scale: Image scale to extract features from. image_sz: Size to resize the image samples to before extraction. transforms: A set of image transforms to apply. """ # Get image patche im_patch, dp_patch, _ = sample_patch(im, dp, pos, scale * image_sz, image_sz) # Apply transforms im_patches = torch.cat([T(im_patch) for T in transforms]) dp_patches = torch.cat([T(dp_patch) for T in transforms]) # Compute features feature_map = TensorList([ f.get_feature(im_patches, dp_patches) for f in self.features ]).unroll() return feature_map
def generate_init_samples(self, im: torch.Tensor, target_pos, sample_scale) -> TensorList: # Compute augmentation size aug_expansion_factor = getattr(self.params, 'augmentation_expansion_factor', None) aug_expansion_sz = self.img_sample_sz.clone() aug_output_sz = None if aug_expansion_factor is not None and aug_expansion_factor != 1: aug_expansion_sz = (self.img_sample_sz * aug_expansion_factor).long() aug_expansion_sz += (aug_expansion_sz - self.img_sample_sz.long()) % 2 aug_expansion_sz = aug_expansion_sz.float() aug_output_sz = self.img_sample_sz.long().tolist() # Random shift operator get_rand_shift = lambda: None # Create transofmations self.transforms = [augmentation.Identity(aug_output_sz)] if 'shift' in self.params.augmentation_method: self.transforms.extend([augmentation.Translation(shift, aug_output_sz) for shift in self.params.augmentation_method['shift']]) if 'relativeshift' in self.params.augmentation_method: get_absolute = lambda shift: (torch.Tensor(shift) * self.img_sample_sz/2).long().tolist() self.transforms.extend([augmentation.Translation(get_absolute(shift), aug_output_sz) for shift in self.params.augmentation_method['relativeshift']]) if 'fliplr' in self.params.augmentation_method and self.params.augmentation_method['fliplr']: self.transforms.append(augmentation.FlipHorizontal(aug_output_sz, get_rand_shift())) if 'blur' in self.params.augmentation_method: self.transforms.extend([augmentation.Blur(sigma, aug_output_sz, get_rand_shift()) for sigma in self.params.augmentation_method['blur']]) if 'scale' in self.params.augmentation_method: self.transforms.extend([augmentation.Scale(scale_factor, aug_output_sz, get_rand_shift()) for scale_factor in self.params.augmentation_method['scale']]) if 'rotate' in self.params.augmentation_method: self.transforms.extend([augmentation.Rotate(angle, aug_output_sz, get_rand_shift()) for angle in self.params.augmentation_method['rotate']]) init_sample = sample_patch(im, target_pos, sample_scale*aug_expansion_sz, aug_expansion_sz) init_samples = torch.cat([T(init_sample) for T in self.transforms]) if not self.params.augmentation: init_samples = init_samples[0:1,...] return init_samples
def extract(self, im, pos, scales, image_sz, return_patches=False): """Extract features. args: im: Image. pos: Center position for extraction. scales: Image scales to extract features from. image_sz: Size to resize the image samples to before extraction. """ if isinstance(scales, (int, float)): scales = [scales] # Get image patches patch_iter, coord_iter = zip(*(sample_patch(im, pos, s*image_sz, image_sz, mode=self.patch_mode) for s in scales)) im_patches = torch.cat(list(patch_iter)) patch_coords = torch.cat(list(coord_iter)) # im_patches = torch.cat([sample_patch(im, pos, s*image_sz, image_sz) for s in scales]) # Compute features feature_map = TensorList([f.get_feature(im_patches) for f in self.features]).unroll() if return_patches: return feature_map, patch_coords, im_patches else: return feature_map, patch_coords
def extract_transformed(self, im, pos, scale, image_sz, transforms): """Extract features from a set of transformed image samples. args: im: Image. pos: Center position for extraction. scale: Image scale to extract features from. image_sz: Size to resize the image samples to before extraction. transforms: A set of image transforms to apply. """ # Get image patche im_patch = sample_patch(im, pos, scale * image_sz, image_sz) # Apply transforms im_patches = torch.cat([T(im_patch) for T in transforms]) # import cv2 # import numpy as np # cv2.namedWindow('Patch', cv2.WINDOW_AUTOSIZE) # for p in im_patches: # p_ = p.permute(1, 2, 0).cpu().numpy() # cv2.imshow('Patch', p_.astype(np.uint8)) # cv2.waitKey(0) # Compute features feature_map = TensorList( [f.get_feature(im_patches) for f in self.features]).unroll() return feature_map
def extract_transformed(self, im, pos, scale, image_sz, transforms, debug_save_name=None): """Extract features from a set of transformed image samples. args: im: Image. pos: Center position for extraction. scale: Image scale to extract features from. image_sz: Size to resize the image samples to before extraction. transforms: A set of image transforms to apply. """ # Get image patche im_patch = sample_patch(im, pos, scale * image_sz, image_sz) # Apply transforms with fluid.dygraph.guard(fluid.CPUPlace()): im_patches = np.stack([T(im_patch) for T in transforms]) if debug_save_name is not None: np.save(debug_save_name, im_patches) im_patches = np.transpose(im_patches, (0, 3, 1, 2)) # Compute features feature_map = TensorList( [f.get_feature(im_patches) for f in self.features]).unroll() return feature_map
def extract(self, im, pos, scales, image_sz, debug_save_name=None): """Extract features. args: im: Image. pos: Center position for extraction. scales: Image scales to extract features from. image_sz: Size to resize the image samples to before extraction. """ if isinstance(scales, (int, float)): scales = [scales] # Get image patches with fluid.dygraph.guard(fluid.CPUPlace()): im_patches = np.stack([ sample_patch(im, pos, s * image_sz, image_sz) for s in scales ]) if debug_save_name is not None: np.save(debug_save_name, im_patches) im_patches = np.transpose(im_patches, (0, 3, 1, 2)) # Compute features feature_map = TensorList( [f.get_feature(im_patches) for f in self.features]).unroll() return feature_map
def extract(self, im, pos, scales, image_sz): if isinstance(scales, (int, float)): scales = [scales] # Get image patches im_patches = torch.cat( [sample_patch(im, pos, s * image_sz, image_sz) for s in scales]) if im.shape[1] == 1: im_patches = torch.cat([ sample_patch(im, pos, s * image_sz, image_sz) for s in scales ]) return im_patches # Compute features feature_map = torch.cat(TensorList( [f.get_feature(im_patches) for f in self.features]).unroll(), dim=1) return feature_map
def extract(self, im, pos, scales, image_sz): if isinstance(scales, (int, float)): scales = [scales] # Get image patches im_patches = np.stack( [sample_patch(im, pos, s * image_sz, image_sz) for s in scales]) im_patches = np.transpose(im_patches, (0, 3, 1, 2)) # Compute features feature_map = layers.concat(TensorList( [f.get_feature(im_patches) for f in self.features]).unroll(), axis=1) return feature_map
def extract(self, im, pos, scales, image_sz): """Extract features. args: im: Image. pos: Center position for extraction. scales: Image scales to extract features from. image_sz: Size to resize the image samples to before extraction. """ if isinstance(scales, (int, float)): scales = [scales] # print(image_sz,scales) # Get image patches im_patches = torch.cat([sample_patch(im, pos, s*image_sz, image_sz) for s in scales]) # Compute features feature_map = TensorList([f.get_feature(im_patches) for f in self.features]).unroll() return feature_map
def extract_v2(self, im, pos, scales, image_sz): """Extract img_patch/features. args: im: Image. pos: Center position for extraction. scales: Image scales to extract features from. image_sz: Size to resize the image samples to before extraction. """ if isinstance(scales, (int, float)): scales = [scales] # Get image patches im_patches = torch.cat( [sample_patch(im, pos, s * image_sz, image_sz) for s in scales]) im_patches_np = im_patches[0:1].squeeze().permute( 1, 2, 0).numpy().astype(np.uint8) #cv2.imwrite('test.jpg', im_patches_np) # Compute features feature_map = TensorList( [f.get_feature(im_patches) for f in self.features]).unroll() return im_patches_np, feature_map
def track(self, image, info: dict = None) -> dict: self.debug_info = {} self.frame_num += 1 self.debug_info['frame_num'] = self.frame_num # Obtain the merged segmentation prediction for the previous frames. This is used to update the target model # and determine the search region for the current frame if self.object_id is None: prev_segmentation_prob_im = info['previous_output'][ 'segmentation_raw'] else: prev_segmentation_prob_im = info['previous_output'][ 'segmentation_raw'][self.object_id] prev_segmentation_prob_im = torch.from_numpy( prev_segmentation_prob_im).unsqueeze(0).unsqueeze(0).float() # ********************************************************************************** # # ------- Update the target model using merged masks from the previous frame ------- # # ********************************************************************************** # if self.frame_num > 2: # Crop the segmentation mask for the previous search area if self.params.get('update_target_model', True): prev_segmentation_prob_crop, _ = sample_patch( prev_segmentation_prob_im, self.prev_pos, self.prev_scale * self.img_sample_sz, self.img_sample_sz, mode=self.params.get('border_mode', 'replicate'), max_scale_change=self.params.get('patch_max_scale_change'), is_mask=True) # Update the target model self.update_target_model(self.prev_test_x, prev_segmentation_prob_crop.clone()) # ****************************************************************************************** # # -------- Estimate target box using the merged segmentation mask from prev. frame --------- # # --- The estimated target box is used to obtain the search region for the current frame --- # # ****************************************************************************************** # self.pos, self.target_sz = self.get_target_state( prev_segmentation_prob_im.squeeze()) new_target_scale = torch.sqrt(self.target_sz.prod() / self.base_target_sz.prod()) if self.params.get('max_scale_change') is not None: # Do not allow drastic scale change, as this might be caused due to occlusions or incorrect mask # prediction new_target_scale = self.clip_scale_change(new_target_scale) # Update target size and scale using the filtered target size self.target_scale = new_target_scale self.target_sz = self.base_target_sz * self.target_scale # ********************************************************************** # # ---------- Predict segmentation mask for the current frame ----------- # # ********************************************************************** # # Convert image im = numpy_to_torch(image) # Extract backbone features backbone_feat, sample_coords, im_patches = self.extract_backbone_features( im, self.get_centered_sample_pos(), self.target_scale, self.img_sample_sz) # Save the search region information as it is needed to merge the segmentation masks for the next frame update self.prev_pos = self.get_centered_sample_pos() self.prev_scale = self.target_scale # Extract features input to the target model test_x = self.get_target_model_features(backbone_feat) # Location of sample sample_pos, sample_scale = self.get_sample_location(sample_coords) # Predict the segmentation mask. Note: These are raw scores, before the sigmoid segmentation_scores = self.segment_target(test_x, backbone_feat) self.prev_test_x = test_x # Get the segmentation scores for the full image. # Regions outside the search region are assigned low scores (-100) segmentation_scores_im = self.convert_scores_crop_to_image( segmentation_scores, im, sample_scale, sample_pos) segmentation_mask_im = (segmentation_scores_im > 0.0).float() # Binary segmentation mask segmentation_prob_im = torch.sigmoid( segmentation_scores_im ) # Probability of being target at each pixel # ************************************************************************ # # ---------- Output estimated segmentation mask and target box ----------- # # ************************************************************************ # # Get target box from the predicted segmentation pred_pos, pred_target_sz = self.get_target_state( segmentation_prob_im.squeeze()) new_state = torch.cat( (pred_pos[[1, 0]] - (pred_target_sz[[1, 0]] - 1) / 2, pred_target_sz[[1, 0]])) output_state = new_state.tolist() if self.object_id is None: # In single object mode, no merge called. Hence return the probabilities segmentation_output = segmentation_prob_im else: # In multi-object mode, return raw scores segmentation_output = segmentation_scores_im segmentation_mask_im = segmentation_mask_im.view( *segmentation_mask_im.shape[-2:]).cpu().numpy() segmentation_output = segmentation_output.cpu().numpy() if self.visdom is not None: self.visdom.register(segmentation_scores_im, 'heatmap', 2, 'Seg Scores' + self.id_str) self.visdom.register(self.debug_info, 'info_dict', 1, 'Status') out = { 'segmentation': segmentation_mask_im, 'target_bbox': output_state, 'segmentation_raw': segmentation_output } return out
def track(self, image): self.frame_num += 1 # For debugging and display only if self.params.output_image: image_show = image.copy() # Initialization hard_flag = False # Conver to tensor and GPU image_cuda = self.numpy_to_tensor_gpu(image) # ------- LOCALIZATION ------- # sample_pos = self.target_pos.clone() sample_scale = self.sample_scale.clone() target_sample_sz = self.target_sample_sz.clone() # Sample and extract backbone features test_sample = sample_patch(image_cuda, sample_pos, sample_scale*self.img_sample_sz, self.img_sample_sz) test_backbone_features = self.params.model.extract_backbone_features(test_sample) # Extract locator features and calcualte the localization score test_locator_proposals = self.get_locator_proposals(target_sample_sz) test_locator_features = self.params.model.extract_locator_features(test_backbone_features, test_locator_proposals).squeeze() test_locator_score = torch.mm(test_locator_features, self.locator_model) # Window output and find argmax if getattr(self.params, 'window_output', False): test_locator_score = test_locator_score * self.output_window max_score, max_id = torch.max(test_locator_score, dim=0) max_score, max_id = max_score.item(), max_id.item() # When target is found if max_score > self.params.target_not_found: # Update target position self.target_pos[1] += (self.locator_proposals_xc[max_id].item() - self.img_sample_sz[1]*0.5) * sample_scale # x self.target_pos[0] += (self.locator_proposals_yc[max_id].item() - self.img_sample_sz[0]*0.5) * sample_scale # y # ------- REFINEMENT ------- # # Extract iou backbone features and refine target box test_iou_backbone_features = self.params.model.extract_iou_features(test_backbone_features) new_target_box = self.refine_target_box(self.target_pos, self.target_sz, sample_pos, sample_scale, test_iou_backbone_features) # Update target box if new_target_box is not None: self.target_pos = sample_pos + (new_target_box[:2] + new_target_box[2:]/2 - (self.img_sample_sz - 1) / 2).flip((0,)) * sample_scale self.target_sz = self.params.scale_damp * self.target_sz + (1 - self.params.scale_damp) * new_target_box[2:].flip((0,)) * sample_scale self.target_sz = torch.min(self.target_sz, self.initial_target_sz*self.max_scale_factor) self.target_sz = torch.max(self.target_sz, self.initial_target_sz*self.min_scale_factor) # Update the sampling message self.search_area = torch.prod(self.target_sz * self.params.search_padding) self.sample_scale = torch.sqrt(self.search_area / self.params.img_sample_area) self.target_sample_sz = self.target_sz / self.sample_scale # ------- UPDAT FEATURE MODEL------- # train_sample = sample_patch(image_cuda, self.target_pos, self.sample_scale*self.img_sample_sz, self.img_sample_sz) train_backbone_features = self.params.model.extract_backbone_features(train_sample) # Extract locator features train_locator_proposals = self.get_locator_proposals(self.target_sample_sz) train_locator_features = self.params.model.extract_locator_features(train_backbone_features, train_locator_proposals).squeeze() # Hard negtive mining and Adaptive learning rate if self.params.hard_negative_mining: train_locator_score = torch.mm(train_locator_features, self.locator_model) max_score = train_locator_score.max() train_locator_score = train_locator_score * self.hard_negative_region_mask if (train_locator_score.max() > self.params.hard_negative_threshold*max_score) and (train_locator_score.max() > self.params.target_not_found): hard_flag = True learning_rate = self.params.hard_negative_learning_rate else: learning_rate = self.params.learning_rate # Update locator model self.locator_XTX = (1 - learning_rate) * self.locator_XTX + learning_rate * torch.mm(train_locator_features.t(), train_locator_features) self.locator_XTY = (1 - learning_rate) * self.locator_XTY + learning_rate * torch.mm(train_locator_features.t(), self.locator_labels) # Adjust weight of initial frame self.current_initial_frame_weight = (1 - learning_rate) * self.current_initial_frame_weight if self.current_initial_frame_weight < self.params.init_samples_minimum_weight: diff = self.params.init_samples_minimum_weight - self.current_initial_frame_weight coff = diff / (1 - self.current_initial_frame_weight) self.locator_XTX = (1 - coff) * self.locator_XTX + coff * self.locator_XTX_initial self.locator_XTY = (1 - coff) * self.locator_XTY + coff * self.locator_XTY_initial self.current_initial_frame_weight = self.params.init_samples_minimum_weight # ------- TRAIN ------- # if (self.frame_num % self.params.train_skipping == 0) or (hard_flag): self.locator_model = self.train_locator_model(self.locator_XTX+self.locator_regularization, self.locator_XTY, self.locator_model) # ------- RETURN ------- # # Return new state new_state = torch.cat((self.target_pos[[1,0]] - self.target_sz[[1,0]]*0.5, self.target_sz[[1,0]])) new_state[0], new_state[1] = new_state[0].clamp(0), new_state[1].clamp(0) new_state[2] = new_state[2].clamp(0, self.IMG_WIDTH -new_state[0]) new_state[3] = new_state[3].clamp(0, self.IMG_HEIGHT-new_state[1]) # Output result image if self.params.output_image: self.output_result_image(image_show, new_state) return new_state.tolist()
def track(self, image): self.frame_num += 1 # For debug show only #image_show = image.copy() # Conver to tensor and GPU image_cuda = self.numpy_to_tensor_gpu(image) # ------- LOCALIZATION ------- # sample_pos = self.target_pos.clone() sample_scale = self.sample_scale.clone() target_sample_sz = self.target_sample_sz.clone() # sample and extract features test_sample = sample_patch(image_cuda, sample_pos, sample_scale*self.img_sample_sz, self.img_sample_sz) test_locator_proposals = self.get_locator_proposals(target_sample_sz) self.params.model.extract(test_sample, test_locator_proposals) # calcualte the localization score test_locator_score = torch.mm(self.params.model.locator_features, self.locator_model) if getattr(self.params, 'window_output', False): test_locator_score = test_locator_score * self.output_window max_score, max_id = torch.max(test_locator_score, dim=0) max_score, max_id = max_score.item(), max_id.item() # when target not found if max_score < self.params.target_not_found_threshold: # maintain the original target position and size new_state = torch.cat((self.target_pos[[1,0]] - (self.target_sz[[1,0]]-1)/2, self.target_sz[[1,0]])) # Output result image #self.output_result_image(image_show, new_state) return new_state.tolist() # update the target position self.target_pos[0] = self.target_pos[0] + (self.proposals_yc[max_id].item() - self.img_sample_sz[1]*0.5) * sample_scale self.target_pos[1] = self.target_pos[1] + (self.proposals_xc[max_id].item() - self.img_sample_sz[0]*0.5) * sample_scale # refine the target position and size by IoUNet new_pos, new_target_sz = self.refine_target_box(self.target_pos, self.target_sz, sample_pos, sample_scale) # bound the taeget size if new_target_sz is not None: new_target_sz = torch.min(new_target_sz, self.initial_target_sz*self.max_scale_factor) new_target_sz = torch.max(new_target_sz, self.initial_target_sz*self.min_scale_factor) # update the target and sampling message if new_pos is not None: self.target_pos = new_pos.clone() self.target_sz = new_target_sz.clone() self.search_area = torch.prod(self.target_sz * self.params.search_area_scale) self.sample_scale = torch.sqrt(self.search_area / self.params.img_sample_area) self.target_sample_sz = self.target_sz / self.sample_scale # Return new state new_state = torch.cat((self.target_pos[[1,0]] - (self.target_sz[[1,0]]-1)/2, self.target_sz[[1,0]])) # Output result image #self.output_result_image(image_show, new_state) # ------- UPDAT MODEL------- # train_sample = sample_patch(image_cuda, self.target_pos, self.sample_scale*self.img_sample_sz, self.img_sample_sz) train_locator_proposals = self.get_locator_proposals(self.target_sample_sz) self.params.model.extract(train_sample, train_locator_proposals, only_locator=True) hard_flag = False if self.params.hard_negative_mining: train_locator_score = torch.mm(self.params.model.locator_features, self.locator_model) train_locator_score = train_locator_score * self.hard_negative_region_mask max_score, _ = torch.max(train_locator_score, dim=0) if max_score > self.params.hard_negative_threshold: hard_flag = True if hard_flag: learning_rate = self.params.hard_negative_learning_rate else: learning_rate = self.params.learning_rate self.locator_features_model = (1 - learning_rate) * self.locator_features_model + learning_rate * self.params.model.locator_features self.current_initial_frame_weight = (1 - learning_rate) * self.current_initial_frame_weight if self.current_initial_frame_weight < self.params.init_samples_minimum_weight: diff = self.params.init_samples_minimum_weight - self.current_initial_frame_weight coff = diff / (1 - self.current_initial_frame_weight) self.locator_features_model = (1 - coff) * self.locator_features_model + coff * self.initial_locator_features self.current_initial_frame_weight = self.params.init_samples_minimum_weight if (self.frame_num % self.params.train_skipping == 0) or (hard_flag): self.locator_model = self.train_locator_model(self.locator_features_model, self.locator_model) return new_state.tolist()