def __getitem__(self, idx): """ Load the data from list, and match the ground-truth bounding boxes with prior bounding boxes. :return bbox_tensor: matched bounding box, dim: (num_priors, 4) :return bbox_label: matched classification label, dim: (num_priors) """ # TODO: implement data loading # 1. Load image as well as the bounding box with its label # 2. Normalize the image with self.mean and self.std # 3. Convert the bounding box from corner form (left-top, right-bottom): [(x,y), (x+w, y+h)] to # center form: [(center_x, center_y, w, h)] # 4. Normalize the bounding box position value from 0 to 1 item = self.dataset_list[idx] #print(item['image_path']) self.image_path = item['image_path'] print(item['labels']) self.labels = torch.Tensor(np.asarray(item['labels'])) self.bound_boxes = item['bound_boxes'] self.bound_boxes = torch.Tensor(np.asarray(item['bound_boxes'])) #print(self.bound_boxes.shape) img = Image.open(self.image_path) w, h = img.size self.bound_boxes /= torch.Tensor([w, h, w, h]).expand_as(self.bound_boxes) # resize image img = img.resize((self.image_size, self.image_size), Image.ANTIALIAS) # normalize_img img = np.asarray(img, dtype=np.float32) # normalise the image pixels to (-1,1) img = (img / 255.0) * 2 - 1 # convert to tensor img_tensor = torch.Tensor(img.astype(float)) img_tensor = img_tensor.view( (img.shape[2], img.shape[0], img.shape[1])) # 4. Do the augmentation if needed. e.g. random clip the bounding box or flip the bounding box # 5. Do the matching prior and generate ground-truth labels as well as the boxes bbox_tensor, bbox_label_tensor = match_priors(self.prior_bound_boxes, self.bound_boxes, self.labels, iou_threshold=0.5) #[DEBUG] check the output. assert isinstance(bbox_label_tensor, torch.Tensor) assert isinstance(bbox_tensor, torch.Tensor) assert bbox_tensor.dim() == 2 assert bbox_tensor.shape[1] == 4 assert bbox_label_tensor.dim() == 1 assert bbox_label_tensor.shape[0] == bbox_tensor.shape[0] return img_tensor, bbox_tensor, bbox_label_tensor
def test_priorbb(self): prior_layer_cfg = [ # Example: { 'layer_name': 'Conv5', 'feature_dim_hw': (38, 38), 'bbox_size': (30, 30), 'aspect_ratio': (1.0, 1 / 2, 1 / 3, 2.0, 3.0, 1.0) }, { 'layer_name': 'Conv11', 'feature_dim_hw': (19, 19), 'bbox_size': (60, 60), 'aspect_ratio': (1.0, 1 / 2, 1 / 3, 2.0, 3.0, 1.0) }, { 'layer_name': 'Conv14_2', 'feature_dim_hw': (10, 10), 'bbox_size': (111, 111), 'aspect_ratio': (1.0, 1 / 2, 1 / 3, 2.0, 3.0, 1.0) }, { 'layer_name': 'Conv15_2', 'feature_dim_hw': (5, 5), 'bbox_size': (162, 162), 'aspect_ratio': (1.0, 1 / 2, 1 / 3, 2.0, 3.0, 1.0) }, { 'layer_name': 'Conv16_2', 'feature_dim_hw': (3, 3), 'bbox_size': (213, 213), 'aspect_ratio': (1.0, 1 / 2, 1 / 3, 2.0, 3.0, 1.0) }, { 'layer_name': 'Conv17_2', 'feature_dim_hw': (1, 1), 'bbox_size': (264, 264), 'aspect_ratio': (1.0, 1 / 2, 1 / 3, 2.0, 3.0, 1.0) } ] pp = generate_prior_bboxes(prior_layer_cfg) print(pp[0:1], pp[39:40]) temp = iou(pp[0:6], pp[0:1]) print('iou', temp) gt_label = torch.tensor([1]) # print(gt_label.dim[0]) print('matching', match_priors(pp[0:38], pp[38:39], gt_label, 0.5)) np.set_printoptions(threshold=np.inf) size_bounds = [0.2, 0.9] img_shape = [300, 300] # list = self.ssd_size_bounds_to_values(size_bounds,6,img_shape) # print(list) #prior_bbox = self.ssd_anchor_one_layer((300,300),(38,38),(30,60), [2, .5, 3, 1. / 3], 3) #print(prior_bbox) self.assertEqual('foo'.upper(), 'FOO')
def __getitem__(self, idx): img_dir = self.img_dir_list[idx] json_dir = self.json_dir_list[idx] sample_img = cv2.imread(img_dir, cv2.COLOR_BGR2RGB) gt_bboxes, gt_labels = get_bbox_label(json_dir) gt_bboxes = torch.tensor(gt_bboxes, dtype=torch.float32) gt_labels = torch.tensor(gt_labels, dtype=torch.int32) # data augmentation data_augmentation = SSDAugmentation(mode= self.mode) sample_img = np.array(sample_img, dtype=np.float64) sample_img, gt_bboxes, gt_labels = data_augmentation(sample_img, gt_bboxes, gt_labels) # Do the matching prior and generate ground-truth labels as well as the boxes bbox_tensor, bbox_label_tensor = match_priors(self.prior_bboxes, gt_bboxes, gt_labels) output_prior_bboxes = self.prior_bboxes return sample_img, bbox_tensor, bbox_label_tensor.long(), output_prior_bboxes
def __getitem__(self, idx): """ Load the data from list, and match the ground-truth bounding boxes with prior bounding boxes. :return bbox_tensor: matched bounding box, dim: (num_priors, 4) :return bbox_label: matched classification label, dim: (num_priors) """ # data loading # 1. Load image as well as the bounding box with its label item = self.dataset_list[idx] img_path = item['img'] h = item['h'] w = item['w'] sample_labels = item['labels'] sample_bboxes_corner = item['bboxes'] img = Image.open(img_path) # data augment if self.train: img, sample_bboxes_corner = self.random_flip( img, sample_bboxes_corner) # crop img, sample_bboxes_corner, sample_labels = self.crop_img( img, sample_bboxes_corner, sample_labels, img_path, h) img = img.resize((self.img_size, self.img_size)) # Convert the bounding box from corner form (left-top, right-bottom): [(x,y), (x+w, y+h)] to # center form: [(center_x, center_y, w, h)] lt = sample_bboxes_corner[:, 0, :] rb = sample_bboxes_corner[:, 1, :] wh = rb - lt c = (lt + wh / 2) sample_bboxes = np.stack( (c[:, 0] / h, c[:, 1] / h, wh[:, 0] / h, wh[:, 1] / h), axis=1) # crop # Normalize the image with self.mean and self.std sample_img = (np.array(img, dtype=np.float) - self.mean) / self.std img_tensor = torch.from_numpy(sample_img).float() sample_bboxes = torch.from_numpy(np.asarray(sample_bboxes)).float() sample_labels = torch.from_numpy(np.asarray(sample_labels)).float() # matching prior, generate ground-truth labels and boxes bbox_tensor, bbox_label_tensor, bbox_offset_tensor = match_priors( self.prior_bboxes.cpu(), sample_bboxes, sample_labels, iou_threshold=0.5) if self.show: self.show_bbox(img, sample_bboxes.numpy(), self.prior_bboxes.cpu().numpy(), bbox_label_tensor.numpy()) # [DEBUG] check the output. assert isinstance(bbox_label_tensor, torch.Tensor) assert isinstance(bbox_tensor, torch.Tensor) assert bbox_tensor.dim() == 2 assert bbox_tensor.shape[1] == 4 assert bbox_label_tensor.dim() == 1 assert bbox_label_tensor.shape[0] == bbox_tensor.shape[0] # return bbox_tensor, bbox_label_tensor, img_tensor return bbox_offset_tensor, bbox_label_tensor, img_tensor
def __getitem__(self, index): """ Load the data from list, and match the ground-truth bounding boxes with prior bounding boxes. Labels are include [car, traffic sign, person]. irrelevant objects are set to 0. :return bbox_tensor: matched bounding box, dim: (num_priors, 4). :return bbox_label: matched classification label, dim: (num_priors). """ # Alert current dataset status. digit = str(len(str(len(self.dataset_list)))) self.prepared_index += self.num_worker current_index = (self.prepared_index % len(self.dataset_list)) n_instance = ('[{:' + digit + 'd}').format(current_index) + '/' + str( len(self.dataset_list)) + ']' n_percentage = '[{:6.2f}%]'.format(current_index * 100. / len(self.dataset_list)) print('\r' + ('Preparing dataset at index [{:' + digit + 'd}').format(index) + ']' + n_instance + n_percentage, end='') if self.is_debug: pr = cProfile.Profile() pr.enable() else: pr = None # Prepare configurations. item = self.dataset_list[index] self.imgWidth = float(item['imgWidth']) self.imgHeight = float(item['imgHeight']) self.resize_ratio = min(self.imgHeight / 300., self.imgWidth / 300.) image = Image.open(item['file']) confidences, locations = self.sanitize(item) # Return the case there is no match at all. if confidences.nonzero().shape[0] == 0: image = self.resize(image) if image.shape != torch.Size([self.imgHeight, self.imgWidth, 3]): # Filter out broken input image by a 300x300x3 black patch. image = torch.zeros([300, 300, 3]) else: # Crop the top left 300x300x3 patch if image is not corrupted. image = image[0:300, 0:300, :] image = self.normalize(image) image = image.view( (image.shape[2], image.shape[0], image.shape[1])) return image, confidences, locations # Resize the image and label first. image = self.resize(image) locations = self.resize(locations) # Prepare image array first to update crop. image = self.crop(image) image = self.brighten(image) image = self.normalize(image) # Prepare labels second to apply crop. locations = self.crop(locations) locations = self.normalize(locations) # Do the matching prior and generate ground-truth labels as well as the boxes. confidences = helper.match_priors( self.prior_bboxes, locations, iou_threshold=self.matching_iou_threshold) if self.is_debug: pr.disable() pr.print_stats(sort='time') # Reshape image to channel by X by Y. image = image.view((image.shape[2], image.shape[0], image.shape[1])) return image, confidences, self.prior_bboxes
def __getitem__(self, idx): """ Load the data from list, and match the ground-truth bounding boxes with prior bounding boxes. :return bbox_tensor: matched bounding box, dim: (num_priors, 4) :return bbox_label: matched classification label, dim: (num_priors) """ # TODO: implement data loading # 1. Load image as well as the bounding box with its label item = self.dataset_list[idx] img = Image.open(item['img_path']) label = item['label'] bbox = item['bbox'] bbox_arr = np.array(bbox).reshape(-1, 4) # tuple to array # 2. Random crop to 1024*1024 bbox_croped = [] label_croped = [] num_box_arr = len(bbox_arr) flag = False count = 0 while flag is False: count += 1 crop_startX = random.uniform(0, 1) * 1024 crop_size = 1024 # if after 200 random, still not find a good crop position, then let crop pos = bbox pos if count == 200: crop_startX = bbox_arr[0][0] crop_size = bbox_arr[0][2] - bbox_arr[0][0] # print('bbox_arr 200',bbox_arr) # print('img_path',item['img_path']) # print('crop_startX',crop_startX) # print('crop_size',crop_size) for i in range(num_box_arr): if bbox_arr[i][ 2] > 2048: # bamberg_000000_000441_gtCoarse_polygons.json strange data bbox_arr[i][2] = 2048 if bbox_arr[i][0] >= crop_startX and bbox_arr[i][ 2] <= crop_startX + crop_size: flag = True box = [ bbox_arr[i][0] - crop_startX, bbox_arr[i][1], bbox_arr[i][2] - crop_startX, bbox_arr[i][3] ] bbox_croped.append(box) label_croped.append(label[i]) crop_pos = (crop_startX, 0, crop_startX + crop_size, crop_size) img_croped = img.crop(crop_pos) resized_size = 300 img_resized = img_croped.resize((resized_size, resized_size)) # img_resized.save("img300.jpg", "JPEG") bbox_resized = np.divide(bbox_croped, crop_size / resized_size) # 3. Convert the bounding box from corner form (left-top, right-bottom): [(x,y), (x+w, y+h)] to # center form: [(center_x, center_y, w, h)] bbox_center_form = bbox_helper.corner2center( torch.tensor(bbox_resized)) # 4. Do the augmentation if needed. e.g. random clip the bounding box or flip the bounding box # Random flip will_flip = random.uniform(0, 1) if will_flip > 0.5: bbox_center_form[:, 0] = resized_size - bbox_center_form[:, 0] # x coordinates after flip img_resized = img_resized.transpose(Image.FLIP_LEFT_RIGHT) # common.drawRectsPLT(img_resized,bbox_helper.center2corner(bbox_center_form),[int(i) for i in label_croped]) # Normalize image img_normalized = (img_resized - self.mean) / self.std # 5. Normalize the bounding box position value from 0 to 1, sample_labels = torch.tensor(label_croped, dtype=torch.float32) sample_bboxes = torch.tensor(bbox_center_form / resized_size, dtype=torch.float32) sample_img = np.asarray(img_normalized, dtype=np.float32) img_tensor = torch.from_numpy(sample_img) # 6. Do the matching prior and generate ground-truth labels as well as the boxes bbox_tensor, bbox_label_tensor = bbox_helper.match_priors( self.prior_bboxes, sample_bboxes, sample_labels, iou_threshold=0.5) # [DEBUG] check the output. assert isinstance(bbox_label_tensor, torch.Tensor) assert isinstance(bbox_tensor, torch.Tensor) assert bbox_tensor.dim() == 2 assert bbox_tensor.shape[1] == 4 assert bbox_label_tensor.dim() == 1 assert bbox_label_tensor.shape[0] == bbox_tensor.shape[0] return bbox_tensor, bbox_label_tensor, img_tensor
def __getitem__(self, idx): """ Load the data from list, and match the ground-truth bounding boxes with prior bounding boxes. :return bbox_tensor: matched bounding box, dim: (num_priors, 4) :return bbox_label: matched classification label, dim: (num_priors) """ sample_idx = (lambda i, n: i // n if n is not 0 else i)(idx, self.n_augmented) sample = self.dataset_list[sample_idx] # TODO: implement data loading # 1. Load image as well as the bounding box with its label image = Image.open(sample['image_path']) label = sample['label'] cls = sample['class'] bbox = sample['bbox'] # 2. convert the image and bbox to numpy array and crop to square form image = np.asarray(image, dtype=np.uint8) bbox_cr = np.asarray(bbox, dtype=np.float32) show_list = [{ 'image': image.copy(), 'bbox_cr': bbox_cr.copy(), 'label': label.copy(), 'title': 'Original' }] # image, bbox_cr, cls, label = self.crop(image, bbox_cr, cls, label, is_random=False) # show_list.append({'image': image.copy(), 'bbox_cr': bbox_cr.copy(), 'label': label.copy(), 'title': 'Cropping(Square)'}) # 3. Do the augmentation if needed. e.g. random clip the bounding box or flip the bounding box if self.n_augmented > 0: # calling the cropping function image, bbox_cr, cls, label = self.crop(image, bbox_cr, cls, label, is_random=True) show_list.append({ 'image': image.copy(), 'bbox_cr': bbox_cr.copy(), 'label': label.copy(), 'title': 'Cropping(Random)' }) # calling the flip function image, bbox_cr = self.flip(image, bbox_cr) show_list.append({ 'image': image.copy(), 'bbox_cr': bbox_cr.copy(), 'label': label.copy(), 'title': 'Flipping' }) # 4. resize the image (H, W, C) to net size(300, 300) bbox_cr[:, [0, 2]] = bbox_cr[:, [0, 2]] * ( self.net_size[0] / image.shape[1]) # Width bbox_cr[:, [1, 3]] = bbox_cr[:, [1, 3]] * ( self.net_size[1] / image.shape[0]) # Height image = cv2.resize(image, dsize=self.net_size, interpolation=cv2.INTER_CUBIC) show_list.append({ 'image': image.copy(), 'bbox_cr': bbox_cr.copy(), 'label': label.copy(), 'title': 'Resizing' }) # Check intermediate input if self.debug: self.show_image(show_list) # 5. Convert the bounding box from corner form (left-top, right-bottom): [(x,y), (x+w, y+h)] to # center form: [(center_x, center_y, w, h)] center_xy = (bbox_cr[:, 2:] + bbox_cr[:, :2]) / 2. center_wh = (bbox_cr[:, 2:] - bbox_cr[:, :2]) bbox_ct = np.concatenate((center_xy, center_wh), axis=1) # 6. Normalize the image with self.mean and self.std image_norm = np.divide( (np.asarray(image, dtype=np.float32) - self.mean), self.std) # Normalize the bounding box position value from 0 to 1 bbox_ct[:, [0, 2]] = bbox_ct[:, [0, 2]] / self.net_size[0] bbox_ct[:, [1, 3]] = bbox_ct[:, [1, 3]] / self.net_size[1] # 7. Do the matching prior and generate ground-truth labels as well as the boxes sample_labels = torch.from_numpy(np.asarray(cls)).type( torch.long) # Cuda Tensor sample_bboxes = torch.from_numpy(bbox_ct).type( torch.float32) # Cuda Tensor if torch.cuda.is_available(): sample_labels = sample_labels.cuda() sample_bboxes = sample_bboxes.cuda() bbox_tensor, bbox_label_tensor = match_priors(self.prior_bboxes, sample_bboxes, sample_labels, iou_threshold=0.5) sample_img_tensor = torch.from_numpy(image_norm.transpose()).type( torch.float32) # Cuda Tensor if torch.cuda.is_available(): bbox_tensor = bbox_tensor.cuda() bbox_label_tensor = bbox_label_tensor.cuda() sample_img_tensor = sample_img_tensor.cuda() # Check the final tensor input if self.debug: self.show_tensor_image(sample_img_tensor.clone(), bbox_tensor.clone(), bbox_label_tensor.clone(), label.copy(), sample_idx) # [DEBUG] check the output. assert isinstance(sample_img_tensor, torch.Tensor) assert isinstance(bbox_label_tensor, torch.Tensor) assert isinstance(bbox_tensor, torch.Tensor) assert bbox_tensor.dim() == 2 assert bbox_tensor.shape[1] == 4 assert bbox_label_tensor.dim() == 1 assert bbox_label_tensor.shape[0] == bbox_tensor.shape[0] return sample_img_tensor, bbox_label_tensor, bbox_tensor, self.prior_bboxes
def __getitem__(self, idx): """ Load the data from list, and match the ground-truth bounding boxes with prior bounding boxes. :return bbox_tensor: matched bounding box, dim: (num_priors, 4) :return bbox_label: matched classification label, dim: (num_priors) """ # 1. Load image as well as the bounding box with its label item = self.dataset_list[idx] file_path = item['file_path'] ground_truth = item['label'] sample_labels = np.asarray(ground_truth[0], dtype=np.float32) sample_bboxes = np.asarray(ground_truth[1], dtype=np.float32) sample_img = Image.open(file_path) augmentation = np.random.randint(0, 4) sample_img, sample_bboxes, sample_labels = self.crop(sample_img,sample_bboxes,sample_labels) #augmentation=None if augmentation == 0: sample_img = ImageEnhance.Brightness(sample_img).enhance(np.random.randint(5, 25) / 10.0) # horizontal flip if augmentation == 1: sample_img = sample_img.transpose(Image.FLIP_LEFT_RIGHT) width = sample_img.size[0] flipped_boxes = sample_bboxes.copy() # sample_bboxes = [float(width), float(top), float(left), float(top)] - flipped_bboxes sample_bboxes[:, 0] = width - flipped_boxes[:, 2] sample_bboxes[:, 2] = width - flipped_boxes[:, 0] # flipped_boxes = sample_bboxes.copy() # sample_bboxes[:, 0] = flipped_boxes[:, 2] # sample_bboxes[:, 2] = flipped_boxes[:, 0] if augmentation == 2: if random.choice([True, False]) == True: sample_img = sample_img.filter(ImageFilter.BLUR) else: sample_img = sample_img.filter(ImageFilter.SHARPEN) # if augmentation == 3: # w, h = sample_img.size[:2] # left = np.random.randint(0, np.min(sample_bboxes[:, 0])-(np.min(sample_bboxes[:, 0])/5).astype(int)) # # print("left---------------",left) # top = np.random.randint(0, np.min(sample_bboxes[:, 1])-(np.min(sample_bboxes[:, 1])/5).astype(int)) # right = np.random.randint(np.max(sample_bboxes[:, 2])+((w-np.max(sample_bboxes[:, 2]))/5).astype(int), w) # # print("right--------------",right) # bottom = np.random.randint( np.max(sample_bboxes[:, 3])+((h-np.max(sample_bboxes[:, 3]))/5).astype(int), h) # # print("bottom-------------",bottom) # # sample_img = sample_img.crop((left, top, right, bottom)) # # print(sample_bboxes[0]) # # print("left", left) # sample_bboxes = sample_bboxes - [float(left), float(top), float(left), float(top)] # # print(sample_bboxes[0]) # 2. Normalize the image with self.mean and self.std img = sample_img.resize((300, 300)) img_array = np.asarray(img) img_array = (img_array-self.mean)/self.std h, w, c = img_array.shape[0], img_array.shape[1], img_array.shape[2] # 3. Convert the bounding box from corner form (left-top, right-bottom): [(x,y), (x+w, y+h)] to # center form: [(center_x, center_y, w, h)] #print([sample_img.size[0],sample_img.size[1],sample_img.size[0],sample_img.size[1]]) sample_bboxes = torch.Tensor(sample_bboxes)/torch.Tensor([sample_img.size[0],sample_img.size[1],sample_img.size[0],sample_img.size[1]]) # 4. Normalize the bounding box position value from 0 to 1 sample_bboxes = corner2center(sample_bboxes) #self.prior_bboxes = center2corner(self.prior_bboxes) # 4. Do the augmentation if needed. e.g. random clip the bounding box or flip the bounding box # TODO: data augmentation # 5. Do the matching prior and generate ground-truth labels as well as the boxes bbox_tensor, bbox_label_tensor = match_priors(self.prior_bboxes.cuda(), sample_bboxes.cuda(), torch.Tensor(sample_labels).cuda(), iou_threshold=0.45) #bbox_tensor, bbox_label_tensor = assign_priors(sample_bboxes.cuda(), torch.Tensor(sample_labels).cuda(), self.prior_bboxes.cuda(), iou_threshold=0.5) img_tensor = torch.Tensor(img_array) img_tensor = img_tensor.view(c, h, w) #print(img_tensor.shape) # [DEBUG] check the output. assert isinstance(bbox_label_tensor, torch.Tensor) assert isinstance(bbox_tensor, torch.Tensor) assert bbox_tensor.dim() == 2 assert bbox_tensor.shape[1] == 4 assert bbox_label_tensor.dim() == 1 assert bbox_label_tensor.shape[0] == bbox_tensor.shape[0] return img_tensor, bbox_tensor, bbox_label_tensor
def __getitem__(self, idx): """ Load the data from list, and match the ground-truth bounding boxes with prior bounding boxes. :return bbox_tensor: matched bounding box, dim: (num_priors, 4) :return bbox_label: matched classification label, dim: (num_priors) """ # TODO: implement data loading # 1. Load image as well as the bounding box with its label # 2. Normalize the image with self.mean and self.std # 3. Convert the bounding box from corner form (left-top, right-bottom): [(x,y), (x+w, y+h)] to # center form: [(center_x, center_y, w, h)] # 4. Normalize the bounding box position value from 0 to 1 item = self.dataset_list[idx] image_path = item['image_path'] labels = np.asarray(item['labels']) labels = torch.Tensor(labels).cuda() locations = torch.Tensor(item['bboxes']).cuda() bbox = np.array(item['bboxes']) image = Image.open(image_path) self.imgWidth, self.imgHeight = image.size self.resize_ratio = min(self.imgHeight / 300., self.imgWidth / 300.) locations = helper.corner2center(locations) image = self.resize(image) locations = self.resize(locations) # Prepare image array first to update crop. image = self.crop(image) image = self.brighten(image) image = self.normalize(image) # Prepare labels second to apply crop. locations = self.crop(locations) locations = self.normalize(locations) # convert to tensor img_tensor = image.view( (image.shape[2], image.shape[0], image.shape[1])) img_tensor = img_tensor.cuda() labels = labels[self.ios_index] # 4. Do the augmentation if needed. e.g. random clip the bounding box or flip the bounding box # 5. Do the matching prior and generate ground-truth labels as well as the boxes bbox_tensor, bbox_label_tensor = match_priors( self.prior_bboxes, helper.center2corner(locations), labels, iou_threshold=0.5) # [DEBUG] check the output. # assert isinstance(bbox_label_tensor, torch.Tensor) # assert isinstance(bbox_tensor, torch.Tensor) # assert bbox_tensor.dim() == 2 # assert bbox_tensor.shape[1] == 4 # assert bbox_label_tensor.dim() == 1 # assert bbox_label_tensor.shape[0] == bbox_tensor.shape[0] return img_tensor, bbox_tensor, bbox_label_tensor