def __prepare_batch(self): im_batch = numpy.zeros((self.batch_size, self.num_image_channels, self.net_input_height, self.net_input_width), dtype=numpy.float32) label_batch_list = [numpy.zeros((self.batch_size, self.num_output_channels, v, v), dtype=numpy.float32) for v in self.feature_map_size_list] mask_batch_list = [numpy.zeros((self.batch_size, self.num_output_channels, v, v), dtype=numpy.float32) for v in self.feature_map_size_list] data_batch = DataBatch(self.mxnet_module) loop = 0 while loop < self.batch_size: if loop < self.num_neg_images_per_batch: # fill neg images first rand_idx = random.choice(self.negative_index) im, _, __ = self.data_provider.read_by_index(rand_idx) random_resize_factor = random.random() * (self.neg_image_resize_factor_interval[1] - self.neg_image_resize_factor_interval[0]) + self.neg_image_resize_factor_interval[0] im = cv2.resize(im, (0, 0), fy=random_resize_factor, fx=random_resize_factor) h_interval = im.shape[0] - self.net_input_height w_interval = im.shape[1] - self.net_input_width if h_interval >= 0: y_top = random.randint(0, h_interval) else: y_pad = int(-h_interval / 2) if w_interval >= 0: x_left = random.randint(0, w_interval) else: x_pad = int(-w_interval / 2) im_input = numpy.zeros((self.net_input_height, self.net_input_width, self.num_image_channels), dtype=numpy.uint8) if h_interval >= 0 and w_interval >= 0: im_input[:, :, :] = im[y_top:y_top + self.net_input_height, x_left:x_left + self.net_input_width, :] elif h_interval >= 0 and w_interval < 0: im_input[:, x_pad:x_pad + im.shape[1], :] = im[y_top:y_top + self.net_input_height, :, :] elif h_interval < 0 and w_interval >= 0: im_input[y_pad:y_pad + im.shape[0], :, :] = im[:, x_left:x_left + self.net_input_width, :] else: im_input[y_pad:y_pad + im.shape[0], x_pad:x_pad + im.shape[1], :] = im[:, :, :] # data augmentation if self.enable_horizon_flip and random.random() > 0.5: im_input = Augmentor.flip(im_input, 'h') if self.enable_vertical_flip and random.random() > 0.5: im_input = Augmentor.flip(im_input, 'v') if random.random() > 0.5: random.shuffle(self.pixel_augmentor_func_list) for augmentor in self.pixel_augmentor_func_list: im_input = augmentor(im_input) # display for debug------------------------------------------------- # cv2.imshow('im', im_pad.astype(dtype=numpy.uint8)) # cv2.waitKey() im_input = im_input.astype(numpy.float32) im_input = im_input.transpose([2, 0, 1]) im_batch[loop] = im_input for label_batch in label_batch_list: label_batch[loop, 1, :, :] = 1 for mask_batch in mask_batch_list: mask_batch[loop, 0:2, :, :] = 1 else: rand_idx = random.choice(self.positive_index) im, _, bboxes_org = self.data_provider.read_by_index(rand_idx) num_bboxes = bboxes_org.shape[0] bboxes = bboxes_org.copy() # data augmentation if self.enable_horizon_flip and random.random() > 0.5: im = Augmentor.flip(im, 'h') bboxes[:, 0] = im.shape[1] - (bboxes[:, 0] + bboxes[:, 2]) if self.enable_vertical_flip and random.random() > 0.5: im = Augmentor.flip(im, 'v') bboxes[:, 1] = im.shape[0] - (bboxes[:, 1] + bboxes[:, 3]) # display for debug------------------------------------------- # im_show = im.copy() # for n in range(num_bboxes): # cv2.rectangle(im_show, (int(bboxes[n,0]),int(bboxes[n,1])), (int(bboxes[n,0]+bboxes[n,2]),int(bboxes[n,1]+bboxes[n,3])), (255,255,0), 1) # cv2.imshow('im_show', im_show) # cv2.waitKey() # randomly select a bbox bbox_idx = random.randint(0, num_bboxes - 1) # randomly select a reasonable scale for the selected bbox (selection strategy may vary from task to task) target_bbox = bboxes[bbox_idx, :] longer_side = max(target_bbox[2:]) if longer_side <= self.bbox_small_list[0]: scale_idx = 0 elif longer_side <= self.bbox_small_list[1]: scale_idx = random.randint(0, 1) # elif longer_side <= self.bbox_small_list[2]: # scale_idx = random.randint(0, 2) else: if random.random() > 0.9: scale_idx = random.randint(0, self.num_output_scales) else: scale_idx = random.randint(0, self.num_output_scales - 1) # choose a side length in the selected scale if scale_idx == self.num_output_scales: scale_idx -= 1 side_length = self.bbox_large_list[-1] + random.randint(0, self.bbox_large_list[-1] * 0.5) else: side_length = self.bbox_small_list[scale_idx] + \ random.randint(0, self.bbox_large_list[scale_idx] - self.bbox_small_list[scale_idx]) target_scale = float(side_length) / longer_side # resize bboxes bboxes = bboxes * target_scale target_bbox = target_bbox * target_scale # determine the states of a bbox in each scale green = [[False for i in range(num_bboxes)] for j in range(self.num_output_scales)] gray = [[False for i in range(num_bboxes)] for j in range(self.num_output_scales)] valid = [[False for i in range(num_bboxes)] for j in range(self.num_output_scales)] for i in range(num_bboxes): temp_bbox = bboxes[i, :] large_side = max(temp_bbox[2:]) for j in range(self.num_output_scales): if self.bbox_small_list[j] <= large_side <= self.bbox_large_list[j]: green[j][i] = True valid[j][i] = True elif self.bbox_small_gray_list[j] <= large_side <= self.bbox_large_gray_list[j]: gray[j][i] = True valid[j][i] = True # resize the original image im = cv2.resize(im, None, fx=target_scale, fy=target_scale) # crop the original image centered on the center of the selected bbox with vibration vibration_length = int(self.receptive_field_stride[scale_idx] / 2) offset_x = random.randint(-vibration_length, vibration_length) offset_y = random.randint(-vibration_length, vibration_length) crop_left = int(target_bbox[0] + target_bbox[2] / 2 + offset_x - self.net_input_width / 2.0) if crop_left < 0: crop_left_pad = -int(crop_left) crop_left = 0 else: crop_left_pad = 0 crop_top = int(target_bbox[1] + target_bbox[3] / 2 + offset_y - self.net_input_height / 2.0) if crop_top < 0: crop_top_pad = -int(crop_top) crop_top = 0 else: crop_top_pad = 0 crop_right = int(target_bbox[0] + target_bbox[2] / 2 + offset_x + self.net_input_width / 2.0) if crop_right > im.shape[1]: crop_right = im.shape[1] crop_bottom = int(target_bbox[1] + target_bbox[3] / 2 + offset_y + self.net_input_height / 2.0) if crop_bottom > im.shape[0]: crop_bottom = im.shape[0] im = im[crop_top:crop_bottom, crop_left:crop_right, :] im_input = numpy.zeros((self.net_input_height, self.net_input_width, 3), dtype=numpy.uint8) im_input[crop_top_pad:crop_top_pad + im.shape[0], crop_left_pad:crop_left_pad + im.shape[1], :] = im # image augmentation if random.random() > 0.5: random.shuffle(self.pixel_augmentor_func_list) for augmentor in self.pixel_augmentor_func_list: im_input = augmentor(im_input) # display for debug------------------------------------------------- # im_show = im_input.copy() # for n in range(num_bboxes): # cv2.rectangle(im_show, (int(bboxes[n, 0] - crop_left + crop_left_pad), int(bboxes[n, 1] - crop_top + crop_top_pad)), # (int(bboxes[n, 0] + bboxes[n, 2] - crop_left + crop_left_pad),int(bboxes[n, 1] + bboxes[n, 3] - crop_top + crop_top_pad)), # (255, 0, 255), 1) # cv2.imshow('im_show', im_show) # cv2.waitKey() im_input = im_input.astype(dtype=numpy.float32) im_input = im_input.transpose([2, 0, 1]) # construct GT feature maps for each scale label_list = [] mask_list = [] for i in range(self.num_output_scales): # compute the center coordinates of all RFs receptive_field_centers = numpy.array( [self.receptive_field_center_start[i] + w * self.receptive_field_stride[i] for w in range(self.feature_map_size_list[i])])
def __prepare_batch(self): im_batch = numpy.zeros((self.batch_size, self.num_image_channels, self.net_input_height, self.net_input_width), dtype=numpy.float32) label_batch_list = [numpy.zeros((self.batch_size, self.num_output_channels, v, v), dtype=numpy.float32) for v in self.feature_map_size_list] mask_batch_list = [numpy.zeros((self.batch_size, self.num_output_channels, v, v), dtype=numpy.float32) for v in self.feature_map_size_list] data_batch = DataBatch(self.mxnet_module) loop = 0 while loop < self.batch_size: if loop < self.num_neg_images_per_batch: # fill neg images first rand_idx = random.choice(self.negative_index) im, _, __ = self.data_provider.read_by_index(rand_idx) random_resize_factor = random.random() * (self.neg_image_resize_factor_interval[1] - self.neg_image_resize_factor_interval[0]) + self.neg_image_resize_factor_interval[0] im = cv2.resize(im, (0, 0), fy=random_resize_factor, fx=random_resize_factor) h_interval = im.shape[0] - self.net_input_height w_interval = im.shape[1] - self.net_input_width if h_interval >= 0: y_top = random.randint(0, h_interval) else: y_pad = int(-h_interval / 2) if w_interval >= 0: x_left = random.randint(0, w_interval) else: x_pad = int(-w_interval / 2) im_input = numpy.zeros((self.net_input_height, self.net_input_width, self.num_image_channels), dtype=numpy.uint8) if h_interval >= 0 and w_interval >= 0: im_input[:, :, :] = im[y_top:y_top + self.net_input_height, x_left:x_left + self.net_input_width, :] elif h_interval >= 0 and w_interval < 0: im_input[:, x_pad:x_pad + im.shape[1], :] = im[y_top:y_top + self.net_input_height, :, :] elif h_interval < 0 and w_interval >= 0: im_input[y_pad:y_pad + im.shape[0], :, :] = im[:, x_left:x_left + self.net_input_width, :] else: im_input[y_pad:y_pad + im.shape[0], x_pad:x_pad + im.shape[1], :] = im[:, :, :] # data augmentation if self.enable_horizon_flip and random.random() > 0.5: im_input = Augmentor.flip(im_input, 'h') if self.enable_vertical_flip and random.random() > 0.5: im_input = Augmentor.flip(im_input, 'v') if random.random() > 0.5: random.shuffle(self.pixel_augmentor_func_list) for augmentor in self.pixel_augmentor_func_list: im_input = augmentor(im_input) # # display for debug------------------------------------------------- # cv2.imshow('im', im_pad.astype(dtype=numpy.uint8)) # cv2.waitKey() im_input = im_input.astype(numpy.float32) im_input = im_input.transpose([2, 0, 1]) im_batch[loop] = im_input for label_batch in label_batch_list: label_batch[loop, 1, :, :] = 1 for mask_batch in mask_batch_list: mask_batch[loop, 0:2, :, :] = 1 else: rand_idx = random.choice(self.positive_index) im, _, bboxes_org = self.data_provider.read_by_index(rand_idx) num_bboxes = bboxes_org.shape[0] bboxes = bboxes_org.copy() # data augmentation ---- if self.enable_horizon_flip and random.random() > 0.5: im = Augmentor.flip(im, 'h') bboxes[:, 0] = im.shape[1] - (bboxes[:, 0] + bboxes[:, 2]) if self.enable_vertical_flip and random.random() > 0.5: im = Augmentor.flip(im, 'v') bboxes[:, 1] = im.shape[0] - (bboxes[:, 1] + bboxes[:, 3]) # display for debug------------------------------------------- # im_show = im.copy() # for n in range(num_bboxes): # cv2.rectangle(im_show, (int(bboxes[n,0]),int(bboxes[n,1])), (int(bboxes[n,0]+bboxes[n,2]),int(bboxes[n,1]+bboxes[n,3])), (255,255,0), 1) # cv2.imshow('im_show', im_show) # cv2.waitKey() # randomly select a bbox bbox_idx = random.randint(0, num_bboxes - 1) # randomly select a reasonable scale for the selected bbox (selection strategy may vary from task to task) target_bbox = bboxes[bbox_idx, :] longer_side = max(target_bbox[2:]) if longer_side <= self.bbox_small_list[0]: scale_idx = 0 elif longer_side <= self.bbox_small_list[1]: scale_idx = random.randint(0, 1) elif longer_side <= self.bbox_small_list[2]: scale_idx = random.randint(0, 2) else: if random.random() > 0.8: scale_idx = random.randint(0, self.num_output_scales) else: scale_idx = random.randint(0, self.num_output_scales - 1) scale_counter[scale_idx] += 1 # choose a side length in the selected scale if scale_idx == self.num_output_scales: scale_idx -= 1 side_length = self.bbox_large_list[-1] + random.randint(0, self.bbox_large_list[-1] * 0.5) else: side_length = self.bbox_small_list[scale_idx] + random.randint(0, self.bbox_large_list[scale_idx] - self.bbox_small_list[scale_idx]) target_scale = float(side_length) / longer_side # resize bboxes bboxes = bboxes * target_scale target_bbox = target_bbox * target_scale # determine the states of a bbox in each scale green = [[False for i in range(num_bboxes)] for j in range(self.num_output_scales)] gray = [[False for i in range(num_bboxes)] for j in range(self.num_output_scales)] valid = [[False for i in range(num_bboxes)] for j in range(self.num_output_scales)] for i in range(num_bboxes): temp_bbox = bboxes[i, :] large_side = max(temp_bbox[2:]) for j in range(self.num_output_scales): if self.bbox_small_list[j] <= large_side <= self.bbox_large_list[j]: green[j][i] = True valid[j][i] = True elif self.bbox_small_gray_list[j] <= large_side <= self.bbox_large_gray_list[j]: gray[j][i] = True valid[j][i] = True # resize the original image im = cv2.resize(im, None, fx=target_scale, fy=target_scale) # crop the original image centered on the center of the selected bbox with vibration (it can be regarded as an augmentation) vibration_length = int(self.receptive_field_stride[scale_idx] / 2) offset_x = random.randint(-vibration_length, vibration_length) offset_y = random.randint(-vibration_length, vibration_length) crop_left = int(target_bbox[0] + target_bbox[2] / 2 + offset_x - self.net_input_width / 2.0) if crop_left < 0: crop_left_pad = -int(crop_left) crop_left = 0 else: crop_left_pad = 0 crop_top = int(target_bbox[1] + target_bbox[3] / 2 + offset_y - self.net_input_height / 2.0) if crop_top < 0: crop_top_pad = -int(crop_top) crop_top = 0 else: crop_top_pad = 0 crop_right = int(target_bbox[0] + target_bbox[2] / 2 + offset_x + self.net_input_width / 2.0) if crop_right > im.shape[1]: crop_right = im.shape[1] crop_bottom = int(target_bbox[1] + target_bbox[3] / 2 + offset_y + self.net_input_height / 2.0) if crop_bottom > im.shape[0]: crop_bottom = im.shape[0] im = im[crop_top:crop_bottom, crop_left:crop_right, :] im_input = numpy.zeros((self.net_input_height, self.net_input_width, 3), dtype=numpy.uint8) im_input[crop_top_pad:crop_top_pad + im.shape[0], crop_left_pad:crop_left_pad + im.shape[1], :] = im # image augmentation ---- if random.random() > 0.5: random.shuffle(self.pixel_augmentor_func_list) for augmentor in self.pixel_augmentor_func_list: im_input = augmentor(im_input) # display for debug------------------------------------------------- # im_show = im_input.copy() # for n in range(num_bboxes): # cv2.rectangle(im_show, (int(bboxes[n, 0] - crop_left + crop_left_pad), int(bboxes[n, 1] - crop_top + crop_top_pad)), # (int(bboxes[n, 0] + bboxes[n, 2] - crop_left + crop_left_pad),int(bboxes[n, 1] + bboxes[n, 3] - crop_top + crop_top_pad)), # (255, 0, 255), 1) # cv2.imshow('im_show', im_show) # cv2.waitKey() im_input = im_input.astype(dtype=numpy.float32) im_input = im_input.transpose([2, 0, 1]) # construct GT feature maps for each scale label_list = [] mask_list = [] for i in range(self.num_output_scales): # compute the center coordinates of all RFs receptive_field_centers = numpy.array( [self.receptive_field_center_start[i] + w * self.receptive_field_stride[i] for w in range(self.feature_map_size_list[i])]) shift_x = (self.net_input_width / 2.0 - target_bbox[2] / 2) - target_bbox[0] - offset_x shift_y = (self.net_input_height / 2.0 - target_bbox[3] / 2) - target_bbox[1] - offset_y temp_label = numpy.zeros((self.num_output_channels, self.feature_map_size_list[i], self.feature_map_size_list[i]), dtype=numpy.float32) temp_mask = numpy.zeros((self.num_output_channels, self.feature_map_size_list[i], self.feature_map_size_list[i]), dtype=numpy.float32) temp_label[1, :, :] = 1 temp_mask[0:2, :, :] = 1 score_map_green = numpy.zeros((self.feature_map_size_list[i], self.feature_map_size_list[i]), dtype=numpy.int32) score_map_gray = numpy.zeros((self.feature_map_size_list[i], self.feature_map_size_list[i]), dtype=numpy.int32) for j in range(num_bboxes): if not valid[i][j]: continue temp_bbox = bboxes[j, :] # skip the bbox that does not appear in the cropped area if temp_bbox[0] + temp_bbox[2] + shift_x <= 0 or temp_bbox[0] + shift_x >= self.net_input_width \ or temp_bbox[1] + temp_bbox[3] + shift_y <= 0 or temp_bbox[1] + shift_y >= self.net_input_height: continue temp_bbox_left_bound = temp_bbox[0] + shift_x temp_bbox_right_bound = temp_bbox[0] + temp_bbox[2] + shift_x temp_bbox_top_bound = temp_bbox[1] + shift_y temp_bbox_bottom_bound = temp_bbox[1] + temp_bbox[3] + shift_y left_RF_center_index = max(0, math.ceil((temp_bbox_left_bound - self.receptive_field_center_start[i]) / self.receptive_field_stride[i])) right_RF_center_index = min(self.feature_map_size_list[i] - 1, math.floor((temp_bbox_right_bound - self.receptive_field_center_start[i]) / self.receptive_field_stride[i])) top_RF_center_index = max(0, math.ceil((temp_bbox_top_bound - self.receptive_field_center_start[i]) / self.receptive_field_stride[i])) bottom_RF_center_index = min(self.feature_map_size_list[i] - 1, math.floor((temp_bbox_bottom_bound - self.receptive_field_center_start[i]) / self.receptive_field_stride[i])) # ignore the face with no RF centers inside if right_RF_center_index < left_RF_center_index or bottom_RF_center_index < top_RF_center_index: continue if gray[i][j]: score_map_gray[top_RF_center_index:bottom_RF_center_index + 1, left_RF_center_index:right_RF_center_index + 1] = 1 else: score_map_green[top_RF_center_index:bottom_RF_center_index + 1, left_RF_center_index:right_RF_center_index + 1] += 1 x_centers = receptive_field_centers[left_RF_center_index:right_RF_center_index + 1] y_centers = receptive_field_centers[top_RF_center_index:bottom_RF_center_index + 1] x0_location_regression = (x_centers - temp_bbox_left_bound) / self.normalization_constant[i] y0_location_regression = (y_centers - temp_bbox_top_bound) / self.normalization_constant[i] x1_location_regression = (x_centers - temp_bbox_right_bound) / self.normalization_constant[i] y1_location_regression = (y_centers - temp_bbox_bottom_bound) / self.normalization_constant[i] temp_label[2, top_RF_center_index:bottom_RF_center_index + 1, left_RF_center_index:right_RF_center_index + 1] = \ numpy.tile(x0_location_regression, [bottom_RF_center_index - top_RF_center_index + 1, 1]) temp_label[3, top_RF_center_index:bottom_RF_center_index + 1, left_RF_center_index:right_RF_center_index + 1] = \ numpy.tile(y0_location_regression, [right_RF_center_index - left_RF_center_index + 1, 1]).T temp_label[4, top_RF_center_index:bottom_RF_center_index + 1, left_RF_center_index:right_RF_center_index + 1] = \ numpy.tile(x1_location_regression, [bottom_RF_center_index - top_RF_center_index + 1, 1]) temp_label[5, top_RF_center_index:bottom_RF_center_index + 1, left_RF_center_index:right_RF_center_index + 1] = \ numpy.tile(y1_location_regression, [right_RF_center_index - left_RF_center_index + 1, 1]).T score_gray_flag = numpy.logical_or(score_map_green > 1, score_map_gray > 0) location_green_flag = score_map_green == 1 temp_label[0, :, :][location_green_flag] = 1 temp_label[1, :, :][location_green_flag] = 0 for c in range(self.num_output_channels): if c == 0 or c == 1: temp_mask[c, :, :][score_gray_flag] = 0 continue # for bbox regression, only green area is available temp_mask[c, :, :][location_green_flag] = 1 # display for debug---------------------------------------------------------------- # temp_label_score_show = temp_label[0, :, :] * temp_mask[0, :, :] # temp_label_score_show = temp_label_score_show * 255 # cv2.imshow('temp_label_score_show', cv2.resize(temp_label_score_show.astype(dtype=numpy.uint8), (0, 0), fx=2, fy=2)) # cv2.waitKey() label_list.append(temp_label) mask_list.append(temp_mask) im_batch[loop] = im_input for n in range(self.num_output_scales): label_batch_list[n][loop] = label_list[n] mask_batch_list[n][loop] = mask_list[n] loop += 1 data_batch.append_data(im_batch) for n in range(self.num_output_scales): data_batch.append_label(mask_batch_list[n]) data_batch.append_label(label_batch_list[n]) return data_batch
def __prepare_batch(self): im_batch = numpy.zeros((self.batch_size, self.num_image_channels, self.net_input_height, self.net_input_width), dtype=numpy.float32) label_batch_list = [ numpy.zeros( ( self.batch_size, self.num_output_channels, # 6 v, v), dtype=numpy.float32) for v in self.feature_map_size_list ] # for i in range(len(label_batch_list)): # print('label_batch_list : ',i+1,')',label_batch_list[i].shape) mask_batch_list = [ numpy.zeros((self.batch_size, self.num_output_channels, v, v), dtype=numpy.float32) for v in self.feature_map_size_list ] data_batch = DataBatch(self.torch_module) loop = 0 while loop < self.batch_size: # 先获得足够的负样本,正负样本的比例为10:1 if loop < self.num_neg_images_per_batch: # fill neg images first # 随机选择一个负样本(图片中没有行人)下标 rand_idx = random.choice(self.negative_index) # 通过负样本(图片中没有行人)下标获得图片 im, _, __ = self.data_provider.read_by_index(rand_idx) # 获得一个书记缩放大小的因子random_resize_factor random_resize_factor = random.random() * ( self.neg_image_resize_factor_interval[1] - self.neg_image_resize_factor_interval[0] ) + self.neg_image_resize_factor_interval[0] # 把原图缩放到原来的random_resize_factor倍 im = cv2.resize(im, (0, 0), fy=random_resize_factor, fx=random_resize_factor) # 输入图像的高框都减去640(默认配置) # print(' self.net_input_height,self.net_input_width : ',self.net_input_height,self.net_input_width) h_interval = im.shape[0] - self.net_input_height w_interval = im.shape[1] - self.net_input_width # 如果图片的高能够大于640(默认配置)大小 if h_interval >= 0: y_top = random.randint(0, h_interval) else: # 如果图片的长小于640, y_pad = int(-h_interval / 2) # 在不足的地方使用0像素填充 if w_interval >= 0: # 如果图片的宽大于640 x_left = random.randint(0, w_interval) # 随机在图片左侧选择一个x值坐标 else: x_pad = int(-w_interval / 2) # 要是图片的宽小于640,使用0进行填补 # 输入图片初始化为0 im_input = numpy.zeros( (self.net_input_height, self.net_input_width, self.num_image_channels), dtype=numpy.uint8) # 对于该剪切的图片进行剪切,该填补的的像素使用默认0处理 if h_interval >= 0 and w_interval >= 0: im_input[:, :, :] = im[y_top:y_top + self.net_input_height, x_left:x_left + self.net_input_width, :] elif h_interval >= 0 and w_interval < 0: im_input[:, x_pad:x_pad + im.shape[1], :] = im[y_top:y_top + self.net_input_height, :, :] elif h_interval < 0 and w_interval >= 0: im_input[y_pad:y_pad + im.shape[0], :, :] = im[:, x_left:x_left + self.net_input_width, :] else: im_input[y_pad:y_pad + im.shape[0], x_pad:x_pad + im.shape[1], :] = im[:, :, :] # data augmentation 数据增强,进行垂直或者水平翻转 if self.enable_horizon_flip and random.random() > 0.5: im_input = Augmentor.flip(im_input, 'h') if self.enable_vertical_flip and random.random() > 0.5: im_input = Augmentor.flip(im_input, 'v') # 数据增强,对比度,模糊,亮度等等 if random.random() > 0.5: random.shuffle(self.pixel_augmentor_func_list) for augmentor in self.pixel_augmentor_func_list: im_input = augmentor(im_input) # display for debug------------------------------------------------- # cv2.imshow('im', im_pad.astype(dtype=numpy.uint8)) # cv2.waitKey() # cv2.namedWindow('dataIter',0) # cv2.imshow('dataIter',im_input) # cv2.waitKey(0) # 根据 PyTorch 要求进行数据类型 及通道转换, im_input = im_input.astype(numpy.float32) im_input = im_input.transpose([2, 0, 1]) # 以上是对一张图片的处理,把获得的图像加载到im_batch im_batch[loop] = im_input for label_batch in label_batch_list: # 标记为负样本,box以及后面 # 输出的的6个通道,第1个通道标记为1,其余为0 label_batch[loop, 1, :, :] = 1 for mask_batch in mask_batch_list: # 输出的6个通道,第0和第1个标记为1,其余为0 mask_batch[loop, 0:2, :, :] = 1 else: # 随机获得一张正样本的图片,以及初始的bboxes rand_idx = random.choice(self.positive_index) im, _, bboxes_org = self.data_provider.read_by_index(rand_idx) # 获得boxes的个数,并且进行拷贝 num_bboxes = bboxes_org.shape[0] bboxes = bboxes_org.copy() # data augmentation ,如果进行了水平或者或者垂直翻转,则对应的boxes也要进行翻转 if self.enable_horizon_flip and random.random() > 0.5: im = Augmentor.flip(im, 'h') bboxes[:, 0] = im.shape[1] - (bboxes[:, 0] + bboxes[:, 2]) if self.enable_vertical_flip and random.random() > 0.5: im = Augmentor.flip(im, 'v') bboxes[:, 1] = im.shape[0] - (bboxes[:, 1] + bboxes[:, 3]) # display for debug------------------------------------------- # im_show = im.copy() # for n in range(num_bboxes): # cv2.rectangle(im_show, (int(bboxes[n,0]),int(bboxes[n,1])), (int(bboxes[n,0]+bboxes[n,2]),int(bboxes[n,1]+bboxes[n,3])), (255,255,0), 1) # cv2.imshow('im_show', im_show) # cv2.waitKey() # randomly select a bbox,随机选择一个boxes,这个是为了保证,后续在进行随机剪切的时候,图片中至少存在一个box # 确保其为一个正样本 bbox_idx = random.randint(0, num_bboxes - 1) # randomly select a reasonable scale for the selected bbox (selection strategy may vary from task to task) # 根据boxes的尺寸大小,选择一个合理的缩放 target_bbox = bboxes[bbox_idx, :] # 获得随机选择boxes的长度 longer_side = max(target_bbox[2:]) if longer_side <= self.bbox_small_list[ 0]: # bbox_small_list : [10, 20, 40, 80, 160] scale_idx = 0 elif longer_side <= self.bbox_small_list[1]: scale_idx = random.randint(0, 1) # elif longer_side <= self.bbox_small_list[2]: # scale_idx = random.randint(0, 2) else: if random.random() > 0.9: scale_idx = random.randint(0, self.num_output_scales) else: scale_idx = random.randint(0, self.num_output_scales - 1) # choose a side length in the selected scale # 保证side_length在选择的尺度之间 if scale_idx == self.num_output_scales: scale_idx -= 1 side_length = self.bbox_large_list[-1] + random.randint( 0, self.bbox_large_list[-1] * 0.5) else: side_length = self.bbox_small_list[scale_idx] + \ random.randint(0, self.bbox_large_list[scale_idx] - self.bbox_small_list[scale_idx]) # 获得缩放比例 target_scale = float(side_length) / longer_side # resize bboxes # resize bboxes,对bboxes中的所有box以及target_bbox进行缩放。 bboxes = bboxes * target_scale target_bbox = target_bbox * target_scale # determine the states of a bbox in each scale # 确定每个尺寸的box对应的green(论文eRF区域),gray(RF-eRF,注意不全是)区域,valid[RF] green = [[False for i in range(num_bboxes)] for j in range(self.num_output_scales)] gray = [[False for i in range(num_bboxes)] for j in range(self.num_output_scales)] valid = [[False for i in range(num_bboxes)] for j in range(self.num_output_scales)] # 对每个box进行处理,num_output_scales默认为6 for i in range(num_bboxes): temp_bbox = bboxes[i, :] large_side = max(temp_bbox[2:]) for j in range(self.num_output_scales): # 对每个缩放的尺寸的特征图进行处理 ,判断中心点落在那个尺寸图对应的green,gray,valid if self.bbox_small_list[ j] <= large_side <= self.bbox_large_list[j]: green[j][i] = True valid[j][i] = True elif self.bbox_small_gray_list[ j] <= large_side <= self.bbox_large_gray_list[ j]: gray[j][i] = True valid[j][i] = True # resize the original image 把图片按照计算出来的缩放因子进行缩放 im = cv2.resize(im, None, fx=target_scale, fy=target_scale) # crop the original image centered on the center of the selected bbox with vibration # 绕着target_bbox的中心进行剪切,并且保证中心在特征图之内 vibration_length = int(self.receptive_field_stride[scale_idx] / 2) # 随机设置偏离box中心的偏移量 offset_x = random.randint(-vibration_length, vibration_length) offset_y = random.randint(-vibration_length, vibration_length) crop_left = int(target_bbox[0] + target_bbox[2] / 2 + offset_x - self.net_input_width / 2.0) if crop_left < 0: crop_left_pad = -int(crop_left) crop_left = 0 else: crop_left_pad = 0 crop_top = int(target_bbox[1] + target_bbox[3] / 2 + offset_y - self.net_input_height / 2.0) if crop_top < 0: crop_top_pad = -int(crop_top) crop_top = 0 else: crop_top_pad = 0 crop_right = int(target_bbox[0] + target_bbox[2] / 2 + offset_x + self.net_input_width / 2.0) if crop_right > im.shape[1]: crop_right = im.shape[1] crop_bottom = int(target_bbox[1] + target_bbox[3] / 2 + offset_y + self.net_input_height / 2.0) if crop_bottom > im.shape[0]: crop_bottom = im.shape[0] im = im[crop_top:crop_bottom, crop_left:crop_right, :] im_input = numpy.zeros( (self.net_input_height, self.net_input_width, 3), dtype=numpy.uint8) im_input[crop_top_pad:crop_top_pad + im.shape[0], crop_left_pad:crop_left_pad + im.shape[1], :] = im # image augmentation 数据增强 if random.random() > 0.5: random.shuffle(self.pixel_augmentor_func_list) for augmentor in self.pixel_augmentor_func_list: im_input = augmentor(im_input) # cv2.namedWindow('dataIter',0) # cv2.imshow('dataIter',im_input) # cv2.waitKey(0) # display for debug------------------------------------------------- # im_show = im_input.copy() # for n in range(num_bboxes): # cv2.rectangle(im_show, (int(bboxes[n, 0] - crop_left + crop_left_pad), int(bboxes[n, 1] - crop_top + crop_top_pad)), # (int(bboxes[n, 0] + bboxes[n, 2] - crop_left + crop_left_pad),int(bboxes[n, 1] + bboxes[n, 3] - crop_top + crop_top_pad)), # (255, 0, 255), 1) # cv2.namedWindow('im_show',0) # cv2.imshow('im_show', im_show) # cv2.waitKey() # # print('---------------------------------------------------------------------->>>> im_input .shape',im_input.shape) im_input = im_input.astype(dtype=numpy.float32) im_input = im_input.transpose([2, 0, 1]) # construct GT feature maps for each scale ,为每个尺寸的特征图构建map label_list = [] mask_list = [] # print('self.num_output_scales : ',self.num_output_scales)# 5 # print('self.net_input_width height : ',self.net_input_width,self.net_input_height)# 640,640 # print('self.num_output_channels : ',self.num_output_channels)# 6 # 分别对每个尺寸的feature maps进行处理 for i in range(self.num_output_scales): # 5 # receptive_field_stride = [4, 8, 16, 32, 64] # feature_map_size_list = [159, 79, 39, 19, 9] # receptive_field_center_start = [3, 7, 15, 31, 63] # compute the center coordinates of all RFs # 计算所有RFs对应的中心点 receptive_field_centers = numpy.array([ self.receptive_field_center_start[i] + w * self.receptive_field_stride[i] for w in range(self.feature_map_size_list[i]) ]) # print('receptive_field_centers : ',receptive_field_centers)# example : receptive_field_centers : [ 63 127 191 255 319 383 447 511 575] # print(i,') receptive_field_centers shape',receptive_field_centers.shape)# example : 9 # 求出box与剪切之后图片中心的偏移值 shift_x = (self.net_input_width / 2.0 - target_bbox[2] / 2) - target_bbox[0] - offset_x shift_y = (self.net_input_height / 2.0 - target_bbox[3] / 2) - target_bbox[1] - offset_y temp_label = numpy.zeros((self.num_output_channels, self.feature_map_size_list[i], self.feature_map_size_list[i]), dtype=numpy.float32) temp_mask = numpy.zeros((self.num_output_channels, self.feature_map_size_list[i], self.feature_map_size_list[i]), dtype=numpy.float32) temp_label[1, :, :] = 1 temp_mask[0:2, :, :] = 1 # 用来保存计算出来特征图对应的eRF区域为人脸的概率值 score_map_green = numpy.zeros( (self.feature_map_size_list[i], self.feature_map_size_list[i]), dtype=numpy.int32) # 用来保存特征图的gray=RF-eRF区域为人脸的概率值 score_map_gray = numpy.zeros( (self.feature_map_size_list[i], self.feature_map_size_list[i]), dtype=numpy.int32) for j in range(num_bboxes): # 找到合适尺寸的,该预测box中心点的落在该尺寸的特征图上 if not valid[i][j]: continue # 获得该 bbox temp_bbox = bboxes[j, :] # skip the bbox that does not appear in the cropped area # 如果box已经不在剪切之后图片的区域,则跳过对该box的处理 if temp_bbox[0] + temp_bbox[2] + shift_x <= 0 or temp_bbox[0] + shift_x >= self.net_input_width \ or temp_bbox[1] + temp_bbox[3] + shift_y <= 0 or temp_bbox[1] + shift_y >= self.net_input_height: continue # box最左边,最右边,最上边,最下边 temp_bbox_left_bound = temp_bbox[0] + shift_x temp_bbox_right_bound = temp_bbox[0] + temp_bbox[ 2] + shift_x temp_bbox_top_bound = temp_bbox[1] + shift_y temp_bbox_bottom_bound = temp_bbox[1] + temp_bbox[ 3] + shift_y # 把原图中的box映射到特征图上,得到特征图上的box,也就是RF对应在特征图像素的下标 left_RF_center_index = max( 0, math.ceil((temp_bbox_left_bound - self.receptive_field_center_start[i]) / self.receptive_field_stride[i])) right_RF_center_index = min( self.feature_map_size_list[i] - 1, math.floor((temp_bbox_right_bound - self.receptive_field_center_start[i]) / self.receptive_field_stride[i])) top_RF_center_index = max( 0, math.ceil((temp_bbox_top_bound - self.receptive_field_center_start[i]) / self.receptive_field_stride[i])) bottom_RF_center_index = min( self.feature_map_size_list[i] - 1, math.floor((temp_bbox_bottom_bound - self.receptive_field_center_start[i]) / self.receptive_field_stride[i])) # print('left_RF_center_index,right_RF_center_index,top_RF_center_index,bottom_RF_center_index: ',left_RF_center_index,right_RF_center_index,top_RF_center_index,bottom_RF_center_index) # ignore the face with no RF centers inside 忽略掉没有RF中心的脸 if right_RF_center_index < left_RF_center_index or bottom_RF_center_index < top_RF_center_index: continue # 如果存在灰度区域,则存在gray区域,即RF-eRF。对gray进行赋值,全部为1 if gray[i][j]: score_map_gray[ top_RF_center_index:bottom_RF_center_index + 1, left_RF_center_index:right_RF_center_index + 1] = 1 # 如果不在灰色区域,原图中的box映射到box的eRF上面 else: score_map_green[ top_RF_center_index:bottom_RF_center_index + 1, left_RF_center_index:right_RF_center_index + 1] += 1 # 根据下标获得中心,并且进行了正则化 x_centers = receptive_field_centers[ left_RF_center_index:right_RF_center_index + 1] y_centers = receptive_field_centers[ top_RF_center_index:bottom_RF_center_index + 1] # 这里是机器要学习的东西,不是坐标点,而是距离中心坐标的的偏移量 x0_location_regression = ( x_centers - temp_bbox_left_bound ) / self.normalization_constant[i] y0_location_regression = ( y_centers - temp_bbox_top_bound ) / self.normalization_constant[i] x1_location_regression = ( x_centers - temp_bbox_right_bound ) / self.normalization_constant[i] y1_location_regression = ( y_centers - temp_bbox_bottom_bound ) / self.normalization_constant[i] # print(' x_centers:{}, y_centers:{}, x0:{}, y0:{}, x1:{}, y1:{} '.format(x_centers,y_centers,x0_location_regression,y0_location_regression,x1_location_regression,y1_location_regression)) # 对temp_label进行复制,temp_label是特征图的大小 temp_label[2, top_RF_center_index:bottom_RF_center_index + 1, left_RF_center_index:right_RF_center_index + 1] = \ numpy.tile(x0_location_regression, [bottom_RF_center_index - top_RF_center_index + 1, 1]) # print('temp_label 2 : ',temp_label[2, top_RF_center_index:bottom_RF_center_index + 1,left_RF_center_index:right_RF_center_index + 1]) temp_label[3, top_RF_center_index:bottom_RF_center_index + 1, left_RF_center_index:right_RF_center_index + 1] = \ numpy.tile(y0_location_regression, [right_RF_center_index - left_RF_center_index + 1, 1]).T temp_label[4, top_RF_center_index:bottom_RF_center_index + 1, left_RF_center_index:right_RF_center_index + 1] = \ numpy.tile(x1_location_regression, [bottom_RF_center_index - top_RF_center_index + 1, 1]) temp_label[5, top_RF_center_index:bottom_RF_center_index + 1, left_RF_center_index:right_RF_center_index + 1] = \ numpy.tile(y1_location_regression, [right_RF_center_index - left_RF_center_index + 1, 1]).T # 举例: score_gray_flag[59,59], 对gray区域进行标记,离开eRF=green越远,score越低 score_gray_flag = numpy.logical_or(score_map_green > 1, score_map_gray > 0) location_green_flag = score_map_green == 1 # 标记为这是一个正样本 temp_label[0, :, :][location_green_flag] = 1 temp_label[1, :, :][location_green_flag] = 0 # 对第0个和第一个通道,不进行mask操作,因为其是用来标记正负样本的,其余的通道可以看作同样的mask for c in range(self.num_output_channels): if c == 0 or c == 1: temp_mask[c, :, :][score_gray_flag] = 0 continue # for bbox regression, only green area is available temp_mask[c, :, :][location_green_flag] = 1 # display for debug---------------------------------------------------------------- # temp_label_score_show = temp_label[0, :, :] * temp_mask[0, :, :] # temp_label_score_show = temp_label_score_show * 255 # cv2.imshow('temp_label_score_show',0) # cv2.imshow('temp_label_score_show', cv2.resize(temp_label_score_show.astype(dtype=numpy.uint8), (0, 0), fx=2, fy=2)) # cv2.waitKey() label_list.append(temp_label) mask_list.append(temp_mask) im_batch[loop] = im_input for n in range(self.num_output_scales): label_batch_list[n][loop] = label_list[n] mask_batch_list[n][loop] = mask_list[n] loop += 1 data_batch.append_data(im_batch) for n in range(self.num_output_scales): data_batch.append_label(mask_batch_list[n]) data_batch.append_label(label_batch_list[n]) return data_batch