def bbox_reg(self, boxes, box_deltas, im): if CUDA_AVAILABLE: boxes = boxes.data[:,1:].cpu().numpy() box_deltas = box_deltas.data.cpu().numpy() else: boxes = boxes.data[:,1:].numpy() box_deltas = box_deltas.data.numpy() pred_boxes = bbox_transform_inv(boxes, box_deltas) pred_boxes = clip_boxes(pred_boxes, im.size()[-2:]) return _tovar(pred_boxes)
def get_roi_boxes(self, anchors, rpn_map, rpn_bbox_deltas, im): # TODO fix this!!! im_info = (100, 100, 1) if CUDA_AVAILABLE: bbox_deltas = rpn_bbox_deltas.data.cpu().numpy() else: bbox_deltas = rpn_bbox_deltas.data.numpy() bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4)) # the first set of _num_anchors channels are bg probs # the second set are the fg probs, which we want #scores = bottom[0].data[:, self._num_anchors:, :, :] if CUDA_AVAILABLE: scores = rpn_map.data[:, self._num_anchors:, :, :].cpu().numpy() else: scores = rpn_map.data[:, self._num_anchors:, :, :].numpy() scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1)) # Convert anchors into proposals via bbox transformations proposals = bbox_transform_inv(anchors, bbox_deltas) # 2. clip predicted boxes to image proposals = clip_boxes(proposals, im.size()[-2:]) # 3. remove predicted boxes with either height or width < threshold # (NOTE: convert min_size to input image scale stored in im_info[2]) keep = filter_boxes(proposals, self.min_size * im_info[2]) proposals = proposals[keep, :] scores = scores[keep] # 4. sort all (proposal, score) pairs by score from highest to lowest # 5. take top pre_nms_topN (e.g. 6000) order = scores.ravel().argsort()[::-1] if self.pre_nms_topN > 0: order = order[:self.pre_nms_topN] proposals = proposals[order, :] scores = scores[order] # 6. apply nms (e.g. threshold = 0.7) # 7. take after_nms_topN (e.g. 300) # 8. return the top proposals (-> RoIs top) keep = nms(np.hstack((proposals, scores)), self.nms_thresh) if self.post_nms_topN > 0: keep = keep[:self.post_nms_topN] proposals = proposals[keep, :] scores = scores[keep] return proposals, scores
def get_proposal_boxes(self, rpn_bbox_deltas, rpn_cls_probs): """ applies rpn bbox deltas to anchor boxes to get region proposals. Also filter by non-max suppression and limit to 2k boxes Arguments: rpn_bbox_deltas (Tensor) : (9*fH*fW, 4) rpn_cls_probs (Tensor) : (9*fH*fW,, 2) Return: proposals_boxes (Ndarray) : ( # proposal boxes, 4) scores (Ndarray) : ( # proposal boxes, ) """ all_anchor_boxes = self.all_anchor_boxes nms_thresh = self.nms_thresh # prob thresh pre_nms_limit = self.pre_nms_limit post_nms_limit = self.post_nms_limit # eval with different numbers at test rpn_bbox_deltas = rpn_bbox_deltas.data.cpu().numpy() pos_score = rpn_cls_probs.data.cpu().numpy()[:, 1] # 1. Convert anchors into proposal via bbox transformation proposal_boxes = bbox_transform_inv( all_anchor_boxes, rpn_bbox_deltas) # (H/16 * W/16 * 9, 4) all proposal boxes height, width = self.feature_map_dim[-2:] # 2. ignore out of bounds proposals during training if not self.test: indices = filter_cross_boundary_boxes(proposal_boxes, (height * 16, width * 16)) proposal_boxes = proposal_boxes[indices] pos_score = pos_score[indices] # if no boxes are in the image boundaries, skip if len(proposal_boxes) == 0: return [], [] # 3. pre nms limit limit = np.argsort(pos_score)[:pre_nms_limit] proposal_boxes = proposal_boxes[limit] pos_score = pos_score[limit] # 3. apply nms (e.g. threshold = 0.7) proposal_boxes, scores = non_max_suppression(proposal_boxes, pos_score, nms_thresh, post_nms_limit) return proposal_boxes, scores
def generate_proposals(data): # Extract feature map feature_map = CNN_model_cut.predict( data.reshape(-1, data.shape[0], data.shape[1], data.shape[2])) padded_fcmap = np.pad(feature_map, ((0, 0), (1, 1), (1, 1), (0, 0)), mode='constant') # Extract RPN results RPN_results = RPN_model.predict(padded_fcmap) anchor_probs = RPN_results[0].reshape((-1, 1)) anchor_targets = RPN_results[1].reshape((-1, 4)) # Original anchors feature_size = feature_map.shape[1] number_feature_points = feature_size * feature_size feature_stride = int(image_size / feature_size) base_anchors = generate_anchors(feature_stride, feature_stride, ratios=ANCHOR_RATIOS, scales=ANCHOR_SCALES) shift = np.arange(0, feature_size) * feature_stride shift_x, shift_y = np.meshgrid(shift, shift) shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose() original_anchors = (base_anchors.reshape( (1, anchor_number, 4)) + shifts.reshape( (1, number_feature_points, 4)).transpose((1, 0, 2))) original_anchors = original_anchors.reshape((-1, 4)) # Proposals by the RPN proposals = bbox_transform_inv(original_anchors, anchor_targets) proposals = clip_boxes(proposals, (data.shape[0], data.shape[1])) # clip to image. high_to_low_scores = anchor_probs.ravel().argsort()[::-1] # highest scores high_to_low_scores = high_to_low_scores[0:N] proposals = proposals[high_to_low_scores, :] anchor_probs = anchor_probs[high_to_low_scores] del original_anchors del RPN_results del feature_map del padded_fcmap return proposals, anchor_probs
def define_bbox(pred_bbox_delta, ANCHOR_BOX): delta_x, delta_y, delta_w, delta_h = torch.unbind( pred_bbox_delta, dim=2) # set_anchors(mc, scale) anchor_x = ANCHOR_BOX[:, 0] anchor_y = ANCHOR_BOX[:, 1] anchor_w = ANCHOR_BOX[:, 2] anchor_h = ANCHOR_BOX[:, 3] box_center_x = anchor_x + delta_x * anchor_w box_center_y = anchor_y + delta_y * anchor_h # box_width = anchor_w * util.safe_exp(delta_w, EXP_THRESH) # box_height = anchor_h * util.safe_exp(delta_h, EXP_THRESH) box_width = anchor_w * torch.exp(delta_w) box_height = anchor_h * torch.exp(delta_h) # ok, this needs to be done on CPU side xmins, ymins, xmaxs, ymaxs = util.bbox_transform( [box_center_x, box_center_y, box_width, box_height]) xmins = xmins.cpu().detach().numpy() ymins = ymins.cpu().detach().numpy() xmaxs = xmaxs.cpu().detach().numpy() ymaxs = ymaxs.cpu().detach().numpy() # The max x position is mc.IMAGE_WIDTH - 1 since we use zero-based # pixels. Same for y. xmins = np.minimum( np.maximum(0.0, xmins), IMAGE_WIDTH-1.0) ymins = np.minimum( np.maximum(0.0, ymins), IMAGE_HEIGHT-1.0) xmaxs = np.maximum( np.minimum(IMAGE_WIDTH-1.0, xmaxs), 0.0) ymaxs = np.maximum( np.minimum(IMAGE_HEIGHT-1.0, ymaxs), 0.0) det_boxes = torch.transpose( torch.stack(util.bbox_transform_inv(torch.FloatTensor([xmins, ymins, xmaxs, ymaxs]))), 1, 2) # this is not needed for hardware implementation return det_boxes
def get_predictions(self, img, ignore_background=True): """ :param img: :return: predicted_targets: (N, x1, y1, x2, y1, C) """ rpn_cls_probs, rpn_bbox_deltas, pred_label, pred_bbox_deltas = self.forward( img) proposals, _ = self.get_rpn_proposals() _, pred_class = pred_label.max(dim=1) pred_class = pred_class.cpu().long() pred_bbox_deltas = pred_bbox_deltas.data.cpu() idx = torch.arange(0, len(pred_class)).long() pred_deltas_top_class = pred_bbox_deltas[ idx, pred_class.data.long()].numpy() pred_boxes = bbox_transform_inv(proposals, pred_deltas_top_class) pred_targets = np.hstack( [pred_boxes, pred_class.data.cpu().numpy().reshape(-1, 1)]) if ignore_background: return pred_targets[pred_targets[:, -1] != 0] return pred_targets
def bbox_reg(self, boxes, box_deltas, im): boxes = boxes.data[:, 1:].numpy() box_deltas = box_deltas.data.numpy() pred_boxes = bbox_transform_inv(boxes, box_deltas) pred_boxes = clip_boxes(pred_boxes, im.size()[-2:]) return to_var(pred_boxes)
def forward(self, input): """ Parameters ---------- input - list contains: cls_prob_alls: (BS , H , W , Ax2) outputs of RPN (here - Feature Pyramid Network), prob of bg or fg; bbox_pred_alls: (BS , H , W , Ax4), rgs boxes output of RPN; im_info: a list of [image_height, image_width, scale_ratios]; rpn_shapes: width and height of feature map; ---------- Returns ---------- rpn_rois : (1 x H x W x A, 5) e.g. [0, x1, y1, x2, y2] # Algorithm: # # for each (H, W) location i # generate A anchor boxes centered on cell i # apply predicted bbox deltas at cell i to each of the A anchors # clip predicted boxes to image # remove predicted boxes with either height or width < threshold # sort all (proposal, score) pairs by score from highest to lowest # take top pre_nms_topN proposals before NMS # apply NMS with threshold 0.7 to remaining proposals # take after_nms_topN proposals after NMS # return the top proposals (-> RoIs top, scores top) """ scores = input[0][:, :, 1] # batch_size x num_rois x 1 bbox_deltas = input[1] # batch_size x num_rois x 4 im_info = input[2] anchors = torch.from_numpy(generate_anchors_all_pyramids(self.fpn_scales, self.anchor_ratios, feat_shapes, self.feat_strides, self.fpn_anchor_stride)).type_as(scores) num_anchors = anchors.size(0) anchors = anchors.view(1, num_anchors, 4).expand(batch_size, num_anchors, 4) # Convert anchors into proposals via bbox transformations proposals = bbox_transform_inv(anchors, bbox_deltas, batch_size) # 2. clip predicted boxes to image proposals = clip_boxes(proposals, im_info, batch_size) # keep_idx = self._filter_boxes(proposals, min_size).squeeze().long().nonzero().squeeze() scores_keep = scores proposals_keep = proposals _, order = torch.sort(scores_keep, 1, True) output = scores.new(batch_size, self.post_nms_topN, 5).zero_() for i in range(batch_size): # # 3. remove predicted boxes with either height or width < threshold # # (NOTE: convert min_size to input image scale stored in im_info[2]) proposals_single = proposals_keep[i] scores_single = scores_keep[i] # # 4. sort all (proposal, score) pairs by score from highest to lowest # # 5. take top pre_nms_topN (e.g. 6000) order_single = order[i] if pre_nms_topN > 0 and pre_nms_topN < scores_keep.numel(): order_single = order_single[:pre_nms_topN] proposals_single = proposals_single[order_single, :] scores_single = scores_single[order_single].view(-1,1) # 6. apply nms (e.g. threshold = 0.7) # 7. take after_nms_topN (e.g. 300) # 8. return the top proposals (-> RoIs top) keep_idx_i = nms(proposals_single, scores_single, self.rpn_nms_thresh) keep_idx_i = keep_idx_i.long().view(-1) if self.post_nms_topN > 0: keep_idx_i = keep_idx_i[:self.post_nms_topN] proposals_single = proposals_single[keep_idx_i, :] scores_single = scores_single[keep_idx_i, :] # padding 0 at the end. num_proposal = proposals_single.size(0) output[i,:,0] = i output[i,:num_proposal,1:] = proposals_single return output def backward(self, top, propagate_down, bottom): """This layer does not propagate gradients.""" pass def reshape(self, bottom, top): """Reshaping happens during the call to forward.""" pass def _filter_boxes(self, boxes, min_size): """Remove all boxes with any side smaller than min_size.""" ws = boxes[:, :, 2] - boxes[:, :, 0] + 1 hs = boxes[:, :, 3] - boxes[:, :, 1] + 1 keep = ((ws >= min_size) & (hs >= min_size)) return keep
def produce_batch(feature_map, gt_boxes, h_w=None, category=None): height = np.shape(feature_map)[1] width = np.shape(feature_map)[2] num_feature_map = width * height w_stride = h_w[1] / width h_stride = h_w[0] / height #base anchors are 9 anchors wrt a tile (0,0,w_stride-1,h_stride-1) base_anchors = generate_anchors(w_stride, h_stride) shift_x = np.arange(0, width) * w_stride shift_y = np.arange(0, height) * h_stride shift_x, shift_y = np.meshgrid(shift_x, shift_y) shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose() all_anchors = (base_anchors.reshape((1, anchors_num, 4)) + shifts.reshape( (1, num_feature_map, 4)).transpose((1, 0, 2))) total_anchors = num_feature_map * anchors_num all_anchors = all_anchors.reshape((total_anchors, 4)) # 用训练好的rpn进行预测,得出scores和deltas res = rpn_model.query_cnn(feature_map) scores = res[0] scores = scores.reshape(-1, 1) deltas = res[1] deltas = np.reshape(deltas, (-1, 4)) # 把dx dy转换成具体的xy值,并把照片以外的anchors去掉 proposals = bbox_transform_inv(all_anchors, deltas) proposals = clip_boxes(proposals, (h_w[0], h_w[1])) # remove small boxes keep = filter_boxes(proposals, small_box_threshold) # here threshold is 40 pixel proposals = proposals[keep, :] scores = scores[keep] # sort socres and only keep top 6000. pre_nms_topN = 6000 order = scores.ravel().argsort()[::-1] if pre_nms_topN > 0: order = order[:pre_nms_topN] proposals = proposals[order, :] scores = scores[order] # apply NMS to to 6000, and then keep top 300 post_nms_topN = 300 keep = py_cpu_nms(np.hstack((proposals, scores)), 0.7) if post_nms_topN > 0: keep = keep[:post_nms_topN] proposals = proposals[keep, :] scores = scores[keep] # 把ground true也加到proposals中 proposals = np.vstack((proposals, gt_boxes)) # calculate overlaps of proposal and gt_boxes overlaps = bbox_overlaps(proposals, gt_boxes) gt_assignment = overlaps.argmax(axis=1) max_overlaps = overlaps.max(axis=1) # labels = gt_labels[gt_assignment] #? # sub sample fg_inds = np.where(max_overlaps >= FG_THRESH)[0] fg_rois_per_this_image = min(int(BATCH * FG_FRAC), fg_inds.size) # Sample foreground regions without replacement if fg_inds.size > 0: fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False) bg_inds = np.where((max_overlaps < BG_THRESH_HI) & (max_overlaps >= BG_THRESH_LO))[0] bg_rois_per_this_image = BATCH - fg_rois_per_this_image bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size) # Sample background regions without replacement if bg_inds.size > 0: bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False) # The indices that we're selecting (both fg and bg) keep_inds = np.append(fg_inds, bg_inds) # Select sampled values from various arrays: # labels = labels[keep_inds] rois = proposals[keep_inds] gt_rois = gt_boxes[gt_assignment[keep_inds]] targets = bbox_transform(rois, gt_rois) #input rois rois_num = targets.shape[0] batch_box = np.zeros((rois_num, 200, 4)) for i in range(rois_num): batch_box[i, category] = targets[i] batch_box = np.reshape(batch_box, (rois_num, -1)) # get gt category batch_categories = np.zeros((rois_num, 200, 1)) for i in range(rois_num): batch_categories[i, category] = 1 batch_categories = np.reshape(batch_categories, (rois_num, -1)) return rois, batch_box, batch_categories
imdb = eyelevel5k( '/media/zehao/WD/Dataset/processed/car_dataset/Rendered/eyelevel5K/images', '/media/zehao/WD/Dataset/processed/car_dataset/Rendered/eyelevel5K/annotations' ) net = caffe.Net(network_proto_path, network_model_path, caffe.TEST) transformer = caffe.io.Transformer({'data': net.blobs['data'].data.shape}) transformer.set_input_scale('data', 0.1) transformer.set_mean('data', np.array([104, 117, 123])) transformer.set_transpose('data', (2, 0, 1)) while True: # Generate test images img, bboxes = imdb.read() jittered_bboxes = bbox_jitter(bboxes[0], 0.1, 3) gt_bbox = bbox_transform_inv(bboxes[0]) j_box = bbox_transform_inv(jittered_bboxes[0]) test_img = img[j_box[1]:j_box[1] + j_box[3], j_box[0]:j_box[0] + j_box[2], :] test_img = caffe.io.resize(test_img, [48, 48, 3]) offset = net.forward_all( data=np.asarray([transformer.preprocess('data', test_img)])) print offset['conv6'][0, :] print j_box out_box = shift(j_box, offset['conv6'][0, :]) cv2.rectangle(img, (int(out_box[0]), int(out_box[1])), (int(out_box[2]), int(out_box[3])), (0, 255, 0), 1) cv2.rectangle(img, (int(j_box[0]), int(j_box[1])), (int(j_box[2]), int(j_box[3])), (255, 0, 0), 1) cv2.rectangle(img, (int(gt_bbox[0]), int(gt_bbox[1])),
#restorer = tf.train.Saver(variables_to_restore) #restorer.restore(sess, MODEL_CKPT) #lr, train_opt = construct_graph(net, sess) saver =tf.train.Saver() saver.restore(sess,"./checkpoint.ckpt") #init = tf.global_variables_initializer() #sess.run(init) for i in range(0,150): blob = loader.fetch() roi_score, rois,rpn_cls_prob,cls_pred,bbox_pred = net.test_image(sess,blob["data"],blob["im_info"]) #roi_score, rois, rpn_cls_prob = net.test_image_train(sess,blob["data"],blob["im_info"],blob['gt_boxes']) index = np.where(cls_pred == 1)[0] print("roi_score_num : "+str(bbox_pred.shape[0])+" roi_index_num : "+str(index.shape[0]) ) #print(rois[index][:,1:5]) bbox = bbox_transform_inv(rois[:,1:5],bbox_pred) print(bbox) print(bbox.shape) bbox = bbox[index] print(bbox.shape) #overlaps = bbox_overlaps(rois[index][:,1:5], blob["gt_boxes"]) print("bbox_overlaps debug") #print(overlaps[np.where(overlaps>threshold_overlaps)]) #print(rois[np.where(overlaps>threshold_overlaps)[0]]) #high_prob_roi = rois[index][np.where(overlaps>threshold_overlaps)[0]] img = blob["pil_im"] brush = ImageDraw.Draw(img)
import os.path import cv2 from dataset.eyelevel5K import eyelevel5k from utils import bbox_transform_inv, bbox_jitter, cal_offset LABEL_OUT_PATH = '/media/zehao/Local2/car_regression_data/labels' IMAGE_OUT_PATH = '/media/zehao/Local2/car_regression_data/images' if __name__ == '__main__': imdb = eyelevel5k('/media/zehao/WD/Dataset/processed/car_dataset/Rendered/eyelevel5K/images', '/media/zehao/WD/Dataset/processed/car_dataset/Rendered/eyelevel5K/annotations') for i in range(imdb.num_images): img, bboxes = imdb.read() jittered_bboxes = bbox_jitter(img, bboxes[0], 0.2, 15) gt_bbox = bbox_transform_inv(bboxes[0]) for j_bbox, j in zip(jittered_bboxes, range(len(jittered_bboxes))): j_bbox = bbox_transform_inv(j_bbox) j_img = img[j_bbox[1]:j_bbox[3], j_bbox[0]:j_bbox[2], :] print os.path.join(IMAGE_OUT_PATH, str(i)+'_'+str(j))+'.jpg' cv2.imwrite(os.path.join(IMAGE_OUT_PATH, str(i)+'_'+str(j))+'.jpg', j_img) offset = cal_offset(gt_bbox, j_bbox) with open(os.path.join(LABEL_OUT_PATH, str(i)+'_'+str(j)+'.txt'), 'w') as f: f.write(str(offset[0])+' '+str(offset[1])+' '+str(offset[2])+' '+str(offset[3]))
def produce_batch(filepath, gt_boxes, h_w, category): img = load_img(filepath) img_width = np.shape(img)[1] * scale[1] img_height = np.shape(img)[0] * scale[0] img = img.resize((int(img_width), int(img_height))) #feed image to pretrained model and get feature map img = img_to_array(img) img = np.expand_dims(img, axis=0) feature_map = pretrained_model.predict(img) height = np.shape(feature_map)[1] width = np.shape(feature_map)[2] num_feature_map = width * height #calculate output w, h stride w_stride = h_w[1] / width h_stride = h_w[0] / height #generate base anchors according output stride. #base anchors are 9 anchors wrt a tile (0,0,w_stride-1,h_stride-1) base_anchors = generate_anchors(w_stride, h_stride) #slice tiles according to image size and stride. #each 1x1x1532 feature map is mapping to a tile. shift_x = np.arange(0, width) * w_stride shift_y = np.arange(0, height) * h_stride shift_x, shift_y = np.meshgrid(shift_x, shift_y) shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose() #apply base anchors to all tiles, to have a num_feature_map*9 anchors. all_anchors = (base_anchors.reshape((1, 9, 4)) + shifts.reshape( (1, num_feature_map, 4)).transpose((1, 0, 2))) total_anchors = num_feature_map * 9 all_anchors = all_anchors.reshape((total_anchors, 4)) # feed feature map to pretrained RPN model, get proposal labels and bboxes. res = rpn_model.predict(feature_map) scores = res[0] scores = scores.reshape(-1, 1) deltas = res[1] deltas = np.reshape(deltas, (-1, 4)) # proposals transform to bbox values (x1, y1, x2, y2) proposals = bbox_transform_inv(all_anchors, deltas) proposals = clip_boxes(proposals, (h_w[0], h_w[1])) # remove small boxes, here threshold is 40 pixel keep = filter_boxes(proposals, 40) proposals = proposals[keep, :] scores = scores[keep] # sort socres and only keep top 6000. pre_nms_topN = 6000 order = scores.ravel().argsort()[::-1] if pre_nms_topN > 0: order = order[:pre_nms_topN] proposals = proposals[order, :] scores = scores[order] # apply NMS to to 6000, and then keep top 300 post_nms_topN = 300 keep = py_cpu_nms(np.hstack((proposals, scores)), 0.7) if post_nms_topN > 0: keep = keep[:post_nms_topN] proposals = proposals[keep, :] scores = scores[keep] # add gt_boxes to proposals. proposals = np.vstack((proposals, gt_boxes)) # calculate overlaps of proposal and gt_boxes overlaps = bbox_overlaps(proposals, gt_boxes) gt_assignment = overlaps.argmax(axis=1) max_overlaps = overlaps.max(axis=1) # labels = gt_labels[gt_assignment] #? # sub sample fg_inds = np.where(max_overlaps >= FG_THRESH)[0] fg_rois_per_this_image = min(int(BATCH * FG_FRAC), fg_inds.size) # Sample foreground regions without replacement if fg_inds.size > 0: fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False) bg_inds = np.where((max_overlaps < BG_THRESH_HI) & (max_overlaps >= BG_THRESH_LO))[0] bg_rois_per_this_image = BATCH - fg_rois_per_this_image bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size) # Sample background regions without replacement if bg_inds.size > 0: bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False) # The indices that we're selecting (both fg and bg) keep_inds = np.append(fg_inds, bg_inds) # Select sampled values from various arrays: # labels = labels[keep_inds] rois = proposals[keep_inds] gt_rois = gt_boxes[gt_assignment[keep_inds]] targets = bbox_transform(rois, gt_rois) #input rois rois_num = targets.shape[0] batch_box = np.zeros((rois_num, 200, 4)) for i in range(rois_num): batch_box[i, category] = targets[i] batch_box = np.reshape(batch_box, (rois_num, -1)) # get gt category batch_categories = np.zeros((rois_num, 200, 1)) for i in range(rois_num): batch_categories[i, category] = 1 batch_categories = np.reshape(batch_categories, (rois_num, -1)) return rois, batch_box, batch_categories