def get_detection_output(self, nms_th=0.45, cls_th=0.6): output = dict() ps = [[] for i in range(len(self.input_names))] prior = self.net.mbox_prior.astype(np.float32) loc = self.net.mbox_loc.data[0] conf = self.net.mbox_conf_softmax_reahpe.data[0] cand = [] loc = ssd.decoder(loc, prior) for label in range(1, 21): #cand_score = np.where(conf[:, label] > cls_th) scores = conf[:, label] #[cand_score] cand_loc = loc #[cand_score] k = bbox.nms(cand_loc, scores, nms_th, 300) for i in k: if scores[i] > cls_th: cand.append(np.hstack([[label], [scores[i]], cand_loc[i]])) ps[0].append( (float(scores[i]), int(label), float(cand_loc[i][0]), float(cand_loc[i][1]), float(cand_loc[i][2]), float(cand_loc[i][3]))) output[self.input_names[0]] = ps[0] return output
def detect_faces(fnames, model, pre_thresh, resize=512): results, faces = [], [] for fname in tqdm(fnames, desc='detecting faces'): try: img = cv2.imread(fname) height, width, _ = img.shape ratio = resize / height img = cv2.resize(img, (resize, int(ratio * width))) except Exception as e: print(e + ' @' + fname) bboxlist = detect(model, img) keep = nms(bboxlist, 0.3) bboxlist = bboxlist[keep, :] keep = bboxlist[:, 4] > pre_thresh bboxlist = bboxlist[keep, :] bboxlist[:, :4] = bboxlist[:, :4] / ratio results.append(Face(fname, bboxlist, None)) for b in bboxlist: # x1, y1, x2, y2, s = b face = Face(fname, b, b[4]) faces.append(face) return results, faces
def detect(prior, loc, conf, nms_th=0.45, cls_th=0.6): cand = [] loc = ssd.decoder(loc, prior) for label in range(1, 21): cand_score = np.where(conf[:, label] > cls_th) scores = conf[:, label][cand_score] cand_loc = loc[cand_score] k = bbox.nms(cand_loc, scores, nms_th, 300) for i in k: cand.append(np.hstack([[label], [scores[i]], cand_loc[i]])) cand = np.array(cand) return cand
def detect_faces(net: nn.Module, img: np.ndarray, minscale: int = 3, ovr_threshhold: float = 0.3, score_threshhold: float = 0.5) -> List[Tuple]: """returns an list of tuples describing bounding boxes: [x1,y1,x2,y2,score]. Setting minscale to 0 finds the smallest faces, but takes the longest. """ bboxlist = detect(net, img, minscale) keep_idx = nms(bboxlist, ovr_threshhold) bboxlist = bboxlist[keep_idx, :] out = [] for b in bboxlist: x1, y1, x2, y2, s = b if s < 0.5: continue out.append((int(x1), int(y1), int(x2), int(y2), s)) return out
def detect_faces(fnames, model, pre_thresh=0.3, resize=512): results = [] for fname in tqdm(fnames, desc='detecting faces'): try: img = cv2.imread(fname) height, width, _ = img.shape ratio = resize / height if ratio < 1: img = cv2.resize(img, (resize, int(ratio * width))) except Exception as e: print(e + ' @' + fname) bboxlist = detect(model, img) keep = nms(bboxlist, 0.3) bboxlist = bboxlist[keep, :] keep = bboxlist[:, 4] > pre_thresh bboxlist = bboxlist[keep, :] bboxlist[:, :4] = bboxlist[:, :4] / ratio results.append({"fname": os.path.basename(fname), "bboxes": bboxlist}) return results
def inference(args): model_file = osp.join( args.output_dir, args.ex_dir, 'epoch_{:d}'.format(args.epoch_num) + '.ckpt') if not os.path.exists(os.path.join(args.output_dir, args.ex_dir)): os.makedirs(os.path.join(args.output_dir, args.ex_dir)) if not os.path.exists(os.path.join(args.output_dir, args.ex_dir, 'eval')): os.makedirs(os.path.join(args.output_dir, args.ex_dir, 'eval')) tfconfig = tf.ConfigProto(allow_soft_placement=True) #선택된 device 사용할 수 없을때 다른 장치를 찾기를 원할때 tfconfig.gpu_options.allow_growth = True #실행 과정에서 요구되는 만큼의 GPU 메모리만 할당하게 함 colors = [l.color for l in label_defs[args.test_set]] AP = APCalculate(args.iou_threshold, args.test_set) if args.batch_size % args.num_gpus is not 0: print('please enter batch_size and num_gpus again(could not divide)') exit() args.batch_size = args.batch_size//args.num_gpus with tf.device('/cpu:0'): dataset = Dataset(os.path.dirname(__file__), args.batch_size, 'test', args.num_gpus) num_valid = dataset.valid_train args.num_cl = dataset.num_classes net = SSD(args, args.backbone_dir) for i in range(args.num_gpus): with tf.device('gpu:' + str(i)): with tf.variable_scope(tf.get_variable_scope(), reuse=(i > 0)): net.create_network(i * args.batch_size, (i + 1) * args.batch_size) saver = tf.train.Saver() # if i == 0: # Evaluate model (with test logits, for dropout to be disabled) # correct_pred = tf.equal(tf.argmax(logits_test, 1), tf.argmax(_y, 1)) # accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) total_batch = num_valid // (args.batch_size * args.num_gpus) tfconfig = tf.ConfigProto(allow_soft_placement=True) # 선택된 device 사용할 수 없을때 다른 장치를 찾기를 원할때 tfconfig.gpu_options.allow_growth = True # 실행 과정에서 요구되는 만큼의 GPU 메모리만 할당하게 함 # config.gpu_options.per_process_gpu_memory_fraction = 0.4 # GPU 몇프로 사용할 것인지 0~1 sess = tf.Session(config=tfconfig) # Initialize the variables (i.e. assign their default value) init = tf.global_variables_initializer() sess.run(init) iterator = dataset.get_iterator() next_element = iterator.get_next() saver.restore(sess, model_file) sess.run(iterator.initializer) end_flag = 0 for iter, _ in enumerate(tqdm(range(total_batch+1), desc='# ' + args.test_set + ' evaluation processing')): # batch32 , num_gpu = 2 -> 32*2 = 64 data씩 가져오게.. # TODO: get_batch test_batch = dict() output_data = sess.run(next_element) if iter == 0: input_first = output_data if iter == total_batch: if num_valid % (args.batch_size * args.num_gpus) == 0: break else: test_batch['image'] = np.append(output_data[0], input_first[0][:args.batch_size*args.num_gpus-num_valid % (args.batch_size * args.num_gpus)], axis=0) if iter < total_batch: test_batch['image'] = output_data[0] # 0~255 feed_dict = {net.image: test_batch['image']} run_list = [tf.get_collection('bbox_pred1'), tf.get_collection('bbox_pred2'), tf.get_collection('bbox_pred3'), tf.get_collection('bbox_pred4'), tf.get_collection('bbox_pred5'), tf.get_collection('bbox_pred6')] pred1, pred2, pred3, pred4, pred5, pred6 = sess.run(run_list, feed_dict=feed_dict) for j in range(args.num_gpus): for l in range(args.batch_size): pred = [pred1[j][l], pred2[j][l], pred3[j][l], pred4[j][l], pred5[j][l], pred6[j][l]] bbox_list = list() for k, prediction in enumerate(pred): bbox_list += bbox.translate_pred_to_bbox(prediction, (args.img_h, args.img_w), args.num_cl, net.pred_infos[k]) bbox_list.sort(reverse=True, key=lambda student: student.conf) top_k = 200 if len(bbox_list) < top_k: bbox_list = bbox_list[:len(bbox_list)] else: bbox_list = bbox_list[:top_k] bbox_list2 = defaultdict(list) [bbox_list2[boxes.cl].append(boxes) for boxes in bbox_list] bbox_list_nms = list() for k in range(args.num_cl): bbox_list_nms += bbox.nms(bbox_list2[k], args.nms_threshold) img = test_batch['image'][j*args.batch_size+l].astype(np.uint8) test_box_img = np.copy(img) [bbox.draw_box(test_box_img, box, label_defs[args.test_set][box.cl].name, colors[box.cl]) for box in bbox_list_nms] test_box_img = cv2.cvtColor(test_box_img, cv2.COLOR_BGR2RGB) if args.show_img == True: cv2.imshow('box_img', test_box_img) cv2.waitKey(0) gt_id = output_data[1][j*args.batch_size+l][len(output_data[1][j*args.batch_size+l])-10:len(output_data[1][j*args.batch_size+l])-4].decode("utf-8") gt_label = output_data[2][j*args.batch_size+l] gt_boxes_cord = output_data[4][j*args.batch_size+l] # read gt_label and boxes_cordinates 100 size AP.seperate_class(gt_boxes_cord, gt_label, gt_id, bbox_list_nms) if args.save_img == True: cv2.imwrite(os.path.join(args.output_dir, args.ex_dir, 'eval/') + gt_id + '.png', test_box_img) if iter == total_batch and j * args.batch_size + l + 1 == num_valid % (args.batch_size * args.num_gpus): end_flag = 1 break if end_flag == 1: break AP.compute_ap() print('\n------ VOC07 metric ------') for i in range(args.num_cl): print(label_defs[args.test_set][i].name, "AP : {:.2f}".format(AP.AP[i])) print('mAP : ', AP.APs)
def train(args): if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) train_log_dir = os.path.join(args.log_dir, 'train') valid_log_dir = os.path.join(args.log_dir, 'valid') if not os.path.exists(train_log_dir): os.makedirs(train_log_dir) if not os.path.exists(valid_log_dir): os.makedirs(valid_log_dir) if not os.path.exists(os.path.join(args.output_dir, args.ex_dir)): os.makedirs(os.path.join(args.output_dir, args.ex_dir)) if not os.path.exists(os.path.join(args.output_dir, args.ex_dir, 'image')): os.makedirs(os.path.join(args.output_dir, args.ex_dir, 'image')) train_log_writer = SummaryWriter(train_log_dir) # valid_log_writer = SummaryWriter(valid_log_dir) if args.batch_size % args.num_gpus is not 0: print('please enter batch_size and num_gpus again(could not divide)') exit() args.batch_size = args.batch_size // args.num_gpus with tf.device('/cpu:0'): tower_grads = [] loss_tmp = [] cls_loss_tmp = [] loc_loss_tmp = [] regul_loss_tmp = [] learning_rate = tf.placeholder(tf.float32, shape=[]) optim = tf.train.MomentumOptimizer(learning_rate, args.momentum) dataset = Dataset(os.path.dirname(__file__), args.batch_size, 'train', args.num_gpus) num_trains = dataset.num_train args.num_cl = dataset.num_classes net = SSD(args, args.backbone_dir) for i in range(args.num_gpus): with tf.device('gpu:' + str(i)): with tf.variable_scope(tf.get_variable_scope(), reuse=(i > 0)): net.create_network(i * args.batch_size, (i + 1) * args.batch_size) cls_loss, loc_loss, regul_loss, total_loss = net.create_basic_loss( i * args.batch_size, (i + 1) * args.batch_size) loss_tmp.append(total_loss) loc_loss_tmp.append(loc_loss) cls_loss_tmp.append(cls_loss) regul_loss_tmp.append(regul_loss) # train_op = optim.minimize(total_loss) grads = optim.compute_gradients( total_loss ) # This is the first part of minimize() // gradient 값을 계산한 후, tower_grads.append(grads) total_batch = num_trains // (args.batch_size * args.num_gpus) tower_grads = average_gradients(tower_grads) train_op = optim.apply_gradients( tower_grads ) # This is the second part of minimize() // learning rate를 곱하여 기존 parameter에서 뺀 값으로 업데이트 시키도록 total_losss = tf.reduce_mean(loss_tmp) global_step = tf.Variable(0, trainable=False) #train_op = optim.minimize(total_loss, global_step, colocate_gradients_with_ops=True) cls_losss = tf.reduce_mean(cls_loss_tmp) loc_losss = tf.reduce_mean(loc_loss_tmp) regul_losss = tf.reduce_mean( regul_loss_tmp ) # same as tf.add_n(tf.get_collection('regul_loss')) tfconfig = tf.ConfigProto( allow_soft_placement=True) # 선택된 device 사용할 수 없을때 다른 장치를 찾기를 원할때 tfconfig.gpu_options.allow_growth = True # 실행 과정에서 요구되는 만큼의 GPU 메모리만 할당하게 함 # config.gpu_options.per_process_gpu_memory_fraction = 0.4 # GPU 몇프로 사용할 것인지 0~1 sess = tf.Session(config=tfconfig) # Initialize the variables (i.e. assign their default value) init = tf.global_variables_initializer() sess.run(init) global_iter = 0 lr = args.lr iterator = dataset.get_iterator() next_element = iterator.get_next() for epoch in range(args.max_epoch): sess.run(iterator.initializer) for iter in range( total_batch + 1 ): # ex)total_image = 10000, batch = 10 -> total_batch = 10000/10 # batch32 , num_gpu = 2 -> 32*2 = 64 data씩 가져오게.. # print(objgraph.show_growth()) # TODO: get_batch input_data = sess.run(next_element) train_batch = dict() if iter == 0: input_first = input_data if iter == total_batch: if num_trains % (args.batch_size * args.num_gpus) == 0: break else: train_batch['image'] = np.append( input_data[0], input_first[0][:args.batch_size * args.num_gpus - num_trains % (args.batch_size * args.num_gpus)], axis=0) #train_list = np.append(input_data[2], input_first[2][:args.batch_size-num_trains % (args.batch_size * args.num_gpus)], axis=0) imsis = np.append( input_data[1], input_first[1][:args.batch_size * args.num_gpus - num_trains % (args.batch_size * args.num_gpus)], axis=0) train_batch['gt_cl'] = imsis[:, :, :args.num_cl + 1] train_batch['gt_loc'] = imsis[:, :, args.num_cl + 1:] if iter < total_batch: train_batch['image'] = input_data[0] # 0~255 train_batch['gt_cl'] = input_data[1][:, :, :args.num_cl + 1] # (32, 8732, 9) train_batch['gt_loc'] = input_data[1][:, :, args.num_cl + 1:] # (32, 8732, 4) train_list = input_data[2] #print(train_list) feed_dict = { learning_rate: lr, net.image: train_batch['image'], net.gt_loc: train_batch['gt_loc'], net.gt_cl: train_batch['gt_cl'] } run_list = [ cls_losss, loc_losss, regul_losss, total_losss, train_op ] # 우리가 궁금한것 + gradient 업데이트값(train_op) all_cls_loss, all_loc_loss, all_regul_loss, all_total_loss, _ = sess.run( run_list, feed_dict=feed_dict) print("Epoch %d/%d, Iter: %d/%d" % (epoch, args.max_epoch, iter, total_batch + 1)) print("\tClass loss: %f\n\tLoc loss: %f\n\tRegul loss: %f" % (all_cls_loss, all_loc_loss, all_regul_loss)) print("\tTotal Loss : %f" % (all_total_loss)) train_log_writer.add_scalar('cl_loss', all_cls_loss, global_iter) train_log_writer.add_scalar('loc_loss', all_loc_loss, global_iter) train_log_writer.add_scalar('total_loss', all_total_loss, global_iter) if global_iter == 40000: lr *= 0.1 if global_iter == 50000: lr *= 0.1 global_iter += 1 if iter == total_batch and epoch % 10 == 0 and args.save_img == True: feed_dict = {net.image: train_batch['image']} run_list = [ tf.get_collection('bbox_pred1'), tf.get_collection('bbox_pred2'), tf.get_collection('bbox_pred3'), tf.get_collection('bbox_pred4'), tf.get_collection('bbox_pred5'), tf.get_collection('bbox_pred6') ] pred1, pred2, pred3, pred4, pred5, pred6 = sess.run( run_list, feed_dict=feed_dict) for j in range(args.num_gpus): for l in range(args.batch_size): pred = [ pred1[j][l], pred2[j][l], pred3[j][l], pred4[j][l], pred5[j][l], pred6[j][l] ] bbox_list = list() for k, prediction in enumerate(pred): bbox_list += bbox.translate_pred_to_bbox( prediction, (args.img_w, args.img_h), args.num_cl, net.pred_infos[k]) bbox_list.sort(reverse=True, key=lambda student: student.conf) top_k = 200 if len(bbox_list) < top_k: bbox_list = bbox_list[:len(bbox_list)] else: bbox_list = bbox_list[:top_k] bbox_list2 = defaultdict(list) [ bbox_list2[bbox_list[k].cl].append( bbox_list[k]) for k in range(len(bbox_list)) ] bbox_list_nms = list() for k in range(args.num_cl): bbox_list_nms += bbox.nms(bbox_list2[k], 0.45) img = train_batch['image'][j * args.batch_size + l].astype(np.uint8) train_box_img = np.copy(img) [ bbox.draw_box( train_box_img, box, inference.label_defs[ args.train_set][box.cl].name, inference.label_defs[args.train_set][ box.cl].color) for box in bbox_list_nms ] train_box_img = cv2.cvtColor( train_box_img, cv2.COLOR_BGR2RGB) train_save_path = os.path.join( args.output_dir, args.ex_dir, 'image/') + 'epoch_{:d}'.format( epoch) + 'img_batch_{:d}'.format( j * args.batch_size + l) + '.png' cv2.imwrite(train_save_path, train_box_img) if args.show_img == True: cv2.imshow('train_box_img', train_box_img) cv2.waitKey(0) if epoch % 20 == 0 and epoch is not 0: print('save detection_result........................') saver = tf.train.Saver() snapshot(sess, saver, epoch, args.ex_dir)
net.cuda() net.eval() # set dropout and batch normalization layers to evaluation mode if args.path == 'CAMERA': cap = cv2.VideoCapture(0) # cap.set(cv2.CAP_PROP_FRAME_WIDTH, 160) # cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 120) cv2.namedWindow('image', cv2.WINDOW_NORMAL) while (True): if args.path == 'CAMERA': ret, img = cap.read() else: img = cv2.imread(args.path) t1 = time.time() bboxlist = detect(net, img) t2 = time.time() print("time: {}".format(t2 - t1)) keep = nms(bboxlist, 0.3) bboxlist = bboxlist[keep, :] for b in bboxlist: x1, y1, x2, y2, s = b if s < 0.5: continue cv2.rectangle(img, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 1) cv2.imshow('image', img) if args.path == 'CAMERA': if cv2.waitKey(1) & 0xFF == ord('q'): break else: cv2.imwrite(args.path[:-4] + '_output.png', img) if cv2.waitKey(0) or True: break
def __init__(self, inputs): """ Region proposal net - inputs should be a list of [convolution model, tuple(image_h, image_w, image_scale)] """ self.conv_in, self.im_info = inputs ## inputs is a convolutional net (i.e. VGG or ZFNet) before the fully-connected layers. super(RPN, self).__init__(inputs) in_filters = self.conv_in.output_size[1] # 512 # RPN conv layers classes = 2 n_anchors = 9 min_size = 16 anchor_size = 16 nms_thresh = 0.7 topN = 2000 self.conv = Conv2D(inputs=self.conv_in, n_filters=in_filters, filter_size=(3, 3), stride=(1, 1), activation='relu', border_mode='full') self.cls_score = Conv2D(inputs=self.conv, n_filters=classes*n_anchors, filter_size=(1, 1), stride=(1, 1), activation='linear', border_mode='valid') # need to dimshuffle/flatten it down to get the softmax class probabilities for each class of `classes` cls_shape = self.cls_score.get_outputs().shape cls_score = self.cls_score.get_outputs().reshape((cls_shape[0], classes, -1, cls_shape[3])) # shuffle to (classes, batch, row, col) cls_shuffle = cls_score.dimshuffle((1, 0, 2, 3)) # flatten to (classes, batch*row*col) cls_flat = cls_shuffle.flatten(2) # shuffle to (batch*row*col, classes) cls_flat = cls_flat.dimshuffle((1, 0)) # softmax for probability! cls_probs_flat = T.nnet.softmax(cls_flat) # now shuffle back up to 4D output from cls_score (undo what we did) cls_probs = cls_probs_flat.dimshuffle((1, 0)).reshape(cls_shuffle.shape) cls_probs = cls_probs.dimshuffle((1, 0, 2, 3)) self.cls_probs = cls_probs.reshape(cls_shape) self.bbox_pred = Conv2D(inputs=self.conv, n_filters=4*n_anchors, filter_size=(1, 1), stride=(1, 1), activation='linear', border_mode='valid') ############### # 1. Generate proposals from bbox deltas and shifted anchors (ROIs) ############### anchors = theano.shared(generate_anchors(anchor_size)) object_probs = self.cls_probs[:, n_anchors:, :, :] bbox_deltas = self.bbox_pred.get_outputs() # height and width of convolution features H, W = object_probs.shape[-2:] # essentially do numpy's meshgrid by tiling anchors across height and width of convolution features shift_x = (T.arange(0, W) * anchor_size).reshape((1, W)) shift_y = (T.arange(0, H) * anchor_size).reshape((1, H)) shift_x = T.tile(shift_x, (H, 1)) shift_y = T.tile(shift_y.T, (1, W)) shifts = T.stack([shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel()]).T # Enumerate all shifted anchors: # add A anchors (1, A, 4) to # cell K shifts (K, 1, 4) to get # shift anchors (K, A, 4) # reshape to (K*A, 4) shifted anchors A = n_anchors K = shifts.shape[0] anchors = anchors.reshape((1, A, 4)) + shifts.reshape((K, 1, 4)) anchors = anchors.reshape((K*A, 4)) # Transpose and reshape predicted bbox transformations to get them # into the same order as the anchors: # bbox deltas will be (1, 4 * A, H, W) format # transpose to (1, H, W, 4 * A) # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a) # in slowest to fastest order bbox_deltas = bbox_deltas.dimshuffle((0, 2, 3, 1)).reshape((-1, 4)) # Same story for the object scores: # scores are (1, A, H, W) format # transpose to (1, H, W, A) # reshape to (1 * H * W * A, 1) where rows are ordered by (h, w, a) scores = object_probs.dimshuffle((0, 2, 3, 1)).reshape((-1, 1)) # Convert anchors into proposals via bbox transformations proposals = bbox_transform_inv(anchors, bbox_deltas) # 2. clip predicted boxes to image proposals = clip_boxes(proposals, self.im_info[:2]) # 3. remove predicted boxes with either height or width < threshold # (NOTE: convert min_size to input image scale stored in im_info[2]) keep = filter_boxes(proposals, min_size * self.im_info[2]) proposals = proposals[keep, :] scores = scores[keep] # 4. sort all (proposal, score) pairs by score from highest to lowest order = scores.ravel().argsort()[::-1] proposals = proposals[order, :] scores = scores[order] # 6. apply nms (e.g. threshold = 0.7) # 7. take after_nms_topN (e.g. 2000) # 8. return the top proposals (-> RoIs top) keep, self.updates = nms(T.concatenate([proposals, scores], axis=1), nms_thresh) keep = keep[:topN] self.proposals = proposals[keep, :] self.scores = scores[keep] self.outputs = [self.proposals, self.scores] # self.output_size = [self.cls_score.output_size, self.bbox_pred.output_size] self.params = {} self.params.update(p_dict("rpn_conv/3x3_", self.conv)) self.params.update(p_dict("rpn_cls_score_", self.cls_score)) self.params.update(p_dict("rpn_bbox_pred_", self.bbox_pred))
def __init__(self, inputs): """ Region proposal net - inputs should be a list of [convolution model, tuple(image_h, image_w, image_scale)] """ self.conv_in, self.im_info = inputs ## inputs is a convolutional net (i.e. VGG or ZFNet) before the fully-connected layers. super(RPN, self).__init__(inputs) in_filters = self.conv_in.output_size[1] # 512 # RPN conv layers classes = 2 n_anchors = 9 min_size = 16 anchor_size = 16 nms_thresh = 0.7 topN = 2000 self.conv = Conv2D(inputs=self.conv_in, n_filters=in_filters, filter_size=(3, 3), stride=(1, 1), activation='relu', border_mode='full') self.cls_score = Conv2D(inputs=self.conv, n_filters=classes * n_anchors, filter_size=(1, 1), stride=(1, 1), activation='linear', border_mode='valid') # need to dimshuffle/flatten it down to get the softmax class probabilities for each class of `classes` cls_shape = self.cls_score.get_outputs().shape cls_score = self.cls_score.get_outputs().reshape( (cls_shape[0], classes, -1, cls_shape[3])) # shuffle to (classes, batch, row, col) cls_shuffle = cls_score.dimshuffle((1, 0, 2, 3)) # flatten to (classes, batch*row*col) cls_flat = cls_shuffle.flatten(2) # shuffle to (batch*row*col, classes) cls_flat = cls_flat.dimshuffle((1, 0)) # softmax for probability! cls_probs_flat = T.nnet.softmax(cls_flat) # now shuffle back up to 4D output from cls_score (undo what we did) cls_probs = cls_probs_flat.dimshuffle( (1, 0)).reshape(cls_shuffle.shape) cls_probs = cls_probs.dimshuffle((1, 0, 2, 3)) self.cls_probs = cls_probs.reshape(cls_shape) self.bbox_pred = Conv2D(inputs=self.conv, n_filters=4 * n_anchors, filter_size=(1, 1), stride=(1, 1), activation='linear', border_mode='valid') ############### # 1. Generate proposals from bbox deltas and shifted anchors (ROIs) ############### anchors = theano.shared(generate_anchors(anchor_size)) object_probs = self.cls_probs[:, n_anchors:, :, :] bbox_deltas = self.bbox_pred.get_outputs() # height and width of convolution features H, W = object_probs.shape[-2:] # essentially do numpy's meshgrid by tiling anchors across height and width of convolution features shift_x = (T.arange(0, W) * anchor_size).reshape((1, W)) shift_y = (T.arange(0, H) * anchor_size).reshape((1, H)) shift_x = T.tile(shift_x, (H, 1)) shift_y = T.tile(shift_y.T, (1, W)) shifts = T.stack([ shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel() ]).T # Enumerate all shifted anchors: # add A anchors (1, A, 4) to # cell K shifts (K, 1, 4) to get # shift anchors (K, A, 4) # reshape to (K*A, 4) shifted anchors A = n_anchors K = shifts.shape[0] anchors = anchors.reshape((1, A, 4)) + shifts.reshape((K, 1, 4)) anchors = anchors.reshape((K * A, 4)) # Transpose and reshape predicted bbox transformations to get them # into the same order as the anchors: # bbox deltas will be (1, 4 * A, H, W) format # transpose to (1, H, W, 4 * A) # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a) # in slowest to fastest order bbox_deltas = bbox_deltas.dimshuffle((0, 2, 3, 1)).reshape((-1, 4)) # Same story for the object scores: # scores are (1, A, H, W) format # transpose to (1, H, W, A) # reshape to (1 * H * W * A, 1) where rows are ordered by (h, w, a) scores = object_probs.dimshuffle((0, 2, 3, 1)).reshape((-1, 1)) # Convert anchors into proposals via bbox transformations proposals = bbox_transform_inv(anchors, bbox_deltas) # 2. clip predicted boxes to image proposals = clip_boxes(proposals, self.im_info[:2]) # 3. remove predicted boxes with either height or width < threshold # (NOTE: convert min_size to input image scale stored in im_info[2]) keep = filter_boxes(proposals, min_size * self.im_info[2]) proposals = proposals[keep, :] scores = scores[keep] # 4. sort all (proposal, score) pairs by score from highest to lowest order = scores.ravel().argsort()[::-1] proposals = proposals[order, :] scores = scores[order] # 6. apply nms (e.g. threshold = 0.7) # 7. take after_nms_topN (e.g. 2000) # 8. return the top proposals (-> RoIs top) keep, self.updates = nms(T.concatenate([proposals, scores], axis=1), nms_thresh) keep = keep[:topN] self.proposals = proposals[keep, :] self.scores = scores[keep] self.outputs = [self.proposals, self.scores] # self.output_size = [self.cls_score.output_size, self.bbox_pred.output_size] self.params = {} self.params.update(p_dict("rpn_conv/3x3_", self.conv)) self.params.update(p_dict("rpn_cls_score_", self.cls_score)) self.params.update(p_dict("rpn_bbox_pred_", self.bbox_pred))