def test(self): with tf.Session() as sess: npimg = np.random.rand(1, self.height, self.width, 2).astype(np.float32) npimg = np.zeros((1, self.height, self.width, 1), dtype=np.float32) boxes = np.random.randint(0, 50, [self.N, 2]) s = np.random.randint(20, 30, [self.N, 2]) boxes = np.hstack((boxes, boxes + s)).astype(np.float32) stride = 2.0 for i in range(self.N): b = boxes[i, :] / stride npimg[:, int(b[1]):int(b[3] + 1), int(b[0]):int(b[2] + 1), :] = 1 img = tf.constant(npimg) pooled_height = 5 pooled_width = 5 batch_inds = np.zeros((self.N, ), dtype=np.int32) batch_inds = tf.convert_to_tensor(batch_inds) feats, _ = ROIAlign( img, boxes, batch_inds, stride=stride, pooled_height=pooled_height, pooled_width=pooled_width, ) self.feats = feats.eval() print(self.feats.shape) print(self.feats.reshape((self.N, pooled_height, pooled_width)))
def test(self): with tf.Session() as sess: npimg = np.random.rand(1, self.height, self.width, 2).astype(np.float32) npimg = np.zeros((1, self.height, self.width, 1), dtype=np.float32) boxes = np.random.randint(0, 50, [self.N, 2]) s = np.random.randint(20, 30, [self.N, 2]) boxes = np.hstack((boxes, boxes + s)).astype(np.float32) stride = 2.0 for i in range(self.N): b = boxes[i, :] / stride npimg[:, int(b[1]):int(b[3]+1), int(b[0]):int(b[2]+1), :] = 1 img = tf.constant(npimg) pooled_height = 5 pooled_width = 5 batch_inds = np.zeros((self.N, ), dtype=np.int32) batch_inds = tf.convert_to_tensor(batch_inds) feats = ROIAlign(img, boxes, batch_inds, stride=stride, pooled_height=pooled_height, pooled_width=pooled_width,) self.feats = feats.eval() print (self.feats.shape) print (self.feats.reshape((self.N, pooled_height, pooled_width)))
def build_heads(pyramid, ih, iw, num_classes, base_anchors, is_training=False): """Build the 3-way outputs, i.e., class, box and mask in the pyramid Algo ---- For each layer: 1. Build anchor layer 2. Process the results of anchor layer, decode the output into rois 3. Sample rois 4. Build roi layer 5. Process the results of roi layer, decode the output into boxes 6. Build the mask layer 7. Build losses """ outputs = {} arg_scope = _extra_conv_arg_scope(activation_fn=None) with slim.arg_scope(arg_scope): # for p in pyramid: for i in range(5, 1, -1): p = 'P%d' % i stride = 2**i outputs[p] = {} ## rpn head shape = tf.shape(pyramid[p]) height, width = shape[1], shape[2] rpn = slim.conv2d(pyramid[p], 256, [3, 3], stride=1, activation_fn=tf.nn.relu, scope='%s/rpn' % p) box = slim.conv2d(rpn, base_anchors * 4, [1, 1], stride=1, scope='%s/rpn/box' % p) cls = slim.conv2d(rpn, base_anchors * 2, [1, 1], stride=1, scope='%s/rpn/cls' % p) outputs[p]['rpn'] = {'box': box, 'cls': cls} ## decode, sample and crop all_anchors = gen_all_anchors(height, width, stride) cls_prob = tf.reshape( tf.nn.softmax( tf.reshape(cls, [1, shape[1], shape[2], base_anchors, 2])), [1, shape[1], shape[2], base_anchors * 2]) rois, classes, scores = \ anchor_decoder(box, cls_prob, all_anchors, ih, iw) rois, scores = sample_rpn_outputs(rois, scores) cropped = ROIAlign( pyramid[p], rois, False, stride=2**i, pooled_height=7, pooled_width=7, ) # rois of an image, sampled from rpn output outputs[p]['roi'] = { 'box': rois, 'scores': scores, 'cropped': cropped } ## refine head refine = slim.flatten(cropped) refine = slim.fully_connected(refine, 1024, activation_fn=tf.nn.relu) refine = slim.dropout(refine, keep_prob=0.75, is_training=is_training) refine = slim.fully_connected(refine, 1024, activation_fn=tf.nn.relu) refine = slim.dropout(refine, keep_prob=0.75, is_training=is_training) cls2 = slim.fully_connected(refine, num_classes, activation_fn=None) box = slim.fully_connected(refine, num_classes * 4, activation_fn=None) outputs[p]['refined'] = {'box': box, 'cls': cls2} ## decode refine net outputs cls2_prob = tf.nn.softmax(cls2) final_boxes, classes, scores = \ roi_decoder(box, cls2_prob, rois, ih, iw) # for testing, maskrcnn takes refined boxes as inputs if not is_training: rois = final_boxes ## mask head m = ROIAlign(pyramid[p], rois, False, stride=2**i, pooled_height=14, pooled_width=14) outputs[p]['roi']['cropped_mask'] = m for _ in range(4): m = slim.conv2d(m, 256, [3, 3], stride=1, padding='SAME', activation_fn=tf.nn.relu) m = slim.conv2d_transpose(m, 256, [2, 2], stride=2, padding='VALID', activation_fn=tf.nn.relu) m = slim.conv2d(m, num_classes * 2, [1, 1], stride=1, padding='VALID', activation_fn=None) # add a mask, given the predicted boxes and classes outputs[p]['mask'] = { 'mask': m, 'classes': classes, 'scores': scores } return outputs
def build_heads(pyramid, py_scope, slim_scope, image_height, image_width, num_classes, base_anchors, is_training=False, gt_boxes=None): """Build the 3-way outputs, i.e., class, box and mask in the pyramid Algo ---- For each layer: 1. Build anchor layer 2. Process the results of anchor layer, decode the output into rois 3. Sample rois 4. Build roi layer 5. Process the results of roi layer, decode the output into boxes 6. Build the mask layer 7. Build losses """ outputs = {} # if _BN is True: # if is_training is True: # arg_scope = _extra_conv_arg_scope_with_bn() # else: # arg_scope = _extra_conv_arg_scope_with_bn(batch_norm_decay=0.0) # # arg_scope = _extra_conv_arg_scope_with_bn(is_training=is_training) # else: # arg_scope = _extra_conv_arg_scope(activation_fn=tf.nn.relu) with tf.name_scope(py_scope) as py_scope: with slim.arg_scope(slim_scope) as slim_scope: ### for p in pyramid outputs['rpn'] = {} for i in range(5, 1, -1): p = 'P%d' % i stride = 2**i """Build RPN head RPN takes features from each layer of pyramid network. strides are respectively set to [4, 8, 16, 32] for pyramid feature layer P2,P3,P4,P5 anchor_scales are set to [2 **(i-2), 2 ** (i-1), 2 **(i)] in all pyramid layers (*This is probably inconsistent with original paper where the only scale is 8) It generates 2 outputs. box: an array of shape (1, pyramid_height, pyramid_width, num_anchorx4). box regression values [shift_x, shift_y, scale_width, scale_height] are stored in the last dimension of the array. cls: an array of shape (1, pyramid_height, pyramid_width, num_anchorx2). Note that this value is before softmax """ shape = tf.shape(pyramid[p]) height, width = shape[1], shape[2] rpn = slim.conv2d(pyramid[p], 256, [3, 3], stride=1, activation_fn=tf.nn.relu, scope='pyramid/%s/rpn' % p) box = slim.conv2d(rpn, base_anchors * 4, [1, 1], stride=1, scope='pyramid/%s/rpn/box' % p, \ weights_initializer=tf.truncated_normal_initializer(stddev=0.001), activation_fn=None, normalizer_fn=None) cls = slim.conv2d(rpn, base_anchors * 2, [1, 1], stride=1, scope='pyramid/%s/rpn/cls' % p, \ weights_initializer=tf.truncated_normal_initializer(stddev=0.01), activation_fn=None, normalizer_fn=None) anchor_scales = [8] #[2 **(i-2), 2 ** (i-1), 2 **(i)] print("anchor_scales = ", anchor_scales) all_anchors = gen_all_anchors(height, width, stride, anchor_scales) outputs['rpn'][p] = { 'box': box, 'cls': cls, 'anchor': all_anchors, 'shape': shape } ### gather boxes, clses, anchors from all pyramid layers rpn_boxes = [ tf.reshape(outputs['rpn']['P%d' % p]['box'], [-1, 4]) for p in range(5, 1, -1) ] rpn_clses = [ tf.reshape(outputs['rpn']['P%d' % p]['cls'], [-1, 1]) for p in range(5, 1, -1) ] rpn_anchors = [ tf.reshape(outputs['rpn']['P%d' % p]['anchor'], [-1, 4]) for p in range(5, 1, -1) ] rpn_boxes = tf.concat(values=rpn_boxes, axis=0) rpn_clses = tf.concat(values=rpn_clses, axis=0) rpn_anchors = tf.concat(values=rpn_anchors, axis=0) ### softmax to get probability rpn_probs = tf.nn.softmax(tf.reshape(rpn_clses, [-1, 2])) ### decode anchors and box regression values into proposed bounding boxes rpn_final_boxes, rpn_final_clses, rpn_final_scores = anchor_decoder( rpn_boxes, rpn_probs, rpn_anchors, image_height, image_width) outputs['rpn_boxes'] = rpn_boxes outputs['rpn_clses'] = rpn_clses outputs['rpn_anchor'] = rpn_anchors outputs['rpn_final_boxes'] = rpn_final_boxes outputs['rpn_final_clses'] = rpn_final_clses outputs['rpn_final_scores'] = rpn_final_scores if is_training is True: ### for training, rcnn and maskrcnn take rpn proposed bounding boxes as inputs rpn_rois_to_rcnn, rpn_scores_to_rcnn, rpn_batch_inds_to_rcnn, rpn_rois_to_mask, rpn_scores_to_mask, rpn_batch_inds_to_mask = \ sample_rpn_outputs_with_gt(rpn_final_boxes, rpn_final_scores, gt_boxes, is_training=is_training, only_positive=False)#True else: ### for testing, only rcnn takes rpn boxes as inputs. maskrcnn takes rcnn boxes as inputs rpn_rois_to_rcnn, rpn_scores_to_rcnn, rpn_batch_inds_to_rcnn = sample_rpn_outputs( rpn_final_boxes, rpn_final_scores, only_positive=False) ### assign pyramid layer indexs to rcnn network's ROIs. [rcnn_assigned_rois, rcnn_assigned_batch_inds, rcnn_assigned_layer_inds] = \ assign_boxes(rpn_rois_to_rcnn, [rpn_rois_to_rcnn, rpn_batch_inds_to_rcnn], [2, 3, 4, 5]) ### crop features from pyramid using ROIs. Note that this will change order of the ROIs, so ROIs are also reordered. rcnn_cropped_features = [] rcnn_ordered_rois = [] for i in range(5, 1, -1): p = 'P%d' % i rcnn_splitted_roi = rcnn_assigned_rois[i - 2] rcnn_batch_ind = rcnn_assigned_batch_inds[i - 2] rcnn_cropped_feature, rcnn_rois_to_crop_and_resize = ROIAlign( pyramid[p], rcnn_splitted_roi, rcnn_batch_ind, image_height, image_width, stride=2**i, pooled_height=14, pooled_width=14) rcnn_cropped_features.append(rcnn_cropped_feature) rcnn_ordered_rois.append(rcnn_splitted_roi) rcnn_cropped_features = tf.concat(values=rcnn_cropped_features, axis=0) rcnn_ordered_rois = tf.concat(values=rcnn_ordered_rois, axis=0) """Build rcnn head rcnn takes cropped features and generates 2 outputs. rcnn_boxes: an array of shape (num_ROIs, num_classes x 4). Box regression values of each classes [shift_x, shift_y, scale_width, scale_height] are stored in the last dimension of the array. rcnn_clses: an array of shape (num_ROIs, num_classes). Class prediction values (before softmax) are stored """ rcnn = slim.max_pool2d(rcnn_cropped_features, [3, 3], stride=2, padding='SAME') rcnn = slim.flatten(rcnn) rcnn = slim.fully_connected( rcnn, 1024, activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer( stddev=0.001), scope="pyramid/fully_connected") rcnn = slim.dropout(rcnn, keep_prob=0.75, is_training=is_training) #is_training rcnn = slim.fully_connected( rcnn, 1024, activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer( stddev=0.001), scope="pyramid/fully_connected_1") rcnn = slim.dropout(rcnn, keep_prob=0.75, is_training=is_training) #is_training rcnn_clses = slim.fully_connected( rcnn, num_classes, activation_fn=None, normalizer_fn=None, weights_initializer=tf.truncated_normal_initializer( stddev=0.001), scope="pyramid/fully_connected_2") rcnn_boxes = slim.fully_connected( rcnn, num_classes * 4, activation_fn=None, normalizer_fn=None, weights_initializer=tf.truncated_normal_initializer( stddev=0.001), scope="pyramid/fully_connected_3") ### softmax to get probability rcnn_scores = tf.nn.softmax(rcnn_clses) ### decode ROIs and box regression values into bounding boxes rcnn_final_boxes, rcnn_final_classes, rcnn_final_scores = roi_decoder( rcnn_boxes, rcnn_scores, rcnn_ordered_rois, image_height, image_width) outputs['rcnn_ordered_rois'] = rcnn_ordered_rois outputs['rcnn_cropped_features'] = rcnn_cropped_features tf.add_to_collection('__CROPPED__', rcnn_cropped_features) outputs['rcnn_boxes'] = rcnn_boxes outputs['rcnn_clses'] = rcnn_clses outputs['rcnn_scores'] = rcnn_scores outputs['rcnn_final_boxes'] = rcnn_final_boxes outputs['rcnn_final_clses'] = rcnn_final_classes outputs['rcnn_final_scores'] = rcnn_final_scores if is_training: ### assign pyramid layer indexs to mask network's ROIs [mask_assigned_rois, mask_assigned_batch_inds, mask_assigned_layer_inds] = \ assign_boxes(rpn_rois_to_mask, [rpn_rois_to_mask, rpn_batch_inds_to_mask], [2, 3, 4, 5]) ### crop features from pyramid using ROIs. Again, this will change order of the ROIs, so ROIs are reordered. mask_cropped_features = [] mask_ordered_rois = [] ### crop features from pyramid for mask network for i in range(5, 1, -1): p = 'P%d' % i mask_splitted_roi = mask_assigned_rois[i - 2] mask_batch_ind = mask_assigned_batch_inds[i - 2] mask_cropped_feature, mask_rois_to_crop_and_resize = ROIAlign( pyramid[p], mask_splitted_roi, mask_batch_ind, image_height, image_width, stride=2**i, pooled_height=14, pooled_width=14) mask_cropped_features.append(mask_cropped_feature) mask_ordered_rois.append(mask_splitted_roi) mask_cropped_features = tf.concat(values=mask_cropped_features, axis=0) mask_ordered_rois = tf.concat(values=mask_ordered_rois, axis=0) else: ### for testing, mask network takes rcnn boxes as inputs rcnn_rois_to_mask, rcnn_clses_to_mask, rcnn_scores_to_mask, rcnn_batch_inds_to_mask = sample_rcnn_outputs( rcnn_final_boxes, rcnn_final_classes, rcnn_scores, class_agnostic=False) [mask_assigned_rois, mask_assigned_clses, mask_assigned_scores, mask_assigned_batch_inds, mask_assigned_layer_inds] =\ assign_boxes(rcnn_rois_to_mask, [rcnn_rois_to_mask, rcnn_clses_to_mask, rcnn_scores_to_mask, rcnn_batch_inds_to_mask], [2, 3, 4, 5]) mask_cropped_features = [] mask_ordered_rois = [] mask_ordered_clses = [] mask_ordered_scores = [] for i in range(5, 1, -1): p = 'P%d' % i mask_splitted_roi = mask_assigned_rois[i - 2] mask_splitted_cls = mask_assigned_clses[i - 2] mask_splitted_score = mask_assigned_scores[i - 2] mask_batch_ind = mask_assigned_batch_inds[i - 2] mask_cropped_feature, mask_rois_to_crop_and_resize = ROIAlign( pyramid[p], mask_splitted_roi, mask_batch_ind, image_height, image_width, stride=2**i, pooled_height=14, pooled_width=14) mask_cropped_features.append(mask_cropped_feature) mask_ordered_rois.append(mask_splitted_roi) mask_ordered_clses.append(mask_splitted_cls) mask_ordered_scores.append(mask_splitted_score) mask_cropped_features = tf.concat(values=mask_cropped_features, axis=0) mask_ordered_rois = tf.concat(values=mask_ordered_rois, axis=0) mask_ordered_clses = tf.concat(values=mask_ordered_clses, axis=0) mask_ordered_scores = tf.concat(values=mask_ordered_scores, axis=0) outputs['mask_final_clses'] = mask_ordered_clses outputs['mask_final_scores'] = mask_ordered_scores """Build mask rcnn head mask rcnn takes cropped features and generates masks for each classes. m: an array of shape (28, 28, num_classes). Note that this value is before sigmoid. """ m = mask_cropped_features m = slim.conv2d(m, 256, [3, 3], stride=1, padding='SAME', activation_fn=tf.nn.relu, scope="pyramid/Conv") m = slim.conv2d(m, 256, [3, 3], stride=1, padding='SAME', activation_fn=tf.nn.relu, scope="pyramid/Conv_1") m = slim.conv2d(m, 256, [3, 3], stride=1, padding='SAME', activation_fn=tf.nn.relu, scope="pyramid/Conv_2") m = slim.conv2d(m, 256, [3, 3], stride=1, padding='SAME', activation_fn=tf.nn.relu, scope="pyramid/Conv_3") m = slim.conv2d_transpose(m, 256, 2, stride=2, padding='VALID', activation_fn=tf.nn.relu, scope="pyramid/Conv2d_transpose") tf.add_to_collection('__TRANSPOSED__', m) m = slim.conv2d(m, num_classes, [1, 1], stride=1, padding='VALID', activation_fn=None, normalizer_fn=None, scope="pyramid/Conv_4") outputs['mask_ordered_rois'] = mask_ordered_rois outputs['mask_cropped_features'] = mask_cropped_features outputs['mask_mask'] = m outputs['mask_final_mask'] = tf.nn.sigmoid(m) return outputs, py_scope, slim_scope
def build_heads(pyramid, ih, iw, num_classes, base_anchors, is_training=False, gt_boxes=None): """Build the 3-way outputs, i.e., class, box and mask in the pyramid Algo ---- For each layer: 1. Build anchor layer 2. Process the results of anchor layer, decode the output into rois 3. Sample rois 4. Build roi layer 5. Process the results of roi layer, decode the output into boxes 6. Build the mask layer 7. Build losses """ outputs = {} arg_scope = _extra_conv_arg_scope(activation_fn=None) my_sigmoid = None with slim.arg_scope(arg_scope): with tf.variable_scope('pyramid'): # for p in pyramid: outputs['rpn'] = {} for i in range(5, 1, -1): p = 'P%d' % i stride = 2**i ## rpn head shape = tf.shape(pyramid[p]) height, width = shape[1], shape[2] rpn = slim.conv2d(pyramid[p], 256, [3, 3], stride=1, activation_fn=tf.nn.relu, scope='%s/rpn' % p) box = slim.conv2d(rpn, base_anchors * 4, [1, 1], stride=1, scope='%s/rpn/box' % p, \ weights_initializer=tf.truncated_normal_initializer(stddev=0.001), activation_fn=my_sigmoid) cls = slim.conv2d(rpn, base_anchors * 2, [1, 1], stride=1, scope='%s/rpn/cls' % p, \ weights_initializer=tf.truncated_normal_initializer(stddev=0.01)) anchor_scales = [2**(i - 2), 2**(i - 1), 2**(i)] all_anchors = gen_all_anchors(height, width, stride, anchor_scales) outputs['rpn'][p] = { 'box': box, 'cls': cls, 'anchor': all_anchors } ## gather all rois # print (outputs['rpn']) rpn_boxes = [ tf.reshape(outputs['rpn']['P%d' % p]['box'], [-1, 4]) for p in range(5, 1, -1) ] rpn_clses = [ tf.reshape(outputs['rpn']['P%d' % p]['cls'], [-1, 1]) for p in range(5, 1, -1) ] rpn_anchors = [ tf.reshape(outputs['rpn']['P%d' % p]['anchor'], [-1, 4]) for p in range(5, 1, -1) ] rpn_boxes = tf.concat(values=rpn_boxes, axis=0) rpn_clses = tf.concat(values=rpn_clses, axis=0) rpn_anchors = tf.concat(values=rpn_anchors, axis=0) outputs['rpn']['box'] = rpn_boxes outputs['rpn']['cls'] = rpn_clses outputs['rpn']['anchor'] = rpn_anchors # outputs['rpn'] = {'box': rpn_boxes, 'cls': rpn_clses, 'anchor': rpn_anchors} rpn_probs = tf.nn.softmax(tf.reshape(rpn_clses, [-1, 2])) rois, roi_clses, scores, = anchor_decoder(rpn_boxes, rpn_probs, rpn_anchors, ih, iw) # rois, scores, batch_inds = sample_rpn_outputs(rois, rpn_probs[:, 1]) rois, scores, batch_inds, mask_rois, mask_scores, mask_batch_inds = \ sample_rpn_outputs_with_gt(rois, rpn_probs[:, 1], gt_boxes, is_training=is_training) # if is_training: # # rois, scores, batch_inds = _add_jittered_boxes(rois, scores, batch_inds, gt_boxes) # rois, scores, batch_inds = _add_jittered_boxes(rois, scores, batch_inds, gt_boxes, jitter=0.2) outputs['roi'] = {'box': rois, 'score': scores} ## cropping regions [assigned_rois, assigned_batch_inds, assigned_layer_inds] = \ assign_boxes(rois, [rois, batch_inds], [2, 3, 4, 5]) cropped_rois = [] for i in range(5, 1, -1): p = 'P%d' % i splitted_rois = assigned_rois[i - 2] batch_inds = assigned_batch_inds[i - 2] cropped = ROIAlign(pyramid[p], splitted_rois, batch_inds, stride=2**i, pooled_height=14, pooled_width=14) cropped_rois.append(cropped) cropped_rois = tf.concat(values=cropped_rois, axis=0) outputs['roi']['cropped_rois'] = cropped_rois tf.add_to_collection('__CROPPED__', cropped_rois) ## refine head # to 7 x 7 cropped_regions = slim.max_pool2d(cropped_rois, [3, 3], stride=2, padding='SAME') refine = slim.flatten(cropped_regions) refine = slim.fully_connected(refine, 1024, activation_fn=tf.nn.relu) refine = slim.dropout(refine, keep_prob=0.75, is_training=is_training) refine = slim.fully_connected(refine, 1024, activation_fn=tf.nn.relu) refine = slim.dropout(refine, keep_prob=0.75, is_training=is_training) cls2 = slim.fully_connected( refine, num_classes, activation_fn=None, weights_initializer=tf.truncated_normal_initializer( stddev=0.01)) box = slim.fully_connected( refine, num_classes * 4, activation_fn=my_sigmoid, weights_initializer=tf.truncated_normal_initializer( stddev=0.001)) outputs['refined'] = {'box': box, 'cls': cls2} ## decode refine net outputs cls2_prob = tf.nn.softmax(cls2) final_boxes, classes, scores = \ roi_decoder(box, cls2_prob, rois, ih, iw) ## for testing, maskrcnn takes refined boxes as inputs if not is_training: rois = final_boxes # [assigned_rois, assigned_batch_inds, assigned_layer_inds] = \ # assign_boxes(rois, [rois, batch_inds], [2, 3, 4, 5]) for i in range(5, 1, -1): splitted_rois = assigned_rois[i - 2] batch_inds = assigned_batch_inds[i - 2] p = 'P%d' % i cropped = ROIAlign(pyramid[p], splitted_rois, batch_inds, stride=2**i, pooled_height=14, pooled_width=14) cropped_rois.append(cropped) cropped_rois = tf.concat(values=cropped_rois, axis=0) ## mask head m = cropped_rois for _ in range(4): m = slim.conv2d(m, 256, [3, 3], stride=1, padding='SAME', activation_fn=tf.nn.relu) # to 28 x 28 m = slim.conv2d_transpose(m, 256, 2, stride=2, padding='VALID', activation_fn=tf.nn.relu) tf.add_to_collection('__TRANSPOSED__', m) m = slim.conv2d(m, num_classes, [1, 1], stride=1, padding='VALID', activation_fn=None) # add a mask, given the predicted boxes and classes outputs['mask'] = {'mask': m, 'cls': classes, 'score': scores} return outputs
def build_heads(pyramid, ih, iw, num_classes, base_anchors, is_training=False, gt_boxes=None): """Build the 3-way outputs, i.e., class, box and mask in the pyramid Algo ---- For each layer: 1. Build anchor layer 2. Process the results of anchor layer, decode the output into rois 3. Sample rois 4. Build roi layer 5. Process the results of roi layer, decode the output into boxes 6. Build the mask layer 7. Build losses """ outputs = {} if _BN is True: arg_scope = _extra_conv_arg_scope_with_bn() # arg_scope = _extra_conv_arg_scope_with_bn(is_training=is_training) else: arg_scope = _extra_conv_arg_scope(activation_fn=tf.nn.relu) with slim.arg_scope(arg_scope): with tf.variable_scope('pyramid'): ### for p in pyramid outputs['rpn'] = {} for i in range(5, 1, -1): p = 'P%d'%i stride = 2 ** i ### rpn head shape = tf.shape(pyramid[p]) height, width = shape[1], shape[2] rpn = slim.conv2d(pyramid[p], 256, [3, 3], stride=1, activation_fn=tf.nn.relu, scope='%s/rpn'%p) box = slim.conv2d(rpn, base_anchors * 4, [1, 1], stride=1, scope='%s/rpn/box' % p, \ weights_initializer=tf.truncated_normal_initializer(stddev=0.001), activation_fn=None, normalizer_fn=None) cls = slim.conv2d(rpn, base_anchors * 2, [1, 1], stride=1, scope='%s/rpn/cls' % p, \ weights_initializer=tf.truncated_normal_initializer(stddev=0.01), activation_fn=None, normalizer_fn=None) anchor_scales = [2, 4, 8, 16, 32]#[2 **(i-2), 2 ** (i-1), 2 **(i)] print("anchor_scales = " , anchor_scales) all_anchors = gen_all_anchors(height, width, stride, anchor_scales) outputs['rpn'][p]={'box':box, 'cls':cls, 'anchor':all_anchors} ### gather all rois rpn_boxes = [tf.reshape(outputs['rpn']['P%d'%p]['box'], [-1, 4]) for p in range(5, 1, -1)] rpn_clses = [tf.reshape(outputs['rpn']['P%d'%p]['cls'], [-1, 1]) for p in range(5, 1, -1)] rpn_anchors = [tf.reshape(outputs['rpn']['P%d'%p]['anchor'], [-1, 4]) for p in range(5, 1, -1)] rpn_boxes = tf.concat(values=rpn_boxes, axis=0) rpn_clses = tf.concat(values=rpn_clses, axis=0) rpn_anchors = tf.concat(values=rpn_anchors, axis=0) rpn_probs = tf.nn.softmax(tf.reshape(rpn_clses, [-1, 2])) rpn_final_boxes, rpn_final_clses, rpn_final_scores, indexs = anchor_decoder(rpn_boxes, rpn_probs, rpn_anchors, ih, iw) outputs['rpn']['P5']['index'] = indexs[0:(tf.shape(tf.reshape(outputs['rpn']['P5']['box'], [-1, 4]))[0])] for i in range(4, 1, -1): p = 'P%d'%i outputs['rpn'][p]['index'] = indexs[outputs['rpn']['P%d'%(i+1)]['index'][-1] + 1 :outputs['rpn']['P%d'%(i+1)]['index'][-1] + 1 + tf.shape(tf.reshape(outputs['rpn']['P%d'%(i)]['box'], [-1, 4]))[0]] outputs['rpn_boxes'] = rpn_boxes outputs['rpn_clses'] = rpn_clses outputs['rpn_anchor'] = rpn_anchors outputs['rpn_final_boxes'] = rpn_final_boxes outputs['rpn_final_clses'] = rpn_final_clses outputs['rpn_final_scores'] = rpn_final_scores outputs['rpn_indexs'] = indexs if is_training is True: ### for training, rcnn and maskrcnn take rpn boxes as inputs rpn_rois_to_rcnn, rpn_scores_to_rcnn, rpn_batch_inds_to_rcnn, rpn_indexs_to_rcnn, rpn_rois_to_mask, rpn_scores_to_mask, rpn_batch_inds_to_mask, rpn_indexs_to_mask = \ sample_rpn_outputs_with_gt(rpn_final_boxes, rpn_final_scores, gt_boxes, indexs, is_training=is_training, only_positive=False) # rcnn_rois, rcnn_scores, rcnn_batch_inds, rcnn_indexs, mask_rois, mask_scores, mask_batch_inds, mask_indexs = \ # sample_rpn_outputs_with_gt(rpn_final_boxes, rpn_final_scores, gt_boxes, indexs, is_training=is_training, only_positive=True) else: ### for testing, only rcnn takes rpn boxes as inputs. maskrcnn takes rcnn boxes as inputs rpn_rois_to_rcnn, rpn_scores_to_rcnn, rpn_batch_inds_to_rcnn, rpn_indexs_to_rcnn = sample_rpn_outputs(rpn_final_boxes, rpn_final_scores, indexs, only_positive=True) ### assign pyramid layer indexs to rcnn network's ROIs [rcnn_assigned_rois, rcnn_assigned_batch_inds, rcnn_assigned_indexs, rcnn_assigned_layer_inds] = \ assign_boxes(rpn_rois_to_rcnn, [rpn_rois_to_rcnn, rpn_batch_inds_to_rcnn, rpn_indexs_to_rcnn], [2, 3, 4, 5]) ### crop features from pyramid for rcnn network rcnn_cropped_features = [] rcnn_ordered_rois = [] rcnn_ordered_index = [] for i in range(5, 1, -1): p = 'P%d'%i rcnn_splitted_roi = rcnn_assigned_rois[i-2] rcnn_batch_ind = rcnn_assigned_batch_inds[i-2] rcnn_index = rcnn_assigned_indexs[i-2] rcnn_cropped_feature, rcnn_rois_to_crop_and_resize, rcnn_py_shape, rcnn_ihiw = ROIAlign(pyramid[p], rcnn_splitted_roi, rcnn_batch_ind, ih, iw, stride=2**i, pooled_height=14, pooled_width=14) rcnn_cropped_features.append(rcnn_cropped_feature) rcnn_ordered_rois.append(rcnn_splitted_roi) rcnn_ordered_index.append(rcnn_index) rcnn_cropped_features = tf.concat(values=rcnn_cropped_features, axis=0) rcnn_ordered_rois = tf.concat(values=rcnn_ordered_rois, axis=0) rcnn_ordered_index = tf.concat(values=rcnn_ordered_index, axis=0) ### rcnn head # to 7 x 7 rcnn = slim.max_pool2d(rcnn_cropped_features, [3, 3], stride=2, padding='SAME') rcnn = slim.flatten(rcnn) rcnn = slim.fully_connected(rcnn, 1024, activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer(stddev=0.001)) rcnn = slim.dropout(rcnn, keep_prob=0.75, is_training=is_training) rcnn = slim.fully_connected(rcnn, 1024, activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer(stddev=0.001)) rcnn = slim.dropout(rcnn, keep_prob=0.75, is_training=is_training) rcnn_clses = slim.fully_connected(rcnn, num_classes, activation_fn=None, normalizer_fn=None, weights_initializer=tf.truncated_normal_initializer(stddev=0.001)) rcnn_boxes = slim.fully_connected(rcnn, num_classes*4, activation_fn=None, normalizer_fn=None, weights_initializer=tf.truncated_normal_initializer(stddev=0.001)) rcnn_scores = tf.nn.softmax(rcnn_clses) ### decode rcnn network final outputs rcnn_final_boxes, rcnn_final_classes, rcnn_final_scores = roi_decoder(rcnn_boxes, rcnn_scores, rcnn_ordered_rois, ih, iw) outputs['rcnn_ordered_rois'] = rcnn_ordered_rois outputs['rcnn_ordered_index'] = rcnn_ordered_index outputs['rcnn_cropped_features'] = rcnn_cropped_features tf.add_to_collection('__CROPPED__', rcnn_cropped_features) outputs['rcnn_boxes'] = rcnn_boxes outputs['rcnn_clses'] = rcnn_clses outputs['rcnn_scores'] = rcnn_scores outputs['rcnn_final_boxes'] = rcnn_final_boxes outputs['rcnn_final_clses'] = rcnn_final_classes outputs['rcnn_final_scores'] = rcnn_final_scores ### assign pyramid layer indexs to mask network's ROIs if is_training: [mask_assigned_rois, mask_assigned_batch_inds, mask_assigned_indexs, mask_assigned_layer_inds] = \ assign_boxes(rpn_rois_to_mask, [rpn_rois_to_mask, rpn_batch_inds_to_mask, rpn_indexs_to_mask], [2, 3, 4, 5]) mask_cropped_features = [] mask_ordered_rois = [] mask_ordered_indexs = [] ### crop features from pyramid for mask network for i in range(5, 1, -1): p = 'P%d'%i mask_splitted_roi = mask_assigned_rois[i-2] mask_batch_ind = mask_assigned_batch_inds[i-2] mask_index = mask_assigned_indexs[i-2] mask_cropped_feature, mask_rois_to_crop_and_resize, mask_py_shape, mask_ihiw = ROIAlign(pyramid[p], mask_splitted_roi, mask_batch_ind, ih, iw, stride=2**i, pooled_height=14, pooled_width=14) mask_cropped_features.append(mask_cropped_feature) mask_ordered_rois.append(mask_splitted_roi) mask_ordered_indexs.append(mask_index) mask_cropped_features = tf.concat(values=mask_cropped_features, axis=0) mask_ordered_rois = tf.concat(values=mask_ordered_rois, axis=0) mask_ordered_indexs = tf.concat(values=mask_ordered_indexs, axis=0) else: ### for testing, mask network takes rcnn boxes as inputs rcnn_rois_to_mask, rcnn_clses_to_mask, rcnn_scores_to_mask, rcnn_batch_inds_to_mask, rcnn_indexs_to_mask = sample_rcnn_outputs(rcnn_final_boxes, rcnn_final_classes, rcnn_scores, rcnn_ordered_index) # mask_rois, mask_clses, mask_scores, mask_batch_inds, mask_indexs = sample_rcnn_outputs(rcnn_final_boxes, rcnn_final_classes, rcnn_scores, rcnn_ordered_index) [mask_assigned_rois, mask_assigned_clses, mask_assigned_scores, mask_assigned_batch_inds, mask_assign_indexs, mask_assigned_layer_inds] =\ assign_boxes(rcnn_rois_to_mask, [rcnn_rois_to_mask, rcnn_clses_to_mask, rcnn_scores_to_mask, rcnn_batch_inds_to_mask, rcnn_indexs_to_mask], [2, 3, 4, 5]) mask_cropped_features = [] mask_ordered_rois = [] mask_ordered_indexs = [] mask_ordered_clses = [] mask_ordered_scores = [] for i in range(5, 1, -1): p = 'P%d'%i mask_splitted_roi = mask_assigned_rois[i-2] mask_splitted_cls = mask_assigned_clses[i-2] mask_splitted_score = mask_assigned_scores[i-2] mask_batch_ind = mask_assigned_batch_inds[i-2] mask_index = mask_assign_indexs[i-2] mask_cropped_feature, mask_rois_to_crop_and_resize, mask_py_shape, mask_ihiw = ROIAlign(pyramid[p], mask_splitted_roi, mask_batch_ind, ih, iw, stride=2**i, pooled_height=14, pooled_width=14) mask_cropped_features.append(mask_cropped_feature) mask_ordered_rois.append(mask_splitted_roi) mask_ordered_indexs.append(mask_index) mask_ordered_clses.append(mask_splitted_cls) mask_ordered_scores.append(mask_splitted_score) mask_cropped_features = tf.concat(values=mask_cropped_features, axis=0) mask_ordered_rois = tf.concat(values=mask_ordered_rois, axis=0) mask_ordered_indexs = tf.concat(values=mask_ordered_indexs, axis=0) mask_ordered_clses = tf.concat(values=mask_ordered_clses, axis=0) mask_ordered_scores = tf.concat(values=mask_ordered_scores, axis=0) outputs['mask_final_clses'] = mask_ordered_clses outputs['mask_final_scores'] = mask_ordered_scores ### mask head m = mask_cropped_features for _ in range(4): m = slim.conv2d(m, 256, [3, 3], stride=1, padding='SAME', activation_fn=tf.nn.relu) # to 28 x 28 m = slim.conv2d_transpose(m, 256, 2, stride=2, padding='VALID', activation_fn=tf.nn.relu) tf.add_to_collection('__TRANSPOSED__', m) m = slim.conv2d(m, num_classes, [1, 1], stride=1, padding='VALID', activation_fn=None, normalizer_fn=None) outputs['mask_ordered_rois'] = mask_ordered_rois outputs['mask_ordered_indexs'] = mask_ordered_indexs outputs['mask_cropped_features'] = mask_cropped_features outputs['mask_mask'] = m outputs['mask_final_mask'] = tf.nn.sigmoid(m) return outputs
def build_heads(pyramid, ih, iw, num_classes, base_anchors, is_training=False, gt_boxes=None): """Build the 3-way outputs, i.e., class, box and mask in the pyramid Algo ---- For each layer: 1. Build anchor layer 2. Process the results of anchor layer, decode the output into rois 3. Sample rois 4. Build roi layer 5. Process the results of roi layer, decode the output into boxes 6. Build the mask layer 7. Build losses """ outputs = {} #arg_scope = _extra_conv_arg_scope(activation_fn=None) arg_scope = _extra_conv_arg_scope_with_bn(activation_fn=None) my_sigmoid = None with slim.arg_scope(arg_scope): with tf.variable_scope('pyramid'): # for p in pyramid: outputs['rpn'] = {} for i in range(5, 1, -1): p = 'P%d' % i stride = 2**i ## rpn head shape = tf.shape(pyramid[p]) height, width = shape[1], shape[2] rpn = slim.conv2d(pyramid[p], 256, [3, 3], stride=1, activation_fn=tf.nn.relu, scope='%s/rpn' % p) box = slim.conv2d(rpn, base_anchors * 4, [1, 1], stride=1, scope='%s/rpn/box' % p, \ weights_initializer=tf.truncated_normal_initializer(stddev=0.001), activation_fn=my_sigmoid) cls = slim.conv2d(rpn, base_anchors * 2, [1, 1], stride=1, scope='%s/rpn/cls' % p, \ weights_initializer=tf.truncated_normal_initializer(stddev=0.01)) anchor_scales = [2**(i - 2), 2**(i - 1), 2**(i)] print("anchor_scales = ", anchor_scales) all_anchors = gen_all_anchors(height, width, stride, anchor_scales) outputs['rpn'][p] = { 'box': box, 'cls': cls, 'anchor': all_anchors } ## gather all rois # print (outputs['rpn']) rpn_boxes = [ tf.reshape(outputs['rpn']['P%d' % p]['box'], [-1, 4]) for p in range(5, 1, -1) ] rpn_clses = [ tf.reshape(outputs['rpn']['P%d' % p]['cls'], [-1, 1]) for p in range(5, 1, -1) ] rpn_anchors = [ tf.reshape(outputs['rpn']['P%d' % p]['anchor'], [-1, 4]) for p in range(5, 1, -1) ] rpn_boxes = tf.concat(values=rpn_boxes, axis=0) rpn_clses = tf.concat(values=rpn_clses, axis=0) rpn_anchors = tf.concat(values=rpn_anchors, axis=0) outputs['rpn']['box'] = rpn_boxes outputs['rpn']['cls'] = rpn_clses outputs['rpn']['anchor'] = rpn_anchors # outputs['rpn'] = {'box': rpn_boxes, 'cls': rpn_clses, 'anchor': rpn_anchors} rpn_probs = tf.nn.softmax(tf.reshape(rpn_clses, [-1, 2])) rois, roi_clses, scores, = anchor_decoder(rpn_boxes, rpn_probs, rpn_anchors, ih, iw) # rois, scores, batch_inds = sample_rpn_outputs(rois, rpn_probs[:, 1]) rois, scores, batch_inds, mask_rois, mask_scores, mask_batch_inds = \ sample_rpn_outputs_with_gt(rois, rpn_probs[:, 1], gt_boxes, is_training=is_training) # if is_training: # # rois, scores, batch_inds = _add_jittered_boxes(rois, scores, batch_inds, gt_boxes) # rois, scores, batch_inds = _add_jittered_boxes(rois, scores, batch_inds, gt_boxes, jitter=0.2) outputs['roi'] = {'box': rois, 'score': scores} ## cropping regions [assigned_rois, assigned_batch_inds, assigned_layer_inds] = \ assign_boxes(rois, [rois, batch_inds], [2, 3, 4, 5]) outputs['assigned_rois'] = assigned_rois outputs['assigned_layer_inds'] = assigned_layer_inds cropped_rois = [] ordered_rois = [] pyramid_feature = [] for i in range(5, 1, -1): print(i) p = 'P%d' % i splitted_rois = assigned_rois[i - 2] batch_inds = assigned_batch_inds[i - 2] cropped, boxes_in_crop = ROIAlign_(pyramid[p], splitted_rois, batch_inds, ih, iw, stride=2**i, pooled_height=14, pooled_width=14) # cropped = ROIAlign(pyramid[p], splitted_rois, batch_inds, stride=2**i, # pooled_height=14, pooled_width=14) cropped_rois.append(cropped) ordered_rois.append(splitted_rois) pyramid_feature.append(tf.transpose(pyramid[p], [0, 3, 1, 2])) # if i is 5: # outputs['tmp_0'] = tf.transpose(pyramid[p],[0,3,1,2]) # outputs['tmp_1'] = splitted_rois # outputs['tmp_2'] = tf.transpose(cropped,[0,3,1,2]) # outputs['tmp_3'] = boxes_in_crop # outputs['tmp_4'] = [ih, iw] cropped_rois = tf.concat(values=cropped_rois, axis=0) ordered_rois = tf.concat(values=ordered_rois, axis=0) outputs['ordered_rois'] = ordered_rois outputs['pyramid_feature'] = pyramid_feature outputs['roi']['cropped_rois'] = cropped_rois tf.add_to_collection('__CROPPED__', cropped_rois) ## refine head # to 7 x 7 cropped_regions = slim.max_pool2d(cropped_rois, [3, 3], stride=2, padding='SAME') refine = slim.flatten(cropped_regions) refine = slim.fully_connected(refine, 1024, activation_fn=tf.nn.relu) refine = slim.dropout(refine, keep_prob=0.75, is_training=is_training) refine = slim.fully_connected(refine, 1024, activation_fn=tf.nn.relu) refine = slim.dropout(refine, keep_prob=0.75, is_training=is_training) cls2 = slim.fully_connected( refine, num_classes, activation_fn=None, weights_initializer=tf.truncated_normal_initializer( stddev=0.05)) box = slim.fully_connected( refine, num_classes * 4, activation_fn=my_sigmoid, weights_initializer=tf.truncated_normal_initializer( stddev=0.05)) outputs['refined'] = {'box': box, 'cls': cls2} ## decode refine net outputs cls2_prob = tf.nn.softmax(cls2) final_boxes, classes, scores = \ roi_decoder(box, cls2_prob, ordered_rois, ih, iw) #outputs['tmp_0'] = ordered_rois #outputs['tmp_1'] = assigned_rois #outputs['tmp_2'] = box #outputs['tmp_3'] = final_boxes #outputs['tmp_4'] = cls2_prob #outputs['final_boxes'] = {'box': final_boxes, 'cls': classes} outputs['final_boxes'] = { 'box': final_boxes, 'cls': classes, 'prob': cls2_prob } ## for testing, maskrcnn takes refined boxes as inputs if not is_training: rois = final_boxes # [assigned_rois, assigned_batch_inds, assigned_layer_inds] = \ # assign_boxes(rois, [rois, batch_inds], [2, 3, 4, 5]) for i in range(5, 1, -1): p = 'P%d' % i splitted_rois = assigned_rois[i - 2] batch_inds = assigned_batch_inds[i - 2] cropped = ROIAlign(pyramid[p], splitted_rois, batch_inds, stride=2**i, pooled_height=14, pooled_width=14) cropped_rois.append(cropped) ordered_rois.append(splitted_rois) cropped_rois = tf.concat(values=cropped_rois, axis=0) ordered_rois = tf.concat(values=ordered_rois, axis=0) ## mask head ms = [] m = cropped_rois for _ in range(4): m = slim.conv2d(m, 256, [3, 3], stride=1, padding='SAME', activation_fn=tf.nn.relu) # to 28 x 28 m = slim.conv2d_transpose(m, 256, 2, stride=2, padding='VALID', activation_fn=tf.nn.relu) batch_size = 0 # # m_shape = m.get_shape().as_list() # # m_take_shape = m_shape[1:] # # m = tf.reshape(m, shape=[batch_size] + m_take_shape) # m = tf.reshape(m, shape=[batch_size, 28, 28, 256]) tf.add_to_collection('__TRANSPOSED__', m) # # print('m ', m.shape) ### add the capsule block between the convolutional layers with tf.variable_scope('PrimaryCaps_layer'): primaryCaps, activation = capslayer.layers.primaryCaps( m, filters=32, kernel_size=3, strides=2, out_caps_shape=[8, 1], padding='SAME') # return [batch_size, 10,10,32, 8,1] with tf.variable_scope('fc_reshape_Caps_layer'): fc_Caps, activation = capslayer.layers.fully_connected( primaryCaps, activation, num_outputs=7 * 7 * 1, out_caps_shape=[8, 1], routing_method='DynamicRouting') fc_Caps = tf.reshape(fc_Caps, shape=[batch_size, 7, 7, 1, 8, 1]) with tf.variable_scope('dePrimaryCaps_layer'): output = capslayer.layers.dePrimaryCaps( fc_Caps, activation, num_outputs=128, kernel_size=3, strides=2 ) # [batch, 16, 16, ngf * 8 ] => [batch, 4, 4, ngf * 8] tf.layers.conv2d_transpose(output, 256, kernel_size=9, strides=2) m = slim.conv2d(m, num_classes, [1, 1], stride=1, padding='VALID', activation_fn=None) # add a mask, given the predicted boxes and classes outputs['mask'] = {'mask': m, 'cls': classes, 'score': scores} return outputs
image, ih, iw, gt_boxes, gt_masks, num_instances, img_id = \ coco.read('./data/coco/records/coco_trainval2014_00000-of-00048.tfrecord') image, gt_boxes, gt_masks = \ preprocess_coco.preprocess_image(image, gt_boxes, gt_masks) sess = tf.Session() init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) # init_op = tf.initialize_all_variables() boxes = [[100, 100, 200, 200], [50, 50, 100, 100], [100, 100, 750, 750], [50, 50, 60, 60]] # boxes = np.zeros((0, 4)) boxes = tf.constant(boxes, tf.float32) feat = ROIAlign(image, boxes, False, 16, 7, 7) sess.run(init_op) tf.train.start_queue_runners(sess=sess) with sess.as_default(): for i in range(20000): image_np, ih_np, iw_np, gt_boxes_np, gt_masks_np, num_instances_np, img_id_np, \ feat_np = \ sess.run([image, ih, iw, gt_boxes, gt_masks, num_instances, img_id, feat]) # print (image_np.shape, gt_boxes_np.shape, gt_masks_np.shape) if i % 100 == 0: print('%d, image_id: %s, instances: %d' % (i, str(img_id_np), num_instances_np)) image_np = 256 * (image_np * 0.5 + 0.5)
def build_head(pyramid, num_classes, base_anchors, is_training=False): """Build the 3-way outputs, i.e., class, box and mask in the pyramid Algo ---- For each layer: 1. Build anchor layer 2. Process the results of anchor layer 3. Build roi layer 4. Process the results of roi layer 5. Build the mask layer 6. Build losses """ outputs = {} inshape = pyramid['inputs'].get_shape() ih, iw = inshape[1].value, inshape[2].value arg_scope = _extra_conv_arg_scope(activation_fn=None) with slim.arg_scope(arg_scope): # for p in pyramid: for i in range(5, 1, -1): p = 'P%d' % i stride = 2**i outputs[p] = {} # rpn head height, width = pyramid[p].get_shape( )[1].value, pyramid[p].get_shape()[1].value rpn = slim.conv2d(pyramid[p], 256, [3, 3], stride=1, activation_fn=tf.nn.relu, scope='%s/rpn' % p) box = slim.conv2d(rpn, num_classes * base_anchors * 4, [1, 1], stride=1, scope='%s/rpn/box' % p) cls = slim.conv2d(rpn, num_classes * base_anchors * 2, [1, 1], stride=1, scope='%s/rpn/cls' % p) outputs[p]['rpn'] = {'box': box, 'classes': cls} # decode, sample and crop all_anchors = gen_all_anchors(height, width, stride) rois, classes, scores = \ anchor_decoder(box, cls, all_anchors, ih, iw) rois, class_ids, scores = sample_rpn_outputs(rois, scores) cropped = ROIAlign( pyramid[p], rois, False, stride=2**i, pooled_height=7, pooled_width=7, ) # refine head refine = slim.fully_connected(cropped, 1024, activation_fn=tf.nn.relu) refine = slim.dropout(refine, keep_prob=0.75, is_training=is_training) refine = slim.fully_connected(refine, 1024, activation_fn=tf.nn.relu) refine = slim.dropout(refine, keep_prob=0.75, is_training=is_training) cls2 = slim.fully_connected(refine, num_classes, activation_fn=None) box = slim.fully_connected(refine, num_classes * 4, activation_fn=None) outputs[p]['refined'] = {'box': box, 'classes': cls2} # decode refine net outputs final_boxes, classes, scores = \ roi_decoder(box, cls2, rois, ih, iw) # for testing, maskrcnn takes refined boxes as inputs if not is_training: rois = final_boxes # mask head # rois, class_ids, scores = sample_rpn_outputs(rois, scores) m = ROIAlign(pyramid[p], rois, False, stride=2**i, pooled_height=14, pooled_width=14) for i in range(4): m = slim.conv2d(m, 256, [3, 3], stride=1, padding='SAME', activation_fn=tf.nn.relu) m = slim.conv2d_transpose(m, 256, [2, 2], stride=2, padding='VALID', activation_fn=tf.nn.relu) m = slim.conv2d(m, 81, [1, 1], stride=1, padding='VALID', activation_fn=None) # add a mask, given the predicted boxes and classes outputs[p]['mask'] = { 'mask': m, 'classes': classes, 'scores': scores } return outputs