예제 #1
0
파일: model.py 프로젝트: fendaq/mask_yi
def rpn_graph(feature_map, anchor_per_location, anchor_stride):
    """
    根据特征图建立RPN网络的计算图,对应网络的输出
    :param feature_map:  特征图,形状为[批,高,宽,通道数]
    :param anchor_per_location:  int,每个像素点产生多少个anchor
    :param anchor_stride: 一般取1,表示特征图上,每个点都产生anchor
    :return: 一个列表,有三个元素,依次是anchor的logits,probs, bbox回归
    """
    batch_size, height, width, channal = feature_map.shape
    num_anchor = height * width * anchor_per_location // anchor_stride  # 这张特征图一共可以产生num_anchor个anchor
    shared = conv2d(feature_map, 512, 3, anchor_stride, name='rpn_conv_shared')
    shared = tf.nn.relu(shared)
    # out_channal=2 * anchor_per_location,区分是或者不是物体
    x = conv2d(inputs=shared,
               out_channal=2 * anchor_per_location,
               kernel_size=1,
               strides=1,
               name='rpn_class_raw')
    # 把形状调整为[批数,anchor数,2], 2 表示object/non-object
    # rpn_binary_logits = tf.reshape(x, shape=[batch_size, num_anchor, 2])
    rpn_binary_logits = tf.reshape(x, [batch_size, -1, 2])

    rpn_probs = tf.nn.softmax(rpn_binary_logits)

    x = conv2d(inputs=shared,
               out_channal=4 * anchor_per_location,
               kernel_size=1,
               strides=1,
               name='rpn_bbox_pred')
    # 坐标回归
    rpn_bbox = tf.reshape(x, [batch_size, -1, 4])

    return [rpn_binary_logits, rpn_probs, rpn_bbox]
예제 #2
0
파일: model.py 프로젝트: fendaq/mask_yi
def build_fpn_mask_graph(rois,
                         feature_maps,
                         image_shape,
                         pool_size,
                         num_class,
                         train_bn=True,
                         name=None):
    """
    构建mask
    :param rois:  Proposals, [batch, num_rois, (x1, y1, x2, y2)]
    :param feature_maps:  一个列表,[p2, p3, p4, p5] 代表四个层级的特征图
    :param image_shape: 原始输入图片的shape,[高,宽,通道数]。一个批次的所有图片,必须有相同的shape
    :param pool_size: ROI Pooling后的大小,在论文中,对于mask是14*14
    :param num_class: 分类数,他决定了最终的通道数
    :param train_bn:
    :return: [num_boxes, 28, 28, num_classes]
    """
    # [num_boxes, height, width, channels], ROI Pooling后的结果
    x = pyramidROIAlign(pool_size, rois, image_shape, feature_maps)
    for _ in range(4):
        x = conv2d(inputs=x,
                   out_channal=256,
                   kernel_size=3,
                   strides=1,
                   use_bias=False)
        x = batch_norm(x, train_bn)
        x = tf.nn.relu(x)
    x = conv2d_transpose(inputs=x, out_channal=256, kernel_size=3, strides=2)
    x = tf.nn.relu(x)
    x = conv2d(x, num_class, 1, 1, use_bias=True, name=name)
    return x
예제 #3
0
파일: model.py 프로젝트: fendaq/mask_yi
def fpn_classifier_graph(rois, mrcnn_feature_maps, input_image_shape,
                         pool_size, num_classes):
    """
    构建FPN的分类与回归
    :param rois: Proposals, [batch, num_rois, (x1, y1, x2, y2)]
    :param mrcnn_feature_maps: 一个列表,[p2, p3, p4, p5] 代表四个层级的特征图
    其相对于输入图片的缩放倍数依次是8, 16, 32, 64
    :param input_image_shape: 原始输入图片的shape,[高,宽,通道数]。一个批次的所有图片,必须有相同的shape
    :param pool_size: ROI Pooling后的大小,一般是7*7
    :param num_classes: 分类数,他决定了最终的通道数,因为我们用global avarage pool
    :return:
    """

    # [num_boxes, height, width, channels], ROI Pooling后的结果
    x = pyramidROIAlign(pool_size, rois, input_image_shape, mrcnn_feature_maps)

    num_boxes = tf.shape(x)[0]
    # TODO 到这里,彻底放弃了批数大于1的情形!!
    # 这里其实就是全连接,并且批数由num_boxes代替了

    x = conv2d(inputs=x,
               out_channal=1024,
               kernel_size=pool_size[0],
               strides=pool_size[0],
               use_bias=True,
               name="mrcnn_class_conv1")

    # 这里是全连接,就不来batch_norm了
    x = tf.nn.relu(x)
    x = conv2d(inputs=x,
               out_channal=1024,
               kernel_size=1,
               strides=1,
               use_bias=True,
               name="mrcnn_class_conv2")
    # 这时候,x的shape是[num_boxex, 1, 1, 1024], 调用下面这句话以后,变成了[num_box, 1024]
    shared = tf.squeeze(tf.nn.relu(x), axis=[1, 2])
    # 下面分为两个head,一个用于回归,一个用于分类
    mrcnn_class_logits = dense(inputs=shared,
                               out_dimension=num_classes,
                               use_biase=True,
                               name="mrcnn_class_logits")
    mrcnn_class_probs = tf.nn.softmax(mrcnn_class_logits,
                                      name="mrcnn_class_probs")

    mrcnn_bbox = dense(inputs=shared,
                       out_dimension=4 * num_classes,
                       use_biase=True)
    mrcnn_bbox = tf.reshape(mrcnn_bbox,
                            shape=[num_boxes, num_classes, 4],
                            name="mrcnn_class_bbox")
    # mrcnn_class_logits, mrcnn_class_probs的shape都是[num_boxex, num_classes]
    # mrcnn_bbox 的shape是[num_boxex, num_classes, (dx, dy, log(h), log(w))]
    return mrcnn_class_logits, mrcnn_class_probs, mrcnn_bbox
예제 #4
0
파일: model.py 프로젝트: GuoYi0/mask_yi
def build_fpn_mask_graph(rois,
                         feature_maps,
                         image_shape,
                         pool_size,
                         num_class,
                         train_bn=True,
                         name=None):
    """
    构建mask
    :param rois:  Proposals, [num_rois, (x1, y1, x2, y2)]
    :param feature_maps:  一个列表,[p2, p3, p4, p5] 代表四个层级的特征图
    :param image_shape: 原始输入图片的shape,[高,宽,通道数]。一个批次的所有图片,必须有相同的shape
    :param pool_size: ROI Pooling后的大小,在论文中,对于mask是14*14
    :param num_class: 分类数,他决定了最终的通道数
    :param train_bn:
    :return: [num_boxes, 28, 28, num_classes]
    """
    # [num_boxes, height, width, channels], ROI Pooling后的结果
    x = pyramidROIAlign(pool_size, rois, image_shape, feature_maps)
    asserts = tf.Assert(tf.shape(x)[0] > 0, data=[tf.shape(x)])
    with tf.control_dependencies([asserts]):
        x = tf.identity(x)
    for i in range(4):
        x = conv2d(inputs=x,
                   out_channal=256,
                   kernel_size=3,
                   strides=1,
                   use_bias=False,
                   name=name + "_conv" + str(i + 1))
        x = batch_norm(x, train_bn, name=name + "_bn" + str(i + 1))
        x = tf.nn.relu(x)
    x = conv2d_transpose(inputs=x,
                         out_channal=256,
                         kernel_size=2,
                         strides=2,
                         name=name + "_deconv")
    x = tf.nn.relu(x)
    x = conv2d(x, num_class, 1, 1, use_bias=True, name=name)
    return x
예제 #5
0
파일: model.py 프로젝트: GuoYi0/mask_yi
    def build_model(self,
                    mode,
                    input_image,
                    gt_boxes=None,
                    class_ids=None,
                    input_gt_mask=None,
                    anchor_labels=None,
                    anchor_deltas=None):
        """
        feature map有五个层级,p2, p3, p4, p5, p6,其相对于输入图片的缩放倍数依次是8, 16, 32, 64, 128
        在特征图上每个像素点的位置都要产生3个不同ratio的anchor.假设第s层有N个像素点,则在s层产生的anchor数是3N,
        其shape为[3N, (x1, y1, x2, y2)]。有五个层级,则调用tf.cancat函数在第0维拼接起来,形成的shape是
        [num_anchor, (x1, y1, x2, y2)]。最后,按照批数拼接起来,最终的shape是[batch, num_anchor, (x1, y1, x2, y2)]
        我们采用正则化坐标,故所有的坐标值的范围都必须在区间[0,1]里面

        mode:  必须是'training','validation','inference'三者之一。mode是'training' 或者 'validation'时,所有参数都不能是None,
        mode是'inference'时,只需要提供input_image即可

        :param mode:  必须是'training','validation','inference'三者之一
        :param input_image: [1, 高, 宽, 3]  # 简单一点,每次一张图片, float32
        :param gt_boxes: shape=[1, gt个数, 4], float32
        :param class_ids: shape=[1, gt个数], tf.int32
        :param input_gt_mask: [1,gt个数,高,宽], bool
        :param anchor_labels: [批数,anchor个数],其中1表示正例,0表示负例,-1表示不予考虑, int32
        :param anchor_deltas: anchor与gt之间的回归差异,[批数,anchor个数,(dx, dy, log(h), log(w))], float32
        :return:
        """
        mode_validation = mode in ['training', 'validation', 'inference']
        with tf.control_dependencies(
            [tf.Assert(mode_validation, data=["invalid mode"])]):
            batch_size = input_image.shape[0]
            resnet = Model(resnetlist=resnet50, version=1)
        training = True if mode == 'training' else False

        # layer是一个列表,包含c2, c3, c4, c5,其相对于输入图片的缩放比例依次是8,16,32,64
        # resolution是输入图片相对于特征图的分辨率倍数,是一个列表,依次是[8,16,32,64]
        layer, resolution = resnet(inputs=input_image, training=training)
        P5 = conv2d(inputs=layer[3],
                    out_channal=256,
                    kernel_size=1,
                    strides=1,
                    name='fpn_c5p5')

        P4 = tf.add_n([
            conv2d_transpose(inputs=P5,
                             out_channal=256,
                             kernel_size=1,
                             strides=2,
                             name="fpn_trans4"),
            conv2d(inputs=layer[2],
                   out_channal=256,
                   kernel_size=1,
                   strides=1,
                   name="fpn_c4p4")
        ],
                      name="fpn_p4add")

        P3 = tf.add_n([
            conv2d_transpose(inputs=P4,
                             out_channal=256,
                             kernel_size=1,
                             strides=2,
                             name="fpn_trans3"),
            conv2d(inputs=layer[1],
                   out_channal=256,
                   kernel_size=1,
                   strides=1,
                   name="fpn_c3p3")
        ],
                      name="fpn_p3add")

        P2 = tf.add_n([
            conv2d_transpose(inputs=P3,
                             out_channal=256,
                             kernel_size=1,
                             strides=2,
                             name="fpn_trans2"),
            conv2d(inputs=layer[0],
                   out_channal=256,
                   kernel_size=1,
                   strides=1,
                   name="fpn_c2p2")
        ],
                      name="fpn_p2add")

        # 根据FPN,最终来一个卷积,得到最后的特征图。没有非线性函数
        p2 = conv2d(P2, 256, 3, 1, name="fpn_p2")
        p3 = conv2d(P3, 256, 3, 1, name="fpn_p3")
        p4 = conv2d(P4, 256, 3, 1, name="fpn_p4")
        p5 = conv2d(P5, 256, 3, 1, name="fpn_p5")

        # feature map 6 用来做RPN,不用来做proposal的相关分类
        p6 = tf.layers.max_pooling2d(inputs=p5,
                                     pool_size=1,
                                     strides=2,
                                     name='feature_map6')
        resolution6 = resolution[-1] * 2
        resolution.append(resolution6)

        rpn_feature_maps = [p2, p3, p4, p5, p6]
        mrcnn_feature_maps = [p2, p3, p4, p5]

        # 定义一个列表,其长度为输出特征图的层级数,用于装入每级特征图的rpn输出。
        # rpn输出包含[rpn_binary_logits, rpn_probs, rpn_bbox],
        # 其shape依次是[批数,每个层级的anchors数,2],[批数,anchors数,2],[批数,anchors数,4]
        layer_output = []

        for i, p in enumerate(rpn_feature_maps):
            layer_output.append(
                rpn_graph(p,
                          config.anchor_per_location,
                          anchor_stride=1,
                          name=str(i)))

        # 把各层的输出连接起来,[[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]]
        output_name = ['rpn_binary_logits', 'rpn_binary_probs', 'rpn_bbox']
        outputs = list(zip(*layer_output))
        # 从底层到高层连接,连接以后,anchor数翻了5倍
        outputs = [
            tf.concat(list(o), axis=1, name=n)
            for o, n in zip(outputs, output_name)
        ]
        # [批数,anchors数,2], [批数,anchors数,2], [批数,anchors数,4]
        rpn_binary_logits, rpn_binary_probs, rpn_bbox_pred = outputs

        # 保留的proposal个数
        num_proposal = config.POST_NMS_ROIS_TRAINING if mode == 'training' else config.POST_NMS_ROIS_INFERENCE

        if self.anchors is None:
            self.get_anchors(config.batch_size, resolution, config.input_shape,
                             config.smallest_anchor_size)

        # 根据anchor来生成经过非极大值抑制后的proposal, 形状是 [个数,4]
        proposal = proposalLayer(
            inputs=[rpn_binary_probs, rpn_bbox_pred, self.anchors],
            max_proposal=num_proposal,
            nms_thresh=config.RPN_NMS_THRESHOLD,
            name="ROI")

        if mode == 'inference':
            mrcnn_class_logits, mrcnn_class_probs, mrcnn_bbox = fpn_classifier_graph(
                proposal,
                mrcnn_feature_maps,
                config.IMAGE_SHAPE,
                config.POOL_SIZE,
                config.NUM_CLASSES,
                name="mrcnn")
            # 经过最终处理以后的盒子,[x1, y1, x2, y2], 对应的类别,概率
            boxes, ids, probs = detectionLayer(proposal, mrcnn_class_probs,
                                               mrcnn_bbox, config.IMAGE_SHAPE)
            mask = build_fpn_mask_graph(tf.expand_dims(boxes, 0),
                                        mrcnn_feature_maps,
                                        config.IMAGE_SHAPE,
                                        config.MASK_POOL_SIZE,
                                        config.NUM_CLASSES,
                                        train_bn=training,
                                        name="mrcnn_mask")
            mask = filter_mask(mask, ids)
            mask = tf.nn.sigmoid(mask)
            return [boxes, ids, probs, mask]

        else:
            # 调用detection_targets函数,处理proposal,返回proposal,以及相应的类别、回归、masks
            # 因为批数不好处理,故只能蛋疼地分成一批一批地处理

            # [N, (x1, y1, x2, y2)]; [N];  [N, 4]; [N, 高,宽],float32
            rois, target_class_ids, target_bbox, target_mask = detection_targets(
                proposal,
                gt_class_ids=class_ids[0],
                gt_boxes=gt_boxes[0],
                gt_masks=input_gt_mask[0])

            # mrcnn_class_logits, mrcnn_class_probs的shape都是[num_boxex, num_classes]
            # mrcnn_bbox 的shape是[num_boxex, num_classes, (dx, dy, log(h), log(w))]
            mrcnn_class_logits, mrcnn_class_probs, mrcnn_bbox = fpn_classifier_graph(
                rois,
                mrcnn_feature_maps,
                config.IMAGE_SHAPE,
                config.POOL_SIZE,
                config.NUM_CLASSES,
                name="mrcnn")

            # [num_boxes, 28, 28, num_classes]
            mrcnn_mask_logits = build_fpn_mask_graph(rois,
                                                     mrcnn_feature_maps,
                                                     config.IMAGE_SHAPE,
                                                     config.MASK_POOL_SIZE,
                                                     config.NUM_CLASSES,
                                                     train_bn=training,
                                                     name="mrcnn_mask")

            # rpn loss
            rpn_binary_loss = rpn_binary_loss_graph(anchor_labels,
                                                    rpn_binary_logits)
            rpn_bbox_loss = rpn_bbox_loss_graph(anchor_deltas, rpn_bbox_pred,
                                                anchor_labels)
            proposal_class_loss, targets_id = proposal_class_loss_graph(
                target_class_ids, mrcnn_class_logits, config.NUM_CLASSES)
            proposal_bbox_loss = proposal_bbox_loss_graph(
                target_bbox, mrcnn_bbox, target_class_ids)
            mask_loss = mask_loss_graph(target_mask, mrcnn_mask_logits,
                                        target_class_ids, config.NUM_CLASSES)

            rpn_loss = rpn_binary_loss + rpn_bbox_loss  # rpn的损失
            proposal_loss = proposal_class_loss + proposal_bbox_loss  # proposal的损失
            total_loss = rpn_loss + proposal_loss + mask_loss
            # 返回rpn的损失,proposal的损失,mask的损失,和总损失
            return [rpn_loss, proposal_loss, mask_loss, total_loss]
예제 #6
0
파일: model.py 프로젝트: fendaq/mask_yi
def build_model(mode,
                input_image,
                gt_boxes=None,
                class_ids=None,
                input_gt_mask=None,
                rpn_binary_gt=None,
                rpn_bbox_gt=None,
                anchors=None):
    # TODO 在输入层产生的顺序是根据config中的feat_strides产生的 即128, 64, 32, 16, 8
    """
    feature map有五个层级,p2, p3, p4, p5, p6,其相对于输入图片的缩放倍数依次是8, 16, 32, 64, 128
    在特征图上每个像素点的位置都要产生k^2个anchor.假设第s层有N个像素点,则在s层产生的anchor数是N*k^2,
    其shape为[N*k^2, (x1, y1, x2, y2)]。有五个层级,则调用tf.cancat函数在第0维拼接起来,形成的shape是
    [num_anchor, (x1, y1, x2, y2)]。最后,按照批数拼接起来,最终的shape是[batch, num_anchor, (x1, y1, x2, y2)]
    我们采用正则化坐标,故所有的坐标值的范围都必须在区间[0,1]里面

    mode:  必须是'training','validation','inference'三者之一。mode是'training' 或者 'validation'时,所有参数都不能是None,
    mode是'inference'时,只需要提供input_image即可

    :param mode:  必须是'training','validation','inference'三者之一
    :param input_image: [1, 高, 宽, 3]  # 简单一点,每次一张图片
    :param gt_boxes: shape=[1, 个数, 4]
    :param class_ids: shape=[1, 个数]
    :param input_gt_mask: [1,高,宽]
    :param rpn_binary_gt: shape=[1, None, 1] anchor的标签,0表示背景,1表示instance,-1表示不关心
    :param rpn_bbox_gt: shape=[1, None, 4], anchor的回归目标值
    :param anchors: [1, num_anchor, (x1, y1, x2, y2)]
    :return:
    """
    mode_validation = mode in ['training', 'validation', 'inference']
    with tf.control_dependencies(
        [tf.Assert(mode_validation, data=["invalid mode"])]):
        batch_size = input_image.shape[0]
        resnet = Model(resnetlist=resnet50, version=2)
    training = True if mode == 'training' else False

    # layer是一个列表,包含c2, c3, c4, c5,其相对于输入图片的缩放比例依次是8,16,32,64
    # resolution是输入图片相对于特征图的分辨率倍数,值为64
    layer, resolution = resnet(inputs=input_image, training=training)
    P5 = conv2d(inputs=layer[3],
                out_channal=256,
                kernel_size=1,
                strides=1,
                name='fpn_c5p5')
    P4 = tf.add_n([
        conv2d_transpose(inputs=P5,
                         out_channal=256,
                         kernel_size=1,
                         strides=2,
                         name="fpn_trans4"),
        conv2d(inputs=layer[2],
               out_channal=256,
               kernel_size=1,
               strides=1,
               name="fpn_c4p4")
    ],
                  name="fpn_p4add")

    P3 = tf.add_n([
        conv2d_transpose(inputs=P4,
                         out_channal=256,
                         kernel_size=1,
                         strides=2,
                         name="fpn_trans4"),
        conv2d(inputs=layer[1],
               out_channal=256,
               kernel_size=1,
               strides=1,
               name="fpn_c3p3")
    ],
                  name="fpn_p3add")

    P2 = tf.add_n([
        conv2d_transpose(inputs=P3,
                         out_channal=256,
                         kernel_size=1,
                         strides=2,
                         name="fpn_trans4"),
        conv2d(inputs=layer[0],
               out_channal=256,
               kernel_size=1,
               strides=1,
               name="fpn_c2p2")
    ],
                  name="fpn_p2add")

    # 根据FPN,最终来一个卷积,得到最后的特征图。没有非线性函数
    p2 = conv2d(P2, 256, 3, 1, name="fpn_p2")
    p3 = conv2d(P3, 256, 3, 1, name="fpn_p3")
    p4 = conv2d(P4, 256, 3, 1, name="fpn_p4")
    p5 = conv2d(P5, 256, 3, 1, name="fpn_p5")

    # feature map 6 用来做RPN,不用来做分类
    p6 = tf.layers.max_pooling2d(inputs=p5,
                                 pool_size=1,
                                 strides=2,
                                 name='feature_map6')
    rpn_feature_maps = [p2, p3, p4, p5, p6]
    mrcnn_feature_maps = [p2, p3, p4, p5]

    # 定义一个列表,其长度为输出特征图的层级数,用于装入每级特征图的rpn输出。
    # rpn输出包含[rpn_binary_logits, rpn_probs, rpn_bbox],
    # 其shape依次是[批数,每个层级的anchors数,2],[批数,anchors数,2],[批数,anchors数,4]
    layer_output = []
    for p in rpn_feature_maps:
        layer_output.append(
            rpn_graph(p, config.anchor_per_location, anchor_stride=1))

    # 把各层的输出连接起来,[[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]]
    output_name = ['rpn_binary_logits', 'rpn_binary_probs', 'rpn_bbox']
    outputs = list(zip(*layer_output))
    # 连接以后,anchor数翻了5倍
    outputs = [
        tf.concat(list(o), axis=1, name=n)
        for o, n in zip(outputs, output_name)
    ]
    # [批数,anchors数,2], [批数,anchors数,2], [批数,anchors数,4]
    rpn_binary_logits, rpn_binary_probs, rpn_bbox_pred = outputs

    # 保留的proposal个数
    num_proposal = config.POST_NMS_ROIS_TRAINING if mode == 'training' else config.POST_NMS_ROIS_INFERENCE

    # 生成经过非极大值抑制后的proposal, 形状是 [批数,个数,4]
    proposal = proposalLayer(inputs=[rpn_binary_probs, rpn_bbox_pred, anchors],
                             max_proposal=num_proposal,
                             nms_thresh=config.RPN_NMS_THRESHOLD,
                             name="ROI")

    if mode == 'inference':
        mrcnn_class_logits, mrcnn_class_probs, mrcnn_bbox = fpn_classifier_graph(
            proposal, mrcnn_feature_maps, config.IMAGE_SHAPE, config.POOL_SIZE,
            config.NUM_CLASSES)
        # 经过最终处理以后的盒子,[x1, y1, x2, y2], 对应的类别,概率
        boxes, ids, probs = detectionLayer(proposal, mrcnn_class_probs,
                                           mrcnn_bbox, config.IMAGE_SHAPE)
        mask = build_fpn_mask_graph(tf.expand_dims(boxes, 0),
                                    mrcnn_feature_maps,
                                    config.IMAGE_SHAPE,
                                    config.MASK_POOL_SIZE,
                                    config.NUM_CLASSES,
                                    train_bn=training)
        mask = filter_mask(mask, ids)
        mask = tf.nn.sigmoid(mask)
        return [boxes, ids, probs, mask]

    else:

        # 调用detection_targets函数,返回proposal,以及相应的类别、回归、masks
        # 因为批数不好处理,故只能蛋疼地分成一批一批地处理
        rois_list, target_class_ids_list, target_bbox_list, target_mask_list = [], [], [], []
        for i in range(batch_size):
            # roi_gt_class_ids[M], 反映proposal的分类
            # gt_deltas[M, (dx, dy, log(h), log(w))]
            # 反映proposal相对于gt的回归
            # masks[M, 高,宽]
            # [N, (x1, y1, x2, y2)]; [N];  [N, 4]; [N, 高,宽]
            rois, target_class_ids, target_bbox, target_mask = detection_targets(
                proposal[i],
                gt_class_ids=class_ids[i],
                gt_boxes=gt_boxes[i],
                gt_masks=input_gt_mask[i])
            rois_list.append(rois)
            target_bbox_list.append(target_bbox)
            target_class_ids_list.append(target_class_ids)
            target_mask_list.append(target_mask)
        rois = tf.convert_to_tensor(rois_list)
        target_bbox = tf.convert_to_tensor(target_bbox_list)
        target_class_ids = tf.convert_to_tensor(target_class_ids_list)
        target_mask = tf.convert_to_tensor(target_mask_list)  # [batch, N, 高,宽]

        # mrcnn_class_logits, mrcnn_class_probs的shape都是[num_boxex, num_classes]
        # mrcnn_bbox 的shape是[num_boxex, num_classes, (dx, dy, log(h), log(w))]
        mrcnn_class_logits, mrcnn_class_probs, mrcnn_bbox = fpn_classifier_graph(
            rois, mrcnn_feature_maps, config.IMAGE_SHAPE, config.POOL_SIZE,
            config.NUM_CLASSES)

        # [num_boxes, 28, 28, num_classes]
        mrcnn_mask_logits = build_fpn_mask_graph(rois,
                                                 mrcnn_feature_maps,
                                                 config.IMAGE_SHAPE,
                                                 config.MASK_POOL_SIZE,
                                                 config.NUM_CLASSES,
                                                 train_bn=training,
                                                 name="mrcnn_mask_logits")

        # rpn loss
        rpn_binary_loss = rpn_binary_loss_graph(rpn_binary_gt,
                                                rpn_binary_logits)
        rpn_bbox_loss = rpn_bbox_loss_graph(rpn_bbox_gt, rpn_bbox_pred,
                                            rpn_binary_gt)
        # proposal loss
        proposal_class_loss = proposal_class_loss_graph(
            target_class_ids, mrcnn_class_logits, config.NUM_CLASSES)
        proposal_bbox_loss = proposal_bbox_loss_graph(target_bbox, mrcnn_bbox,
                                                      target_class_ids)
        mask_loss = mask_loss_graph(target_mask, mrcnn_mask_logits,
                                    target_class_ids, config.NUM_CLASSES)
        rpn_loss = rpn_binary_loss + rpn_bbox_loss  # rpn的损失
        proposal_loss = proposal_class_loss + proposal_bbox_loss  # proposal的损失
        total_loss = rpn_loss + proposal_loss + mask_loss
        # 返回rpn的损失,proposal的损失,mask的损失,和总损失
        return [rpn_loss, proposal_loss, mask_loss, total_loss]