예제 #1
0
    def test_return_var_tuple(self):
        """
        pseudocode:

        if True:
            return 1, True
        else:
            return 3, 2
        """
        def true_func():
            return layers.fill_constant(shape=[1, 2], dtype='int32',
                                        value=1), layers.fill_constant(
                                            shape=[2, 3],
                                            dtype='bool',
                                            value=True)

        def false_func():
            return layers.fill_constant(shape=[3, 4], dtype='float32',
                                        value=3), layers.fill_constant(
                                            shape=[4, 5],
                                            dtype='int64',
                                            value=2)

        main_program = Program()
        startup_program = Program()
        with program_guard(main_program, startup_program):
            pred = layers.fill_constant(shape=[1], dtype='bool', value=True)
            out = layers.cond(pred, true_func, false_func)
            # out is a tuple containing 2 tensors

        place = fluid.CUDAPlace(
            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
        exe = fluid.Executor(place)
        ret = exe.run(main_program, fetch_list=out)
        self.assertTrue(
            np.allclose(np.asarray(ret[0]), np.full((1, 2), 1, np.int32)))
        self.assertTrue(
            np.allclose(np.asarray(ret[1]), np.full((2, 3), True, bool)))
예제 #2
0
    def test_pass_and_modify_var(self):
        """
        pseudocode:
        for i in range(5):
            a = 7
            if i % 2 == 0:
                a = a * (i + 1)
            else:
                a = a - (i - 1)
        """

        def true_func(a, i):
            a = a * (i + 1)
            return a

        def false_func(a, i):
            a = a - (i - 1)
            return a

        main_program = Program()
        startup_program = Program()
        with program_guard(main_program, startup_program):
            a = layers.fill_constant(shape=[3, 2, 1], dtype='int32', value=7)
            i = fluid.data(name="i", shape=[1], dtype='int32')
            pred = ((i % 2) == 0)
            a = layers.cond(pred, lambda: true_func(a, i),
                            lambda: false_func(a, i))
        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
        ) else fluid.CPUPlace()
        exe = fluid.Executor(place)
        for feed_i in range(5):
            expected_a = 7 * (feed_i + 1) if feed_i % 2 == 0 else 8 - feed_i
            ret = exe.run(main_program,
                          feed={'i': np.full((1), feed_i, np.int32)},
                          fetch_list=[a])
            self.assertTrue(
                np.allclose(
                    np.asarray(ret), np.full((3, 2, 1), expected_a, np.int32)))
예제 #3
0
    def test_extremely_simple_net_with_op_in_condition(self):
        main_program = fluid.Program()
        startup_program = fluid.Program()
        with fluid.program_guard(main_program, startup_program):
            a = fluid.layers.fill_constant(shape=[1],
                                           dtype='float32',
                                           value=1.23)
            a.stop_gradient = False
            b = fluid.layers.fill_constant(shape=[1],
                                           dtype='float32',
                                           value=1.25)
            b.stop_gradient = False
            out = layers.cond(a - b < -1.0, lambda: a, lambda: b)
        append_backward(out)

        place = fluid.CUDAPlace(
            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
        exe = fluid.Executor(place)
        ret = exe.run(main_program, fetch_list=[out, a.grad_name, b.grad_name])
        # Note: fill_constant has loss of precision, you have to assertEqual
        # with values doens't lose precision in float-point number.
        self.assertEqual(ret[0][0], 1.25)
        self.assertEqual(ret[1][0], 0.0)
        self.assertEqual(ret[2][0], 1.0)
예제 #4
0
    def build_program(self):
        def true_func():
            return layers.fill_constant(shape=[1, 2], dtype='int32',
                                        value=1), layers.fill_constant(
                                            shape=[2, 3],
                                            dtype='bool',
                                            value=True)

        def false_func():
            return layers.fill_constant(shape=[3, 4], dtype='float32',
                                        value=3), layers.fill_constant(
                                            shape=[4, 5],
                                            dtype='int64',
                                            value=2)

        main_program = Program()
        startup_program = Program()
        with program_guard(main_program, startup_program):
            x = layers.fill_constant(shape=[1], dtype='float32', value=0.1)
            y = layers.fill_constant(shape=[1], dtype='float32', value=0.23)
            pred = layers.less_than(x, y)
            out = layers.cond(pred, true_func, false_func)
            # out is a tuple containing 2 tensors
            return main_program, startup_program, out
예제 #5
0
 def branch(i, img, label):
     return layers.cond(
         (i % 2) == 0,
         lambda: simple_fc_net_with_inputs(img, label, class_num=10),
         lambda: batchnorm_fc_with_inputs(img, label, class_num=10))
예제 #6
0
 def cond_func(i, img, label):
     predicate = ((i % 2) == 0)
     return layers.cond(
         predicate,
         lambda: simple_fc_net_with_inputs(img, label, class_num=10),
         lambda: batchnorm_fc_with_inputs(img, label, class_num=10))
예제 #7
0
 def greater_equal_branch(i, a):
     return layers.cond(i < 8.0, lambda: layers.elementwise_mul(a, a),
                        lambda: layers.elementwise_div(a, a))
예제 #8
0
 def less_than_branch(i, a):
     return layers.cond(i >= 3.0, lambda: layers.elementwise_add(a, a),
                        lambda: layers.elementwise_sub(a, a))
예제 #9
0
def fastnms(all_pred_boxes, all_pred_scores, resize_shape, origin_shape,
            conf_thresh, nms_thresh, keep_top_k, nms_top_k, use_yolo_box):
    '''
    :param all_pred_boxes:      [batch_size, -1, 4]
    :param all_pred_scores:     [batch_size, -1, 80]
    :param resize_shape:        [batch_size, 2]
    :param origin_shape:        [batch_size, 2]
    '''
    conf_preds = P.transpose(all_pred_scores, perm=[0, 2, 1])  # [1, 80, -1]
    cur_scores = conf_preds[0]  # [80, -1]
    conf_scores = P.reduce_max(cur_scores, dim=0)  # [-1, ]
    # keep如果是[None],并且在gather()里使用了keep,就会出现
    # cudaGetLastError  invalid configuration argument errno: 9   这个错误。
    # 为了避免上面的问题,只能让keep不是[None],所以这里当keep是[None]时给keep赋予一个坐标[[0]]。
    keep = P.where(conf_scores > conf_thresh)

    def exist_objs_1(keep):
        return keep

    def no_objs_1():
        keep_extra = P.zeros((1, 1), 'int64')
        return keep_extra

    keep = P.cond(P.shape(keep)[0] == 0, no_objs_1, lambda: exist_objs_1(keep))
    scores = P.gather(all_pred_scores[0], keep)
    scores = P.transpose(scores, perm=[1, 0])
    boxes = P.gather(all_pred_boxes[0], keep)
    boxes, scores, classes = fast_nms(boxes, scores, conf_thresh, nms_thresh,
                                      keep_top_k, nms_top_k)

    # 再做一次分数过滤。前面提到,只要某个框最高分数>阈值就保留,
    # 然而计算上面那个矩阵时,这个框其实重复了80次,每一个分身代表是不同类的物品。
    # 非最高分数的其它类别,它的得分可能小于阈值,要过滤。
    # 所以fastnms存在这么一个现象:某个框它最高分数 > 阈值,它有一个非最高分数类的得分也超过了阈值,
    # 那么最后有可能两个框都保留,而且这两个框有相同的xywh
    keep = P.where(scores > conf_thresh)

    def exist_objs_2(keep, boxes, classes, scores):
        boxes = P.gather(boxes, keep)
        classes = P.gather(classes, keep)
        scores = P.gather(scores, keep)
        return boxes, classes, scores

    def no_objs_2(boxes, classes, scores):
        keep = P.zeros((1, 1), 'int64')
        boxes = P.gather(boxes, keep)
        classes = P.gather(classes, keep)
        scores = P.gather(scores, keep)
        scores -= 2.0  # 巧妙设置为负分数让python端过滤
        return boxes, classes, scores

    boxes, classes, scores = P.cond(
        P.shape(keep)[0] == 0, lambda: no_objs_2(boxes, classes, scores),
        lambda: exist_objs_2(keep, boxes, classes, scores))
    # 变成左上角坐标、右下角坐标
    boxes = P.concat(
        [boxes[:, :2] - boxes[:, 2:] * 0.5, boxes[:, :2] + boxes[:, 2:] * 0.5],
        axis=-1)

    # 缩放到原图大小
    resize_shape_f = P.cast(resize_shape, 'float32')
    origin_shape_f = P.cast(origin_shape, 'float32')
    if use_yolo_box:
        scale = origin_shape_f
    else:
        scale = origin_shape_f / resize_shape_f
    scale = P.expand(scale, [1, 2])
    boxes *= scale  # 批大小是1才支持这么做,因为scale第0维表示批大小,boxes第0维却表示这张图片预测出的物体数

    # 批大小在前
    boxes = P.reshape(boxes, (1, -1, 4), name='boxes')
    scores = P.reshape(scores, (1, -1), name='scores')
    classes = P.reshape(classes, (1, -1), name='classes')
    return [boxes, scores, classes]
예제 #10
0
 def begin_localsgd():
     layers.cond(step - last_step == k_steps, communicate_avg_loss)
예제 #11
0
    def minimize_impl(self,
                      loss,
                      startup_program=None,
                      parameter_list=None,
                      no_grad_set=None):
        minimized = self.inner_opt.minimize(loss,
                                            startup_program=startup_program)

        init_k_steps = self.user_defined_strategy.adaptive_localsgd_configs[
            'init_k_steps']
        begin_step_value = self.user_defined_strategy.adaptive_localsgd_configs[
            'begin_step']

        if startup_program is None:
            startup_program = default_startup_program()
        main_block = loss.block

        self.nrings = 2
        collective_helper = CollectiveHelper(self.role_maker, self.nrings)
        collective_helper.update_startup_program(startup_program)
        p2s = self.create_snapshot_vars(startup_program)
        self.init_snapshot_vars(startup_program, p2s)

        p2s = self.create_snapshot_vars(main_block.program)
        with program_guard(main_block.program, startup_program):
            step = layers.autoincreased_step_counter(begin=1)

            k_steps = layers.create_global_var(name="k_steps",
                                               shape=[1],
                                               value=int(init_k_steps),
                                               dtype='int64',
                                               persistable=True)

            begin_step = layers.create_global_var(name="begin_step",
                                                  shape=[1],
                                                  value=int(begin_step_value),
                                                  dtype='int64',
                                                  persistable=True)

            last_step = layers.create_global_var(name="last_step",
                                                 shape=[1],
                                                 value=int(0),
                                                 dtype='int64',
                                                 persistable=True)

            avg_loss = layers.create_global_var(name="avg_loss",
                                                shape=[1],
                                                value=float(0),
                                                dtype=loss.dtype,
                                                persistable=True)

            lr_0 = layers.create_global_var(name="lr_0",
                                            shape=[1],
                                            value=float(0),
                                            dtype='float32',
                                            persistable=True)

            loss_0 = layers.create_global_var(name="loss_0",
                                              shape=[1],
                                              value=float(0),
                                              dtype='float32',
                                              persistable=True)

            global_lr = self.inner_opt._global_learning_rate()

            def initialize():
                self._generate_avg_loss(main_block, loss, avg_loss)
                layers.assign(avg_loss, loss_0)
                layers.assign(global_lr, lr_0)

            layers.cond(step == 1, initialize)

            def communicate():
                sub_block = default_main_program().current_block()
                ring_id = -1
                for param, snapshot in p2s:
                    sub_block.append_op(type='elementwise_sub',
                                        inputs={
                                            'X': [snapshot],
                                            'Y': [param]
                                        },
                                        outputs={'Out': [param]},
                                        attrs={OP_ROLE_KEY: OpRole.Optimize})
                    sub_block.append_op(type='c_sync_calc_stream',
                                        inputs={'X': param},
                                        outputs={'Out': param},
                                        attrs={OP_ROLE_KEY: OpRole.Optimize})
                    ring_id = (ring_id + 1) % self.nrings
                    sub_block.append_op(type='c_allreduce_sum',
                                        inputs={'X': [param]},
                                        outputs={'Out': [param]},
                                        attrs={
                                            'ring_id': ring_id,
                                            OP_ROLE_KEY: OpRole.Optimize
                                        })

                for ring_id in range(self.nrings):
                    sub_block.append_op(type='c_sync_comm_stream',
                                        inputs={'X': param},
                                        outputs={'Out': param},
                                        attrs={
                                            'ring_id': ring_id,
                                            OP_ROLE_KEY: OpRole.Optimize
                                        })

                for param, snapshot in p2s:
                    sub_block.append_op(type='scale',
                                        inputs={'X': [param]},
                                        outputs={'Out': [param]},
                                        attrs={
                                            'scale':
                                            1.0 /
                                            self.role_maker._worker_num(),
                                            OP_ROLE_KEY:
                                            OpRole.Optimize
                                        })
                    sub_block.append_op(type='elementwise_sub',
                                        inputs={
                                            'X': [snapshot],
                                            'Y': [param]
                                        },
                                        outputs={'Out': [param]},
                                        attrs={OP_ROLE_KEY: OpRole.Optimize})
                    sub_block.append_op(type='assign',
                                        inputs={'X': [param]},
                                        outputs={'Out': [snapshot]},
                                        attrs={OP_ROLE_KEY: OpRole.Optimize})
                layers.assign(step, last_step)

            def communicate_avg_loss():
                communicate()
                self._generate_avg_loss(main_block, loss, avg_loss)
                next_local_steps = layers.cast(layers.ceil(
                    layers.sqrt(lr_0 * avg_loss / (global_lr * loss_0) *
                                float(init_k_steps))),
                                               dtype='int64')
                max_local_steps = layers.fill_constant(shape=[1],
                                                       dtype='int64',
                                                       value=16)
                min_local_steps = layers.fill_constant(shape=[1],
                                                       dtype='int64',
                                                       value=1)
                next_local_steps = layers.elementwise_min(
                    next_local_steps, max_local_steps)
                next_local_steps = layers.elementwise_max(
                    next_local_steps, min_local_steps)
                layers.assign(next_local_steps, k_steps)

            def begin_localsgd():
                layers.cond(step - last_step == k_steps, communicate_avg_loss)

            layers.cond(step > begin_step, begin_localsgd, communicate)

        return minimized
def _create_cond_block_and_update_optimizer(
        main_program, cond_var, new_params_to_grads: List[Tuple[Any, Any]],
        param_to_gradient_merge: Dict[str, Any], optimize_ops_desc: List[Any],
        k_steps, avg):
    def true_apply_gradient():
        cur_block_idx = main_program.current_block_idx
        cur_block = main_program.current_block()

        # cur_block's forward_block & backward_block is itself
        cur_block._set_forward_block_idx(cur_block_idx)
        op_maker = core.op_proto_and_checker_maker
        if avg:
            for param, new_grad in new_params_to_grads:
                # grad /= k_steps
                cur_block.append_op(type='scale',
                                    inputs={'X': new_grad},
                                    outputs={'Out': new_grad},
                                    attrs={
                                        'scale': 1.0 / k_steps,
                                        'bias': 0.0,
                                        'bias_after_scale': False
                                    })
                new_grad.op._set_attr(op_maker.kOpRoleAttrName(),
                                      op_maker.OpRole.Optimize)

        # append optimizer ops
        for op_desc in optimize_ops_desc:
            new_op_desc = cur_block.desc.append_op()
            new_op_desc.copy_from(op_desc)

            #update input/output
            for input_name in new_op_desc.input_arg_names():
                if input_name in new_params_to_grads:
                    new_op_desc._rename_input(input_name,
                                              new_params_to_grads[input_name])

            for output_name in new_op_desc.output_arg_names():
                if output_name in new_params_to_grads:
                    new_op_desc._rename_output(
                        output_name, new_params_to_grads[output_name])

            # remove op_role_var
            if new_op_desc.has_attr(op_maker.kOpRoleVarAttrName()):
                new_op_desc.remove_attr(op_maker.kOpRoleVarAttrName())

            # op's update Grad
            if core.grad_var_suffix() in new_op_desc.input_arg_names():
                grad_value = new_op_desc.input("Grad")[0]
                # TODO FIXME(xym) support fp16
                grad_merge_value = grad_value + '@GradientMerge'
                new_op_desc.set_input("Grad", [grad_merge_value])

        main_program.global_block()._sync_with_cpp()
        cur_block._sync_with_cpp()

        # clear gradient_merge_vars
        for param, new_grad in new_params_to_grads:
            layers.fill_constant(shape=new_grad.shape,
                                 dtype=new_grad.dtype,
                                 value=0.0,
                                 out=new_grad)
            new_grad.op._set_attr(op_maker.kOpRoleAttrName(),
                                  op_maker.OpRole.Optimize)

    layers.cond(cond_var, true_fn=true_apply_gradient, false_fn=None)
예제 #13
0
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from paddle.fluid.executor import Executor
from paddle.fluid.framework import Program, program_guard


# 疑问:如果true_func or false_func执行时需要输入参数,怎么办?
def true_func():
    return layers.fill_constant(shape=[1,2], dtype='int32', value=1), \
     layers.fill_constant(shape=[2,3], dtype='bool', value=True)


def false_func():
    return layers.fill_constant(shape=[3,4], dtype='float32', value=3), \
     layers.fill_constant(shape=[4,5], dtype='int64', value=2)


main_program = Program()
startup_program = Program()
with program_guard(main_program, startup_program):
    x = layers.fill_constant(shape=[1], dtype='float32', value=0.1)
    y = layers.fill_constant(shape=[1], dtype='float32', value=0.23)
    pred = layers.less_than(x, y)
    out = layers.cond(pred, true_func, false_func)

    place = fluid.CUDAPlace(
        0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
    exe = fluid.Executor(place)
    ret = exe.run(main_program, fetch_list=out)
    print(ret)
예제 #14
0
 def cond_func_simple_net_at_false(i, img, label):
     return layers.cond(i < 5, lambda: layers.mean(img),
                        lambda: branch(i, img, label))
예제 #15
0
    def minimize_impl(self,
                      loss,
                      startup_program=None,
                      parameter_list=None,
                      no_grad_set=None):
        minimized = self.inner_opt.minimize(loss,
                                            startup_program=startup_program)

        k_steps_value = self.user_defined_strategy.localsgd_configs['k_steps']
        begin_step_value = self.user_defined_strategy.localsgd_configs[
            'begin_step']

        if startup_program is None:
            startup_program = default_startup_program()
        main_block = loss.block

        self.nrings = 2
        collective_helper = CollectiveHelper(self.role_maker, self.nrings)
        collective_helper.update_startup_program(startup_program)
        p2s = self.create_snapshot_vars(startup_program)
        self.init_snapshot_vars(startup_program, p2s)

        p2s = self.create_snapshot_vars(main_block.program)
        with program_guard(main_block.program, startup_program):
            step = layers.autoincreased_step_counter(begin=1)
            k_steps = layers.create_global_var(name="k_steps",
                                               shape=[1],
                                               value=k_steps_value,
                                               dtype='int64',
                                               persistable=True)

            begin_step = layers.create_global_var(name="begin_step",
                                                  shape=[1],
                                                  value=begin_step_value,
                                                  dtype='int64',
                                                  persistable=True)

            last_step = layers.create_global_var(name="last_step",
                                                 shape=[1],
                                                 value=begin_step_value,
                                                 dtype='int64',
                                                 persistable=True)

            def communicate():
                sub_block = default_main_program().current_block()
                ring_id = -1
                for param, snapshot in p2s:
                    sub_block.append_op(type='elementwise_sub',
                                        inputs={
                                            'X': [snapshot],
                                            'Y': [param]
                                        },
                                        outputs={'Out': [param]},
                                        attrs={OP_ROLE_KEY: OpRole.Optimize})
                    sub_block.append_op(type='c_sync_calc_stream',
                                        inputs={'X': param},
                                        outputs={'Out': param},
                                        attrs={OP_ROLE_KEY: OpRole.Optimize})
                    ring_id = (ring_id + 1) % self.nrings
                    sub_block.append_op(type='c_allreduce_sum',
                                        inputs={'X': [param]},
                                        outputs={'Out': [param]},
                                        attrs={
                                            'ring_id': ring_id,
                                            OP_ROLE_KEY: OpRole.Optimize
                                        })

                for ring_id in range(self.nrings):
                    sub_block.append_op(type='c_sync_comm_stream',
                                        inputs={'X': param},
                                        outputs={'Out': param},
                                        attrs={
                                            'ring_id': ring_id,
                                            OP_ROLE_KEY: OpRole.Optimize
                                        })

                for param, snapshot in p2s:
                    sub_block.append_op(type='scale',
                                        inputs={'X': [param]},
                                        outputs={'Out': [param]},
                                        attrs={
                                            'scale':
                                            1.0 /
                                            self.role_maker._worker_num(),
                                            OP_ROLE_KEY:
                                            OpRole.Optimize
                                        })
                    sub_block.append_op(type='elementwise_sub',
                                        inputs={
                                            'X': [snapshot],
                                            'Y': [param]
                                        },
                                        outputs={'Out': [param]},
                                        attrs={OP_ROLE_KEY: OpRole.Optimize})
                    sub_block.append_op(type='assign',
                                        inputs={'X': [param]},
                                        outputs={'Out': [snapshot]},
                                        attrs={OP_ROLE_KEY: OpRole.Optimize})
                layers.assign(step, last_step)

            def begin_localsgd():
                layers.cond(step - last_step == k_steps, communicate)

            layers.cond(step > begin_step, begin_localsgd, communicate)
        return minimized
예제 #16
0
 def cond_func(i, img, label):
     return layers.cond(i < 5, lambda: branch(i, img, label, True),
                        lambda: branch(i, img, label, False))
예제 #17
0
    def get_seg_single(self, cate_preds, mask_proto, kernel_preds,
                       featmap_size, resize_shape, ori_shape):
        '''

        :param cate_preds:   [所有格子数, 80]
        :param mask_proto:   [1, 256, s4, s4]   掩码原型
        :param kernel_preds:   [所有格子数, 256]   每个格子生成的卷积核,是1x1卷积核,输入通道数是256,即掩码原型的通道数。
        :param featmap_size:   (s4, s4)
        :param resize_shape:   shape=[3, ]
        :param ori_shape:      shape=[3, ]
        :return:
        '''
        # overall info.
        upsampled_size_out = (featmap_size[0] * 4, featmap_size[1] * 4
                              )  # 输入网络的图片大小
        cfg = self.nms_cfg

        # 第一次过滤,分数过滤
        inds = L.where(cate_preds > cfg['score_thr'])  # [M, 2]

        # if len(inds) == 0:
        #     return None
        # 静态图里写条件判断太难了。
        def exist_objs_1(inds, cate_preds):
            inds.stop_gradient = True
            scores = L.gather_nd(cate_preds, inds)  # [M, ]   M个物体的分数
            return inds, scores

        def no_objs_1(cate_preds):
            inds = L.zeros((1, 2), np.int64)
            inds.stop_gradient = True
            scores = L.gather_nd(cate_preds,
                                 inds) - 99.0  # [M, ]   M个物体的分数。后面会被过滤掉。
            return inds, scores

        # 是否有物体
        inds, scores = L.cond(
            L.shape(inds)[0] == 0, lambda: no_objs_1(cate_preds),
            lambda: exist_objs_1(inds, cate_preds))

        classes = inds[:, 1]  # [M, ]   M个物体的类别id
        kernel_preds = L.gather(kernel_preds, inds[:,
                                                   0])  # [M, 256]   M个物体的卷积核

        n_stage = len(self.seg_num_grids)  # 5个输出层
        strides = []
        for ind_ in range(n_stage):
            st = L.zeros((1, ), dtype=np.float32) + self.strides[ind_]
            st = L.expand(st, [
                self.seg_num_grids[ind_]**2,
            ])  # [40*40, ]
            strides.append(st)
        strides = L.concat(strides, axis=0)
        strides.stop_gradient = True
        strides = L.gather(strides, inds[:, 0])  # [M, ]   M个物体的下采样倍率

        # mask encoding.原版SOLO中的写法。1x1的卷积核卷积掩码原型,即可得到掩码。
        # M, C = kernel_preds.shape
        # kernel_preds = kernel_preds.view(M, C, 1, 1)    # 被当做卷积核使
        # seg_preds = F.conv2d(seg_preds, kernel_preds, stride=1).squeeze(0).sigmoid()
        # 1x1的卷积核卷积掩码原型,等价于矩阵相乘。注意,3x3卷积核的话可不等价。
        # 这里是由于暂时没发现等价api,所以用矩阵相乘代替。solov2和yolact在这里是一样的。
        mask_proto = L.squeeze(mask_proto, axes=[0])  # [256, s4, s4]
        mask_proto = L.transpose(mask_proto, perm=[1, 2, 0])  # [s4, s4, 256]
        masks = L.matmul(mask_proto, kernel_preds,
                         transpose_y=True)  # [s4, s4, M]
        masks = L.sigmoid(masks)  # [s4, s4, M]
        masks = L.transpose(masks, perm=[2, 0, 1])  # [M, s4, s4]

        # mask.
        seg_masks = L.cast(masks > cfg['mask_thr'],
                           'float32')  # [M, s4, s4]   前景的话值为1
        sum_masks = L.reduce_sum(seg_masks, dim=[1, 2])  # [M, ]   M个物体的掩码面积

        # 第二次过滤,下采样倍率过滤。掩码的面积 超过 下采样倍率 才保留下来。
        keep = L.where(sum_masks > strides)

        # if keep.sum() == 0:
        #     return None

        # 静态图里写条件判断太难了。
        def exist_objs_2(keep, seg_masks, masks, sum_masks, scores, classes):
            keep = L.reshape(keep, (-1, ))  # [M2, ]
            keep.stop_gradient = True
            seg_masks = L.gather(seg_masks, keep)  # [M2, s4, s4]   M2个物体的掩码
            masks = L.gather(masks, keep)  # [M2, s4, s4]   M2个物体的掩码概率
            sum_masks = L.gather(sum_masks, keep)  # [M2, ]   M2个物体的掩码面积
            scores = L.gather(scores, keep)  # [M2, ]   M2个物体的分数
            classes = L.gather(classes, keep)  # [M2, ]   M2个物体的类别id
            return seg_masks, masks, sum_masks, scores, classes

        def no_objs_2(seg_masks, masks, sum_masks, scores, classes):
            keep = L.zeros((1, ), np.int64)
            keep.stop_gradient = True
            seg_masks = L.gather(seg_masks, keep)  # [M2, s4, s4]   M2个物体的掩码
            masks = L.gather(masks, keep)  # [M2, s4, s4]   M2个物体的掩码概率
            sum_masks = L.gather(sum_masks, keep)  # [M2, ]   M2个物体的掩码面积
            scores = L.gather(scores,
                              keep) - 99.0  # [M2, ]   M2个物体的分数。负分数,后面会被过滤掉。
            classes = L.gather(classes, keep)  # [M2, ]   M2个物体的类别id
            return seg_masks, masks, sum_masks, scores, classes

        # 是否有物体
        seg_masks, masks, sum_masks, scores, classes = L.cond(
            L.shape(keep)[0] == 0,
            lambda: no_objs_2(seg_masks, masks, sum_masks, scores, classes),
            lambda: exist_objs_2(keep, seg_masks, masks, sum_masks, scores,
                                 classes))

        # mask scoring.
        # [M2, ]   前景的掩码概率求和,再除以掩码面积。即M2个物体的前景部分的平均掩码概率
        avg_prob = L.reduce_sum(masks * seg_masks, dim=[1, 2]) / sum_masks
        scores *= avg_prob  # [M2, ]   M2个物体的最终分数 = 分类概率 * 平均掩码概率

        # 第三次过滤,只保留得分前cfg['nms_pre']个物体
        _, sort_inds = L.argsort(scores, axis=-1,
                                 descending=True)  # 最终分数降序。最大值的下标,第2大值的下标,...
        sort_inds = sort_inds[:cfg['nms_pre']]  # 最多cfg['nms_pre']个物体。

        seg_masks = L.gather(seg_masks, sort_inds)  # [M3, s4, s4]   M3个物体的掩码
        masks = L.gather(masks, sort_inds)  # [M3, s4, s4]   M3个物体的掩码概率
        sum_masks = L.gather(sum_masks, sort_inds)  # [M3, ]   M3个物体的掩码面积
        scores = L.gather(scores, sort_inds)  # [M3, ]   M3个物体的分数
        classes = L.gather(classes, sort_inds)  # [M3, ]   M3个物体的类别id

        # Matrix NMS
        scores = matrix_nms(seg_masks,
                            classes,
                            scores,
                            kernel=cfg['kernel'],
                            sigma=cfg['sigma'],
                            sum_masks=sum_masks)

        # 第四次过滤,分数过滤
        keep = L.where(scores >= cfg['update_thr'])

        # if keep.sum() == 0:
        #     return None

        def exist_objs_3(keep, masks, classes, scores, upsampled_size_out,
                         resize_shape, ori_shape):
            keep = L.reshape(keep, (-1, ))
            keep.stop_gradient = True
            masks = L.gather(masks, keep)  # [M4, s4, s4]   M4个物体的掩码概率
            scores = L.gather(scores, keep)  # [M4, ]   M4个物体的分数
            classes = L.gather(classes, keep)  # [M4, ]   M4个物体的类别id

            # 第五次过滤,只保留得分前cfg['max_per_img']个物体
            _, sort_inds = L.argsort(scores, axis=-1, descending=True)
            sort_inds = sort_inds[:cfg['max_per_img']]
            sort_inds.stop_gradient = True

            masks = L.gather(masks, sort_inds)  # [M5, s4, s4]   M5个物体的掩码概率
            scores = L.gather(scores, sort_inds)  # [M5, ]   M5个物体的分数
            classes = L.gather(classes, sort_inds)  # [M5, ]   M5个物体的类别id

            masks = L.resize_bilinear(
                L.unsqueeze(masks, axes=[0]),
                out_shape=upsampled_size_out,
                align_corners=False,
                align_mode=0)[:, :, :resize_shape[0], :resize_shape[1]]  # 去掉黑边
            masks = L.resize_bilinear(masks,
                                      out_shape=ori_shape[:2],
                                      align_corners=False,
                                      align_mode=0)  # 插值成原图大小
            masks = L.cast(masks > cfg['mask_thr'], 'float32')[0]
            return masks, classes, scores

        def no_objs_3():
            masks = L.zeros([1, 1, 1], 'float32') - 1.0
            classes = L.zeros([
                1,
            ], 'int64') - 1
            scores = L.zeros([
                1,
            ], 'float32') - 2.0
            return masks, classes, scores

        # 是否有物体
        masks, classes, scores = L.cond(
            L.shape(keep)[0] == 0, no_objs_3,
            lambda: exist_objs_3(keep, masks, classes, scores,
                                 upsampled_size_out, resize_shape, ori_shape))
        return masks, classes, scores