Exemplo n.º 1
0
def freeze_model(model, config, score_threshold=0.5):

    classification, regression = model.outputs

    anchors = anchors_for_shape(
        image_shape=config.input_shape,
        sizes=config.sizes,
        ratios=config.ratios,
        scales=config.scales,
        strides=config.strides,
        pyramid_levels=[3, 4, 5, 6, 7],
        shapes_callback=None,
    )
    # apply predicted regression to anchors
    anchors = tf.convert_to_tensor(anchors)
    anchors_input = tf.expand_dims(anchors, axis=0)
    boxes = RegressBoxes(name='boxes')([anchors_input, regression[..., :4]])
    boxes = ClipBoxes(name='clipped_boxes')([model.input, boxes])

    # filter detections (apply NMS / score threshold / select top-k)
    # if detect_quadrangle:
    #     detections = FilterDetections(
    #         name='filtered_detections',
    #         score_threshold=score_threshold,
    #         detect_quadrangle=True
    #     )([boxes, classification, regression[..., 4:8], regression[..., 8]])
    # else:
    detections = FilterDetections(name='filtered_detections',
                                  score_threshold=score_threshold)(
                                      [boxes, classification])

    prediction_model = models.Model(inputs=model.input,
                                    outputs=detections,
                                    name='efficientdet_p')
    return prediction_model
Exemplo n.º 2
0
    def __init__(self, num_classes, block, layers):
        self.inplanes = 64
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)

        if block == BasicBlock:
            fpn_sizes = [self.layer1[layers[0] - 1].conv2.out_channels, self.layer2[layers[1] - 1].conv2.out_channels, 
                        self.layer3[layers[2] - 1].conv2.out_channels, self.layer4[layers[3] - 1].conv2.out_channels]
        elif block == Bottleneck:
            fpn_sizes = [self.layer1[layers[0] - 1].conv3.out_channels, self.layer2[layers[1] - 1].conv3.out_channels, 
                        self.layer3[layers[2] - 1].conv3.out_channels, self.layer4[layers[3] - 1].conv3.out_channels]
        else:
            raise ValueError(f"Block type {block} not understood")

        self.fpn = PyramidFeatures(fpn_sizes[0], fpn_sizes[1], fpn_sizes[2], fpn_sizes[3])

        self.regressionModel = RegressionModel(256)
        self.classificationModel = ClassificationModel(256, num_classes=num_classes)
        self.contextModel = LevelAttentionModel(256)

        self.anchors = Anchors()

        self.regressBoxes = BBoxTransform()

        self.clipBoxes = ClipBoxes()

        self.contextLoss = losses.Con()

        self.focalLoss = losses.FocalLoss()

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                # init.xavier_normal(m.weight)
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

        prior = 0.01

        self.classificationModel.output.weight.data.fill_(0)
        self.classificationModel.output.bias.data.fill_(-math.log((1.0 - prior) / prior))

        self.regressionModel.output.weight.data.fill_(0)
        self.regressionModel.output.bias.data.fill_(0)

        self.levelattentionModel.conv5.weight.data.fill_(0)
        self.levelattentionModel.conv5.bias.data.fill_(0)

        self.freeze_bn()
Exemplo n.º 3
0
def efficientdet(phi, num_classes=20, num_anchors=9, weighted_bifpn=False, freeze_bn=False,
                 score_threshold=0.01, detect_quadrangle=False, anchor_parameters=None, separable_conv=True):
    assert phi in range(7)
    input_size = image_sizes[phi]
    input_shape = (input_size, input_size, 3)
    image_input = layers.Input(input_shape)
    w_bifpn = w_bifpns[phi]
    d_bifpn = d_bifpns[phi]
    w_head = w_bifpn
    d_head = d_heads[phi]
    backbone_cls = backbones[phi]
    features = backbone_cls(input_tensor=image_input, freeze_bn=freeze_bn)
    if weighted_bifpn:
        fpn_features = features
        for i in range(d_bifpn):
            fpn_features = build_wBiFPN(fpn_features, w_bifpn, i, freeze_bn=freeze_bn)
    else:
        fpn_features = features
        for i in range(d_bifpn):
            fpn_features = build_BiFPN(fpn_features, w_bifpn, i, freeze_bn=freeze_bn)
    box_net = BoxNet(w_head, d_head, num_anchors=num_anchors, separable_conv=separable_conv, freeze_bn=freeze_bn,
                     detect_quadrangle=detect_quadrangle, name='box_net')
    class_net = ClassNet(w_head, d_head, num_classes=num_classes, num_anchors=num_anchors,
                         separable_conv=separable_conv, freeze_bn=freeze_bn, name='class_net')
    classification = [class_net([feature, i]) for i, feature in enumerate(fpn_features)]
    classification = layers.Concatenate(axis=1, name='classification')(classification)
    regression = [box_net([feature, i]) for i, feature in enumerate(fpn_features)]
    regression = layers.Concatenate(axis=1, name='regression')(regression)

    model = models.Model(inputs=[image_input], outputs=[classification, regression], name='efficientdet')

    # apply predicted regression to anchors
    anchors = anchors_for_shape((input_size, input_size), anchor_params=anchor_parameters)
    anchors_input = np.expand_dims(anchors, axis=0)
    boxes = RegressBoxes(name='boxes')([anchors_input, regression[..., :4]])
    boxes = ClipBoxes(name='clipped_boxes')([image_input, boxes])

    # filter detections (apply NMS / score threshold / select top-k)
    if detect_quadrangle:
        detections = FilterDetections(
            name='filtered_detections',
            score_threshold=score_threshold,
            detect_quadrangle=True
        )([boxes, classification, regression[..., 4:8], regression[..., 8]])
    else:
        detections = FilterDetections(
            name='filtered_detections',
            score_threshold=score_threshold
        )([boxes, classification])

    prediction_model = models.Model(inputs=[image_input], outputs=detections, name='efficientdet_p')
    return model, prediction_model
Exemplo n.º 4
0
def efficientdet(phi,
                 num_classes=20,
                 weighted_bifpn=False,
                 freeze_bn=False,
                 score_threshold=0.01):
    assert phi in range(7)
    input_size = image_sizes[phi]
    input_shape = (input_size, input_size, 3)
    # input_shape = (None, None, 3)
    image_input = layers.Input(input_shape)
    w_bifpn = w_bifpns[phi]
    d_bifpn = 2 + phi
    w_head = w_bifpn
    d_head = 3 + int(phi / 3)
    backbone_cls = backbones[phi]
    # features = backbone_cls(include_top=False, input_shape=input_shape, weights=weights)(image_input)
    features = backbone_cls(input_tensor=image_input, freeze_bn=freeze_bn)
    if weighted_bifpn:
        for i in range(d_bifpn):
            features = build_wBiFPN(features, w_bifpn, i, freeze_bn=freeze_bn)
    else:
        for i in range(d_bifpn):
            features = build_BiFPN(features, w_bifpn, i, freeze_bn=freeze_bn)
    regress_head = build_regress_head(w_head, d_head)
    class_head = build_class_head(w_head, d_head, num_classes=num_classes)
    regression = [regress_head(feature) for feature in features]
    regression = layers.Concatenate(axis=1, name='regression')(regression)
    classification = [class_head(feature) for feature in features]
    classification = layers.Concatenate(axis=1,
                                        name='classification')(classification)

    model = models.Model(inputs=[image_input],
                         outputs=[regression, classification],
                         name='efficientdet')

    # apply predicted regression to anchors
    # anchors = tf.tile(tf.expand_dims(tf.constant(anchors), axis=0), (tf.shape(regression)[0], 1, 1))
    anchors_input = layers.Input((None, 4))
    boxes = RegressBoxes(name='boxes')([anchors_input, regression])
    boxes = ClipBoxes(name='clipped_boxes')([image_input, boxes])

    # filter detections (apply NMS / score threshold / select top-k)
    detections = FilterDetections(name='filtered_detections',
                                  score_threshold=score_threshold)(
                                      [boxes, classification])
    prediction_model = models.Model(inputs=[image_input, anchors_input],
                                    outputs=detections,
                                    name='efficientdet_p')
    return model, prediction_model
Exemplo n.º 5
0
def efficientdet(num_anchors, num_classes, num_properties, w_bifpn, d_bifpn,
                 d_head, score_threshold, nms_threshold):
    image_input = layers.Input(shape=(None, None, 3))
    w_head = w_bifpn
    backbone_cls = backbones[0]
    # [(?, 256, 256, 16), (?, 128, 128, 24),(?, 64, 64, 24),(?, 32,32, 24),(?, 16, 16, 24)]
    features = backbone_cls(input_tensor=image_input)

    fpn_features = features
    for i in range(d_bifpn):
        fpn_features = build_wBiFPN(fpn_features, w_bifpn, i)
    reg = regression_coco(fpn_features, w_head, d_head, num_anchors)
    cls = classification_coco(fpn_features, w_head, d_head, num_anchors,
                              num_classes)
    pro = properties_sand(fpn_features, w_head, d_head, num_anchors,
                          num_properties)
    model = models.Model(inputs=[image_input],
                         outputs=[reg, cls, pro],
                         name='efficientdet')

    anchors_input = layers.Input((None, 4), name='anchors_input')
    boxes = RegressBoxes(name='boxes')([anchors_input, reg])
    boxes = ClipBoxes(name='clipped_boxes')([image_input, boxes])

    # filter detections (apply NMS / score threshold / select top-k)
    # boxes (?, 49104, 4) (?, 49104, 1) (?, 49104, 3)
    detections = FilterDetections(name='filtered_detections',
                                  score_threshold=score_threshold,
                                  nms_threshold=nms_threshold,
                                  class_specific_filter=True,
                                  max_detections=100)([boxes, cls, pro])

    prediction_model = models.Model(inputs=[image_input, anchors_input],
                                    outputs=detections,
                                    name='efficientdet_p')
    return model, prediction_model
Exemplo n.º 6
0
def efficientdet_sand(num_anchors, num_classes, num_properties, w_bifpn,
                      d_bifpn, d_head, score_threshold, nms_threshold):
    image_input = layers.Input(shape=(None, None, 3))
    w_head = w_bifpn
    backbone_cls = backbones[0]
    # [(?, 256, 256, 16), (?, 128, 128, 24),(?, 64, 64, 24),(?, 32,32, 24),(?, 16, 16, 24)]
    features = backbone_cls(input_tensor=image_input)

    fpn_features = features
    for i in range(d_bifpn):
        fpn_features = build_wBiFPN(fpn_features, w_bifpn, i)
    reg = regression_coco(fpn_features, w_head, d_head, num_anchors)
    cls = classification_coco(fpn_features, w_head, d_head, num_anchors, 90)
    coco_model = models.Model(inputs=[image_input],
                              outputs=[reg, cls],
                              name='efficientdet_coco')
    path = os.path.join(os.path.dirname(__file__),
                        'weights/efficientdet-d0.h5')
    coco_model.load_weights(path, by_name=True)
    # for i in range(1, 227):  # 321
    #     coco_model.layers[i].trainable = False
    # coco_model.layers[i].training = False

    P3_out = coco_model.get_layer(
        name='fpn_cells/cell_2/fnode3/op_after_combine8/bn').output
    P4_td = coco_model.get_layer(
        name='fpn_cells/cell_2/fnode2/op_after_combine7/bn').output
    P5_td = coco_model.get_layer(
        name='fpn_cells/cell_2/fnode1/op_after_combine6/bn').output
    P6_td = coco_model.get_layer(
        name='fpn_cells/cell_2/fnode0/op_after_combine5/bn').output
    P7_out = coco_model.get_layer(
        name='fpn_cells/cell_2/fnode7/op_after_combine12/bn').output

    tmp_fpn_features = [P3_out, P4_td, P5_td, P6_td, P7_out]
    sand_reg = regression_sand(tmp_fpn_features, w_head, d_head, num_anchors)
    sand_cls = classification_sand(tmp_fpn_features, w_head, d_head,
                                   num_anchors, num_classes)
    sand_pro = properties_sand(tmp_fpn_features, w_head, d_head, num_anchors,
                               num_properties)
    sand_model = models.Model(inputs=[image_input],
                              outputs=[sand_reg, sand_cls, sand_pro],
                              name='efficientdet_sand')

    anchors_input = layers.Input((None, 4), name='anchors_input')
    boxes = RegressBoxes(name='boxes')([anchors_input, sand_reg])
    boxes = ClipBoxes(name='clipped_boxes')([image_input, boxes])

    # filter detections (apply NMS / score threshold / select top-k)
    # boxes (?, 49104, 4) (?, 49104, 1) (?, 49104, 3)
    detections = FilterDetections(
        name='filtered_detections',
        score_threshold=score_threshold,
        nms_threshold=nms_threshold,
        class_specific_filter=True,
        max_detections=100)([boxes, sand_cls, sand_pro])

    prediction_model = models.Model(inputs=[image_input, anchors_input],
                                    outputs=detections,
                                    name='efficientdet_p')

    return sand_model, prediction_model
Exemplo n.º 7
0
def sapd(
    phi,
    soft_select=False,
    num_classes=20,
    freeze_bn=False,
    max_gt_boxes=100,
    batch_size=32,
    score_threshold=0.01,
):
    assert phi in range(7)
    image_size = image_sizes[phi]
    input_shape = (image_size, image_size, 3)
    # input_shape = (None, None, 3)
    image_input = layers.Input(input_shape)
    gt_boxes_input = layers.Input((max_gt_boxes, 5))
    num_gt_boxes_input = layers.Input((1, ), dtype='int32')
    fm_shapes_input = layers.Input((5, 2), dtype='int32')

    backbone_cls = backbones[phi]
    # (C1, C2, C3, C4, C5)
    features = backbone_cls(input_tensor=image_input, freeze_bn=freeze_bn)
    w_bifpn = w_bifpns[phi]
    d_bifpn = 2 + phi
    w_head = w_bifpn
    d_head = 3 + int(phi / 3)
    for i in range(d_bifpn):
        features = build_BiFPN(features, w_bifpn, i, freeze_bn=freeze_bn)
    regr_head = build_regress_head(w_head, d_head)
    cls_head = build_class_head(w_head, d_head, num_classes=num_classes)
    pyramid_features = features
    fpn_width = w_head
    cls_pred = [cls_head(feature) for feature in pyramid_features]
    cls_pred = layers.Concatenate(axis=1, name='classification')(cls_pred)
    regr_pred = [regr_head(feature) for feature in pyramid_features]
    regr_pred = layers.Concatenate(axis=1, name='regression')(regr_pred)

    # meta select net
    meta_select_net = build_meta_select_net(width=fpn_width)
    meta_select_input, gt_boxes_batch_ids = MetaSelectInput()(
        [gt_boxes_input, *pyramid_features])
    meta_select_pred = meta_select_net(meta_select_input)
    meta_select_target = MetaSelectTarget()(
        [cls_pred, regr_pred, fm_shapes_input, gt_boxes_input])
    # # lambda == 0.1 in paper
    meta_select_loss = layers.Lambda(
        lambda x: 0.1 * losses.sparse_categorical_crossentropy(x[0], x[1]),
        output_shape=(1, ),
        name="meta_select_loss")([meta_select_target, meta_select_pred])

    if soft_select:
        meta_select_weight = MetaSelectWeight(
            max_gt_boxes=max_gt_boxes,
            soft_select=soft_select,
            batch_size=batch_size,
        )([meta_select_pred, gt_boxes_batch_ids, num_gt_boxes_input])
    else:
        meta_select_weight = MetaSelectWeight(
            max_gt_boxes=max_gt_boxes,
            soft_select=soft_select,
            batch_size=batch_size,
        )([meta_select_target, gt_boxes_batch_ids, num_gt_boxes_input])

    cls_target, regr_target = SAPDTarget(num_classes=num_classes)(
        [fm_shapes_input, gt_boxes_input, meta_select_weight])

    focal_loss = focal_with_weight_and_mask()
    iou_loss = iou_with_weight_and_mask()
    cls_loss = layers.Lambda(focal_loss, output_shape=(1, ),
                             name="cls_loss")([cls_target, cls_pred])
    regr_loss = layers.Lambda(iou_loss, output_shape=(1, ),
                              name="regr_loss")([regr_target, regr_pred])

    model = models.Model(inputs=[
        image_input, gt_boxes_input, num_gt_boxes_input, fm_shapes_input
    ],
                         outputs=[
                             cls_loss, regr_loss, meta_select_loss, cls_pred,
                             regr_pred, cls_target, regr_target
                         ],
                         name='sapd')

    locations, strides = Locations()(pyramid_features)

    # apply predicted regression to anchors
    boxes = RegressBoxes(name='boxes')([locations, strides, regr_pred])
    boxes = ClipBoxes(name='clipped_boxes')([image_input, boxes])

    # filter detections (apply NMS / score threshold / select top-k)
    detections = FilterDetections(name='filtered_detections',
                                  score_threshold=score_threshold)(
                                      [boxes, cls_pred])

    prediction_model = models.Model(inputs=[image_input],
                                    outputs=detections,
                                    name='sapd_p')

    return model, prediction_model
Exemplo n.º 8
0
def apply_subnets_to_feature_maps(box_net, class_net, rotation_net,
                                  translation_net, fpn_feature_maps,
                                  image_input, camera_parameters_input,
                                  input_size, anchor_parameters):
    """
    Applies the subnetworks to the BiFPN feature maps
    Args:
        box_net, class_net, rotation_net, translation_net: Subnetworks
        fpn_feature_maps: Sequence of the BiFPN feature maps of the different levels (P3, P4, P5, P6, P7)
        image_input, camera_parameters_input: The image and camera parameter input layer
        input size: Integer representing the input image resolution
        anchor_parameters: Struct containing anchor parameters. If None, default values are used.
    
    Returns:
       classification: Tensor containing the classification outputs for all anchor boxes. Shape (batch_size, num_anchor_boxes, num_classes)
       bbox_regression: Tensor containing the deltas of anchor boxes to the GT 2D bounding boxes for all anchor boxes. Shape (batch_size, num_anchor_boxes, 4)
       rotation: Tensor containing the rotation outputs for all anchor boxes. Shape (batch_size, num_anchor_boxes, num_rotation_parameters)
       translation: Tensor containing the translation outputs for all anchor boxes. Shape (batch_size, num_anchor_boxes, 3)
       transformation: Tensor containing the concatenated rotation and translation outputs for all anchor boxes. Shape (batch_size, num_anchor_boxes, num_rotation_parameters + 3)
                       Rotation and Translation are concatenated because the Keras Loss function takes only one GT and prediction tensor respectively as input but the transformation loss needs both
       bboxes: Tensor containing the 2D bounding boxes for all anchor boxes. Shape (batch_size, num_anchor_boxes, 4)
    """
    classification = [
        class_net([feature, i]) for i, feature in enumerate(fpn_feature_maps)
    ]
    classification = layers.Concatenate(axis=1,
                                        name='classification')(classification)

    bbox_regression = [
        box_net([feature, i]) for i, feature in enumerate(fpn_feature_maps)
    ]
    bbox_regression = layers.Concatenate(axis=1,
                                         name='regression')(bbox_regression)

    rotation = [
        rotation_net([feature, i])
        for i, feature in enumerate(fpn_feature_maps)
    ]
    rotation = layers.Concatenate(axis=1, name='rotation')(rotation)

    translation_raw = [
        translation_net([feature, i])
        for i, feature in enumerate(fpn_feature_maps)
    ]
    translation_raw = layers.Concatenate(
        axis=1, name='translation_raw_outputs')(translation_raw)

    #get anchors and apply predicted translation offsets to translation anchors
    anchors, translation_anchors = anchors_for_shape(
        (input_size, input_size), anchor_params=anchor_parameters)
    translation_anchors_input = np.expand_dims(translation_anchors, axis=0)

    translation_xy_Tz = RegressTranslation(name='translation_regression')(
        [translation_anchors_input, translation_raw])
    translation = CalculateTxTy(name='translation')(
        translation_xy_Tz,
        fx=camera_parameters_input[:, 0],
        fy=camera_parameters_input[:, 1],
        px=camera_parameters_input[:, 2],
        py=camera_parameters_input[:, 3],
        tz_scale=camera_parameters_input[:, 4],
        image_scale=camera_parameters_input[:, 5])

    # apply predicted 2D bbox regression to anchors
    anchors_input = np.expand_dims(anchors, axis=0)
    bboxes = RegressBoxes(name='boxes')(
        [anchors_input, bbox_regression[..., :4]])
    bboxes = ClipBoxes(name='clipped_boxes')([image_input, bboxes])

    #concat rotation and translation outputs to transformation output to have a single output for transformation loss calculation
    #standard concatenate layer throws error that shapes does not match because translation shape dim 2 is known via translation_anchors and rotation shape dim 2 is None
    #so just use lambda layer with tf concat
    transformation = layers.Lambda(
        lambda input_list: tf.concat(input_list, axis=-1),
        name="transformation")([rotation, translation])

    return classification, bbox_regression, rotation, translation, transformation, bboxes
def yolo_body(num_classes=20, score_threshold=0.01):
    """
    Create YOLO_V3 model CNN body in Keras.

    Args:
        num_classes:
        score_threshold:

    Returns:

    """
    image_input = Input(shape=(None, None, 3), name='image_input')
    darknet = Model([image_input], darknet_body(image_input))
    ##################################################
    # build fsaf head
    ##################################################
    x, y1 = make_last_layers(darknet.output, 512, 4 + num_classes)

    x = compose(darknet_conv2d_bn_leaky(256, (1, 1)), UpSampling2D(2))(x)
    x = Concatenate()([x, darknet.layers[152].output])
    x, y2 = make_last_layers(x, 256, 4 + num_classes)
    x = compose(darknet_conv2d_bn_leaky(128, (1, 1)), UpSampling2D(2))(x)
    x = Concatenate()([x, darknet.layers[92].output])
    x, y3 = make_last_layers(x, 128, 4 + num_classes)
    y1_ = Reshape((-1, 4 + num_classes))(y1)
    y2_ = Reshape((-1, 4 + num_classes))(y2)
    y3_ = Reshape((-1, 4 + num_classes))(y3)
    y = Concatenate(axis=1)([y1_, y2_, y3_])
    batch_cls_pred = Lambda(lambda x: x[..., 4:])(y)
    batch_regr_pred = Lambda(lambda x: x[..., :4])(y)
    batch_cls_pred = Activation('sigmoid')(batch_cls_pred)
    batch_regr_pred = Activation('relu')(batch_regr_pred)

    gt_boxes_input = Input(shape=(config.MAX_NUM_GT_BOXES, 5),
                           name='gt_boxes_input')
    grid_shapes_input = Input((len(config.STRIDES), 2),
                              dtype='int32',
                              name='grid_shapes_input')
    batch_gt_box_levels = LevelSelect(name='level_select')(
        [batch_cls_pred, batch_regr_pred, grid_shapes_input, gt_boxes_input])
    batch_cls_target, batch_cls_mask, batch_cls_num_pos, batch_regr_target, batch_regr_mask = FSAFTarget(
        num_classes=num_classes, name='fsaf_target')(
            [batch_gt_box_levels, grid_shapes_input, gt_boxes_input])
    focal_loss_graph = focal_with_mask()
    iou_loss_graph = iou_with_mask()
    cls_loss = Lambda(focal_loss_graph, output_shape=(1, ), name="cls_loss")(
        [batch_cls_target, batch_cls_pred, batch_cls_mask, batch_cls_num_pos])
    regr_loss = Lambda(iou_loss_graph, output_shape=(1, ), name="regr_loss")(
        [batch_regr_target, batch_regr_pred, batch_regr_mask])
    model = Model(inputs=[image_input, gt_boxes_input, grid_shapes_input],
                  outputs=[cls_loss, regr_loss],
                  name='fsaf')

    # compute the anchors
    features = [y1, y2, y3]

    locations, strides = Locations(strides=config.STRIDES)(features)

    # apply predicted regression to anchors
    boxes = RegressBoxes(name='boxes')([locations, strides, batch_regr_pred])
    boxes = ClipBoxes(name='clipped_boxes')([image_input, boxes])

    # filter detections (apply NMS / score threshold / select top-k)
    detections = FilterDetections(
        nms=True,
        class_specific_filter=True,
        name='filtered_detections',
        score_threshold=score_threshold)([boxes, batch_cls_pred])

    prediction_model = Model(inputs=image_input,
                             outputs=detections,
                             name='fsaf_detection')
    return model, prediction_model