def freeze_model(model, config, score_threshold=0.5): classification, regression = model.outputs anchors = anchors_for_shape( image_shape=config.input_shape, sizes=config.sizes, ratios=config.ratios, scales=config.scales, strides=config.strides, pyramid_levels=[3, 4, 5, 6, 7], shapes_callback=None, ) # apply predicted regression to anchors anchors = tf.convert_to_tensor(anchors) anchors_input = tf.expand_dims(anchors, axis=0) boxes = RegressBoxes(name='boxes')([anchors_input, regression[..., :4]]) boxes = ClipBoxes(name='clipped_boxes')([model.input, boxes]) # filter detections (apply NMS / score threshold / select top-k) # if detect_quadrangle: # detections = FilterDetections( # name='filtered_detections', # score_threshold=score_threshold, # detect_quadrangle=True # )([boxes, classification, regression[..., 4:8], regression[..., 8]]) # else: detections = FilterDetections(name='filtered_detections', score_threshold=score_threshold)( [boxes, classification]) prediction_model = models.Model(inputs=model.input, outputs=detections, name='efficientdet_p') return prediction_model
def __init__(self, num_classes, block, layers): self.inplanes = 64 super(ResNet, self).__init__() self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = nn.BatchNorm2d(64) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=2) self.layer4 = self._make_layer(block, 512, layers[3], stride=2) if block == BasicBlock: fpn_sizes = [self.layer1[layers[0] - 1].conv2.out_channels, self.layer2[layers[1] - 1].conv2.out_channels, self.layer3[layers[2] - 1].conv2.out_channels, self.layer4[layers[3] - 1].conv2.out_channels] elif block == Bottleneck: fpn_sizes = [self.layer1[layers[0] - 1].conv3.out_channels, self.layer2[layers[1] - 1].conv3.out_channels, self.layer3[layers[2] - 1].conv3.out_channels, self.layer4[layers[3] - 1].conv3.out_channels] else: raise ValueError(f"Block type {block} not understood") self.fpn = PyramidFeatures(fpn_sizes[0], fpn_sizes[1], fpn_sizes[2], fpn_sizes[3]) self.regressionModel = RegressionModel(256) self.classificationModel = ClassificationModel(256, num_classes=num_classes) self.contextModel = LevelAttentionModel(256) self.anchors = Anchors() self.regressBoxes = BBoxTransform() self.clipBoxes = ClipBoxes() self.contextLoss = losses.Con() self.focalLoss = losses.FocalLoss() for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) # init.xavier_normal(m.weight) elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() prior = 0.01 self.classificationModel.output.weight.data.fill_(0) self.classificationModel.output.bias.data.fill_(-math.log((1.0 - prior) / prior)) self.regressionModel.output.weight.data.fill_(0) self.regressionModel.output.bias.data.fill_(0) self.levelattentionModel.conv5.weight.data.fill_(0) self.levelattentionModel.conv5.bias.data.fill_(0) self.freeze_bn()
def efficientdet(phi, num_classes=20, num_anchors=9, weighted_bifpn=False, freeze_bn=False, score_threshold=0.01, detect_quadrangle=False, anchor_parameters=None, separable_conv=True): assert phi in range(7) input_size = image_sizes[phi] input_shape = (input_size, input_size, 3) image_input = layers.Input(input_shape) w_bifpn = w_bifpns[phi] d_bifpn = d_bifpns[phi] w_head = w_bifpn d_head = d_heads[phi] backbone_cls = backbones[phi] features = backbone_cls(input_tensor=image_input, freeze_bn=freeze_bn) if weighted_bifpn: fpn_features = features for i in range(d_bifpn): fpn_features = build_wBiFPN(fpn_features, w_bifpn, i, freeze_bn=freeze_bn) else: fpn_features = features for i in range(d_bifpn): fpn_features = build_BiFPN(fpn_features, w_bifpn, i, freeze_bn=freeze_bn) box_net = BoxNet(w_head, d_head, num_anchors=num_anchors, separable_conv=separable_conv, freeze_bn=freeze_bn, detect_quadrangle=detect_quadrangle, name='box_net') class_net = ClassNet(w_head, d_head, num_classes=num_classes, num_anchors=num_anchors, separable_conv=separable_conv, freeze_bn=freeze_bn, name='class_net') classification = [class_net([feature, i]) for i, feature in enumerate(fpn_features)] classification = layers.Concatenate(axis=1, name='classification')(classification) regression = [box_net([feature, i]) for i, feature in enumerate(fpn_features)] regression = layers.Concatenate(axis=1, name='regression')(regression) model = models.Model(inputs=[image_input], outputs=[classification, regression], name='efficientdet') # apply predicted regression to anchors anchors = anchors_for_shape((input_size, input_size), anchor_params=anchor_parameters) anchors_input = np.expand_dims(anchors, axis=0) boxes = RegressBoxes(name='boxes')([anchors_input, regression[..., :4]]) boxes = ClipBoxes(name='clipped_boxes')([image_input, boxes]) # filter detections (apply NMS / score threshold / select top-k) if detect_quadrangle: detections = FilterDetections( name='filtered_detections', score_threshold=score_threshold, detect_quadrangle=True )([boxes, classification, regression[..., 4:8], regression[..., 8]]) else: detections = FilterDetections( name='filtered_detections', score_threshold=score_threshold )([boxes, classification]) prediction_model = models.Model(inputs=[image_input], outputs=detections, name='efficientdet_p') return model, prediction_model
def efficientdet(phi, num_classes=20, weighted_bifpn=False, freeze_bn=False, score_threshold=0.01): assert phi in range(7) input_size = image_sizes[phi] input_shape = (input_size, input_size, 3) # input_shape = (None, None, 3) image_input = layers.Input(input_shape) w_bifpn = w_bifpns[phi] d_bifpn = 2 + phi w_head = w_bifpn d_head = 3 + int(phi / 3) backbone_cls = backbones[phi] # features = backbone_cls(include_top=False, input_shape=input_shape, weights=weights)(image_input) features = backbone_cls(input_tensor=image_input, freeze_bn=freeze_bn) if weighted_bifpn: for i in range(d_bifpn): features = build_wBiFPN(features, w_bifpn, i, freeze_bn=freeze_bn) else: for i in range(d_bifpn): features = build_BiFPN(features, w_bifpn, i, freeze_bn=freeze_bn) regress_head = build_regress_head(w_head, d_head) class_head = build_class_head(w_head, d_head, num_classes=num_classes) regression = [regress_head(feature) for feature in features] regression = layers.Concatenate(axis=1, name='regression')(regression) classification = [class_head(feature) for feature in features] classification = layers.Concatenate(axis=1, name='classification')(classification) model = models.Model(inputs=[image_input], outputs=[regression, classification], name='efficientdet') # apply predicted regression to anchors # anchors = tf.tile(tf.expand_dims(tf.constant(anchors), axis=0), (tf.shape(regression)[0], 1, 1)) anchors_input = layers.Input((None, 4)) boxes = RegressBoxes(name='boxes')([anchors_input, regression]) boxes = ClipBoxes(name='clipped_boxes')([image_input, boxes]) # filter detections (apply NMS / score threshold / select top-k) detections = FilterDetections(name='filtered_detections', score_threshold=score_threshold)( [boxes, classification]) prediction_model = models.Model(inputs=[image_input, anchors_input], outputs=detections, name='efficientdet_p') return model, prediction_model
def efficientdet(num_anchors, num_classes, num_properties, w_bifpn, d_bifpn, d_head, score_threshold, nms_threshold): image_input = layers.Input(shape=(None, None, 3)) w_head = w_bifpn backbone_cls = backbones[0] # [(?, 256, 256, 16), (?, 128, 128, 24),(?, 64, 64, 24),(?, 32,32, 24),(?, 16, 16, 24)] features = backbone_cls(input_tensor=image_input) fpn_features = features for i in range(d_bifpn): fpn_features = build_wBiFPN(fpn_features, w_bifpn, i) reg = regression_coco(fpn_features, w_head, d_head, num_anchors) cls = classification_coco(fpn_features, w_head, d_head, num_anchors, num_classes) pro = properties_sand(fpn_features, w_head, d_head, num_anchors, num_properties) model = models.Model(inputs=[image_input], outputs=[reg, cls, pro], name='efficientdet') anchors_input = layers.Input((None, 4), name='anchors_input') boxes = RegressBoxes(name='boxes')([anchors_input, reg]) boxes = ClipBoxes(name='clipped_boxes')([image_input, boxes]) # filter detections (apply NMS / score threshold / select top-k) # boxes (?, 49104, 4) (?, 49104, 1) (?, 49104, 3) detections = FilterDetections(name='filtered_detections', score_threshold=score_threshold, nms_threshold=nms_threshold, class_specific_filter=True, max_detections=100)([boxes, cls, pro]) prediction_model = models.Model(inputs=[image_input, anchors_input], outputs=detections, name='efficientdet_p') return model, prediction_model
def efficientdet_sand(num_anchors, num_classes, num_properties, w_bifpn, d_bifpn, d_head, score_threshold, nms_threshold): image_input = layers.Input(shape=(None, None, 3)) w_head = w_bifpn backbone_cls = backbones[0] # [(?, 256, 256, 16), (?, 128, 128, 24),(?, 64, 64, 24),(?, 32,32, 24),(?, 16, 16, 24)] features = backbone_cls(input_tensor=image_input) fpn_features = features for i in range(d_bifpn): fpn_features = build_wBiFPN(fpn_features, w_bifpn, i) reg = regression_coco(fpn_features, w_head, d_head, num_anchors) cls = classification_coco(fpn_features, w_head, d_head, num_anchors, 90) coco_model = models.Model(inputs=[image_input], outputs=[reg, cls], name='efficientdet_coco') path = os.path.join(os.path.dirname(__file__), 'weights/efficientdet-d0.h5') coco_model.load_weights(path, by_name=True) # for i in range(1, 227): # 321 # coco_model.layers[i].trainable = False # coco_model.layers[i].training = False P3_out = coco_model.get_layer( name='fpn_cells/cell_2/fnode3/op_after_combine8/bn').output P4_td = coco_model.get_layer( name='fpn_cells/cell_2/fnode2/op_after_combine7/bn').output P5_td = coco_model.get_layer( name='fpn_cells/cell_2/fnode1/op_after_combine6/bn').output P6_td = coco_model.get_layer( name='fpn_cells/cell_2/fnode0/op_after_combine5/bn').output P7_out = coco_model.get_layer( name='fpn_cells/cell_2/fnode7/op_after_combine12/bn').output tmp_fpn_features = [P3_out, P4_td, P5_td, P6_td, P7_out] sand_reg = regression_sand(tmp_fpn_features, w_head, d_head, num_anchors) sand_cls = classification_sand(tmp_fpn_features, w_head, d_head, num_anchors, num_classes) sand_pro = properties_sand(tmp_fpn_features, w_head, d_head, num_anchors, num_properties) sand_model = models.Model(inputs=[image_input], outputs=[sand_reg, sand_cls, sand_pro], name='efficientdet_sand') anchors_input = layers.Input((None, 4), name='anchors_input') boxes = RegressBoxes(name='boxes')([anchors_input, sand_reg]) boxes = ClipBoxes(name='clipped_boxes')([image_input, boxes]) # filter detections (apply NMS / score threshold / select top-k) # boxes (?, 49104, 4) (?, 49104, 1) (?, 49104, 3) detections = FilterDetections( name='filtered_detections', score_threshold=score_threshold, nms_threshold=nms_threshold, class_specific_filter=True, max_detections=100)([boxes, sand_cls, sand_pro]) prediction_model = models.Model(inputs=[image_input, anchors_input], outputs=detections, name='efficientdet_p') return sand_model, prediction_model
def sapd( phi, soft_select=False, num_classes=20, freeze_bn=False, max_gt_boxes=100, batch_size=32, score_threshold=0.01, ): assert phi in range(7) image_size = image_sizes[phi] input_shape = (image_size, image_size, 3) # input_shape = (None, None, 3) image_input = layers.Input(input_shape) gt_boxes_input = layers.Input((max_gt_boxes, 5)) num_gt_boxes_input = layers.Input((1, ), dtype='int32') fm_shapes_input = layers.Input((5, 2), dtype='int32') backbone_cls = backbones[phi] # (C1, C2, C3, C4, C5) features = backbone_cls(input_tensor=image_input, freeze_bn=freeze_bn) w_bifpn = w_bifpns[phi] d_bifpn = 2 + phi w_head = w_bifpn d_head = 3 + int(phi / 3) for i in range(d_bifpn): features = build_BiFPN(features, w_bifpn, i, freeze_bn=freeze_bn) regr_head = build_regress_head(w_head, d_head) cls_head = build_class_head(w_head, d_head, num_classes=num_classes) pyramid_features = features fpn_width = w_head cls_pred = [cls_head(feature) for feature in pyramid_features] cls_pred = layers.Concatenate(axis=1, name='classification')(cls_pred) regr_pred = [regr_head(feature) for feature in pyramid_features] regr_pred = layers.Concatenate(axis=1, name='regression')(regr_pred) # meta select net meta_select_net = build_meta_select_net(width=fpn_width) meta_select_input, gt_boxes_batch_ids = MetaSelectInput()( [gt_boxes_input, *pyramid_features]) meta_select_pred = meta_select_net(meta_select_input) meta_select_target = MetaSelectTarget()( [cls_pred, regr_pred, fm_shapes_input, gt_boxes_input]) # # lambda == 0.1 in paper meta_select_loss = layers.Lambda( lambda x: 0.1 * losses.sparse_categorical_crossentropy(x[0], x[1]), output_shape=(1, ), name="meta_select_loss")([meta_select_target, meta_select_pred]) if soft_select: meta_select_weight = MetaSelectWeight( max_gt_boxes=max_gt_boxes, soft_select=soft_select, batch_size=batch_size, )([meta_select_pred, gt_boxes_batch_ids, num_gt_boxes_input]) else: meta_select_weight = MetaSelectWeight( max_gt_boxes=max_gt_boxes, soft_select=soft_select, batch_size=batch_size, )([meta_select_target, gt_boxes_batch_ids, num_gt_boxes_input]) cls_target, regr_target = SAPDTarget(num_classes=num_classes)( [fm_shapes_input, gt_boxes_input, meta_select_weight]) focal_loss = focal_with_weight_and_mask() iou_loss = iou_with_weight_and_mask() cls_loss = layers.Lambda(focal_loss, output_shape=(1, ), name="cls_loss")([cls_target, cls_pred]) regr_loss = layers.Lambda(iou_loss, output_shape=(1, ), name="regr_loss")([regr_target, regr_pred]) model = models.Model(inputs=[ image_input, gt_boxes_input, num_gt_boxes_input, fm_shapes_input ], outputs=[ cls_loss, regr_loss, meta_select_loss, cls_pred, regr_pred, cls_target, regr_target ], name='sapd') locations, strides = Locations()(pyramid_features) # apply predicted regression to anchors boxes = RegressBoxes(name='boxes')([locations, strides, regr_pred]) boxes = ClipBoxes(name='clipped_boxes')([image_input, boxes]) # filter detections (apply NMS / score threshold / select top-k) detections = FilterDetections(name='filtered_detections', score_threshold=score_threshold)( [boxes, cls_pred]) prediction_model = models.Model(inputs=[image_input], outputs=detections, name='sapd_p') return model, prediction_model
def apply_subnets_to_feature_maps(box_net, class_net, rotation_net, translation_net, fpn_feature_maps, image_input, camera_parameters_input, input_size, anchor_parameters): """ Applies the subnetworks to the BiFPN feature maps Args: box_net, class_net, rotation_net, translation_net: Subnetworks fpn_feature_maps: Sequence of the BiFPN feature maps of the different levels (P3, P4, P5, P6, P7) image_input, camera_parameters_input: The image and camera parameter input layer input size: Integer representing the input image resolution anchor_parameters: Struct containing anchor parameters. If None, default values are used. Returns: classification: Tensor containing the classification outputs for all anchor boxes. Shape (batch_size, num_anchor_boxes, num_classes) bbox_regression: Tensor containing the deltas of anchor boxes to the GT 2D bounding boxes for all anchor boxes. Shape (batch_size, num_anchor_boxes, 4) rotation: Tensor containing the rotation outputs for all anchor boxes. Shape (batch_size, num_anchor_boxes, num_rotation_parameters) translation: Tensor containing the translation outputs for all anchor boxes. Shape (batch_size, num_anchor_boxes, 3) transformation: Tensor containing the concatenated rotation and translation outputs for all anchor boxes. Shape (batch_size, num_anchor_boxes, num_rotation_parameters + 3) Rotation and Translation are concatenated because the Keras Loss function takes only one GT and prediction tensor respectively as input but the transformation loss needs both bboxes: Tensor containing the 2D bounding boxes for all anchor boxes. Shape (batch_size, num_anchor_boxes, 4) """ classification = [ class_net([feature, i]) for i, feature in enumerate(fpn_feature_maps) ] classification = layers.Concatenate(axis=1, name='classification')(classification) bbox_regression = [ box_net([feature, i]) for i, feature in enumerate(fpn_feature_maps) ] bbox_regression = layers.Concatenate(axis=1, name='regression')(bbox_regression) rotation = [ rotation_net([feature, i]) for i, feature in enumerate(fpn_feature_maps) ] rotation = layers.Concatenate(axis=1, name='rotation')(rotation) translation_raw = [ translation_net([feature, i]) for i, feature in enumerate(fpn_feature_maps) ] translation_raw = layers.Concatenate( axis=1, name='translation_raw_outputs')(translation_raw) #get anchors and apply predicted translation offsets to translation anchors anchors, translation_anchors = anchors_for_shape( (input_size, input_size), anchor_params=anchor_parameters) translation_anchors_input = np.expand_dims(translation_anchors, axis=0) translation_xy_Tz = RegressTranslation(name='translation_regression')( [translation_anchors_input, translation_raw]) translation = CalculateTxTy(name='translation')( translation_xy_Tz, fx=camera_parameters_input[:, 0], fy=camera_parameters_input[:, 1], px=camera_parameters_input[:, 2], py=camera_parameters_input[:, 3], tz_scale=camera_parameters_input[:, 4], image_scale=camera_parameters_input[:, 5]) # apply predicted 2D bbox regression to anchors anchors_input = np.expand_dims(anchors, axis=0) bboxes = RegressBoxes(name='boxes')( [anchors_input, bbox_regression[..., :4]]) bboxes = ClipBoxes(name='clipped_boxes')([image_input, bboxes]) #concat rotation and translation outputs to transformation output to have a single output for transformation loss calculation #standard concatenate layer throws error that shapes does not match because translation shape dim 2 is known via translation_anchors and rotation shape dim 2 is None #so just use lambda layer with tf concat transformation = layers.Lambda( lambda input_list: tf.concat(input_list, axis=-1), name="transformation")([rotation, translation]) return classification, bbox_regression, rotation, translation, transformation, bboxes
def yolo_body(num_classes=20, score_threshold=0.01): """ Create YOLO_V3 model CNN body in Keras. Args: num_classes: score_threshold: Returns: """ image_input = Input(shape=(None, None, 3), name='image_input') darknet = Model([image_input], darknet_body(image_input)) ################################################## # build fsaf head ################################################## x, y1 = make_last_layers(darknet.output, 512, 4 + num_classes) x = compose(darknet_conv2d_bn_leaky(256, (1, 1)), UpSampling2D(2))(x) x = Concatenate()([x, darknet.layers[152].output]) x, y2 = make_last_layers(x, 256, 4 + num_classes) x = compose(darknet_conv2d_bn_leaky(128, (1, 1)), UpSampling2D(2))(x) x = Concatenate()([x, darknet.layers[92].output]) x, y3 = make_last_layers(x, 128, 4 + num_classes) y1_ = Reshape((-1, 4 + num_classes))(y1) y2_ = Reshape((-1, 4 + num_classes))(y2) y3_ = Reshape((-1, 4 + num_classes))(y3) y = Concatenate(axis=1)([y1_, y2_, y3_]) batch_cls_pred = Lambda(lambda x: x[..., 4:])(y) batch_regr_pred = Lambda(lambda x: x[..., :4])(y) batch_cls_pred = Activation('sigmoid')(batch_cls_pred) batch_regr_pred = Activation('relu')(batch_regr_pred) gt_boxes_input = Input(shape=(config.MAX_NUM_GT_BOXES, 5), name='gt_boxes_input') grid_shapes_input = Input((len(config.STRIDES), 2), dtype='int32', name='grid_shapes_input') batch_gt_box_levels = LevelSelect(name='level_select')( [batch_cls_pred, batch_regr_pred, grid_shapes_input, gt_boxes_input]) batch_cls_target, batch_cls_mask, batch_cls_num_pos, batch_regr_target, batch_regr_mask = FSAFTarget( num_classes=num_classes, name='fsaf_target')( [batch_gt_box_levels, grid_shapes_input, gt_boxes_input]) focal_loss_graph = focal_with_mask() iou_loss_graph = iou_with_mask() cls_loss = Lambda(focal_loss_graph, output_shape=(1, ), name="cls_loss")( [batch_cls_target, batch_cls_pred, batch_cls_mask, batch_cls_num_pos]) regr_loss = Lambda(iou_loss_graph, output_shape=(1, ), name="regr_loss")( [batch_regr_target, batch_regr_pred, batch_regr_mask]) model = Model(inputs=[image_input, gt_boxes_input, grid_shapes_input], outputs=[cls_loss, regr_loss], name='fsaf') # compute the anchors features = [y1, y2, y3] locations, strides = Locations(strides=config.STRIDES)(features) # apply predicted regression to anchors boxes = RegressBoxes(name='boxes')([locations, strides, batch_regr_pred]) boxes = ClipBoxes(name='clipped_boxes')([image_input, boxes]) # filter detections (apply NMS / score threshold / select top-k) detections = FilterDetections( nms=True, class_specific_filter=True, name='filtered_detections', score_threshold=score_threshold)([boxes, batch_cls_pred]) prediction_model = Model(inputs=image_input, outputs=detections, name='fsaf_detection') return model, prediction_model