def efficientdet(phi, num_classes=20, num_anchors=9, weighted_bifpn=False, freeze_bn=False, score_threshold=0.01, detect_quadrangle=False, anchor_parameters=None, separable_conv=True): assert phi in range(7) input_size = image_sizes[phi] input_shape = (input_size, input_size, 3) image_input = layers.Input(input_shape) w_bifpn = w_bifpns[phi] d_bifpn = d_bifpns[phi] w_head = w_bifpn d_head = d_heads[phi] backbone_cls = backbones[phi] features = backbone_cls(input_tensor=image_input, freeze_bn=freeze_bn) if weighted_bifpn: fpn_features = features for i in range(d_bifpn): fpn_features = build_wBiFPN(fpn_features, w_bifpn, i, freeze_bn=freeze_bn) else: fpn_features = features for i in range(d_bifpn): fpn_features = build_BiFPN(fpn_features, w_bifpn, i, freeze_bn=freeze_bn) box_net = BoxNet(w_head, d_head, num_anchors=num_anchors, separable_conv=separable_conv, freeze_bn=freeze_bn, detect_quadrangle=detect_quadrangle, name='box_net') class_net = ClassNet(w_head, d_head, num_classes=num_classes, num_anchors=num_anchors, separable_conv=separable_conv, freeze_bn=freeze_bn, name='class_net') classification = [class_net([feature, i]) for i, feature in enumerate(fpn_features)] classification = layers.Concatenate(axis=1, name='classification')(classification) regression = [box_net([feature, i]) for i, feature in enumerate(fpn_features)] regression = layers.Concatenate(axis=1, name='regression')(regression) model = models.Model(inputs=[image_input], outputs=[classification, regression], name='efficientdet') # apply predicted regression to anchors anchors = anchors_for_shape((input_size, input_size), anchor_params=anchor_parameters) anchors_input = np.expand_dims(anchors, axis=0) boxes = RegressBoxes(name='boxes')([anchors_input, regression[..., :4]]) boxes = ClipBoxes(name='clipped_boxes')([image_input, boxes]) # filter detections (apply NMS / score threshold / select top-k) if detect_quadrangle: detections = FilterDetections( name='filtered_detections', score_threshold=score_threshold, detect_quadrangle=True )([boxes, classification, regression[..., 4:8], regression[..., 8]]) else: detections = FilterDetections( name='filtered_detections', score_threshold=score_threshold )([boxes, classification]) prediction_model = models.Model(inputs=[image_input], outputs=detections, name='efficientdet_p') return model, prediction_model
def freeze_model(model, config, score_threshold=0.5): classification, regression = model.outputs anchors = anchors_for_shape( image_shape=config.input_shape, sizes=config.sizes, ratios=config.ratios, scales=config.scales, strides=config.strides, pyramid_levels=[3, 4, 5, 6, 7], shapes_callback=None, ) # apply predicted regression to anchors anchors = tf.convert_to_tensor(anchors) anchors_input = tf.expand_dims(anchors, axis=0) boxes = RegressBoxes(name='boxes')([anchors_input, regression[..., :4]]) boxes = ClipBoxes(name='clipped_boxes')([model.input, boxes]) # filter detections (apply NMS / score threshold / select top-k) # if detect_quadrangle: # detections = FilterDetections( # name='filtered_detections', # score_threshold=score_threshold, # detect_quadrangle=True # )([boxes, classification, regression[..., 4:8], regression[..., 8]]) # else: detections = FilterDetections(name='filtered_detections', score_threshold=score_threshold)( [boxes, classification]) prediction_model = models.Model(inputs=model.input, outputs=detections, name='efficientdet_p') return prediction_model
def efficientdet(phi, num_classes=20, weighted_bifpn=False, freeze_bn=False, score_threshold=0.01): assert phi in range(7) input_size = image_sizes[phi] input_shape = (input_size, input_size, 3) # input_shape = (None, None, 3) image_input = layers.Input(input_shape) w_bifpn = w_bifpns[phi] d_bifpn = 2 + phi w_head = w_bifpn d_head = 3 + int(phi / 3) backbone_cls = backbones[phi] # features = backbone_cls(include_top=False, input_shape=input_shape, weights=weights)(image_input) features = backbone_cls(input_tensor=image_input, freeze_bn=freeze_bn) if weighted_bifpn: for i in range(d_bifpn): features = build_wBiFPN(features, w_bifpn, i, freeze_bn=freeze_bn) else: for i in range(d_bifpn): features = build_BiFPN(features, w_bifpn, i, freeze_bn=freeze_bn) regress_head = build_regress_head(w_head, d_head) class_head = build_class_head(w_head, d_head, num_classes=num_classes) regression = [regress_head(feature) for feature in features] regression = layers.Concatenate(axis=1, name='regression')(regression) classification = [class_head(feature) for feature in features] classification = layers.Concatenate(axis=1, name='classification')(classification) model = models.Model(inputs=[image_input], outputs=[regression, classification], name='efficientdet') # apply predicted regression to anchors # anchors = tf.tile(tf.expand_dims(tf.constant(anchors), axis=0), (tf.shape(regression)[0], 1, 1)) anchors_input = layers.Input((None, 4)) boxes = RegressBoxes(name='boxes')([anchors_input, regression]) boxes = ClipBoxes(name='clipped_boxes')([image_input, boxes]) # filter detections (apply NMS / score threshold / select top-k) detections = FilterDetections(name='filtered_detections', score_threshold=score_threshold)( [boxes, classification]) prediction_model = models.Model(inputs=[image_input, anchors_input], outputs=detections, name='efficientdet_p') return model, prediction_model
def efficientdet(num_anchors, num_classes, num_properties, w_bifpn, d_bifpn, d_head, score_threshold, nms_threshold): image_input = layers.Input(shape=(None, None, 3)) w_head = w_bifpn backbone_cls = backbones[0] # [(?, 256, 256, 16), (?, 128, 128, 24),(?, 64, 64, 24),(?, 32,32, 24),(?, 16, 16, 24)] features = backbone_cls(input_tensor=image_input) fpn_features = features for i in range(d_bifpn): fpn_features = build_wBiFPN(fpn_features, w_bifpn, i) reg = regression_coco(fpn_features, w_head, d_head, num_anchors) cls = classification_coco(fpn_features, w_head, d_head, num_anchors, num_classes) pro = properties_sand(fpn_features, w_head, d_head, num_anchors, num_properties) model = models.Model(inputs=[image_input], outputs=[reg, cls, pro], name='efficientdet') anchors_input = layers.Input((None, 4), name='anchors_input') boxes = RegressBoxes(name='boxes')([anchors_input, reg]) boxes = ClipBoxes(name='clipped_boxes')([image_input, boxes]) # filter detections (apply NMS / score threshold / select top-k) # boxes (?, 49104, 4) (?, 49104, 1) (?, 49104, 3) detections = FilterDetections(name='filtered_detections', score_threshold=score_threshold, nms_threshold=nms_threshold, class_specific_filter=True, max_detections=100)([boxes, cls, pro]) prediction_model = models.Model(inputs=[image_input, anchors_input], outputs=detections, name='efficientdet_p') return model, prediction_model
def efficientdet_sand(num_anchors, num_classes, num_properties, w_bifpn, d_bifpn, d_head, score_threshold, nms_threshold): image_input = layers.Input(shape=(None, None, 3)) w_head = w_bifpn backbone_cls = backbones[0] # [(?, 256, 256, 16), (?, 128, 128, 24),(?, 64, 64, 24),(?, 32,32, 24),(?, 16, 16, 24)] features = backbone_cls(input_tensor=image_input) fpn_features = features for i in range(d_bifpn): fpn_features = build_wBiFPN(fpn_features, w_bifpn, i) reg = regression_coco(fpn_features, w_head, d_head, num_anchors) cls = classification_coco(fpn_features, w_head, d_head, num_anchors, 90) coco_model = models.Model(inputs=[image_input], outputs=[reg, cls], name='efficientdet_coco') path = os.path.join(os.path.dirname(__file__), 'weights/efficientdet-d0.h5') coco_model.load_weights(path, by_name=True) # for i in range(1, 227): # 321 # coco_model.layers[i].trainable = False # coco_model.layers[i].training = False P3_out = coco_model.get_layer( name='fpn_cells/cell_2/fnode3/op_after_combine8/bn').output P4_td = coco_model.get_layer( name='fpn_cells/cell_2/fnode2/op_after_combine7/bn').output P5_td = coco_model.get_layer( name='fpn_cells/cell_2/fnode1/op_after_combine6/bn').output P6_td = coco_model.get_layer( name='fpn_cells/cell_2/fnode0/op_after_combine5/bn').output P7_out = coco_model.get_layer( name='fpn_cells/cell_2/fnode7/op_after_combine12/bn').output tmp_fpn_features = [P3_out, P4_td, P5_td, P6_td, P7_out] sand_reg = regression_sand(tmp_fpn_features, w_head, d_head, num_anchors) sand_cls = classification_sand(tmp_fpn_features, w_head, d_head, num_anchors, num_classes) sand_pro = properties_sand(tmp_fpn_features, w_head, d_head, num_anchors, num_properties) sand_model = models.Model(inputs=[image_input], outputs=[sand_reg, sand_cls, sand_pro], name='efficientdet_sand') anchors_input = layers.Input((None, 4), name='anchors_input') boxes = RegressBoxes(name='boxes')([anchors_input, sand_reg]) boxes = ClipBoxes(name='clipped_boxes')([image_input, boxes]) # filter detections (apply NMS / score threshold / select top-k) # boxes (?, 49104, 4) (?, 49104, 1) (?, 49104, 3) detections = FilterDetections( name='filtered_detections', score_threshold=score_threshold, nms_threshold=nms_threshold, class_specific_filter=True, max_detections=100)([boxes, sand_cls, sand_pro]) prediction_model = models.Model(inputs=[image_input, anchors_input], outputs=detections, name='efficientdet_p') return sand_model, prediction_model
def sapd( phi, soft_select=False, num_classes=20, freeze_bn=False, max_gt_boxes=100, batch_size=32, score_threshold=0.01, ): assert phi in range(7) image_size = image_sizes[phi] input_shape = (image_size, image_size, 3) # input_shape = (None, None, 3) image_input = layers.Input(input_shape) gt_boxes_input = layers.Input((max_gt_boxes, 5)) num_gt_boxes_input = layers.Input((1, ), dtype='int32') fm_shapes_input = layers.Input((5, 2), dtype='int32') backbone_cls = backbones[phi] # (C1, C2, C3, C4, C5) features = backbone_cls(input_tensor=image_input, freeze_bn=freeze_bn) w_bifpn = w_bifpns[phi] d_bifpn = 2 + phi w_head = w_bifpn d_head = 3 + int(phi / 3) for i in range(d_bifpn): features = build_BiFPN(features, w_bifpn, i, freeze_bn=freeze_bn) regr_head = build_regress_head(w_head, d_head) cls_head = build_class_head(w_head, d_head, num_classes=num_classes) pyramid_features = features fpn_width = w_head cls_pred = [cls_head(feature) for feature in pyramid_features] cls_pred = layers.Concatenate(axis=1, name='classification')(cls_pred) regr_pred = [regr_head(feature) for feature in pyramid_features] regr_pred = layers.Concatenate(axis=1, name='regression')(regr_pred) # meta select net meta_select_net = build_meta_select_net(width=fpn_width) meta_select_input, gt_boxes_batch_ids = MetaSelectInput()( [gt_boxes_input, *pyramid_features]) meta_select_pred = meta_select_net(meta_select_input) meta_select_target = MetaSelectTarget()( [cls_pred, regr_pred, fm_shapes_input, gt_boxes_input]) # # lambda == 0.1 in paper meta_select_loss = layers.Lambda( lambda x: 0.1 * losses.sparse_categorical_crossentropy(x[0], x[1]), output_shape=(1, ), name="meta_select_loss")([meta_select_target, meta_select_pred]) if soft_select: meta_select_weight = MetaSelectWeight( max_gt_boxes=max_gt_boxes, soft_select=soft_select, batch_size=batch_size, )([meta_select_pred, gt_boxes_batch_ids, num_gt_boxes_input]) else: meta_select_weight = MetaSelectWeight( max_gt_boxes=max_gt_boxes, soft_select=soft_select, batch_size=batch_size, )([meta_select_target, gt_boxes_batch_ids, num_gt_boxes_input]) cls_target, regr_target = SAPDTarget(num_classes=num_classes)( [fm_shapes_input, gt_boxes_input, meta_select_weight]) focal_loss = focal_with_weight_and_mask() iou_loss = iou_with_weight_and_mask() cls_loss = layers.Lambda(focal_loss, output_shape=(1, ), name="cls_loss")([cls_target, cls_pred]) regr_loss = layers.Lambda(iou_loss, output_shape=(1, ), name="regr_loss")([regr_target, regr_pred]) model = models.Model(inputs=[ image_input, gt_boxes_input, num_gt_boxes_input, fm_shapes_input ], outputs=[ cls_loss, regr_loss, meta_select_loss, cls_pred, regr_pred, cls_target, regr_target ], name='sapd') locations, strides = Locations()(pyramid_features) # apply predicted regression to anchors boxes = RegressBoxes(name='boxes')([locations, strides, regr_pred]) boxes = ClipBoxes(name='clipped_boxes')([image_input, boxes]) # filter detections (apply NMS / score threshold / select top-k) detections = FilterDetections(name='filtered_detections', score_threshold=score_threshold)( [boxes, cls_pred]) prediction_model = models.Model(inputs=[image_input], outputs=detections, name='sapd_p') return model, prediction_model
def build_EfficientPose(phi, num_classes=8, num_anchors=9, freeze_bn=False, score_threshold=0.5, anchor_parameters=None, num_rotation_parameters=3, print_architecture=True): """ Builds an EfficientPose model Args: phi: EfficientPose scaling hyperparameter phi num_classes: Number of classes, num_anchors: The number of anchors, usually 3 scales and 3 aspect ratios resulting in 3 * 3 = 9 anchors freeze_bn: Boolean indicating if the batch norm layers should be freezed during training or not. score_threshold: Minimum score threshold at which a prediction is not filtered out anchor_parameters: Struct containing anchor parameters. If None, default values are used. num_rotation_parameters: Number of rotation parameters, e.g. 3 for axis angle representation print_architecture: Boolean indicating if the model architecture should be printed or not Returns: efficientpose_train: EfficientPose model without NMS used for training efficientpose_prediction: EfficientPose model including NMS used for evaluating and inferencing all_layers: List of all layers in the EfficientPose model to load weights. Otherwise it can happen that a subnet is considered as a single unit when loading weights and if the output dimension doesn't match with the weight file, the whole subnet weight loading is skipped """ #select parameters according to the given phi assert phi in range(7) scaled_parameters = get_scaled_parameters(phi) input_size = scaled_parameters["input_size"] input_shape = (input_size, input_size, 3) bifpn_width = subnet_width = scaled_parameters["bifpn_width"] bifpn_depth = scaled_parameters["bifpn_depth"] subnet_depth = scaled_parameters["subnet_depth"] subnet_num_iteration_steps = scaled_parameters[ "subnet_num_iteration_steps"] num_groups_gn = scaled_parameters["num_groups_gn"] backbone_class = scaled_parameters["backbone_class"] #input layers image_input = layers.Input(input_shape) camera_parameters_input = layers.Input( (6, ) ) #camera parameters and image scale for calculating the translation vector from 2D x-, y-coordinates #build EfficientNet backbone backbone_feature_maps = backbone_class(input_tensor=image_input, freeze_bn=freeze_bn) #build BiFPN fpn_feature_maps = build_BiFPN(backbone_feature_maps, bifpn_depth, bifpn_width, freeze_bn) #build subnets box_net, class_net, rotation_net, translation_net = build_subnets( num_classes, subnet_width, subnet_depth, subnet_num_iteration_steps, num_groups_gn, num_rotation_parameters, freeze_bn, num_anchors) #apply subnets to feature maps classification, bbox_regression, rotation, translation, transformation, bboxes = apply_subnets_to_feature_maps( box_net, class_net, rotation_net, translation_net, fpn_feature_maps, image_input, camera_parameters_input, input_size, anchor_parameters) #get the EfficientPose model for training without NMS and the rotation and translation output combined in the transformation output because of the loss calculation efficientpose_train = models.Model( inputs=[image_input, camera_parameters_input], outputs=[classification, bbox_regression, transformation], name='efficientpose') # filter detections (apply NMS / score threshold / select top-k) filtered_detections = FilterDetections( num_rotation_parameters=num_rotation_parameters, num_translation_parameters=3, name='filtered_detections', score_threshold=score_threshold)( [bboxes, classification, rotation, translation]) efficientpose_prediction = models.Model( inputs=[image_input, camera_parameters_input], outputs=filtered_detections, name='efficientpose_prediction') if print_architecture: print_models(efficientpose_train, box_net, class_net, rotation_net, translation_net) #create list with all layers to be able to load all layer weights because sometimes the whole subnet weight loading is skipped if the output shape does not match instead of skipping just the output layer all_layers = list( set(efficientpose_train.layers + box_net.layers + class_net.layers + rotation_net.layers + translation_net.layers)) return efficientpose_train, efficientpose_prediction, all_layers
def yolo_body(num_classes=20, score_threshold=0.01): """ Create YOLO_V3 model CNN body in Keras. Args: num_classes: score_threshold: Returns: """ image_input = Input(shape=(None, None, 3), name='image_input') darknet = Model([image_input], darknet_body(image_input)) ################################################## # build fsaf head ################################################## x, y1 = make_last_layers(darknet.output, 512, 4 + num_classes) x = compose(darknet_conv2d_bn_leaky(256, (1, 1)), UpSampling2D(2))(x) x = Concatenate()([x, darknet.layers[152].output]) x, y2 = make_last_layers(x, 256, 4 + num_classes) x = compose(darknet_conv2d_bn_leaky(128, (1, 1)), UpSampling2D(2))(x) x = Concatenate()([x, darknet.layers[92].output]) x, y3 = make_last_layers(x, 128, 4 + num_classes) y1_ = Reshape((-1, 4 + num_classes))(y1) y2_ = Reshape((-1, 4 + num_classes))(y2) y3_ = Reshape((-1, 4 + num_classes))(y3) y = Concatenate(axis=1)([y1_, y2_, y3_]) batch_cls_pred = Lambda(lambda x: x[..., 4:])(y) batch_regr_pred = Lambda(lambda x: x[..., :4])(y) batch_cls_pred = Activation('sigmoid')(batch_cls_pred) batch_regr_pred = Activation('relu')(batch_regr_pred) gt_boxes_input = Input(shape=(config.MAX_NUM_GT_BOXES, 5), name='gt_boxes_input') grid_shapes_input = Input((len(config.STRIDES), 2), dtype='int32', name='grid_shapes_input') batch_gt_box_levels = LevelSelect(name='level_select')( [batch_cls_pred, batch_regr_pred, grid_shapes_input, gt_boxes_input]) batch_cls_target, batch_cls_mask, batch_cls_num_pos, batch_regr_target, batch_regr_mask = FSAFTarget( num_classes=num_classes, name='fsaf_target')( [batch_gt_box_levels, grid_shapes_input, gt_boxes_input]) focal_loss_graph = focal_with_mask() iou_loss_graph = iou_with_mask() cls_loss = Lambda(focal_loss_graph, output_shape=(1, ), name="cls_loss")( [batch_cls_target, batch_cls_pred, batch_cls_mask, batch_cls_num_pos]) regr_loss = Lambda(iou_loss_graph, output_shape=(1, ), name="regr_loss")( [batch_regr_target, batch_regr_pred, batch_regr_mask]) model = Model(inputs=[image_input, gt_boxes_input, grid_shapes_input], outputs=[cls_loss, regr_loss], name='fsaf') # compute the anchors features = [y1, y2, y3] locations, strides = Locations(strides=config.STRIDES)(features) # apply predicted regression to anchors boxes = RegressBoxes(name='boxes')([locations, strides, batch_regr_pred]) boxes = ClipBoxes(name='clipped_boxes')([image_input, boxes]) # filter detections (apply NMS / score threshold / select top-k) detections = FilterDetections( nms=True, class_specific_filter=True, name='filtered_detections', score_threshold=score_threshold)([boxes, batch_cls_pred]) prediction_model = Model(inputs=image_input, outputs=detections, name='fsaf_detection') return model, prediction_model