def sampling(args): z_mean, z_log_var = args epsilon = K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0., stddev=1.) return z_mean + K.exp(z_log_var) * epsilon
def _smooth_labels(y_true, label_smoothing): num_classes = tf.cast(K.shape(y_true)[-1], dtype=K.floatx()) label_smoothing = K.constant(label_smoothing, dtype=K.floatx()) return y_true * (1.0 - label_smoothing) + label_smoothing / num_classes
def build_model(hidden_dim, max_seq_len, vocabulary_size): ## encoder Input and layers encoder_in = Input((max_seq_len, ), dtype='int32', name='encoder_in') ith_str = Input((1, ), dtype='int32', name='ith_str') word = Input((1, ), dtype='int32', name='word') OneHot = Lambda(lambda x: K.one_hot(x, vocabulary_size), name='OneHot') ## building encoder encoder_in_and_word = Concatenate()([ith_str, word, encoder_in]) encoder_GRU = GRU(hidden_dim, return_state=True, return_sequences=True) encoder_out, state = encoder_GRU(OneHot(encoder_in_and_word)) encoder_out_dup = RepeatVector(max_seq_len)(encoder_out[:, -1]) ## decoder Input and layers decoder_in = Input((max_seq_len, ), dtype='int32', name='decoder_in') ith = Input((1, ), dtype='int32', name='ith') decoder_GRU = GRU(hidden_dim, return_sequences=True, return_state=True) decoder_Dense = Dense(vocabulary_size, activation='softmax', name='decoder_out') ## building decoder ith_dup = RepeatVector(max_seq_len)(K.cast(ith, 'float')) word_dup = K.reshape(RepeatVector(max_seq_len)(word), (-1, max_seq_len)) x = Concatenate()( [ith_dup, OneHot(word_dup), OneHot(decoder_in), encoder_out_dup]) x, _ = decoder_GRU(x, initial_state=state) decoder_out = decoder_Dense(x) ## get the specific word gather = K.concatenate( [K.reshape(tf.range(K.shape(decoder_out)[0]), (-1, 1)), ith]) specific_word = tf.gather_nd(decoder_out, gather) specific_word = Lambda(tf.identity, name='word_out')( specific_word ) # Add this layer because the name of tf.gather_nd is too ugly model = Model([encoder_in, decoder_in, ith, ith_str, word], [decoder_out, specific_word]) ## building decoder model given encoder_out and states decoder_in_one_word = Input((1, ), dtype='int32', name='decoder_in_one_word') decoder_state_in = Input((hidden_dim, ), name='decoder_state_in') encoder_out = Input((hidden_dim, ), name='decoder_encoder_out') x = Concatenate()([ K.cast(ith, 'float')[:, tf.newaxis], OneHot(word), OneHot(decoder_in_one_word), encoder_out[:, tf.newaxis] ]) x, decoder_state = decoder_GRU(x, initial_state=decoder_state_in) decoder_out = decoder_Dense(x) decoder_model = Model( [decoder_in_one_word, encoder_out, decoder_state_in, ith, word], [decoder_out, decoder_state]) encoder_in = Input((None, ), dtype='int32') encoder_in_and_word = Concatenate()([ith_str, word, encoder_in]) encoder_out, state = encoder_GRU(OneHot(encoder_in_and_word)) encoder_model = Model([encoder_in, ith_str, word], [encoder_out, state]) return model, encoder_model, decoder_model
def resize_bilinear(x): return tf.compat.v1.image.resize_bilinear( x, size=[K.shape(x)[1] * RESIZE_FACTOR, K.shape(x)[2] * RESIZE_FACTOR])
this equates to the Dice score when delta = 0.5 smooth: smoothing constant to prevent division by zero errors """ delta = 0.5 smooth = 0.000001 axis = identify_axis(y_true.get_shape()) # Calculate true positives (tp), false negatives (fn) and false positives (fp) tp = K.sum(y_true * y_pred, axis=axis) fn = K.sum(y_true * (1-y_pred), axis=axis) fp = K.sum((1-y_true) * y_pred, axis=axis) # Calculate Dice score dice_class = (tp + smooth)/(tp + delta*fn + (1-delta)*fp + smooth) # Sum up classes to one score dice_loss = K.sum(1-dice_class, axis=[-1]) # adjusts loss to account for number of classes num_classes = K.cast(K.shape(y_true)[-1],'float32') dice_loss = dice_loss / num_classes return dice_loss # Tversky loss def tversky_loss(y_true, y_pred): """ Paper: Tversky loss function for image segmentation using 3D fully convolutional deep networks Link: https://arxiv.org/abs/1706.05721 delta: controls weight given to false positive and false negatives. this equates to the Tversky index when delta = 0.7 smooth: smoothing constant to prevent division by zero errors """
def yolo5_loss(args, anchors, num_classes, ignore_thresh=.5, label_smoothing=0, elim_grid_sense=True, use_focal_loss=False, use_focal_obj_loss=False, use_softmax_loss=False, use_giou_loss=False, use_diou_loss=True): ''' YOLOv5 loss function. Parameters ---------- yolo_outputs: list of tensor, the output of yolo_body or tiny_yolo_body y_true: list of array, the output of preprocess_true_boxes anchors: array, shape=(N, 2), wh num_classes: integer ignore_thresh: float, the iou threshold whether to ignore object confidence loss Returns ------- loss: tensor, shape=(1,) ''' num_layers = len(anchors)//3 # default setting yolo_outputs = args[:num_layers] y_true = args[num_layers:] # gains for box, class and confidence loss # from https://github.com/ultralytics/yolov5/blob/master/data/hyp.scratch.yaml box_loss_gain = 0.05 class_loss_gain = 0.5 confidence_loss_gain = 1.0 # balance weights for confidence (objectness) loss # on different predict heads (x/32, x/16, x/8), # here the order is reversed from ultralytics PyTorch version # from https://github.com/ultralytics/yolov5/blob/master/utils/loss.py#L109 confidence_balance_weights = [0.4, 1.0, 4.0] if num_layers == 3: anchor_mask = [[6,7,8], [3,4,5], [0,1,2]] # YOLOv5 enable "elim_grid_sense" by default scale_x_y = [2.0, 2.0, 2.0] #if elim_grid_sense else [None, None, None] else: anchor_mask = [[3,4,5], [0,1,2]] scale_x_y = [1.05, 1.05] #if elim_grid_sense else [None, None] input_shape = K.cast(K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) grid_shapes = [K.cast(K.shape(yolo_outputs[i])[1:3], K.dtype(y_true[0])) for i in range(num_layers)] loss = 0 total_location_loss = 0 total_confidence_loss = 0 total_class_loss = 0 batch_size = K.shape(yolo_outputs[0])[0] # batch size, tensor batch_size_f = K.cast(batch_size, K.dtype(yolo_outputs[0])) for i in range(num_layers): object_mask = y_true[i][..., 4:5] true_class_probs = y_true[i][..., 5:] if label_smoothing: true_class_probs = _smooth_labels(true_class_probs, label_smoothing) #true_objectness_probs = _smooth_labels(object_mask, label_smoothing) #else: #true_objectness_probs = object_mask grid, raw_pred, pred_xy, pred_wh = yolo5_decode(yolo_outputs[i], anchors[anchor_mask[i]], num_classes, input_shape, scale_x_y=scale_x_y[i], calc_loss=True) pred_box = K.concatenate([pred_xy, pred_wh]) # Darknet raw box to calculate loss. raw_true_xy = y_true[i][..., :2]*grid_shapes[i][::-1] - grid raw_true_wh = K.log(y_true[i][..., 2:4] / anchors[anchor_mask[i]] * input_shape[::-1]) raw_true_wh = K.switch(object_mask, raw_true_wh, K.zeros_like(raw_true_wh)) # avoid log(0)=-inf #box_loss_scale = 2 - y_true[i][...,2:3]*y_true[i][...,3:4] # Find ignore mask, iterate over each of batch. #ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) #object_mask_bool = K.cast(object_mask, 'bool') #def loop_body(b, ignore_mask): #true_box = tf.boolean_mask(y_true[i][b,...,0:4], object_mask_bool[b,...,0]) #iou = box_iou(pred_box[b], true_box) #best_iou = K.max(iou, axis=-1) #ignore_mask = ignore_mask.write(b, K.cast(best_iou<ignore_thresh, K.dtype(true_box))) #return b+1, ignore_mask #_, ignore_mask = tf.while_loop(lambda b,*args: b<batch_size, loop_body, [0, ignore_mask]) #ignore_mask = ignore_mask.stack() #ignore_mask = K.expand_dims(ignore_mask, -1) if use_giou_loss: # Calculate GIoU loss as location loss raw_true_box = y_true[i][...,0:4] giou = box_giou(raw_true_box, pred_box) giou_loss = object_mask * (1 - giou) location_loss = giou_loss iou = giou elif use_diou_loss: # Calculate DIoU loss as location loss raw_true_box = y_true[i][...,0:4] diou = box_diou(raw_true_box, pred_box) diou_loss = object_mask * (1 - diou) location_loss = diou_loss iou = diou else: raise ValueError('Unsupported IOU loss type') # Standard YOLOv3 location loss # K.binary_crossentropy is helpful to avoid exp overflow. #xy_loss = object_mask * box_loss_scale * K.binary_crossentropy(raw_true_xy, raw_pred[...,0:2], from_logits=True) #wh_loss = object_mask * box_loss_scale * 0.5 * K.square(raw_true_wh-raw_pred[...,2:4]) #xy_loss = K.sum(xy_loss) / batch_size_f #wh_loss = K.sum(wh_loss) / batch_size_f #location_loss = xy_loss + wh_loss # use box iou for positive sample as objectness ground truth, # to calculate confidence loss # from https://github.com/ultralytics/yolov5/blob/master/utils/loss.py#L127 true_objectness_probs = K.maximum(iou, 0) if use_focal_obj_loss: # Focal loss for objectness confidence confidence_loss = confidence_balance_weights[i] * sigmoid_focal_loss(true_objectness_probs, raw_pred[...,4:5]) else: #confidence_loss = K.binary_crossentropy(true_objectness_probs, raw_pred[...,4:5], from_logits=True) * confidence_balance_weights[i] confidence_loss = confidence_balance_weights[i] * (object_mask * K.binary_crossentropy(true_objectness_probs, raw_pred[...,4:5], from_logits=True)+ \ (1-object_mask) * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True))# * ignore_mask if use_focal_loss: # Focal loss for classification score if use_softmax_loss: class_loss = softmax_focal_loss(true_class_probs, raw_pred[...,5:]) else: class_loss = sigmoid_focal_loss(true_class_probs, raw_pred[...,5:]) else: if use_softmax_loss: # use softmax style classification output class_loss = object_mask * K.expand_dims(K.categorical_crossentropy(true_class_probs, raw_pred[...,5:], from_logits=True), axis=-1) else: # use sigmoid style classification output class_loss = object_mask * K.binary_crossentropy(true_class_probs, raw_pred[...,5:], from_logits=True) confidence_loss = confidence_loss_gain * K.sum(confidence_loss) / batch_size_f class_loss = class_loss_gain * K.sum(class_loss) / batch_size_f location_loss = box_loss_gain * K.sum(location_loss) / batch_size_f loss += location_loss + confidence_loss + class_loss total_location_loss += location_loss total_confidence_loss += confidence_loss total_class_loss += class_loss # Fit for tf 2.0.0 loss shape loss = K.expand_dims(loss, axis=-1) return loss, total_location_loss, total_confidence_loss, total_class_loss
def call(self, x, mask=None): ''' Return an anchor box tensor based on the shape of the input tensor. The logic implemented here is identical to the logic in the module `ssd_box_encode_decode_utils.py`. Note that this tensor does not participate in any graph computations at runtime. It is being created as a constant once during graph creation and is just being output along with the rest of the model output during runtime. Because of this, all logic is implemented as Numpy array operations and it is sufficient to convert the resulting Numpy array into a Keras tensor at the very end before outputting it. Arguments: x (tensor): 4D tensor of shape `(batch, channels, height, width)` if `dim_ordering = 'th'` or `(batch, height, width, channels)` if `dim_ordering = 'tf'`. The input for this layer must be the output of the localization predictor layer. ''' # Compute box width and height for each aspect ratio # The shorter side of the image will be used to compute `w` and `h` using `scale` and `aspect_ratios`. size = min(self.img_height, self.img_width) # Compute the box widths and and heights for all aspect ratios wh_list = [] for ar in self.aspect_ratios: if (ar == 1): # Compute the regular anchor box for aspect ratio 1. box_height = box_width = self.this_scale * size wh_list.append((box_width, box_height)) if self.two_boxes_for_ar1: # Compute one slightly larger version using the geometric mean of this scale value and the next. box_height = box_width = np.sqrt( self.this_scale * self.next_scale) * size wh_list.append((box_width, box_height)) else: box_height = self.this_scale * size / np.sqrt(ar) box_width = self.this_scale * size * np.sqrt(ar) wh_list.append((box_width, box_height)) wh_list = np.array(wh_list) # We need the shape of the input tensor if K.image_data_format() == 'channels_last': batch_size, feature_map_height, feature_map_width, feature_map_channels = x.shape else: # Not yet relevant since TensorFlow is the only supported backend right now, but it can't harm to have this in here for the future batch_size, feature_map_channels, feature_map_height, feature_map_width = x.shape # Compute the grid of box center points. They are identical for all aspect ratios. # Compute the step sizes, i.e. how far apart the anchor box center points will be vertically and horizontally. if (self.this_steps is None): step_height = self.img_height / feature_map_height step_width = self.img_width / feature_map_width else: if isinstance(self.this_steps, (list, tuple)) and (len(self.this_steps) == 2): step_height = self.this_steps[0] step_width = self.this_steps[1] elif isinstance(self.this_steps, (int, float)): step_height = self.this_steps step_width = self.this_steps # Compute the offsets, i.e. at what pixel values the first anchor box center point will be from the top and from the left of the image. if (self.this_offsets is None): offset_height = 0.5 offset_width = 0.5 else: if isinstance(self.this_offsets, (list, tuple)) and (len(self.this_offsets) == 2): offset_height = self.this_offsets[0] offset_width = self.this_offsets[1] elif isinstance(self.this_offsets, (int, float)): offset_height = self.this_offsets offset_width = self.this_offsets # Now that we have the offsets and step sizes, compute the grid of anchor box center points. cy = np.linspace(offset_height * step_height, (offset_height + feature_map_height - 1) * step_height, feature_map_height) cx = np.linspace(offset_width * step_width, (offset_width + feature_map_width - 1) * step_width, feature_map_width) cx_grid, cy_grid = np.meshgrid(cx, cy) cx_grid = np.expand_dims( cx_grid, -1 ) # This is necessary for np.tile() to do what we want further down cy_grid = np.expand_dims( cy_grid, -1 ) # This is necessary for np.tile() to do what we want further down # Create a 4D tensor template of shape `(feature_map_height, feature_map_width, n_boxes, 4)` # where the last dimension will contain `(cx, cy, w, h)` boxes_tensor = np.zeros( (feature_map_height, feature_map_width, self.n_boxes, 4)) boxes_tensor[:, :, :, 0] = np.tile(cx_grid, (1, 1, self.n_boxes)) # Set cx boxes_tensor[:, :, :, 1] = np.tile(cy_grid, (1, 1, self.n_boxes)) # Set cy boxes_tensor[:, :, :, 2] = wh_list[:, 0] # Set w boxes_tensor[:, :, :, 3] = wh_list[:, 1] # Set h # Convert `(cx, cy, w, h)` to `(xmin, xmax, ymin, ymax)` boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='centroids2corners') # If `clip_boxes` is enabled, clip the coordinates to lie within the image boundaries if self.clip_boxes: x_coords = boxes_tensor[:, :, :, [0, 2]] x_coords[x_coords >= self.img_width] = self.img_width - 1 x_coords[x_coords < 0] = 0 boxes_tensor[:, :, :, [0, 2]] = x_coords y_coords = boxes_tensor[:, :, :, [1, 3]] y_coords[y_coords >= self.img_height] = self.img_height - 1 y_coords[y_coords < 0] = 0 boxes_tensor[:, :, :, [1, 3]] = y_coords # If `normalize_coords` is enabled, normalize the coordinates to be within [0,1] if self.normalize_coords: boxes_tensor[:, :, :, [0, 2]] /= self.img_width boxes_tensor[:, :, :, [1, 3]] /= self.img_height # TODO: Implement box limiting directly for `(cx, cy, w, h)` so that we don't have to unnecessarily convert back and forth. if self.coords == 'centroids': # Convert `(xmin, ymin, xmax, ymax)` back to `(cx, cy, w, h)`. boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2centroids', border_pixels='half') elif self.coords == 'minmax': # Convert `(xmin, ymin, xmax, ymax)` to `(xmin, xmax, ymin, ymax). boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2minmax', border_pixels='half') # Create a tensor to contain the variances and append it to `boxes_tensor`. This tensor has the same shape # as `boxes_tensor` and simply contains the same 4 variance values for every position in the last axis. variances_tensor = np.zeros_like( boxes_tensor ) # Has shape `(feature_map_height, feature_map_width, n_boxes, 4)` variances_tensor += self.variances # Long live broadcasting # Now `boxes_tensor` becomes a tensor of shape `(feature_map_height, feature_map_width, n_boxes, 8)` boxes_tensor = np.concatenate((boxes_tensor, variances_tensor), axis=-1) # Now prepend one dimension to `boxes_tensor` to account for the batch size and tile it along # The result will be a 5D tensor of shape `(batch_size, feature_map_height, feature_map_width, n_boxes, 8)` boxes_tensor = np.expand_dims(boxes_tensor, axis=0) boxes_tensor = K.tile(K.constant(boxes_tensor, dtype='float32'), (K.shape(x)[0], 1, 1, 1, 1)) return boxes_tensor
def call(self, inputs): mu, log_var = inputs epsilon = K.random_normal(shape=K.shape(mu), mean=0., stddev=1.) return mu + K.exp(log_var / 2) * epsilon
def call(self, x, mask=None): assert (len(x) == 2) img = x[0] rois = x[1] input_shape = K.shape(img) outputs = [] for roi_idx in range(self.num_rois): x = rois[0, roi_idx, 0] y = rois[0, roi_idx, 1] w = rois[0, roi_idx, 2] h = rois[0, roi_idx, 3] row_length = w / float(self.pool_size) col_length = h / float(self.pool_size) num_pool_regions = self.pool_size # NOTE: the RoiPooling implementation differs between theano and tensorflow due to the lack of a resize op # in theano. The theano implementation is much less efficient and leads to long compile times if self.dim_ordering == 'th': for jy in range(num_pool_regions): for ix in range(num_pool_regions): x1 = x + ix * row_length x2 = x1 + row_length y1 = y + jy * col_length y2 = y1 + col_length x1 = K.cast(x1, 'int32') x2 = K.cast(x2, 'int32') y1 = K.cast(y1, 'int32') y2 = K.cast(y2, 'int32') x2 = x1 + K.maximum(1, x2 - x1) y2 = y1 + K.maximum(1, y2 - y1) new_shape = [ input_shape[0], input_shape[1], y2 - y1, x2 - x1 ] x_crop = img[:, :, y1:y2, x1:x2] xm = K.reshape(x_crop, new_shape) pooled_val = K.max(xm, axis=(2, 3)) outputs.append(pooled_val) elif self.dim_ordering == 'tf': x = K.cast(x, 'int32') y = K.cast(y, 'int32') w = K.cast(w, 'int32') h = K.cast(h, 'int32') rs = tf.image.resize_images(img[:, y:y + h, x:x + w, :], (self.pool_size, self.pool_size)) outputs.append(rs) final_output = K.concatenate(outputs, axis=0) final_output = K.reshape(final_output, (1, self.num_rois, self.pool_size, self.pool_size, self.nb_channels)) if self.dim_ordering == 'th': final_output = K.permute_dimensions(final_output, (0, 1, 4, 2, 3)) else: final_output = K.permute_dimensions(final_output, (0, 1, 2, 3, 4)) return final_output
def yolo2_loss(args, anchors, num_classes, label_smoothing=0, elim_grid_sense=False, use_crossentropy_loss=False, use_crossentropy_obj_loss=False, rescore_confidence=False, use_giou_loss=False, use_diou_loss=False): """ YOLOv2 loss function. Parameters ---------- yolo_output : tensor Final convolutional layer features. y_true : array output of preprocess_true_boxes, with shape [conv_height, conv_width, num_anchors, 6] anchors : tensor Anchor boxes for model. num_classes : int Number of object classes. rescore_confidence : bool, default=False If true then set confidence target to IOU of best predicted box with the closest matching ground truth box. Returns ------- total_loss : float total mean YOLOv2 loss across minibatch """ (yolo_output, y_true) = args num_anchors = len(anchors) scale_x_y = 1.05 if elim_grid_sense else None yolo_output_shape = K.shape(yolo_output) input_shape = K.cast(yolo_output_shape[1:3] * 32, K.dtype(y_true)) grid_shape = K.cast(yolo_output_shape[1:3], K.dtype(y_true)) # height, width batch_size_f = K.cast(yolo_output_shape[0], K.dtype(yolo_output)) # batch size, float tensor object_scale = 5 no_object_scale = 1 class_scale = 1 location_scale = 1 grid, raw_pred, pred_xy, pred_wh = yolo2_decode( yolo_output, anchors, num_classes, input_shape, scale_x_y=scale_x_y, calc_loss=True) pred_confidence = K.sigmoid(raw_pred[..., 4:5]) pred_class_prob = K.softmax(raw_pred[..., 5:]) object_mask = y_true[..., 4:5] # Expand pred x,y,w,h to allow comparison with ground truth. # batch, conv_height, conv_width, num_anchors, num_true_boxes, box_params pred_boxes = K.concatenate([pred_xy, pred_wh]) pred_boxes = K.expand_dims(pred_boxes, 4) raw_true_boxes = y_true[...,0:4] raw_true_boxes = K.expand_dims(raw_true_boxes, 4) iou_scores = box_iou(pred_boxes, raw_true_boxes) iou_scores = K.squeeze(iou_scores, axis=0) # Best IOUs for each location. best_ious = K.max(iou_scores, axis=4) # Best IOU scores. best_ious = K.expand_dims(best_ious) # A detector has found an object if IOU > thresh for some true box. object_detections = K.cast(best_ious > 0.6, K.dtype(best_ious)) # Determine confidence weights from object and no_object weights. # NOTE: YOLOv2 does not use binary cross-entropy. Here we try it. no_object_weights = (no_object_scale * (1 - object_detections) * (1 - object_mask)) if use_crossentropy_obj_loss: no_objects_loss = no_object_weights * K.binary_crossentropy(K.zeros(K.shape(pred_confidence)), pred_confidence, from_logits=False) if rescore_confidence: objects_loss = (object_scale * object_mask * K.binary_crossentropy(best_ious, pred_confidence, from_logits=False)) else: objects_loss = (object_scale * object_mask * K.binary_crossentropy(K.ones(K.shape(pred_confidence)), pred_confidence, from_logits=False)) else: no_objects_loss = no_object_weights * K.square(-pred_confidence) if rescore_confidence: objects_loss = (object_scale * object_mask * K.square(best_ious - pred_confidence)) else: objects_loss = (object_scale * object_mask * K.square(1 - pred_confidence)) confidence_loss = objects_loss + no_objects_loss # Classification loss for matching detections. # NOTE: YOLOv2 does not use categorical cross-entropy loss. # Here we try it. matching_classes = K.cast(y_true[..., 5], 'int32') matching_classes = K.one_hot(matching_classes, num_classes) if label_smoothing: matching_classes = _smooth_labels(matching_classes, label_smoothing) if use_crossentropy_loss: classification_loss = (class_scale * object_mask * K.expand_dims(K.categorical_crossentropy(matching_classes, pred_class_prob, from_logits=False), axis=-1)) else: classification_loss = (class_scale * object_mask * K.square(matching_classes - pred_class_prob)) if use_giou_loss: # Calculate GIoU loss as location loss giou = box_giou(raw_true_boxes, pred_boxes) giou = K.squeeze(giou, axis=-1) giou_loss = location_scale * object_mask * (1 - giou) location_loss = giou_loss elif use_diou_loss: # Calculate DIoU loss as location loss diou = box_diou(raw_true_boxes, pred_boxes) diou = K.squeeze(diou, axis=-1) diou_loss = location_scale * object_mask * (1 - diou) location_loss = diou_loss else: # YOLOv2 location loss for matching detection boxes. # Darknet trans box to calculate loss. trans_true_xy = y_true[..., :2]*grid_shape[::-1] - grid trans_true_wh = K.log(y_true[..., 2:4] / anchors * input_shape[::-1]) trans_true_wh = K.switch(object_mask, trans_true_wh, K.zeros_like(trans_true_wh)) # avoid log(0)=-inf trans_true_boxes = K.concatenate([trans_true_xy, trans_true_wh]) # Unadjusted box predictions for loss. trans_pred_boxes = K.concatenate( (K.sigmoid(raw_pred[..., 0:2]), raw_pred[..., 2:4]), axis=-1) location_loss = (location_scale * object_mask * K.square(trans_true_boxes - trans_pred_boxes)) confidence_loss_sum = K.sum(confidence_loss) / batch_size_f classification_loss_sum = K.sum(classification_loss) / batch_size_f location_loss_sum = K.sum(location_loss) / batch_size_f total_loss = 0.5 * ( confidence_loss_sum + classification_loss_sum + location_loss_sum) # Fit for tf 2.0.0 loss shape total_loss = K.expand_dims(total_loss, axis=-1) return total_loss, location_loss_sum, confidence_loss_sum, classification_loss_sum
def hw_flatten(x): return K.reshape(x, shape=[K.shape(x)[0], K.shape(x)[1]*K.shape(x)[2], K.shape(x)[3]])
def yolo_eval(yolo_outputs, anchors, num_classes, image_shape, max_boxes=20, score_threshold=.6, iou_threshold=.5, eager=False): if eager: image_shape = K.reshape(yolo_outputs[-1], [-1]) num_layers = len(yolo_outputs) - 1 else: # 获得特征层的数量 num_layers = len(yolo_outputs) # 特征层1对应的anchor是678 # 特征层2对应的anchor是345 # 特征层3对应的anchor是012 anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] input_shape = K.shape(yolo_outputs[0])[1:3] * 32 boxes = [] box_scores = [] # 对每个特征层进行处理 for l in range(num_layers): _boxes, _box_scores = yolo_boxes_and_scores(yolo_outputs[l], anchors[anchor_mask[l]], num_classes, input_shape, image_shape) boxes.append(_boxes) box_scores.append(_box_scores) # 将每个特征层的结果进行堆叠 boxes = K.concatenate(boxes, axis=0) box_scores = K.concatenate(box_scores, axis=0) mask = box_scores >= score_threshold max_boxes_tensor = K.constant(max_boxes, dtype='int32') boxes_ = [] scores_ = [] classes_ = [] for c in range(num_classes): # 取出所有box_scores >= score_threshold的框,和成绩 class_boxes = tf.boolean_mask(boxes, mask[:, c]) class_box_scores = tf.boolean_mask(box_scores[:, c], mask[:, c]) # 非极大抑制,去掉box重合程度高的那一些 nms_index = tf.image.non_max_suppression(class_boxes, class_box_scores, max_boxes_tensor, iou_threshold=iou_threshold) # 获取非极大抑制后的结果 # 下列三个分别是 # 框的位置,得分与种类 class_boxes = K.gather(class_boxes, nms_index) class_box_scores = K.gather(class_box_scores, nms_index) classes = K.ones_like(class_box_scores, 'int32') * c boxes_.append(class_boxes) scores_.append(class_box_scores) classes_.append(classes) boxes_ = K.concatenate(boxes_, axis=0) scores_ = K.concatenate(scores_, axis=0) classes_ = K.concatenate(classes_, axis=0) return boxes_, scores_, classes_
def __call__(self, labels, outputs, anchors, num_classes, ignore_thresh=.5, label_smoothing=0, elim_grid_sense=True, use_focal_loss=False, use_focal_obj_loss=False, use_softmax_loss=False, use_giou_loss=False, use_diou_loss=True): # pylint: disable=R0915 """ YOLOv3 loss function. :param yolo_outputs: list of tensor, the output of yolo_body or tiny_yolo_body :param y_true: list of array, the output of preprocess_true_boxes :param anchors: array, shape=(N, 2), wh :param num_classes: integer :param ignore_thresh: float, the iou threshold whether to ignore object confidence loss :return loss: tensor, shape=(1,) """ anchors = np.array(anchors).astype(float).reshape(-1, 2) num_layers = len(anchors) // 3 # default setting yolo_outputs = list(outputs.values()) # args[:num_layers] y_true = list(labels.values()) # args[num_layers:] anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] scale_x_y = [1.05, 1.1, 1.2] if elim_grid_sense else [None, None, None] input_shape = K.cast( K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) loss = 0 total_location_loss = 0 total_confidence_loss = 0 total_class_loss = 0 batch_size = K.shape(yolo_outputs[0])[0] # batch size, tensor batch_size_f = K.cast(batch_size, K.dtype(yolo_outputs[0])) for i in range(num_layers): object_mask = y_true[i][..., 4:5] true_class_probs = y_true[i][..., 5:] if label_smoothing: true_class_probs = self._smooth_labels(true_class_probs, label_smoothing) true_objectness_probs = self._smooth_labels( object_mask, label_smoothing) else: true_objectness_probs = object_mask raw_pred, pred_xy, pred_wh = self.yolo3_decode( yolo_outputs[i], anchors[anchor_mask[i]], num_classes, input_shape, scale_x_y=scale_x_y[i]) pred_box = K.concatenate([pred_xy, pred_wh]) box_loss_scale = 2 - y_true[i][..., 2:3] * y_true[i][..., 3:4] # Find ignore mask, iterate over each of batch. ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) object_mask_bool = K.cast(object_mask, 'bool') def loop_body(b, ignore_mask): true_box = tf.boolean_mask(y_true[i][b, ..., 0:4], object_mask_bool[b, ..., 0]) iou = self.box_iou(pred_box[b], true_box) best_iou = K.max(iou, axis=-1) ignore_mask = ignore_mask.write( b, K.cast(best_iou < ignore_thresh, K.dtype(true_box))) return b + 1, ignore_mask _, ignore_mask = tf.while_loop(lambda b, *args: b < batch_size, loop_body, [0, ignore_mask]) ignore_mask = ignore_mask.stack() ignore_mask = K.expand_dims(ignore_mask, -1) raw_pred = raw_pred + K.epsilon() if use_focal_obj_loss: # Focal loss for objectness confidence confidence_loss = self.sigmoid_focal_loss( true_objectness_probs, raw_pred[..., 4:5]) else: confidence_loss = (object_mask * K.binary_crossentropy(true_objectness_probs, raw_pred[...,4:5], from_logits=True)) \ + ((1-object_mask) * ignore_mask * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True)) if use_focal_loss: # Focal loss for classification score if use_softmax_loss: class_loss = self.softmax_focal_loss( true_class_probs, raw_pred[..., 5:]) else: class_loss = self.sigmoid_focal_loss( true_class_probs, raw_pred[..., 5:]) else: if use_softmax_loss: # use softmax style classification output class_loss = object_mask \ * K.expand_dims(K.categorical_crossentropy(true_class_probs, raw_pred[...,5:], from_logits=True), axis=-1) else: # use sigmoid style classification output class_loss = object_mask \ * K.binary_crossentropy(true_class_probs, raw_pred[...,5:], from_logits=True) raw_true_box = y_true[i][..., 0:4] diou = self.box_diou(raw_true_box, pred_box) diou_loss = object_mask * box_loss_scale * (1 - diou) diou_loss = K.sum(diou_loss) / batch_size_f location_loss = diou_loss confidence_loss = K.sum(confidence_loss) / batch_size_f class_loss = K.sum(class_loss) / batch_size_f loss += location_loss + confidence_loss + class_loss total_location_loss += location_loss total_confidence_loss += confidence_loss total_class_loss += class_loss loss = K.expand_dims(loss, axis=-1) return loss, total_location_loss, total_confidence_loss, total_class_loss
def sampling(args): z_mean, z_log_var = args epsilon = K.random_normal(shape=K.shape(z_mean)) return z_mean + K.exp(0.5 * z_log_var) * epsilon
def yolo_loss(inputs, num_anchors): ignore_thresh = .5 # Порог вероятности обнаружения объекта num_layers = num_anchors // 3 # Подсчитываем количество анкоров на каждом уровне сетки y_pred = inputs[:num_layers] # Из входных данных выцепляем посчитанные моделью значения y_true = inputs[num_layers:] # Из входных данных выцепляем эталонные значения anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] # Задаем маску анкоров для каждого уровня сеток # Получаем размерность входного изображения ( (13 х 13) * 32 = (416 х 416)) и приводим к типу элемента y_true[0] input_shape = K.cast(K.shape(y_pred[0])[1:3] * 32, K.dtype(y_true[0])) # Получаем двумерный массив, соответствующий размерностям сеток ((13, 13), (26, 26), (52, 52)) grid_shapes = [K.cast(K.shape(y_pred[l])[1:3], K.dtype(y_true[0])) for l in range(num_layers)] loss = 0 # Значение ошибки # Считываем количество элементов m = K.shape(y_pred[0])[0] # Размер пакета batch_size = K.cast(m, K.dtype(y_pred[0])) # Преобразуем к типу y_pred[0] for l in range(num_layers): # Пробегаем по всем трем уровням сеток # Получаем маску для сетки l-го уровня по вероятности определения объекта (5-ый параметр в списке общих параметров). # В массиве object_mask будут значения, которые соответствуют только вероятности обнаружения объекта object_mask = y_true[l][..., 4:5] # Вернется набор данных вида ([0][0][0][0]...[1]...[0]) # Получаем аналогичную выборку для сетки l-го уровня с OHE (где записана позиция нашего класса) # В массиве true_class будут значения, которые соответствуют только OHE представлению класса для данного уровня анкоров true_class = y_true[l][..., 5:] # Вернется набор данных вида ([0][0][0][0]...[1]...[0]) num_sub_anchors = len(anchors[anchor_mask[l]]) # Получаем количество анкоров для отдельного уровян сетки (3) # Решейпим анкоры отдельного уровня сетки и записываем в переменную anchors_tensor anchors_tensor = K.reshape(K.constant(anchors[anchor_mask[l]]), [1, 1, 1, num_sub_anchors, 2]) # Создаем двумерный массив grid со значениями [[[0, 0] , [0, 1] , [0, 2] , ... , [0, k]], # [[1, 0] , [1, 1] , [1, 2] , ... , [1 ,k]], # ... # [[k, 0] , [k, 1] , [k, 2] , ... , [k, k]]] # где k - размерность сетки. Массив хранит индексы ячеек сетки grid_shape = K.shape(y_pred[l])[1:3] # Получаем ширину и высоту сетки grid_y = K.tile(K.reshape(K.arange(0, stop=grid_shape[0]), [-1, 1, 1, 1]),[1, grid_shape[1], 1, 1]) # Создаем вертикальную линию grid_x = K.tile(K.reshape(K.arange(0, stop=grid_shape[1]), [1, -1, 1, 1]),[grid_shape[0], 1, 1, 1]) # Создаем горизонтальную линию grid = K.concatenate([grid_x, grid_y]) # Объединяем grid = K.cast(grid, K.dtype(y_pred[l])) # Приводим к типу y_pred[l] # Решейпим y_pred[l] feats = K.reshape(y_pred[l], [-1, grid_shape[0], grid_shape[1], num_sub_anchors, num_classes + 5]) # Считаем ошибку в определении координат центра объекта # Получаем координаты центра объекта из спредиктенного значения pred_xy = (K.sigmoid(feats[..., :2]) + grid) / K.cast(grid_shape[::-1], K.dtype(feats)) # Производим обратные вычисления для оригинальных значений из y_true для координат центра объекта true_xy = y_true[l][..., :2] * grid_shapes[l][::-1] - grid # Реальные координаты центра bounding_box box_loss_scale = 2 - y_true[l][...,2:3] * y_true[l][...,3:4] # чем больше бокс, тем меньше ошибка # binary_crossentropy для истинного значения и спредиктенного (obect_mask для подсчета только требуемого значения) xy_loss = object_mask * box_loss_scale * K.binary_crossentropy(true_xy, feats[...,0:2], from_logits=True) # Считаем ошибку в определении координат ширины и высоты # Получаем значения ширины и высоты изображения из спредиктенного значения pred_wh = K.exp(feats[..., 2:4]) * anchors_tensor / K.cast(input_shape[::-1], K.dtype(feats)) # Производим обратные вычисления для оригинальных значений из y_true для ширины и высоты объекта true_wh = K.log(y_true[l][..., 2:4] / anchors[anchor_mask[l]] * input_shape[::-1]) # Оставляем значение высоты и ширины только у тех элементов, где object_mask = 1 true_wh = K.switch(object_mask, true_wh, K.zeros_like(true_wh)) # Считаем значение ошибки в определении высоты и ширины wh_loss = object_mask * box_loss_scale * 0.5 * K.square(true_wh-feats[...,2:4]) # Объединяем значения в один массив pred_box = K.concatenate([pred_xy, pred_wh]) # Считаем ошибку в определении обнаружения какого-либо класса # Для этого вначале надо отсечь все найденные объекты, вероятность которых меньше установленного значения ignore_thresh # Определяем массив, который будет хранить данные о неподходящих значениях ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) object_mask_bool = K.cast(object_mask, 'bool') # Приводим тип object_mask к типу 'bool' # Функция, определяющая данные, которые требуется игнорировать # Пробегаем по всем элементам пакета (b<m) # Получаем параметры реального bounding_box для текущей ячейки # Считаем IoU реального и спредиктенного # В зависимости от best_iou < ignore_thresh помечаем его как верно распознанный или неверено def loop_body( b, ignore_mask ): # в true_box запишутся первые 4 параметра (центр, высота и ширина объекта) того элемента, значение которого в object_mask_bool равно True true_box = tf.boolean_mask(y_true[l][b,...,0:4], object_mask_bool[b,...,0]) # Подсчитываем iou для спредиктенной ограничивающей рамки (pred_box) и оригинальной (true_box) iou = calc_iou(pred_box[b], true_box) # Находим лучшую ограничивающую рамку best_iou = K.max(iou, axis=-1) # Записываем в ignore_mask true или false в зависимости от (best_iou < ignore_thresh) ignore_mask = ignore_mask.write(b, K.cast(best_iou < ignore_thresh, K.dtype(true_box))) return b+1, ignore_mask # Увеличиваем счетчик на единицу и возвращаем ignore_mask # Пробегаем в цикле по всем элементам в пределах значения m (m = batch size) _, ignore_mask = tf.while_loop(lambda b,*args: b<m, loop_body, [0, ignore_mask]) ignore_mask = ignore_mask.stack() # Приводим ignore_mask к тензору ignore_mask = K.expand_dims(ignore_mask, -1) # Добавляем еще одну размерность в конце ignore_mask # Считаем значение ошибки # 1 компонент - для значений, которые были верно спредиктены # 2 компонент - для значения, которые были неверно спредиктены confidence_loss = ( object_mask * K.binary_crossentropy(object_mask, feats[...,4:5], from_logits=True) + (1-object_mask) * K.binary_crossentropy(object_mask, feats[...,4:5], from_logits=True) * ignore_mask ) # Считаем ошибку в определении класса объекта class_loss = object_mask * K.binary_crossentropy(true_class, feats[...,5:], from_logits=True) # Считаем суммарную ошибку xy_loss = K.sum(xy_loss) / batch_size wh_loss = K.sum(wh_loss) / batch_size confidence_loss = K.sum(confidence_loss) / batch_size class_loss = K.sum(class_loss) / batch_size loss += xy_loss + wh_loss + confidence_loss + class_loss return loss # Возвращаем значение ошибки
def samplingfun(z_mean, z_log_var): epsilon = K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0., stddev=1.0) return z_mean + K.exp(z_log_var / 2) * epsilon
def call(self, prob): gumbel_dist = tfp.Gumbel(0, 1) gumbel_sample = gumbel_dist.sample(K.shape(prob)) categ_sample = K.exp((K.log(prob)+gumbel_sample)/self.temp) categ_sample = categ_sample/K.sum(categ_sample, axis=-1, keepdims=True) return categ_sample
def _infer_network_outputs(*, sess, restored_model, num_of_anchors, anchors, orig_image_width, orig_image_height, model_image_width, model_image_height, img_np, verbose): start = time.time() boxes = [] prob_class = [] for yolo_head_idx in range(len(restored_model.output)): yolo_head = restored_model.output[yolo_head_idx] yolo_head_shape = K.shape(yolo_head) yolo_head_num_of_cols, yolo_head_num_of_rows = yolo_head_shape[ 2], yolo_head_shape[1] curr_yolo_head = K.reshape(yolo_head, [ -1, yolo_head_num_of_cols, yolo_head_num_of_rows, num_of_anchors, NUM_OF_BOX_PARAMS + NUM_OF_CLASSES ]) grid = construct_grid(yolo_head_shape[1], yolo_head_shape[2]) grid = K.cast(grid, dtype=K.dtype(curr_yolo_head)) grid_size = K.cast([yolo_head_num_of_cols, yolo_head_num_of_rows], dtype=K.dtype(curr_yolo_head)) curr_boxes_xy = (K.sigmoid(curr_yolo_head[..., :2]) + grid) / grid_size curr_boxes_wh = K.exp(curr_yolo_head[..., 2:4]) * anchors[yolo_head_idx] curr_prob_obj = K.sigmoid(curr_yolo_head[..., 4:5]) curr_prob_class = K.sigmoid(curr_yolo_head[..., 5:]) curr_prob_detected_class = curr_prob_obj * curr_prob_class boxes.append( get_corrected_boxes(box_width=curr_boxes_wh[..., 0:1], box_height=curr_boxes_wh[..., 1:2], box_x=curr_boxes_xy[..., 0:1], box_y=curr_boxes_xy[..., 1:2], orig_image_shape=(orig_image_width, orig_image_height), model_image_shape=(model_image_width, model_image_height))) curr_prob_detected_class = K.reshape(curr_prob_detected_class, [-1, NUM_OF_CLASSES]) prob_class.append(curr_prob_detected_class) prob_class = K.concatenate(prob_class, axis=0) boxes = K.concatenate(boxes, axis=0) out_tensors = [ boxes, prob_class, ] if verbose: print(f'Took {time.time() - start} seconds to construct network.') start = time.time() sess_out = sess.run(out_tensors, feed_dict={ restored_model.input: img_np, K.learning_phase(): 0 }) if verbose: print( f'Took {time.time() - start} seconds to infer outputs in session.') boxes, out_boxes_classes = sess_out return boxes, out_boxes_classes
def gaussian_sample(mu, log_var): epsilon = K.random_normal(shape=K.shape(mu)) sample = mu + K.exp(0.5 * log_var) * epsilon return sample
def yolo_loss(args, anchors, num_classes, rescore_confidence=False, print_loss=False): """YOLO localization loss function. Parameters ---------- yolo_output : tensor Final convolutional layer features. true_boxes : tensor Ground truth boxes tensor with shape [batch, num_true_boxes, 5] containing box x_center, y_center, width, height, and class. detectors_mask : array 0/1 mask for detector positions where there is a matching ground truth. matching_true_boxes : array Corresponding ground truth boxes for positive detector positions. Already adjusted for conv height and width. anchors : tensor Anchor boxes for model. num_classes : int Number of object classes. rescore_confidence : bool, default=False If true then set confidence target to IOU of best predicted box with the closest matching ground truth box. print_loss : bool, default=False If True then use a tf.Print() to print the loss components. Returns ------- mean_loss : float mean localization loss across minibatch """ (yolo_output, true_boxes, detectors_mask, matching_true_boxes) = args num_anchors = len(anchors) object_scale = 5 no_object_scale = 1 class_scale = 1 coordinates_scale = 1 pred_xy, pred_wh, pred_confidence, pred_class_prob = yolo_head( yolo_output, anchors, num_classes) # Unadjusted box predictions for loss. # TODO: Remove extra computation shared with yolo_head. yolo_output_shape = K.shape(yolo_output) feats = K.reshape(yolo_output, [ -1, yolo_output_shape[1], yolo_output_shape[2], num_anchors, num_classes + 5 ]) pred_boxes = K.concatenate((K.sigmoid(feats[..., 0:2]), feats[..., 2:4]), axis=-1) # TODO: Adjust predictions by image width/height for non-square images? # IOUs may be off due to different aspect ratio. # Expand pred x,y,w,h to allow comparison with ground truth. # batch, conv_height, conv_width, num_anchors, num_true_boxes, box_params pred_xy = K.expand_dims(pred_xy, 4) pred_wh = K.expand_dims(pred_wh, 4) pred_wh_half = pred_wh / 2. pred_mins = pred_xy - pred_wh_half pred_maxes = pred_xy + pred_wh_half true_boxes_shape = K.shape(true_boxes) # batch, conv_height, conv_width, num_anchors, num_true_boxes, box_params true_boxes = K.reshape(true_boxes, [ true_boxes_shape[0], 1, 1, 1, true_boxes_shape[1], true_boxes_shape[2] ]) true_xy = true_boxes[..., 0:2] true_wh = true_boxes[..., 2:4] # Find IOU of each predicted box with each ground truth box. true_wh_half = true_wh / 2. true_mins = true_xy - true_wh_half true_maxes = true_xy + true_wh_half intersect_mins = K.maximum(pred_mins, true_mins) intersect_maxes = K.minimum(pred_maxes, true_maxes) intersect_wh = K.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] pred_areas = pred_wh[..., 0] * pred_wh[..., 1] true_areas = true_wh[..., 0] * true_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = intersect_areas / union_areas # Best IOUs for each location. best_ious = K.max(iou_scores, axis=4) # Best IOU scores. best_ious = K.expand_dims(best_ious) # A detector has found an object if IOU > thresh for some true box. object_detections = K.cast(best_ious > 0.6, K.dtype(best_ious)) # TODO: Darknet region training includes extra coordinate loss for early # training steps to encourage predictions to match anchor priors. # Determine confidence weights from object and no_object weights. # NOTE: YOLO does not use binary cross-entropy here. no_object_weights = (no_object_scale * (1 - object_detections) * (1 - detectors_mask)) no_objects_loss = no_object_weights * K.square(-pred_confidence) if rescore_confidence: objects_loss = (object_scale * detectors_mask * K.square(best_ious - pred_confidence)) else: objects_loss = (object_scale * detectors_mask * K.square(1 - pred_confidence)) confidence_loss = objects_loss + no_objects_loss # Classification loss for matching detections. # NOTE: YOLO does not use categorical cross-entropy loss here. matching_classes = K.cast(matching_true_boxes[..., 4], 'int32') matching_classes = K.one_hot(matching_classes, num_classes) classification_loss = (class_scale * detectors_mask * K.square(matching_classes - pred_class_prob)) # Coordinate loss for matching detection boxes. matching_boxes = matching_true_boxes[..., 0:4] coordinates_loss = (coordinates_scale * detectors_mask * K.square(matching_boxes - pred_boxes)) confidence_loss_sum = K.sum(confidence_loss) classification_loss_sum = K.sum(classification_loss) coordinates_loss_sum = K.sum(coordinates_loss) total_loss = 0.5 * (confidence_loss_sum + classification_loss_sum + coordinates_loss_sum) if print_loss: total_loss = tf.Print( total_loss, [ total_loss, confidence_loss_sum, classification_loss_sum, coordinates_loss_sum ], message='yolo_loss, conf_loss, class_loss, box_coord_loss:') return total_loss
def divisible_temporal_padding(x, n): """将一维向量序列右padding到长度能被n整除 """ r_len = K.shape(x)[1] % n p_len = K.switch(r_len > 0, n - r_len, 0) return K.temporal_padding(x, (0, p_len))
def yolo_head(feats, anchors, num_classes): """Convert final layer features to bounding box parameters. Parameters ---------- feats : tensor Final convolutional layer features. anchors : array-like Anchor box widths and heights. num_classes : int Number of target classes. Returns ------- box_xy : tensor x, y box predictions adjusted by spatial location in conv layer. box_wh : tensor w, h box predictions adjusted by anchors and conv spatial resolution. box_conf : tensor Probability estimate for whether each box contains any object. box_class_pred : tensor Probability distribution estimate for each box over class labels. """ num_anchors = len(anchors) # Reshape to batch, height, width, num_anchors, box_params. anchors_tensor = K.reshape(K.variable(anchors), [1, 1, 1, num_anchors, 2]) # Static implementation for fixed models. # TODO: Remove or add option for static implementation. # _, conv_height, conv_width, _ = K.int_shape(feats) # conv_dims = K.variable([conv_width, conv_height]) # Dynamic implementation of conv dims for fully convolutional model. conv_dims = K.shape(feats)[1:3] # assuming channels last # In YOLO the height index is the inner most iteration. conv_height_index = K.arange(0, stop=conv_dims[0]) conv_width_index = K.arange(0, stop=conv_dims[1]) conv_height_index = K.tile(conv_height_index, [conv_dims[1]]) # TODO: Repeat_elements and tf.split doesn't support dynamic splits. # conv_width_index = K.repeat_elements(conv_width_index, conv_dims[1], axis=0) conv_width_index = K.tile(K.expand_dims(conv_width_index, 0), [conv_dims[0], 1]) conv_width_index = K.flatten(K.transpose(conv_width_index)) conv_index = K.transpose(K.stack([conv_height_index, conv_width_index])) conv_index = K.reshape(conv_index, [1, conv_dims[0], conv_dims[1], 1, 2]) conv_index = K.cast(conv_index, K.dtype(feats)) feats = K.reshape( feats, [-1, conv_dims[0], conv_dims[1], num_anchors, num_classes + 5]) conv_dims = K.cast(K.reshape(conv_dims, [1, 1, 1, 1, 2]), K.dtype(feats)) # Static generation of conv_index: # conv_index = np.array([_ for _ in np.ndindex(conv_width, conv_height)]) # conv_index = conv_index[:, [1, 0]] # swap columns for YOLO ordering. # conv_index = K.variable( # conv_index.reshape(1, conv_height, conv_width, 1, 2)) # feats = Reshape( # (conv_dims[0], conv_dims[1], num_anchors, num_classes + 5))(feats) box_confidence = K.sigmoid(feats[..., 4:5]) box_xy = K.sigmoid(feats[..., :2]) box_wh = K.exp(feats[..., 2:4]) box_class_probs = K.softmax(feats[..., 5:]) # Adjust preditions to each spatial grid point and anchor size. # Note: YOLO iterates over height index before width index. box_xy = (box_xy + conv_index) / conv_dims box_wh = box_wh * anchors_tensor / conv_dims return box_confidence, box_xy, box_wh, box_class_probs
def yolo3_loss(args, anchors, num_classes, ignore_thresh=.5, label_smoothing=0, use_focal_loss=False, use_focal_obj_loss=False, use_softmax_loss=False, use_giou_loss=False, use_diou_loss=False): '''Return yolo_loss tensor Parameters ---------- yolo_outputs: list of tensor, the output of yolo_body or tiny_yolo_body y_true: list of array, the output of preprocess_true_boxes anchors: array, shape=(N, 2), wh num_classes: integer ignore_thresh: float, the iou threshold whether to ignore object confidence loss Returns ------- loss: tensor, shape=(1,) ''' num_layers = len(anchors) // 3 # default setting yolo_outputs = args[:num_layers] y_true = args[num_layers:] anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2] ] if num_layers == 3 else [[3, 4, 5], [0, 1, 2]] input_shape = K.cast( K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) grid_shapes = [ K.cast(K.shape(yolo_outputs[l])[1:3], K.dtype(y_true[0])) for l in range(num_layers) ] loss = 0 total_location_loss = 0 total_confidence_loss = 0 total_class_loss = 0 m = K.shape(yolo_outputs[0])[0] # batch size, tensor mf = K.cast(m, K.dtype(yolo_outputs[0])) for l in range(num_layers): object_mask = y_true[l][..., 4:5] true_class_probs = y_true[l][..., 5:] if label_smoothing: true_class_probs = _smooth_labels(true_class_probs, label_smoothing) grid, raw_pred, pred_xy, pred_wh = yolo3_head(yolo_outputs[l], anchors[anchor_mask[l]], num_classes, input_shape, calc_loss=True) pred_box = K.concatenate([pred_xy, pred_wh]) # Darknet raw box to calculate loss. raw_true_xy = y_true[l][..., :2] * grid_shapes[l][..., ::-1] - grid raw_true_wh = K.log(y_true[l][..., 2:4] / anchors[anchor_mask[l]] * input_shape[..., ::-1]) raw_true_wh = K.switch(object_mask, raw_true_wh, K.zeros_like(raw_true_wh)) # avoid log(0)=-inf box_loss_scale = 2 - y_true[l][..., 2:3] * y_true[l][..., 3:4] # Find ignore mask, iterate over each of batch. ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) object_mask_bool = K.cast(object_mask, 'bool') def loop_body(b, ignore_mask): true_box = tf.boolean_mask(y_true[l][b, ..., 0:4], object_mask_bool[b, ..., 0]) iou = box_iou(pred_box[b], true_box) best_iou = K.max(iou, axis=-1) ignore_mask = ignore_mask.write( b, K.cast(best_iou < ignore_thresh, K.dtype(true_box))) return b + 1, ignore_mask _, ignore_mask = tf.while_loop(lambda b, *args: b < m, loop_body, [0, ignore_mask]) ignore_mask = ignore_mask.stack() ignore_mask = K.expand_dims(ignore_mask, -1) if use_focal_obj_loss: # Focal loss for objectness confidence confidence_loss = sigmoid_focal_loss(object_mask, raw_pred[..., 4:5]) else: confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True)+ \ (1-object_mask) * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True) * ignore_mask if use_focal_loss: # Focal loss for classification score if use_softmax_loss: class_loss = softmax_focal_loss(true_class_probs, raw_pred[..., 5:]) else: class_loss = sigmoid_focal_loss(true_class_probs, raw_pred[..., 5:]) else: if use_softmax_loss: # use softmax style classification output class_loss = object_mask * K.expand_dims( K.categorical_crossentropy( true_class_probs, raw_pred[..., 5:], from_logits=True), axis=-1) else: # use sigmoid style classification output class_loss = object_mask * K.binary_crossentropy( true_class_probs, raw_pred[..., 5:], from_logits=True) if use_giou_loss: # Calculate GIoU loss as location loss raw_true_box = y_true[l][..., 0:4] giou = box_giou(pred_box, raw_true_box) giou_loss = object_mask * box_loss_scale * (1 - giou) giou_loss = K.sum(giou_loss) / mf location_loss = giou_loss elif use_diou_loss: # Calculate DIoU loss as location loss raw_true_box = y_true[l][..., 0:4] diou = box_diou(pred_box, raw_true_box) diou_loss = object_mask * box_loss_scale * (1 - diou) diou_loss = K.sum(diou_loss) / mf location_loss = diou_loss else: # Standard YOLOv3 location loss # K.binary_crossentropy is helpful to avoid exp overflow. xy_loss = object_mask * box_loss_scale * K.binary_crossentropy( raw_true_xy, raw_pred[..., 0:2], from_logits=True) wh_loss = object_mask * box_loss_scale * 0.5 * K.square( raw_true_wh - raw_pred[..., 2:4]) xy_loss = K.sum(xy_loss) / mf wh_loss = K.sum(wh_loss) / mf location_loss = xy_loss + wh_loss confidence_loss = K.sum(confidence_loss) / mf class_loss = K.sum(class_loss) / mf loss += location_loss + confidence_loss + class_loss total_location_loss += location_loss total_confidence_loss += confidence_loss total_class_loss += class_loss # Fit for tf 2.0.0 loss shape loss = K.expand_dims(loss, axis=-1) return loss, total_location_loss, total_confidence_loss, total_class_loss
def yolo_loss(args, anchors, num_classes, ignore_threshold=0.5, print_loss=False, normalize=True): num_layers = len(anchors) // 3 y_true = args[num_layers:] yolo_outputs = args[:num_layers] anchors_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] input_shape = K.cast( K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) grid_shapes = [ K.cast(K.shape(yolo_outputs[l])[1:3], K.dtype(y_true[0])) for l in range(num_layers) ] loss = 0 num_pos = 0 m = K.shape(yolo_outputs[0])[0] # Batch size mf = K.cast(m, K.dtype(yolo_outputs[0])) for l in range(num_layers): # feature maps location object_mask = y_true[l][..., 4:5] true_class_probs = y_true[l][..., 5:] # predict grid, class, xy, wh grid, raw_pred, pred_xy, pred_wh = yolo_head( yolo_outputs[l], anchors=anchors[anchors_mask[l]], num_classes=num_classes, input_shape=input_shape, calc_loss=True) pred_box = K.concatenate([pred_xy, pred_wh]) ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) object_mask_bool = K.cast(object_mask, 'bool') def loop_body(b, ignore_mask): true_box = tf.boolean_mask(y_true[l][b, ..., 0:4], object_mask_bool[b, ..., 0]) iou = box_iou(pred_box[b], true_box) best_iou = K.max(iou, axis=-1) ignore_mask = ignore_mask.write( b, K.cast(best_iou < ignore_threshold, K.dtype(true_box))) return b + 1, ignore_mask _, ignore_mask = tf.while_loop(lambda b, *args: b < m, loop_body, [0, ignore_mask]) ignore_mask = ignore_mask.stack() ignore_mask = K.expand_dims(ignore_mask, -1) # Encode true bounding boxes raw_true_xy = y_true[l][..., :2] * grid_shapes[l][:] - grid raw_true_wh = K.log(y_true[l][..., 2:4] / anchors[anchors_mask[l]] * input_shape[::-1]) raw_true_wh = K.switch(object_mask, raw_true_wh, K.zeros_like(raw_true_wh)) # True bounding box is to bigger, weights is less. box_loss_scale = 2 - y_true[l][..., 2:3] * y_true[l][..., 3:4] xy_loss = object_mask * box_loss_scale * K.binary_crossentropy( raw_true_xy, raw_pred[..., 0:2], from_logits=True) wh_loss = object_mask * box_loss_scale * 0.5 * K.square( raw_true_wh - raw_pred[..., 2:4]) confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[..., 4:5], from_logits=True) + \ (1 - object_mask) * K.binary_crossentropy(object_mask, raw_pred[..., 4:5], from_logits=True) * ignore_mask class_loss = object_mask * K.binary_crossentropy( true_class_probs, raw_pred[..., 5:], from_logits=True) xy_loss = K.sum(xy_loss) wh_loss = K.sum(wh_loss) confidence_loss = K.sum(confidence_loss) class_loss = K.sum(class_loss) # Compute positive sample num_pos += tf.maximum(K.sum(K.cast(object_mask, tf.float32)), 1) loss += xy_loss + wh_loss + confidence_loss + class_loss if print_loss: loss = tf.print(loss, [ loss, xy_loss, wh_loss, confidence_loss, class_loss, tf.shape(ignore_mask) ], summarize=100, message="loss: ") if normalize: loss = loss / num_pos else: loss = loss / mf return loss
def init_qstar_0(self, m, batch_index, batch_num): """Initialize the q0 with zeros.""" batch_shape = ksb.shape(batch_num) return tf.zeros((batch_shape[0], 1, 2 * self.channels))
def yolo_loss(args, anchors, num_classes, ignore_thresh=.5, print_loss=False): '''Return yolo_loss tensor Parameters ---------- yolo_outputs: list of tensor, the output of yolo_body or tiny_yolo_body y_true: list of array, the output of preprocess_true_boxes anchors: array, shape=(N, 2), wh num_classes: integer ignore_thresh: float, the iou threshold whether to ignore object confidence loss Returns ------- loss: tensor, shape=(1,) ''' num_layers = len(anchors) // 3 # default setting yolo_outputs = args[:num_layers] y_true = args[num_layers:] anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2] ] if num_layers == 3 else [[3, 4, 5], [1, 2, 3]] input_shape = K.cast( K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) grid_shapes = [ K.cast(K.shape(yolo_outputs[l])[1:3], K.dtype(y_true[0])) for l in range(num_layers) ] loss = 0 m = K.shape(yolo_outputs[0])[0] # batch size, tensor mf = K.cast(m, K.dtype(yolo_outputs[0])) for l in range(num_layers): object_mask = y_true[l][..., 4:5] true_class_probs = y_true[l][..., 5:] grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[l], anchors[anchor_mask[l]], num_classes, input_shape, calc_loss=True) pred_box = K.concatenate([pred_xy, pred_wh]) # Darknet raw box to calculate loss. raw_true_xy = y_true[l][..., :2] * grid_shapes[l][::-1] - grid raw_true_wh = K.log(y_true[l][..., 2:4] / anchors[anchor_mask[l]] * input_shape[::-1]) raw_true_wh = K.switch(object_mask, raw_true_wh, K.zeros_like(raw_true_wh)) # avoid log(0)=-inf box_loss_scale = 2 - y_true[l][..., 2:3] * y_true[l][..., 3:4] # Find ignore mask, iterate over each of batch. ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) object_mask_bool = K.cast(object_mask, 'bool') def loop_body(b, ignore_mask): true_box = tf.boolean_mask(y_true[l][b, ..., 0:4], object_mask_bool[b, ..., 0]) iou = box_iou(pred_box[b], true_box) best_iou = K.max(iou, axis=-1) ignore_mask = ignore_mask.write( b, K.cast(best_iou < ignore_thresh, K.dtype(true_box))) return b + 1, ignore_mask _, ignore_mask = tf.while_loop(lambda b, *args: b < m, loop_body, [0, ignore_mask]) ignore_mask = ignore_mask.stack() ignore_mask = K.expand_dims(ignore_mask, -1) # K.binary_crossentropy is helpful to avoid exp overflow. xy_loss = object_mask * box_loss_scale * K.binary_crossentropy( raw_true_xy, raw_pred[..., 0:2], from_logits=True) wh_loss = object_mask * box_loss_scale * 0.5 * K.square( raw_true_wh - raw_pred[..., 2:4]) confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True)+ \ (1-object_mask) * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True) * ignore_mask class_loss = object_mask * K.binary_crossentropy( true_class_probs, raw_pred[..., 5:], from_logits=True) xy_loss = K.sum(xy_loss) / mf wh_loss = K.sum(wh_loss) / mf confidence_loss = K.sum(confidence_loss) / mf class_loss = K.sum(class_loss) / mf loss += xy_loss + wh_loss + confidence_loss + class_loss if print_loss: loss = tf.Print(loss, [ loss, xy_loss, wh_loss, confidence_loss, class_loss, K.sum(ignore_mask) ], message='loss: ') return loss
def yolo_loss(args, anchors, num_classes, ignore_thresh=0.5, label_smoothing=0.1, print_loss=False): # 一共有三层 num_layers = len(anchors) // 3 # 将预测结果和实际ground truth分开,args是[*model_body.output, *y_true] # y_true是一个列表,包含三个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85),(m,52,52,3,85)。 # yolo_outputs是一个列表,包含三个特征层,shape分别为(m,13,13,255),(m,26,26,255),(m,52,52,255)。 y_true = args[num_layers:] yolo_outputs = args[:num_layers] # 先验框 # 678为142,110, 192,243, 459,401 # 345为36,75, 76,55, 72,146 # 012为12,16, 19,36, 40,28 anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2] ] if num_layers == 3 else [[3, 4, 5], [1, 2, 3]] # 得到input_shpae为608,608 input_shape = K.cast( K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) loss = 0 # 取出每一张图片 # m的值就是batch_size m = K.shape(yolo_outputs[0])[0] mf = K.cast(m, K.dtype(yolo_outputs[0])) # y_true是一个列表,包含三个特征层,shape分别为(m,13,13,3,85),(m,26,26,3,85),(m,52,52,3,85)。 # yolo_outputs是一个列表,包含三个特征层,shape分别为(m,13,13,255),(m,26,26,255),(m,52,52,255)。 for l in range(num_layers): # 以第一个特征层(m,13,13,3,85)为例 # 取出该特征层中存在目标的点的位置。(m,13,13,3,1) object_mask = y_true[l][..., 4:5] # 取出其对应的种类(m,13,13,3,80) true_class_probs = y_true[l][..., 5:] if label_smoothing: true_class_probs = _smooth_labels(true_class_probs, label_smoothing) # 将yolo_outputs的特征层输出进行处理 # grid为网格结构(13,13,1,2),raw_pred为尚未处理的预测结果(m,13,13,3,85) # 还有解码后的xy,wh,(m,13,13,3,2) grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[l], anchors[anchor_mask[l]], num_classes, input_shape, calc_loss=True) # 这个是解码后的预测的box的位置 # (m,13,13,3,4) pred_box = K.concatenate([pred_xy, pred_wh]) # 找到负样本群组,第一步是创建一个数组,[] ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) object_mask_bool = K.cast(object_mask, 'bool') # 对每一张图片计算ignore_mask def loop_body(b, ignore_mask): # 取出第b副图内,真实存在的所有的box的参数 # n,4 true_box = tf.boolean_mask(y_true[l][b, ..., 0:4], object_mask_bool[b, ..., 0]) # 计算预测结果与真实情况的iou # pred_box为13,13,3,4 # 计算的结果是每个pred_box和其它所有真实框的iou # 13,13,3,n iou = box_iou(pred_box[b], true_box) # 13,13,3 best_iou = K.max(iou, axis=-1) ignore_mask = ignore_mask.write( b, K.cast(best_iou < ignore_thresh, K.dtype(true_box))) return b + 1, ignore_mask # 遍历所有的图片 _, ignore_mask = tf.while_loop(lambda b, *args: b < m, loop_body, [0, ignore_mask]) # 将每幅图的内容压缩,进行处理 ignore_mask = ignore_mask.stack() #(m,13,13,3,1) ignore_mask = K.expand_dims(ignore_mask, -1) box_loss_scale = 2 - y_true[l][..., 2:3] * y_true[l][..., 3:4] # Calculate ciou loss as location loss raw_true_box = y_true[l][..., 0:4] ciou = box_ciou(pred_box, raw_true_box) ciou_loss = object_mask * box_loss_scale * (1 - ciou) ciou_loss = K.sum(ciou_loss) / mf location_loss = ciou_loss # 如果该位置本来有框,那么计算1与置信度的交叉熵 # 如果该位置本来没有框,而且满足best_iou<ignore_thresh,则被认定为负样本 # best_iou<ignore_thresh用于限制负样本数量 confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True)+ \ (1-object_mask) * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True) * ignore_mask class_loss = object_mask * K.binary_crossentropy( true_class_probs, raw_pred[..., 5:], from_logits=True) confidence_loss = K.sum(confidence_loss) / mf class_loss = K.sum(class_loss) / mf #loss += location_loss + confidence_loss + class_loss loss += location_loss loss = K.expand_dims(loss, axis=-1) return loss
def build(self, mode, config): """Build Mask R-CNN architecture. input_shape: The shape of the input image. mode: Either "training" or "inference". The inputs and outputs of the model differ accordingly. """ assert mode in ['training', 'inference'] # Image size must be dividable by 2 multiple times h, w = config.IMAGE_SHAPE[:2] if h / 2 ** 6 != int(h / 2 ** 6) or w / 2 ** 6 != int(w / 2 ** 6): raise Exception("Image size must be dividable by 2 at least 6 times " "to avoid fractions when downscaling and upscaling." "For example, use 256, 320, 384, 448, 512, ... etc. ") # Inputs input_image = KL.Input( shape=[None, None, config.IMAGE_SHAPE[2]], name="input_image") input_image_meta = KL.Input(shape=[config.IMAGE_META_SIZE], name="input_image_meta") if mode == "training": # RPN GT input_rpn_match = KL.Input( shape=[None, 1], name="input_rpn_match", dtype=tf.int32) input_rpn_bbox = KL.Input( shape=[None, 4], name="input_rpn_bbox", dtype=tf.float32) # Detection GT (class IDs, bounding boxes, and masks) # 1. GT Class IDs (zero padded) input_gt_class_ids = KL.Input( shape=[None], name="input_gt_class_ids", dtype=tf.int32) # 2. GT Boxes in pixels (zero padded) # [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in image coordinates input_gt_boxes = KL.Input( shape=[None, 4], name="input_gt_boxes", dtype=tf.float32) # Normalize coordinates gt_boxes = KL.Lambda(lambda x: norm_boxes_graph( x, K.shape(input_image)[1:3]))(input_gt_boxes) # 3. GT Masks (zero padded) # [batch, height, width, MAX_GT_INSTANCES] if config.USE_MINI_MASK: input_gt_masks = KL.Input( shape=[config.MINI_MASK_SHAPE[0], config.MINI_MASK_SHAPE[1], None], name="input_gt_masks", dtype=bool) else: input_gt_masks = KL.Input( shape=[config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1], None], name="input_gt_masks", dtype=bool) elif mode == "inference": # Anchors in normalized coordinates input_anchors = KL.Input(shape=[None, 4], name="input_anchors") # Build the shared convolutional layers. # Bottom-up Layers # Returns a list of the last layers of each stage, 5 in total. # Don't create the thead (stage 5), so we pick the 4th item in the list. if callable(config.BACKBONE): _, C2, C3, C4, C5 = config.BACKBONE(input_image, stage5=True, train_bn=config.TRAIN_BN) else: _, C2, C3, C4, C5 = resnet_graph(input_image, config.BACKBONE, stage5=True, train_bn=config.TRAIN_BN) # Top-down Layers # TODO: add assert to varify feature map sizes match what's in config P5 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c5p5')(C5) P4 = KL.Add(name="fpn_p4add")([ KL.UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(P5), KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c4p4')(C4)]) P3 = KL.Add(name="fpn_p3add")([ KL.UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(P4), KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c3p3')(C3)]) P2 = KL.Add(name="fpn_p2add")([ KL.UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(P3), KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c2p2')(C2)]) # Attach 3x3 conv to all P layers to get the final feature maps. P2 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p2")(P2) P3 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p3")(P3) P4 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p4")(P4) P5 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p5")(P5) # P6 is used for the 5th anchor scale in RPN. Generated by # # subsampling from P5 with stride of 2. P6 = KL.MaxPooling2D(pool_size=(1, 1), strides=2, name="fpn_p6")(P5) # Note that P6 is used in RPN, but not in the classifier heads. rpn_feature_maps = [P2, P3, P4, P5, P6] mrcnn_feature_maps = [P2, P3, P4, P5] # Anchors if mode == "training": anchors = self.get_anchors(config.IMAGE_SHAPE) # Duplicate across the batch dimension because Keras requires it # TODO: can this be optimized to avoid duplicating the anchors? anchors = np.broadcast_to(anchors, (config.BATCH_SIZE,) + anchors.shape) # A hack to get around Keras's bad support for constants anchors = KL.Lambda(lambda x: tf.Variable(anchors), name="anchors")(input_image) else: anchors = input_anchors # RPN Model rpn = build_rpn_model(config.RPN_ANCHOR_STRIDE, len(config.RPN_ANCHOR_RATIOS), config.TOP_DOWN_PYRAMID_SIZE) # Loop through pyramid layers layer_outputs = [] # list of lists for p in rpn_feature_maps: layer_outputs.append(rpn([p])) # Concatenate layer outputs # Convert from list of lists of level outputs to list of lists # of outputs across levels. # e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]] output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"] outputs = list(zip(*layer_outputs)) outputs = [KL.Concatenate(axis=1, name=n)(list(o)) for o, n in zip(outputs, output_names)] rpn_class_logits, rpn_class, rpn_bbox = outputs # Generate proposals # Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates # and zero padded. proposal_count = config.POST_NMS_ROIS_TRAINING if mode == "training" \ else config.POST_NMS_ROIS_INFERENCE rpn_rois = ProposalLayer( proposal_count=proposal_count, nms_threshold=config.RPN_NMS_THRESHOLD, name="ROI", config=config)([rpn_class, rpn_bbox, anchors]) if mode == "training": # Class ID mask to mark class IDs supported by the dataset the image # came from. active_class_ids = KL.Lambda( lambda x: parse_image_meta_graph(x)["active_class_ids"] )(input_image_meta) if not config.USE_RPN_ROIS: # Ignore predicted ROIs and use ROIs provided as an input. input_rois = KL.Input(shape=[config.POST_NMS_ROIS_TRAINING, 4], name="input_roi", dtype=np.int32) # Normalize coordinates target_rois = KL.Lambda(lambda x: norm_boxes_graph( x, K.shape(input_image)[1:3]))(input_rois) else: target_rois = rpn_rois # Generate detection targets # Subsamples proposals and generates target outputs for training # Note that proposal class IDs, gt_boxes, and gt_masks are zero # padded. Equally, returned rois and targets are zero padded. rois, target_class_ids, target_bbox, target_mask = \ DetectionTargetLayer(config, name="proposal_targets")([ target_rois, input_gt_class_ids, gt_boxes, input_gt_masks]) # Network Heads # TODO: verify that this handles zero padded ROIs mrcnn_class_logits, mrcnn_class, mrcnn_bbox = \ fpn_classifier_graph(rois, mrcnn_feature_maps, input_image_meta, config.POOL_SIZE, config.NUM_CLASSES, train_bn=config.TRAIN_BN, fc_layers_size=config.FPN_CLASSIF_FC_LAYERS_SIZE) mrcnn_mask = build_fpn_mask_graph(rois, mrcnn_feature_maps, input_image_meta, config.MASK_POOL_SIZE, config.NUM_CLASSES, train_bn=config.TRAIN_BN) # TODO: clean up (use tf.identify if necessary) output_rois = KL.Lambda(lambda x: x * 1, name="output_rois")(rois) # Losses rpn_class_loss = KL.Lambda(lambda x: rpn_class_loss_graph(*x), name="rpn_class_loss")( [input_rpn_match, rpn_class_logits]) rpn_bbox_loss = KL.Lambda(lambda x: rpn_bbox_loss_graph(config, *x), name="rpn_bbox_loss")( [input_rpn_bbox, input_rpn_match, rpn_bbox]) class_loss = KL.Lambda(lambda x: mrcnn_class_loss_graph(*x), name="mrcnn_class_loss")( [target_class_ids, mrcnn_class_logits, active_class_ids]) bbox_loss = KL.Lambda(lambda x: mrcnn_bbox_loss_graph(*x), name="mrcnn_bbox_loss")( [target_bbox, target_class_ids, mrcnn_bbox]) mask_loss = KL.Lambda(lambda x: mrcnn_mask_loss_graph(*x), name="mrcnn_mask_loss")( [target_mask, target_class_ids, mrcnn_mask]) # Model inputs = [input_image, input_image_meta, input_rpn_match, input_rpn_bbox, input_gt_class_ids, input_gt_boxes, input_gt_masks] if not config.USE_RPN_ROIS: inputs.append(input_rois) outputs = [rpn_class_logits, rpn_class, rpn_bbox, mrcnn_class_logits, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, output_rois, rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss, mask_loss] model = KM.Model(inputs, outputs, name='mask_rcnn') else: # Network Heads # Proposal classifier and BBox regressor heads mrcnn_class_logits, mrcnn_class, mrcnn_bbox = \ fpn_classifier_graph(rpn_rois, mrcnn_feature_maps, input_image_meta, config.POOL_SIZE, config.NUM_CLASSES, train_bn=config.TRAIN_BN, fc_layers_size=config.FPN_CLASSIF_FC_LAYERS_SIZE) # Detections # output is [batch, num_detections, (y1, x1, y2, x2, class_id, score)] in # normalized coordinates detections = DetectionLayer(config, name="mrcnn_detection")( [rpn_rois, mrcnn_class, mrcnn_bbox, input_image_meta]) # Create masks for detections detection_boxes = KL.Lambda(lambda x: x[..., :4])(detections) mrcnn_mask = build_fpn_mask_graph(detection_boxes, mrcnn_feature_maps, input_image_meta, config.MASK_POOL_SIZE, config.NUM_CLASSES, train_bn=config.TRAIN_BN) model = KM.Model([input_image, input_image_meta, input_anchors], [detections, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, rpn_class, rpn_bbox], name='mask_rcnn') # Add multi-GPU support. if config.GPU_COUNT > 1: from .parallel_model import ParallelModel model = ParallelModel(model, config.GPU_COUNT) return model
def _smooth_labels(): num_classes = K.cast(K.shape(y_true)[1], y_pred.dtype) return y_true * (1.0 - smoothing) + (smoothing / num_classes)
def call(self, inputs, training=None, mask=None): input_shape = K.shape(inputs) if self.rank == 1: input_shape = [input_shape[i] for i in range(3)] batch_shape, dim, channels = input_shape xx_range = K.tile(K.expand_dims(K.arange(0, dim), axis=0), K.stack([batch_shape, 1])) xx_range = K.expand_dims(xx_range, axis=-1) xx_channels = K.cast(xx_range, K.dtype(inputs)) xx_channels = xx_channels / K.cast(dim - 1, K.dtype(inputs)) xx_channels = (xx_channels * 2) - 1. outputs = K.concatenate([inputs, xx_channels], axis=-1) if self.rank == 2: if self.data_format == 'channels_first': inputs = K.permute_dimensions(inputs, [0, 2, 3, 1]) input_shape = K.shape(inputs) input_shape = [input_shape[i] for i in range(4)] batch_shape, dim1, dim2, channels = input_shape xx_ones = K.ones(K.stack([batch_shape, dim2]), dtype='int32') xx_ones = K.expand_dims(xx_ones, axis=-1) xx_range = K.tile(K.expand_dims(K.arange(0, dim1), axis=0), K.stack([batch_shape, 1])) xx_range = K.expand_dims(xx_range, axis=1) xx_channels = K.batch_dot(xx_ones, xx_range, axes=[2, 1]) xx_channels = K.expand_dims(xx_channels, axis=-1) xx_channels = K.permute_dimensions(xx_channels, [0, 2, 1, 3]) yy_ones = K.ones(K.stack([batch_shape, dim1]), dtype='int32') yy_ones = K.expand_dims(yy_ones, axis=1) yy_range = K.tile(K.expand_dims(K.arange(0, dim2), axis=0), K.stack([batch_shape, 1])) yy_range = K.expand_dims(yy_range, axis=-1) yy_channels = K.batch_dot(yy_range, yy_ones, axes=[2, 1]) yy_channels = K.expand_dims(yy_channels, axis=-1) yy_channels = K.permute_dimensions(yy_channels, [0, 2, 1, 3]) xx_channels = K.cast(xx_channels, K.floatx()) xx_channels = xx_channels / K.cast(dim1 - 1, K.floatx()) xx_channels = (xx_channels * 2) - 1. yy_channels = K.cast(yy_channels, K.floatx()) yy_channels = yy_channels / K.cast(dim2 - 1, K.floatx()) yy_channels = (yy_channels * 2) - 1. outputs = K.concatenate([inputs, xx_channels, yy_channels], axis=-1) if self.use_radius: rr = K.sqrt( K.square(xx_channels - 0.5) + K.square(yy_channels - 0.5)) outputs = K.concatenate([outputs, rr], axis=-1) if self.data_format == 'channels_first': outputs = K.permute_dimensions(outputs, [0, 3, 1, 2]) if self.rank == 3: if self.data_format == 'channels_first': inputs = K.permute_dimensions(inputs, [0, 2, 3, 4, 1]) input_shape = K.shape(inputs) input_shape = [input_shape[i] for i in range(5)] batch_shape, dim1, dim2, dim3, channels = input_shape xx_ones = K.ones(K.stack([batch_shape, dim3]), dtype='int32') xx_ones = K.expand_dims(xx_ones, axis=-1) xx_range = K.tile(K.expand_dims(K.arange(0, dim2), axis=0), K.stack([batch_shape, 1])) xx_range = K.expand_dims(xx_range, axis=1) xx_channels = K.batch_dot(xx_ones, xx_range, axes=[2, 1]) xx_channels = K.expand_dims(xx_channels, axis=-1) xx_channels = K.permute_dimensions(xx_channels, [0, 2, 1, 3]) xx_channels = K.expand_dims(xx_channels, axis=1) xx_channels = K.tile(xx_channels, [1, dim1, 1, 1, 1]) yy_ones = K.ones(K.stack([batch_shape, dim2]), dtype='int32') yy_ones = K.expand_dims(yy_ones, axis=1) yy_range = K.tile(K.expand_dims(K.arange(0, dim3), axis=0), K.stack([batch_shape, 1])) yy_range = K.expand_dims(yy_range, axis=-1) yy_channels = K.batch_dot(yy_range, yy_ones, axes=[2, 1]) yy_channels = K.expand_dims(yy_channels, axis=-1) yy_channels = K.permute_dimensions(yy_channels, [0, 2, 1, 3]) yy_channels = K.expand_dims(yy_channels, axis=1) yy_channels = K.tile(yy_channels, [1, dim1, 1, 1, 1]) zz_range = K.tile(K.expand_dims(K.arange(0, dim1), axis=0), K.stack([batch_shape, 1])) zz_range = K.expand_dims(zz_range, axis=-1) zz_range = K.expand_dims(zz_range, axis=-1) zz_channels = K.tile(zz_range, [1, 1, dim2, dim3]) zz_channels = K.expand_dims(zz_channels, axis=-1) xx_channels = K.cast(xx_channels, K.floatx()) xx_channels = xx_channels / K.cast(dim2 - 1, K.floatx()) xx_channels = xx_channels * 2 - 1. yy_channels = K.cast(yy_channels, K.floatx()) yy_channels = yy_channels / K.cast(dim3 - 1, K.floatx()) yy_channels = yy_channels * 2 - 1. zz_channels = K.cast(zz_channels, K.floatx()) zz_channels = zz_channels / K.cast(dim1 - 1, K.floatx()) zz_channels = zz_channels * 2 - 1. outputs = K.concatenate( [inputs, zz_channels, xx_channels, yy_channels], axis=-1) if self.data_format == 'channels_first': outputs = K.permute_dimensions(outputs, [0, 4, 1, 2, 3]) return outputs