def generate_anchor_boxes_for_layer(self,
                                        feature_map_size,
                                        aspect_ratios,
                                        this_scale,
                                        this_steps=None,
                                        this_offsets=None,
                                        diagnostics=False):
        '''
        Computes an array of the spatial positions and sizes of the anchor boxes for one predictor layer
        of size `feature_map_size == [feature_map_height, feature_map_width]`.
        Arguments:
            feature_map_size (tuple): A list or tuple `[feature_map_height, feature_map_width]` with the spatial
                dimensions of the feature map for which to generate the anchor boxes.
            aspect_ratios (list): A list of floats, the aspect ratios for which anchor boxes are to be generated.
                All list elements must be unique.
            this_scale (float): A float in [0, 1], the scaling factor for the size of the generate anchor boxes
                as a fraction of the shorter side of the input image.
            next_scale (float): A float in [0, 1], the next larger scaling factor. Only relevant if
                `self.two_boxes_for_ar1 == True`.
            diagnostics (bool, optional): If true, the following additional outputs will be returned:
                1) A list of the center point `x` and `y` coordinates for each spatial location.
                2) A list containing `(width, height)` for each box aspect ratio.
                3) A tuple containing `(step_height, step_width)`
                4) A tuple containing `(offset_height, offset_width)`
                This information can be useful to understand in just a few numbers what the generated grid of
                anchor boxes actually looks like, i.e. how large the different boxes are and how dense
                their spatial distribution is, in order to determine whether the box grid covers the input images
                appropriately and whether the box sizes are appropriate to fit the sizes of the objects
                to be detected.
        Returns:
            A 4D Numpy tensor of shape `(feature_map_height, feature_map_width, n_boxes_per_cell, 4)` where the
            last dimension contains `(xmin, xmax, ymin, ymax)` for each anchor box in each cell of the feature map.
        '''
        # Compute box width and height for each aspect ratio.

        # The shorter side of the image will be used to compute `w` and `h` using `scale` and `aspect_ratios`.
        size = min(self.img_height, self.img_width)
        # Compute the box widths and and heights for all aspect ratios
        wh_list = []
        for ar in aspect_ratios:
            if (ar == 1):
                # Compute the regular anchor box for aspect ratio 1.
                box_height = box_width = this_scale * size
                wh_list.append((box_width, box_height))
                if False: #self.two_boxes_for_ar1:
                    # Compute one slightly larger version using the geometric mean of this scale value and the next.
                    box_height = box_width = np.sqrt(this_scale * next_scale) * size
                    wh_list.append((box_width, box_height))
            else:
                box_width = this_scale * size * np.sqrt(ar)
                box_height = this_scale * size / np.sqrt(ar)
                wh_list.append((box_width, box_height))
        wh_list = np.array(wh_list)
        n_boxes = len(wh_list)

        # Compute the grid of box center points. They are identical for all aspect ratios.

        # Compute the step sizes, i.e. how far apart the anchor box center points will be vertically and horizontally.
        if (this_steps is None):
            step_height = self.img_height / feature_map_size[0]
            step_width = self.img_width / feature_map_size[1]
        else:
            if isinstance(this_steps, (list, tuple)) and (len(this_steps) == 2):
                step_height = this_steps[0]
                step_width = this_steps[1]
            elif isinstance(this_steps, (int, float)):
                step_height = this_steps
                step_width = this_steps
        # Compute the offsets, i.e. at what pixel values the first anchor box center point will be from the top and from the left of the image.
        if (this_offsets is None):
            offset_height = 0.5
            offset_width = 0.5
        else:
            if isinstance(this_offsets, (list, tuple)) and (len(this_offsets) == 2):
                offset_height = this_offsets[0]
                offset_width = this_offsets[1]
            elif isinstance(this_offsets, (int, float)):
                offset_height = this_offsets
                offset_width = this_offsets
        # Now that we have the offsets and step sizes, compute the grid of anchor box center points.
        #print(feature_map_size[0])
        cy = np.linspace(offset_height * step_height, (offset_height + feature_map_size[0] - 1) * step_height, feature_map_size[0])
        cx = np.linspace(offset_width * step_width, (offset_width + feature_map_size[1] - 1) * step_width, feature_map_size[1])
        cx_grid, cy_grid = np.meshgrid(cx, cy)
        cx_grid = np.expand_dims(cx_grid, -1) # This is necessary for np.tile() to do what we want further down
        cy_grid = np.expand_dims(cy_grid, -1) # This is necessary for np.tile() to do what we want further down

        # Create a 4D tensor template of shape `(feature_map_height, feature_map_width, n_boxes, 4)`
        # where the last dimension will contain `(cx, cy, w, h)`
        boxes_tensor = np.zeros((feature_map_size[0], feature_map_size[1], n_boxes, 4))

        boxes_tensor[:, :, :, 0] = np.tile(cx_grid, (1, 1, n_boxes)) # Set cx
        boxes_tensor[:, :, :, 1] = np.tile(cy_grid, (1, 1, n_boxes)) # Set cy
        boxes_tensor[:, :, :, 2] = wh_list[:, 0] # Set w
        boxes_tensor[:, :, :, 3] = wh_list[:, 1] # Set h

        # Convert `(cx, cy, w, h)` to `(xmin, ymin, xmax, ymax)`
        boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='centroids2corners')

        # If `clip_boxes` is enabled, clip the coordinates to lie within the image boundaries
        if self.clip_boxes:
            x_coords = boxes_tensor[:,:,:,[0, 2]]
            x_coords[x_coords >= self.img_width] = self.img_width - 1
            x_coords[x_coords < 0] = 0
            boxes_tensor[:,:,:,[0, 2]] = x_coords
            y_coords = boxes_tensor[:,:,:,[1, 3]]
            y_coords[y_coords >= self.img_height] = self.img_height - 1
            y_coords[y_coords < 0] = 0
            boxes_tensor[:,:,:,[1, 3]] = y_coords


        print(boxes_tensor)

        return boxes_tensor
Exemplo n.º 2
0
    def call(self, x, mask=None):
        '''
        Return an anchor box tensor based on the shape of the input tensor.
        The logic implemented here is identical to the logic in the module `ssd_box_encode_decode_utils.py`.
        Note that this tensor does not participate in any graph computations at runtime. It is being created
        as a constant once during graph creation and is just being output along with the rest of the model output
        during runtime. Because of this, all logic is implemented as Numpy array operations and it is sufficient
        to convert the resulting Numpy array into a Keras tensor at the very end before outputting it.
        Arguments:
            x (tensor): 4D tensor of shape `(batch, channels, height, width)` if `dim_ordering = 'th'`
                or `(batch, height, width, channels)` if `dim_ordering = 'tf'`. The input for this
                layer must be the output of the localization predictor layer.
        '''

        # Compute box width and height for each aspect ratio
        # The shorter side of the image will be used to compute `w` and `h` using `scale` and `aspect_ratios`.
        size = min(self.img_height, self.img_width)
        # Compute the box widths and and heights for all aspect ratios
        wh_list = []
        for ar in self.aspect_ratios:
            if (ar == 1):
                # Compute the regular anchor box for aspect ratio 1.
                box_height = box_width = self.this_scale * size
                wh_list.append((box_width, box_height))
                if self.two_boxes_for_ar1:
                    # Compute one slightly larger version using the geometric mean of this scale value and the next.
                    box_height = box_width = np.sqrt(self.this_scale * self.next_scale) * size
                    wh_list.append((box_width, box_height))
            else:
                box_height = self.this_scale * size / np.sqrt(ar)
                box_width = self.this_scale * size * np.sqrt(ar)
                wh_list.append((box_width, box_height))
        wh_list = np.array(wh_list)

        # We need the shape of the input tensor
        # if K.image_dim_ordering() == 'tf':
        if K.image_data_format() == 'channels_last':
            batch_size, feature_map_height, feature_map_width, feature_map_channels = x._keras_shape
        else: # Not yet relevant since TensorFlow is the only supported backend right now, but it can't harm to have this in here for the future
            batch_size, feature_map_channels, feature_map_height, feature_map_width = x._keras_shape

        # Compute the grid of box center points. They are identical for all aspect ratios.

        # Compute the step sizes, i.e. how far apart the anchor box center points will be vertically and horizontally.
        if (self.this_steps is None):
            step_height = self.img_height / feature_map_height
            step_width = self.img_width / feature_map_width
        else:
            if isinstance(self.this_steps, (list, tuple)) and (len(self.this_steps) == 2):
                step_height = self.this_steps[0]
                step_width = self.this_steps[1]
            elif isinstance(self.this_steps, (int, float)):
                step_height = self.this_steps
                step_width = self.this_steps
        # Compute the offsets, i.e. at what pixel values the first anchor box center point will be from the top and from the left of the image.
        if (self.this_offsets is None):
            offset_height = 0.5
            offset_width = 0.5
        else:
            if isinstance(self.this_offsets, (list, tuple)) and (len(self.this_offsets) == 2):
                offset_height = self.this_offsets[0]
                offset_width = self.this_offsets[1]
            elif isinstance(self.this_offsets, (int, float)):
                offset_height = self.this_offsets
                offset_width = self.this_offsets
        # Now that we have the offsets and step sizes, compute the grid of anchor box center points.
        cy = np.linspace(offset_height * step_height, (offset_height + feature_map_height - 1) * step_height, feature_map_height)
        cx = np.linspace(offset_width * step_width, (offset_width + feature_map_width - 1) * step_width, feature_map_width)
        cx_grid, cy_grid = np.meshgrid(cx, cy)
        cx_grid = np.expand_dims(cx_grid, -1) # This is necessary for np.tile() to do what we want further down
        cy_grid = np.expand_dims(cy_grid, -1) # This is necessary for np.tile() to do what we want further down

        # Create a 4D tensor template of shape `(feature_map_height, feature_map_width, n_boxes, 4)`
        # where the last dimension will contain `(cx, cy, w, h)`
        boxes_tensor = np.zeros((feature_map_height, feature_map_width, self.n_boxes, 4))

        boxes_tensor[:, :, :, 0] = np.tile(cx_grid, (1, 1, self.n_boxes)) # Set cx
        boxes_tensor[:, :, :, 1] = np.tile(cy_grid, (1, 1, self.n_boxes)) # Set cy
        boxes_tensor[:, :, :, 2] = wh_list[:, 0] # Set w
        boxes_tensor[:, :, :, 3] = wh_list[:, 1] # Set h

        # Convert `(cx, cy, w, h)` to `(xmin, xmax, ymin, ymax)`
        boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='centroids2corners')

        # If `clip_boxes` is enabled, clip the coordinates to lie within the image boundaries
        if self.clip_boxes:
            x_coords = boxes_tensor[:,:,:,[0, 2]]
            x_coords[x_coords >= self.img_width] = self.img_width - 1
            x_coords[x_coords < 0] = 0
            boxes_tensor[:,:,:,[0, 2]] = x_coords
            y_coords = boxes_tensor[:,:,:,[1, 3]]
            y_coords[y_coords >= self.img_height] = self.img_height - 1
            y_coords[y_coords < 0] = 0
            boxes_tensor[:,:,:,[1, 3]] = y_coords

        # If `normalize_coords` is enabled, normalize the coordinates to be within [0,1]
        if self.normalize_coords:
            boxes_tensor[:, :, :, [0, 2]] /= self.img_width
            boxes_tensor[:, :, :, [1, 3]] /= self.img_height

        # TODO: Implement box limiting directly for `(cx, cy, w, h)` so that we don't have to unnecessarily convert back and forth.
        if self.coords == 'centroids':
            # Convert `(xmin, ymin, xmax, ymax)` back to `(cx, cy, w, h)`.
            boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2centroids', border_pixels='half')
        elif self.coords == 'minmax':
            # Convert `(xmin, ymin, xmax, ymax)` to `(xmin, xmax, ymin, ymax).
            boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2minmax', border_pixels='half')

        # Create a tensor to contain the variances and append it to `boxes_tensor`. This tensor has the same shape
        # as `boxes_tensor` and simply contains the same 4 variance values for every position in the last axis.
        variances_tensor = np.zeros_like(boxes_tensor) # Has shape `(feature_map_height, feature_map_width, n_boxes, 4)`
        variances_tensor += self.variances # Long live broadcasting
        # Now `boxes_tensor` becomes a tensor of shape `(feature_map_height, feature_map_width, n_boxes, 8)`
        boxes_tensor = np.concatenate((boxes_tensor, variances_tensor), axis=-1)

        # Now prepend one dimension to `boxes_tensor` to account for the batch size and tile it along
        # The result will be a 5D tensor of shape `(batch_size, feature_map_height, feature_map_width, n_boxes, 8)`
        boxes_tensor = np.expand_dims(boxes_tensor, axis=0)
        boxes_tensor = K.tile(K.constant(boxes_tensor, dtype='float32'), (K.shape(x)[0], 1, 1, 1, 1))

        return boxes_tensor
Exemplo n.º 3
0
    def __call__(self, ground_truth_labels, diagnostics=False):
        '''
        Converts ground truth bounding box data into a suitable format to train an SSD model.

        Arguments:
            ground_truth_labels (list): A python list of length `batch_size` that contains one 2D Numpy array
                for each batch image. Each such array has `k` rows for the `k` ground truth bounding boxes belonging
                to the respective image, and the data for each ground truth bounding box has the format
                `(class_id, xmin, ymin, xmax, ymax)` (i.e. the 'corners' coordinate format), and `class_id` must be
                an integer greater than 0 for all boxes as class ID 0 is reserved for the background class.
            diagnostics (bool, optional): If `True`, not only the encoded ground truth tensor will be returned,
                but also a copy of it with anchor box coordinates in place of the ground truth coordinates.
                This can be very useful if you want to visualize which anchor boxes got matched to which ground truth
                boxes.

        Returns:
            `y_encoded`, a 3D numpy array of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)` that serves as the
            ground truth label tensor for training, where `#boxes` is the total number of boxes predicted by the
            model per image, and the classes are one-hot-encoded. The four elements after the class vecotrs in
            the last axis are the box coordinates, the next four elements after that are just dummy elements, and
            the last four elements are the variances.
        '''

        # Mapping to define which indices represent which coordinates in the ground truth.
        class_id = 0
        xmin = 1
        ymin = 2
        xmax = 3
        ymax = 4

        batch_size = len(ground_truth_labels)

        ##################################################################################
        # Generate the template for y_encoded.
        ##################################################################################

        y_encoded = self.generate_encoding_template(batch_size=batch_size, diagnostics=False)

        ##################################################################################
        # Match ground truth boxes to anchor boxes.
        ##################################################################################

        # Match the ground truth boxes to the anchor boxes. Every anchor box that does not have
        # a ground truth match and for which the maximal IoU overlap with any ground truth box is less
        # than or equal to `neg_iou_limit` will be a negative (background) box.

        y_encoded[:, :, self.background_id] = 1 # All boxes are background boxes by default.
        n_boxes = y_encoded.shape[1] # The total number of boxes that the model predicts per batch item
        class_vectors = np.eye(self.n_classes) # An identity matrix that we'll use as one-hot class vectors

        for i in range(batch_size): # For each batch item...

            if ground_truth_labels[i].size == 0: continue # If there is no ground truth for this batch item, there is nothing to match.
            labels = ground_truth_labels[i].astype(np.float) # The labels for this batch item

            # Check for degenerate ground truth bounding boxes before attempting any computations.
            if np.any(labels[:,[xmax]] - labels[:,[xmin]] <= 0) or np.any(labels[:,[ymax]] - labels[:,[ymin]] <= 0):
                raise DegenerateBoxError("SSDInputEncoder detected degenerate ground truth bounding boxes for batch item {} with bounding boxes {}, ".format(i, labels) +
                                         "i.e. bounding boxes where xmax <= xmin and/or ymax <= ymin. Degenerate ground truth " +
                                         "bounding boxes will lead to NaN errors during the training.")

            # Maybe normalize the box coordinates.
            if self.normalize_coords:
                labels[:,[ymin,ymax]] /= self.img_height # Normalize ymin and ymax relative to the image height
                labels[:,[xmin,xmax]] /= self.img_width # Normalize xmin and xmax relative to the image width

            # Maybe convert the box coordinate format.
            if self.coords == 'centroids':
                labels = convert_coordinates(labels, start_index=xmin, conversion='corners2centroids', border_pixels=self.border_pixels)
            elif self.coords == 'minmax':
                labels = convert_coordinates(labels, start_index=xmin, conversion='corners2minmax')

            classes_one_hot = class_vectors[labels[:, class_id].astype(np.int)] # The one-hot class IDs for the ground truth boxes of this batch item
            labels_one_hot = np.concatenate([classes_one_hot, labels[:, [xmin,ymin,xmax,ymax]]], axis=-1) # The one-hot version of the labels for this batch item

            # Compute the IoU similarities between all anchor boxes and all ground truth boxes for this batch item.
            # This is a matrix of shape `(num_ground_truth_boxes, num_anchor_boxes)`.
            similarities = iou(labels[:,[xmin,ymin,xmax,ymax]], y_encoded[i,:,-12:-8], coords=self.coords, mode='outer_product', border_pixels=self.border_pixels)

            # First: Do bipartite matching, i.e. match each ground truth box to the one anchor box with the highest IoU.
            #        This ensures that each ground truth box will have at least one good match.

            # For each ground truth box, get the anchor box to match with it.
            bipartite_matches = match_bipartite_greedy(weight_matrix=similarities)

            # Write the ground truth data to the matched anchor boxes.
            y_encoded[i, bipartite_matches, :-8] = labels_one_hot

            # Set the columns of the matched anchor boxes to zero to indicate that they were matched.
            similarities[:, bipartite_matches] = 0

            # Second: Maybe do 'multi' matching, where each remaining anchor box will be matched to its most similar
            #         ground truth box with an IoU of at least `pos_iou_threshold`, or not matched if there is no
            #         such ground truth box.

            if self.matching_type == 'multi':

                # Get all matches that satisfy the IoU threshold.
                matches = match_multi(weight_matrix=similarities, threshold=self.pos_iou_threshold)

                # Write the ground truth data to the matched anchor boxes.
                y_encoded[i, matches[1], :-8] = labels_one_hot[matches[0]]

                # Set the columns of the matched anchor boxes to zero to indicate that they were matched.
                similarities[:, matches[1]] = 0

            # Third: Now after the matching is done, all negative (background) anchor boxes that have
            #        an IoU of `neg_iou_limit` or more with any ground truth box will be set to netral,
            #        i.e. they will no longer be background boxes. These anchors are "too close" to a
            #        ground truth box to be valid background boxes.

            max_background_similarities = np.amax(similarities, axis=0)
            neutral_boxes = np.nonzero(max_background_similarities >= self.neg_iou_limit)[0]
            y_encoded[i, neutral_boxes, self.background_id] = 0

        ##################################################################################
        # Convert box coordinates to anchor box offsets.
        ##################################################################################

        if self.coords == 'centroids':
            y_encoded[:,:,[-12,-11]] -= y_encoded[:,:,[-8,-7]] # cx(gt) - cx(anchor), cy(gt) - cy(anchor)
            y_encoded[:,:,[-12,-11]] /= y_encoded[:,:,[-6,-5]] * y_encoded[:,:,[-4,-3]] # (cx(gt) - cx(anchor)) / w(anchor) / cx_variance, (cy(gt) - cy(anchor)) / h(anchor) / cy_variance
            y_encoded[:,:,[-10,-9]] /= y_encoded[:,:,[-6,-5]] # w(gt) / w(anchor), h(gt) / h(anchor)
            y_encoded[:,:,[-10,-9]] = np.log(y_encoded[:,:,[-10,-9]]) / y_encoded[:,:,[-2,-1]] # ln(w(gt) / w(anchor)) / w_variance, ln(h(gt) / h(anchor)) / h_variance (ln == natural logarithm)
        elif self.coords == 'corners':
            y_encoded[:,:,-12:-8] -= y_encoded[:,:,-8:-4] # (gt - anchor) for all four coordinates
            y_encoded[:,:,[-12,-10]] /= np.expand_dims(y_encoded[:,:,-6] - y_encoded[:,:,-8], axis=-1) # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor)
            y_encoded[:,:,[-11,-9]] /= np.expand_dims(y_encoded[:,:,-5] - y_encoded[:,:,-7], axis=-1) # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor)
            y_encoded[:,:,-12:-8] /= y_encoded[:,:,-4:] # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively
        elif self.coords == 'minmax':
            y_encoded[:,:,-12:-8] -= y_encoded[:,:,-8:-4] # (gt - anchor) for all four coordinates
            y_encoded[:,:,[-12,-11]] /= np.expand_dims(y_encoded[:,:,-7] - y_encoded[:,:,-8], axis=-1) # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor)
            y_encoded[:,:,[-10,-9]] /= np.expand_dims(y_encoded[:,:,-5] - y_encoded[:,:,-6], axis=-1) # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor)
            y_encoded[:,:,-12:-8] /= y_encoded[:,:,-4:] # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively

        if diagnostics:
            # Here we'll save the matched anchor boxes (i.e. anchor boxes that were matched to a ground truth box, but keeping the anchor box coordinates).
            y_matched_anchors = np.copy(y_encoded)
            y_matched_anchors[:,:,-12:-8] = 0 # Keeping the anchor box coordinates means setting the offsets to zero.
            return y_encoded, y_matched_anchors
        else:
            return y_encoded
Exemplo n.º 4
0
# imgs.shape
# t_imgs=K.constant(imgs)
#%%
anhor_boxes=generate_anchor_boxes_for_layer(feature_map_size=(18,18),
                                        aspect_ratios=[1],
                                        this_scale=0.13,
                                        next_scale=0.1,
                                        this_steps=16,
                                        this_offsets=0.5,
                                        diagnostics=False,
                                        two_boxes_for_ar1=False,
                                        clip_boxes=True,
                                        normalize_coords=False,
                                        coords='centroids')
#%%
boxes_tensor = convert_coordinates(anhor_boxes, start_index=0, conversion='centroids2corners')
img=cv2.imread('./Test_Original_image.jpg')
imgs=np.array([img])
nth_bx=4
for j in range(1,18):
   if j%nth_bx==0:
      for i in range(1,18):
         if i%nth_bx==0 :
            x1,y1,x2,y2=boxes_tensor[j,i,0,:]
            draw_bbox(img,int(x1),int(y1),int(x2),int(y2))

cv2.imwrite('Output_4A_bx.jpg',img)



#%%
Exemplo n.º 5
0
def generate_anchor_boxes_for_layer(   feature_map_size,
                                        aspect_ratios,
                                        this_scale,
                                        next_scale,
                                        this_steps=None,
                                        this_offsets=None,
                                        diagnostics=False,
                                        two_boxes_for_ar1=False,
                                        clip_boxes=False,
                                        normalize_coords=False,
                                        coords='centroids'):
        img_height=300
        img_width=300
        size = min(img_height, img_width)
        wh_list = []
        for ar in aspect_ratios:
            if (ar == 1):
                box_height = box_width = this_scale * size
                wh_list.append((box_width, box_height))
                if two_boxes_for_ar1:
                    box_height = box_width = np.sqrt(this_scale * next_scale) * size
                    wh_list.append((box_width, box_height))
            else:
                box_width = this_scale * size * np.sqrt(ar)
                box_height = this_scale * size / np.sqrt(ar)
                wh_list.append((box_width, box_height))
        wh_list = np.array(wh_list)
        n_boxes = len(wh_list)

        if (this_steps is None):
            step_height = img_height / feature_map_size[0]
            step_width = img_width / feature_map_size[1]
        else:
            if isinstance(this_steps, (list, tuple)) and (len(this_steps) == 2):
                step_height = this_steps[0]
                step_width = this_steps[1]
            elif isinstance(this_steps, (int, float)):
                step_height = this_steps
                step_width = this_steps
        if (this_offsets is None):
            offset_height = 0.5
            offset_width = 0.5
        else:
            if isinstance(this_offsets, (list, tuple)) and (len(this_offsets) == 2):
                offset_height = this_offsets[0]
                offset_width = this_offsets[1]
            elif isinstance(this_offsets, (int, float)):
                offset_height = this_offsets
                offset_width = this_offsets
        cy = np.linspace(offset_height * step_height, (offset_height + feature_map_size[0] - 1) * step_height, feature_map_size[0])
        cx = np.linspace(offset_width * step_width, (offset_width + feature_map_size[1] - 1) * step_width, feature_map_size[1])
        cx_grid, cy_grid = np.meshgrid(cx, cy)
        cx_grid = np.expand_dims(cx_grid, -1) # This is necessary for np.tile() to do what we want further down
        cy_grid = np.expand_dims(cy_grid, -1) # This is necessary for np.tile() to do what we want further down

        boxes_tensor = np.zeros((feature_map_size[0], feature_map_size[1], n_boxes, 4))

        boxes_tensor[:, :, :, 0] = np.tile(cx_grid, (1, 1, n_boxes)) # Set cx
        boxes_tensor[:, :, :, 1] = np.tile(cy_grid, (1, 1, n_boxes)) # Set cy
        boxes_tensor[:, :, :, 2] = wh_list[:, 0] # Set w
        boxes_tensor[:, :, :, 3] = wh_list[:, 1] # Set h

        boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='centroids2corners')

        if clip_boxes:
            x_coords = boxes_tensor[:,:,:,[0, 2]]
            x_coords[x_coords >= img_width] = img_width - 1
            x_coords[x_coords < 0] = 0
            boxes_tensor[:,:,:,[0, 2]] = x_coords
            y_coords = boxes_tensor[:,:,:,[1, 3]]
            y_coords[y_coords >= img_height] = img_height - 1
            y_coords[y_coords < 0] = 0
            boxes_tensor[:,:,:,[1, 3]] = y_coords

        if normalize_coords:
            boxes_tensor[:, :, :, [0, 2]] /= img_width
            boxes_tensor[:, :, :, [1, 3]] /= img_height

        if coords == 'centroids':
            # Convert `(xmin, ymin, xmax, ymax)` back to `(cx, cy, w, h)`.
            boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2centroids', border_pixels='half')
        elif coords == 'minmax':
            # Convert `(xmin, ymin, xmax, ymax)` to `(xmin, xmax, ymin, ymax).
            boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2minmax', border_pixels='half')

        if diagnostics:
            return boxes_tensor, (cy, cx), wh_list, (step_height, step_width), (offset_height, offset_width)
        else:
            return boxes_tensor
Exemplo n.º 6
0
def decode_detections_debug(y_pred,
                            confidence_thresh=0.01,
                            iou_threshold=0.45,
                            top_k=200,
                            input_coords='centroids',
                            normalize_coords=True,
                            img_height=None,
                            img_width=None,
                            variance_encoded_in_target=False,
                            border_pixels='half'):
    if normalize_coords and ((img_height is None) or (img_width is None)):
        raise ValueError(
            "If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(
                img_height, img_width))

    # 1: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates

    y_pred_decoded_raw = np.copy(y_pred[:, :,
                                 :-8])  # Slice out the classes and the four offsets, throw away the anchor coordinates and variances, resulting in a tensor of shape `[batch, n_boxes, n_classes + 4 coordinates]`

    if input_coords == 'centroids':
        if variance_encoded_in_target:
            # Decode the predicted box center x and y coordinates.
            y_pred_decoded_raw[:, :, [-4, -3]] = y_pred_decoded_raw[:, :, [-4, -3]] * y_pred[:, :, [-6, -5]] + y_pred[:,
                                                                                                               :,
                                                                                                               [-8, -7]]
            # Decode the predicted box width and heigt.
            y_pred_decoded_raw[:, :, [-2, -1]] = np.exp(y_pred_decoded_raw[:, :, [-2, -1]]) * y_pred[:, :, [-6, -5]]
        else:
            # Decode the predicted box center x and y coordinates.
            y_pred_decoded_raw[:, :, [-4, -3]] = y_pred_decoded_raw[:, :, [-4, -3]] * y_pred[:, :, [-6, -5]] * y_pred[:,
                                                                                                               :, [-4,
                                                                                                                   -3]] + y_pred[
                                                                                                                          :,
                                                                                                                          :,
                                                                                                                          [
                                                                                                                              -8,
                                                                                                                              -7]]
            # Decode the predicted box width and heigt.
            y_pred_decoded_raw[:, :, [-2, -1]] = np.exp(
                y_pred_decoded_raw[:, :, [-2, -1]] * y_pred[:, :, [-2, -1]]) * y_pred[:, :, [-6, -5]]
        y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='centroids2corners')
    elif input_coords == 'minmax':
        y_pred_decoded_raw[:, :, -4:] *= y_pred[:, :,
                                         -4:]  # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        y_pred_decoded_raw[:, :, [-4, -3]] *= np.expand_dims(y_pred[:, :, -7] - y_pred[:, :, -8],
                                                             axis=-1)  # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        y_pred_decoded_raw[:, :, [-2, -1]] *= np.expand_dims(y_pred[:, :, -5] - y_pred[:, :, -6],
                                                             axis=-1)  # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        y_pred_decoded_raw[:, :, -4:] += y_pred[:, :, -8:-4]  # delta(pred) + anchor == pred for all four coordinates
        y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='minmax2corners')
    elif input_coords == 'corners':
        y_pred_decoded_raw[:, :, -4:] *= y_pred[:, :,
                                         -4:]  # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        y_pred_decoded_raw[:, :, [-4, -2]] *= np.expand_dims(y_pred[:, :, -6] - y_pred[:, :, -8],
                                                             axis=-1)  # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        y_pred_decoded_raw[:, :, [-3, -1]] *= np.expand_dims(y_pred[:, :, -5] - y_pred[:, :, -7],
                                                             axis=-1)  # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        y_pred_decoded_raw[:, :, -4:] += y_pred[:, :, -8:-4]  # delta(pred) + anchor == pred for all four coordinates
    else:
        raise ValueError(
            "Unexpected value for `input_coords`. Supported input coordinate formats are 'minmax', 'corners' and 'centroids'.")

    # 2: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that

    if normalize_coords:
        y_pred_decoded_raw[:, :, [-4, -2]] *= img_width  # Convert xmin, xmax back to absolute coordinates
        y_pred_decoded_raw[:, :, [-3, -1]] *= img_height  # Convert ymin, ymax back to absolute coordinates

    # 3: For each batch item, prepend each box's internal index to its coordinates.

    y_pred_decoded_raw2 = np.zeros((y_pred_decoded_raw.shape[0], y_pred_decoded_raw.shape[1],
                                    y_pred_decoded_raw.shape[2] + 1))  # Expand the last axis by one.
    y_pred_decoded_raw2[:, :, 1:] = y_pred_decoded_raw
    y_pred_decoded_raw2[:, :, 0] = np.arange(
        y_pred_decoded_raw.shape[1])  # Put the box indices as the first element for each box via broadcasting.
    y_pred_decoded_raw = y_pred_decoded_raw2

    # 4: Apply confidence thresholding and non-maximum suppression per class

    n_classes = y_pred_decoded_raw.shape[
                    -1] - 5  # The number of classes is the length of the last axis minus the four box coordinates and minus the index

    y_pred_decoded = []  # Store the final predictions in this list
    for batch_item in y_pred_decoded_raw:  # `batch_item` has shape `[n_boxes, n_classes + 4 coords]`
        pred = []  # Store the final predictions for this batch item here
        for class_id in range(1, n_classes):  # For each class except the background class (which has class ID 0)...
            single_class = batch_item[:, [0, class_id + 1, -4, -3, -2,
                                          -1]]  # ...keep only the confidences for that class, making this an array of shape `[n_boxes, 6]` and...
            threshold_met = single_class[single_class[:,
                                         1] > confidence_thresh]  # ...keep only those boxes with a confidence above the set threshold.
            if threshold_met.shape[0] > 0:  # If any boxes made the threshold...
                maxima = _greedy_nms_debug(threshold_met, iou_threshold=iou_threshold, coords='corners',
                                           border_pixels=border_pixels)  # ...perform NMS on them.
                maxima_output = np.zeros((maxima.shape[0], maxima.shape[
                    1] + 1))  # Expand the last dimension by one element to have room for the class ID. This is now an arrray of shape `[n_boxes, 6]`
                maxima_output[:, 0] = maxima[:, 0]  # Write the box index to the first column...
                maxima_output[:, 1] = class_id  # ...and write the class ID to the second column...
                maxima_output[:, 2:] = maxima[:, 1:]  # ...and write the rest of the maxima data to the other columns...
                pred.append(
                    maxima_output)  # ...and append the maxima for this class to the list of maxima for this batch item.
        # Once we're through with all classes, keep only the `top_k` maxima with the highest scores
        pred = np.concatenate(pred, axis=0)
        if pred.shape[
            0] > top_k:  # If we have more than `top_k` results left at this point, otherwise there is nothing to filter,...
            top_k_indices = np.argpartition(pred[:, 2], kth=pred.shape[0] - top_k, axis=0)[
                            pred.shape[0] - top_k:]  # ...get the indices of the `top_k` highest-score maxima...
            pred = pred[top_k_indices]  # ...and keep only those entries of `pred`...
        y_pred_decoded.append(
            pred)  # ...and now that we're done, append the array of final predictions for this batch item to the output list

    return y_pred_decoded
Exemplo n.º 7
0
def decode_detections(y_pred,
                      confidence_thresh=0.01,
                      iou_threshold=0.45,
                      top_k=200,
                      input_coords='centroids',
                      normalize_coords=True,
                      img_height=None,
                      img_width=None,
                      border_pixels='half'):
    '''
    将模型的输出转换为只包含正样本预测值的格式, 与`SSDInputEncoder`的输入格式一样

    解码以后, 为每一个类别, 有两个处理步骤:
    1. 根据 confidence_thresh 筛除概率小的框, 2. non-maximum suppression
    在处理完所有类别的以后, 将结果连接起来, 具有最大的概率的`top_k`个值就是每一幅图的最后结果.
    这个实现和原版 Caffe 的原理一样, 想更优化的实现, 请见 `decode_detections_fast()`,
    只是它是针对所有类别, 而不是对某一个类别.

    Arguments:
        y_pred (array): SSD 的输出, 期望是 Numpy array, 形状为 `(batch_size, #boxes, #classes + 4 + 4 + 4)`,
            其中 `#boxes` 模型为每一幅图预测的边界框的总数. 最后的维度包含的值为:
            `[one-hot 编码的类别, 预测边界框的 4 个相对坐标, anchor 的 4 个坐标, 4 个 variances 的值]`.
        confidence_thresh (float, optional): [0,1) 之间的实数, 能够被保留进入non-maximum suppression步骤的分类概率的最小值.
            值越小, 进入non-maximum suppression步骤的边界框数量越多.
        iou_threshold (float, optional): [0,1] 之间的实数. 所有与具有最大概率的边界框IoU大于iou_threshold都会被筛除掉.
        top_k (int, optional): 在non-maximum suppression步骤之后保留的具有最大概率的边界框的个数.
        input_coords (str, optional): 模型输出的坐标格式. 可以为 'centroids' `(cx, cy, w, h)`, 后者 'minmax'
            `(xmin, xmax, ymin, ymax)`, 或者 'corners' `(xmin, ymin, xmax, ymax)`.
        normalize_coords (bool, optional): 如果模型输出坐标为相对值 (值在[0,1]), 而且你希望将相对值转换为绝对值, 则设为 `True`. 如果模型输出为
            相对值, 而且你不希望将相对值转换为绝对值, 则设为 `False`. 如果模型输出为绝对值, 设置为 `False`.
            如果设置为`True`, 则同时需要设置 `img_height` 和 `img_width` .
        img_height (int, optional): 图像的高度. 当设置 `normalize_coords` 为 `True` 是需要.
        img_width (int, optional): 图像的宽度. 当设置 `normalize_coords` 为 `True` 是需要.
        border_pixels (str, optional): 如何处理边界框边界像素. 取值可以为 'include', 'exclude', 或者 'half'.

    Returns:
        A python list of length `batch_size` where each list element represents the predicted boxes
        for one image and contains a Numpy array of shape `(boxes, 6)` where each row is a box prediction for
        a non-background class for the respective image in the format `[class_id, confidence, xmin, ymin, xmax, ymax]`.
    '''
    if normalize_coords and ((img_height is None) or (img_width is None)):
        raise ValueError(
            "If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(
                img_height, img_width))

    # 1: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates

    y_pred_decoded_raw = np.copy(y_pred[:, :,
                                 :-8])  # Slice out the classes and the four offsets, throw away the anchor coordinates and variances, resulting in a tensor of shape `[batch, n_boxes, n_classes + 4 coordinates]`

    if input_coords == 'centroids':
        y_pred_decoded_raw[:, :, [-2, -1]] = np.exp(y_pred_decoded_raw[:, :, [-2, -1]] * y_pred[:, :, [-2,
                                                                                                       -1]])  # exp(ln(w(pred)/w(anchor)) / w_variance * w_variance) == w(pred) / w(anchor), exp(ln(h(pred)/h(anchor)) / h_variance * h_variance) == h(pred) / h(anchor)
        y_pred_decoded_raw[:, :, [-2, -1]] *= y_pred[:, :, [-6,
                                                            -5]]  # (w(pred) / w(anchor)) * w(anchor) == w(pred), (h(pred) / h(anchor)) * h(anchor) == h(pred)
        y_pred_decoded_raw[:, :, [-4, -3]] *= y_pred[:, :, [-4, -3]] * y_pred[:, :, [-6,
                                                                                     -5]]  # (delta_cx(pred) / w(anchor) / cx_variance) * cx_variance * w(anchor) == delta_cx(pred), (delta_cy(pred) / h(anchor) / cy_variance) * cy_variance * h(anchor) == delta_cy(pred)
        y_pred_decoded_raw[:, :, [-4, -3]] += y_pred[:, :, [-8,
                                                            -7]]  # delta_cx(pred) + cx(anchor) == cx(pred), delta_cy(pred) + cy(anchor) == cy(pred)
        y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='centroids2corners')
    elif input_coords == 'minmax':
        y_pred_decoded_raw[:, :, -4:] *= y_pred[:, :,
                                         -4:]  # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        y_pred_decoded_raw[:, :, [-4, -3]] *= np.expand_dims(y_pred[:, :, -7] - y_pred[:, :, -8],
                                                             axis=-1)  # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        y_pred_decoded_raw[:, :, [-2, -1]] *= np.expand_dims(y_pred[:, :, -5] - y_pred[:, :, -6],
                                                             axis=-1)  # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        y_pred_decoded_raw[:, :, -4:] += y_pred[:, :, -8:-4]  # delta(pred) + anchor == pred for all four coordinates
        y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='minmax2corners')
    elif input_coords == 'corners':
        y_pred_decoded_raw[:, :, -4:] *= y_pred[:, :,
                                         -4:]  # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        y_pred_decoded_raw[:, :, [-4, -2]] *= np.expand_dims(y_pred[:, :, -6] - y_pred[:, :, -8],
                                                             axis=-1)  # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        y_pred_decoded_raw[:, :, [-3, -1]] *= np.expand_dims(y_pred[:, :, -5] - y_pred[:, :, -7],
                                                             axis=-1)  # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        y_pred_decoded_raw[:, :, -4:] += y_pred[:, :, -8:-4]  # delta(pred) + anchor == pred for all four coordinates
    else:
        raise ValueError(
            "Unexpected value for `input_coords`. Supported input coordinate formats are 'minmax', 'corners' and 'centroids'.")

    # 2: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that

    if normalize_coords:
        y_pred_decoded_raw[:, :, [-4, -2]] *= img_width  # Convert xmin, xmax back to absolute coordinates
        y_pred_decoded_raw[:, :, [-3, -1]] *= img_height  # Convert ymin, ymax back to absolute coordinates

    # 3: Apply confidence thresholding and non-maximum suppression per class

    n_classes = y_pred_decoded_raw.shape[
                    -1] - 4  # The number of classes is the length of the last axis minus the four box coordinates

    y_pred_decoded = []  # Store the final predictions in this list
    for batch_item in y_pred_decoded_raw:  # `batch_item` has shape `[n_boxes, n_classes + 4 coords]`
        pred = []  # Store the final predictions for this batch item here
        for class_id in range(1, n_classes):  # For each class except the background class (which has class ID 0)...
            single_class = batch_item[:, [class_id, -4, -3, -2,
                                          -1]]  # ...keep only the confidences for that class, making this an array of shape `[n_boxes, 5]` and...
            threshold_met = single_class[single_class[:,
                                         0] > confidence_thresh]  # ...keep only those boxes with a confidence above the set threshold.
            if threshold_met.shape[0] > 0:  # If any boxes made the threshold...
                maxima = _greedy_nms(threshold_met, iou_threshold=iou_threshold, coords='corners',
                                     border_pixels=border_pixels)  # ...perform NMS on them.
                maxima_output = np.zeros((maxima.shape[0], maxima.shape[
                    1] + 1))  # Expand the last dimension by one element to have room for the class ID. This is now an arrray of shape `[n_boxes, 6]`
                maxima_output[:, 0] = class_id  # Write the class ID to the first column...
                maxima_output[:, 1:] = maxima  # ...and write the maxima to the other columns...
                pred.append(
                    maxima_output)  # ...and append the maxima for this class to the list of maxima for this batch item.
        # Once we're through with all classes, keep only the `top_k` maxima with the highest scores
        if pred:  # If there are any predictions left after confidence-thresholding...
            pred = np.concatenate(pred, axis=0)
            if top_k != 'all' and pred.shape[
                0] > top_k:  # If we have more than `top_k` results left at this point, otherwise there is nothing to filter,...
                top_k_indices = np.argpartition(pred[:, 1], kth=pred.shape[0] - top_k, axis=0)[
                                pred.shape[0] - top_k:]  # ...get the indices of the `top_k` highest-score maxima...
                pred = pred[top_k_indices]  # ...and keep only those entries of `pred`...
        else:
            pred = np.array(pred)  # Even if empty, `pred` must become a Numpy array.
        y_pred_decoded.append(
            pred)  # ...and now that we're done, append the array of final predictions for this batch item to the output list

    return y_pred_decoded
Exemplo n.º 8
0
def decode_detections(y_pred,
                      confidence_thresh=0.01,
                      iou_threshold=0.45,
                      top_k=200,
                      input_coords='centroids',
                      normalize_coords=True,
                      img_height=None,
                      img_width=None,
                      border_pixels='half'):
    if normalize_coords and ((img_height is None) or (img_width is None)):
        raise ValueError(
            "If relative box coordinates are supposed to be converted to absolute coordinates, "
            "the decoder needs the image size in order to decode the predictions, but `img_height == {}` "
            "and `img_width == {}`".format(img_height, img_width))

    # 1: 将框坐标从预测的锚框偏移量转换为预测的绝对坐标

    y_pred_decoded_raw = np.copy(
        y_pred[:, :, :-8]
    )  # shape `[batch, n_boxes, n_classes + 4 coordinates]`

    if input_coords == 'centroids':
        y_pred_decoded_raw[:, :, [-2, -1]] = np.exp(
            y_pred_decoded_raw[:, :, [-2, -1]] * y_pred[:, :, [-2, -1]]
        )  # exp(ln(w(pred)/w(anchor)) / w_variance * w_variance) == w(pred) / w(anchor), exp(ln(h(pred)/h(anchor)) / h_variance * h_variance) == h(pred) / h(anchor)
        y_pred_decoded_raw[:, :, [-2, -1]] *= y_pred[:, :, [
            -6, -5
        ]]  # (w(pred) / w(anchor)) * w(anchor) == w(pred), (h(pred) / h(anchor)) * h(anchor) == h(pred)
        y_pred_decoded_raw[:, :, [
            -4, -3
        ]] *= y_pred[:, :, [-4, -3]] * y_pred[:, :, [
            -6, -5
        ]]  # (delta_cx(pred) / w(anchor) / cx_variance) * cx_variance * w(anchor) == delta_cx(pred), (delta_cy(pred) / h(anchor) / cy_variance) * cy_variance * h(anchor) == delta_cy(pred)
        y_pred_decoded_raw[:, :, [-4, -3]] += y_pred[:, :, [
            -8, -7
        ]]  # delta_cx(pred) + cx(anchor) == cx(pred), delta_cy(pred) + cy(anchor) == cy(pred)
        y_pred_decoded_raw = convert_coordinates(
            y_pred_decoded_raw, start_index=-4, conversion='centroids2corners')
    elif input_coords == 'minmax':
        y_pred_decoded_raw[:, :,
                           -4:] *= y_pred[:, :,
                                          -4:]  # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        y_pred_decoded_raw[:, :, [-4, -3]] *= np.expand_dims(
            y_pred[:, :, -7] - y_pred[:, :, -8], axis=-1
        )  # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        y_pred_decoded_raw[:, :, [-2, -1]] *= np.expand_dims(
            y_pred[:, :, -5] - y_pred[:, :, -6], axis=-1
        )  # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        y_pred_decoded_raw[:, :,
                           -4:] += y_pred[:, :, -8:
                                          -4]  # delta(pred) + anchor == pred for all four coordinates
        y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw,
                                                 start_index=-4,
                                                 conversion='minmax2corners')
    elif input_coords == 'corners':
        y_pred_decoded_raw[:, :,
                           -4:] *= y_pred[:, :,
                                          -4:]  # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        y_pred_decoded_raw[:, :, [-4, -2]] *= np.expand_dims(
            y_pred[:, :, -6] - y_pred[:, :, -8], axis=-1
        )  # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        y_pred_decoded_raw[:, :, [-3, -1]] *= np.expand_dims(
            y_pred[:, :, -5] - y_pred[:, :, -7], axis=-1
        )  # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        y_pred_decoded_raw[:, :,
                           -4:] += y_pred[:, :, -8:
                                          -4]  # delta(pred) + anchor == pred for all four coordinates
    else:
        raise ValueError(
            "Unexpected value for `input_coords`. Supported input coordinate formats are 'minmax', 'corners' and 'centroids'."
        )

    # 2: 如果模型应用了归一化,那么执行下面操作。

    if normalize_coords:
        y_pred_decoded_raw[:, :, [
            -4, -2
        ]] *= img_width  # Convert xmin, xmax back to absolute coordinates
        y_pred_decoded_raw[:, :, [
            -3, -1
        ]] *= img_height  # Convert ymin, ymax back to absolute coordinates

    # 3: 对每个类别应用置信度阈值和非最大抑制

    n_classes = y_pred_decoded_raw.shape[-1] - 4  # 类的数量

    y_pred_decoded = []  # 将最终预测存储在此列表中
    for batch_item in y_pred_decoded_raw:  # `batch_item` has shape `[n_boxes, n_classes + 4 coords]`
        pred = []  # 在此存储此批次项目的最终预测
        for class_id in range(1, n_classes):
            single_class = batch_item[:, [class_id, -4, -3, -2, -1]]
            threshold_met = single_class[single_class[:,
                                                      0] > confidence_thresh]
            if threshold_met.shape[0] > 0:  # 如果有任何盒子达到阈值...
                maxima = _greedy_nms(
                    threshold_met,
                    iou_threshold=iou_threshold,
                    #coords='corners',
                    border_pixels=border_pixels)  # ...执行NMS。
                maxima_output = np.zeros(
                    (maxima.shape[0],
                     maxima.shape[1] + 1))  # shape `[n_boxes, 6]`
                maxima_output[:, 0] = class_id
                maxima_output[:, 1:] = maxima
                pred.append(maxima_output)
        # 完成所有类别后,仅保留得分最高的`top_k`最大值
        if pred:
            pred = np.concatenate(pred, axis=0)
            if top_k != 'all' and pred.shape[
                    0] > top_k:  # 如果目前尚余`top_k`个结果,则没有任何可过滤的内容,...
                top_k_indices = np.argpartition(
                    pred[:, 1], kth=pred.shape[0] - top_k,
                    axis=0)[pred.shape[0] -
                            top_k:]  # ...获取“ top_k”最高得分最大值的索引...
                pred = pred[top_k_indices]
        else:
            pred = np.array(pred)
        y_pred_decoded.append(pred)

    return y_pred_decoded
Exemplo n.º 9
0
def decode_detections_fast(y_pred,
                           confidence_thresh=0.5,
                           iou_threshold=0.45,
                           top_k='all',
                           input_coords='centroids',
                           normalize_coords=True,
                           img_height=None,
                           img_width=None,
                           border_pixels='half'):
    if normalize_coords and ((img_height is None) or (img_width is None)):
        raise ValueError(
            "If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(
                img_height, img_width))

    # 1: Convert the classes from one-hot encoding to their class ID
    y_pred_converted = np.copy(y_pred[:, :,
                               -14:-8])  # Slice out the four offset predictions plus two elements whereto we'll write the class IDs and confidences in the next step
    y_pred_converted[:, :, 0] = np.argmax(y_pred[:, :, :-12],
                                          axis=-1)  # The indices of the highest confidence values in the one-hot class vectors are the class ID
    y_pred_converted[:, :, 1] = np.amax(y_pred[:, :, :-12], axis=-1)  # Store the confidence values themselves, too

    # 2: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates
    if input_coords == 'centroids':
        y_pred_converted[:, :, [4, 5]] = np.exp(y_pred_converted[:, :, [4, 5]] * y_pred[:, :, [-2,
                                                                                               -1]])  # exp(ln(w(pred)/w(anchor)) / w_variance * w_variance) == w(pred) / w(anchor), exp(ln(h(pred)/h(anchor)) / h_variance * h_variance) == h(pred) / h(anchor)
        y_pred_converted[:, :, [4, 5]] *= y_pred[:, :, [-6,
                                                        -5]]  # (w(pred) / w(anchor)) * w(anchor) == w(pred), (h(pred) / h(anchor)) * h(anchor) == h(pred)
        y_pred_converted[:, :, [2, 3]] *= y_pred[:, :, [-4, -3]] * y_pred[:, :, [-6,
                                                                                 -5]]  # (delta_cx(pred) / w(anchor) / cx_variance) * cx_variance * w(anchor) == delta_cx(pred), (delta_cy(pred) / h(anchor) / cy_variance) * cy_variance * h(anchor) == delta_cy(pred)
        y_pred_converted[:, :, [2, 3]] += y_pred[:, :, [-8,
                                                        -7]]  # delta_cx(pred) + cx(anchor) == cx(pred), delta_cy(pred) + cy(anchor) == cy(pred)
        y_pred_converted = convert_coordinates(y_pred_converted, start_index=-4, conversion='centroids2corners')
    elif input_coords == 'minmax':
        y_pred_converted[:, :, 2:] *= y_pred[:, :,
                                      -4:]  # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        y_pred_converted[:, :, [2, 3]] *= np.expand_dims(y_pred[:, :, -7] - y_pred[:, :, -8],
                                                         axis=-1)  # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        y_pred_converted[:, :, [4, 5]] *= np.expand_dims(y_pred[:, :, -5] - y_pred[:, :, -6],
                                                         axis=-1)  # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        y_pred_converted[:, :, 2:] += y_pred[:, :, -8:-4]  # delta(pred) + anchor == pred for all four coordinates
        y_pred_converted = convert_coordinates(y_pred_converted, start_index=-4, conversion='minmax2corners')
    elif input_coords == 'corners':
        y_pred_converted[:, :, 2:] *= y_pred[:, :,
                                      -4:]  # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        y_pred_converted[:, :, [2, 4]] *= np.expand_dims(y_pred[:, :, -6] - y_pred[:, :, -8],
                                                         axis=-1)  # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        y_pred_converted[:, :, [3, 5]] *= np.expand_dims(y_pred[:, :, -5] - y_pred[:, :, -7],
                                                         axis=-1)  # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        y_pred_converted[:, :, 2:] += y_pred[:, :, -8:-4]  # delta(pred) + anchor == pred for all four coordinates
    else:
        raise ValueError("Unexpected value for `coords`. Supported values are 'minmax', 'corners' and 'centroids'.")

    # 3: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that
    if normalize_coords:
        y_pred_converted[:, :, [2, 4]] *= img_width  # Convert xmin, xmax back to absolute coordinates
        y_pred_converted[:, :, [3, 5]] *= img_height  # Convert ymin, ymax back to absolute coordinates

    # 4: Decode our huge `(batch, #boxes, 6)` tensor into a list of length `batch` where each list entry is an array containing only the positive predictions
    y_pred_decoded = []
    for batch_item in y_pred_converted:  # For each image in the batch...
        boxes = batch_item[
            np.nonzero(batch_item[:, 0])]  # ...get all boxes that don't belong to the background class,...
        boxes = boxes[boxes[:,
                      1] >= confidence_thresh]  # ...then filter out those positive boxes for which the prediction confidence is too low and after that...
        if iou_threshold:  # ...if an IoU threshold is set...
            boxes = _greedy_nms2(boxes, iou_threshold=iou_threshold, coords='corners',
                                 border_pixels=border_pixels)  # ...perform NMS on the remaining boxes.
        if top_k != 'all' and boxes.shape[0] > top_k:  # If we have more than `top_k` results left at this point...
            top_k_indices = np.argpartition(boxes[:, 1], kth=boxes.shape[0] - top_k, axis=0)[
                            boxes.shape[0] - top_k:]  # ...get the indices of the `top_k` highest-scoring boxes...
            boxes = boxes[top_k_indices]  # ...and keep only those boxes...
        y_pred_decoded.append(
            boxes)  # ...and now that we're done, append the array of final predictions for this batch item to the output list

    return y_pred_decoded
Exemplo n.º 10
0
def decode_detections_debug(y_pred,
                            confidence_thresh=0.01,
                            iou_threshold=0.45,
                            top_k=200,
                            input_coords='centroids',
                            normalize_coords=True,
                            img_height=None,
                            img_width=None,
                            variance_encoded_in_target=False,
                            border_pixels='half'):
    '''
    This decoder performs the same processing as `decode_detections()`, but the output format for each left-over
    predicted box is `[box_id, class_id, confidence, xmin, ymin, xmax, ymax]`.

    That is, in addition to the usual data, each predicted box has the internal index of that box within
    the model (`box_id`) prepended to it. This allows you to know exactly which part of the model made a given
    box prediction; in particular, it allows you to know which predictor layer made a given prediction.
    This can be useful for debugging.

    Arguments:
        y_pred (array): The prediction output of the SSD model, expected to be a Numpy array
            of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)`, where `#boxes` is the total number of
            boxes predicted by the model per image and the last axis contains
            `[one-hot vector for the classes, 4 predicted coordinate offsets, 4 anchor box coordinates, 4 variances]`.
        confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
            positive class in order to be considered for the non-maximum suppression stage for the respective class.
            A lower value will result in a larger part of the selection process being done by the non-maximum suppression
            stage, while a larger value will result in a larger part of the selection process happening in the confidence
            thresholding stage.
        iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold`
            with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
            to the box score.
        top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
            non-maximum suppression stage.
        input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids'
            for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format
            `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
        normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
            and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
            relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
            Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
            coordinates. Requires `img_height` and `img_width` if set to `True`.
        img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
        img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
        border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
            Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
            to the boxes. If 'exclude', the border pixels do not belong to the boxes.
            If 'half', then one of each of the two horizontal and vertical borders belong
            to the boxex, but not the other.

    Returns:
        A python list of length `batch_size` where each list element represents the predicted boxes
        for one image and contains a Numpy array of shape `(boxes, 7)` where each row is a box prediction for
        a non-background class for the respective image in the format `[box_id, class_id, confidence, xmin, ymin, xmax, ymax]`.
    '''
    if normalize_coords and ((img_height is None) or (img_width is None)):
        raise ValueError(
            "If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`"
            .format(img_height, img_width))

    # 1: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates

    y_pred_decoded_raw = np.copy(
        y_pred[:, :, :-8]
    )  # Slice out the classes and the four offsets, throw away the anchor coordinates and variances, resulting in a tensor of shape `[batch, n_boxes, n_classes + 4 coordinates]`

    if input_coords == 'centroids':
        if variance_encoded_in_target:
            # Decode the predicted box center x and y coordinates.
            y_pred_decoded_raw[:, :, [-4, -3]] = y_pred_decoded_raw[:, :, [
                -4, -3
            ]] * y_pred[:, :, [-6, -5]] + y_pred[:, :, [-8, -7]]
            # Decode the predicted box width and heigt.
            y_pred_decoded_raw[:, :, [-2, -1]] = np.exp(
                y_pred_decoded_raw[:, :, [-2, -1]]) * y_pred[:, :, [-6, -5]]
        else:
            # Decode the predicted box center x and y coordinates.
            y_pred_decoded_raw[:, :, [-4, -3]] = y_pred_decoded_raw[:, :, [
                -4, -3
            ]] * y_pred[:, :, [-6, -5]] * y_pred[:, :,
                                                 [-4, -3]] + y_pred[:, :,
                                                                    [-8, -7]]
            # Decode the predicted box width and heigt.
            y_pred_decoded_raw[:, :, [-2, -1]] = np.exp(
                y_pred_decoded_raw[:, :, [-2, -1]] *
                y_pred[:, :, [-2, -1]]) * y_pred[:, :, [-6, -5]]
        y_pred_decoded_raw = convert_coordinates(
            y_pred_decoded_raw, start_index=-4, conversion='centroids2corners')
    elif input_coords == 'minmax':
        y_pred_decoded_raw[:, :,
                           -4:] *= y_pred[:, :,
                                          -4:]  # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        y_pred_decoded_raw[:, :, [-4, -3]] *= np.expand_dims(
            y_pred[:, :, -7] - y_pred[:, :, -8], axis=-1
        )  # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        y_pred_decoded_raw[:, :, [-2, -1]] *= np.expand_dims(
            y_pred[:, :, -5] - y_pred[:, :, -6], axis=-1
        )  # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        y_pred_decoded_raw[:, :,
                           -4:] += y_pred[:, :, -8:
                                          -4]  # delta(pred) + anchor == pred for all four coordinates
        y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw,
                                                 start_index=-4,
                                                 conversion='minmax2corners')
    elif input_coords == 'corners':
        y_pred_decoded_raw[:, :,
                           -4:] *= y_pred[:, :,
                                          -4:]  # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        y_pred_decoded_raw[:, :, [-4, -2]] *= np.expand_dims(
            y_pred[:, :, -6] - y_pred[:, :, -8], axis=-1
        )  # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        y_pred_decoded_raw[:, :, [-3, -1]] *= np.expand_dims(
            y_pred[:, :, -5] - y_pred[:, :, -7], axis=-1
        )  # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        y_pred_decoded_raw[:, :,
                           -4:] += y_pred[:, :, -8:
                                          -4]  # delta(pred) + anchor == pred for all four coordinates
    else:
        raise ValueError(
            "Unexpected value for `input_coords`. Supported input coordinate formats are 'minmax', 'corners' and 'centroids'."
        )

    # 2: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that

    if normalize_coords:
        y_pred_decoded_raw[:, :, [
            -4, -2
        ]] *= img_width  # Convert xmin, xmax back to absolute coordinates
        y_pred_decoded_raw[:, :, [
            -3, -1
        ]] *= img_height  # Convert ymin, ymax back to absolute coordinates

    # 3: For each batch item, prepend each box's internal index to its coordinates.

    y_pred_decoded_raw2 = np.zeros(
        (y_pred_decoded_raw.shape[0], y_pred_decoded_raw.shape[1],
         y_pred_decoded_raw.shape[2] + 1))  # Expand the last axis by one.
    y_pred_decoded_raw2[:, :, 1:] = y_pred_decoded_raw
    y_pred_decoded_raw2[:, :, 0] = np.arange(
        y_pred_decoded_raw.shape[1]
    )  # Put the box indices as the first element for each box via broadcasting.
    y_pred_decoded_raw = y_pred_decoded_raw2

    # 4: Apply confidence thresholding and non-maximum suppression per class

    n_classes = y_pred_decoded_raw.shape[
        -1] - 5  # The number of classes is the length of the last axis minus the four box coordinates and minus the index

    y_pred_decoded = []  # Store the final predictions in this list
    for batch_item in y_pred_decoded_raw:  # `batch_item` has shape `[n_boxes, n_classes + 4 coords]`
        pred = []  # Store the final predictions for this batch item here
        for class_id in range(
                1, n_classes
        ):  # For each class except the background class (which has class ID 0)...
            single_class = batch_item[:, [
                0, class_id + 1, -4, -3, -2, -1
            ]]  # ...keep only the confidences for that class, making this an array of shape `[n_boxes, 6]` and...
            threshold_met = single_class[
                single_class[:, 1] >
                confidence_thresh]  # ...keep only those boxes with a confidence above the set threshold.
            if threshold_met.shape[0] > 0:  # If any boxes made the threshold...
                maxima = _greedy_nms_debug(
                    threshold_met,
                    iou_threshold=iou_threshold,
                    coords='corners',
                    border_pixels=border_pixels)  # ...perform NMS on them.
                maxima_output = np.zeros(
                    (maxima.shape[0], maxima.shape[1] + 1)
                )  # Expand the last dimension by one element to have room for the class ID. This is now an arrray of shape `[n_boxes, 6]`
                maxima_output[:,
                              0] = maxima[:,
                                          0]  # Write the box index to the first column...
                maxima_output[:,
                              1] = class_id  # ...and write the class ID to the second column...
                maxima_output[:,
                              2:] = maxima[:,
                                           1:]  # ...and write the rest of the maxima data to the other columns...
                pred.append(
                    maxima_output
                )  # ...and append the maxima for this class to the list of maxima for this batch item.
        # Once we're through with all classes, keep only the `top_k` maxima with the highest scores
        pred = np.concatenate(pred, axis=0)
        if pred.shape[
                0] > top_k:  # If we have more than `top_k` results left at this point, otherwise there is nothing to filter,...
            top_k_indices = np.argpartition(
                pred[:, 2], kth=pred.shape[0] - top_k, axis=0
            )[pred.shape[0] -
              top_k:]  # ...get the indices of the `top_k` highest-score maxima...
            pred = pred[
                top_k_indices]  # ...and keep only those entries of `pred`...
        y_pred_decoded.append(
            pred
        )  # ...and now that we're done, append the array of final predictions for this batch item to the output list

    return y_pred_decoded
Exemplo n.º 11
0
def decode_detections_fast(y_pred,
                           confidence_thresh=0.5,
                           iou_threshold=0.45,
                           top_k='all',
                           input_coords='centroids',
                           normalize_coords=True,
                           img_height=None,
                           img_width=None,
                           border_pixels='half'):
    '''
    Convert model prediction output back to a format that contains only the positive box predictions
    (i.e. the same format that `enconde_y()` takes as input).

    Optionally performs confidence thresholding and greedy non-maximum suppression after the decoding stage.

    Note that the decoding procedure used here is not the same as the procedure used in the original Caffe implementation.
    For each box, the procedure used here assigns the box's highest confidence as its predicted class. Then it removes
    all boxes for which the highest confidence is the background class. This results in less work for the subsequent
    non-maximum suppression, because the vast majority of the predictions will be filtered out just by the fact that
    their highest confidence is for the background class. It is much more efficient than the procedure of the original
    implementation, but the results may also differ.

    Arguments:
        y_pred (array): The prediction output of the SSD model, expected to be a Numpy array
            of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)`, where `#boxes` is the total number of
            boxes predicted by the model per image and the last axis contains
            `[one-hot vector for the classes, 4 predicted coordinate offsets, 4 anchor box coordinates, 4 variances]`.
        confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in any positive
            class required for a given box to be considered a positive prediction. A lower value will result
            in better recall, while a higher value will result in better precision. Do not use this parameter with the
            goal to combat the inevitably many duplicates that an SSD will produce, the subsequent non-maximum suppression
            stage will take care of those.
        iou_threshold (float, optional): `None` or a float in [0,1]. If `None`, no non-maximum suppression will be
            performed. If not `None`, greedy NMS will be performed after the confidence thresholding stage, meaning
            all boxes with a Jaccard similarity of greater than `iou_threshold` with a locally maximal box will be removed
            from the set of predictions, where 'maximal' refers to the box score.
        top_k (int, optional): 'all' or an integer with number of highest scoring predictions to be kept for each batch item
            after the non-maximum suppression stage. If 'all', all predictions left after the NMS stage will be kept.
        input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids'
            for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format
            `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
        normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
            and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
            relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
            Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
            coordinates. Requires `img_height` and `img_width` if set to `True`.
        img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
        img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
        border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
            Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
            to the boxes. If 'exclude', the border pixels do not belong to the boxes.
            If 'half', then one of each of the two horizontal and vertical borders belong
            to the boxex, but not the other.

    Returns:
        A python list of length `batch_size` where each list element represents the predicted boxes
        for one image and contains a Numpy array of shape `(boxes, 6)` where each row is a box prediction for
        a non-background class for the respective image in the format `[class_id, confidence, xmin, xmax, ymin, ymax]`.
    '''
    if normalize_coords and ((img_height is None) or (img_width is None)):
        raise ValueError(
            "If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`"
            .format(img_height, img_width))

    # 1: Convert the classes from one-hot encoding to their class ID
    y_pred_converted = np.copy(
        y_pred[:, :, -14:-8]
    )  # Slice out the four offset predictions plus two elements whereto we'll write the class IDs and confidences in the next step
    y_pred_converted[:, :, 0] = np.argmax(
        y_pred[:, :, :-12], axis=-1
    )  # The indices of the highest confidence values in the one-hot class vectors are the class ID
    y_pred_converted[:, :, 1] = np.amax(
        y_pred[:, :, :-12],
        axis=-1)  # Store the confidence values themselves, too

    # 2: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates
    if input_coords == 'centroids':
        y_pred_converted[:, :, [4, 5]] = np.exp(
            y_pred_converted[:, :, [4, 5]] * y_pred[:, :, [-2, -1]]
        )  # exp(ln(w(pred)/w(anchor)) / w_variance * w_variance) == w(pred) / w(anchor), exp(ln(h(pred)/h(anchor)) / h_variance * h_variance) == h(pred) / h(anchor)
        y_pred_converted[:, :, [4, 5]] *= y_pred[:, :, [
            -6, -5
        ]]  # (w(pred) / w(anchor)) * w(anchor) == w(pred), (h(pred) / h(anchor)) * h(anchor) == h(pred)
        y_pred_converted[:, :, [
            2, 3
        ]] *= y_pred[:, :, [-4, -3]] * y_pred[:, :, [
            -6, -5
        ]]  # (delta_cx(pred) / w(anchor) / cx_variance) * cx_variance * w(anchor) == delta_cx(pred), (delta_cy(pred) / h(anchor) / cy_variance) * cy_variance * h(anchor) == delta_cy(pred)
        y_pred_converted[:, :, [2, 3]] += y_pred[:, :, [
            -8, -7
        ]]  # delta_cx(pred) + cx(anchor) == cx(pred), delta_cy(pred) + cy(anchor) == cy(pred)
        y_pred_converted = convert_coordinates(y_pred_converted,
                                               start_index=-4,
                                               conversion='centroids2corners')
    elif input_coords == 'minmax':
        y_pred_converted[:, :,
                         2:] *= y_pred[:, :,
                                       -4:]  # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        y_pred_converted[:, :, [2, 3]] *= np.expand_dims(
            y_pred[:, :, -7] - y_pred[:, :, -8], axis=-1
        )  # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        y_pred_converted[:, :, [4, 5]] *= np.expand_dims(
            y_pred[:, :, -5] - y_pred[:, :, -6], axis=-1
        )  # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        y_pred_converted[:, :,
                         2:] += y_pred[:, :, -8:
                                       -4]  # delta(pred) + anchor == pred for all four coordinates
        y_pred_converted = convert_coordinates(y_pred_converted,
                                               start_index=-4,
                                               conversion='minmax2corners')
    elif input_coords == 'corners':
        y_pred_converted[:, :,
                         2:] *= y_pred[:, :,
                                       -4:]  # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
        y_pred_converted[:, :, [2, 4]] *= np.expand_dims(
            y_pred[:, :, -6] - y_pred[:, :, -8], axis=-1
        )  # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
        y_pred_converted[:, :, [3, 5]] *= np.expand_dims(
            y_pred[:, :, -5] - y_pred[:, :, -7], axis=-1
        )  # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
        y_pred_converted[:, :,
                         2:] += y_pred[:, :, -8:
                                       -4]  # delta(pred) + anchor == pred for all four coordinates
    else:
        raise ValueError(
            "Unexpected value for `coords`. Supported values are 'minmax', 'corners' and 'centroids'."
        )

    # 3: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that
    if normalize_coords:
        y_pred_converted[:, :, [
            2, 4
        ]] *= img_width  # Convert xmin, xmax back to absolute coordinates
        y_pred_converted[:, :, [
            3, 5
        ]] *= img_height  # Convert ymin, ymax back to absolute coordinates

    # 4: Decode our huge `(batch, #boxes, 6)` tensor into a list of length `batch` where each list entry is an array containing only the positive predictions
    y_pred_decoded = []
    for batch_item in y_pred_converted:  # For each image in the batch...
        boxes = batch_item[np.nonzero(
            batch_item[:, 0]
        )]  # ...get all boxes that don't belong to the background class,...
        boxes = boxes[
            boxes[:, 1] >=
            confidence_thresh]  # ...then filter out those positive boxes for which the prediction confidence is too low and after that...
        if iou_threshold:  # ...if an IoU threshold is set...
            boxes = _greedy_nms2(boxes,
                                 iou_threshold=iou_threshold,
                                 coords='corners',
                                 border_pixels=border_pixels
                                 )  # ...perform NMS on the remaining boxes.
        if top_k != 'all' and boxes.shape[
                0] > top_k:  # If we have more than `top_k` results left at this point...
            top_k_indices = np.argpartition(
                boxes[:, 1], kth=boxes.shape[0] - top_k, axis=0
            )[boxes.shape[0] -
              top_k:]  # ...get the indices of the `top_k` highest-scoring boxes...
            boxes = boxes[top_k_indices]  # ...and keep only those boxes...
        y_pred_decoded.append(
            boxes
        )  # ...and now that we're done, append the array of final predictions for this batch item to the output list

    return y_pred_decoded
Exemplo n.º 12
0
    def call(self, x, mask=None):
        '''
        基于输入张量的shape,返回一个锚点框张量。
        实现逻辑和模块`ssd_box_encode_decode_utils.py`一致。
        请注意,该张量在运行时不参与任何图形计算。 在图形创建期间将其一次创建为一个常数,
        并在运行时将其与模型输出的其余部分一起输出。 因此,所有逻辑都实现为Numpy数组操作,
        并且在将其输出之前将最终的Numpy数组转换为Keras张量就足够了。
        Arguments:
            x (tensor): 4维张量 `(batch, height, width, channels)` .
            这一层的输入必须维标准化预测层的输出。
        '''
        #计算每一个基于纵横比的包围框的宽和高,图像较短的边将用于计算w和h,用`scale`和`aspect_ratios`。
        size = min(self.img_height, self.img_width)
        # 计算所有纵横比例包围框的宽和高。
        wh_list = []
        for ar in self.aspect_ratios:
            if (ar == 1):
                # 计算纵横比为1的标准锚点包围框。
                box_height = box_width = self.this_scale * size
                wh_list.append((box_width, box_height))
                if self.two_boxes_for_ar1:
                    # 使用此比例值和下一个比例的几何平均值计算一个稍大的版本。
                    box_height = box_width = np.sqrt(
                        self.this_scale * self.next_scale) * size
                    wh_list.append((box_width, box_height))
            else:
                box_height = self.this_scale * size / np.sqrt(ar)
                box_width = self.this_scale * size * np.sqrt(ar)
                wh_list.append((box_width, box_height))
        wh_list = np.array(wh_list)

        # 我们需要输入张量的尺寸
        batch_size, feature_map_height, feature_map_width, feature_map_channels = x.shape

        # 计算中心点的格子线,它们对于所有宽高比都是相同的。
        # 计算步长,即锚框中心点在垂直和水平方向上相距多远。
        step_height = self.this_steps
        step_width = self.this_steps

        # 计算偏移量,即第一个锚点框中心点的像素值是从图像的顶部和左侧开始。
        offset_height = self.this_offsets
        offset_width = self.this_offsets

        # 现在我们有了偏移量和步长,计算锚点盒中心点的网格。
        cy = np.linspace(
            (offset_height * step_height),
            (offset_height + int(feature_map_height) - 1) * step_height,
            int(feature_map_height))
        cx = np.linspace(offset_width * step_width,
                         (offset_width + int(feature_map_width) - 1) *
                         step_width, int(feature_map_width))
        cx_grid, cy_grid = np.meshgrid(cx, cy)
        cx_grid = np.expand_dims(cx_grid, -1)  # 这对于np.tile()进行进一步的操作是必要的
        cy_grid = np.expand_dims(cy_grid, -1)  # 这对于np.tile()进行进一步的操作是必要的

        # 创造一个4维张量尺寸模板`(feature_map_height, feature_map_width, n_boxes, 4)`
        # 最后一个维度将包含`(cx,cy,w,h)`
        boxes_tensor = np.zeros(
            (feature_map_height, feature_map_width, self.n_boxes, 4))

        boxes_tensor[:, :, :, 0] = np.tile(cx_grid,
                                           (1, 1, self.n_boxes))  # 设置 cx
        boxes_tensor[:, :, :, 1] = np.tile(cy_grid,
                                           (1, 1, self.n_boxes))  # 设置 cy
        boxes_tensor[:, :, :, 2] = wh_list[:, 0]  # 设置 w
        boxes_tensor[:, :, :, 3] = wh_list[:, 1]  # 设置 h

        # 转换 `(cx, cy, w, h)` 为 `(xmin, xmax, ymin, ymax)`
        boxes_tensor = convert_coordinates(boxes_tensor,
                                           start_index=0,
                                           conversion='centroids2corners')

        # 如果启用了“ normalize_coords”,则将坐标标准化为[0,1]以内
        if self.normalize_coords:
            boxes_tensor[:, :, :, [0, 2]] /= self.img_width
            boxes_tensor[:, :, :, [1, 3]] /= self.img_height

        # 转换 `(xmin, ymin, xmax, ymax)` 为 `(cx, cy, w, h)`.
        boxes_tensor = convert_coordinates(boxes_tensor,
                                           start_index=0,
                                           conversion='corners2centroids',
                                           border_pixels='half')

        # 创建一个张量以包含方差并将其附加到`boxes_tensor`。 该张量具有与“ boxes_tensor”相同的形状,
        # 并且对于最后一个轴上的每个位置仅包含相同的4个方差值。
        variances_tensor = np.zeros_like(
            boxes_tensor
        )  # 形状为 `(feature_map_height, feature_map_width, n_boxes, 4)`
        variances_tensor += self.variances
        # 现在 `boxes_tensor` 变为一个形状为 `(feature_map_height, feature_map_width, n_boxes, 8)`
        boxes_tensor = np.concatenate((boxes_tensor, variances_tensor),
                                      axis=-1)

        # 现在在“ boxes_tensor”前面添加一个尺寸以说明批量大小并将其平铺
        # 结果将为一个5维的数据`(batch_size, feature_map_height, feature_map_width, n_boxes, 8)`
        boxes_tensor = np.expand_dims(boxes_tensor, axis=0)
        boxes_tensor = K.tile(K.constant(boxes_tensor, dtype='float32'),
                              (K.shape(x)[0], 1, 1, 1, 1))

        return boxes_tensor
Exemplo n.º 13
0
    def generate_anchor_boxes_for_layer(self,
                                        feature_map_size,
                                        aspect_ratios,
                                        this_scale,
                                        next_scale,
                                        this_steps,
                                        this_offsets,
                                        diagnostics=False):
        size = min(self.img_height, self.img_width)
        # 计算所有纵横比的框宽和高
        wh_list = []
        for ar in aspect_ratios:
            if (ar == 1):
                box_height = box_width = this_scale * size
                wh_list.append((box_width, box_height))
                if self.two_boxes_for_ar1:
                    box_height = box_width = np.sqrt(
                        this_scale * next_scale) * size
                    wh_list.append((box_width, box_height))
            else:
                box_width = this_scale * size * np.sqrt(ar)
                box_height = this_scale * size / np.sqrt(ar)
                wh_list.append((box_width, box_height))
        wh_list = np.array(wh_list)
        n_boxes = len(wh_list)

        # 计算步长尺寸
        step_height = this_steps
        step_width = this_steps

        # 计算偏执
        offset_height = this_offsets
        offset_width = this_offsets

        # 计算锚框中心点的网格。
        cy = np.linspace(offset_height * step_height,
                         (offset_height + feature_map_size[0] - 1) *
                         step_height, feature_map_size[0])
        cx = np.linspace(offset_width * step_width,
                         (offset_width + feature_map_size[1] - 1) * step_width,
                         feature_map_size[1])
        cx_grid, cy_grid = np.meshgrid(cx, cy)
        cx_grid = np.expand_dims(
            cx_grid, -1
        )  # This is necessary for np.tile() to do what we want further down
        cy_grid = np.expand_dims(
            cy_grid, -1
        )  # This is necessary for np.tile() to do what we want further down

        # 创建形状的4D张量模板`(feature_map_height, feature_map_width, n_boxes, 4)`
        # 最后一个维度将包含的位置 `(cx, cy, w, h)`
        boxes_tensor = np.zeros(
            (feature_map_size[0], feature_map_size[1], n_boxes, 4))

        boxes_tensor[:, :, :, 0] = np.tile(cx_grid, (1, 1, n_boxes))  # Set cx
        boxes_tensor[:, :, :, 1] = np.tile(cy_grid, (1, 1, n_boxes))  # Set cy
        boxes_tensor[:, :, :, 2] = wh_list[:, 0]  # Set w
        boxes_tensor[:, :, :, 3] = wh_list[:, 1]  # Set h

        # Convert `(cx, cy, w, h)` to `(xmin, ymin, xmax, ymax)`
        boxes_tensor = convert_coordinates(boxes_tensor,
                                           start_index=0,
                                           conversion='centroids2corners')

        if self.normalize_coords:
            boxes_tensor[:, :, :, [0, 2]] /= self.img_width
            boxes_tensor[:, :, :, [1, 3]] /= self.img_height

        # Convert `(xmin, ymin, xmax, ymax)` back to `(cx, cy, w, h)`.
        boxes_tensor = convert_coordinates(boxes_tensor,
                                           start_index=0,
                                           conversion='corners2centroids',
                                           border_pixels='half')

        if diagnostics:
            return boxes_tensor, (cy,
                                  cx), wh_list, (step_height,
                                                 step_width), (offset_height,
                                                               offset_width)
        else:
            return boxes_tensor
Exemplo n.º 14
0
    def __call__(self, ground_truth_labels, diagnostics=False):
        # 映射以定义哪些索引代表真实情况中的哪个坐标。
        class_id = 0
        xmin = 1
        ymin = 2
        xmax = 3
        ymax = 4

        batch_size = len(ground_truth_labels)

        ##################################################################################
        # 为y_encoded生成模板。
        ##################################################################################

        y_encoded = self.generate_encoding_template(batch_size=batch_size,
                                                    diagnostics=False)

        ##################################################################################
        # 匹配真实包围框到锚点框
        ##################################################################################
        y_encoded[:, :, self.background_id] = 1  # 所有包围框默认为背景
        class_vectors = np.eye(self.n_classes)  # 用one-hot类向量来定义矩阵

        for i in range(batch_size):
            if ground_truth_labels[i].size == 0: continue
            labels = ground_truth_labels[i].astype(np.float)

            # 如果规范框坐标。
            if self.normalize_coords:
                labels[:, [ymin, ymax]] /= self.img_height
                labels[:, [xmin, xmax]] /= self.img_width

            # 也许转换包围框坐标格式。
            labels = convert_coordinates(labels,
                                         start_index=xmin,
                                         conversion='corners2centroids')

            classes_one_hot = class_vectors[labels[:, class_id].astype(np.int)]
            labels_one_hot = np.concatenate(
                [classes_one_hot, labels[:, [xmin, ymin, xmax, ymax]]],
                axis=-1)

            similarities = iou(labels[:, [xmin, ymin, xmax, ymax]],
                               y_encoded[i, :, -12:-8])

            # 对于每个真相框,获取与之最匹配的锚框。
            bipartite_matches = match_bipartite_greedy(
                weight_matrix=similarities)

            y_encoded[i, bipartite_matches, :-8] = labels_one_hot

            similarities[:, bipartite_matches] = 0

            if self.matching_type == 'multi':  #进行多级匹配
                # 获取所有满足的匹配
                matches = match_multi(weight_matrix=similarities,
                                      threshold=self.pos_iou_threshold)

                y_encoded[i, matches[1], :-8] = labels_one_hot[matches[0]]

                similarities[:, matches[1]] = 0

            max_background_similarities = np.amax(similarities, axis=0)
            neutral_boxes = np.nonzero(
                max_background_similarities >= self.neg_iou_limit)[0]
            y_encoded[i, neutral_boxes, self.background_id] = 0

        ##################################################################################
        # 将框坐标转换为锚框偏移量。
        ##################################################################################
        y_encoded[:, :, [-12, -11]] -= y_encoded[:, :, [
            -8, -7
        ]]  # cx(gt) - cx(anchor), cy(gt) - cy(anchor)
        y_encoded[:, :, [
            -12, -11
        ]] /= y_encoded[:, :, [-6, -5]] * y_encoded[:, :, [
            -4, -3
        ]]  # (cx(gt) - cx(anchor)) / w(anchor) / cx_variance, (cy(gt) - cy(anchor)) / h(anchor) / cy_variance
        y_encoded[:, :, [-10, -9]] /= y_encoded[:, :, [
            -6, -5
        ]]  # w(gt) / w(anchor), h(gt) / h(anchor)
        y_encoded[:, :, [-10, -9]] = np.log(
            y_encoded[:, :, [-10, -9]]
        ) / y_encoded[:, :, [
            -2, -1
        ]]  # ln(w(gt) / w(anchor)) / w_variance, ln(h(gt) / h(anchor)) / h_variance (ln == natural logarithm)

        if diagnostics:
            y_matched_anchors = np.copy(y_encoded)
            y_matched_anchors[:, :, -12:-8] = 0
            return y_encoded, y_matched_anchors
        else:
            return y_encoded  #[batch_sizes, total_boxes, [one_hot_label, [4个值为真实包围框], [4个值为锚点框] ,[4个值为var]]]