def _build(self, image, gt_boxes=None, is_training=False):

        if gt_boxes is not None:
            gt_boxes = tf.cast(gt_boxes, tf.float32)

        image.set_shape((None, None, 3))

        conv_feature_map = self.base_network(
            tf.expand_dims(image, 0), is_training=is_training
        )

        # The RPN submodule which generates proposals of objects.
        self._rpn = RPN(
            self._num_anchors, self._config.model.rpn,
            debug=self._debug, seed=self._seed
        )
        if self._with_rcnn:
            self._rcnn = RCNN(
                self._num_classes, self._config.model.rcnn,
                debug=self._debug, seed=self._seed
            )

        image_shape = tf.shape(image)[0:2]

        variable_summaries(
            conv_feature_map, 'conv_feature_map', 'reduced'
        )

        all_anchors = self._generate_anchors(tf.shape(conv_feature_map))
        rpn_prediction = self._rpn(
            conv_feature_map, image_shape, all_anchors,
            gt_boxes=gt_boxes, is_training=is_training
        )

        prediction_dict = {
            'rpn_prediction': rpn_prediction,
        }

        if self._debug:
            prediction_dict['image'] = image
            prediction_dict['image_shape'] = image_shape
            prediction_dict['all_anchors'] = all_anchors
            prediction_dict['anchor_reference'] = tf.convert_to_tensor(
                self._anchor_reference
            )
            if gt_boxes is not None:
                prediction_dict['gt_boxes'] = gt_boxes
            prediction_dict['conv_feature_map'] = conv_feature_map

        if self._with_rcnn:
            proposals = tf.stop_gradient(rpn_prediction['proposals'])
            classification_pred = self._rcnn(
                conv_feature_map, proposals,
                image_shape, self.base_network,
                gt_boxes=gt_boxes, is_training=is_training
            )

            prediction_dict['classification_prediction'] = classification_pred

        return prediction_dict
예제 #2
0
def clip_gradients_by_norm(grads_and_vars, add_to_summary=True):
    if add_to_summary:
        for grad, var in grads_and_vars:
            if grad is not None:
                variable_summaries(grad, 'grad/{}'.format(var.name[:-2]),
                                   'full')
                variable_summaries(tf.abs(grad),
                                   'grad/abs/{}'.format(var.name[:-2]), 'full')

    # Clip by norm. Grad can be null when not training some modules.
    with tf.name_scope('clip_gradients_by_norm'):
        grads_and_vars = [(tf.check_numerics(tf.clip_by_norm(gv[0], 10.),
                                             'Invalid gradient'),
                           gv[1]) if gv[0] is not None else gv
                          for gv in grads_and_vars]

    if add_to_summary:
        for grad, var in grads_and_vars:
            if grad is not None:
                variable_summaries(grad,
                                   'clipped_grad/{}'.format(var.name[:-2]),
                                   'full')
                variable_summaries(tf.abs(grad),
                                   'clipped_grad/{}'.format(var.name[:-2]),
                                   'full')

    return grads_and_vars
예제 #3
0
def clip_gradients_by_norm(grads_and_vars, add_to_summary=False):
    if add_to_summary:
        for grad, var in grads_and_vars:
            if grad is not None:
                variable_summaries(grad, "grad/{}".format(var.name[:-2]), "full")
                variable_summaries(
                    tf.abs(grad), "grad/abs/{}".format(var.name[:-2]), "full"
                )

    # Clip by norm. Grad can be null when not training some modules.
    with tf.name_scope("clip_gradients_by_norm"):
        grads_and_vars = [
            (tf.check_numerics(tf.clip_by_norm(gv[0], 10.0), "Invalid gradient"), gv[1])
            if gv[0] is not None
            else gv
            for gv in grads_and_vars
        ]

    if add_to_summary:
        for grad, var in grads_and_vars:
            if grad is not None:
                variable_summaries(
                    grad, "clipped_grad/{}".format(var.name[:-2]), "full"
                )
                variable_summaries(
                    tf.abs(grad), "clipped_grad/{}".format(var.name[:-2]), "full"
                )

    return grads_and_vars
예제 #4
0
def clip_gradients_by_norm(grads_and_vars, add_to_summary=True):
    if add_to_summary:
        for grad, var in grads_and_vars:
            if grad is not None:
                variable_summaries(
                    grad, 'grad/{}'.format(var.name[:-2]), 'full'
                )
                variable_summaries(
                    tf.abs(grad), 'grad/abs/{}'.format(var.name[:-2]), 'full'
                )

    # Clip by norm. Grad can be null when not training some modules.
    with tf.name_scope('clip_gradients_by_norm'):
        grads_and_vars = [
            (
                tf.check_numerics(
                    tf.clip_by_norm(gv[0], 10.),
                    'Invalid gradient'
                ), gv[1]
            )
            if gv[0] is not None else gv
            for gv in grads_and_vars
        ]

    if add_to_summary:
        for grad, var in grads_and_vars:
            if grad is not None:
                variable_summaries(
                    grad, 'clipped_grad/{}'.format(var.name[:-2]), 'full'
                )
                variable_summaries(
                    tf.abs(grad),
                    'clipped_grad/{}'.format(var.name[:-2]),
                    'full'
                )

    return grads_and_vars
예제 #5
0
    def _build(self, conv_feature_map, proposals, im_shape, base_network,
               gt_boxes=None, is_training=False):
        """
        Classifies & refines proposals based on the pooled feature map.

        Args:
            conv_feature_map: The feature map of the image, extracted
                using the pretrained network.
                Shape: (num_proposals, pool_height, pool_width, 512).
            proposals: A Tensor with the bounding boxes proposed by the RPN.
                Shape: (total_num_proposals, 4).
                Encoding: (x1, y1, x2, y2).
            im_shape: A Tensor with the shape of the image in the form of
                (image_height, image_width).
            gt_boxes (optional): A Tensor with the ground truth boxes of the
                image.
                Shape: (total_num_gt, 5).
                Encoding: (x1, y1, x2, y2, label).
            is_training (optional): A boolean to determine if we are just using
                the module for training or just inference.

        Returns:
            prediction_dict: a dict with the object predictions.
                It should have the keys:
                objects:
                labels:
                probs:

                rcnn:
                target:

        """
        self._instantiate_layers()

        prediction_dict = {'_debug': {}}

        if gt_boxes is not None:
            proposals_target, bbox_offsets_target = self._rcnn_target(
                proposals, gt_boxes)

            if is_training:
                with tf.name_scope('prepare_batch'):
                    # We flatten to set shape, but it is already a flat Tensor.
                    in_batch_proposals = tf.reshape(
                        tf.greater_equal(proposals_target, 0), [-1]
                    )
                    proposals = tf.boolean_mask(
                        proposals, in_batch_proposals)
                    bbox_offsets_target = tf.boolean_mask(
                        bbox_offsets_target, in_batch_proposals)
                    proposals_target = tf.boolean_mask(
                        proposals_target, in_batch_proposals)

            prediction_dict['target'] = {
                'cls': proposals_target,
                'bbox_offsets': bbox_offsets_target,
            }

        roi_prediction = self._roi_pool(proposals, conv_feature_map, im_shape)
        #return roi_prediction

        if self._debug:
            # Save raw roi prediction in debug mode.
            prediction_dict['_debug']['roi'] = roi_prediction

        pooled_features = roi_prediction['roi_pool']
        features = base_network._build_tail(
            pooled_features, is_training=is_training
        )

        if self._use_mean:
            # We avg our height and width dimensions for a more
            # "memory-friendly" Tensor.
            features = tf.reduce_mean(features, [1, 2])

        # We treat num proposals as batch number so that when flattening we
        # get a (num_proposals, flatten_pooled_feature_map_size) Tensor.
        flatten_features = tf.contrib.layers.flatten(features)
        net = tf.identity(flatten_features)

        if is_training:
            net = tf.nn.dropout(net, keep_prob=self._dropout_keep_prob)

        if self._debug:
            prediction_dict['_debug']['flatten_net'] = net

        # After flattening we are left with a Tensor of shape
        # (num_proposals, pool_height * pool_width * 512).
        # The first dimension works as batch size when applied to snt.Linear.
        for i, layer in enumerate(self._layers):
            # Through FC layer.
            net = layer(net)

            # Apply activation and dropout.
            variable_summaries(
                net, 'fc_{}_preactivationout'.format(i), 'reduced'
            )
            net = self._activation(net)
            if self._debug:
                prediction_dict['_debug']['layer_{}_out'.format(i)] = net

            variable_summaries(net, 'fc_{}_out'.format(i), 'reduced')
            if is_training:
                net = tf.nn.dropout(net, keep_prob=self._dropout_keep_prob)

        cls_score = self._classifier_layer(net)
        cls_prob = tf.nn.softmax(cls_score, axis=1)
        bbox_offsets = self._bbox_layer(net)

        prediction_dict['rcnn'] = {
            'cls_score': cls_score,
            'cls_prob': cls_prob,
            'bbox_offsets': bbox_offsets,
        }

        # Get final objects proposals based on the probabilty, the offsets and
        # the original proposals.
        proposals_pred = self._rcnn_proposal(
            proposals, bbox_offsets, cls_prob, im_shape)

        # objects, objects_labels, and objects_labels_prob are the only keys
        # that matter for drawing objects.
        prediction_dict['objects'] = proposals_pred['objects']
        prediction_dict['labels'] = proposals_pred['proposal_label']
        prediction_dict['probs'] = proposals_pred['proposal_label_prob']

        prediction_dict["without_filter_dict"] = proposals_pred["without_filter_dict"]

        if self._debug:
            prediction_dict['_debug']['proposal'] = proposals_pred

        # Calculate summaries for results
        variable_summaries(cls_prob, 'cls_prob', 'reduced')
        variable_summaries(bbox_offsets, 'bbox_offsets', 'reduced')

        if self._debug:
            variable_summaries(pooled_features, 'pooled_features', 'full')
            layer_summaries(self._classifier_layer, 'full')
            layer_summaries(self._bbox_layer, 'full')

        return prediction_dict
예제 #6
0
    def _build(self, image, gt_boxes=None, is_training=False):
        """
        Returns bounding boxes and classification probabilities.

        Args:
            image: A tensor with the image.
                Its shape should be `(height, width, 3)`.
            gt_boxes: A tensor with all the ground truth boxes of that image.
                Its shape should be `(num_gt_boxes, 5)`
                Where for each gt box we have (x1, y1, x2, y2, label),
                in that order.
            is_training: A boolean to whether or not it is used for training.

        Returns:
            classification_prob: A tensor with the softmax probability for
                each of the bounding boxes found in the image.
                Its shape should be: (num_bboxes, num_categories + 1)
            classification_bbox: A tensor with the bounding boxes found.
                It's shape should be: (num_bboxes, 4). For each of the bboxes
                we have (x1, y1, x2, y2)
        """
        if gt_boxes is not None:
            gt_boxes = tf.cast(gt_boxes, tf.float32)
        # A Tensor with the feature map for the image,
        # its shape should be `(feature_height, feature_width, 512)`.
        # The shape depends of the pretrained network in use.

        # Set rank and last dimension before using base network
        # TODO: Why does it loose information when using queue?
        image.set_shape((None, None, 3))

        conv_feature_map = self.base_network(tf.expand_dims(image, 0),
                                             is_training=is_training)

        # The RPN submodule which generates proposals of objects.
        self._rpn = RPN(
            self._num_anchors,
            self._config.model.rpn,
            debug=self._debug,
            seed=self._seed,
        )
        if self._with_rcnn:
            # The RCNN submodule which classifies RPN's proposals and
            # classifies them as background or a specific class.
            self._rcnn = RCNN(
                self._num_classes,
                self._config.model.rcnn,
                debug=self._debug,
                seed=self._seed,
            )

        image_shape = tf.shape(image)[0:2]

        variable_summaries(conv_feature_map, "conv_feature_map", "reduced")

        # Generate anchors for the image based on the anchor reference.
        all_anchors = self._generate_anchors(tf.shape(conv_feature_map))
        rpn_prediction = self._rpn(
            conv_feature_map,
            image_shape,
            all_anchors,
            gt_boxes=gt_boxes,
            is_training=is_training,
        )

        prediction_dict = {
            "rpn_prediction": rpn_prediction,
        }

        if self._debug:
            prediction_dict["image"] = image
            prediction_dict["image_shape"] = image_shape
            prediction_dict["all_anchors"] = all_anchors
            prediction_dict["anchor_reference"] = tf.convert_to_tensor(
                self._anchor_reference)
            if gt_boxes is not None:
                prediction_dict["gt_boxes"] = gt_boxes
            prediction_dict["conv_feature_map"] = conv_feature_map

        if self._with_rcnn:
            proposals = tf.stop_gradient(rpn_prediction["proposals"])
            classification_pred = self._rcnn(
                conv_feature_map,
                proposals,
                image_shape,
                self.base_network,
                gt_boxes=gt_boxes,
                is_training=is_training,
            )

            prediction_dict["classification_prediction"] = classification_pred

        return prediction_dict
    def _build(self, image, gt_boxes=None, is_training=False):
        """
        Returns bounding boxes and classification probabilities.

        Args:
            image: A tensor with the image.
                Its shape should be `(height, width, 3)`.
            gt_boxes: A tensor with all the ground truth boxes of that image.
                Its shape should be `(num_gt_boxes, 5)`
                Where for each gt box we have (x1, y1, x2, y2, label),
                in that order.
            is_training: A boolean to whether or not it is used for training.

        Returns:
            classification_prob: A tensor with the softmax probability for
                each of the bounding boxes found in the image.
                Its shape should be: (num_bboxes, num_categories + 1)
            classification_bbox: A tensor with the bounding boxes found.
                It's shape should be: (num_bboxes, 4). For each of the bboxes
                we have (x1, y1, x2, y2)
        """

        #### use variable_scope to split BodyDetector and PartDetector



        if gt_boxes is not None:
            gt_boxes = tf.cast(gt_boxes, tf.float32)
        # A Tensor with the feature map for the image,
        # its shape should be `(feature_height, feature_width, 512)`.
        # The shape depends of the pretrained network in use.

        # Set rank and last dimension before using base network
        # TODO: Why does it loose information when using queue?
        image.set_shape((None, None, 3))

        conv_feature_map = self.base_network(
            tf.expand_dims(image, 0), is_training=is_training
        )

        C4 = conv_feature_map
        with tf.variable_scope("C5"):
            C5 = self.iter_unify_layer(C4, is_training=is_training)
            #C5 = self.unify_layer(C4, is_training=is_training)

        with tf.variable_scope("Head_body_part"):
            Head_body_part = self.iter_unify_layer(C5, is_training=is_training)
            #Head_body_part = self.unify_layer(C5, is_training=is_training)

        with tf.variable_scope("Head_hf_part"):
            Head_hf_part = self.iter_unify_layer(C5, is_training=is_training)
            #Head_hf_part = self.unify_layer(C5, is_training=is_training)

        with tf.variable_scope("Head_hf_part_conv"):
            Head_hf_part_conv = self.iter_unify_layer(
                Head_hf_part, is_training=is_training
            )

        # The RPN submodule which generates proposals of objects.
        self._rpn = RPN(
            self._num_anchors, self._config.model.rpn,
            debug=self._debug, seed=self._seed
        )

        if self._with_rcnn:
            # The RCNN submodule which classifies RPN's proposals and
            # classifies them as background or a specific class.
            self._rcnn = RCNN(
                self._num_classes, self._config.model.rcnn,
                debug=self._debug, seed=self._seed,
                name="__rcnn__1"
            )

        image_shape = tf.shape(image)[0:2]

        variable_summaries(
            conv_feature_map, 'conv_feature_map', 'reduced'
        )

        # Generate anchors for the image based on the anchor reference.
        all_anchors_1 = self._generate_anchors(tf.shape(conv_feature_map))

        rpn_1_prediction = self._rpn(
            conv_feature_map, image_shape, all_anchors_1,
            gt_boxes=gt_boxes, is_training=is_training
        )

        prediction_1_dict = {
            'rpn_prediction': rpn_1_prediction,
        }

        if self._debug:
            prediction_1_dict['image'] = image
            prediction_1_dict['image_shape'] = image_shape
            prediction_1_dict['all_anchors'] = all_anchors_1
            prediction_1_dict['anchor_reference'] = tf.convert_to_tensor(
                self._anchor_reference
            )
            if gt_boxes is not None:
                prediction_1_dict['gt_boxes'] = gt_boxes
            prediction_1_dict['conv_feature_map'] = conv_feature_map

        if self._with_rcnn:
            proposals = tf.stop_gradient(rpn_1_prediction['proposals'])

            rpn_1_proposals = proposals

            classification_pred = self._rcnn(
                Head_body_part, proposals,
                image_shape, self.base_network,
                gt_boxes=gt_boxes, is_training=is_training
            )

            #### retrieve req from classification_pred
            without_filter_dict = classification_pred["without_filter_dict"]

            objects_1_all = without_filter_dict["objects"]
            labels_1_all = without_filter_dict["proposal_label"]
            probs_1_all = without_filter_dict["proposal_label_prob"]

            objects_1 = classification_pred["objects"]
            labels_1 = classification_pred["labels"]
            probs_1 = classification_pred["probs"]

            prediction_1_dict['objects'] = objects_1
            prediction_1_dict['labels'] = labels_1
            prediction_1_dict['probs'] = probs_1

            top_indices = tf.nn.top_k(tf.cast(1 - tf.sign(tf.abs(labels_1_all - self._main_part_label)), dtype=tf.float32) + probs_1_all,
                                      k = tf.shape(labels_1_all)[0]).indices

            objects_1_sorted = tf.gather(objects_1_all ,top_indices)
            filter_num = tf.minimum(tf.shape(objects_1_sorted)[0], 7)

            objects_1_filtered = tf.slice(objects_1_sorted, begin=[0, 0], size=[filter_num, 4])
            #### expand with label [?, 4] -> [?, 5]
            objects_1_filtered = tf.concat([objects_1_filtered, tf.fill([tf.shape(objects_1_filtered)[0], 1], value=tf.convert_to_tensor(self._main_part_label,
                                                                                                                                         dtype=tf.float32))],
                                           axis=-1)

            prediction_1_dict['classification_prediction'] = classification_pred

            if gt_boxes is not None:
                body_feature_ground_truth = self.generate_PartDetector_features(
                    input_image=image, input_feature=Head_hf_part, gt_boxes = gt_boxes, only_main_part_boxes=False
                )
                body_feature_pred = self.generate_PartDetector_features(
                    input_image=image, input_feature=Head_hf_part, gt_boxes=tf.concat([tf.gather(gt_boxes, tf.reshape(tf.where(tf.not_equal(gt_boxes[:, -1], self._main_part_label)), [-1])),
                                                                                       objects_1_filtered], axis=0)
                    ,only_main_part_boxes=False)
            else:
                body_feature_ground_truth = None
                body_feature_pred = self.generate_PartDetector_features(
                    input_image=image, input_feature=Head_hf_part, gt_boxes=objects_1_filtered,
                    only_main_part_boxes=True
                )

            #### use as fake placeholder
            if gt_boxes is not None:
                body_feature_pred = tf.reshape(body_feature_pred, [-1, tf.shape(body_feature_ground_truth)[-1]])
            else:
                body_feature_pred = tf.reshape(body_feature_pred, [-1, 147461])

            #### unstack it in firxt dim and "map reduce" it on modified faster-rcnn
            #### but the input ground truth label should perform label remapping is the "decoder" of single feature
            fixed_sliced_size ,PartDetector_feature_stacked = self.padding_and_slice_PartDetector_features(body_pred_feature=body_feature_pred, body_ground_truth_feature=body_feature_ground_truth)
            PartDetector_feature_stacked = tf.slice(PartDetector_feature_stacked, begin=[0, 0], size=[fixed_sliced_size, -1])

            if gt_boxes is not None:
                PartDetector_feature_stacked = tf.gather(PartDetector_feature_stacked, tf.random_shuffle(tf.range(fixed_sliced_size)))
                PartDetector_feature_stacked = tf.reshape(PartDetector_feature_stacked, [fixed_sliced_size, -1])
                PartDetector_feature_unstacked = [PartDetector_feature_stacked[0,...]]
            else:
                PartDetector_feature_unstacked = tf.unstack(PartDetector_feature_stacked, axis=0)
            partdetector_dict_list = []

            for single_partdetector_feature in PartDetector_feature_unstacked:
                if gt_boxes is not None:
                    main_part_ori_bbox ,cropped_feature, cropped_bboxes  = self.decode_single_unstacked_feature(input_feature=single_partdetector_feature, only_main_part_boxes = True if gt_boxes is None else False)
                else:
                    main_part_ori_bbox, cropped_feature = self.decode_single_unstacked_feature(input_feature=single_partdetector_feature, only_main_part_boxes = True if gt_boxes is None else False)
                    cropped_bboxes = None

                x1, y1, x2, y2 ,_ = tf.split(main_part_ori_bbox, 5)
                x1, y1, x2, y2 = map(lambda x: tf.cast(tf.reshape(x, []), tf.int32), [x1, y1, x2, y2])

                cropped_image = tf.image.crop_to_bounding_box(image=image, offset_height=y1, offset_width=x1, target_height=y2 - y1 + 1, target_width=x2 - x1 + 1)
                cropped_feature = tf.expand_dims(cropped_feature, 0)

                input_feature = Head_hf_part_conv
                image_h, image_w = tf.split(tf.shape(image)[0:2], num_or_size_splits=2)
                feature_h, feature_w = tf.split(tf.shape(input_feature)[1:3], num_or_size_splits=2)

                t4 = [x1, y1, x2, y2]
                Head_hf_part_conv = tf.slice(input_feature,
                                             begin=[0,
                                                    tf.reshape(tf.cast(tf.cast(t4[1], tf.float32) / tf.cast(image_h, tf.float32) * tf.cast(feature_h, tf.float32), tf.int32), []),
                                                    tf.reshape(tf.cast(tf.cast(t4[0], tf.float32) / tf.cast(image_w, tf.float32) * tf.cast(feature_w, tf.float32), tf.int32), []),
                                                    0],
                                             size=[-1,
                                                   tf.reshape(tf.cast(tf.cast(t4[3] - t4[1], tf.float32)/ tf.cast(image_h, tf.float32) * tf.cast(feature_h, tf.float32), tf.int32), []) ,
                                                   tf.reshape(tf.cast(tf.cast(t4[2] - t4[0], tf.float32) / tf.cast(image_w, tf.float32) * tf.cast(feature_w, tf.float32), tf.int32), []) ,
                                                   256]
                                             )

                #### Head_hf_part_conv  not crop, test the efficiency
                partdetector_dict = self.partdetetor(conv_feature_map = cropped_feature, Head_hf_part_conv = Head_hf_part_conv ,image = cropped_image, gt_boxes = cropped_bboxes, is_training = is_training)

                partdetector_dict["main_info"] = {
                    "image": image,
                    "main_part_ori_bbox": main_part_ori_bbox
                }

                partdetector_dict_list.append(partdetector_dict)

            return [prediction_1_dict] + partdetector_dict_list
예제 #8
0
    def _build(self, image, gt_boxes=None, is_training=True):
        """
        Returns bounding boxes and classification probabilities.

        Args:
            image: A tensor with the image.
                Its shape should be `(1, height, width, 3)`.
            gt_boxes: A tensor with all the ground truth boxes of that image.
                Its shape should be `(num_gt_boxes, 5)`
                Where for each gt box we have (x1, y1, x2, y2, label),
                in that order.
            is_training: A boolean to whether or not it is used for training.

        Returns:
            classification_prob: A tensor with the softmax probability for
                each of the bounding boxes found in the image.
                Its shape should be: (num_bboxes, num_categories + 1)
            classification_bbox: A tensor with the bounding boxes found.
                It's shape should be: (num_bboxes, 4). For each of the bboxes
                we have (x1, y1, x2, y2)
        """
        if gt_boxes is not None:
            gt_boxes = tf.cast(gt_boxes, tf.float32)
        # A Tensor with the feature map for the image,
        # its shape should be `(feature_height, feature_width, 512)`.
        # The shape depends of the pretrained network in use.
        conv_feature_map = self.base_network(image, is_training=is_training)

        # The RPN submodule which generates proposals of objects.
        self._rpn = RPN(self._num_anchors,
                        self._config.rpn,
                        debug=self._debug,
                        seed=self._seed)
        if self._with_rcnn:
            # The RCNN submodule which classifies RPN's proposals and
            # classifies them as background or a specific class.
            self._rcnn = RCNN(self._num_classes,
                              self._config.rcnn,
                              debug=self._debug,
                              seed=self._seed)

        image_shape = tf.shape(image)[1:3]

        variable_summaries(conv_feature_map, 'conv_feature_map', ['rpn'])

        # Generate anchors for the image based on the anchor reference.
        all_anchors = self._generate_anchors(tf.shape(conv_feature_map))
        rpn_prediction = self._rpn(conv_feature_map,
                                   image_shape,
                                   all_anchors,
                                   gt_boxes=gt_boxes)

        prediction_dict = {
            'rpn_prediction': rpn_prediction,
        }

        if self._debug:
            prediction_dict['image'] = image
            prediction_dict['image_shape'] = image_shape
            prediction_dict['all_anchors'] = all_anchors
            prediction_dict['anchor_reference'] = tf.convert_to_tensor(
                self._anchor_reference)
            prediction_dict['gt_boxes'] = gt_boxes
            prediction_dict['conv_feature_map'] = conv_feature_map

        if self._with_rcnn:
            classification_pred = self._rcnn(conv_feature_map,
                                             rpn_prediction['proposals'],
                                             image_shape,
                                             gt_boxes=gt_boxes,
                                             is_training=is_training)

            prediction_dict['classification_prediction'] = classification_pred

        return prediction_dict
예제 #9
0
파일: rpn.py 프로젝트: czbiohub/luminoth
    def _build(self,
               conv_feature_map,
               im_shape,
               all_anchors,
               gt_boxes=None,
               is_training=False):
        """Builds the RPN model subgraph.

        Args:
            conv_feature_map: A Tensor with the output of some pretrained
                network. Its dimensions should be
                `[1, feature_map_height, feature_map_width, depth]` where depth
                is 512 for the default layer in VGG and 1024 for the default
                layer in ResNet.
            im_shape: A Tensor with the shape of the original image.
            all_anchors: A Tensor with all the anchor bounding boxes. Its shape
                should be
                [feature_map_height * feature_map_width * total_anchors, 4]
            gt_boxes: A Tensor with the ground-truth boxes for the image.
                Its dimensions should be `[total_gt_boxes, 5]`, and it should
                consist of [x1, y1, x2, y2, label], being (x1, y1) -> top left
                point, and (x2, y2) -> bottom right point of the bounding box.

        Returns:
            prediction_dict: A dict with the following keys:
                proposals: A Tensor with a variable number of proposals for
                    objects on the image.
                scores: A Tensor with a "objectness" probability for each
                    proposal. The score should be the output of the softmax for
                    object.

                If training is True, then some more Tensors are added to the
                prediction dictionary to be used for calculating the loss.

                rpn_cls_prob: A Tensor with the probability of being
                    background and foreground for each anchor.
                rpn_cls_score: A Tensor with the cls score of being background
                    and foreground for each anchor (the input for the softmax).
                rpn_bbox_pred: A Tensor with the bounding box regression for
                    each anchor.
                rpn_cls_target: A Tensor with the target for each of the
                    anchors. The shape is [num_anchors,].
                rpn_bbox_target: A Tensor with the target for each of the
                    anchors. In case of ignoring the anchor for the target then
                    we still have a bbox target for each anchors, and it's
                    filled with zeroes when ignored.
        """
        # We start with a common conv layer applied to the feature map.
        self._instantiate_layers()
        self._proposal = RPNProposal(self._num_anchors,
                                     self._config.proposals,
                                     debug=self._debug)
        self._anchor_target = RPNTarget(self._num_anchors,
                                        self._config.target,
                                        seed=self._seed)

        prediction_dict = {}

        # Get the RPN feature using a simple conv net. Activation function
        # can be set to empty.
        rpn_conv_feature = self._rpn(conv_feature_map)
        rpn_feature = self._rpn_activation(rpn_conv_feature)

        # Then we apply separate conv layers for classification and regression.
        rpn_cls_score_original = self._rpn_cls(rpn_feature)
        rpn_bbox_pred_original = self._rpn_bbox(rpn_feature)
        # rpn_cls_score_original has shape (1, H, W, num_anchors * 2)
        # rpn_bbox_pred_original has shape (1, H, W, num_anchors * 4)
        # where H, W are height and width of the pretrained feature map.

        # Convert (flatten) `rpn_cls_score_original` which has two scalars per
        # anchor per location to be able to apply softmax.
        rpn_cls_score = tf.reshape(rpn_cls_score_original, [-1, 2])
        # Now that `rpn_cls_score` has shape (H * W * num_anchors, 2), we apply
        # softmax to the last dim.
        rpn_cls_prob = tf.nn.softmax(rpn_cls_score)

        prediction_dict["rpn_cls_prob"] = rpn_cls_prob
        prediction_dict["rpn_cls_score"] = rpn_cls_score

        # Flatten bounding box delta prediction for easy manipulation.
        # We end up with `rpn_bbox_pred` having shape (H * W * num_anchors, 4).
        rpn_bbox_pred = tf.reshape(rpn_bbox_pred_original, [-1, 4])

        prediction_dict["rpn_bbox_pred"] = rpn_bbox_pred

        # We have to convert bbox deltas to usable bounding boxes and remove
        # redundant ones using Non Maximum Suppression (NMS).
        proposal_prediction = self._proposal(rpn_cls_prob, rpn_bbox_pred,
                                             all_anchors, im_shape)

        prediction_dict["proposals"] = proposal_prediction["proposals"]
        prediction_dict["scores"] = proposal_prediction["scores"]

        if self._debug:
            prediction_dict["proposal_prediction"] = proposal_prediction

        if gt_boxes is not None:
            # When training we use a separate module to calculate the target
            # values we want to output.
            (rpn_cls_target, rpn_bbox_target,
             rpn_max_overlap) = self._anchor_target(all_anchors, gt_boxes,
                                                    im_shape)

            prediction_dict["rpn_cls_target"] = rpn_cls_target
            prediction_dict["rpn_bbox_target"] = rpn_bbox_target

            if self._debug:
                prediction_dict["rpn_max_overlap"] = rpn_max_overlap
                variable_summaries(rpn_bbox_target, "rpn_bbox_target", "full")

        # Variables summaries.
        variable_summaries(prediction_dict["scores"], "rpn_scores", "reduced")
        variable_summaries(rpn_cls_prob, "rpn_cls_prob", "reduced")
        variable_summaries(rpn_bbox_pred, "rpn_bbox_pred", "reduced")

        if self._debug:
            variable_summaries(rpn_feature, "rpn_feature", "full")
            variable_summaries(rpn_cls_score_original,
                               "rpn_cls_score_original", "full")
            variable_summaries(rpn_bbox_pred_original,
                               "rpn_bbox_pred_original", "full")

            # Layer summaries.
            layer_summaries(self._rpn, "full")
            layer_summaries(self._rpn_cls, "full")
            layer_summaries(self._rpn_bbox, "full")

        return prediction_dict
예제 #10
0
파일: rcnn.py 프로젝트: lartpang/luminoth
    def _build(self,
               conv_feature_map,
               proposals,
               im_shape,
               base_network,
               gt_boxes=None,
               is_training=False):
        """
        Classifies & refines proposals based on the pooled feature map.
        基于得出的池化特征图来进行分类优化提案

        Args:
            conv_feature_map: The feature map of the image, extracted
                using the pretrained network.
                Shape: (num_proposals, pool_height, pool_width, 512).
                卷积输出的特征图, 若是使用VGG, 对应的是14x14x512
            proposals: A Tensor with the bounding boxes proposed by the RPN.
                Shape: (total_num_proposals, 4).
                Encoding: (x1, y1, x2, y2).
                RPN得出的边界框
            im_shape: A Tensor with the shape of the image in the form of
                (image_height, image_width).
            gt_boxes (optional): A Tensor with the ground truth boxes of the
                image.
                Shape: (total_num_gt, 5).
                Encoding: (x1, y1, x2, y2, label).
            is_training (optional): A boolean to determine if we are just using
                the module for training or just inference.

        Returns:
            prediction_dict: a dict with the object predictions.
                It should have the keys:
                objects:
                labels:
                probs:

                rcnn:
                target:

        """
        self._instantiate_layers()

        prediction_dict = {'_debug': {}}

        # 真实数据存在
        if gt_boxes is not None:
            # 获得提案目标值, 和边界框偏移量
            proposals_target, bbox_offsets_target = self._rcnn_target(
                proposals, gt_boxes)

            if is_training:
                with tf.name_scope('prepare_batch'):
                    # We flatten to set shape, but it is already a flat Tensor.
                    # 返回提案目标值为大于等于0的提案的对应为真的逻辑张量, 表示对应的类别
                    # 每个提案只对应一个类别
                    in_batch_proposals = tf.reshape(
                        tf.greater_equal(proposals_target, 0), [-1])
                    # 获取有效的提案
                    proposals = tf.boolean_mask(proposals, in_batch_proposals)
                    # 获取有效的边界框对应的真实值相对anchor的偏移量和缩放
                    bbox_offsets_target = tf.boolean_mask(
                        bbox_offsets_target, in_batch_proposals)
                    # 获取有效的提案对应的真实类别标定值
                    proposals_target = tf.boolean_mask(proposals_target,
                                                       in_batch_proposals)

            prediction_dict['target'] = {
                'cls': proposals_target,
                'bbox_offsets': bbox_offsets_target,
            }

        # 在特征图上利用proposals实现RoI Pooling操作
        roi_prediction = self._roi_pool(proposals, conv_feature_map, im_shape)

        if self._debug:
            # Save raw roi prediction in debug mode.
            prediction_dict['_debug']['roi'] = roi_prediction

        pooled_features = roi_prediction['roi_pool']
        features = base_network._build_tail(pooled_features,
                                            is_training=is_training)

        # 在最后一层全连接之前, 这里主要针对HW维度, 进行全局平均池化
        if self._use_mean:
            # We avg our height and width dimensions for a more
            # "memory-friendly" Tensor.
            features = tf.reduce_mean(features, [1, 2])

        # We treat num proposals as batch number so that when flattening we
        # get a (num_proposals, flatten_pooled_feature_map_size) Tensor.
        # flatten会保留batch数, 得出的是一个二维张量
        flatten_features = tf.contrib.layers.flatten(features)
        net = tf.identity(flatten_features)

        # 训练的时候, RoI Pooling之后要用dropout
        if is_training:
            net = tf.nn.dropout(net, keep_prob=self._dropout_keep_prob)

        if self._debug:
            prediction_dict['_debug']['flatten_net'] = net

        # After flattening we are left with a Tensor of shape
        # (num_proposals, pool_height * pool_width * 512).
        # The first dimension works as batch size when applied to snt.Linear.
        # 开始构建总体的RCNN网络层
        for i, layer in enumerate(self._layers):
            # Through FC layer.
            net = layer(net)

            # Apply activation and dropout.
            variable_summaries(net, 'fc_{}_preactivationout'.format(i),
                               'reduced')
            # 使用的relu6:  min(max(features, 0), 6)
            net = self._activation(net)
            if self._debug:
                prediction_dict['_debug']['layer_{}_out'.format(i)] = net

            variable_summaries(net, 'fc_{}_out'.format(i), 'reduced')
            if is_training:
                net = tf.nn.dropout(net, keep_prob=self._dropout_keep_prob)

        # 创建softmax分类分支
        cls_score = self._classifier_layer(net)
        cls_prob = tf.nn.softmax(cls_score, axis=1)
        # 创建框回归分支
        bbox_offsets = self._bbox_layer(net)

        prediction_dict['rcnn'] = {
            'cls_score': cls_score,
            'cls_prob': cls_prob,
            'bbox_offsets': bbox_offsets,
        }

        # Get final objects proposals based on the probabilty, the offsets and
        # the original proposals.
        # 得到最终的预测结果, 基于概率, 偏移量, 和原始提案
        proposals_pred = self._rcnn_proposal(proposals, bbox_offsets, cls_prob,
                                             im_shape)

        # objects, objects_labels, and objects_labels_prob are the only keys
        # that matter for drawing objects.
        prediction_dict['objects'] = proposals_pred['objects']
        prediction_dict['labels'] = proposals_pred['proposal_label']
        prediction_dict['probs'] = proposals_pred['proposal_label_prob']

        if self._debug:
            prediction_dict['_debug']['proposal'] = proposals_pred

        # Calculate summaries for results
        variable_summaries(cls_prob, 'cls_prob', 'reduced')
        variable_summaries(bbox_offsets, 'bbox_offsets', 'reduced')

        if self._debug:
            variable_summaries(pooled_features, 'pooled_features', 'full')
            layer_summaries(self._classifier_layer, 'full')
            layer_summaries(self._bbox_layer, 'full')

        return prediction_dict
예제 #11
0
    def _build(self, conv_feature_map, proposals, im_shape, base_network,
               gt_boxes=None, is_training=False):
        """
        Classifies & refines proposals based on the pooled feature map.

        Args:
            conv_feature_map: The feature map of the image, extracted
                using the pretrained network.
                Shape: (num_proposals, pool_height, pool_width, 512).
            proposals: A Tensor with the bounding boxes proposed by the RPN.
                Shape: (total_num_proposals, 4).
                Encoding: (x1, y1, x2, y2).
            im_shape: A Tensor with the shape of the image in the form of
                (image_height, image_width).
            gt_boxes (optional): A Tensor with the ground truth boxes of the
                image.
                Shape: (total_num_gt, 5).
                Encoding: (x1, y1, x2, y2, label).
            is_training (optional): A boolean to determine if we are just using
                the module for training or just inference.

        Returns:
            prediction_dict: a dict with the object predictions.
                It should have the keys:
                objects:
                labels:
                probs:

                rcnn:
                target:

        """
        self._instantiate_layers()

        prediction_dict = {'_debug': {}}

        if gt_boxes is not None:
            proposals_target, bbox_offsets_target = self._rcnn_target(
                proposals, gt_boxes)

            if is_training:
                with tf.name_scope('prepare_batch'):
                    # We flatten to set shape, but it is already a flat Tensor.
                    in_batch_proposals = tf.reshape(
                        tf.greater_equal(proposals_target, 0), [-1]
                    )
                    proposals = tf.boolean_mask(
                        proposals, in_batch_proposals)
                    bbox_offsets_target = tf.boolean_mask(
                        bbox_offsets_target, in_batch_proposals)
                    proposals_target = tf.boolean_mask(
                        proposals_target, in_batch_proposals)

            prediction_dict['target'] = {
                'cls': proposals_target,
                'bbox_offsets': bbox_offsets_target,
            }

        roi_prediction = self._roi_pool(proposals, conv_feature_map, im_shape)

        if self._debug:
            # Save raw roi prediction in debug mode.
            prediction_dict['_debug']['roi'] = roi_prediction

        pooled_features = roi_prediction['roi_pool']
        features = base_network._build_tail(
            pooled_features, is_training=is_training
        )

        if self._use_mean:
            # We avg our height and width dimensions for a more
            # "memory-friendly" Tensor.
            features = tf.reduce_mean(features, [1, 2])

        # We treat num proposals as batch number so that when flattening we
        # get a (num_proposals, flatten_pooled_feature_map_size) Tensor.
        flatten_features = tf.contrib.layers.flatten(features)
        net = tf.identity(flatten_features)

        if is_training:
            net = tf.nn.dropout(net, keep_prob=self._dropout_keep_prob)

        if self._debug:
            prediction_dict['_debug']['flatten_net'] = net

        # After flattening we are left with a Tensor of shape
        # (num_proposals, pool_height * pool_width * 512).
        # The first dimension works as batch size when applied to snt.Linear.
        for i, layer in enumerate(self._layers):
            # Through FC layer.
            net = layer(net)

            # Apply activation and dropout.
            variable_summaries(
                net, 'fc_{}_preactivationout'.format(i), 'reduced'
            )
            net = self._activation(net)
            if self._debug:
                prediction_dict['_debug']['layer_{}_out'.format(i)] = net

            variable_summaries(net, 'fc_{}_out'.format(i), 'reduced')
            if is_training:
                net = tf.nn.dropout(net, keep_prob=self._dropout_keep_prob)

        cls_score = self._classifier_layer(net)
        cls_prob = tf.nn.softmax(cls_score, dim=1)
        bbox_offsets = self._bbox_layer(net)

        prediction_dict['rcnn'] = {
            'cls_score': cls_score,
            'cls_prob': cls_prob,
            'bbox_offsets': bbox_offsets,
        }

        # Get final objects proposals based on the probabilty, the offsets and
        # the original proposals.
        proposals_pred = self._rcnn_proposal(
            proposals, bbox_offsets, cls_prob, im_shape)

        # objects, objects_labels, and objects_labels_prob are the only keys
        # that matter for drawing objects.
        prediction_dict['objects'] = proposals_pred['objects']
        prediction_dict['labels'] = proposals_pred['proposal_label']
        prediction_dict['probs'] = proposals_pred['proposal_label_prob']

        if self._debug:
            prediction_dict['_debug']['proposal'] = proposals_pred

        # Calculate summaries for results
        variable_summaries(cls_prob, 'cls_prob', 'reduced')
        variable_summaries(bbox_offsets, 'bbox_offsets', 'reduced')

        if self._debug:
            variable_summaries(pooled_features, 'pooled_features', 'full')
            layer_summaries(self._classifier_layer, 'full')
            layer_summaries(self._bbox_layer, 'full')

        return prediction_dict
예제 #12
0
파일: rpn.py 프로젝트: lartpang/luminoth
    def _build(self, conv_feature_map, im_shape, all_anchors,
               gt_boxes=None, is_training=False):
        """Builds the RPN model subgraph.
        构建RPN模型计算图, 对应的是RPN的输入输出

        Args:
            conv_feature_map: A Tensor with the output of some pretrained
                network. Its dimensions should be
                `[1, feature_map_height, feature_map_width, depth]` where depth
                is 512 for the default layer in VGG and 1024 for the default
                layer in ResNet.
                按照VGG理解, 这里是conv5-3的卷积特征图, 特征是512维
            im_shape: A Tensor with the shape of the original image.
                原始图像大小, 可能需要来确定缩小的比例.
            all_anchors: A Tensor with all the anchor bounding boxes. Its shape
                should be
                [feature_map_height * feature_map_width * total_anchors, 4]
                这个是输入的对应的所有的anchors候选
            gt_boxes: A Tensor with the ground-truth boxes for the image.
                Its dimensions should be `[total_gt_boxes, 5]`, and it should
                consist of [x1, y1, x2, y2, label], being (x1, y1) -> top left
                point, and (x2, y2) -> bottom right point of the bounding box.
                真实标签使用左上和右下角确定的

        Returns:
            返回预测值和对应的类别
            prediction_dict: A dict with the following keys:
                proposals: A Tensor with a variable number of proposals for
                    objects on the image.
                scores: A Tensor with a "objectness" probability for each
                    proposal. The score should be the output of the softmax for
                    object.

                If training is True, then some more Tensors are added to the
                prediction dictionary to be used for calculating the loss.
                训练的时候会有额外的一些预测用来计算损失, 可以看下面的loss部分的介绍, 是需
                要这些值的

                rpn_cls_prob: A Tensor with the probability of being
                    background and foreground for each anchor.
                    前景背景的概率, 这个没有下面的得分值重要
                rpn_cls_score: A Tensor with the cls score of being background
                    and foreground for each anchor (the input for the softmax).
                    每个anchor的前景背景类别得分
                rpn_bbox_pred: A Tensor with the bounding box regression for
                    each anchor.
                    边界框回归结果
                rpn_cls_target: A Tensor with the target for each of the
                    anchors. The shape is [num_anchors,].
                    分类目标, 不明白这里所谓的"target"代表什么??????
                rpn_bbox_target: A Tensor with the target for each of the
                    anchors. In case of ignoring the anchor for the target then
                    we still have a bbox target for each anchors, and it's
                    filled with zeroes when ignored.
                    这里是边界框目标.如果对于目标忽略了anchor, 那么对于每个anchor,
                    我们还是会有一个边界框目标, 当他被忽略时, 将会用0填充?????

        """
        # We start with a common conv layer applied to the feature map.
        # 开始进入回归与分类各自的RPN分支
        self._instantiate_layers()
        # 将anchor和RPN预测转化为原始图像上的目标的提案
        self._proposal = RPNProposal(
            self._num_anchors, self._config.proposals, debug=self._debug
        )

        # Tuple of the tensors of:
        #     labels: (1, 0, -1) for each anchor.
        #         Shape (num_anchors, 1)
        #     bbox_targets: 4d bbox targets as specified by paper.
        #         Shape (num_anchors, 4)
        #         图像内部的anchors对应的真实值相对的偏移量和缩放量
        #     max_overlaps: Max IoU overlap with ground truth boxes.
        #         Shape (num_anchors, 1)
        #         每个anchor对应的IoU最大的真实框的索引
        self._anchor_target = RPNTarget(
            self._num_anchors, self._config.target, seed=self._seed
        )

        prediction_dict = {}

        # Get the RPN feature using a simple conv net. Activation function
        # can be set to empty.
        # 得到第一个3x3卷积的结果, 后加了个ReLU激活函数
        rpn_conv_feature = self._rpn(conv_feature_map)
        rpn_feature = self._rpn_activation(rpn_conv_feature)

        # Then we apply separate conv layers for classification and regression.
        # 获得类别得分和边界框预测, 这里各自只是经过了一次1x1的卷积, 就直接得到结果.
        rpn_cls_score_original = self._rpn_cls(rpn_feature)
        rpn_bbox_pred_original = self._rpn_bbox(rpn_feature)
        # rpn_cls_score_original has shape (1, H, W, num_anchors * 2)
        # rpn_bbox_pred_original has shape (1, H, W, num_anchors * 4)
        # where H, W are height and width of the pretrained feature map.
        # 因为使用的是3x3 padding=1 以及 1x1的卷积, 所以宽高不变, 而且这里也不能变,
        # 因为还要与原始的特征图所对应

        # Convert (flatten) `rpn_cls_score_original` which has two scalars per
        # anchor per location to be able to apply softmax.
        # 这里的操作是实现了一个flatten的操作, 但是对于每个anchor都有对应的两个值
        # 也就是前景和背景的概率(目标/非目标)
        rpn_cls_score = tf.reshape(rpn_cls_score_original, [-1, 2])
        # Now that `rpn_cls_score` has shape (H * W * num_anchors, 2), we apply
        # softmax to the last dim.
        # 对每个anchors应用一个softmax分类器得到类别预测
        rpn_cls_prob = tf.nn.softmax(rpn_cls_score)

        # 数据存档
        prediction_dict['rpn_cls_prob'] = rpn_cls_prob
        prediction_dict['rpn_cls_score'] = rpn_cls_score

        # 与上面类似的操作, 进行了把各个anchor全部展开
        # Flatten bounding box delta prediction for easy manipulation.
        # We end up with `rpn_bbox_pred` having shape (H * W * num_anchors, 4).
        rpn_bbox_pred = tf.reshape(rpn_bbox_pred_original, [-1, 4])

        # 数据存档
        prediction_dict['rpn_bbox_pred'] = rpn_bbox_pred

        # We have to convert bbox deltas to usable bounding boxes and remove
        # redundant ones using Non Maximum Suppression (NMS).
        # 将bbox增量转换为可用的边界框并使用NMS删除冗余
        # 这些操作都在self._proposal中实现了, 输出来的就是调整处理后的山下来的结果
        proposal_prediction = self._proposal(
            rpn_cls_prob, rpn_bbox_pred, all_anchors, im_shape)

        # 数据存档
        prediction_dict['proposals'] = proposal_prediction['proposals']
        prediction_dict['scores'] = proposal_prediction['scores']

        if self._debug:
            prediction_dict['proposal_prediction'] = proposal_prediction

        if gt_boxes is not None:
            # When training we use a separate module to calculate the target
            # values we want to output.
            # 训练的时候, 使用一个独立的模块来计算想要的目标输出值(也就是真实框对anchors的
            # 对应数据)
            # labels, bbox_targets, max_overlap
            (rpn_cls_target, rpn_bbox_target,
             rpn_max_overlap) = self._anchor_target(
                all_anchors, gt_boxes, im_shape
            )

            # 数据存档
            prediction_dict['rpn_cls_target'] = rpn_cls_target
            prediction_dict['rpn_bbox_target'] = rpn_bbox_target

            # ques: 这里的rpn_max_overlap代表的是什么
            # ans: 表示anchors对应的最大的IoU的真实框索引
            if self._debug:
                prediction_dict['rpn_max_overlap'] = rpn_max_overlap
                variable_summaries(rpn_bbox_target, 'rpn_bbox_target', 'full')

        # Variables summaries.
        # 可视化时使用的数据摘要
        variable_summaries(prediction_dict['scores'], 'rpn_scores', 'reduced')
        variable_summaries(rpn_cls_prob, 'rpn_cls_prob', 'reduced')
        variable_summaries(rpn_bbox_pred, 'rpn_bbox_pred', 'reduced')

        if self._debug:
            variable_summaries(rpn_feature, 'rpn_feature', 'full')
            variable_summaries(
               rpn_cls_score_original, 'rpn_cls_score_original', 'full')
            variable_summaries(
               rpn_bbox_pred_original, 'rpn_bbox_pred_original', 'full')

            # Layer summaries.
            layer_summaries(self._rpn, 'full')
            layer_summaries(self._rpn_cls, 'full')
            layer_summaries(self._rpn_bbox, 'full')

        return prediction_dict
예제 #13
0
        def valid_conclusion(gt_boxes):
            if gt_boxes is not None:
                gt_boxes = tf.cast(gt_boxes, tf.float32)
            # A Tensor with the feature map for the image,
            # its shape should be `(feature_height, feature_width, 512)`.
            # The shape depends of the pretrained network in use.

            # Set rank and last dimension before using base network
            # TODO: Why does it loose information when using queue?
            image.set_shape((None, None, 3))

            # The RPN submodule which generates proposals of objects.
            self._rpn = RPN(
                self._num_anchors, self._config.model.rpn,
                debug=self._debug, seed=self._seed
            )
            if self._with_rcnn:
                # The RCNN submodule which classifies RPN's proposals and
                # classifies them as background or a specific class.
                self._rcnn = RCNN(
                    self._num_classes, self._config.model.rcnn,
                    debug=self._debug, seed=self._seed
                )

            image_shape = tf.shape(image)[0:2]

            variable_summaries(
                conv_feature_map, 'conv_feature_map', 'reduced'
            )

            # Generate anchors for the image based on the anchor reference.
            all_anchors = self._generate_anchors(tf.shape(conv_feature_map))


            rpn_prediction = self._rpn(
                conv_feature_map, image_shape, all_anchors,
                gt_boxes=gt_boxes, is_training=is_training
            )

            prediction_dict["debug"] = (image, gt_boxes)

            prediction_dict["rpn_prediction"] = rpn_prediction

            if self._debug:
                prediction_dict['image'] = image
                prediction_dict['image_shape'] = image_shape
                prediction_dict['all_anchors'] = all_anchors
                prediction_dict['anchor_reference'] = tf.convert_to_tensor(
                    self._anchor_reference
                )
                if gt_boxes is not None:
                    prediction_dict['gt_boxes'] = gt_boxes
                prediction_dict['conv_feature_map'] = conv_feature_map

            if self._with_rcnn:
                proposals = tf.stop_gradient(rpn_prediction['proposals'])

                classification_pred = self._rcnn(
                    Head_hf_part_conv, proposals,
                    image_shape, self.base_network,
                    gt_boxes=gt_boxes, is_training=is_training
                )

                prediction_dict['classification_prediction'] = classification_pred

            return prediction_dict
예제 #14
0
    def _build(self, conv_feature_map, im_shape, all_anchors,
               gt_boxes=None, is_training=False):
        """Builds the RPN model subgraph.

        Args:
            conv_feature_map: A Tensor with the output of some pretrained
                network. Its dimensions should be
                `[1, feature_map_height, feature_map_width, depth]` where depth
                is 512 for the default layer in VGG and 1024 for the default
                layer in ResNet.
            im_shape: A Tensor with the shape of the original image.
            all_anchors: A Tensor with all the anchor bounding boxes. Its shape
                should be
                [feature_map_height * feature_map_width * total_anchors, 4]
            gt_boxes: A Tensor with the ground-truth boxes for the image.
                Its dimensions should be `[total_gt_boxes, 5]`, and it should
                consist of [x1, y1, x2, y2, label], being (x1, y1) -> top left
                point, and (x2, y2) -> bottom right point of the bounding box.

        Returns:
            prediction_dict: A dict with the following keys:
                proposals: A Tensor with a variable number of proposals for
                    objects on the image.
                scores: A Tensor with a "objectness" probability for each
                    proposal. The score should be the output of the softmax for
                    object.

                If training is True, then some more Tensors are added to the
                prediction dictionary to be used for calculating the loss.

                rpn_cls_prob: A Tensor with the probability of being
                    background and foreground for each anchor.
                rpn_cls_score: A Tensor with the cls score of being background
                    and foreground for each anchor (the input for the softmax).
                rpn_bbox_pred: A Tensor with the bounding box regression for
                    each anchor.
                rpn_cls_target: A Tensor with the target for each of the
                    anchors. The shape is [num_anchors,].
                rpn_bbox_target: A Tensor with the target for each of the
                    anchors. In case of ignoring the anchor for the target then
                    we still have a bbox target for each anchors, and it's
                    filled with zeroes when ignored.
        """
        # We start with a common conv layer applied to the feature map.
        self._instantiate_layers()
        self._proposal = RPNProposal(
            self._num_anchors, self._config.proposals, debug=self._debug
        )
        self._anchor_target = RPNTarget(
            self._num_anchors, self._config.target, seed=self._seed
        )

        prediction_dict = {}

        # Get the RPN feature using a simple conv net. Activation function
        # can be set to empty.
        rpn_conv_feature = self._rpn(conv_feature_map)
        rpn_feature = self._rpn_activation(rpn_conv_feature)

        # Then we apply separate conv layers for classification and regression.
        rpn_cls_score_original = self._rpn_cls(rpn_feature)
        rpn_bbox_pred_original = self._rpn_bbox(rpn_feature)
        # rpn_cls_score_original has shape (1, H, W, num_anchors * 2)
        # rpn_bbox_pred_original has shape (1, H, W, num_anchors * 4)
        # where H, W are height and width of the pretrained feature map.

        # Convert (flatten) `rpn_cls_score_original` which has two scalars per
        # anchor per location to be able to apply softmax.
        rpn_cls_score = tf.reshape(rpn_cls_score_original, [-1, 2])
        # Now that `rpn_cls_score` has shape (H * W * num_anchors, 2), we apply
        # softmax to the last dim.
        rpn_cls_prob = tf.nn.softmax(rpn_cls_score)

        prediction_dict['rpn_cls_prob'] = rpn_cls_prob
        prediction_dict['rpn_cls_score'] = rpn_cls_score

        # Flatten bounding box delta prediction for easy manipulation.
        # We end up with `rpn_bbox_pred` having shape (H * W * num_anchors, 4).
        rpn_bbox_pred = tf.reshape(rpn_bbox_pred_original, [-1, 4])

        prediction_dict['rpn_bbox_pred'] = rpn_bbox_pred

        # We have to convert bbox deltas to usable bounding boxes and remove
        # redundant ones using Non Maximum Suppression (NMS).
        proposal_prediction = self._proposal(
            rpn_cls_prob, rpn_bbox_pred, all_anchors, im_shape)

        prediction_dict['proposals'] = proposal_prediction['proposals']
        prediction_dict['scores'] = proposal_prediction['scores']

        if self._debug:
            prediction_dict['proposal_prediction'] = proposal_prediction

        if gt_boxes is not None:
            # When training we use a separate module to calculate the target
            # values we want to output.
            (rpn_cls_target, rpn_bbox_target,
             rpn_max_overlap) = self._anchor_target(
                all_anchors, gt_boxes, im_shape
            )

            prediction_dict['rpn_cls_target'] = rpn_cls_target
            prediction_dict['rpn_bbox_target'] = rpn_bbox_target

            if self._debug:
                prediction_dict['rpn_max_overlap'] = rpn_max_overlap
                variable_summaries(rpn_bbox_target, 'rpn_bbox_target', 'full')

        # Variables summaries.
        variable_summaries(prediction_dict['scores'], 'rpn_scores', 'reduced')
        variable_summaries(rpn_cls_prob, 'rpn_cls_prob', 'reduced')
        variable_summaries(rpn_bbox_pred, 'rpn_bbox_pred', 'reduced')

        if self._debug:
            variable_summaries(rpn_feature, 'rpn_feature', 'full')
            variable_summaries(
               rpn_cls_score_original, 'rpn_cls_score_original', 'full')
            variable_summaries(
               rpn_bbox_pred_original, 'rpn_bbox_pred_original', 'full')

            # Layer summaries.
            layer_summaries(self._rpn, 'full')
            layer_summaries(self._rpn_cls, 'full')
            layer_summaries(self._rpn_bbox, 'full')

        return prediction_dict