Exemplo n.º 1
0
        def f(batched_inputs, c2_inputs, c2_results):
            image_sizes = [[int(im[0]), int(im[1])]
                           for im in c2_inputs["im_info"]]
            detector_results = assemble_rcnn_outputs_by_name(
                image_sizes, c2_results, force_mask_on=True)
            sem_seg_results = c2_results["sem_seg"]

            # copied from meta_arch/panoptic_fpn.py ...
            processed_results = []
            for sem_seg_result, detector_result, input_per_image, image_size in zip(
                    sem_seg_results, detector_results, batched_inputs,
                    image_sizes):
                height = input_per_image.get("height", image_size[0])
                width = input_per_image.get("width", image_size[1])
                sem_seg_r = sem_seg_postprocess(sem_seg_result, image_size,
                                                height, width)
                detector_r = detector_postprocess(detector_result, height,
                                                  width)

                processed_results.append({
                    "sem_seg": sem_seg_r,
                    "instances": detector_r
                })

                if combine_on:
                    panoptic_r = combine_semantic_and_instance_outputs(
                        detector_r,
                        sem_seg_r.argmax(dim=0),
                        combine_overlap_threshold,
                        combine_stuff_area_limit,
                        combine_instances_confidence_threshold,
                    )
                    processed_results[-1]["panoptic_seg"] = panoptic_r
            return processed_results
Exemplo n.º 2
0
Arquivo: fcos.py Projeto: zyg11/DeFCN
    def forward(self, batched_inputs):
        """
        Args:
            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
                Each item in the list contains the inputs for one image.
                For now, each item in the list is a dict that contains:

                * image: Tensor, image in (C, H, W) format.
                * instances: Instances

                Other information that's included in the original dicts, such as:

                * "height", "width" (int): the output resolution of the model, used in inference.
                    See :meth:`postprocess` for details.
        Returns:
            dict[str: Tensor]:
                mapping from a named loss to a tensor storing the loss. Used during training only.
        """
        images = self.preprocess_image(batched_inputs)
        if "instances" in batched_inputs[0]:
            gt_instances = [
                x["instances"].to(self.device) for x in batched_inputs
            ]
        elif "targets" in batched_inputs[0]:
            log_first_n(
                logging.WARN,
                "'targets' in the model inputs is now renamed to 'instances'!",
                n=10)
            gt_instances = [
                x["targets"].to(self.device) for x in batched_inputs
            ]
        else:
            gt_instances = None

        features = self.backbone(images.tensor)
        features = [features[f] for f in self.in_features]
        box_cls, box_delta, box_center = self.head(features)
        shifts = self.shift_generator(features)

        if self.training:
            # remove gt_instances with ignore label
            gt_instances = [
                inst[inst.gt_classes >= 0] for inst in gt_instances
            ]
            gt_classes, gt_shifts_reg_deltas, gt_centerness = self.get_ground_truth(
                shifts, gt_instances)
            return self.losses(gt_classes, gt_shifts_reg_deltas, gt_centerness,
                               box_cls, box_delta, box_center)
        else:
            results = self.inference(box_cls, box_delta, box_center, shifts,
                                     images)
            processed_results = []
            for results_per_image, input_per_image, image_size in zip(
                    results, batched_inputs, images.image_sizes):
                height = input_per_image.get("height", image_size[0])
                width = input_per_image.get("width", image_size[1])
                r = detector_postprocess(results_per_image, height, width)
                processed_results.append({"instances": r})
            return processed_results
Exemplo n.º 3
0
    def forward(self, batched_inputs):
        """
        Args:
            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
                Each item in the list contains the inputs for one image.
                For now, each item in the list is a dict that contains:

                * image: Tensor, image in (C, H, W) format.
                * instances: Instances

                Other information that's included in the original dicts, such as:

                * "height", "width" (int): the output resolution of the model, used in inference.
                  See :meth:`postprocess` for details.
        Returns:
            dict[str: Tensor]:
                mapping from a named loss to a tensor storing the loss. Used during training only.
        """
        images = self.preprocess_image(batched_inputs)
        if "instances" in batched_inputs[0]:
            gt_instances = [
                x["instances"].to(self.device) for x in batched_inputs
            ]
        elif "targets" in batched_inputs[0]:
            log_first_n(
                logging.WARN,
                "'targets' in the model inputs is now renamed to 'instances'!",
                n=10)
            gt_instances = [
                x["targets"].to(self.device) for x in batched_inputs
            ]
        else:
            gt_instances = None

        features = self.backbone(images.tensor)
        features = [features[f] for f in self.in_features]
        box_cls, box_delta = self.decoder(self.encoder(features[0]))
        anchors = self.anchor_generator(features)
        # Transpose the Hi*Wi*A dimension to the middle:
        pred_logits = [permute_to_N_HWA_K(box_cls, self.num_classes)]
        pred_anchor_deltas = [permute_to_N_HWA_K(box_delta, 4)]

        if self.training:
            indices = self.get_ground_truth(anchors, pred_anchor_deltas,
                                            gt_instances)
            losses = self.losses(indices, gt_instances, anchors, pred_logits,
                                 pred_anchor_deltas)
            return losses
        else:
            results = self.inference([box_cls], [box_delta], anchors,
                                     images.image_sizes)
            processed_results = []
            for results_per_image, input_per_image, image_size in zip(
                    results, batched_inputs, images.image_sizes):
                height = input_per_image.get("height", image_size[0])
                width = input_per_image.get("width", image_size[1])
                r = detector_postprocess(results_per_image, height, width)
                processed_results.append({"instances": r})
            return processed_results
Exemplo n.º 4
0
    def forward(self, batched_inputs):
        """
        Args:
            batched_inputs: a list, batched outputs of : class:`DatasetMapper`.
            Each item in the list contains the input for one image.
            For now, each item in the list is a dict that contains:
             * images: Tensor, image in (C, H, W) format.
             * instances: Instances.
             Other information that' s included in the original dict ,such as:
             * "height", "width"(int): the output resolution of the model,
             used in inference.See  `postprocess` for detail
        Return:
            dict[str: Tensor]:
                mapping from a named loss to a tensor storing the loss, Used
                during training only.
                At inference stage, return predicted bboxes.
        """
        images = self.preprocess_image(batched_inputs)
        if "instances" in batched_inputs[0]:
            gt_instances = [
                x['instances'].to(self.device) for x in batched_inputs
            ]
        elif "targets" in batched_inputs[0]:
            log_first_n(
                "WARNING",
                "'targets' in the model inputs is now renamed to 'instances'!",
                n=10)
            gt_instances = [
                x['instances'].to(self.device) for x in batched_inputs
            ]
        else:
            gt_instances = None

        features = self.backbone(images.tensor)
        features = [features[f] for f in self.in_features]
        cls_outs, pts_outs_init, pts_outs_refine = self.head(features)
        center_pts = self.shift_generator(features)

        if self.training:
            return self.losses(center_pts, cls_outs, pts_outs_init,
                               pts_outs_refine, gt_instances)
        else:
            results = self.inference(center_pts, cls_outs, pts_outs_init,
                                     pts_outs_refine, images)
            processed_results = []
            for results_per_image, input_per_image, image_size in zip(
                    results, batched_inputs, images.image_sizes):
                height = input_per_image.get("height", image_size[0])
                width = input_per_image.get("width", image_size[1])
                r = detector_postprocess(results_per_image, height, width)
                processed_results.append({"instances": r})
            return processed_results
Exemplo n.º 5
0
    def _inference_for_ms_test(self, batched_inputs):
        """
        function used for multiscale test, will be refactor in the future.
        The same input with `forward` function.
        """
        assert not self.training, "inference mode with training=True"
        assert len(batched_inputs) == 1, "inference image number > 1"
        images = self.preprocess_image(batched_inputs)

        features = self.backbone(images.tensor)
        features = [features[f] for f in self.in_features]
        box_cls, box_delta = self.head(features)
        shifts = self.shift_generator(features)

        results = self.inference(box_cls, box_delta, shifts, images)
        for results_per_image, input_per_image, image_size in zip(
                results, batched_inputs, images.image_sizes):
            height = input_per_image.get("height", image_size[0])
            width = input_per_image.get("width", image_size[1])
            processed_results = detector_postprocess(results_per_image, height,
                                                     width)
        return processed_results
Exemplo n.º 6
0
    def forward(self, batched_inputs):
        """
        Args:
            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
                Each item in the list contains the inputs for one image.
                For now, each item in the list is a dict that contains:

                * image: Tensor, image in (C, H, W) format.
                * instances: Instances

                Other information that's included in the original dicts, such as:

                * "height", "width" (int): the output resolution of the model, used in inference.
                  See :meth:`postprocess` for details.
        Returns:
            result (list[BoxList] or dict[Tensor]): the output from the model.
                During training, it returns a dict[Tensor] which contains the losses.
                During testing, it returns list[BoxList] contains additional fields
                like `scores`, `labels` and `mask` (for Mask R-CNN models).

        """
        images = self.preprocess_image(batched_inputs)
        if "instances" in batched_inputs[0]:
            gt_instances = [
                x["instances"].to(self.device) for x in batched_inputs
            ]
        elif "targets" in batched_inputs[0]:
            log_first_n(
                logging.WARN,
                "'targets' in the model inputs is now renamed to 'instances'!",
                n=10)
            gt_instances = [
                x["targets"].to(self.device) for x in batched_inputs
            ]
        else:
            gt_instances = None

        features = self.backbone(images.tensor)
        features = [features[f] for f in self.in_features]
        shifts = self.shift_generator(features)
        (box_cls, box_delta, box_center, bd_box_cls, bd_box_delta,
         bd_based_box) = self.head(features, shifts)

        if self.training:
            (gt_classes, gt_shifts_reg_deltas, gt_centerness,
             gt_border_classes,
             gt_border_shifts_deltas) = self.get_ground_truth(
                 shifts, gt_instances, bd_based_box)
            return self.losses(
                gt_classes,
                gt_shifts_reg_deltas,
                gt_centerness,
                gt_border_classes,
                gt_border_shifts_deltas,
                box_cls,
                box_delta,
                box_center,
                bd_box_cls,
                bd_box_delta,
            )
        else:
            results = self.inference(box_cls, box_center, bd_box_cls,
                                     bd_box_delta, bd_based_box,
                                     images.image_sizes)
            processed_results = []
            for results_per_image, input_per_image, image_size in zip(
                    results, batched_inputs, images.image_sizes):
                height = input_per_image.get("height", image_size[0])
                width = input_per_image.get("width", image_size[1])
                r = detector_postprocess(results_per_image, height, width)
                processed_results.append({"instances": r})
            return processed_results
Exemplo n.º 7
0
    def forward(self, batched_inputs):
        """
        Args:
            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
                Each item in the list contains the inputs for one image.
                For now, each item in the list is a dict that contains:

                * image: Tensor, image in (C, H, W) format.
                * instances: Instances

                Other information that's included in the original dicts, such as:

                * "height", "width" (int): the output resolution of the model, used in inference.
                    See :meth:`postprocess` for details.
        Returns:
            dict[str: Tensor]:
                mapping from a named loss to a tensor storing the loss. Used during training only.
        """
        images = self.preprocess_image(batched_inputs)
        if "instances" in batched_inputs[0]:
            gt_instances = [
                x["instances"].to(self.device) for x in batched_inputs
            ]
        elif "targets" in batched_inputs[0]:
            log_first_n(
                logging.WARN,
                "'targets' in the model inputs is now renamed to 'instances'!",
                n=10)
            gt_instances = [
                x["targets"].to(self.device) for x in batched_inputs
            ]
        else:
            gt_instances = None

        features = self.backbone(images.tensor)
        features = [features[f] for f in self.in_features]
        box_cls, box_delta, box_center = self.head(features)
        shifts = self.shift_generator(features)

        if self.training:
            gt_classes, gt_shifts_reg_deltas, gt_centerness = self.get_ground_truth(
                shifts, gt_instances)
            return self.losses(gt_classes, gt_shifts_reg_deltas, gt_centerness,
                               box_cls, box_delta, box_center)
        else:
            if self.is_dynamic_head:
                soft_cost, used_cost, full_cost = get_module_running_cost(self)
                if self.head_complexity_buffer is None:
                    self.head_complexity_buffer = [used_cost]
                else:
                    self.head_complexity_buffer.append(used_cost)
                avg_cost = float(sum(self.head_complexity_buffer) / len(self.head_complexity_buffer))
                max_cost = float(max(self.head_complexity_buffer))
                min_cost = float(min(self.head_complexity_buffer))
                print("Head Complexity, CUR: {:.3f} GFLOPs, AVG: {:.3f} GFLOPs, MAX: {:.3f} GFLOPs, MIN: {:.3f} GFLOPs".format(
                    float(used_cost) / 2 ** 30, avg_cost / 2 ** 30, max_cost / 2 ** 30, min_cost / 2 ** 30))

            results = self.inference(box_cls, box_delta, box_center, shifts,
                                     images)
            processed_results = []
            for results_per_image, input_per_image, image_size in zip(
                    results, batched_inputs, images.image_sizes):
                height = input_per_image.get("height", image_size[0])
                width = input_per_image.get("width", image_size[1])
                r = detector_postprocess(results_per_image, height, width)
                processed_results.append({"instances": r})
            return processed_results
Exemplo n.º 8
0
    def forward(self, batched_inputs):
        """
        Args:
            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
                Each item in the list contains the inputs for one image.
                For now, each item in the list is a dict that contains:

                * image: Tensor, image in (C, H, W) format.
                * instances: Instances

                Other information that's included in the original dicts, such as:

                * "height", "width" (int): the output resolution of the model, used in inference.
                    See :meth:`postprocess` for details.
        Returns:
            dict[str: Tensor]:
                mapping from a named loss to a tensor storing the loss. Used during training only.
        """

        images, labels = self.preprocess_image(batched_inputs, self.training)

        # batched_inputs[0]['image'] = images.tensor[0].cpu() * 255
        # self.visualize_data(batched_inputs[0])

        x = images.tensor
        img_size = x.shape[-2:]

        def _branch(_embedding, _in):
            for i, e in enumerate(_embedding):
                _in = e(_in)
                if i == 4:
                    out_branch = _in
            return _in, out_branch

        #  backbone
        # x2, x1, x0 = self.backbone(x)
        out_features = self.backbone(x)
        features = [out_features[f] for f in self.in_features]
        [x2, x1, x0] = features
        #  yolo branch 0
        out0, out0_branch = _branch(self.out0, x0)
        #  yolo branch 1
        x1_in = self.out1_cbl(out0_branch)
        x1_in = self.out1_upsample(x1_in)
        x1_in = torch.cat([x1_in, x1], 1)
        out1, out1_branch = _branch(self.out1, x1_in)
        #  yolo branch 2
        x2_in = self.out2_cbl(out1_branch)
        x2_in = self.out2_upsample(x2_in)
        x2_in = torch.cat([x2_in, x2], 1)
        out2, out2_branch = _branch(self.out2, x2_in)

        outputs = [out0, out1, out2]

        if self.training:
            losses = [
                loss_evaluator(out, labels, img_size)
                for out, loss_evaluator in zip(outputs, self.loss_evaluators)
            ]
            keys = [
                "loss_x", "loss_y", "loss_w", "loss_h", "loss_conf", "loss_cls"
            ]
            losses_dict = {}
            for key in keys:
                losses_dict[key] = sum([loss[key] for loss in losses])
            return losses_dict
        else:
            predictions_list = [
                loss_evaluator(out, labels, img_size)
                for out, loss_evaluator in zip(outputs, self.loss_evaluators)
            ]

            predictions = torch.cat(predictions_list, 1)
            detections = postprocess(predictions,
                                     self.num_classes,
                                     self.conf_threshold,
                                     self.nms_threshold,
                                     nms_type=self.nms_type)

            results = []
            for idx, out in enumerate(detections):
                if out is None:
                    out = x.new_zeros((0, 7))
                # image_size = images.image_sizes[idx]
                image_size = img_size
                result = Instances(image_size)
                result.pred_boxes = Boxes(out[:, :4])
                result.scores = out[:, 5] * out[:, 4]
                result.pred_classes = out[:, -1]
                results.append(result)

            processed_results = []
            for results_per_image, input_per_image, image_size in zip(
                    results, batched_inputs, images.image_sizes):
                height = input_per_image.get("height", image_size[0])
                width = input_per_image.get("width", image_size[1])
                r = detector_postprocess(results_per_image, height, width)
                processed_results.append({"instances": r})

            return processed_results
Exemplo n.º 9
0
    def forward(self, batched_inputs):
        """
        Args:
            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
                Each item in the list contains the inputs for one image.
                For now, each item in the list is a dict that contains:

                * image: Tensor, image in (C, H, W) format.
                * instances: Instances

                Other information that's included in the original dicts, such as:

                * "height", "width" (int): the output resolution of the model, used in inference.
                    See :meth:`postprocess` for details.
        Returns:
            dict[str: Tensor]:
                mapping from a named loss to a tensor storing the loss. Used during training only.
        """
        images = self.preprocess_image(batched_inputs)

        if "instances" in batched_inputs[0]:
            gt_instances = [
                x["instances"].to(self.device) for x in batched_inputs
            ]
        elif "targets" in batched_inputs[0]:
            log_first_n(
                logging.WARN,
                "'targets' in the model inputs is now renamed to 'instances'!",
                n=10)
            gt_instances = [
                x["targets"].to(self.device) for x in batched_inputs
            ]
        else:
            gt_instances = None

        # vgg feature maps: ['Conv4_3', 'Conv7']
        features = self.backbone(images.tensor)
        features = [features[f] for f in self.in_features]

        # featrue map: Conv4_3
        # Conv4_3 has a different feature scale compared to the other layers, we use
        # the L2 normalization technique to scale the feature norm at each location
        # in the feature map to 20 and learn the scale during back propagation.
        features[0] = self.l2norm(features[0])

        # Conv7
        x = features[-1]
        # compute featrue maps: conv8_2, conv9_2, conv10_2, and conv11_2
        for idx, extra_layer in enumerate(self.extra_layers):
            x = F.relu(extra_layer(x), inplace=True)
            if idx % 2 == 1:
                features.append(x)

        conf_pred, loc_pred = self.head(features)

        if self.training:
            gt_conf, gt_default_boxes_deltas = self.get_ground_truth(
                self.default_boxes, gt_instances)
            return self.losses(gt_conf, gt_default_boxes_deltas, conf_pred,
                               loc_pred)
        else:
            results = self.inference(conf_pred, loc_pred, self.default_boxes,
                                     images)
            processed_results = []
            for results_per_image, input_per_image, image_size in zip(
                    results, batched_inputs, images.image_sizes):
                height = input_per_image.get("height", image_size[0])
                width = input_per_image.get("width", image_size[1])
                r = detector_postprocess(results_per_image, height, width)
                processed_results.append({"instances": r})
            return processed_results