예제 #1
0
    def __call__(
        self,
        proposals_with_gt: List[Instances],
        densepose_predictor_outputs: Any,
        packed_annotations: PackedCseAnnotations,
        interpolator: BilinearInterpolationHelper,
        embedder: nn.Module,
    ) -> Dict[int, torch.Tensor]:
        """
        Produces losses for estimated embeddings given annotated vertices.
        Embeddings for all the vertices of a mesh are computed by the embedder.
        Embeddings for observed pixels are estimated by a predictor.
        Losses are computed as cross-entropy for squared distances between
        observed vertex embeddings and all mesh vertex embeddings given
        ground truth vertex IDs.

        Args:
            proposals_with_gt (list of Instances): detections with associated
                ground truth data; each item corresponds to instances detected
                on 1 image; the number of items corresponds to the number of
                images in a batch
            densepose_predictor_outputs: an object of a dataclass that contains predictor
                outputs with estimated values; assumed to have the following attributes:
                * embedding - embedding estimates, tensor of shape [N, D, S, S], where
                  N = number of instances (= sum N_i, where N_i is the number of
                      instances on image i)
                  D = embedding space dimensionality (MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE)
                  S = output size (width and height)
            packed_annotations (PackedCseAnnotations): contains various data useful
                for loss computation, each data is packed into a single tensor
            interpolator (BilinearInterpolationHelper): bilinear interpolation helper
            embedder (nn.Module): module that computes vertex embeddings for different meshes
        Return:
            dict(int -> tensor): losses for different mesh IDs
        """
        losses = {}
        for mesh_id_tensor in packed_annotations.vertex_mesh_ids_gt.unique():  # pyre-ignore[16]
            mesh_id = mesh_id_tensor.item()
            mesh_name = MeshCatalog.get_mesh_name(mesh_id)
            # valid points are those that fall into estimated bbox
            # and correspond to the current mesh
            j_valid = interpolator.j_valid * (  # pyre-ignore[16]
                packed_annotations.vertex_mesh_ids_gt == mesh_id
            )
            # extract estimated embeddings for valid points
            # -> tensor [J, D]
            vertex_embeddings_i = normalize_embeddings(
                interpolator.extract_at_points(
                    densepose_predictor_outputs.embedding,
                    slice_fine_segm=slice(None),
                    w_ylo_xlo=interpolator.w_ylo_xlo[:, None],  # pyre-ignore[16]
                    w_ylo_xhi=interpolator.w_ylo_xhi[:, None],  # pyre-ignore[16]
                    w_yhi_xlo=interpolator.w_yhi_xlo[:, None],  # pyre-ignore[16]
                    w_yhi_xhi=interpolator.w_yhi_xhi[:, None],  # pyre-ignore[16]
                )[j_valid, :]
            )
            # extract vertex ids for valid points
            # -> tensor [J]
            vertex_indices_i = packed_annotations.vertex_ids_gt[j_valid]
            # embeddings for all mesh vertices
            # -> tensor [K, D]
            mesh_vertex_embeddings = embedder(mesh_name)
            # unnormalized scores for valid points
            # -> tensor [J, K]
            scores = squared_euclidean_distance_matrix(
                vertex_embeddings_i, mesh_vertex_embeddings
            ) / (-self.embdist_gauss_sigma)
            losses[mesh_name] = F.cross_entropy(scores, vertex_indices_i, ignore_index=-1)

        for mesh_name in embedder.mesh_names:  # pyre-ignore[16]
            if mesh_name not in losses:
                losses[mesh_name] = self.fake_value(
                    densepose_predictor_outputs, embedder, mesh_name
                )
        return losses
예제 #2
0
    def forward(
        self,
        proposals_with_gt: List[Instances],
        densepose_predictor_outputs: Any,
        packed_annotations: PackedCseAnnotations,
        embedder: nn.Module,
    ):
        """
        Args:
            proposals_with_gt (list of Instances): detections with associated
                ground truth data; each item corresponds to instances detected
                on 1 image; the number of items corresponds to the number of
                images in a batch
            densepose_predictor_outputs: an object of a dataclass that contains predictor
                outputs with estimated values; assumed to have the following attributes:
                * embedding - embedding estimates, tensor of shape [N, D, S, S], where
                  N = number of instances (= sum N_i, where N_i is the number of
                      instances on image i)
                  D = embedding space dimensionality (MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE)
                  S = output size (width and height)
            packed_annotations (PackedCseAnnotations): contains various data useful
                for loss computation, each data is packed into a single tensor
            embedder (nn.Module): module that computes vertex embeddings for different meshes
        """
        pix_embeds = densepose_predictor_outputs.embedding
        if self.pixel_dists.device != pix_embeds.device:
            # should normally be done only once
            self.pixel_dists = self.pixel_dists.to(device=pix_embeds.device)
        with torch.no_grad():
            mask_loss_data = extract_data_for_mask_loss_from_matches(
                proposals_with_gt, densepose_predictor_outputs.coarse_segm)
        # GT masks - tensor of shape [N, S, S] of int64
        masks_gt = mask_loss_data.masks_gt.long()  # pyre-ignore[16]
        assert len(pix_embeds) == len(masks_gt), (
            f"Number of instances with embeddings {len(pix_embeds)} != "
            f"number of instances with GT masks {len(masks_gt)}")
        losses = []
        mesh_names = (
            self.shape_names if self.use_all_meshes_not_gt_only else [
                MeshCatalog.get_mesh_name(mesh_id.item())
                for mesh_id in packed_annotations.vertex_mesh_ids_gt.unique(
                )  # pyre-ignore[16]
            ])
        for pixel_embeddings, mask_gt in zip(pix_embeds, masks_gt):
            # pixel_embeddings [D, S, S]
            # mask_gt [S, S]
            for mesh_name in mesh_names:
                mesh_vertex_embeddings = embedder(mesh_name)
                # pixel indices [M]
                pixel_indices_flattened = _sample_fg_pixels_randperm(
                    mask_gt, self.num_pixels_to_sample)
                # pixel distances [M, M]
                pixel_dists = self.pixel_dists.to(
                    pixel_embeddings.device)[torch.meshgrid(
                        pixel_indices_flattened, pixel_indices_flattened)]
                # pixel embeddings [M, D]
                pixel_embeddings_sampled = normalize_embeddings(
                    pixel_embeddings.reshape(
                        (self.embed_size, -1))[:, pixel_indices_flattened].T)
                # pixel-vertex similarity [M, K]
                sim_matrix = pixel_embeddings_sampled.mm(  # pyre-ignore[16]
                    mesh_vertex_embeddings.T)
                c_pix_vertex = F.softmax(sim_matrix /
                                         self.temperature_pix_to_vertex,
                                         dim=1)
                c_vertex_pix = F.softmax(sim_matrix.T /
                                         self.temperature_vertex_to_pix,
                                         dim=1)
                c_cycle = c_pix_vertex.mm(c_vertex_pix)
                loss_cycle = torch.norm(pixel_dists * c_cycle, p=self.norm_p)
                losses.append(loss_cycle)

        if len(losses) == 0:
            return pix_embeds.sum() * 0
        return torch.stack(losses, dim=0).mean()