def prepare_layoutlmv2_batch_inputs():
    # Here we prepare a batch of 2 sequences to test a LayoutLMv2 forward pass on:
    # fmt: off
    input_ids = torch.tensor(
        [[
            101, 1019, 1014, 1016, 1037, 12849, 4747, 1004, 14246, 2278, 5439,
            4524, 5002, 2930, 2193, 2930, 4341, 3208, 1005, 1055, 2171, 2848,
            11300, 3531, 102
        ],
         [
             101, 4070, 4034, 7020, 1024, 3058, 1015, 1013, 2861, 1013, 6070,
             19274, 2772, 6205, 27814, 16147, 16147, 4343, 2047, 10283, 10969,
             14389, 1012, 2338, 102
         ]],
        device=torch_device)  # noqa: E231
    bbox = torch.tensor(
        [[[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287],
          [419, 115, 437, 129], [961, 885, 992, 912], [256, 38, 330, 58],
          [256, 38, 330, 58], [336, 42, 353, 57], [360, 39, 401, 56],
          [360, 39, 401, 56], [411, 39, 471, 59], [479, 41, 528, 59],
          [533, 39, 630, 60], [67, 113, 134, 131], [141, 115, 209, 132],
          [68, 149, 133, 166], [141, 149, 187, 164], [195, 148, 287, 165],
          [195, 148, 287, 165], [195, 148, 287, 165], [295, 148, 349, 165],
          [441, 149, 492, 166], [497, 149, 546, 164], [64, 201, 125, 218],
          [1000, 1000, 1000, 1000]],
         [[0, 0, 0, 0], [662, 150, 754, 166], [665, 199, 742, 211],
          [519, 213, 554, 228], [519, 213, 554, 228], [134, 433, 187, 454],
          [130, 467, 204, 480], [130, 467, 204, 480], [130, 467, 204, 480],
          [130, 467, 204, 480], [130, 467, 204, 480], [314, 469, 376, 482],
          [504, 684, 582, 706], [941, 825, 973, 900], [941, 825, 973, 900],
          [941, 825, 973, 900], [941, 825, 973, 900], [610, 749, 652, 765],
          [130, 659, 168, 672], [176, 657, 237, 672], [238, 657, 312, 672],
          [443, 653, 628, 672], [443, 653, 628, 672], [716, 301, 825, 317],
          [1000, 1000, 1000, 1000]]],
        device=torch_device)  # noqa: E231
    image = ImageList(torch.randn((2, 3, 224, 224)),
                      image_sizes=[(224, 224), (224, 224)])  # noqa: E231
    attention_mask = torch.tensor([
        [
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1
        ],
        [
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            1, 1, 1
        ],
    ],
                                  device=torch_device)  # noqa: E231
    token_type_ids = torch.tensor([[
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0
    ],
                                   [
                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
                                   ]],
                                  device=torch_device)  # noqa: E231
    # fmt: on

    return input_ids, bbox, image, attention_mask, token_type_ids
Пример #2
0
    def inference(
        self,
        images: torch.Tensor,
        image_shapes: torch.Tensor,
        gt_boxes: torch.Tensor = None,
        proposals: torch.Tensor = None,
        scales_yx: torch.Tensor = None,
        **kwargs,
    ):
        # run images through backbone
        features = self.backbone(images)

        image_list = ImageList(images, image_shapes)

        # generate proposals if none are available
        if proposals is None:
            proposal_boxes, _ = self.proposal_generator(
                image_list, features, gt_boxes)
        else:
            assert proposals is not None

        proposal_boxes = [proposal_boxes[0].get_fields()["proposal_boxes"]]

        feature_pooled = self.forward_for_roi_head([features["res4"]],
                                                   proposal_boxes)

        preds_per_image = [p.size(0) for p in [proposal_boxes[0].tensor]]

        roi_features = feature_pooled.split(preds_per_image, dim=0)

        return roi_features
    def prepare_config_and_inputs(self):
        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)

        bbox = ids_tensor([self.batch_size, self.seq_length, 4], self.range_bbox)
        # Ensure that bbox is legal
        for i in range(bbox.shape[0]):
            for j in range(bbox.shape[1]):
                if bbox[i, j, 3] < bbox[i, j, 1]:
                    t = bbox[i, j, 3]
                    bbox[i, j, 3] = bbox[i, j, 1]
                    bbox[i, j, 1] = t
                if bbox[i, j, 2] < bbox[i, j, 0]:
                    t = bbox[i, j, 2]
                    bbox[i, j, 2] = bbox[i, j, 0]
                    bbox[i, j, 0] = t

        image = ImageList(
            torch.zeros(self.batch_size, self.num_channels, self.image_size, self.image_size, device=torch_device),
            self.image_size,
        )

        input_mask = None
        if self.use_input_mask:
            input_mask = random_attention_mask([self.batch_size, self.seq_length])

        token_type_ids = None
        if self.use_token_type_ids:
            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)

        sequence_labels = None
        token_labels = None
        if self.use_labels:
            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)

        config = LayoutLMv2Config(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
            num_attention_heads=self.num_attention_heads,
            intermediate_size=self.intermediate_size,
            hidden_act=self.hidden_act,
            hidden_dropout_prob=self.hidden_dropout_prob,
            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
            max_position_embeddings=self.max_position_embeddings,
            type_vocab_size=self.type_vocab_size,
            is_decoder=False,
            initializer_range=self.initializer_range,
            image_feature_pool_shape=self.image_feature_pool_shape,
            coordinate_size=self.coordinate_size,
            shape_size=self.shape_size,
        )

        # use smaller resnet backbone to make tests faster
        config.detectron2_config_args["MODEL.RESNETS.DEPTH"] = 18
        config.detectron2_config_args["MODEL.RESNETS.RES2_OUT_CHANNELS"] = 64
        config.detectron2_config_args["MODEL.RESNETS.NUM_GROUPS"] = 1

        return config, input_ids, bbox, image, token_type_ids, input_mask, sequence_labels, token_labels