def prepare_layoutlmv2_batch_inputs(): # Here we prepare a batch of 2 sequences to test a LayoutLMv2 forward pass on: # fmt: off input_ids = torch.tensor( [[ 101, 1019, 1014, 1016, 1037, 12849, 4747, 1004, 14246, 2278, 5439, 4524, 5002, 2930, 2193, 2930, 4341, 3208, 1005, 1055, 2171, 2848, 11300, 3531, 102 ], [ 101, 4070, 4034, 7020, 1024, 3058, 1015, 1013, 2861, 1013, 6070, 19274, 2772, 6205, 27814, 16147, 16147, 4343, 2047, 10283, 10969, 14389, 1012, 2338, 102 ]], device=torch_device) # noqa: E231 bbox = torch.tensor( [[[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [419, 115, 437, 129], [961, 885, 992, 912], [256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [360, 39, 401, 56], [360, 39, 401, 56], [411, 39, 471, 59], [479, 41, 528, 59], [533, 39, 630, 60], [67, 113, 134, 131], [141, 115, 209, 132], [68, 149, 133, 166], [141, 149, 187, 164], [195, 148, 287, 165], [195, 148, 287, 165], [195, 148, 287, 165], [295, 148, 349, 165], [441, 149, 492, 166], [497, 149, 546, 164], [64, 201, 125, 218], [1000, 1000, 1000, 1000]], [[0, 0, 0, 0], [662, 150, 754, 166], [665, 199, 742, 211], [519, 213, 554, 228], [519, 213, 554, 228], [134, 433, 187, 454], [130, 467, 204, 480], [130, 467, 204, 480], [130, 467, 204, 480], [130, 467, 204, 480], [130, 467, 204, 480], [314, 469, 376, 482], [504, 684, 582, 706], [941, 825, 973, 900], [941, 825, 973, 900], [941, 825, 973, 900], [941, 825, 973, 900], [610, 749, 652, 765], [130, 659, 168, 672], [176, 657, 237, 672], [238, 657, 312, 672], [443, 653, 628, 672], [443, 653, 628, 672], [716, 301, 825, 317], [1000, 1000, 1000, 1000]]], device=torch_device) # noqa: E231 image = ImageList(torch.randn((2, 3, 224, 224)), image_sizes=[(224, 224), (224, 224)]) # noqa: E231 attention_mask = torch.tensor([ [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ], [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ], ], device=torch_device) # noqa: E231 token_type_ids = torch.tensor([[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]], device=torch_device) # noqa: E231 # fmt: on return input_ids, bbox, image, attention_mask, token_type_ids
def inference( self, images: torch.Tensor, image_shapes: torch.Tensor, gt_boxes: torch.Tensor = None, proposals: torch.Tensor = None, scales_yx: torch.Tensor = None, **kwargs, ): # run images through backbone features = self.backbone(images) image_list = ImageList(images, image_shapes) # generate proposals if none are available if proposals is None: proposal_boxes, _ = self.proposal_generator( image_list, features, gt_boxes) else: assert proposals is not None proposal_boxes = [proposal_boxes[0].get_fields()["proposal_boxes"]] feature_pooled = self.forward_for_roi_head([features["res4"]], proposal_boxes) preds_per_image = [p.size(0) for p in [proposal_boxes[0].tensor]] roi_features = feature_pooled.split(preds_per_image, dim=0) return roi_features
def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) bbox = ids_tensor([self.batch_size, self.seq_length, 4], self.range_bbox) # Ensure that bbox is legal for i in range(bbox.shape[0]): for j in range(bbox.shape[1]): if bbox[i, j, 3] < bbox[i, j, 1]: t = bbox[i, j, 3] bbox[i, j, 3] = bbox[i, j, 1] bbox[i, j, 1] = t if bbox[i, j, 2] < bbox[i, j, 0]: t = bbox[i, j, 2] bbox[i, j, 2] = bbox[i, j, 0] bbox[i, j, 0] = t image = ImageList( torch.zeros(self.batch_size, self.num_channels, self.image_size, self.image_size, device=torch_device), self.image_size, ) input_mask = None if self.use_input_mask: input_mask = random_attention_mask([self.batch_size, self.seq_length]) token_type_ids = None if self.use_token_type_ids: token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) sequence_labels = None token_labels = None if self.use_labels: sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) config = LayoutLMv2Config( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, hidden_act=self.hidden_act, hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, is_decoder=False, initializer_range=self.initializer_range, image_feature_pool_shape=self.image_feature_pool_shape, coordinate_size=self.coordinate_size, shape_size=self.shape_size, ) # use smaller resnet backbone to make tests faster config.detectron2_config_args["MODEL.RESNETS.DEPTH"] = 18 config.detectron2_config_args["MODEL.RESNETS.RES2_OUT_CHANNELS"] = 64 config.detectron2_config_args["MODEL.RESNETS.NUM_GROUPS"] = 1 return config, input_ids, bbox, image, token_type_ids, input_mask, sequence_labels, token_labels