def test_call_pytorch_with_coco_detection_annotations(self): # prepare image and target image = Image.open( "./tests/fixtures/tests_samples/COCO/000000039769.png") with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f: target = json.loads(f.read()) target = {"image_id": 39769, "annotations": target} # encode them feature_extractor = DetrFeatureExtractor.from_pretrained( "facebook/detr-resnet-50") encoding = feature_extractor(images=image, annotations=target, return_tensors="pt") # verify pixel values expected_shape = torch.Size([1, 3, 800, 1066]) self.assertEqual(encoding["pixel_values"].shape, expected_shape) expected_slice = torch.tensor([0.2796, 0.3138, 0.3481]) assert torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4) # verify area expected_area = torch.tensor([ 5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438 ]) assert torch.allclose(encoding["labels"][0]["area"], expected_area) # verify boxes expected_boxes_shape = torch.Size([6, 4]) self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215]) assert torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3) # verify image_id expected_image_id = torch.tensor([39769]) assert torch.allclose(encoding["labels"][0]["image_id"], expected_image_id) # verify is_crowd expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) assert torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd) # verify class_labels expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17]) assert torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels) # verify orig_size expected_orig_size = torch.tensor([480, 640]) assert torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size) # verify size expected_size = torch.tensor([800, 1066]) assert torch.allclose(encoding["labels"][0]["size"], expected_size)
def test_call_pytorch_with_coco_panoptic_annotations(self): # prepare image, target and masks_path image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f: target = json.loads(f.read()) target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target} masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic") # encode them # TODO replace by .from_pretrained facebook/detr-resnet-50-panoptic feature_extractor = DetrFeatureExtractor(format="coco_panoptic") encoding = feature_extractor(images=image, annotations=target, masks_path=masks_path, return_tensors="pt") # verify pixel values expected_shape = torch.Size([1, 3, 800, 1066]) self.assertEqual(encoding["pixel_values"].shape, expected_shape) expected_slice = torch.tensor([0.2796, 0.3138, 0.3481]) assert torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4) # verify area expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147]) assert torch.allclose(encoding["labels"][0]["area"], expected_area) # verify boxes expected_boxes_shape = torch.Size([6, 4]) self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625]) assert torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3) # verify image_id expected_image_id = torch.tensor([39769]) assert torch.allclose(encoding["labels"][0]["image_id"], expected_image_id) # verify is_crowd expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) assert torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd) # verify class_labels expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93]) assert torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels) # verify masks expected_masks_sum = 822338 self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum) # verify orig_size expected_orig_size = torch.tensor([480, 640]) assert torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size) # verify size expected_size = torch.tensor([800, 1066]) assert torch.allclose(encoding["labels"][0]["size"], expected_size)
def convert_detr_checkpoint(model_name, pytorch_dump_folder_path): """ Copy/paste/tweak model's weights to our DETR structure. """ # load default config config = DetrConfig() # set backbone and dilation attributes if "resnet101" in model_name: config.backbone = "resnet101" if "dc5" in model_name: config.dilation = True is_panoptic = "panoptic" in model_name if is_panoptic: config.num_labels = 250 else: config.num_labels = 91 config.id2label = id2label config.label2id = {v: k for k, v in id2label.items()} # load feature extractor format = "coco_panoptic" if is_panoptic else "coco_detection" feature_extractor = DetrFeatureExtractor(format=format) # prepare image img = prepare_img() encoding = feature_extractor(images=img, return_tensors="pt") pixel_values = encoding["pixel_values"] logger.info(f"Converting model {model_name}...") # load original model from torch hub detr = torch.hub.load("facebookresearch/detr", model_name, pretrained=True).eval() state_dict = detr.state_dict() # rename keys for src, dest in rename_keys: if is_panoptic: src = "detr." + src rename_key(state_dict, src, dest) state_dict = rename_backbone_keys(state_dict) # query, key and value matrices need special treatment read_in_q_k_v(state_dict, is_panoptic=is_panoptic) # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them prefix = "detr.model." if is_panoptic else "model." for key in state_dict.copy().keys(): if is_panoptic: if ( key.startswith("detr") and not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor") ): val = state_dict.pop(key) state_dict["detr.model" + key[4:]] = val elif "class_labels_classifier" in key or "bbox_predictor" in key: val = state_dict.pop(key) state_dict["detr." + key] = val elif key.startswith("bbox_attention") or key.startswith("mask_head"): continue else: val = state_dict.pop(key) state_dict[prefix + key] = val else: if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"): val = state_dict.pop(key) state_dict[prefix + key] = val # finally, create HuggingFace model and load state dict model = DetrForSegmentation(config) if is_panoptic else DetrForObjectDetection(config) model.load_state_dict(state_dict) model.eval() # verify our conversion original_outputs = detr(pixel_values) outputs = model(pixel_values) assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-4) assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-4) if is_panoptic: assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4) # Save model and feature extractor logger.info(f"Saving PyTorch model and feature extractor to {pytorch_dump_folder_path}...") Path(pytorch_dump_folder_path).mkdir(exist_ok=True) model.save_pretrained(pytorch_dump_folder_path) feature_extractor.save_pretrained(pytorch_dump_folder_path)
def default_feature_extractor(self): return DetrFeatureExtractor.from_pretrained( "facebook/detr-resnet-50") if is_vision_available() else None
def plot_save(pil_img, prob, boxes, image_name): for i, p, (xmin, ymin, xmax, ymax) in enumerate(zip(prob, boxes.tolist())): cl = p.argmax() if model.config.id2label[cl.item()] == 'person': if int(p[cl]) > 0.95: im = pil_img.crop((xmin, ymin, xmax, ymax)) im.save(save_path + str(i) + '_' + image_name + '.jpg') for image_path in image_list: image_name = image_path.split('/')[-1] image = Image.open(image_path) #url = "http://images.cocodataset.org/train2014/COCO_train2014_000000384029.jpg" #image = Image. open( requests. get( url, stream = True). raw) # feature_extractor = DetrFeatureExtractor.from_pretrained( 'facebook/detr-resnet-50') model = DetrForObjectDetection.from_pretrained('facebook/detr-resnet-50') inputs = feature_extractor(images=image, return_tensors="pt") outputs = model(**inputs) # 分類名やバウンディングボックスを描画する # colors for visualization COLORS = [[0.000, 0.447, 0.741], [0.850, 0.325, 0.098], [0.929, 0.694, 0.125], [0.494, 0.184, 0.556], [0.466, 0.674, 0.188], [0.301, 0.745, 0.933]] # keep only predictions of queries with 0.9+ confidence (excluding no-object class) probas = outputs.logits.softmax(-1)[0, :, :-1] keep = probas.max(-1).values > 0.9