def get_hidden_outputs(self, batched_inputs): # complete image images = [x["image"].to(self.device) for x in batched_inputs] images = [self.normalizer(x) for x in images] images = ImageList.from_tensors(images, self.inpaint_net.size_divisibility) # triplet input maps: # erased regions masks = [x["mask"].to(self.device) for x in batched_inputs] masks = ImageList.from_tensors(masks, self.inpaint_net.size_divisibility) # mask the input image with masks erased_ims = images.tensor * (1. - masks.tensor) # ones map ones_ims = [ torch.ones_like(x["mask"].to(self.device)) for x in batched_inputs ] ones_ims = ImageList.from_tensors(ones_ims, self.inpaint_net.size_divisibility) # the conv layer use zero padding, this is used to indicate the image boundary # generation process input_tensor = torch.cat([erased_ims, ones_ims.tensor, masks.tensor], dim=1) all_hidden_outputs = self.inpaint_net.get_hidden_outputs( input_tensor, masks.tensor) # offset_flow is used to visualize return all_hidden_outputs
def _forward(self, batched_inputs): image_path = [x['file_name'] for x in batched_inputs] if self.training: flips = [x['flip'] for x in batched_inputs] else: flips = None images = [x["image"].to(self.device) for x in batched_inputs] images = [(x - self.pixel_mean) / self.pixel_std for x in images] images = ImageList.from_tensors(images, self.backbone.size_divisibility) features = self.backbone(images.tensor) proposals = None if "proposals" in batched_inputs[0]: proposals = [ x["proposals"].to(self.device) for x in batched_inputs ] proposal_losses = {} if "sem_seg" in batched_inputs[0]: gt_sem_seg = [x["sem_seg"].to(self.device) for x in batched_inputs] gt_sem_seg = ImageList.from_tensors( gt_sem_seg, self.backbone.size_divisibility, self.sem_seg_head.ignore_value).tensor else: gt_sem_seg = None sem_seg_results, sem_seg_losses = self.sem_seg_head( features, gt_sem_seg) if "integral_sem_seg" in batched_inputs[0] and self.training: gt_integral_sem_seg = [ x["integral_sem_seg"].to(self.device) for x in batched_inputs ] else: gt_integral_sem_seg = None if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] else: gt_instances = None if self.proposal_generator: proposals, proposal_losses = self.proposal_generator( images, features, gt_instances, gt_integral_sem_seg) else: proposal_losses = {} if "instances" in batched_inputs[0]: if hasattr(self.roi_heads.box_predictor, 'add_pseudo_label'): gt_instances = self.roi_heads.box_predictor.add_pseudo_label( gt_instances, image_path, flips) losses = {} if self.training: losses.update(sem_seg_losses) losses.update(proposal_losses) return images, features, proposals, gt_instances, gt_integral_sem_seg, sem_seg_results, losses
def forward(self, batched_inputs): if not self.training and not self.visualize_path: return self.single_test(batched_inputs) images = [x["image"].to(self.device) for x in batched_inputs] images = [self.normalizer(x) for x in images] images = ImageList.from_tensors(images, self.backbone.size_divisibility) features = self.backbone(images.tensor) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None if "sem_seg" in batched_inputs[0]: gt_sem_seg = [x["sem_seg"].to(self.device) for x in batched_inputs] gt_sem_seg = ImageList.from_tensors( gt_sem_seg, self.backbone.size_divisibility, self.refinement_head.ignore_value).tensor else: gt_sem_seg = None proposals, proposal_losses = self.proposal_generator( images, features, gt_instances) edge_map, head_losses, proposals = self.refinement_head( features, proposals, (gt_sem_seg, [gt_instances, images.image_sizes])) # In training, the proposals are not useful at all in RPN models; but not here # This makes RPN-only models about 5% slower. if self.training: proposal_losses.update(head_losses) return proposal_losses processed_results = [] for per_edge_map, results_per_image, input_per_image, image_size in zip( edge_map, proposals, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) edge_map_r = edge_map_postprocess(per_edge_map, image_size) instance_r = detector_postprocess(proposals[0], height, width) processed_results.append( { "instances": instance_r, "edge_map": edge_map_r }, ) return processed_results
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper`. Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * "image": Tensor, image in (C, H, W) format. * "sem_seg": semantic segmentation ground truth * Other information that's included in the original dicts, such as: "height", "width" (int): the output resolution of the model (may be different from input resolution), used in inference. Returns: list[dict]: Each dict is the output for one input image. The dict contains one key "sem_seg" whose value is a Tensor that represents the per-pixel segmentation prediced by the head. The prediction has shape KxHxW that represents the logits of each class for each pixel. """ images = [x["image"].to(self.device) for x in batched_inputs] images = [(x - self.pixel_mean) / self.pixel_std for x in images] images = ImageList.from_tensors( images, self.backbone.size_divisibility, padding_constraints=self.backbone.padding_constraints, ) features = self.backbone(images.tensor) if "sem_seg" in batched_inputs[0]: targets = [x["sem_seg"].to(self.device) for x in batched_inputs] targets = ImageList.from_tensors( targets, self.backbone.size_divisibility, self.sem_seg_head.ignore_value, self.backbone.padding_constraints, ).tensor else: targets = None results, losses = self.sem_seg_head(features, targets) if self.training: return losses processed_results = [] for result, input_per_image, image_size in zip(results, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = sem_seg_postprocess(result, image_size, height, width) processed_results.append({"sem_seg": r}) return processed_results
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: image: Tensor, image in (C, H, W) format. sem_seg: semantic segmentation ground truth Other information that's included in the original dicts, such as: "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: list[dict]: Each dict is the output for one input image. The dict contains one key "sem_seg" whose value is a Tensor of the output resolution that represents the per-pixel segmentation prediction. """ images = [x["image"].to(self.device) for x in batched_inputs] images = [self.normalizer(x) for x in images] images = ImageList.from_tensors(images, self.backbone.size_divisibility) features = self.backbone(images.tensor) if "contours" in batched_inputs[0]: contours = ImageList.from_tensors( [x["contours"].gt_contours.to(self.device).tensor for x in batched_inputs], self.backbone.size_divisibility ).tensor segmasks = ImageList.from_tensors( [x["contours"].gt_segmasks.to(self.device).tensor for x in batched_inputs], self.backbone.size_divisibility ).tensor else: contours = None segmasks = None if "instances" in batched_inputs[0]: objmask = [x["instances"].gt_masks.to(self.device).tensor for x in batched_inputs] classes = [x["instances"].gt_classes.to(self.device) for x in batched_inputs] else: objmask = None classes = None results, losses = self.sem_seg_head(features, segmasks, contours, objmask, classes) if self.training: return losses processed_results = [] for segmap, contour, emb, input_per_image, image_size in zip(results[0], results[1], results[2], batched_inputs, images.image_sizes): height = input_per_image.get("height") width = input_per_image.get("width") #TODO: translate semantic segmentations and contour maps into detection bounding boxes r = seg_det_postprocess(segmap, contour, emb, image_size, height, width) processed_results.append({"instances": r}) #, "segmap": segmap, "contour": contour, "emb": emb return processed_results
def forward(self, batched_inputs): """ Args: Same as in :class:`GeneralizedRCNN.forward` Returns: list[dict]: Each dict is the output for one input image. The dict contains one key "proposals" whose value is a :class:`Instances` with keys "proposal_boxes" and "objectness_logits". """ images = [x["image"].to(self.device) for x in batched_inputs] images = [(x - self.pixel_mean) / self.pixel_std for x in images] images = ImageList.from_tensors(images, self.backbone.size_divisibility) features = self.backbone(images.tensor) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None masks = { key: ImageList.from_tensors([x[key] for x in batched_inputs], self.backbone.size_divisibility) for key in self.masks } proposals, proposal_losses = self.proposal_generator( images, features, gt_instances, **masks) # In training, the proposals are not useful at all but we generate them anyway. # This makes RPN-only models about 5% slower. if self.training: return proposal_losses processed_results = [] for results_per_image, input_per_image, image_size in zip( proposals, batched_inputs, images.image_sizes): height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) r = detector_postprocess(results_per_image, height, width) processed_results.append({"proposals": r}) return processed_results
def preprocess_images(self, batched_inputs): """ Normalize, pad and batch the input image pairs. """ pre_images = [x["pre_image"].to(self.device) for x in batched_inputs] pre_images = [self.normalizer(x) for x in pre_images] pre_images = ImageList.from_tensors(pre_images, self.backbone.size_divisibility) post_images = [x["post_image"].to(self.device) for x in batched_inputs] post_images = [self.normalizer(x) for x in post_images] post_images = ImageList.from_tensors(post_images, self.backbone.size_divisibility) return pre_images, post_images
def setup(file): # get cfg cfg = get_cfg() cfg.merge_from_file(file) cfg.SOLVER.IMS_PER_BATCH = 2 # get data loader iter data_loader = build_detection_train_loader(cfg) data_loader_iter = iter(data_loader) batched_inputs = next(data_loader_iter) # build anchors backbone = build_backbone(cfg).to(device) images = [x["image"].to(device) for x in batched_inputs] images = ImageList.from_tensors(images, backbone.size_divisibility) features = backbone(images.tensor.float()) input_shape = backbone.output_shape() in_features = cfg.MODEL.RPN.IN_FEATURES anchor_generator = build_anchor_generator( cfg, [input_shape[f] for f in in_features]) anchors = anchor_generator([features[f] for f in in_features]) anchors = Boxes.cat(anchors).to(device) # build matcher raw_matcher = Matcher(cfg.MODEL.RPN.IOU_THRESHOLDS, cfg.MODEL.RPN.IOU_LABELS, allow_low_quality_matches=True) matcher = TopKMatcher(cfg.MODEL.RPN.IOU_THRESHOLDS, cfg.MODEL.RPN.IOU_LABELS, 9) return cfg, data_loader_iter, anchors, matcher, raw_matcher
def test_rpn_proposals_inf(self): N, Hi, Wi, A = 3, 3, 3, 3 proposals = [torch.rand(N, Hi * Wi * A, 4)] pred_logits = [torch.rand(N, Hi * Wi * A)] pred_logits[0][1][3:5].fill_(float("inf")) images = ImageList.from_tensors([torch.rand(3, 10, 10)] * 3) find_top_rpn_proposals(proposals, pred_logits, images, 0.5, 1000, 1000, 0, False)
def preprocess_image(self, batched_inputs: Tuple[Dict[str, torch.Tensor]]): """ Normalize, pad and batch the input images. """ # Print some things for testing purposes ''' test_x_b = batched_inputs[0] print('first line:') print(test_x_b["image"].to(self.device)) ''' images = [x["image"].to(self.device) for x in batched_inputs] ''' test_x = images[0] print('second line:') print(np.shape(test_x)) print('third line:') print(np.shape(self.pixel_mean)) print('fourth line:') print(np.shape(self.pixel_std)) ''' images = [(x - self.pixel_mean) / self.pixel_std for x in images] images = ImageList.from_tensors(images, self.backbone.size_divisibility) return images
def convert_batched_inputs_to_c2_format(batched_inputs, size_divisibility, device): """ See get_caffe2_inputs() below. """ assert all(isinstance(x, dict) for x in batched_inputs) assert all(x["image"].dim() == 3 for x in batched_inputs) images = [x["image"] for x in batched_inputs] images = ImageList.from_tensors(images, size_divisibility) im_info = [] for input_per_image, image_size in zip(batched_inputs, images.image_sizes): target_height = input_per_image.get("height", image_size[0]) target_width = input_per_image.get("width", image_size[1]) # noqa # NOTE: The scale inside im_info is kept as convention and for providing # post-processing information if further processing is needed. For # current Caffe2 model definitions that don't include post-processing inside # the model, this number is not used. # NOTE: There can be a slight difference between width and height # scales, using a single number can results in numerical difference # compared with D2's post-processing. scale = target_height / image_size[0] im_info.append([image_size[0], image_size[1], scale]) im_info = torch.Tensor(im_info) return images.tensor.to(device), im_info.to(device)
def preprocess(self, images): processed_images = [] for image in images: height, width = image.shape[:2] print("height=", height, " width=", width) image = image.to(device=self.device, non_blocking=True) image = image.permute(2, 0, 1).type(torch.float) origin_ratio = width / height cfg_ratio = self.cfg.INPUT.MAX_SIZE_TEST / self.cfg.INPUT.MIN_SIZE_TEST if cfg_ratio > origin_ratio: target_height = self.cfg.INPUT.MIN_SIZE_TEST target_width = int(round(target_height * origin_ratio)) else: target_width = self.cfg.INPUT.MAX_SIZE_TEST target_height = int(round(target_width / origin_ratio)) target_shape = (target_height, target_width) image = F.interpolate(image.unsqueeze(0), target_shape, mode='bilinear', align_corners=False) image = (image.squeeze(0) - self.predictor.model.pixel_mean) / \ self.predictor.model.pixel_std processed_images.append(image) images = ImageList.from_tensors( processed_images, self.predictor.model.backbone.size_divisibility) return images
def preprocess_seg(self, batched_inputs): images = [ x["segment_annotation"].to(self.device) for x in batched_inputs ] images = ImageList.from_tensors(images, self.backbone.size_divisibility) return images
def preprocess_semseg_image(self, batched_inputs): """ Normalize, pad and batch the input images. """ images = [x["sem_seg"] for x in batched_inputs] images = ImageList.from_tensors(images, self.backbone.size_divisibility) return images
def preprocess_image(self, batched_inputs): """ Normalize, pad and batch the input images. """ images = [self.normalizer(x.to(self.device)) for x in batched_inputs] images = ImageList.from_tensors(images, 2) return images
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: image: Tensor, image in (C, H, W) format. sem_seg: semantic segmentation ground truth Other information that's included in the original dicts, such as: "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: list[dict]: Each dict is the output for one input image. The dict contains one key "sem_seg" whose value is a Tensor of the output resolution that represents the per-pixel segmentation prediction. """ images = [x["image"].to(self.device) for x in batched_inputs] images = [self.normalizer(x) for x in images] images = ImageList.from_tensors(images, self.backbone.size_divisibility) features = self.backbone(images.tensor) if "sem_seg" in batched_inputs[0]: targets = [x["sem_seg"].to(self.device) for x in batched_inputs] targets = ImageList.from_tensors( targets, self.backbone.size_divisibility, self.sem_seg_head.ignore_value).tensor else: targets = None results, losses = self.sem_seg_head(features, targets) if self.training: return losses processed_results = [] for result, input_per_image, image_size in zip(results, batched_inputs, images.image_sizes): height = input_per_image.get("height") width = input_per_image.get("width") r = sem_seg_postprocess(result, image_size, height, width) processed_results.append({"sem_seg": r}) return processed_results
def preprocess_batchedimages(self, batched_inputs): """ Preprocess batch: normalized, resize, pad -> uniform batch """ images = [x["image"].to(self.device) for x in batched_inputs] images = [self.normalizer(x) for x in images] images = ImageList.from_tensors(images, self.backbone.size_divisibility) return images
def preprocess_flow(self, batched_inputs): """ Normalize and pad and batch the target flow. """ flows = [x["flow_map"].to(self.device) for x in batched_inputs] flows = [x / self.flow_div for x in flows] flows = ImageList.from_tensors(flows).tensor return flows
def preprocess_image(self, batched_inputs): """ Normalize, pad and batch the input images. """ images = [x["image"].to(self.device) for x in batched_inputs] images = [self.normalizer(x) for x in images] images = ImageList.from_tensors(images, self.backbone.size_divisibility) return images
def preprocess_image(self, batched_inputs): """ Normalize, pad and batch the input images. """ images = [x["image"].to(self.device) for x in batched_inputs] images = [(x - self.pixel_mean) / self.pixel_std for x in images] images = ImageList.from_tensors(images, self.encoder.size_divisibility) return images
def preprocess_image(self, batched_inputs: Tuple[Dict[str, torch.Tensor]]): """ Normalize, pad and batch the input images. """ images = [x["image"].to(self.device) for x in batched_inputs] images = [(x - self.pixel_mean) / self.pixel_std for x in images] images = ImageList.from_tensors(images, self.backbone.size_divisibility) return images
def forward(self, batched_inputs: Tuple[Dict[str, torch.Tensor]]): images = [x["image"].to(self.device) for x in batched_inputs] images = [(x - self.pixel_mean) / self.pixel_std for x in images] images = ImageList.from_tensors(images, size_divisibility=self.size_divisibility).tensor metas = [] rescale = {"height" in x for x in batched_inputs} if len(rescale) != 1: raise ValueError("Some inputs have original height/width, but some don't!") rescale = list(rescale)[0] output_shapes = [] for input in batched_inputs: meta = {} c, h, w = input["image"].shape meta["img_shape"] = meta["ori_shape"] = (h, w, c) if rescale: scale_factor = np.sqrt(h * w / (input["height"] * input["width"])) ori_shape = (input["height"], input["width"]) output_shapes.append(ori_shape) meta["ori_shape"] = ori_shape + (c,) else: scale_factor = 1.0 output_shapes.append((h, w)) meta["scale_factor"] = scale_factor meta["flip"] = False padh, padw = images.shape[-2:] meta["pad_shape"] = (padh, padw, c) metas.append(meta) if self.training: gt_instances = [x["instances"].to(self.device) for x in batched_inputs] if gt_instances[0].has("gt_masks"): from mmdet.core import PolygonMasks as mm_PolygonMasks, BitmapMasks as mm_BitMasks def convert_mask(m, shape): # mmdet mask format if isinstance(m, BitMasks): return mm_BitMasks(m.tensor.cpu().numpy(), shape[0], shape[1]) else: return mm_PolygonMasks(m.polygons, shape[0], shape[1]) gt_masks = [convert_mask(x.gt_masks, x.image_size) for x in gt_instances] else: gt_masks = None losses_and_metrics = self.detector.forward_train( images, metas, [x.gt_boxes.tensor for x in gt_instances], [x.gt_classes for x in gt_instances], gt_masks=gt_masks, ) return _parse_losses(losses_and_metrics) else: results = self.detector.simple_test(images, metas, rescale=rescale) results = [ {"instances": _convert_mmdet_result(r, shape)} for r, shape in zip(results, output_shapes) ] return results
def _preprocess_image(self, batched_inputs): """ Normalize, pad and batch the input images. """ images = [x["image"].to(self.device) for x in batched_inputs] labels = torch.LongTensor([x["label"] for x in batched_inputs]).to(self.device) images = ImageList.from_tensors(images) return images, labels
def preprocess_image(self, batched_inputs): # all models from detectron2 preprocess the images the same way # this could change in the future; fingers crossed # reference: https://github.com/facebookresearch/detectron2/tree/master/detectron2/modeling/meta_arch # last checked: 23.11.20 images = [x["image"].to(self.model.device) for x in batched_inputs] images = [(x - self.model.pixel_mean) / self.model.pixel_std for x in images] images = ImageList.from_tensors(images, self.model.backbone.size_divisibility) return images
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper`. Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * "image": Tensor, image in (C, H, W) format. * "instances": Instances * "sem_seg": semantic segmentation ground truth. * Other information that's included in the original dicts, such as: "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: list[dict]: each dict has the results for one image. The dict contains the following keys: * "instances": see :meth:`GeneralizedRCNN.forward` for its format. * "sem_seg": see :meth:`SemanticSegmentor.forward` for its format. * "panoptic_seg": See the return value of :func:`combine_semantic_and_instance_outputs` for its format. """ if not self.training: return self.inference(batched_inputs) images = self.preprocess_image(batched_inputs) features = self.backbone(images.tensor) assert "sem_seg" in batched_inputs[0] gt_sem_seg = [x["sem_seg"].to(self.device) for x in batched_inputs] gt_sem_seg = ImageList.from_tensors( gt_sem_seg, self.backbone.size_divisibility, self.sem_seg_head.ignore_value ).tensor sem_seg_results, sem_seg_losses, feat, seg_score = self.sem_seg_head(features, gt_sem_seg) # del sem_seg_results # gt_instances = [x["instances"].to(self.device) for x in batched_inputs] proposals, proposal_losses = self.proposal_generator(images, features, gt_instances) detector_results, detector_losses, box_features = self.roi_heads( images, features, proposals, gt_instances ) ############################# # Graph op ############################# instance, sem_seg_results, losses_graph = self.graph_connection( box_features, detector_results, features, feat, seg_score, gt_sem_seg, ) losses = sem_seg_losses losses.update(proposal_losses) losses.update(detector_losses) losses.update(losses_graph) return losses
def preprocess_image(self, batched_inputs: Tuple[Dict[str, Tensor]]): ''' Normalize and batch the input images. ''' images = [x["image"].to(self.device) for x in batched_inputs] images = [x.float().div(255) for x in images if x.dtype != torch.float] images = [(x - self.pixel_mean) / self.pixel_std for x in images] images = ImageList.from_tensors(images) return images
def preprocess_image(self, batched_inputs): """ Normalize, pad and batch the input images. """ images = [x.to(self.device) for x in batched_inputs] norms = [self.normalizer(x) for x in images] size = (norms[0].shape[1], norms[0].shape[2]) images = ImageList.from_tensors(norms, self.backbone.size_divisibility) return images, size
def preprocess_image(self, batched_inputs): """ Normalize, pad and batch the input images. """ images = [x["image"].to(self.device) for x in batched_inputs] images = [(x - self.pixel_mean) / self.pixel_std for x in images] if self.dynamic: images = ImageList.from_tensors(images, self.backbone.size_divisibility) else: if self.training: min_size = self.input.MIN_SIZE_TRAIN max_size = self.input.MAX_SIZE_TRAIN else: min_size = self.input.MIN_SIZE_TEST max_size = self.input.MAX_SIZE_TEST min_size = min_size[0] if isinstance(min_size, tuple) else min_size images = ImageList.from_tensors(images, self.backbone.size_divisibility, max_height=min_size, max_width=max_size) return images
def preprocess_image(self, batched_inputs, norm=True): """ Normalize, pad and batch the input images. """ images = [x["image"].to(self.device) for x in batched_inputs] if norm: images = [self.normalizer(x) for x in images] images = ImageList.from_tensors(images, 512) images = images.to(self.device) return images
def forward(self, batched_inputs): # complete image images = [x["image"].to(self.device) for x in batched_inputs] images = [self.normalizer(x) for x in images] images = ImageList.from_tensors(images, self.inpaint_net.size_divisibility) # triplet input maps: # erased regions masks = [x["mask"].to(self.device) for x in batched_inputs] masks = ImageList.from_tensors(masks, self.inpaint_net.size_divisibility) # mask the input image with masks erased_ims = images.tensor * (1. - masks.tensor) # ones map ones_ims = [ torch.ones_like(x["mask"].to(self.device)) for x in batched_inputs ] ones_ims = ImageList.from_tensors(ones_ims, self.inpaint_net.size_divisibility) # the conv layer use zero padding, this is used to indicate the image boundary # generation process input_tensor = torch.cat([erased_ims, ones_ims.tensor, masks.tensor], dim=1) coarse_inp, fine_inp, offset_flow = self.inpaint_net( input_tensor, masks.tensor) # offset_flow is used to visualize if self.training: raise NotImplementedError else: processed_results = [] inpainted_im = erased_ims * ( 1. - masks.tensor) + fine_inp * masks.tensor for result, input_per_image, image_size in zip( inpainted_im, batched_inputs, images.image_sizes): height = input_per_image.get("height") width = input_per_image.get("width") r = sem_seg_postprocess(result, image_size, height, width) # abuse semantic segmentation postprocess. it basically does some resize processed_results.append({"inpainted": r}) return processed_results