def _prepare_test(self, index, img, label): # This returns cpu tensors. # Image: 3D with channels last, float32, in range [0, 1] (normally done # by ToTensor). # Label map: 2D, flat int64, [0 ... sef.gt_k - 1] # label is passed in canonical [0 ... 181] indexing assert (img.shape[:2] == label.shape) img = img.astype(np.float32) label = label.astype(np.int32) # shrink original images, for memory purposes, otherwise no point if self.pre_scale_all: assert (self.pre_scale_factor < 1.) img = cv2.resize(img, dsize=None, fx=self.pre_scale_factor, fy=self.pre_scale_factor, interpolation=cv2.INTER_LINEAR) label = cv2.resize(label, dsize=None, fx=self.pre_scale_factor, fy=self.pre_scale_factor, interpolation=cv2.INTER_NEAREST) # center crop to input sz img, _ = pad_and_or_crop(img, self.input_sz, mode="centre") label, _ = pad_and_or_crop(label, self.input_sz, mode="centre") # finish if not self.no_sobel: img = custom_greyscale_numpy(img, include_rgb=self.include_rgb) img = img.astype(np.float32) / 255. img = torch.from_numpy(img).permute(2, 0, 1) if RENDER_DATA: render(label, mode="label", name=("test_data_label_pre_%d" % index)) # convert to coarse if required, reindex to [0, gt_k -1], and get mask label, mask = self._filter_label(label) # mask if required if self.mask_input: masked = 1 - mask img[:, masked] = 0 if RENDER_DATA: render(img, mode="image", name=("test_data_img_%d" % index)) render(label, mode="label", name=("test_data_label_post_%d" % index)) render(mask, mode="mask", name=("test_data_mask_%d" % index)) # dataloader must return tensors (conversion forced in their code anyway) return img, torch.from_numpy(label), torch.from_numpy( mask.astype(np.uint8))
def _prepare_train(self, index, img): # This returns gpu tensors. # label is passed in canonical [0 ... 181] indexing img = img.astype(np.float32) # shrink original images, for memory purposes # or enlarge if self.pre_scale_all: img = cv2.resize(img, dsize=None, fx=self.pre_scale_factor, fy=self.pre_scale_factor, interpolation=cv2.INTER_LINEAR) # basic augmentation transforms for both img1 and img2 if self.use_random_scale: # bilinear interp requires float img scale_factor = (np.random.rand() * (self.scale_max - self.scale_min)) + \ self.scale_min img = cv2.resize(img, dsize=None, fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_LINEAR) # random crop to input sz img, coords = pad_and_or_crop(img, self.input_sz, mode="random") # make img2 different from img1 (img) # tf_mat can be: # *A, from img2 to img1 (will be applied to img2's heatmap)-> img1 space # input img1 tf: *tf.functional or pil.image # input mask tf: *none # output heatmap: *tf.functional (parallel), inverse of what is used # for inputs, create inverse of this tf in [-1, 1] format # B, from img1 to img2 (will be applied to img1's heatmap)-> img2 space # input img1 tf: pil.image # input mask tf: pil.image (discrete) # output heatmap: tf.functional, create copy of this tf in [-1,1] format # tf.function tf_mat: translation is opposite to what we'd expect (+ve 1 # is shift half towards left) # but rotation is correct (-sin in top right = counter clockwise) # flip is [[-1, 0, 0], [0, 1, 0], [0, 0, 1]] # img2 = flip(affine1_to_2(img1)) # => img1_space = affine1_to_2^-1(flip^-1(img2_space)) # = affine2_to_1(flip^-1(img2_space)) # so tf_mat_img2_to_1 = affine2_to_1 * flip^-1 (order matters as not diag) # flip^-1 = flip # no need to tf label, as we're doing option A, mask needed in img1 space # converting to PIL does not change underlying np datatype it seems # images are RGBIR. We don't want to jitter or greyscale the IR part img_ir = img[:, :, 3] img = img[:, :, :3] img1 = Image.fromarray(img.astype(np.uint8)) # (img2) do jitter, no tf_mat change img2 = self.jitter_tf(img1) # not in place, new memory img1 = np.array(img1) img2 = np.array(img2) # channels still last # channels still last if not self.no_sobel: img1 = custom_greyscale_numpy(img1, include_rgb=self.include_rgb) img2 = custom_greyscale_numpy(img2, include_rgb=self.include_rgb) img1 = img1.astype(np.float32) / 255. img2 = img2.astype(np.float32) / 255. # concatenate IR back on before spatial warps # may be concatenating onto just greyscale image # grey/RGB underneath IR img_ir = img_ir.astype(np.float32) / 255. img1 = np.concatenate([img1, np.expand_dims(img_ir, axis=2)], axis=2) img2 = np.concatenate([img2, np.expand_dims(img_ir, axis=2)], axis=2) # convert both to channel-first tensor format # make them all cuda tensors now, except label, for optimality img1 = torch.from_numpy(img1).permute(2, 0, 1).cuda() img2 = torch.from_numpy(img2).permute(2, 0, 1).cuda() # (img2) do affine if nec, tf_mat changes if self.use_random_affine: affine_kwargs = { "min_rot": self.aff_min_rot, "max_rot": self.aff_max_rot, "min_shear": self.aff_min_shear, "max_shear": self.aff_max_shear, "min_scale": self.aff_min_scale, "max_scale": self.aff_max_scale } img2, affine1_to_2, affine2_to_1 = random_affine( img2, **affine_kwargs) # # tensors else: affine2_to_1 = torch.zeros([2, 3 ]).to(torch.float32).cuda() # identity affine2_to_1[0, 0] = 1 affine2_to_1[1, 1] = 1 # (img2) do random flip, tf_mat changes if np.random.rand() > self.flip_p: img2 = torch.flip(img2, dims=[2]) # horizontal, along width # applied affine, then flip, new = flip * affine * coord # (flip * affine)^-1 is just flip^-1 * affine^-1. # No order swap, unlike functions... # hence top row is negated affine2_to_1[0, :] *= -1. # uint8 tensor as masks should be binary, also for consistency, # but converted to float32 in main loop because is used # multiplicatively in loss mask_img1 = torch.ones(self.input_sz, self.input_sz).to(torch.uint8).cuda() if RENDER_DATA: render(img1, mode="image", name=("train_data_img1_%d" % index)) render(img2, mode="image", name=("train_data_img2_%d" % index)) render(affine2_to_1, mode="matrix", name=("train_data_affine2to1_%d" % index)) render(mask_img1, mode="mask", name=("train_data_mask_%d" % index)) return img1, img2, affine2_to_1, mask_img1
def _prepare_test(self, index, img, label): # This returns cpu tensors. # Image: 3D with channels last, float32, in range [0, 1] (normally done # by ToTensor). # Label map: 2D, flat int64, [0 ... sef.gt_k - 1] # label is passed in canonical [0 ... 181] indexing assert (label is not None) assert (img.shape[:2] == label.shape) img = img.astype(np.float32) label = label.astype(np.int32) # shrink original images, for memory purposes, or magnify if self.pre_scale_all: img = cv2.resize(img, dsize=None, fx=self.pre_scale_factor, fy=self.pre_scale_factor, interpolation=cv2.INTER_LINEAR) label = cv2.resize(label, dsize=None, fx=self.pre_scale_factor, fy=self.pre_scale_factor, interpolation=cv2.INTER_NEAREST) # center crop to input sz img, _ = pad_and_or_crop(img, self.input_sz, mode="centre") label, _ = pad_and_or_crop(label, self.input_sz, mode="centre") img_ir = img[:, :, 3] img = img[:, :, :3] # finish # may be concatenating onto just greyscale image if not self.no_sobel: img = custom_greyscale_numpy(img, include_rgb=self.include_rgb) img = img.astype(np.float32) / 255. img_ir = img_ir.astype(np.float32) / 255. img = np.concatenate([img, np.expand_dims(img_ir, axis=2)], axis=2) # # grey/RGB under IR img = torch.from_numpy(img).permute(2, 0, 1) if RENDER_DATA: render(label, mode="label", name=("test_data_label_pre_%d" % index)) # convert to coarse if required, reindex to [0, gt_k -1], and get mask label = self._filter_label(label) mask = torch.ones(self.input_sz, self.input_sz).to(torch.uint8) if RENDER_DATA: render(img, mode="image", name=("test_data_img_%d" % index)) render(label, mode="label", name=("test_data_label_post_%d" % index)) render(mask, mode="mask", name=("test_data_mask_%d" % index)) # dataloader must return tensors (conversion forced in their code anyway) return img, torch.from_numpy(label), mask
def _prepare_train_single(self, index, img): # Returns one pair only, i.e. without transformed second image. # Used for standard CNN training (baselines). # This returns gpu tensors. # label is passed in canonical [0 ... 181] indexing img = img.astype(np.float32) # shrink original images, for memory purposes # or enlarge if self.pre_scale_all: img = cv2.resize(img, dsize=None, fx=self.pre_scale_factor, fy=self.pre_scale_factor, interpolation=cv2.INTER_LINEAR) # basic augmentation transforms for both img1 and img2 if self.use_random_scale: # bilinear interp requires float img scale_factor = (np.random.rand() * (self.scale_max - self.scale_min)) + \ self.scale_min img = cv2.resize(img, dsize=None, fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_LINEAR) # random crop to input sz img, coords = pad_and_or_crop(img, self.input_sz, mode="random") # converting to PIL does not change underlying np datatype it seems # images are RGBIR. We don't want to jitter or greyscale the IR part img_ir = img[:, :, 3] img = img[:, :, :3] img1 = Image.fromarray(img.astype(np.uint8)) img1 = self.jitter_tf(img1) # not in place, new memory img1 = np.array(img1) # channels still last # channels still last if not self.no_sobel: img1 = custom_greyscale_numpy(img1, include_rgb=self.include_rgb) img1 = img1.astype(np.float32) / 255. # concatenate IR back on before spatial warps # may be concatenating onto just greyscale image # grey/RGB underneath IR img_ir = img_ir.astype(np.float32) / 255. img1 = np.concatenate([img1, np.expand_dims(img_ir, axis=2)], axis=2) # convert both to channel-first tensor format # make them all cuda tensors now, except label, for optimality img1 = torch.from_numpy(img1).permute(2, 0, 1).cuda() if self.use_random_affine: affine_kwargs = { "min_rot": self.aff_min_rot, "max_rot": self.aff_max_rot, "min_shear": self.aff_min_shear, "max_shear": self.aff_max_shear, "min_scale": self.aff_min_scale, "max_scale": self.aff_max_scale } img1, _, _ = random_affine(img1, **affine_kwargs) # tensors # (img2) do random flip, tf_mat changes if np.random.rand() > self.flip_p: img1 = torch.flip(img1, dims=[2]) # horizontal, along width # uint8 tensor as masks should be binary, also for consistency, # but converted to float32 in main loop because is used # multiplicatively in loss mask_img1 = torch.ones(self.input_sz, self.input_sz).to(torch.uint8).cuda() if RENDER_DATA: render(img1, mode="image", name=("train_data_img1_%d" % index)) render(mask_img1, mode="mask", name=("train_data_mask_%d" % index)) return img1, mask_img1
def _prepare_train_single(self, index, img, label): # Returns one pair only, i.e. without transformed second image. # Used for standard CNN training (baselines). # This returns gpu tensors. # label is passed in canonical [0 ... 181] indexing assert (img.shape[:2] == label.shape) img = img.astype(np.float32) label = label.astype(np.int32) # shrink original images, for memory purposes, otherwise no point if self.pre_scale_all: assert (self.pre_scale_factor < 1.) img = cv2.resize(img, dsize=None, fx=self.pre_scale_factor, fy=self.pre_scale_factor, interpolation=cv2.INTER_LINEAR) label = cv2.resize(label, dsize=None, fx=self.pre_scale_factor, fy=self.pre_scale_factor, interpolation=cv2.INTER_NEAREST) if self.use_random_scale: # bilinear interp requires float img scale_factor = (np.random.rand() * (self.scale_max - self.scale_min)) + \ self.scale_min img = cv2.resize(img, dsize=None, fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_LINEAR) label = cv2.resize(label, dsize=None, fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_NEAREST) # random crop to input sz img, coords = pad_and_or_crop(img, self.input_sz, mode="random") label, _ = pad_and_or_crop(label, self.input_sz, mode="fixed", coords=coords) _, mask_img1 = self._filter_label(label) # uint8 tensor as masks should be binary, also for consistency with # prepare_train, but converted to float32 in main loop because is used # multiplicatively in loss mask_img1 = torch.from_numpy(mask_img1.astype(np.uint8)).cuda() # converting to PIL does not change underlying np datatype it seems img1 = Image.fromarray(img.astype(np.uint8)) img1 = self.jitter_tf(img1) # not in place, new memory img1 = np.array(img1) # channels still last if not self.no_sobel: img1 = custom_greyscale_numpy(img1, include_rgb=self.include_rgb) img1 = img1.astype(np.float32) / 255. # convert both to channel-first tensor format # make them all cuda tensors now, except label, for optimality img1 = torch.from_numpy(img1).permute(2, 0, 1).cuda() # mask if required if self.mask_input: masked = 1 - mask_img1 img1[:, masked] = 0 if self.use_random_affine: affine_kwargs = { "min_rot": self.aff_min_rot, "max_rot": self.aff_max_rot, "min_shear": self.aff_min_shear, "max_shear": self.aff_max_shear, "min_scale": self.aff_min_scale, "max_scale": self.aff_max_scale } img1, _, _ = random_affine(img1, **affine_kwargs) # tensors if np.random.rand() > self.flip_p: img1 = torch.flip(img1, dims=[2]) # horizontal, along width if RENDER_DATA: render(img1, mode="image", name=("train_data_img1_%d" % index)) render(mask_img1, mode="mask", name=("train_data_mask_%d" % index)) return img1, mask_img1
def IID_segmentation_loss_uncollapsed(x1_outs, x2_outs, all_affine2_to_1=None, all_mask_img1=None, lamb=1.0, half_T_side_dense=None, half_T_side_sparse_min=None, half_T_side_sparse_max=None): assert (x1_outs.requires_grad) assert (x2_outs.requires_grad) assert (not all_affine2_to_1.requires_grad) assert (not all_mask_img1.requires_grad) assert (x1_outs.shape == x2_outs.shape) # bring x2 back into x1's spatial frame x2_outs_inv = perform_affine_tf(x2_outs, all_affine2_to_1) if (half_T_side_sparse_min != 0) or (half_T_side_sparse_max != 0): x2_outs_inv = random_translation_multiple( x2_outs_inv, half_side_min=half_T_side_sparse_min, half_side_max=half_T_side_sparse_max) if RENDER: # indices added to each name by render() render(x1_outs, mode="image_as_feat", name="invert_img1_") render(x2_outs, mode="image_as_feat", name="invert_img2_pre_") render(x2_outs_inv, mode="image_as_feat", name="invert_img2_post_") render(all_mask_img1, mode="mask", name="invert_mask_") # zero out all irrelevant patches bn, k, h, w = x1_outs.shape all_mask_img1 = all_mask_img1.view(bn, 1, h, w) # mult, already float32 x1_outs = x1_outs * all_mask_img1 # broadcasts x2_outs_inv = x2_outs_inv * all_mask_img1 # sum over everything except classes, by convolving x1_outs with x2_outs_inv # which is symmetric, so doesn't matter which one is the filter x1_outs = x1_outs.permute(1, 0, 2, 3).contiguous() # k, ni, h, w x2_outs_inv = x2_outs_inv.permute(1, 0, 2, 3).contiguous() # k, ni, h, w # k, k, 2 * half_T_side_dense + 1,2 * half_T_side_dense + 1 p_i_j = F.conv2d(x1_outs, weight=x2_outs_inv, padding=(half_T_side_dense, half_T_side_dense)) # do expectation over each shift location in the T_side_dense * # T_side_dense box T_side_dense = half_T_side_dense * 2 + 1 # T x T x k x k p_i_j = p_i_j.permute(2, 3, 0, 1) p_i_j = p_i_j / p_i_j.sum(dim=3, keepdim=True).sum(dim=2, keepdim=True) # norm # symmetrise, transpose the k x k part p_i_j = (p_i_j + p_i_j.permute(0, 1, 3, 2)) / 2.0 # T x T x k x k p_i_mat = p_i_j.sum(dim=2, keepdim=True).repeat(1, 1, k, 1) p_j_mat = p_i_j.sum(dim=3, keepdim=True).repeat(1, 1, 1, k) # for log stability; tiny values cancelled out by mult with p_i_j anyway p_i_j[(p_i_j < EPS).data] = EPS p_i_mat[(p_i_mat < EPS).data] = EPS p_j_mat[(p_j_mat < EPS).data] = EPS # maximise information loss = (-p_i_j * (torch.log(p_i_j) - lamb * torch.log(p_i_mat) - lamb * torch.log(p_j_mat))).sum() / (T_side_dense * T_side_dense) # for analysis only loss_no_lamb = (-p_i_j * (torch.log(p_i_j) - torch.log(p_i_mat) - torch.log(p_j_mat))).sum() / (T_side_dense * T_side_dense) return loss, loss_no_lamb
def IID_segmentation_loss(x1_outs, x2_outs, all_affine2_to_1=None, all_mask_img1=None, lamb=1.0, half_T_side_dense=None, half_T_side_sparse_min=None, half_T_side_sparse_max=None): assert (x1_outs.requires_grad) assert (x2_outs.requires_grad) assert (not all_affine2_to_1.requires_grad) assert (not all_mask_img1.requires_grad) assert (x1_outs.shape == x2_outs.shape) # bring x2 back into x1's spatial frame x2_outs_inv = perform_affine_tf(x2_outs, all_affine2_to_1) if (half_T_side_sparse_min != 0) or (half_T_side_sparse_max != 0): x2_outs_inv = random_translation_multiple( x2_outs_inv, half_side_min=half_T_side_sparse_min, half_side_max=half_T_side_sparse_max) if RENDER: # indices added to each name by render() render(x1_outs, mode="image_as_feat", name="invert_img1_") render(x2_outs, mode="image_as_feat", name="invert_img2_pre_") render(x2_outs_inv, mode="image_as_feat", name="invert_img2_post_") render(all_mask_img1, mode="mask", name="invert_mask_") # zero out all irrelevant patches bn, k, h, w = x1_outs.shape all_mask_img1 = all_mask_img1.view(bn, 1, h, w) # mult, already float32 x1_outs = x1_outs * all_mask_img1 # broadcasts x2_outs_inv = x2_outs_inv * all_mask_img1 # sum over everything except classes, by convolving x1_outs with x2_outs_inv # which is symmetric, so doesn't matter which one is the filter x1_outs = x1_outs.permute(1, 0, 2, 3).contiguous() # k, ni, h, w x2_outs_inv = x2_outs_inv.permute(1, 0, 2, 3).contiguous() # k, ni, h, w # k, k, 2 * half_T_side_dense + 1,2 * half_T_side_dense + 1 p_i_j = F.conv2d(x1_outs, weight=x2_outs_inv, padding=(half_T_side_dense, half_T_side_dense)) p_i_j = p_i_j.sum(dim=2, keepdim=False).sum(dim=2, keepdim=False) # k, k # normalise, use sum, not bn * h * w * T_side * T_side, because we use a mask # also, some pixels did not have a completely unmasked box neighbourhood, # but it's fine - just less samples from that pixel current_norm = float(p_i_j.sum()) p_i_j = p_i_j / current_norm # symmetrise p_i_j = (p_i_j + p_i_j.t()) / 2. # compute marginals p_i_mat = p_i_j.sum(dim=1).unsqueeze(1) # k, 1 p_j_mat = p_i_j.sum(dim=0).unsqueeze(0) # 1, k # for log stability; tiny values cancelled out by mult with p_i_j anyway p_i_j[(p_i_j < EPS).data] = EPS p_i_mat[(p_i_mat < EPS).data] = EPS p_j_mat[(p_j_mat < EPS).data] = EPS # maximise information loss = (-p_i_j * (torch.log(p_i_j) - lamb * torch.log(p_i_mat) - lamb * torch.log(p_j_mat))).sum() # for analysis only loss_no_lamb = ( -p_i_j * (torch.log(p_i_j) - torch.log(p_i_mat) - torch.log(p_j_mat))).sum() return loss, loss_no_lamb