def vectorized_negatives_loss(image_outputs, audio_outputs, negatives_output, nframes, margin, symfun): """ Computes the triplet margin ranking loss for each anchor image/caption pair using the specific negative, in a vectorized way """ # I = image_outputs.view(image_outputs.size(0), embedding_dim) # A = audio_outputs.view(audio_outputs.size(0), embedding_dim) # num_negatives = len(negatives_output) num_units = image_outputs.size(1) output_loss = [] n = image_outputs.size(0) loss = torch.zeros(1, requires_grad=True).type(image_outputs.data.type()) first = 0 last = num_units # first = (j*num_units)//num_negatives # last = ((j+1)*num_units)//num_negatives for i in range(n): nF = nframes[i] anchorsim = utils.matchmap_sim( utils.compute_matchmap(image_outputs[i][first:last], audio_outputs[i][first:last, :, 0:nF], symfun)) Aimpsim = utils.matchmap_sim( utils.compute_matchmap(negatives_output[i][first:last], audio_outputs[i][first:last, :, 0:nF], symfun)) I2A_simdif = margin + Aimpsim - anchorsim output_loss.append(I2A_simdif) return output_loss
def combined_random_sampled_margin_rank_loss(image_outputs, audio_outputs, negatives_output, nframes, margin, symfun): """ Computes the triplet margin ranking loss for each anchor image/caption pair using both a random negative from the positive images batch, and also the specific negative for the image. The returned loss for each sample is the highest of the two """ # I = image_outputs.view(image_outputs.size(0), embedding_dim) # A = audio_outputs.view(audio_outputs.size(0), embedding_dim) n = image_outputs.size(0) loss = torch.zeros(1, requires_grad=True).type(image_outputs.data.type()) for i in range(n): I_imp_ind = i A_imp_ind = i while I_imp_ind == i: I_imp_ind = np.random.randint(0, n) while A_imp_ind == i: A_imp_ind = np.random.randint(0, n) nF = nframes[i] nFimp = nframes[A_imp_ind] anchorsim = utils.matchmap_sim( utils.compute_matchmap(image_outputs[i], audio_outputs[i][:, :, 0:nF], symfun)) Iimpsim = utils.matchmap_sim( utils.compute_matchmap(image_outputs[I_imp_ind], audio_outputs[i][:, :, 0:nF], symfun)) Aimpsim = utils.matchmap_sim( utils.compute_matchmap(image_outputs[i], audio_outputs[A_imp_ind][:, :, 0:nFimp], symfun)) anchorsim_neg = utils.matchmap_sim( utils.compute_matchmap(image_outputs[i], audio_outputs[i][:, :, 0:nF], symfun)) Aimpsim_neg = utils.matchmap_sim( utils.compute_matchmap(negatives_output[i], audio_outputs[i][:, :, 0:nF], symfun)) I2A_simdif_neg = margin + Aimpsim_neg - anchorsim_neg A2I_simdif = margin + Iimpsim - anchorsim if (A2I_simdif.data > 0).all(): loss = loss + A2I_simdif I2A_simdif = margin + Aimpsim - anchorsim if (I2A_simdif.data > 0).all(): loss = loss + torch.max(I2A_simdif, I2A_simdif_neg) loss = loss / n return loss
def segment_images_iter(self): images = {} audios = {} counter_images = 0 for batch_id, (image_input, audio_input, _, nframes, path, image_raw) in enumerate(self.dataloader): v_init = self.z[int(path[0])] z_img = torch.FloatTensor(audio_input.size(0), v_init.shape[0]) for k in range(audio_input.size(0)): z_img[k, :] = self.z[int(path[k])] image_input = self.generator.generate_images(z_img, intervention=None) image_input = utils.transform(image_input) audio_input = audio_input.cuda(async=True) model_output = self.model(image_input, audio_input, []) image_output = model_output[0] audio_output = model_output[1] pooling_ratio = round(audio_input.size(3) / audio_output.size(3)) nframes = nframes.div(pooling_ratio) # Compute matchmap to detect where there are important concepts that we want to cluster (this time in image) for i in range(image_input.shape[0]): nF = nframes[i] matchmap_i = utils.compute_matchmap(image_output[i], audio_output[i][:, :, 0:nF]) matchmap_i_mean = matchmap_i.mean(2).view(-1) indexes = np.where(matchmap_i_mean > 0.9 * matchmap_i_mean.max())[0] features_im = image_output[i].view(image_output.shape[1], -1)[..., indexes].cpu().numpy() product = np.matmul(self.centroids, features_im) # For each selected superpixel in the image, find top 5 concepts seg_image = {} for j, index in enumerate(indexes): clust = np.argsort(-product[:, j])[:5] seg_image[index] = clust images[path[i]] = seg_image # Also for the audio, for testing purposes matchmap_i_max = matchmap_i.max(1)[0].max(0)[0] indexes = np.where(matchmap_i_max > 0.9 * matchmap_i_max.max())[0] features_au = audio_output[i].view(audio_output.shape[1], -1)[..., indexes].cpu().numpy() product = np.matmul(self.centroids, features_au) # For each selected superpixel in the image, find top 5 concepts seg_audio = {} for j, index in enumerate(indexes): clust = np.argsort(-product[:, j])[:5] seg_audio[index + 20] = clust audios[path[i]] = seg_audio counter_images += 1 if counter_images >= self.num_images_segment: return images, audios return images, audios
def sampled_margin_rank_loss(image_outputs, audio_outputs, nframes, margin, symfun): """ Computes the triplet margin ranking loss for each anchor image/caption pair The impostor image/caption is randomly sampled from the minibatch """ # I = image_outputs.view(image_outputs.size(0), embedding_dim) # A = audio_outputs.view(audio_outputs.size(0), embedding_dim) n = image_outputs.size(0) loss = torch.zeros(1, requires_grad=True).type(image_outputs.data.type()) for i in range(n): I_imp_ind = i A_imp_ind = i while I_imp_ind == i: I_imp_ind = np.random.randint(0, n) while A_imp_ind == i: A_imp_ind = np.random.randint(0, n) nF = nframes[i] nFimp = nframes[A_imp_ind] anchorsim = utils.matchmap_sim( utils.compute_matchmap(image_outputs[i], audio_outputs[i][:, :, 0:nF], symfun)) Iimpsim = utils.matchmap_sim( utils.compute_matchmap(image_outputs[I_imp_ind], audio_outputs[i][:, :, 0:nF], symfun)) Aimpsim = utils.matchmap_sim( utils.compute_matchmap(image_outputs[i], audio_outputs[A_imp_ind][:, :, 0:nFimp], symfun)) A2I_simdif = margin + Iimpsim - anchorsim if (A2I_simdif.data > 0).all(): loss = loss + A2I_simdif I2A_simdif = margin + Aimpsim - anchorsim if (I2A_simdif.data > 0).all(): loss = loss + I2A_simdif loss = loss / n return loss
def segment_batch(self, tensor_images, downsample=1): ''' Returns a multilabel segmentation for the given batch of (RGB [-1...1]) images. Each pixel of the result is a torch.long indicating a predicted class number. Multiple classes can be predicted for the same pixel: output shape is (n, multipred, y, x), where multipred is 3, 5, or 6, for how many different predicted labels can be given for each pixel (depending on whether subdivision is being used). If downsample is specified, then the output y and x dimensions are downsampled from the original image. ''' output_images, _, _ = self.model(tensor_images, None, []) # N x 512 x W x H output_seg = [] t = 1.5 for i in range(len(output_images)): c_trans = torch.transpose(self.clusters, 0, 1) c_trans = c_trans[:, None, :] # 512 x T clust_mean = self.mean_clust.view(-1, 1, 1) clust_mean = clust_mean.expand(output_images[i].size(0), output_images[i].size(1), output_images[i].size(2)) std_clust = self.std_clust.view(-1, 1, 1) std_clust = std_clust.expand(output_images[i].size(0), output_images[i].size(1), output_images[i].size(2)) im_normalized = (output_images[i] - clust_mean) / (std_clust + 1e-8) matchmap = utils.compute_matchmap(im_normalized, c_trans) # H x W x N_clusters matchmap = matchmap.permute(2, 0, 1) # N_c x H x W matchmap = torch.nn.functional.interpolate(matchmap[None, :, :, :], size=(64, 64), mode='bilinear')[0] matchmap = nn.Threshold(self.threshold, 0)(matchmap) matchmap = -nn.Threshold(-0.1, -1)(-matchmap) seg = torch.zeros(self.clusters.size(0), matchmap.size(1), matchmap.size(2)).long().cuda() for c in range(self.clusters.size(0)): seg[c, :, :] = ((c + 1) * matchmap[c, :, :]).long() output_seg.append(seg) output_seg = torch.stack(output_seg) return output_seg
def predict_single_class(self, tensor_images, classnum, downsample=1): ''' Given a batch of images (RGB, normalized to [-1...1]) and a specific segmentation class number, returns a tuple with (1) a differentiable ([0..1]) prediction score for the class at every pixel of the input image. (2) a binary mask showing where in the input image the specified class is the best-predicted label for the pixel. Does not work on subdivided labels. ''' output_images, _, _ = self.model(tensor_images, None, []) # N x 512 x W x H output_seg = [] for i in range(len(output_images)): c_trans = torch.transpose(self.clusters, 0, 1) c_trans = c_trans[:, None, :] # 512 x T clust_mean = self.mean_clust.view(-1, 1, 1) clust_mean = clust_mean.expand(output_images[i].size(0), output_images[i].size(1), output_images[i].size(2)) std_clust = self.std_clust.view(-1, 1, 1) std_clust = std_clust.expand(output_images[i].size(0), output_images[i].size(1), output_images[i].size(2)) im_normalized = (output_images[i] - clust_mean) / (std_clust + 1e-8) matchmap = utils.compute_matchmap(im_normalized, c_trans[:, :, classnum:classnum + 1]) # H x W x N_clusters matchmap = matchmap[:, :, classnum - 1] matchmap = nn.Threshold(self.threshold, 0)(matchmap) matchmap = matchmap / (torch.sum(matchmap[:]) + 1e-8) output_seg.append(matchmap) output_seg = torch.stack(output_seg) return output_seg
def hard_negative_loss(image_outputs, audio_outputs, nframes, margin, symfun): """ Computes the triplet margin ranking loss for each anchor image/caption pair using the hardes sample from the positive images batch """ # I = image_outputs.view(image_outputs.size(0), embedding_dim) # A = audio_outputs.view(audio_outputs.size(0), embedding_dim) n = image_outputs.size(0) with torch.no_grad(): N = image_outputs.size(0) similarity_loss = torch.zeros(N, N, requires_grad=False).type( image_outputs.data.type()) D = image_outputs.size(1) H = image_outputs.size(2) W = image_outputs.size(3) T = audio_outputs.size(3) image_outputs_hard = image_outputs.detach() audio_outputs_hard = audio_outputs.detach() image_outputs_hard = image_outputs_hard.view(N, 1, D, H, W).expand( N, N, D, H, W).contiguous().view(-1, D, H, W) audio_outputs_hard = audio_outputs_hard.view(1, N, D, 1, T).expand( N, N, D, 1, T).contiguous().view(-1, D, 1, T) match_hard = utils.compute_matchmap_vectorized( image_outputs_hard, audio_outputs_hard).view(N, N, H, W, T) match_hard, _ = match_hard.max(3) match_hard, _ = match_hard.max(2) for i in range(N): similarity_loss[:, i] = match_hard[:, i, 0:nframes[i]].mean(1) loss = torch.zeros(1, requires_grad=True).type(image_outputs.data.type()) for i in range(n): _, rank_image = torch.topk(similarity_loss[:, i], 3) _, rank_audio = torch.topk(similarity_loss[i, :], 3) I_imp_ind = rank_image[0] A_imp_ind = rank_audio[0] if I_imp_ind == i: I_imp_ind = rank_image[1] if A_imp_ind == i: A_imp_ind = rank_audio[1] nF = nframes[i] if A_imp_ind < nframes.size(0): nFimp = nframes[A_imp_ind] else: nFimp = 16 anchorsim = utils.matchmap_sim( utils.compute_matchmap(image_outputs[i], audio_outputs[i][:, :, 0:nF], symfun)) Iimpsim = utils.matchmap_sim( utils.compute_matchmap(image_outputs[I_imp_ind], audio_outputs[i][:, :, 0:nF], symfun)) Aimpsim = utils.matchmap_sim( utils.compute_matchmap(image_outputs[i], audio_outputs[A_imp_ind][:, :, 0:nFimp], symfun)) A2I_simdif = margin + Iimpsim - anchorsim if (A2I_simdif.data > 0).all(): loss = loss + A2I_simdif I2A_simdif = margin + Aimpsim - anchorsim if (I2A_simdif.data > 0).all(): loss = loss + I2A_simdif loss = loss / n return loss
def combined_random_hard_negative_loss(image_outputs, audio_outputs, negatives_output, nframes, margin, symfun): """ Computes the triplet margin ranking loss for each anchor image/caption pair using both the hardes negative in the positive images batch, and also the specific negative for the image. Returns the highest of the two losses for each sample """ # I = image_outputs.view(image_outputs.size(0), embedding_dim) # A = audio_outputs.view(audio_outputs.size(0), embedding_dim) n = image_outputs.size(0) with torch.no_grad(): N = image_outputs.size(0) similarity_loss = torch.zeros(N, N, requires_grad=False).type( image_outputs.data.type()) D = image_outputs.size(1) H = image_outputs.size(2) W = image_outputs.size(3) T = audio_outputs.size(3) image_outputs_hard = image_outputs.detach() audio_outputs_hard = audio_outputs.detach() image_outputs_hard = image_outputs_hard.view(N, 1, D, H, W).expand( N, N, D, H, W).contiguous().view(-1, D, H, W) audio_outputs_hard = audio_outputs_hard.view(1, N, D, 1, T).expand( N, N, D, 1, T).contiguous().view(-1, D, 1, T) match_hard = utils.compute_matchmap_vectorized( image_outputs_hard, audio_outputs_hard).view(N, N, H, W, T) match_hard, _ = match_hard.max(3) match_hard, _ = match_hard.max(2) for i in range(N): similarity_loss[:, i] = match_hard[:, i, 0:nframes[i]].mean(1) loss = torch.zeros(1, requires_grad=True).type(image_outputs.data.type()) for i in range(n): if n >= 2: _, rank_image = torch.topk(similarity_loss[:, i], 2) _, rank_audio = torch.topk(similarity_loss[i, :], 2) I_imp_ind = rank_image[0] A_imp_ind = rank_audio[0] if I_imp_ind == i: I_imp_ind = rank_image[1] if A_imp_ind == i: A_imp_ind = rank_audio[1] I_imp_ind = max(min(image_outputs.size(0) - 1, I_imp_ind), 0) A_imp_ind = max(min(image_outputs.size(0) - 1, A_imp_ind), 0) else: I_imp_ind = 0 A_imp_ind = 0 # I_imp_ind = max(min(image_outputs.size(0)-1,I_imp_ind),0) # A_imp_ind =max(min(image_outputs.size(0)-1,A_imp_ind),0) nF = nframes[i] if A_imp_ind < nframes.size(0): nFimp = nframes[A_imp_ind] else: nFimp = 16 anchorsim = utils.matchmap_sim( utils.compute_matchmap(image_outputs[i], audio_outputs[i][:, :, 0:nF], symfun)) Iimpsim = utils.matchmap_sim( utils.compute_matchmap(image_outputs[I_imp_ind], audio_outputs[i][:, :, 0:nF], symfun)) Aimpsim = utils.matchmap_sim( utils.compute_matchmap(image_outputs[i], audio_outputs[A_imp_ind][:, :, 0:nFimp], symfun)) A2I_simdif = margin + Iimpsim - anchorsim anchorsim_neg = utils.matchmap_sim( utils.compute_matchmap(image_outputs[i], audio_outputs[i][:, :, 0:nF], symfun)) Aimpsim_neg = utils.matchmap_sim( utils.compute_matchmap(negatives_output[i], audio_outputs[i][:, :, 0:nF], symfun)) I2A_simdif_neg = margin + Aimpsim_neg - anchorsim_neg if (A2I_simdif.data > 0).all(): loss = loss + A2I_simdif I2A_simdif = margin + Aimpsim - anchorsim if (I2A_simdif.data > 0).all(): loss = loss + torch.max(I2A_simdif, I2A_simdif_neg) loss = loss / n return loss
def train_epoch(self, epoch): """ Train one epoch. It consists of 5 steps Step 1: Compute the output of the positive image Step 2: Compute the mask for the positive image features Step 3: Generate the negative image from this mask Step 4: Compute the output of this negative Step 5: Compute all the losses And after that, do the backpropagation and weight updates """ if not self.args.use_cpu: torch.cuda.synchronize() batch_time = utils.AverageMeter() data_time = utils.AverageMeter() losses_meter = utils.AverageMeter() # Switch to train mode self.model.train() end = time.time() N_examples = self.loaders['train'].dataset.__len__() loss_list_total = { 'loss_regular': 0, 'loss_neg': 0, 'loss_hardneg': 0, 'loss_total': 0 } for batch_id, (image_input, audio_input, neg_images, nframes, path, image_raw) in enumerate(self.loaders['train']): loss_list = { 'loss_regular': 0, 'loss_neg': 0, 'loss_hardneg': 0, 'loss_total': 0 } # Measure data loading time data_time.update(time.time() - end) if not self.args.use_cpu: audio_input = audio_input.cuda(async=True) if not self.args.loading_image: path_ints = [p.split('/')[-1] for p in path ] # in case the audio is inside a subfolder v_init = self.z[int(path_ints[0])] z_img = torch.FloatTensor(image_input.size(0), v_init.shape[0]) for k in range(image_input.size(0)): z_img[k, :] = self.z[int(path_ints[k])] image_input = self.generator.generate_images(z_img, intervention=None) image_input = utils.transform(image_input).detach() else: image_input = image_input.cuda() neg_images = neg_images.cuda() # STEP 1: Compute output positive model_output = self.model(image_input, audio_input, []) image_output = model_output[0] audio_output = model_output[1] neg_images = [] pooling_ratio = round(audio_input.size(3) / audio_output.size(3)) nframes.div_(pooling_ratio) binary_mask_0 = None # Only do steps 2-4 if we want to train with semantic negatives if self.loss_type == 'negatives_edited' or self.loss_type == 'negatives_both': # STEP 2: Compute mask from image features limits = np.zeros((image_input.size(0), 2)) for i in range(image_input.size(0)): pos_image = image_input[i, :, :, :] nF = nframes[i] matchmap = utils.compute_matchmap( image_output[i], audio_output[i][:, :, :nF]) matchmap = matchmap.data.cpu().numpy().copy() matchmap = matchmap.transpose(2, 0, 1) # l, h, w matchmap = matchmap / (matchmap.max() + 1e-10) matchmap_image = matchmap.max(axis=0) threshold = 0.95 # ind_max = np.argmax(matchmap_image) ind_max = np.argmax(matchmap) ind_t = ind_max // (matchmap.shape[2] * matchmap.shape[1]) ind_h = (ind_max % (matchmap.shape[2] * matchmap.shape[1]) ) // matchmap.shape[1] ind_w = (ind_max % (matchmap.shape[2] * matchmap.shape[1]) ) % matchmap.shape[1] limits[i, 0] = ind_t limits[i, 1] = ind_t + 1 if self.clustering: if self.args.active_learning and 'active' in path[i]: neg_img = active_learning.get_negatives( self, path_ints[i]) else: v = (image_output[i][:, ind_h, ind_w] - self.mean_clust.cuda()) / ( self.std_clust.cuda() + 1e-8) normalized_clusters = np.matmul( self.clusters.cpu(), v.detach().cpu().numpy().transpose()) sorted_val = -np.sort(-normalized_clusters[:]) sorted_val = np.clip(sorted_val, 0, 4) if np.sum(sorted_val) <= 0: print( "None of the clusters was close to the image feature. If this happens regularly, " "it probably means they were low quality clusters. Did you pretrain with a " "regular loss before clustering?") prob_samples = sorted_val / np.sum(sorted_val) sorted_id = np.argsort(-normalized_clusters[:]) cluster_id = sorted_id[0] norm = 0 threshold_random = 0.95 # The number of units to be ablated grows if we cannot generate a good (changed) negative # The following numbers are the starting number of units to change num_units_dict = { 'layer2': 30, 'layer3': 30, 'layer4': 140, 'layer5': 30, 'layer6': 30 } thresold_heatmap = threshold count = 0 binary_mask_eval = matchmap_image > ( thresold_heatmap * matchmap_image.max()) binary_mask_eval = utils.geodesic_dilation( binary_mask_eval, (ind_h, ind_w)) binary_mask_eval = cv2.resize( binary_mask_eval, (128, 128)) bmask = torch.Tensor(binary_mask_eval).cuda() bmask = bmask.view(1, 128, 128).expand(3, 128, 128) while norm < threshold_random: with torch.no_grad(): binary_mask = matchmap_image > ( thresold_heatmap * matchmap_image.max()) binary_mask = utils.geodesic_dilation( binary_mask, (ind_h, ind_w)) if binary_mask_0 is None: binary_mask_0 = cv2.resize( binary_mask, (224, 224)) # STEP 3: Generate new image z_img = self.z[int(path_ints[i])] z_img = z_img[np.newaxis, :] _ = self.generator.generate_images(z_img) intervention = {} for layer_n in self.layer_list_all: units_ids = self.layers_units[layer_n][ cluster_id][:num_units_dict[ layer_n]] layer_size = self.layers_dict[layer_n][ 'size'] layer_dim = self.layers_dict[layer_n][ 'depth'] ablation, replacement = self.get_ablation_replacement( params=[layer_dim, units_ids], option='specific') ablation_final = cv2.resize( binary_mask, (layer_size, layer_size)) ablation_final = np.tile( ablation_final, (layer_dim, 1, 1)).astype( np.float32) ablation_final = torch.cuda.FloatTensor( ablation_final) ablation_final = ablation.view( layer_dim, 1, 1).expand_as(ablation_final ) * ablation_final intervention[layer_n] = ( ablation_final, replacement) neg_img = self.generator.generate_images( z_img, intervention=intervention).detach() neg_img_t = utils.transform( neg_img).detach() norm = (neg_img_t[0, :, :, :] - pos_image.detach()) norm = norm * bmask norm = torch.norm(torch.norm(torch.norm( norm, dim=2), dim=1), dim=0) norm_normalized = norm / torch.norm( torch.norm(torch.norm( pos_image.detach() * bmask, dim=2), dim=1), dim=0) norm = norm_normalized.item() for layer_n in self.layer_list_all: num_units_dict[layer_n] = num_units_dict[ layer_n] + 40 # increase units to change thresold_heatmap = thresold_heatmap - 0.1 threshold_random = threshold_random - 0.05 cluster_id = np.random.choice( sorted_id, size=1, p=prob_samples)[0] count = count + 1 else: # random edited negatives binary_mask = matchmap_image > (threshold * matchmap_image.max()) binary_mask = utils.geodesic_dilation( binary_mask, (ind_h, ind_w)) if binary_mask_0 is None: binary_mask_0 = cv2.resize(binary_mask, (224, 224)) norm = 0 threshold_random = 0.95 p = 0.4 while norm < threshold_random: with torch.no_grad(): intervention = {} for layer_n in self.layer_list_all: layer_size = self.layers_dict[layer_n][ 'size'] layer_dim = self.layers_dict[layer_n][ 'depth'] ablation, replacement = self.get_ablation_replacement( params=[layer_dim, True, 0.5], option='random') ablation_final = cv2.resize( binary_mask, (layer_size, layer_size)) ablation_final = np.tile( ablation_final, (layer_dim, 1, 1)).astype(np.float32) ablation_final = torch.cuda.FloatTensor( ablation_final) ablation_final = ablation.view( layer_dim, 1, 1).expand_as( ablation_final) * ablation_final intervention[layer_n] = (ablation_final, replacement) # STEP 3: Generate new image z_img = self.z[int(path_ints[i])] z_img = z_img[np.newaxis, :].detach() neg_img = self.generator.generate_images( z_img, intervention=intervention).detach() neg_img_t = utils.transform(neg_img).detach() binary_mask = cv2.resize( binary_mask, (128, 128)) bmask = torch.Tensor(binary_mask).cuda() bmask = bmask.view(1, 128, 128).expand(3, 128, 128) norm = (neg_img_t[0, :, :, :] - pos_image.detach()) norm = norm * bmask norm = torch.norm(torch.norm(torch.norm(norm, dim=2), dim=1), dim=0) norm_normalized = norm / torch.norm(torch.norm( torch.norm(pos_image.detach() * bmask, dim=2), dim=1), dim=0) norm = norm_normalized.item() if random.random() > 0.2: p = p + 0.05 else: threshold_random = threshold_random - 0.01 neg_images.append(neg_img) neg_images = torch.cat(neg_images) neg_images_t = utils.transform(neg_images) # print(neg_images_t.size()) # STEP 4: Compute output negative image_output_neg, _, _ = self.model(neg_images_t, None, []) # STEP 5: Compute losses if self.args.active_learning: image_output, image_output_neg = active_learning.switch_pos_neg( self, image_input, image_output, image_output_neg, path) if self.loss_type == 'regular': loss = losses.sampled_margin_rank_loss(image_output, audio_output, nframes, self.margin, self.args.symfun) loss_list['loss_regular'] = loss.item() loss_list['loss_total'] = loss.item() elif self.loss_type == 'negatives_edited': # train with semantic negatives loss_regular = losses.sampled_margin_rank_loss( image_output, audio_output, nframes, self.margin, self.args.symfun) loss_neg = losses.negatives_loss(image_output, audio_output, image_output_neg, nframes, self.margin, self.args.symfun) loss = loss_regular + loss_neg loss_list['loss_regular'] = loss_regular.item() loss_list['loss_neg'] = loss_neg.item() loss_list['loss_total'] = loss.item() elif self.loss_type == 'negatives_hard': # train with hard negatives loss_regular = losses.sampled_margin_rank_loss( image_output, audio_output, nframes, self.margin, self.args.symfun) loss_neg = losses.hard_negative_loss(image_output, audio_output, nframes, self.margin, self.args.symfun) loss = loss_regular + loss_neg loss_list['loss_regular'] = loss_regular.item() loss_list['loss_neg'] = loss_neg.item() loss_list['loss_total'] = loss.item() elif self.loss_type == 'negatives_both': # combine hard negatives with semantic negatives loss_hardneg = losses.combined_random_hard_negative_loss( image_output, audio_output, image_output_neg, nframes, self.margin, self.args.symfun) loss_regular = losses.sampled_margin_rank_loss( image_output, audio_output, nframes, self.margin, self.args.symfun) loss_regular = torch.clamp(loss_regular, min=0, max=5) loss_hardneg = torch.clamp(loss_hardneg, min=0, max=5) loss = loss_regular + loss_hardneg loss_list['loss_regular'] = loss_regular.item() loss_list['loss_hardneg'] = loss_hardneg.item() loss_list['loss_total'] = loss.item() else: raise Exception( f'The loss function {self.loss_type} is not implemented.') last_sample = N_examples * epoch + batch_id * self.args.batch_size + image_input.size( 0) # Record loss losses_meter.update(loss.item(), image_input.size(0)) # Backward pass and update self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Measure elapsed time batch_time.update(time.time() - end) end = time.time() # Print results if (batch_id + 1) % self.args.print_freq == 0: for name in loss_list: loss_list_total[name] += loss_list[name] for name in loss_list: loss_list_total[ name] = loss_list_total[name] / self.args.print_freq for loss_name in loss_list: self.args.writer.add_scalar(f'losses/{loss_name}', loss_list_total[loss_name], last_sample) print( f'Epoch: [{epoch}][{batch_id+1}/{len(self.loaders["train"])}]\t' f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' f'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' f'Loss {losses_meter.val:.4f} ({losses_meter.avg:.4f})\t', flush=True) image_raw = self.unorm(image_input[0].data.cpu()) self.args.writer.add_image('positive', image_raw, last_sample) if self.loss_type == 'negatives_edited' or self.loss_type == 'negatives_both': image_raw_neg = self.unorm(neg_images[0].data.cpu()) image_neg = image_raw_neg / torch.max(image_raw_neg) self.args.writer.add_image('negative', image_neg, last_sample) self.args.writer.add_image( 'Images/region', 255 * np.array([binary_mask_0, binary_mask_0, binary_mask_0 ]).swapaxes(0, 1).swapaxes(1, 2), last_sample) loss_list_total = {k: 0 for k, v in loss_list_total.items()} else: for loss_name in loss_list: loss_list_total[loss_name] += loss_list[loss_name]
def get_datapoints(self): """ Compute datapoints :return: datapoints and path names identifying all the datapoints """ names_audio = [] names_image = [] finish = False dim = self.model_dim datapoints_image = np.zeros((self.max_datapoints, dim)) datapoints_audio = np.zeros((self.max_datapoints, dim)) datapoints_mul = np.zeros((self.max_datapoints, dim)) current_datapoints_image = 0 current_datapoints_audio = 0 current_datapoints_mul = 0 finish_image = False finish_audio = False for batch_id, (image_input, audio_input, _, nframes, path, image_raw) in enumerate(self.dataloader): # print(f'Current datapoints: ({current_datapoints_image}, ' # f'{current_datapoints_audio})/{self.max_datapoints}') if finish: break path_ints = [p.split('/')[-1] for p in path] # in case the audio is inside a subfolder v_init = self.z[int(path_ints[0])] z_img = torch.FloatTensor(audio_input.size(0), v_init.shape[0]) for k in range(audio_input.size(0)): z_img[k, :] = self.z[int(path_ints[k])] image_input = self.generator.generate_images(z_img, intervention=None) image_input = utils.transform(image_input) image_input = image_input.cuda(async=True) audio_input = audio_input.cuda(async=True) model_output = self.model(image_input, audio_input, []) image_output = model_output[0] audio_output = model_output[1] pooling_ratio = round(audio_input.size(3) / audio_output.size(3)) nframes.div_(pooling_ratio) # Compute matchmap to detect where there are important concepts that we want to cluster for i in range(image_input.shape[0]): nF = nframes[i] matchmap_i = utils.compute_matchmap(image_output[i], audio_output[i][:, :, 0:nF]) matchmap = matchmap_i.data.cpu().numpy().copy() matchmap = matchmap.transpose(2, 0, 1) # l, h, w matchmap = matchmap / matchmap.max() ind_max = np.argmax(matchmap) ind_t = ind_max // (matchmap.shape[2] * matchmap.shape[1]) ind_h = (ind_max % (matchmap.shape[2] * matchmap.shape[1])) // matchmap.shape[1] ind_w = (ind_max % (matchmap.shape[2] * matchmap.shape[1])) % matchmap.shape[1] d_audio = audio_output[i][:, 0, ind_t].view(-1) d_image = image_output[i][:, ind_h, ind_w].view(-1) d_all = d_audio * d_image datapoints_mul[current_datapoints_mul:current_datapoints_mul + 1] = d_all.cpu().numpy() current_datapoints_mul = current_datapoints_mul + 1 # Computing image matchmap_i_max = matchmap_i.mean(2).view(-1) structure = np.ones(3, dtype=np.int) labeled, ncomponents = label(matchmap_i_max.cpu() > 0.5 * matchmap_i_max.max().cpu(), structure) indexes = np.zeros(ncomponents) for n in range(ncomponents): indexes[n] = np.array(np.where(labeled == n + 1)).mean().round().astype(int) num_datapoints = len(indexes) if current_datapoints_image + num_datapoints > self.max_datapoints: num_datapoints = self.max_datapoints - current_datapoints_image datapoints_i = image_output[i].view(image_output.shape[1], -1)[:, indexes[:num_datapoints]] if num_datapoints > 0: datapoints_image[current_datapoints_image:current_datapoints_image + num_datapoints] = \ datapoints_i.transpose(1, 0).cpu().numpy() names_i = [] for index in indexes[:num_datapoints]: names_i.append((path[i], index)) names_image[current_datapoints_image:current_datapoints_image + num_datapoints] = names_i current_datapoints_image += num_datapoints if current_datapoints_image >= self.max_datapoints: finish_image = True matchmap_i_max, _ = matchmap_i.max(1) matchmap_i_max, _ = matchmap_i_max.max(0) structure = np.ones(3, dtype=np.int) labeled, ncomponents = label(matchmap_i_max.cpu() > 0.5 * matchmap_i_max.max().cpu(), structure) indexes = np.zeros(ncomponents) for n in range(ncomponents): indexes[n] = np.array(np.where(labeled == n + 1)).mean().round().astype(int) num_datapoints = len(indexes) if current_datapoints_audio + num_datapoints > self.max_datapoints: num_datapoints = self.max_datapoints - current_datapoints_audio if num_datapoints > 0: datapoints_i = audio_output[i][..., indexes[:num_datapoints]]. \ view(audio_output.shape[1], num_datapoints) datapoints_audio[current_datapoints_audio:current_datapoints_audio + num_datapoints] = \ datapoints_i.transpose(1, 0).cpu().numpy() names_i = [] for index in indexes[:num_datapoints]: names_i.append((path[i], index)) names_audio[current_datapoints_audio:current_datapoints_audio + num_datapoints] = names_i current_datapoints_audio += num_datapoints if current_datapoints_audio >= self.max_datapoints: finish_audio = True if finish_image and finish_audio: finish = True if current_datapoints_image < self.max_datapoints: datapoints_image = datapoints_image[:current_datapoints_image] if current_datapoints_audio < self.max_datapoints: datapoints_audio = datapoints_audio[:current_datapoints_audio] if current_datapoints_mul < self.max_datapoints: datapoints_mul = datapoints_mul[:current_datapoints_mul] self.datapoints_audio = datapoints_audio self.datapoints_image = datapoints_image self.datapoints_mul = datapoints_mul self.names_im = names_image self.names = names_audio return datapoints_image, names_image
def repeated_attributes(trainer): """ Here we check if the model is able of determining which images contain a specific attribute mentioned in the audio. It is the experiment reported in Table 1 in the paper. For each attribute, find 500 images with the attribute and 500 without (with the segmentor). We use the same list of images (for each attribute) for all the compared checkpoints. The audios with the repeated attributes are also always the same. """ if not os.path.isdir(os.path.join(trainer.args.path_repeated_attributes, 'repetition_audios')): path_tar = os.path.join(trainer.args.path_repeated_attributes, 'repeated_attributes.tar.gz') wget.download('http://wednesday.csail.mit.edu/gaze/ganclevr/files/repetition_audios.tar.gz', out=path_tar) tf = tarfile.open(path_tar) tf.extractall(trainer.args.path_repeated_attributes) os.remove(path_tar) num_elements_each = 500 path_paths = os.path.join(trainer.args.results, 'repeated_attributes', f'paths_{trainer.args.name_dataset}.pkl') list_attributes = ['RUBBER', 'METAL', 'CUBE', 'SPHERE', 'CYLINDER', 'LARGE', 'SMALL', 'GRAY', 'RED', 'BLUE', 'GREEN', 'BROWN', 'PURPLE', 'CYAN', 'YELLOW'] # First step: get paths of images to test for the specific dataset print('Obtaining samples to compare') if not os.path.isfile(path_paths): j = 0 normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) segment = segmenter.GroundTruthSegmenter(trainer.args.path_model_segmenter) counter_no_attribute = {word_attribute: 0 for word_attribute in list_attributes} counter_attribute = {word_attribute: 0 for word_attribute in list_attributes} # The key is the path, and the values are the attributes. This way, if a path (image) contains more than one # attribute, we can share the forward pass (much faster) paths_attributes = {} while ((np.array(list(counter_no_attribute.values())) < num_elements_each).any() or ( np.array(list(counter_attribute.values())) < num_elements_each).any()) and \ j < len(trainer.loaders['test'].dataset): p = trainer.loaders['test'].dataset.paths[j] j += 1 raw_image = trainer.loaders['test'].dataset.load_image_raw(path=f'{p}') L = segment.get_pred(normalize(torch.tensor(raw_image).cuda().permute(2, 0, 1).float()/255), return_L=True) B = L >> 16 G = (L - (B << 16)) >> 8 R = (L - (B << 16) - (G << 8)) pred_size = B >> 4 # - ids pred_shape = G >> 4 pred_material = G - (pred_shape << 4) pred_color = R segmentation_keys = {'CUBE': [pred_shape, 1], 'SPHERE': [pred_shape, 2], 'CYLINDER': [pred_shape, 3], 'RUBBER': [pred_material, 1], 'METAL': [pred_material, 2], 'LARGE': [pred_size, 1], 'SMALL': [pred_size, 2], 'GRAY': [pred_color, 1], 'RED': [pred_color, 2], 'BLUE': [pred_color, 3], 'GREEN': [pred_color, 4], 'BROWN': [pred_color, 5], 'PURPLE': [pred_color, 6], 'CYAN': [pred_color, 7], 'YELLOW': [pred_color, 8]} exists = {} for word_attribute in list_attributes: no_size = word_attribute not in ['LARGE', 'SMALL'] prob_exists = (segmentation_keys[word_attribute][0] == segmentation_keys[word_attribute][1]).sum() if prob_exists > 100 and prob_exists < (700 if no_size else 20000): # otherwise can be noise exists[word_attribute] = 1 elif prob_exists < 10: # to make sure it is not there (10 pixels is almost nothing) exists[word_attribute] = -1 else: exists[word_attribute] = 0 # Check if attribute in the image if exists[word_attribute] == 1 and counter_attribute[word_attribute] < num_elements_each: counter_attribute[word_attribute] += 1 if p in paths_attributes: paths_attributes[p].append([word_attribute, exists[word_attribute]]) else: paths_attributes[p] = [[word_attribute, exists[word_attribute]]] elif exists[word_attribute] == -1 and counter_no_attribute[word_attribute] < num_elements_each: counter_no_attribute[word_attribute] += 1 if p in paths_attributes: paths_attributes[p].append([word_attribute, exists[word_attribute]]) else: paths_attributes[p] = [[word_attribute, exists[word_attribute]]] os.makedirs(os.path.join(trainer.args.results, 'repeated_attributes'), exist_ok=True) with open(path_paths, 'wb') as f: pickle.dump(paths_attributes, f, protocol=pickle.HIGHEST_PROTOCOL) else: with open(path_paths, 'rb') as f: paths_attributes = pickle.load(f) # Second step: compute matching values for each image and audio in the list print('Computing matching values') synthetic = 'synth' in trainer.args.name_dataset results_checkpoint = {word_attribute: [] for word_attribute in list_attributes} # Load audio of all attributes and store in dict audio_features = {} for word_attribute in list_attributes: # Load audio of the word path_audio = os.path.join(trainer.args.path_repeated_attributes, 'repetition_audios', f'{word_attribute}_{"synthetic" if synthetic else "amt"}.wav') audio, nframes = trainer.loaders['test'].dataset.load_mel_spectrogram(path='', path_audio=path_audio) audio = audio.unsqueeze(0).unsqueeze(0).cuda() with torch.no_grad(): audio_feat = trainer.model._modules['module'].model_audio.audio_model(audio) audio_features[word_attribute] = audio_feat # Load images and compute matchmaps for path, attributes in paths_attributes.items(): with torch.no_grad(): image = trainer.loaders['test'].dataset.load_image(path=path).unsqueeze(0).cuda() image_output = trainer.model._modules['module'].model_image.image_model(image) for word_attribute, ex in attributes: matchmap = utils.compute_matchmap(image_output[0], audio_features[word_attribute][0]) # all frames matchmap_max_h, _ = matchmap.max(0) matchmap_max_hw, _ = matchmap_max_h.max(0) matchmap_max_hw = matchmap_max_hw[4:-4] # cut beginning and end value1 = matchmap_max_hw.mean() value2, _ = matchmap_max_hw.max(0) results_checkpoint[word_attribute].append([ex == 1, value1.cpu().numpy(), value2.cpu().numpy()]) # Third step: compute final experiment value print('Computing final experiment value') diff_ = 0 n_pairs_total = 0 shape = [0, 0] color = [0, 0] size = [0, 0] material = [0, 0] for attribute, values in results_checkpoint.items(): a = np.array(values) pos = a[np.where(a[:, 0] == 1)][:, 1] neg = a[np.where(a[:, 0] == 0)][:, 1] l = np.minimum(pos.shape[0], neg.shape[0]) diff = (pos[:l] > neg[:l]).sum() n_pairs_total += l diff_ += diff if attribute in ['CUBE', 'SPHERE', 'CYLINDER']: shape[0] += diff shape[1] += l elif attribute in ['RUBBER', 'METAL', 'RUBBER', 'METAL']: material[0] += diff material[1] += l elif attribute in ['LARGE', 'SMALL']: size[0] += diff size[1] += l elif attribute in ['GRAY', 'RED', 'BLUE', 'GREEN', 'BROWN', 'CYAN', 'PURPLE', 'YELLOW']: color[0] += diff color[1] += l print('') print(f'Color: {color[0]/color[1]:0.03f}') print(f'Material: {material[0]/material[1]:0.03f}') print(f'Size: {size[0]/size[1]:0.03f}') print(f'Shape: {shape[0]/shape[1]:0.03f}') print(f'Mean: {(shape[0]/shape[1] + color[0]/color[1] + size[0]/size[1] + material[0]/material[1])/4:0.03f}')
def test_recall(trainer): """ This experiment computes one positive and num_fakes corresponding negatives with the GAN, gives the recall of the system in distinguishing the positive from the negatives. Similar to the test_recall_selected, but selecting the negatives online, not using the pre-selected (and better) negatives. """ number_recall = 200 num_fakes = 9 if not trainer.args.use_cpu: torch.cuda.synchronize() recall1_meter = utils.AverageMeter() recall5_meter = utils.AverageMeter() # Switch to evaluate mode trainer.model.eval() with torch.no_grad(): for i, (image_input, audio_input, negatives, nframes, path, _) in enumerate(trainer.loaders['test']): if i % 50 == 0: print(f'Starting batch {i}') if i * image_input.size(0) > number_recall: break for j in range(image_input.size(0)): score_vector = torch.FloatTensor(num_fakes + 1) if not trainer.args.loading_image: v_init = trainer.z[0] z_img = torch.FloatTensor(1, v_init.shape[0]) z_img[0, :] = trainer.z[int(path[j])] image_input = trainer.generator.generate_images(z_img, intervention=None) image_input = utils.transform(image_input) else: image_input = image_input.cuda() pos_image = image_input[0, :, :, :] model_output = trainer.model(image_input, audio_input, []) image_output = model_output[0] audio_output = model_output[1] nF = nframes[j] matchmap = utils.compute_matchmap(image_output[0], audio_output[0][:, :, :nF]) real_score = utils.matchmap_sim(matchmap) score_vector[0] = real_score matchmap = matchmap.data.cpu().numpy().copy() matchmap = matchmap.transpose(2, 0, 1) # l, h, w matchmap = matchmap / matchmap.max() matchmap_image = matchmap.max(axis=0) threshold = 0.95 ind_max = np.argmax(matchmap) ind_h = (ind_max % (matchmap.shape[2] * matchmap.shape[1])) // matchmap.shape[1] ind_w = (ind_max % (matchmap.shape[2] * matchmap.shape[1])) % matchmap.shape[1] for fake_id in range(num_fakes): binary_mask = matchmap_image > (threshold * matchmap_image.max()) binary_mask = utils.geodesic_dilation(binary_mask, (ind_h, ind_w)) norm = 0 threshold_random = 0.95 p = 0.4 while norm < threshold_random: with torch.no_grad(): intervention = {} for layer_n in trainer.layer_list_all: layer_size = trainer.layers_dict[layer_n]['size'] layer_dim = trainer.layers_dict[layer_n]['depth'] ablation, replacement = trainer.get_ablation_replacement(params=[layer_dim, True, p], option='random') ablation_final = cv2.resize(binary_mask, (layer_size, layer_size)) ablation_final = np.tile(ablation_final, (layer_dim, 1, 1)).astype(np.float32) ablation_final = torch.cuda.FloatTensor(ablation_final) ablation_final = ablation.view(layer_dim, 1, 1).expand_as( ablation_final) * ablation_final intervention[layer_n] = (ablation_final, replacement) z_img = trainer.z[int(path[j])] z_img = z_img[np.newaxis, :].detach() neg_img = trainer.generator.generate_images(z_img, intervention=intervention).detach() neg_img_t = utils.transform(neg_img).detach() binary_mask = cv2.resize(binary_mask, (128, 128)) bmask = torch.Tensor(binary_mask).cuda() bmask = bmask.view(1, 128, 128).expand(3, 128, 128) norm = (neg_img_t[0, :, :, :] - pos_image[:, :, :].detach()) norm = norm * bmask norm = torch.norm(torch.norm(torch.norm(norm, dim=2), dim=1), dim=0) norm_normalized = norm / torch.norm( torch.norm(torch.norm(pos_image[:, :, :].detach() * bmask, dim=2), dim=1), dim=0) norm = norm_normalized.item() if random.random() > 0.2: p = p + 0.05 else: threshold_random = threshold_random - 0.01 model_output = trainer.model(neg_img_t, audio_input, []) image_output = model_output[0] audio_output = model_output[1] score_vector[1 + fake_id] = utils.matchmap_sim( utils.compute_matchmap(image_output[0], audio_output[0][:, :, :nF])) _, ids = score_vector.topk(10) ids = ids.cpu().numpy() ids = np.where(ids == 0)[0] A_foundind = ids[0] if A_foundind == 0: recall1_meter.update(1) else: recall1_meter.update(0) if A_foundind < 5: recall5_meter.update(1) else: recall5_meter.update(0) # print('Recall 1: {0}'.format(recall1_meter.avg)) # print('Recall 5: {0}'.format(recall5_meter.avg)) print('Recall 1: {0}'.format(recall1_meter.avg)) print('Recall 5: {0}'.format(recall5_meter.avg)) return recall1_meter.avg
def create_videos(trainer): """ Create videos for visualizing. Will generate a video for each sample, so cancel when you have enough videos. For this experiment, you need to have the images downloaded. """ if not trainer.args.use_cpu: torch.cuda.synchronize() # Switch to evaluate mode trainer.model.eval() len_audio = 20.48 # Only if target_spec_length = 2048 folder_name = os.path.join(trainer.args.results, 'results_video', trainer.args.name_checkpoint) os.makedirs(folder_name, exist_ok=True) with torch.no_grad(): for i, (image_input, audio_input, negatives, nframes, path, _) in enumerate(trainer.loaders['test']): v_init = trainer.z[int(path[0])] z_img = torch.FloatTensor(audio_input.size(0), v_init.shape[0]) for k in range(audio_input.size(0)): z_img[k, :] = trainer.z[int(path[k])] if not trainer.args.loading_image: image_input = trainer.generator.generate_images(z_img, intervention=None) image_input = utils.transform(image_input).detach() # compute output model_output = trainer.model(image_input, audio_input, []) image_output = model_output[0] audio_output = model_output[1] pooling_ratio = round(audio_input.size(3) / audio_output.size(3)) nframes.div_(pooling_ratio) fps = audio_output.size(3) / len_audio for bs in range(image_output.size(0)): try: target_writer = imageio.get_writer(folder_name + f'/output_video_{path[bs]}.mp4', fps=fps) matchmap = utils.compute_matchmap(image_output[bs], audio_output[bs][:, :, 0:nframes[bs]]).data.cpu().numpy().copy() wav = trainer.loaders['test'].dataset.load_audio_raw(path=path[bs]) scipy.io.wavfile.write(folder_name + f'/output_audio_{path[bs]}.mp3', 44100, wav.astype(np.int16)) matchmap = matchmap.transpose(2, 0, 1) # l, h, w matchmap = matchmap / matchmap.sum() matchmap_l, matchmap_h, matchmap_w = matchmap.shape k_ranges = utils.frange(np.max(matchmap) / 100, np.max(matchmap), np.max(matchmap) / 100) for k in k_ranges: binary_mask = matchmap > k map_temp = np.multiply(matchmap, binary_mask) if np.sum(map_temp) < 0.1: break smoothing_factor = 1 struct_element = [[[True]]] * smoothing_factor binary_mask = morph.binary_dilation(binary_mask, struct_element) # Temporal smoothing matchmap = np.multiply(matchmap, binary_mask) matchmap = (matchmap - np.min(matchmap)) / (np.max(matchmap) - np.min(matchmap)) image = trainer.loaders['test'].dataset.load_image_raw(path=path[bs]) for t in range(matchmap_l): mask_resize = np.array([cv2.resize(binary_mask[t, :, :].astype(float), (image.shape[1], image.shape[0]))] * 3).transpose(1, 2, 0) map_t = cv2.resize(matchmap[t, :, :], (image.shape[1], image.shape[0])) map_t = 1 - map_t map_t = 255 * map_t map_t = map_t.astype(np.uint8) map_t = cv2.applyColorMap(map_t, cv2.COLORMAP_JET) im_final = np.multiply((0.3 * image + 0.7 * map_t), mask_resize) + np.multiply(image, 1 - mask_resize) target_writer.append_data(im_final) target_writer.close() # -y means overwrite os.system('ffmpeg -y -i ' + folder_name + f'/output_video_{path[bs]}.mp4 -i ' + folder_name + f'/output_audio_{path[bs]}.mp3 -vf scale=1200:1200 -shortest -strict -2 ' '-c:v libx264 ' + folder_name + f'/video_{path[bs]}.mp4') except KeyboardInterrupt as e: print('you decided to finish!') finally: # Remove temporary files try: os.remove(folder_name + f'/output_video_{path[bs]}.mp4') except OSError: pass try: os.remove(folder_name + f'/output_audio_{path[bs]}.mp3') except OSError: pass return False
def generate_active_learning(trainer): """ Generate active learning samples, selecting the positive/negative pairs in which the model has the highest error The saved information is: - Current clusters. They are necessary to generate the same negative (as they select the mask) - jpg negative images. Only needed to get the captions. Not needed for training (as they will be GAN-generated) - Information to generate the negative images (masks and units, and the associated paths) This information is saved in {args.active_learning_path}/{trainer.args.name_checkpoint}_{str(time.time())}/ When training with these images, it is recommended to start from the same checkpoint used for obtaining them. After this, the next steps before running again the system with the new samples are: - Collecting captions of the active learning samples (the negatives; the positives already have captions) - Adding the new collected samples to the dataset. The can be added in a separate folder, and only modifying the name_list_{}.txt files is enough. Note that the noise ID (and thus the name of the file) will already exist (for the positive one), so save the new ones in an "active" subfolder. """ assert len( trainer.layer_list_all ) == 1, 'Active learning is only implemented for a single layer ablations' trainer.clusterer.save_results = True clus, mean_clust, std_clust, _ = trainer.clusterer.create_clusters( iteration=0) trainer.clusters = torch.FloatTensor(clus).cuda() trainer.mean_clust = torch.FloatTensor(mean_clust) trainer.std_clust = torch.FloatTensor(std_clust) trainer.cluster_counts = 1 / trainer.clusters.max(1)[0] trainer.clusters_unit = trainer.cluster_counts.view(trainer.clusters.size(0), 1).expand_as(trainer.clusters) * \ trainer.clusters trainer.clusterer.name_with_images_clusters() trainer.clusterer.name_clusters() trainer.optimize_neurons() if not trainer.args.use_cpu: torch.cuda.synchronize() data_time = utils.AverageMeter() # Switch to train mode trainer.model.eval() active_learning_name = os.path.join( trainer.args.active_learning_path, f'{trainer.args.name_checkpoint}_{str(time.time())}') end = time.time() all_loss = [] all_hmap = [] all_hmap_eval = [] all_units = [] all_paths = [] for batch_id, (image_input, audio_input, neg_images, nframes, path, image_raw) in \ enumerate(trainer.loaders['train']): print(batch_id) # Measure data loading time data_time.update(time.time() - end) if not trainer.args.use_cpu: audio_input = audio_input.cuda(async=True) if not trainer.args.loading_image: if trainer.args.active_learning: path_ints = [p.split('/')[-1] for p in path] else: path_ints = path v_init = trainer.z[int(path_ints[0])] z_img = torch.FloatTensor(image_input.size(0), v_init.shape[0]) for k in range(image_input.size(0)): z_img[k, :] = trainer.z[int(path_ints[k])] image_input = trainer.generator.generate_images(z_img, intervention=None) image_input = utils.transform(image_input).detach() else: image_input = image_input.cuda() neg_images = neg_images.cuda() model_output = trainer.model(image_input, audio_input, []) image_output = model_output[0] audio_output = model_output[1] neg_images = [] pooling_ratio = round(audio_input.size(3) / audio_output.size(3)) nframes.div_(pooling_ratio) binary_mask_0 = None if trainer.loss_type == 'negatives_edited' or trainer.loss_type == 'negatives_both': limits = np.zeros((image_input.size(0), 2)) for i in range(image_input.size(0)): pos_image = image_input[i, :, :, :] nF = nframes[i] matchmap = utils.compute_matchmap(image_output[i], audio_output[i][:, :, :nF]) positive_score = utils.matchmap_sim(matchmap).detach() matchmap = matchmap.data.cpu().numpy().copy() matchmap = matchmap.transpose(2, 0, 1) # l, h, w matchmap = matchmap / (matchmap.max() + 1e-10) matchmap_image = matchmap.max(axis=0) threshold = 0.95 # ind_max = np.argmax(matchmap_image) ind_max = np.argmax(matchmap) ind_t = ind_max // (matchmap.shape[2] * matchmap.shape[1]) ind_h = (ind_max % (matchmap.shape[2] * matchmap.shape[1]) ) // matchmap.shape[1] ind_w = (ind_max % (matchmap.shape[2] * matchmap.shape[1]) ) % matchmap.shape[1] limits[i, 0] = ind_t limits[i, 1] = ind_t + 1 v = (image_output[i][:, ind_h, ind_w] - trainer.mean_clust.cuda()) / (trainer.std_clust.cuda() + 1e-8) normalized_clusters = np.matmul( trainer.clusters.cpu(), v.detach().cpu().numpy().transpose()) sorted_val = -np.sort(-normalized_clusters[:]) sorted_val = np.clip(sorted_val, 0, 4) prob_samples = sorted_val / np.sum(sorted_val) sorted_id = np.argsort(-normalized_clusters[:]) cluster_id = sorted_id[0] norm = 0 threshold_random = 0.95 # The number of units to be ablated grows if we cannot generate a good (changed) negative # The following numbers are the starting number of units to change num_units_dict = { 'layer2': 30, 'layer3': 30, 'layer4': 140, 'layer5': 30, 'layer6': 30 } thresold_heatmap = threshold count = 0 binary_mask_eval = matchmap_image > (thresold_heatmap * matchmap_image.max()) binary_mask_eval = utils.geodesic_dilation( binary_mask_eval, (ind_h, ind_w)) binary_mask_eval = cv2.resize(binary_mask_eval, (128, 128)) all_hmap_eval.append(binary_mask_eval) bmask = torch.Tensor(binary_mask_eval).cuda() bmask = bmask.view(1, 128, 128).expand(3, 128, 128) all_paths.append(path_ints[i]) while norm < threshold_random: with torch.no_grad(): binary_mask = matchmap_image > (thresold_heatmap * matchmap_image.max()) binary_mask = utils.geodesic_dilation( binary_mask, (ind_h, ind_w)) if binary_mask_0 is None: binary_mask_0 = cv2.resize(binary_mask, (224, 224)) z_img = trainer.z[int(path_ints[i])] z_img = z_img[np.newaxis, :] _ = trainer.generator.generate_images(z_img) intervention = {} for layer_n in trainer.layer_list_all: # This will only be one layer units_ids = trainer.layers_units[layer_n][ cluster_id][:num_units_dict[layer_n]] layer_size = trainer.layers_dict[layer_n]['size'] layer_dim = trainer.layers_dict[layer_n]['depth'] ablation, replacement = trainer.get_ablation_replacement( params=[layer_dim, units_ids], option='specific') ablation_final = cv2.resize( binary_mask, (layer_size, layer_size)) ablation_final = np.tile(ablation_final, (layer_dim, 1, 1)).astype( np.float32) ablation_final = torch.cuda.FloatTensor( ablation_final) ablation_final = ablation.view( layer_dim, 1, 1).expand_as(ablation_final) * ablation_final intervention[layer_n] = (ablation_final, replacement) neg_img = trainer.generator.generate_images( z_img, intervention=intervention).detach() neg_img_t = utils.transform(neg_img).detach() binary_mask = cv2.resize(binary_mask, (128, 128)) norm = (neg_img_t[0, :, :, :] - pos_image.detach()) norm_im = torch.norm(norm, dim=0) norm = norm * bmask im_dif = norm norm = torch.norm(torch.norm(torch.norm(norm, dim=2), dim=1), dim=0) norm_normalized = norm / torch.norm(torch.norm( torch.norm(pos_image.detach() * bmask, dim=2), dim=1), dim=0) norm = norm_normalized.item() for layer_n in trainer.layer_list_all: num_units_dict[layer_n] = num_units_dict[ layer_n] + 40 # increase units to change thresold_heatmap = thresold_heatmap - 0.1 threshold_random = threshold_random - 0.05 cluster_id = np.random.choice(sorted_id, size=(1), p=prob_samples)[0] count = count + 1 neg_images.append(neg_img.detach()) all_hmap.append(binary_mask) all_units.append(units_ids) neg_images = torch.cat(neg_images) neg_images_t = utils.transform(neg_images) image_output_neg, _, _ = trainer.model(neg_images_t, None, []) loss_list = trainer.vectorized_negatives_loss( image_output, audio_output, image_output_neg, nframes) loss_list = [x.detach() for x in loss_list] all_loss.extend(loss_list) all_loss = [x.view(1, -1) for x in all_loss] all_loss = torch.cat(all_loss).view(-1, 1) _, ind = all_loss.topk(3000, 0) ind = [x.item() for x in ind] a_units = [all_units[i] for i in ind] a_paths = [all_paths[i] for i in ind] a_hmaps = [all_hmap[i] for i in ind] a_hmaps_eval = [all_hmap_eval[i] for i in ind] torch.save(a_units, os.path.join(active_learning_name, 'units.pth')) torch.save(a_paths, os.path.join(active_learning_name, 'a_paths.pth')) torch.save(a_hmaps, os.path.join(active_learning_name, 'a_hmaps.pth')) torch.save(a_hmaps_eval, os.path.join(active_learning_name, 'a_hmaps_eval.pth')) os.makedirs(os.path.join(active_learning_name, 'images'), exist_ok=True) os.makedirs(os.path.join(active_learning_name, 'hm'), exist_ok=True) for j in range(len(a_units)): path = a_paths[j] units_ids = a_units[j] binary_mask = a_hmaps[j] layer_n = trainer.layer_list_all[0] layer_size = trainer.layers_dict[layer_n]['size'] layer_dim = trainer.layers_dict[layer_n]['depth'] intervention = {} ablation, replacement = trainer.get_ablation_replacement( params=[layer_dim, units_ids], option='specific') ablation_final = cv2.resize(binary_mask, (layer_size, layer_size)) ablation_final = np.tile(ablation_final, (layer_dim, 1, 1)).astype(np.float32) ablation_final = torch.cuda.FloatTensor(ablation_final) ablation_final = ablation.view( layer_dim, 1, 1).expand_as(ablation_final) * ablation_final intervention[layer_n] = (ablation_final, replacement) z_img = trainer.z[int(path)] z_img = z_img[np.newaxis, :] neg_img = trainer.generator.generate_images( z_img, intervention=intervention).detach() neg_im = neg_img[0, :, :, :].cpu().numpy().transpose(1, 2, 0) neg_im = neg_im.astype(np.uint8) neg_im = Image.fromarray(neg_im.astype('uint8'), 'RGB') draw = ImageDraw.Draw(neg_im) hm = a_hmaps_eval[j] rows = np.any(hm, axis=1) cols = np.any(hm, axis=0) rmin, rmax = np.where(rows)[0][[0, -1]] cmin, cmax = np.where(cols)[0][[0, -1]] draw.rectangle(((cmin, rmin), (cmax, rmax)), outline='red') neg_im.save(os.path.join(active_learning_name, 'images', f'{j}_hn.jpg')) binary_mask_eval = cv2.resize(a_hmaps_eval[j], (128, 128)) mask_im = binary_mask_eval * 255 mask_im = mask_im.astype(np.uint8) mask_im = mask_im.reshape((128, 128, 1)) mask_im = np.concatenate((mask_im, mask_im, mask_im), axis=2) mask_im = Image.fromarray(mask_im.astype('uint8'), 'RGB') mask_im.save(os.path.join(active_learning_name, 'hm', f'{j}_hn.jpg'))
def name_with_images_clusters(self): """ Find representatives for the clusters, now with images. Simply for visualization purposes. :return: list of dicts. For each cluster, 5 key-value pairs in the dict {path: mask} """ if not self.load['name_final']: # We use the same flag assert self.centroids is not None assert self.names_im is not None assert self.datapoints_image is not None datapoints = self.datapoints_image values = np.matmul(self.centroids.astype(float), datapoints.transpose(1, 0)) num_images_per_cluster = 100 return_vector = [] audio_output = torch.FloatTensor(self.centroids).cuda() audio_output = audio_output.transpose(1, 0).view( self.centroids.shape[1], 1, self.centroids.shape[0]) with torch.no_grad(): for c in range(self.num_clusters): dict_c = {} max_images_indexes = np.argsort(-values[c]) for i in range(num_images_per_cluster): count = 0 im_index = max_images_indexes[i] path, index = self.names_im[im_index] while path in dict_c: count = count + 1 im_index = max_images_indexes[i + count] path, index = self.names_im[im_index] path_ints = path.split('/')[ -1] # in case the audio is inside a subfolder v_init = self.z[int(path_ints)] z_img = torch.FloatTensor(1, v_init.shape[0]) z_img[0, :] = v_init image_input = self.generator.generate_images( z_img, intervention=None) image_input = utils.transform(image_input) model_output = self.model(image_input, None, []) image_output = model_output[0] mask = utils.compute_matchmap( image_output[0], audio_output).cpu().numpy()[:, :, c] th = 0.64 binary_mask = mask > mask.max() * th per = binary_mask.astype(float).sum() / 64.0 while per < 0.2: binary_mask = mask > mask.max() * th per = binary_mask.astype(float).sum() / 64.0 th = th - 0.02 dict_c[path] = binary_mask return_vector.append(dict_c) if self.save_results: torch.save( return_vector, os.path.join(self.path_store, f'names_images.pth.tar')) else: return_vector = torch.load( os.path.join(self.path_store, f'names_images.pth.tar')) self.names_images = return_vector return return_vector