def __init__(self, layer, param_name="weight", dim=0, power=2): super(WeightNormWrapper, self).__init__() self.param_name = param_name self.dim = dim self.power = power self.layer = layer w_v = param_name + "_v" w_g = param_name + "_g" # we could also use numpy to compute this, after all, it is run only once # at initialization. original_weight = getattr(layer, param_name) self.add_parameter( w_v, self.create_parameter(shape=original_weight.shape, dtype=original_weight.dtype)) with dg.no_grad(): F.assign(original_weight, getattr(self, w_v)) delattr(layer, param_name) temp = norm_except(getattr(self, w_v), self.dim, self.power) self.add_parameter( w_g, self.create_parameter(shape=temp.shape, dtype=temp.dtype)) with dg.no_grad(): F.assign(temp, getattr(self, w_g)) # also set this when setting up setattr( self.layer, self.param_name, compute_weight(getattr(self, w_v), getattr(self, w_g), self.dim, self.power)) self.weigth_norm_applied = True
def synthesize(args, config, model, vocoder, sentence, monotonic_layers): print("[synthesize] {}".format(sentence)) text = en.text_to_sequence(sentence, p=1.0) text = np.expand_dims(np.array(text, dtype="int64"), 0) lengths = np.array([text.size], dtype=np.int64) text_seqs = dg.to_variable(text) text_lengths = dg.to_variable(lengths) decoder_layers = config["decoder_layers"] force_monotonic_attention = [False] * decoder_layers for i in monotonic_layers: force_monotonic_attention[i] = True with dg.no_grad(): outputs = model(text_seqs, text_lengths, speakers=None, force_monotonic_attention=force_monotonic_attention, window=(config["backward_step"], config["forward_step"])) decoded, refined, attentions = outputs if args.vocoder == "griffin-lim": wav_np = vocoder(refined.numpy()[0].T) else: wav = vocoder(F.transpose(refined, (0, 2, 1))) wav_np = wav.numpy()[0] return wav_np
def make_animation(source_image, driving_video, generator, kp_detector, relative=True, adapt_movement_scale=True): with dygraph.no_grad(): predictions = [] source = dygraph.to_variable( np.transpose(source_image[np.newaxis], (0, 3, 1, 2)).astype(np.float32)) driving = dygraph.to_variable( np.transpose(np.array(driving_video)[np.newaxis], (0, 4, 1, 2, 3)).astype(np.float32)) kp_source = kp_detector(source) kp_driving_initial = kp_detector(driving[:, :, 0]) for frame_idx in tqdm(range(driving.shape[2])): driving_frame = driving[:, :, frame_idx] kp_driving = kp_detector(driving_frame) kp_norm = normalize_kp(kp_source=kp_source, kp_driving=kp_driving, kp_driving_initial=kp_driving_initial, use_relative_movement=relative, use_relative_jacobian=relative, adapt_movement_scale=adapt_movement_scale) out = generator(source, kp_source=kp_source, kp_driving=kp_norm) predictions.append( np.transpose(out['prediction'].numpy(), [0, 2, 3, 1])[0]) return predictions
def __call__(self, oriImg): h, w, _ = oriImg.shape scale_search = [0.5, 1.0, 1.5, 2.0] # scale_search = [0.5] boxsize = 368 stride = 8 padValue = 128 multiplier = [x * boxsize / h for x in scale_search] avg_output = np.zeros((22, h, w)) for m in range(len(multiplier)): scale = multiplier[m] imageToTest = cv2.resize(oriImg, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC) imageToTest_padded, pad = padRightDownCorner(imageToTest, stride, padValue) im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256. - 0.5 im = np.ascontiguousarray(im) data = dg.to_variable(im) with dg.no_grad(): output = self.hand_model(data)[-1] heatmap = output.numpy()[0].transpose((1, 2, 0)) # [h, w, c] heatmap = cv2.resize(heatmap, (0, 0), fx=stride, fy=stride, interpolation=cv2.INTER_CUBIC) heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :] heatmap = cv2.resize(heatmap, (w, h), interpolation=cv2.INTER_CUBIC) heatmap = heatmap.transpose((2, 0, 1)) # [c, h, w] avg_output += heatmap / len(multiplier) return self.postprocessor(avg_output)
def apply(module: dg.Layer, name, dim): for k, hook in module._forward_pre_hooks.items(): if isinstance(hook, WeightNorm) and hook.name == name: raise RuntimeError("Cannot register two weight_norm hooks on " "the same parameter {}".format(name)) if dim is None: dim = -1 fn = WeightNorm(name, dim) # remove w from parameter list w = getattr(module, name) del module._parameters[name] # add g and v as new parameters and express w as g/||v|| * v g_var = norm_except_dim(w, dim) v = module.create_parameter(w.shape, dtype=w.dtype) module.add_parameter(name + "_v", v) g = module.create_parameter(g_var.shape, dtype=g_var.dtype) module.add_parameter(name + "_g", g) with dg.no_grad(): F.assign(w, v) F.assign(g_var, g) setattr(module, name, fn.compute_weight(module)) # recompute weight before every forward() module.register_forward_pre_hook(fn) return fn
def get_inception_mean_cov(data_loader, key_real, key_fake, generator, sample_size, preprocess, is_video=False, few_shot_video=False): """ Load mean and covariance from saved npy file if exists. Otherwise, compute the mean and covariance. """ print("Extract mean and covariance.") if is_video: with dg.no_grad(): y = get_video_activations(data_loader, key_real, key_fake, generator, sample_size, preprocess, few_shot_video) else: y = get_activations(data_loader, key_real, key_fake, generator, sample_size, preprocess) m = np.mean(y, axis=0) s = np.cov(y, rowvar=False) return m, s
def evaluate(model, criterion, dataset, visualizer, output_dir, args): with dg.no_grad(): model.eval() metric_logger = utils.MetricLogger(args, delimiter=" ") metric_logger.add_meter( "class_error", utils.SmoothedValue(window_size=1, fmt="{value:.2f}")) header = "Test" print_freq = 10 visualize_freq = 100 * print_freq count = 0 for samples, targets in metric_logger.log_every( dataset, print_freq, header): outputs = model(samples) loss_dict = criterion(outputs, targets) weight_dict = criterion.weight_dict losses = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) losses = losses / args.batch_size metric_logger.update(loss=losses.numpy(), **loss_dict) metric_logger.update(class_error=loss_dict["class_error"]) count += 1 if visualize_freq % count == 0: visualizer.plot_results(samples, outputs, targets) print("Averaged stats:", metric_logger) return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
def power_iteration(W, u_, update=True, eps=1e-12): # Lists holding singular vectors and values us, vs, svs = [], [], [] for i, u in enumerate(u_): # Run one step of the power iteration # with torch.no_grad(): with dg.no_grad(): v = fluid.layers.matmul(u, W) # v = torch.matmul(u, W) # Run Gram-Schmidt to subtract components of all other singular vectors v = fluid.layers.l2_normalize(gram_schmidt(v, vs), eps=eps) # Add to the list vs += [v] # Update the other singular vector u = fluid.layers.matmul(v, W.t()) # u = torch.matmul(v, W.t()) # Run Gram-Schmidt to subtract components of all other singular vectors u = fluid.layers.l2_normalize(gram_schmidt(u, us), eps=eps) # Add to the list us += [u] if update: u_[i][:] = u # Compute this singular value and add it to the list ## *** torch.squeeze == fluid.layers.squeeze (input, axes, name=None) svs += [ fluid.layers.squeeze( fluid.layers.matmul(fluid.layers.matmul(v, W.t()), u.t())) ] # svs += [torch.squeeze(torch.matmul(torch.matmul(v, W.t()), u.t()))] #svs += [torch.sum(F.linear(u, W.transpose(0, 1)) * v)] return svs, us, vs
def gen_frames(self, data, use_model_average=False): net_G_output = None data_prev = None net_G = self.net_G # Iterate through the length of sequence. all_info = {'inputs': [], 'outputs': []} for t in range(self.sequence_length): # Get the data at the current time frame. data_t = self.get_data_t(data, net_G_output, data_prev, t) data_prev = data_t # Generator forward. with dg.no_grad(): net_G_output = net_G(data_t) # Do any postprocessing if necessary data_t, net_G_output = self.post_process(data_t, net_G_output) if t == 0: # Get the output at beginning of sequence for visualization. first_net_G_output = net_G_output all_info['inputs'].append(data_t) all_info['outputs'].append(net_G_output) return first_net_G_output, net_G_output, all_info
def std_gen_interpolate(batch_size=8, seed=None, out_path='data/out', levels=None, interpolate_mode=0): default_levels = ("y;z0;z11;z12;z21;z22;z31;z32;z41;z42;z51;z52;z61;z62") if levels is None: levels = default_levels default_levels = default_levels.split(';') img_save_dir = os.path.join('/tmp', out_path+'.dir') os.system(f'rm -rf {img_save_dir}') os.system(f'mkdir {img_save_dir} -p') with dg.no_grad(): model_cache.train_mode = False model_cache.initialized = False if seed is not None: rds.rng = np.random.RandomState(seed) elif rds.rng is None: rds.rng = np.random G = model_cache.G x_np = rds.rng.randn(batch_size,140).astype('float32') y_np = rds.rng.randint(0,1000,size=[batch_size]).astype('int64') x = dg.to_variable(x_np) y_cls = dg.to_variable(y_np) y_hot = layers.one_hot(layers.unsqueeze(y_cls,[1]), depth=1000) y_embed = G.embed_y(y_hot) x = layers.concat([x, x[:1]], 0) y_embed = layers.concat([y_embed, y_embed[:1]], 0) levels = levels.split(';') for level in default_levels: if len(level) == 1: locals()[level] = y_embed locals()['_'+level] = y_embed[:1] if len(level) >= 2: idx = int(level[1])*20 locals()[level] = x[:,idx:idx+20] locals()['_'+level] = x[:1,idx:idx+20] imgs = [] for i in range(batch_size): for j in range(40): alpha = j / 40 if interpolate_mode == 1: alpha = alpha**2 * (3 - 2 * alpha) for level in levels: locals()['_'+level] = (1 - alpha) * locals()[level][i:i+1] + alpha * locals()[level][i+1:i+2] inputs = [] for level in default_levels[1:]: inputs.append(locals()['_'+level]) img_pd = G(inputs, locals()['_'+default_levels[0]], True) img = np.uint8(img_pd.numpy().clip(0,1)*255)[0].transpose([1,2,0]) imgs.append(Image.fromarray(img)) stdout.write(f'{i*40+j+1}/{40*batch_size}\r') stdout.flush() print('') for i, img in enumerate(imgs): img.save(os.path.join(img_save_dir, str(i).zfill(5)+'.png')) imgs[0].save(out_path+'.gif', save_all=True, append_images=imgs[1:], duration=40, loop=0) out_path = out_path + '.mp4' os.system(f'ffmpeg -r 40 -i {img_save_dir}/%05d.png -hide_banner -loglevel warning -nostats -c:v libx264 -crf 23 -y {out_path}') os.system(f'rm -rf {img_save_dir}')
def save_image(self, path, data): self.net_G.eval() self.net_G_output = None with dg.no_grad(): first_net_G_output, last_net_G_output, _ = self.gen_frames(data) def get_images(data, net_G_output, return_first_frame=True, for_model_average=False): frame_idx = 0 if return_first_frame else -1 warped_idx = 0 if return_first_frame else 1 vis_images = [] vis_images += [ tensor2im(data['ref_image'][:, frame_idx]), self.visualize_label(data['tgt_label'][:, frame_idx]), tensor2im(data['tgt_image'][:, frame_idx]) ] vis_images += [ tensor2im(net_G_output['fake_images']), tensor2im(net_G_output['fake_raw_images']) ] vis_images += [ # tensor2im(net_G_output['warped_images'][warped_idx]), # tensor2flow(net_G_output['fake_flow_maps'][warped_idx]), # tensor2flow(self.gt_flow[warped_idx]), # tensor2im(net_G_output['fake_occlusion_masks'][warped_idx]) ] return vis_images vis_images_first = get_images(data, first_net_G_output) if self.sequence_length > 1: vis_images_last = get_images(data, last_net_G_output, return_first_frame=False) # If generating a video, the first row of each batch will be # the first generated frame and the flow/mask for warping the # reference image, and the second row will be the last generated # frame and the flow/mask for warping the previous frame. vis_images = [[ np.vstack((im_first, im_last)) for im_first, im_last in zip(imgs_first, imgs_last) ] for imgs_first, imgs_last in zip( vis_images_first, vis_images_last) if imgs_first is not None] else: vis_images = vis_images_first image_grid = np.hstack( [np.vstack(im) for im in vis_images if im is not None]) print("Save output images to {}".format(path)) os.makedirs(os.path.dirname(path), exist_ok=True) imageio.imwrite(path, image_grid)
def remove(self, module): w_var = self.compute_weight(module) delattr(module, self.name) del module._parameters[self.name + '_g'] del module._parameters[self.name + '_v'] w = module.create_parameter(w_var.shape, dtype=w_var.dtype) module.add_parameter(self.name, w) with dg.no_grad(): F.assign(w_var, w)
def test_single(self, data, output_dir=None, inference_args=None, return_fake_image=True): # if getattr(inference_args, 'finetune', False): # if not getattr(self, 'has_fine_tuned', False): # self.finetune(data, inference_args) net_G = self.net_G net_G.eval() data_t = self.get_data_t(data, self.net_G_output, self.data_prev, 0) if self.is_inference or self.sequence_length > 1: self.data_prev = data_t # Generator forward. with dg.no_grad(): self.net_G_output = net_G(data_t) if output_dir is None: return self.net_G_output save_fake_only = getattr(inference_args, 'save_fake_only', False) if save_fake_only: ys, ye, xs, xe = get_face_bbox_for_output(None, data_t['label'][0:1], crop_smaller=0) image_grid = tensor2im(self.net_G_output['fake_images'])[0] h, w, _ = image_grid.shape face_mask = Image.open( '/home/aistudio/vid2vid/test/images/face.png').resize( (ye - ys, xe - xs)) mask = np.zeros((h, w, 3)).astype("uint8") mask[ys:ye, xs:xe, :] = np.array(face_mask)[:, :, :3] image_grid[mask != 0] = 0 image_grid += mask # image_grid = tensor2im(data_t['label'][:, 3:])[0] else: vis_images = self.get_test_output_images(data) image_grid = np.hstack( [np.vstack(im) for im in vis_images if im is not None]) if 'img_name' in data: save_name = data['img_name'].split('.')[0] + '.jpg' else: save_name = "%04d.jpg" % self.t output_filename = os.path.join(output_dir, save_name) os.makedirs(output_dir, exist_ok=True) imageio.imwrite(output_filename, image_grid) self.t += 1 if return_fake_image: return image_grid else: return self.net_G_output, image_grid
def forward(self, ten_first, ten_second): h, w = ten_first.shape[2:] r_h, r_w = int(math.floor(math.ceil(h / 32.0) * 32.0)), int(math.floor(math.ceil(w / 32.0) * 32.0)) ten_first = L.image_resize(ten_first, (r_h, r_w)) ten_second = L.image_resize(ten_second, (r_h, r_w)) with dg.no_grad(): flow = self.network(ten_first, ten_second) flow = L.image_resize(flow, (h, w)) flow[:, 0, :, :] *= float(w) / float(r_w) flow[:, 1, :, :] *= float(h) / float(r_h) return flow
def forward(self, x, mask_in=None): assert len(x.shape) == 4 if mask_in is not None or self.last_size != tuple(x.shape): self.last_size = tuple(x.shape) with dg.no_grad(): if self.weight_maskUpdater.dtype != x.dtype: self.weight_maskUpdater = self.weight_maskUpdater.astype( x.dtype) if mask_in is None: # If mask is not provided, create a mask. if self.multi_channel: mask = L.ones(x.shape, dtype=x.dtype) else: mask = L.ones((1, 1, x.shape[2], x.shape[3]), dtype=x.dtype) else: mask = mask_in self.update_mask = nn.functional.conv2d( mask, self.weight_maskUpdater, bias=None, stride=self.stride, padding=self.padding, dilation=self.dilation, groups=1) # For mixed precision training, eps from 1e-8 ~ 1e-6 eps = 1e-6 self.mask_ratio = self.slide_winsize / (self.update_mask + eps) self.update_mask = L.clamp(self.update_mask, 0, 1) self.mask_ratio = self.mask_ratio * self.update_mask raw_out = super(PartialConv2D, self).forward(x * mask if mask_in is not None else x) if self.bias is not None: bias_view = L.reshape(self.bias, (1, self.out_channels, 1, 1)) output = (raw_out - bias_view) * self.mask_ratio + bias_view output = output * self.update_mask else: output = raw_out * self.mask_ratio if self.return_mask: return output, self.update_mask else: return output
def forward(self, sen_q, seg_q, sen_k, seg_k): """ Input: im_q: a batch of query images im_k: a batch of key images Output: logits, targets """ # compute query features q = self.encoder_q(sen_q, seg_q) # queries: N q = norm(q, dim=1) # compute key features with D.no_grad(): # no gradient to keys self._momentum_update_key_encoder() # update the key encoder # shuffle for making use of BN #sen_k, idx_unshuffle = self._batch_shuffle_ddp(sen_k) k = self.encoder_k(sen_k, seg_k) # keys: NxC k = norm(k, dim=1) # undo shuffle #k = self._batch_unshuffle_ddp(k, idx_unshuffle) l_pos=L.unsqueeze(L.reduce_sum(L.elementwise_mul(q, k), dim=1),axes=[-1]) # negative logits: NxK l_neg = L.matmul(q, self.queue.detach()) # logits: Nx(1+K) logits = L.concat([l_pos, l_neg], axis=-1) # apply temperature logits /= self.T # labels: positive key indicators labels = L.zeros([logits.shape[0]], dtype='int64') self._dequeue_and_enqueue(k) if labels is not None: if len(labels.shape) == 1: labels = L.reshape(labels, [-1, 1]) loss = L.softmax_with_cross_entropy(logits, labels) loss = L.reduce_mean(loss) return loss
def loss_cardinality(self, outputs, targets, indices, num_boxes): """ Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients """ with dg.no_grad(): pred_logits = outputs[ "pred_logits"] # [bs, num_queries, num_classes] tgt_lengths = dg.to_variable([len(v["labels"]) for v in targets]).astype("float32") # Count the number of predictions that are NOT "no-object" (which is the last class) card_pred = L.reduce_sum( (L.argmax(pred_logits, -1) != pred_logits.shape[-1] - 1).astype("float32")) card_err = F.loss.l1_loss(card_pred, tgt_lengths) losses = {"cardinality_error": card_err} return losses
def W_(self): W_mat = self.weight.view(self.weight.size(0), -1) if self.transpose: W_mat = W_mat.t() # Apply num_itrs power iterations for _ in range(self.num_itrs): svs, us, vs = power_iteration(W_mat, self.u, update=self.training, eps=self.eps) # Update the svs if self.training: with dg.no_grad( ): # Make sure to do this in a no_grad() context or you'll get memory leaks! for i, sv in enumerate(svs): self.sv[i][:] = sv return self.weight / svs[0]
def std_gen(batch_size=8, seed=None): with dg.no_grad(): model_cache.train_mode = False model_cache.initialized = False if seed is not None: rds.rng = np.random.RandomState(seed) elif rds.rng is None: rds.rng = np.random G = model_cache.G x_np = rds.rng.randn(batch_size,140).astype('float32') y_np = rds.rng.randint(0,1000,size=[batch_size]).astype('int64') x = dg.to_variable(x_np) y = dg.to_variable(y_np) y_hot = layers.one_hot(layers.unsqueeze(y,[1]), depth=1000) img_pd = G(x, y_hot) img = np.uint8(img_pd.numpy().clip(0,1)*255) imgs = [] for i in range(len(img)): imgs += [Image.fromarray(img[i].transpose([1,2,0]))] return imgs
def compute_fid(fid_path, data_loader, net_G, key_real='tgt_image', key_fake='fake_images', sample_size=None, preprocess=None, is_video=False, few_shot_video=False): """ Compute the fid score. Args: fid_path (str): Location for the numpy file to store or to load the statistics. data_loader (obj): data_loader object. net_G (obj): key_real (str): Dictionary key value for the real data. key_fake (str): Dictionary key value for the fake data. sample_size (int or tuple): How many samples to be used. prerpocess (func): The preprocess function to be applied to the data. is_video (bool): Whether we are handling video sequences. few_shot_video(bool): If True, uses few-shot video synthesis. """ print("Computing FID.") with dg.no_grad(): # Get the fake mean and covariance. fake_mean, fake_cov = load_or_compute_stats(fid_path, data_loader, key_real, key_fake, net_G, sample_size, preprocess, is_video, few_shot_video) # Get the ground truth mean and covariance. mean_cov_path = os.path.join(os.path.dirname(fid_path), 'real_mean_cov.npz') real_mean, real_cov = load_or_compute_stats(mean_cov_path, data_loader, key_real, key_fake, None, sample_size, preprocess, is_video, few_shot_video) fid = calculate_frechet_distance(real_mean, real_cov, fake_mean, fake_cov) return fid
def std_enc_with_D(path='miku.png', steps=2000, lr=4e-3, levels=[0, 3], weights=[100, 1]): model_cache.train_mode = False model_cache.initialized = False img = Image.open(path) w, h = img.size min_size = min(w, h) x0 = (w - min_size) // 2 y0 = (h - min_size) // 2 x1 = x0 + min_size y1 = y0 + min_size img = img.crop([x0, y0, x1, y1]).convert('RGB') img = _img = img.resize([256, 256], Image.BILINEAR) img = np.asarray(img) / 255.0 img = dg.to_variable(img.transpose(2, 0, 1).astype('float32')[None, ...]) m_latent = Latents() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=lr, parameter_list=m_latent.parameters()) for i in range(steps): z, class_emb = m_latent() out = model_cache.G(z, class_emb, input_class_emb=True) with dg.no_grad(): _, real_features = model_cache.D(img) real_features = [img] + real_features _, fake_features = model_cache.D(out) fake_features = [out] + fake_features loss = 0 for idx, weight in zip(levels, weights): r, f = real_features[idx], fake_features[idx] loss = loss + weight * layers.mean((f - r)**2) loss.backward() optimizer.minimize(loss) optimizer.clear_gradients() stdout.write(f'loss: {loss.numpy().mean()} {i+1}/{steps}\r') stdout.flush() print('') out = np.uint8(out.numpy()[0].transpose(1, 2, 0).clip(0, 1) * 255) return Image.fromarray(out), _img
def step(self, data): # Whether to reuse generator output for both gen_update and dis_update. # It saves time but comsumes a bit more memory. reuse_gen_output = getattr(self.cfg.trainer, 'reuse_gen_output', False) past_frames = [None, None] net_G_output = None data_prev = None for t in range(self.sequence_length): data_t = self.get_data_t(data, net_G_output, data_prev, t) data_prev = data_t # Discriminator update. if reuse_gen_output: net_G_output = self.net_G(data_t) else: with dg.no_grad(): net_G_output = self.net_G(data_t) data_t, net_G_output = self.post_process(data_t, net_G_output) # Get losses and update D if image generated by network in training. if 'fake_images_source' not in net_G_output: net_G_output['fake_images_source'] = 'in_training' if net_G_output['fake_images_source'] != 'pretrained': net_D_output, _ = self.net_D(data_t, detach(net_G_output), past_frames) self.get_dis_losses(net_D_output) # Generator update. if not reuse_gen_output: net_G_output = self.net_G(data_t) data_t, net_G_output = self.post_process(data_t, net_G_output) # Get losses and update G if image generated by network in training. if 'fake_images_source' not in net_G_output: net_G_output['fake_images_source'] = 'in_training' if net_G_output['fake_images_source'] != 'pretrained': net_D_output, past_frames = self.net_D(data_t, net_G_output, past_frames) self.get_gen_losses(data_t, net_G_output, net_D_output)
def renorm_gen_interpolate(batch_size=8, seed=None, out_path='data/out.gif'): with dg.no_grad(): model_cache.train_mode = True model_cache.initialized = True if seed is not None: rds.rng = np.random.RandomState(seed) elif rds.rng is None: rds.rng = np.random G = model_cache.G x_np = rds.rng.randn(batch_size, 140).astype('float32') y_np = rds.rng.randint(0, 1000, size=[batch_size]).astype('int64') x = dg.to_variable(x_np) y = dg.to_variable(y_np) y_hot = layers.one_hot(layers.unsqueeze(y, [1]), depth=1000) y_embed = G.embed_y(y_hot) G(x, y_embed, True) model_cache.train_mode = False model_cache.initialized = True x = layers.concat([x, x[:1]], 0) y_embed = layers.concat([y_embed, y_embed[:1]], 0) imgs = [] for i in range(batch_size): for j in range(40): alpha = j / (40 - 1) _x = (1 - alpha) * x[i:i + 1] + alpha * x[i + 1:i + 2] _y_embed = (1 - alpha ) * y_embed[i:i + 1] + alpha * y_embed[i + 1:i + 2] img_pd = G(_x, _y_embed, True) img = np.uint8(img_pd.numpy().clip(0, 1) * 255)[0].transpose( [1, 2, 0]) imgs.append(Image.fromarray(img)) stdout.write(f'{i*40+j+1}/{40*batch_size}\r') stdout.flush() print('') imgs[0].save(out_path, save_all=True, append_images=imgs[1:], duration=40, loop=0) return Image.open(out_path)
def forward(self, outputs, targets): """ Performs the matching Params: outputs: This is a dict contains at least these entries: "pred_logits": Tensor of dim[batch_size, num_queries, num_classes] with the classification logits "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicated box coordinates targets: This is a list of targets (len(targets) == batch_size), where each target is a dict containing: "labels": Tensor of dim[num_target_boxes] (where num_target_boxes is the number of ground-truth) objects in the target) containing the class labels "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordiantes Returns: A list of size batch_size, containing tuples of (index_i, index_j) where: - index_i is the indices of the selected predictions (in order) - index_j is the indices of the corresponding selected targets (in order) For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) """ with dg.no_grad(): bs, num_queries, num_classes = outputs["pred_logits"].shape # We flatten to compute the cost matrices in a batch out_prob = L.reshape( outputs["pred_logits"], [-1, num_classes]) # [batch_size * num_queries, num_classes] out_prob = L.softmax( out_prob, axis=-1) # [batch_size * num_queries, num_classes] out_bbox = L.reshape(outputs["pred_boxes"], [-1, 4]) # [batch_size * num_queries, 4] # Alse concat the target labels and boxes tgt_ids = L.concat([v["labels"] for v in targets]).astype( "int64") # [batch_size * num_target_boxes_i] tgt_bbox = L.concat([v["boxes"] for v in targets]).astype( "float32") # [batch_size * num_target_boxes_i] # Compute the classification cost. Contrary to the loss, we don't use the NLL, # but approximate it in 1 - proba[target class]. # The 1 is a constant that donesn't change the matching, it can be ommitted. cost_class = -out_prob.numpy()[:, tgt_ids.numpy( )] # [batch_size * num_queries, num_all_target_boxes] cost_class = dg.to_variable(cost_class) # Compute the L1 cost between boxes num_all_target_boxes = tgt_bbox.shape[0] expanded_out_bbox = L.expand( L.unsqueeze(out_bbox, [1]), [1, num_all_target_boxes, 1 ]) # [batch_size * num_queries, num_all_target_boxes, 4] expanded_tgt_bbox = L.expand( L.unsqueeze(tgt_bbox, [0]), [bs * num_queries, 1, 1 ]) # [batch_size * num_queries, num_all_target_boxes, 4] cost_bbox = F.loss.l1_loss( expanded_out_bbox, expanded_tgt_bbox, reduction='none' ) # [batch_size * num_queries, num_all_target_boxes, 4] cost_bbox = L.reduce_mean( cost_bbox, -1) # [batch_size * num_queries, num_all_target_boxes] # Compute the giou cost between boxes cost_giou = -generalied_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox)) # Final cost matrix C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou C = L.reshape( C, [bs, num_queries, -1 ]) # [batch_size, num_queries, num_all_target_boxes] sizes = [len(v["boxes"]) for v in targets] indices = [ linear_sum_assignment(c[i].numpy()) for i, c in enumerate(L.split(C, sizes, dim=-1)) ] return [(dg.to_variable(i.astype("int64")), dg.to_variable(j.astype("int64"))) for i, j in indices]
def __call__(self, mel): with dg.no_grad(): self.model.eval() audio = self.model.synthesize(mel) self.model.train() return audio
def apply_optimize(self, loss, startup_program, params_grads): super(AdamW, self).apply_optimize(loss, startup_program, params_grads) for p, g in params_grads: if not self.pat.match(p.name): with D.no_grad(): L.assign(p * (1. - self.wd * self.current_step_lr()), p)