def do_train(self, paths, dataset, optimiser, epochs, batch_size, step, lr=1e-4, valid_index=[], use_half=False): if use_half: import apex optimiser = apex.fp16_utils.FP16_Optimizer(optimiser, dynamic_loss_scale=True) for p in optimiser.param_groups: p['lr'] = lr criterion = nn.NLLLoss().cuda() k = 0 saved_k = 0 print(win_length, hop_length, win_length / hop_length) for e in range(epochs): # trn_loader = DataLoader(dataset, collate_fn=lambda batch: env.collate(0, int( win_length/hop_length), 0, batch), batch_size=batch_size, # num_workers=2, shuffle=True, pin_memory=True) trn_loader = DataLoader( dataset, collate_fn=lambda batch: env.collate(0, 16, 0, batch), batch_size=batch_size, num_workers=2, shuffle=True, pin_memory=True) start = time.time() running_loss_c = 0. running_loss_f = 0. iters = len(trn_loader) for i, (mels, coarse, fine, coarse_f, fine_f) in enumerate(trn_loader): mels, coarse, fine, coarse_f, fine_f = mels.cuda( ), coarse.cuda(), fine.cuda(), coarse_f.cuda(), fine_f.cuda() coarse, fine, coarse_f, fine_f = [ t[:, hop_length:1 - hop_length] for t in [coarse, fine, coarse_f, fine_f] ] if use_half: mels = mels.half() coarse_f = coarse_f.half() fine_f = fine_f.half() x = torch.cat([ coarse_f[:, :-1].unsqueeze(-1), fine_f[:, :-1].unsqueeze(-1), coarse_f[:, 1:].unsqueeze(-1) ], dim=2) p_c, p_f, _h_n = self(x, mels) loss_c = criterion(p_c.transpose(1, 2).float(), coarse[:, 1:]) loss_f = criterion(p_f.transpose(1, 2).float(), fine[:, 1:]) loss = loss_c + loss_f optimiser.zero_grad() if use_half: optimiser.backward(loss) else: loss.backward() optimiser.step() running_loss_c += loss_c.item() running_loss_f += loss_f.item() self.after_update() speed = (i + 1) / (time.time() - start) avg_loss_c = running_loss_c / (i + 1) avg_loss_f = running_loss_f / (i + 1) step += 1 k = step // 1000 logger.status( f'Epoch: {e+1}/{epochs} -- Batch: {i+1}/{iters} -- Loss: c={avg_loss_c:#.4} f={avg_loss_f:#.4} -- Speed: {speed:#.4} steps/sec -- Step: {k}k ' ) os.makedirs(paths.checkpoint_dir, exist_ok=True) torch.save(self.state_dict(), paths.model_path()) np.save(paths.step_path(), step) logger.log_current_status() logger.log( f' <saved>; w[0][0] = {self.wavernn.gru.weight_ih_l0[0][0]}') if k > saved_k + 50: torch.save(self.state_dict(), paths.model_hist_path(step)) saved_k = k self.do_generate(paths, step, dataset.path, valid_index, use_half=use_half)
def do_train(self, paths, dataset, optimiser, epochs, batch_size, step, lr=1e-4, valid_index=[], use_half=False, do_clip=False): if use_half: import apex optimiser = apex.fp16_utils.FP16_Optimizer(optimiser, dynamic_loss_scale=True) for p in optimiser.param_groups: p['lr'] = lr criterion = nn.NLLLoss().cuda() k = 0 saved_k = 0 pad_left = self.pad_left() pad_left_encoder = self.pad_left_encoder() pad_left_decoder = self.pad_left_decoder() if self.noise_x: extra_pad_right = 127 else: extra_pad_right = 0 pad_right = self.pad_right() + extra_pad_right window = 16 * self.total_scale() logger.log( f'pad_left={pad_left_encoder}|{pad_left_decoder}, pad_right={pad_right}, total_scale={self.total_scale()}' ) for e in range(epochs): trn_loader = DataLoader( dataset, collate_fn=lambda batch: env.collate_multispeaker_samples( pad_left, window, pad_right, batch), batch_size=batch_size, num_workers=2, shuffle=True, pin_memory=True) start = time.time() running_loss_c = 0. running_loss_f = 0. running_loss_vq = 0. running_loss_vqc = 0. running_entropy = 0. running_max_grad = 0. running_max_grad_name = "" iters = len(trn_loader) for i, (speaker, wave16) in enumerate(trn_loader): speaker = speaker.cuda() wave16 = wave16.cuda() coarse = (wave16 + 2**15) // 256 fine = (wave16 + 2**15) % 256 coarse_f = coarse.float() / 127.5 - 1. fine_f = fine.float() / 127.5 - 1. total_f = (wave16.float() + 0.5) / 32767.5 if self.noise_y: noisy_f = total_f * ( 0.02 * torch.randn(total_f.size(0), 1).cuda() ).exp() + 0.003 * torch.randn_like(total_f) else: noisy_f = total_f if use_half: coarse_f = coarse_f.half() fine_f = fine_f.half() noisy_f = noisy_f.half() x = torch.cat([ coarse_f[:, pad_left - pad_left_decoder:-pad_right].unsqueeze(-1), fine_f[:, pad_left - pad_left_decoder:-pad_right].unsqueeze(-1), coarse_f[:, pad_left - pad_left_decoder + 1:1 - pad_right].unsqueeze(-1), ], dim=2) y_coarse = coarse[:, pad_left + 1:1 - pad_right] y_fine = fine[:, pad_left + 1:1 - pad_right] if self.noise_x: # Randomly translate the input to the encoder to encourage # translational invariance total_len = coarse_f.size(1) translated = [] for j in range(coarse_f.size(0)): shift = random.randrange(256) - 128 translated.append( noisy_f[j, pad_left - pad_left_encoder + shift:total_len - extra_pad_right + shift]) translated = torch.stack(translated, dim=0) else: translated = noisy_f[:, pad_left - pad_left_encoder:] p_cf, vq_pen, encoder_pen, entropy = self( speaker, x, translated) p_c, p_f = p_cf loss_c = criterion(p_c.transpose(1, 2).float(), y_coarse) loss_f = criterion(p_f.transpose(1, 2).float(), y_fine) encoder_weight = 0.01 * min(1, max(0.1, step / 1000 - 1)) loss = loss_c + loss_f + vq_pen + encoder_weight * encoder_pen optimiser.zero_grad() if use_half: optimiser.backward(loss) if do_clip: raise RuntimeError( "clipping in half precision is not implemented yet" ) else: loss.backward() if do_clip: max_grad = 0 max_grad_name = "" for name, param in self.named_parameters(): if param.grad is not None: param_max_grad = param.grad.data.abs().max() if param_max_grad > max_grad: max_grad = param_max_grad max_grad_name = name if 1000000 < param_max_grad: logger.log( f'Very large gradient at {name}: {param_max_grad}' ) if 100 < max_grad: for param in self.parameters(): if param.grad is not None: if 1000000 < max_grad: param.grad.data.zero_() else: param.grad.data.mul_(100 / max_grad) if running_max_grad < max_grad: running_max_grad = max_grad running_max_grad_name = max_grad_name if 100000 < max_grad: torch.save(self.state_dict(), "bad_model.pyt") raise RuntimeError( "Aborting due to crazy gradient (model saved to bad_model.pyt)" ) optimiser.step() running_loss_c += loss_c.item() running_loss_f += loss_f.item() running_loss_vq += vq_pen.item() running_loss_vqc += encoder_pen.item() running_entropy += entropy self.after_update() speed = (i + 1) / (time.time() - start) avg_loss_c = running_loss_c / (i + 1) avg_loss_f = running_loss_f / (i + 1) avg_loss_vq = running_loss_vq / (i + 1) avg_loss_vqc = running_loss_vqc / (i + 1) avg_entropy = running_entropy / (i + 1) step += 1 k = step // 1000 logger.status( f'Epoch: {e+1}/{epochs} -- Batch: {i+1}/{iters} -- Loss: c={avg_loss_c:#.4} f={avg_loss_f:#.4} vq={avg_loss_vq:#.4} vqc={avg_loss_vqc:#.4} -- Entropy: {avg_entropy:#.4} -- Grad: {running_max_grad:#.1} {running_max_grad_name} Speed: {speed:#.4} steps/sec -- Step: {k}k ' ) os.makedirs(paths.checkpoint_dir, exist_ok=True) torch.save(self.state_dict(), paths.model_path()) np.save(paths.step_path(), step) logger.log_current_status() logger.log( f' <saved>; w[0][0] = {self.overtone.wavernn.gru.weight_ih_l0[0][0]}' ) if k > saved_k + 50: torch.save(self.state_dict(), paths.model_hist_path(step)) saved_k = k self.do_generate(paths, step, dataset.path, valid_index)
def do_train(self, paths, dataset, optimiser, epochs, batch_size, step, lr=1e-4, valid_index=[], use_half=False): if use_half: import apex optimiser = apex.fp16_utils.FP16_Optimizer(optimiser, dynamic_loss_scale=True) for p in optimiser.param_groups: p['lr'] = lr criterion = nn.NLLLoss().cuda() k = 0 saved_k = 0 pad_left = self.overtone.pad() time_span = 16 * 64 for e in range(epochs): trn_loader = DataLoader( dataset, collate_fn=lambda batch: env.collate_samples( pad_left, time_span, 1, batch), batch_size=batch_size, num_workers=2, shuffle=True, pin_memory=True) start = time.time() running_loss_c = 0. running_loss_f = 0. max_grad = 0. max_grad_name = "" iters = len(trn_loader) for i, wave16 in enumerate(trn_loader): wave16 = wave16.to(self.DEVICE) coarse = (wave16 + 2**15) // 256 fine = (wave16 + 2**15) % 256 coarse_f = coarse.float() / 127.5 - 1. fine_f = fine.float() / 127.5 - 1. if use_half: coarse_f = coarse_f.half() fine_f = fine_f.half() x = torch.cat([ coarse_f[:, :-1].unsqueeze(-1), fine_f[:, :-1].unsqueeze(-1), coarse_f[:, 1:].unsqueeze(-1), ], dim=2) y_coarse = coarse[:, pad_left + 1:] y_fine = fine[:, pad_left + 1:] p_c, p_f = self(x) loss_c = criterion(p_c.transpose(1, 2).float(), y_coarse) loss_f = criterion(p_f.transpose(1, 2).float(), y_fine) loss = loss_c + loss_f optimiser.zero_grad() if use_half: optimiser.backward(loss) else: loss.backward() for name, param in self.named_parameters(): param_max_grad = param.grad.data.abs().max() if param_max_grad > max_grad: max_grad = param_max_grad max_grad_name = name nn.utils.clip_grad_norm_(self.parameters(), 1, 'inf') optimiser.step() running_loss_c += loss_c.item() running_loss_f += loss_f.item() self.after_update() speed = (i + 1) / (time.time() - start) avg_loss_c = running_loss_c / (i + 1) avg_loss_f = running_loss_f / (i + 1) step += 1 k = step // 1000 logger.status( f'Epoch: {e+1}/{epochs} -- Batch: {i+1}/{iters} -- Loss: c={avg_loss_c:#.4} f={avg_loss_f:#.4} -- Grad: {max_grad:#.1} {max_grad_name} Speed: {speed:#.4} steps/sec -- Step: {k}k ' ) torch.save(self.state_dict(), paths.model_path()) np.save(paths.step_path(), step) logger.log_current_status() logger.log( f' <saved>; w[0][0] = {self.overtone.wavernn.gru.weight_ih_l0[0][0]}' ) if k > saved_k + 50: torch.save(self.state_dict(), paths.model_hist_path(step)) saved_k = k self.do_generate(paths, step, dataset.path, valid_index) logger.log('done generation')
def generate(self, cond, global_cond, n=None, seq_len=None, verbose=False, use_half=False): start = time.time() if n is None: n = cond.size(0) if seq_len is None: seq_len = (cond.size(1) - self.cond_pad) * 64 if use_half: std_tensor = torch.tensor([]).cuda().half() else: std_tensor = torch.tensor([]).cuda() # Warmup c0 = self.conv0(std_tensor.new_zeros(n, 10, 1), global_cond).repeat(1, 10, 1) c1 = self.conv1(c0, global_cond).repeat(1, 10, 1) c2 = self.conv2(c1, global_cond) if cond is None: pad_cond = None else: pad_cond = cond[:, :self.cond_pad] #logger.log(f'pad_cond: {pad_cond.size()}') r0, h0 = self.rnn0(torch.cat(filter_none([c2.repeat(1, 85, 1), pad_cond]), dim=2), global_cond) r1, h1 = self.rnn1(torch.cat([c1.repeat(1, 9, 1)[:, :84], r0], dim=2), global_cond) r2, h2 = self.rnn2(torch.cat([c0.repeat(1, 8, 1), r1], dim=2), global_cond) if global_cond is not None: global_cond_1 = global_cond.unsqueeze(1).expand(-1, r2.size(1), -1) else: global_cond_1 = None h3 = self.wavernn(std_tensor.new_zeros(n, 64, 3), torch.cat(filter_none([r2, global_cond_1]), dim=2))[2] # Create cells cell0 = self.rnn0.to_cell() cell1 = self.rnn1.to_cell() cell2 = self.rnn2.to_cell() wcell = self.wavernn.to_cell() # Main loop! coarse = std_tensor.new_zeros(n, 10, 1) c_val = std_tensor.new_zeros(n) f_val = std_tensor.new_zeros(n) zero = std_tensor.new_zeros(n) output = [] for t in range(seq_len): #logger.log(f't = {t}') t0 = t % 4 ct0 = (-t) % 4 if t0 == 0: t1 = (t // 4) % 4 ct1 = ((-t) // 4) % 4 #logger.log(f'written to c0[{-ct1-1}]') c0[:, -ct1-1].copy_(self.conv0(coarse, global_cond).squeeze(1)) coarse[:, :-4].copy_(coarse[:, 4:]) if t1 == 0: t2 = (t // 16) % 4 ct2 = ((-t) // 16) % 4 #logger.log('read c0') #logger.log(f'written to c1[{-ct2-1}]') c1[:, -ct2-1].copy_(self.conv1(c0, global_cond).squeeze(1)) c0[:, :-4].copy_(c0[:, 4:]) if t2 == 0: #logger.log('read c1') #logger.log('written to c2') c2 = self.conv2(c1, global_cond).squeeze(1) c1[:, :-4].copy_(c1[:, 4:]) #logger.log('read c2') #logger.log('written to r0') if cond is None: inp0 = c2 else: inp0 = torch.cat([c2, cond[:, t // 64 + self.cond_pad]], dim=1) r0, h0 = cell0(inp0, global_cond, h0) #logger.log(f'read r0[{t2}]') #logger.log(f'written to r1') #logger.log(f'c1: {c1.size()} r0: {r0.size()}') r1, h1 = cell1(torch.cat([c1[:, -ct2-1], r0[:, t2]], dim=1), global_cond, h1) #logger.log(f'read r1[{t1}]') #logger.log(f'written to r2') #logger.log(f'c0: {c0.size()} r1: {r1.size()}') r2, h2 = cell2(torch.cat([c0[:, -ct1-1], r1[:, t1]], dim=1), global_cond, h2) #logger.log(f'read r2[{t0}]') wcond = torch.cat(filter_none([r2[:, t0], global_cond]), dim=1) x = torch.stack([c_val, f_val, zero], dim=1) o_c = wcell.forward_c(x, wcond, None, None, h3) c_cat = utils.nn.sample_softmax(o_c).float() c_val_new = (c_cat / 127.5 - 1.0).to(std_tensor) x = torch.stack([c_val, f_val, c_val_new], dim=1) o_f, h3 = wcell.forward_f(x, wcond, None, None, h3) f_cat = utils.nn.sample_softmax(o_f).float() f_val = (f_cat / 127.5 - 1.0).to(std_tensor) c_val = c_val_new sample = (c_cat * 256 + f_cat) / 32767.5 - 1.0 coarse[:, 6+t0].copy_(c_val.unsqueeze(1)) if verbose and t % 10000 < 100: logger.log(f'c={c_cat[0]} f={f_cat[0]} sample={sample[0]}') output.append(sample) if t % 100 == 0 : speed = int((t + 1) / (time.time() - start)) logger.status(f'{t+1}/{seq_len} -- Speed: {speed} samples/sec') return torch.stack(output, dim=1)
def generate(self, feat, aux1=None, aux2=None, aux3=None, deterministic=False, use_half=False, verbose=False, seq_len=None, batch_size=None): start = time.time() if seq_len is None: seq_len = feat.size(1) if batch_size is None: batch_size = feat.size(0) h = torch.zeros(batch_size, self.rnn_dims).cuda() if use_half: h = h.half() c_val = torch.zeros(batch_size).cuda() f_val = torch.zeros(batch_size).cuda() zero = torch.zeros(batch_size).cuda() rnn_cell = self.to_cell() output = [] for i in range(seq_len): if feat is None: m_t = None else: m_t = feat[:, i, :] if aux1 is None: a1_t = None else: a1_t = aux1[:, i, :] if aux2 is None: a2_t = None else: a2_t = aux2[:, i, :] if aux3 is None: a3_t = None else: a3_t = aux3[:, i, :] x = torch.stack([c_val, f_val, zero], dim=1) if use_half: x = x.half() o_c = rnn_cell.forward_c(x, m_t, a1_t, a2_t, h) if deterministic: c_cat = torch.argmax(o_c, dim=1).to(torch.float32) else: posterior_c = F.softmax(o_c.float(), dim=1) distrib_c = torch.distributions.Categorical(posterior_c) c_cat = distrib_c.sample().float() c_val_new = c_cat / 127.5 - 1.0 x = torch.stack([c_val, f_val, c_val_new], dim=1) if use_half: x = x.half() o_f, h = rnn_cell.forward_f(x, m_t, a1_t, a3_t, h) if deterministic: f_cat = torch.argmax(o_f, dim=1).to(torch.float32) else: posterior_f = F.softmax(o_f.float(), dim=1) distrib_f = torch.distributions.Categorical(posterior_f) f_cat = distrib_f.sample().float() f_val = f_cat / 127.5 - 1.0 c_val = c_val_new sample = (c_cat * 256 + f_cat) / 32767.5 - 1.0 if verbose and i % 10000 < 100: logger.log(f'c={c_cat[0]} f={f_cat[0]} sample={sample[0]}') output.append(sample) if i % 100 == 0: speed = int((i + 1) / (time.time() - start)) logger.status(f'{i+1}/{seq_len} -- Speed: {speed} samples/sec') return torch.stack(output, dim=1)