def save_states(global_step, writer, y_hat, y, input_lengths, checkpoint_dir=None): print("Save intermediate states at step {}".format(global_step)) idx = np.random.randint(0, len(y_hat)) length = input_lengths[idx].data.cpu().numpy()[0] # (B, C, T) y_hat = y_hat.squeeze(-1) # (B, T) y_hat = F.softmax(y_hat, dim=1).max(1)[1] # (T,) y_hat = y_hat[idx].data.cpu().long().numpy() y = y[idx].view(-1).data.cpu().long().numpy() y_hat = P.inv_mulaw_quantize(y_hat) y = P.inv_mulaw_quantize(y) # Mask by length y_hat[length:] = 0 y[length:] = 0 # Save audio audio_dir = join(checkpoint_dir, "audio") os.makedirs(audio_dir, exist_ok=True) path = join(audio_dir, "step{:09d}_predicted.wav".format(global_step)) librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate) path = join(audio_dir, "step{:09d}_target.wav".format(global_step)) librosa.output.write_wav(path, y, sr=hparams.sample_rate)
def save_states(global_step, writer, y_hat, student_hat, y, input_lengths, checkpoint_dir=None): print("Save intermediate states at step {}".format(global_step)) idx = np.random.randint(0, len(y_hat)) length = input_lengths[idx].data.cpu().item() # (B, C, T) if y_hat.dim() == 4: y_hat = y_hat.squeeze(-1) if is_mulaw_quantize(hparams.input_type): # (B, T) y_hat = F.softmax(y_hat, dim=1).max(1)[1] # (T,) y_hat = y_hat[idx].data.cpu().long().numpy() y = y[idx].view(-1).data.cpu().long().numpy() y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels) y = P.inv_mulaw_quantize(y, hparams.quantize_channels) else: # (B, T) if hparams.use_gaussian: y_hat = y_hat.transpose(1, 2) y_hat = sample_from_gaussian(y_hat, log_scale_min=hparams.log_scale_min) else: y_hat = sample_from_discretized_mix_logistic( y_hat, log_scale_min=hparams.log_scale_min) # (T,) y_hat = y_hat[idx].view(-1).data.cpu().numpy() y = y[idx].view(-1).data.cpu().numpy() student_hat = student_hat[idx].view(-1).data.cpu().numpy() if is_mulaw(hparams.input_type): y_hat = P.inv_mulaw(y_hat, hparams.quantize_channels) y = P.inv_mulaw(y, hparams.quantize_channels) student_hat = P.inv_mulaw(student_hat, hparams.quantize_channels) # Mask by length y_hat[length:] = 0 y[length:] = 0 student_hat[length:] = 0 # Save audio audio_dir = join(checkpoint_dir, "audio") os.makedirs(audio_dir, exist_ok=True) path = join(audio_dir, "step{:09d}_teacher.wav".format(global_step)) librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate) path = join(audio_dir, "step{:09d}_student.wav".format(global_step)) librosa.output.write_wav(path, student_hat, sr=hparams.sample_rate) path = join(audio_dir, "step{:09d}_target.wav".format(global_step)) librosa.output.write_wav(path, y, sr=hparams.sample_rate)
def save_states(global_step, writer, y_hat, y, input_lengths, checkpoint_dir=None): print("Save intermediate states at step {}".format(global_step)) idx = np.random.randint(0, len(y_hat)) length = input_lengths[idx].data.cpu().item() # (B, C, T) if y_hat.dim() == 4: y_hat = y_hat.squeeze(-1) if is_mulaw_quantize(wavenet_hparams.input_type): # (B, T) y_hat = F.softmax(y_hat, dim=1).max(1)[1] # (T,) y_hat = y_hat[idx].data.cpu().long().numpy() y = y[idx].view(-1).data.cpu().long().numpy() y_hat = P.inv_mulaw_quantize(y_hat, wavenet_hparams.quantize_channels - 1) y = P.inv_mulaw_quantize(y, wavenet_hparams.quantize_channels - 1) else: # (B, T) if wavenet_hparams.output_distribution == "Logistic": y_hat = sample_from_discretized_mix_logistic( y_hat, log_scale_min=wavenet_hparams.log_scale_min) elif wavenet_hparams.output_distribution == "Normal": y_hat = sample_from_mix_gaussian( y_hat, log_scale_min=wavenet_hparams.log_scale_min) else: assert False # (T,) y_hat = y_hat[idx].view(-1).data.cpu().numpy() y = y[idx].view(-1).data.cpu().numpy() if is_mulaw(wavenet_hparams.input_type): y_hat = P.inv_mulaw(y_hat, wavenet_hparams.quantize_channels) y = P.inv_mulaw(y, wavenet_hparams.quantize_channels) # Mask by length y_hat[length:] = 0 y[length:] = 0 # Save audio audio_dir = join(checkpoint_dir, "intermediate", "audio") os.makedirs(audio_dir, exist_ok=True) path = join(audio_dir, "step{:09d}_predicted.wav".format(global_step)) # librosa.output.write_wav(path, y_hat, sr=wavenet_hparams.sample_rate) sf.write(path, y_hat, samplerate=wavenet_hparams.sample_rate) path = join(audio_dir, "step{:09d}_target.wav".format(global_step)) # librosa.output.write_wav(path, y, sr=wavenet_hparams.sample_rate) sf.write(path, y, samplerate=wavenet_hparams.sample_rate)
def save_states(global_step, writer, y_hat, y, y_student,scale_tot, input_lengths, checkpoint_dir=None): print("Save intermediate states at step {}".format(global_step)) idx = np.random.randint(0, len(y_hat)) length = input_lengths[idx].data.cpu().numpy() # (B, C, T) if y_hat.dim() == 4: y_hat = y_hat.squeeze(-1) if is_mulaw_quantize(hparams.input_type): # (B, T) y_hat = F.softmax(y_hat, dim=1).max(1)[1] # (T,) y_hat = y_hat[idx].data.cpu().long().numpy() y = y[idx].view(-1).data.cpu().long().numpy() y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels) y = P.inv_mulaw_quantize(y, hparams.quantize_channels) else: # (B, T) scale = y_hat[:,1:,:] teacher_log_scale = scale.data.cpu().numpy() student_log_scale = torch.log(scale_tot).data.cpu().numpy() writer.add_histogram('log_teacher_scale', teacher_log_scale, global_step) writer.add_histogram('log_student_scale', student_log_scale, global_step) y_hat = sample_from_discretized_gaussian( y_hat, log_scale_min=hparams.log_scale_min) # (T,) y_hat = y_hat[idx].view(-1).data.cpu().numpy() y = y[idx].view(-1).data.cpu().numpy() if is_mulaw(hparams.input_type): y_hat = P.inv_mulaw(y_hat, hparams.quantize_channels) y = P.inv_mulaw(y, hparams.quantize_channels) # Mask by length y_hat[length:] = 0 y[length:] = 0 y_student = y_student[idx].view(-1).data.cpu().numpy() y_student[length:] = 0 # Save audio audio_dir = join(checkpoint_dir, "audio") os.makedirs(audio_dir, exist_ok=True) path = join(audio_dir, "step{:09d}_teacher_predicted.wav".format(global_step)) librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate) path = join(audio_dir, "step{:09d}_student_predicted.wav".format(global_step)) librosa.output.write_wav(path, y_student, sr=hparams.sample_rate) path = join(audio_dir, "step{:09d}_target.wav".format(global_step)) librosa.output.write_wav(path, y, sr=hparams.sample_rate) path = join(audio_dir, "step{:09d}.jpg".format(global_step)) save_waveplot(path,y_teacher=y_hat,y_student=y_student,y_target=y,writer=writer,global_step=global_step)
def eval_model(global_step, writer, model, y, c, g, input_lengths, eval_dir): model.eval() idx = np.random.randint(0, len(y)) length = input_lengths[idx].data.cpu().numpy()[0] # (T,) y_target = y[idx].view(-1).data.cpu().long().numpy()[:length] if c is not None: c = c[idx, :, :length].unsqueeze(0) assert c.dim() == 3 print("Shape of local conditioning features: {}".format(c.size())) if g is not None: # TODO: test g = g[idx] print("Shape of global conditioning features: {}".format(g.size())) # Dummy silence initial_value = P.mulaw_quantize(0) print("Intial value:", initial_value) # (C,) initial_input = np_utils.to_categorical(initial_value, num_classes=256).astype(np.float32) initial_input = Variable(torch.from_numpy(initial_input), volatile=True).view(1, 1, 256) initial_input = initial_input.cuda() if use_cuda else initial_input y_hat = model.incremental_forward(initial_input, c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True) y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy() y_hat = P.inv_mulaw_quantize(y_hat) y_target = P.inv_mulaw_quantize(y_target) # Save audio os.makedirs(eval_dir, exist_ok=True) path = join(eval_dir, "step{:09d}_predicted.wav".format(global_step)) librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate) path = join(eval_dir, "step{:09d}_target.wav".format(global_step)) librosa.output.write_wav(path, y_target, sr=hparams.sample_rate) # save figure path = join(eval_dir, "step{:09d}_waveplots.png".format(global_step)) save_waveplot(path, y_hat, y_target)
def test_mulaw(): # Check corner cases assert P.mulaw_quantize(-1.0, 2) == 0 assert P.mulaw_quantize(-0.5, 2) == 0 assert P.mulaw_quantize(-0.001, 2) == 0 assert P.mulaw_quantize(0.0, 2) == 1 assert P.mulaw_quantize(0.0001, 2) == 1 assert P.mulaw_quantize(0.5, 2) == 1 assert P.mulaw_quantize(0.99999, 2) == 1 assert P.mulaw_quantize(1.0, 2) == 2 np.random.seed(1234) # forward/backward correctness for mu in [128, 256, 512]: for x in np.random.rand(100): y = P.mulaw(x, mu) assert y >= 0 and y <= 1 x_hat = P.inv_mulaw(y, mu) assert np.allclose(x, x_hat) # forward/backward correctness for quantize for mu in [128, 256, 512]: for x, y in [(-1.0, 0), (0.0, mu // 2), (0.99999, mu - 1)]: y_hat = P.mulaw_quantize(x, mu) err = np.abs(x - P.inv_mulaw_quantize(y_hat, mu)) print(y, y_hat, err) assert np.allclose(y, y_hat) # have small quantize error assert err <= 0.1 # ndarray input for mu in [128, 256, 512]: x = np.random.rand(10) y = P.mulaw(x, mu) x_hat = P.inv_mulaw(y, mu) assert np.allclose(x, x_hat) P.inv_mulaw_quantize(P.mulaw_quantize(x)) # torch array input from warnings import warn import torch torch.manual_seed(1234) for mu in [128, 256, 512]: x = torch.rand(10) y = P.mulaw(x, mu) x_hat = P.inv_mulaw(y, mu) assert np.allclose(x, x_hat) P.inv_mulaw_quantize(P.mulaw_quantize(x))
def save_log(sess, step, model, plot_dir, audio_dir, hp): predicts, targets = sess.run([model.log_outputs, model.targets]) y_hat = P.inv_mulaw_quantize(predicts[0], hp.quantize_channels) y = P.inv_mulaw_quantize(targets[0], hp.quantize_channels) pred_wav_path = os.path.join(audio_dir, 'step-{}-pred.wav'.format(step)) target_wav_path = os.path.join(audio_dir, 'step-{}-real.wav'.format(step)) plot_path = os.path.join(plot_dir, 'step-{}-waveplot.png'.format(step)) # Save audio librosa.output.write_wav(pred_wav_path, y_hat, sr=hp.sample_rate) librosa.output.write_wav(target_wav_path, y, sr=hp.sample_rate) # Save figure waveplot(plot_path, y_hat, y, hparams)
def synthesize(self, sess, n_samples, lc, gc): sess.run(tf.variables_initializer(self.var_q)) if self.net.scalar_input: seeds = [0] else: seeds = [128] seeds = [seeds] seeds = np.repeat(seeds, self.batch_size, axis=0) generated = [seeds] if type(n_samples) == list: n_sample = max(n_samples) else: n_sample = n_samples for j in tqdm(range(n_sample)): sample = generated[-1] current_lc = lc[:, j, :] # Generation phase feed_dict = { self.sample_placeholder: sample, self.lc_placeholder: current_lc, self.gen_num: j } if self.gc_placeholder is not None: feed_dict.update({self.gc_placeholder: gc}) prob, _layers = sess.run([self.next_sample_prob, self.layers_out], feed_dict=feed_dict) # Update phase feed_dict = { self.initial: _layers[0], self.others: np.array(_layers[1:]), self.gen_num: j } sess.run(self.update_q_ops, feed_dict=feed_dict) if self.net.scalar_input: generated_sample = prob else: # TODO: random choice generated_sample = np.argmax(prob, axis=-1) generated.append(generated_sample) result = np.hstack(generated) if not self.net.scalar_input: result = P.inv_mulaw_quantize(result.astype(np.int16), self.net.quantization_channels) if type(n_samples) == list: result = [x[:n_samples[i]] for i, x in enumerate(result)] return result
def generate(self, sess, n_samples, lc, gc): sess.run(tf.variables_initializer(self.var_q)) receptive_field = self.vocoder.net.receptive_field if self.vocoder.net.scalar_input: seeds = [0] else: seeds = [128] seeds = [seeds] seeds = np.repeat(seeds, self.batch_size, axis=0) # generated = [] generated = [seeds] # for j in tqdm(range(receptive_field + n_samples)): # if j < receptive_field: # sample = seeds # current_lc = np.zeros((self.batch_size, hparams.num_mels)) # else: # sample = generated[-1] # current_lc = lc[:, j - receptive_field, :] for j in tqdm(range(n_samples)): sample = generated[-1] current_lc = lc[:, j, :] # Generation phase feed_dict = { self.sample_placeholder: sample, self.lc_placeholder: current_lc, self.gen_num: j} if self.gc_placeholder is not None: feed_dict.update({self.gc_placeholder: gc}) prob, _layers = sess.run([self.next_sample_prob, self.layers_out], feed_dict=feed_dict) # Update phase feed_dict = { self.initial: _layers[0], self.others: np.array(_layers[1:]), self.gen_num: j} sess.run(self.update_q_ops, feed_dict=feed_dict) if self.vocoder.net.scalar_input: generated_sample = prob else: # TODO: random choice generated_sample = np.argmax(prob, axis=-1) generated.append(generated_sample) # result = np.hstack(generated)[:, receptive_field:] result = np.hstack(generated) if not self.vocoder.net.scalar_input: result = P.inv_mulaw_quantize(result.astype(np.int16), self.vocoder.net.quantization_channels) return result
def batch_wavegen(model, c=None, g=None, fast=True, tqdm=tqdm, length=None, writing_dir=None): from train import sanity_check sanity_check(model, c, g) # assert c is not None if c is not None: B = c.shape[0] else: B = 1 #c.shape[0] model.eval() if fast: model.make_generation_fast_() # Transform data to GPU g = None if g is None else g.to(device) c = None if c is None else c.to(device) if hparams.upsample_conditional_features and length is None: length = (c.shape[-1] - hparams.cin_pad * 2) * audio.get_hop_size() with torch.no_grad(): y_hat = model.incremental_forward( c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True, log_scale_min=hparams.log_scale_min) y_hat_sample = y_hat.max(1)[1].view(B, -1).float() cross_entropy = model.binary_softmax_loss(y_hat_sample.unsqueeze(1), c) # Write the output with open(join(writing_dir, "info.json"), "w") as f: data = {"0.244" : float(cross_entropy.detach().cpu().numpy())} json.dump(data, f, indent=4) if is_mulaw_quantize(hparams.input_type): # needs to be float since mulaw_inv returns in range of [-1, 1] y_hat = y_hat.max(1)[1].view(B, -1).float().cpu().data.numpy() for i in range(B): y_hat[i] = P.inv_mulaw_quantize(y_hat[i], hparams.quantize_channels - 1) elif is_linear_quantize(hparams.input_type): y_hat = y_hat.max(1)[1].view(B, -1).float().cpu().data.numpy() for i in range(B): y_hat[i] = inv_linear_quantize(y_hat[i], hparams.quantize_channels - 1) elif is_mulaw(hparams.input_type): y_hat = y_hat.view(B, -1).cpu().data.numpy() for i in range(B): y_hat[i] = P.inv_mulaw(y_hat[i], hparams.quantize_channels - 1) else: y_hat = y_hat.view(B, -1).cpu().data.numpy() if hparams.postprocess is not None and hparams.postprocess not in ["", "none"]: for i in range(B): y_hat[i] = getattr(audio, hparams.postprocess)(y_hat[i]) if hparams.global_gain_scale > 0: for i in range(B): y_hat[i] /= hparams.global_gain_scale return y_hat
def batch_wavegen(model, c=None, g=None, fast=True, tqdm=tqdm, length=None): from train import sanity_check sanity_check(model, c, g) # assert c is not None if c is not None: B = c.shape[0] else: B = 1 #c.shape[0] model.eval() if fast: model.make_generation_fast_() # Transform data to GPU g = None if g is None else g.to(device) c = None if c is None else c.to(device) if hparams.upsample_conditional_features and length is None: length = (c.shape[-1] - hparams.cin_pad * 2) * audio.get_hop_size() with torch.no_grad(): y_hat = model.incremental_forward(c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): # needs to be float since mulaw_inv returns in range of [-1, 1] y_hat = y_hat.max(1)[1].view(B, -1).float().cpu().data.numpy() for i in range(B): y_hat[i] = P.inv_mulaw_quantize(y_hat[i], hparams.quantize_channels - 1) elif is_linear_quantize(hparams.input_type): y_hat = y_hat.max(1)[1].view(B, -1).float().cpu().data.numpy() for i in range(B): y_hat[i] = inv_linear_quantize(y_hat[i], hparams.quantize_channels - 1) elif is_mulaw(hparams.input_type): y_hat = y_hat.view(B, -1).cpu().data.numpy() for i in range(B): y_hat[i] = P.inv_mulaw(y_hat[i], hparams.quantize_channels - 1) else: y_hat = y_hat.view(B, -1).cpu().data.numpy() if hparams.postprocess is not None and hparams.postprocess not in [ "", "none" ]: for i in range(B): y_hat[i] = getattr(audio, hparams.postprocess)(y_hat[i]) if hparams.global_gain_scale > 0: for i in range(B): y_hat[i] /= hparams.global_gain_scale return y_hat
def test_mulaw_real(): fs, x = wavfile.read(example_audio_file()) x = (x / 32768.0).astype(np.float32) mu = 256 y = P.mulaw_quantize(x, mu) assert y.min() >= 0 and y.max() < mu assert y.dtype == np.int x = P.inv_mulaw_quantize(y, mu) * 32768 assert x.dtype == np.float32 x = x.astype(np.int16)
def batch_wavegen(hparam, net, c_input=None, g_input=None, tqdm_=None, is_numpy=True): """ generate audio """ assert c_input is not None B = c_input.shape[0] net.set_train(False) if hparam.upsample_conditional_features: length = (c_input.shape[-1] - hparam.cin_pad * 2) * audio.get_hop_size() else: # already dupulicated length = c_input.shape[-1] y_hat = net.incremental_forward(c=c_input, g=g_input, T=length, tqdm=tqdm_, softmax=True, quantize=True, log_scale_min=hparam.log_scale_min, is_numpy=is_numpy) if is_mulaw_quantize(hparam.input_type): # needs to be float since mulaw_inv returns in range of [-1, 1] y_hat = np.reshape(np.argmax(y_hat, 1), (B, -1)) y_hat = y_hat.astype(np.float32) for k in range(B): y_hat[k] = P.inv_mulaw_quantize(y_hat[k], hparam.quantize_channels - 1) elif is_mulaw(hparam.input_type): y_hat = np.reshape(y_hat, (B, -1)) for k in range(B): y_hat[k] = P.inv_mulaw(y_hat[k], hparam.quantize_channels - 1) else: y_hat = np.reshape(y_hat, (B, -1)) if hparam.postprocess is not None and hparam.postprocess not in [ "", "none" ]: for k in range(B): y_hat[k] = getattr(audio, hparam.postprocess)(y_hat[k]) if hparam.global_gain_scale > 0: for k in range(B): y_hat[k] /= hparam.global_gain_scale return y_hat
def save_ref_audio(hparam, ref, length, target_wav_path_): """ save reference audio """ if is_mulaw_quantize(hparam.input_type): ref = np.reshape(np.argmax(ref, 0), (-1))[:length] ref = ref.astype(np.float32) else: ref = np.reshape(ref, (-1))[:length] if is_mulaw_quantize(hparam.input_type): ref = P.inv_mulaw_quantize(ref, hparam.quantize_channels - 1) elif is_mulaw(hparam.input_type): ref = P.inv_mulaw(ref, hparam.quantize_channels - 1) if hparam.postprocess is not None and hparam.postprocess not in ["", "none"]: ref = getattr(audio, hparam.postprocess)(ref) if hparam.global_gain_scale > 0: ref /= hparam.global_gain_scale ref = np.clip(ref, -1.0, 1.0) wavfile.write(target_wav_path_, hparam.sample_rate, to_int16(ref))
def _test_data(sr=4000, N=3000, returns_power=False, mulaw=True): x, _ = librosa.load(example_audio_file(), sr=sr) x, _ = librosa.effects.trim(x, top_db=15) # To save computational cost x = x[:N] # For power conditioning wavenet if returns_power: # (1 x N') p = librosa.feature.rmse(x, frame_length=256, hop_length=128) upsample_factor = x.size // p.size # (1 x N) p = np.repeat(p, upsample_factor, axis=-1) if p.size < x.size: # pad against time axis p = np.pad(p, [(0, 0), (0, x.size - p.size)], mode="constant", constant_values=0) # shape adajst p = p.reshape(1, 1, -1) # (T,) if mulaw: x = P.mulaw_quantize(x) x_org = P.inv_mulaw_quantize(x) # (C, T) x = to_categorical(x, num_classes=256).T # (1, C, T) x = x.reshape(1, 256, -1).astype(np.float32) else: x_org = x x = x.reshape(1, 1, -1) if returns_power: return x, x_org, p return x, x_org
def wavegen(model, length=None, c=None, g=None, initial_value=None, fast=False, tqdm=tqdm): """Generate waveform samples by WaveNet. Multiple waveforms can be generated in single batch Args: model (nn.Module) : WaveNet decoder length (int): Time steps to generate. If conditinlal features are given, then this is determined by the feature size. c (numpy.ndarray or list): Conditional features, of shape T x C g (scalar or list): Speaker ID initial_value (int) : initial_value for the WaveNet decoder. fast (Bool): Whether to remove weight normalization or not. tqdm (lambda): tqdm Returns: numpy.ndarray or list : Generated waveform samples """ from train import sanity_check sanity_check(model, c, g) model.eval() if fast: model.make_generation_fast_() # Prepare Local Condition batch_size = 1 output_should_be_list = False if c is None: assert length is not None else: if type(c)==list : output_should_be_list = True c = [_to_numpy(x) for x in c] for x in c : if x.ndim != 2: raise RuntimeError( "Expected 2-dim shape (T, {}) for the conditional feature, but {} was actually given.".format(hparams.cin_channels, x.shape)) assert x.ndim == 2 batch_size = len(c) batch = np.zeros([batch_size, max([x.shape[0] for x in c]), c[0].shape[1]]) for i in range(batch_size) : batch[i,:c[i].shape[0],:] = c[i][:,:] upsample_factor = audio.get_hop_size() # length_list : used to cut silence when batch_size > 1 length_list = [x.shape[0]*upsample_factor for x in c] length = max(length_list) if not hparams.upsample_conditional_features: batch = np.repeat(batch, upsample_factor, axis=1) c = torch.FloatTensor(np.transpose(batch, [0, 2, 1])) else : c = _to_numpy(c) # (Tc, D) if c.ndim != 2: raise RuntimeError( "Expected 2-dim shape (T, {}) for the conditional feature, but {} was actually given.".format(hparams.cin_channels, c.shape)) assert c.ndim == 2 Tc = c.shape[0] upsample_factor = audio.get_hop_size() # Overwrite length according to feature size length = Tc * upsample_factor # (Tc, D) -> (Tc', D) # Repeat features before feeding it to the network if not hparams.upsample_conditional_features: c = np.repeat(c, upsample_factor, axis=0) # B x C x T c = torch.FloatTensor(c.T).unsqueeze(0) # Prepare initial_input if initial_value is None: if is_mulaw_quantize(hparams.input_type): initial_value = P.mulaw_quantize(0, hparams.quantize_channels) else: initial_value = 0.0 if is_mulaw_quantize(hparams.input_type): assert initial_value >= 0 and initial_value < hparams.quantize_channels initial_input = np_utils.to_categorical( initial_value, num_classes=hparams.quantize_channels).astype(np.float32) initial_input = torch.from_numpy(initial_input).view( 1, 1, hparams.quantize_channels) else: initial_input = torch.zeros(1, 1, 1).fill_(initial_value) initial_input = initial_input.repeat(batch_size, 1, 1) # Prepare Global Condition if type(g)==list : g = [_to_numpy(x) for x in g] g = torch.LongTensor(g) elif g is not None : g = _to_numpy(g) g = torch.LongTensor([g]) # Transform data to GPU initial_input = initial_input.to(device) g = None if g is None else g.to(device) c = None if c is None else c.to(device) with torch.no_grad(): y_hat = model.incremental_forward( initial_input, c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = y_hat.max(1)[1].view(batch_size, -1).long().cpu().data.numpy() y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = P.inv_mulaw(y_hat.view(batch_size, -1).cpu().data.numpy(), hparams.quantize_channels) else: y_hat = y_hat.view(batch_size, -1).cpu().data.numpy() if output_should_be_list : return [y_hat[i, :length_list[i]] for i in range(batch_size)] else : return y_hat[0, :]
def wavegen(model, length=None, c=None, g=None, initial_value=None, fast=False, tqdm=tqdm): """Generate waveform samples by WaveNet. Args: model (nn.Module) : WaveNet decoder length (int): Time steps to generate. If conditinlal features are given, then this is determined by the feature size. c (numpy.ndarray): Conditional features, of shape T x C g (scaler): Speaker ID initial_value (int) : initial_value for the WaveNet decoder. fast (Bool): Whether to remove weight normalization or not. tqdm (lambda): tqdm Returns: numpy.ndarray : Generated waveform samples """ from train import sanity_check sanity_check(model, c, g) c = _to_numpy(c) g = _to_numpy(g) model.eval() if fast: model.make_generation_fast_() if c is None: assert length is not None else: # (Tc, D) if c.ndim != 2: raise RuntimeError( "Expected 2-dim shape (T, {}) for the conditional feature, but {} was actually given." .format(hparams.cin_channels, c.shape)) assert c.ndim == 2 Tc = c.shape[0] upsample_factor = audio.get_hop_size() # Overwrite length according to feature size length = Tc * upsample_factor # (Tc, D) -> (Tc', D) # Repeat features before feeding it to the network if not hparams.upsample_conditional_features: c = np.repeat(c, upsample_factor, axis=0) # B x C x T c = torch.FloatTensor(c.T).unsqueeze(0) if initial_value is None: if is_mulaw_quantize(hparams.input_type): initial_value = P.mulaw_quantize(0, hparams.quantize_channels - 1) else: initial_value = 0.0 if is_mulaw_quantize(hparams.input_type): assert initial_value >= 0 and initial_value < hparams.quantize_channels initial_input = np_utils.to_categorical( initial_value, num_classes=hparams.quantize_channels).astype(np.float32) initial_input = torch.from_numpy(initial_input).view( 1, 1, hparams.quantize_channels) else: initial_input = torch.zeros(1, 1, 1).fill_(initial_value) g = None if g is None else torch.LongTensor([g]) # Transform data to GPU initial_input = initial_input.to(device) g = None if g is None else g.to(device) c = None if c is None else c.to(device) with torch.no_grad(): y_hat = model.incremental_forward(initial_input, c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy() y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = P.inv_mulaw( y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels) else: y_hat = y_hat.view(-1).cpu().data.numpy() if hparams.postprocess is not None and hparams.postprocess not in [ "", "none" ]: y_hat = getattr(audio, hparams.postprocess)(y_hat) if hparams.global_gain_scale > 0: y_hat /= hparams.global_gain_scale return y_hat
def save_states(global_step, writer, y_hat, y, y_student, input_lengths, mu=None, checkpoint_dir=None): ''' :param global_step: :param writer: :param y_hat: parameters output by teachery_hat是教师结果 :param y: target :param y_student: student output :param input_lengths: :param mu: student mu :param checkpoint_dir: :return: ''' print("Save intermediate states at step {}".format(global_step)) idx = np.random.randint(0, len(y_hat)) length = input_lengths[idx].data.cpu().numpy() if mu is not None: mu = mu[idx] # (B, C, T) if y_hat.dim() == 4: y_hat = y_hat.squeeze(-1) if is_mulaw_quantize(hparams.input_type): # (B, T) y_hat = F.softmax(y_hat, dim=1).max(1)[1] # (T,) y_hat = y_hat[idx].data.cpu().long().numpy() y = y[idx].view(-1).data.cpu().long().numpy() y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels) y = P.inv_mulaw_quantize(y, hparams.quantize_channels) else: # (B, T) y_hat = sample_from_discretized_mix_logistic( y_hat, log_scale_min=hparams.log_scale_min) # (T,) y_hat = y_hat[idx].view(-1).data.cpu().numpy() y = y[idx].view(-1).data.cpu().numpy() if is_mulaw(hparams.input_type): y_hat = P.inv_mulaw(y_hat, hparams.quantize_channels) y = P.inv_mulaw(y, hparams.quantize_channels) # Mask by length y_hat[length:] = 0 y[length:] = 0 y_student = y_student.data.cpu().numpy() y_student = y_student[idx].reshape(y_student.shape[-1]) mu = to_numpy(mu) # Save audio audio_dir = join(checkpoint_dir, "audio") if global_step % 1000 == 0: audio_dir = join(checkpoint_dir, "audio") os.makedirs(audio_dir, exist_ok=True) path = join(audio_dir, "step{:09d}_teacher.wav".format(global_step)) librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate) path = join(audio_dir, "step{:09d}_target.wav".format(global_step)) librosa.output.write_wav(path, y, sr=hparams.sample_rate) path = join(audio_dir, "step{:09d}_student.wav".format(global_step)) librosa.output.write_wav(path, y_student, sr=hparams.sample_rate) # TODO save every 200 step, if global_step % 200 == 0: path = join(audio_dir, "wave_step{:09d}.png".format(global_step)) save_waveplot(path, y_student=y_student, y_target=y, y_teacher=y_hat, student_mu=mu)
def main(): args = get_arguments() if args.hparams is not None: hparams.parse(args.hparams) if not hparams.gc_enable: hparams.global_cardinality = None hparams.global_channel = None print(hparams_debug_string()) sess = tf.Session(config=tf.ConfigProto(log_device_placement=False, gpu_options=tf.GPUOptions( allow_growth=True))) net = WaveNetModel( batch_size=1, dilations=hparams.dilations, filter_width=hparams.filter_width, residual_channels=hparams.residual_channels, dilation_channels=hparams.dilation_channels, skip_channels=hparams.skip_channels, quantization_channels=hparams.quantization_channels, use_biases=hparams.use_biases, scalar_input=hparams.scalar_input, initial_filter_width=hparams.initial_filter_width, local_condition_channel=hparams.num_mels, upsample_conditional_features=hparams.upsample_conditional_features, upsample_factor=hparams.upsample_factor, global_cardinality=hparams.global_cardinality, global_channel=hparams.global_channel) samples = tf.placeholder(tf.int32) local_ph = tf.placeholder(tf.float32, shape=(1, hparams.num_mels)) sess.run(tf.global_variables_initializer()) variables_to_restore = { var.name[:-2]: var for var in tf.global_variables() if not ('state_buffer' in var.name or 'pointer' in var.name) } saver = tf.train.Saver(variables_to_restore) print('Restoring model from {}'.format(args.checkpoint)) saver.restore(sess, args.checkpoint) tmp_global_condition = None upsample_factor = audio.get_hop_size() generate_list = [] with open(args.eval_txt, 'r', encoding='utf-8') as f: lines = f.readlines() for line in lines: if line is not None: line = line.strip().split('|') npy_path = os.path.join(hparams.NPY_DATAROOT, line[1]) tmp_local_condition = np.load(npy_path).astype(np.float32) if len(line) == 5: tmp_global_condition = int(line[4]) if hparams.global_channel is None: tmp_global_condition = None generate_list.append( (tmp_local_condition, tmp_global_condition, line[1])) for local_condition, global_condition, npy_path in generate_list: wav_id = npy_path.split('-mel')[0] wav_out_path = "wav/{}_gen.wav".format(wav_id) if not hparams.upsample_conditional_features: local_condition = np.repeat(local_condition, upsample_factor, axis=0) else: local_condition = np.expand_dims(local_condition, 0) local_condition = net.create_upsample(local_condition) local_condition = tf.squeeze(local_condition, [0]).eval(session=sess) next_sample = net.predict_proba_incremental(samples, local_ph, global_condition) sess.run(net.init_ops) quantization_channels = hparams.quantization_channels # Silence with a single random sample at the end. waveform = [quantization_channels / 2] * (net.receptive_field - 1) waveform.append(np.random.randint(quantization_channels)) sample_len = local_condition.shape[0] for step in tqdm(range(0, sample_len)): outputs = [next_sample] outputs.extend(net.push_ops) window = waveform[-1] # Run the WaveNet to predict the next sample. prediction = sess.run(outputs, feed_dict={ samples: window, local_ph: local_condition[step:step + 1, :] })[0] # Scale prediction distribution using temperature. np.seterr(divide='ignore') scaled_prediction = np.log(prediction) / args.temperature scaled_prediction = (scaled_prediction - np.logaddexp.reduce(scaled_prediction)) scaled_prediction = np.exp(scaled_prediction) np.seterr(divide='warn') # print(quantization_channels, scaled_prediction) sample = np.random.choice(np.arange(quantization_channels), p=scaled_prediction) waveform.append(sample) # If we have partial writing, save the result so far. if (wav_out_path and args.save_every and (step + 1) % args.save_every == 0): out = P.inv_mulaw_quantize(np.array(waveform), quantization_channels) write_wav(out, hparams.sample_rate, wav_out_path) # Introduce a newline to clear the carriage return from the progress. print() # Save the result as a wav file. if wav_out_path: out = P.inv_mulaw_quantize( np.array(waveform).astype(np.int16), quantization_channels) # out = P.inv_mulaw_quantize(np.asarray(waveform), quantization_channels) write_wav(out, hparams.sample_rate, wav_out_path) print('Finished generating.')
def wavegen(model, length=None, c=None, g=None, initial_value=None, fast=False, tqdm=tqdm): """Generate waveform samples by WaveNet. Args: model (nn.Module) : WaveNet decoder length (int): Time steps to generate. If conditinlal features are given, then determined by the feature size. c (numpy.ndarray): Conditional features, of shape T x C g (scaler): Speaker ID initial_value (int) : initial_value for the WaveNet decoder. fast (Bool): Whether to remove weight normalization or not. tqdm (lambda): tqdm Returns: numpy.ndarray : Generated waveform samples """ c = _to_numpy(c) g = _to_numpy(g) if use_cuda: model = model.cuda() model.eval() if fast: model.make_generation_fast_() if c is None: assert length is not None else: # (N, D) assert c.ndim == 2 # (T, D) if not hparams.upsample_conditional_features: upsample_factor = audio.get_hop_size() c = np.repeat(c, upsample_factor, axis=0) length = c.shape[0] # B x C x T c = c.T.reshape(1, -1, length) c = Variable(torch.FloatTensor(c)) if initial_value is None: initial_value = P.mulaw_quantize(0) # dummy silence assert initial_value >= 0 and initial_value < 256 initial_input = np_utils.to_categorical(initial_value, num_classes=256).astype(np.float32) initial_input = Variable(torch.from_numpy(initial_input)).view(1, 1, 256) g = None if g is None else Variable(torch.LongTensor([g])) if use_cuda: initial_input = initial_input.cuda() g = None if g is None else g.cuda() c = None if c is None else c.cuda() y_hat = model.incremental_forward(initial_input, c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True) y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy() y_hat = P.inv_mulaw_quantize(y_hat) return y_hat
# Generate waveform = wavegen(model, length, c=c, g=g, initial_value=initial_value, fast=True, tqdm=_tqdm) # save librosa.output.write_wav(dst_wav_path, waveform, sr=hparams.sample_rate) librosa.output.write_wav(target_wav_path, P.inv_mulaw_quantize(x), sr=hparams.sample_rate) # log if output_html: print(""" <audio controls="controls" > <source src="/{}/audio/{}/{}" autoplay/> Your browser does not support the audio element. </audio> """.format(hparams.name, dst_dir_name, basename(dst_wav_path))) print( "Finished! Check out {} for generated audio samples.".format(dst_dir)) sys.exit(0)
target_wav_path = join(dst_dir, "{}_{}{}_target.wav".format( idx, checkpoint_name, file_name_suffix)) else: dst_wav_path = join(dst_dir, "speaker{}_{}_{}{}_predicted.wav".format( g, idx, checkpoint_name, file_name_suffix)) target_wav_path = join(dst_dir, "speaker{}_{}_{}{}_target.wav".format( g, idx, checkpoint_name, file_name_suffix)) # Generate waveform = wavegen(model, length, c=c, g=g, initial_value=initial_value, fast=True, tqdm=_tqdm) # save librosa.output.write_wav(dst_wav_path, waveform, sr=hparams.sample_rate) if is_mulaw_quantize(hparams.input_type): x = P.inv_mulaw_quantize(x, hparams.quantize_channels) elif is_mulaw(hparams.input_type): x = P.inv_mulaw(x, hparams.quantize_channels) librosa.output.write_wav(target_wav_path, x, sr=hparams.sample_rate) # log if output_html: print(""" <audio controls="controls" > <source src="/{}/audio/{}/{}" autoplay/> Your browser does not support the audio element. </audio> """.format(hparams.name, dst_dir_name, basename(dst_wav_path))) print("Finished! Check out {} for generated audio samples.".format(dst_dir)) del tee
def test_incremental_forward_correctness(): import librosa.display from matplotlib import pyplot as plt model = build_compact_model().to(device) checkpoint_path = join(dirname(__file__), "..", "foobar/checkpoint_step000058000.pth") if exists(checkpoint_path): print("Loading from:", checkpoint_path) checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint["state_dict"]) sr = 4000 x, x_org = _test_data(sr=sr, N=3000) x = torch.from_numpy(x).contiguous().to(device) model.eval() # Batch forward y_offline = model(x, softmax=True) # Test from zero start y_online = model.incremental_forward(initial_input=None, T=100, tqdm=tqdm, softmax=True) # Incremental forward with forced teaching y_online = model.incremental_forward(test_inputs=x, tqdm=tqdm, softmax=True, quantize=False) # (1 x C x T) c = (y_offline - y_online).abs() print(c.mean(), c.max()) try: assert np.allclose(y_offline.cpu().data.numpy(), y_online.cpu().data.numpy(), atol=1e-4) except Exception: from warnings import warn warn("oops! must be a bug!") # (1, T, C) xt = x.transpose(1, 2).contiguous() initial_input = xt[:, 0, :].unsqueeze(1).contiguous() print(initial_input.size()) print("Inital value:", initial_input.view(-1).max(0)[1]) # With zero start zerostart = True if zerostart: y_inference = model.incremental_forward( initial_input=initial_input, T=xt.size(1), tqdm=tqdm, softmax=True, quantize=True) else: # Feed a few samples as test_inputs and then generate auto-regressively N = 1000 y_inference = model.incremental_forward( initial_input=None, test_inputs=xt[:, :N, :], T=xt.size(1), tqdm=tqdm, softmax=True, quantize=True) # Waveforms # (T,) y_offline = y_offline.max(1)[1].view(-1) y_online = y_online.max(1)[1].view(-1) y_inference = y_inference.max(1)[1].view(-1) y_offline = P.inv_mulaw_quantize(y_offline.cpu().data.long().numpy()) y_online = P.inv_mulaw_quantize(y_online.cpu().data.long().numpy()) y_inference = P.inv_mulaw_quantize(y_inference.cpu().data.long().numpy()) plt.figure(figsize=(16, 10)) plt.subplot(4, 1, 1) librosa.display.waveplot(x_org, sr=sr) plt.subplot(4, 1, 2) librosa.display.waveplot(y_offline, sr=sr) plt.subplot(4, 1, 3) librosa.display.waveplot(y_online, sr=sr) plt.subplot(4, 1, 4) librosa.display.waveplot(y_inference, sr=sr) plt.show() save_audio = False if save_audio: librosa.output.write_wav("target.wav", x_org, sr=sr) librosa.output.write_wav("online.wav", y_online, sr=sr) librosa.output.write_wav("inference.wav", y_inference, sr=sr)
"-feats", "") # Paths if g is None: dst_wav_path = join(dst_dir, "{}_gen.wav".format(name)) target_wav_path = join(dst_dir, "{}_ref.wav".format(name)) else: dst_wav_path = join(dst_dir, "speaker{}_{}_gen.wav".format(g, name)) target_wav_path = join(dst_dir, "speaker{}_{}_ref.wav".format(g, name)) # save if has_ref_file: if is_mulaw_quantize(hparams.input_type): ref = P.inv_mulaw_quantize(ref, hparams.quantize_channels - 1) elif is_mulaw(hparams.input_type): ref = P.inv_mulaw(ref, hparams.quantize_channels - 1) if hparams.postprocess is not None and hparams.postprocess not in [ "", "none" ]: ref = getattr(audio, hparams.postprocess)(ref) if hparams.global_gain_scale > 0: ref /= hparams.global_gain_scale # clip (just in case) gen = np.clip(gen, -1.0, 1.0) if has_ref_file: ref = np.clip(ref, -1.0, 1.0) wavfile.write(dst_wav_path, hparams.sample_rate, to_int16(gen))
def main(args): model = ModelWrapper() model.eval() if args["--downsample_interval"] is None: raise(ValueError("Must specify downsample fraction with --downsample_interval")) downsample_interval = int(args["--downsample_interval"]) receptive_field = model.receptive_field # Change the output dir if you want writing_dir = args["<output-dir>"] os.makedirs(writing_dir, exist_ok=True) print("writing dir: {}".format(writing_dir)) # Load up a samples x_original = librosa.core.load(args["<input-file>"], sr=hparams.sample_rate, mono=True)[0] # Hacky way to allow processing some or all of the file global SAMPLE_SIZE if SAMPLE_SIZE == -1: SAMPLE_SIZE = x_original.shape[0] x_original = x_original[:SAMPLE_SIZE] # Normalize to reduce encoding artifacts x_original /= abs(x_original).max() sf.write(os.path.join(writing_dir, "x_original.wav"), x_original, hparams.sample_rate) # Cut the sampling rate x_modified = x_original[::downsample_interval] x_modified_out = librosa.core.resample(x_modified, int(hparams.sample_rate / downsample_interval), hparams.sample_rate) sf.write(join(writing_dir, "x_modified.wav"), x_modified_out, hparams.sample_rate) x_modified = P.mulaw_quantize(x_modified, hparams.quantize_channels - 1) # Update constraint mask for super resolution. Masked spots don't update mask = np.ones_like(x_original) mask[::downsample_interval] = 0 mask = torch.Tensor(mask).unsqueeze(0).to(device) # Initialize with noise for the samples we need to fill in, or x original for the samples # we are allowed to use noise = np.random.uniform(0, 256, size=x_original.shape) mask_np = mask[0].detach().cpu().numpy() x = P.mulaw_quantize(x_original, hparams.quantize_channels - 1) * (1 - mask_np) + noise * (mask_np) x = torch.FloatTensor(x).unsqueeze(0).to(device) x.requires_grad = True sigmas = [175.9, 110., 68.7, 54.3, 42.9, 34.0, 26.8, 21.2, 16.8, 13.3, 10.5, 8.29, 6.55, 5.18, 4.1, 3.24, 2.56, 1.6, 1.0, 0.625, 0.39, 0.244, 0.15, 0.1] for idx, sigma in enumerate(sigmas): # Make sure each sample is updated on average N_STEPS times n_steps_sgld = int((SAMPLE_SIZE/(SGLD_WINDOW*BATCH_SIZE)) * N_STEPS) print("Number of SGLD steps {}".format(n_steps_sgld)) # Bump down a model checkpoint_path = join(args["<checkpoint>"], CHECKPOINTS[sigma], "checkpoint_latest_ema.pth") model.load_checkpoint(checkpoint_path) parmodel = torch.nn.DataParallel(model) parmodel.to(device) eta = .05 * (sigma ** 2) for i in range(n_steps_sgld): # need to get a good sampling of the beginning/end (boundary effects) # to understand this: think about how often we would update x[receptive_field] (first point) # if we only sampled U(receptive_field,x0.shape-receptive_field-SGLD_WINDOW) j = np.random.randint(-SGLD_WINDOW, x.shape[1], BATCH_SIZE) j = np.maximum(j, 0) j = np.minimum(j, x.shape[1]-(SGLD_WINDOW)) patches = [] for k in range(BATCH_SIZE): patches.append(x[:, j[k]:j[k] + SGLD_WINDOW]) patches = torch.stack(patches, axis=0) # Forward pass log_prob, prediction = parmodel(patches, sigma=sigma) log_prob = torch.sum(log_prob) grad = torch.autograd.grad(log_prob, patches)[0] x_update = eta * grad # Langevin step epsilon = np.sqrt(2 * eta) * torch.normal(0, 1, size=x_update.shape, device=device) x_update += epsilon with torch.no_grad(): for k in range(BATCH_SIZE): x_update[k] *= mask[:, j[k] : j[k] + SGLD_WINDOW] x[:, j[k] : j[k] + SGLD_WINDOW] += x_update[k] if (not i % 20) or (i == (n_steps_sgld - 1)): # debugging print("--------------") print('sigma = {}'.format(sigma)) print('eta = {}'.format(eta)) print("i {}".format(i)) print("Max sample {}".format( abs(x).max())) print('Mean sample logpx: {}'.format(log_prob / (BATCH_SIZE*SGLD_WINDOW))) print("Max gradient update: {}".format(eta * abs(grad).max())) t0 = time.time() out = P.inv_mulaw_quantize(x[0].detach().cpu().numpy(), hparams.quantize_channels - 1) out = np.clip(out, -1, 1) sf.write(os.path.join(writing_dir, "out_{}.wav".format(sigma)), out, hparams.sample_rate)
def main(args): model = ModelWrapper() model.eval() receptive_field = model.receptive_field hparams.max_time_steps = SAMPLE_SIZE test_data_loader = get_data_loader(args["<dump-root>"], collate_fn) # Change the output dir if you want writing_dir = args["<output-dir>"] if not exists(writing_dir): os.makedirs(writing_dir) print("writing dir: {}".format(writing_dir)) (x_original, y, c, g, input_lengths) = next(iter(test_data_loader)) c = c.to(device) sanity_check(model.model, c, g) # Write inputs x_original_out = P.inv_mulaw_quantize(x_original, hparams.quantize_channels - 1) sf.write(join(writing_dir, "original.wav"), x_original_out[0, 0,], hparams.sample_rate) # Initialize with noise x = torch.FloatTensor(np.random.uniform(0, 256, size=(1, x_original.shape[-1] + 1))).to(device) x.requires_grad = True sigmas = [175.9, 110., 68.7, 42.9, 26.8, 16.8, 10.5, 6.55, 4.1, 2.56, 1.6, 1.0, 0.625, 0.39, 0.244, 0.1] t0 = time.time() for idx, sigma in enumerate(sigmas): # Bump down a model checkpoint_path = join(args["<checkpoint>"], CHECKPOINTS[sigma], "checkpoint_latest_ema.pth") model.load_checkpoint(checkpoint_path) parmodel = torch.nn.DataParallel(model) parmodel.to(device) eta = .1 * (sigma ** 2) # Make sure each sample is updated on average N_STEPS times n_steps_sgld = int((SAMPLE_SIZE/(SGLD_WINDOW*BATCH_SIZE)) * N_STEPS) print("Number of SGLD steps {}".format(n_steps_sgld)) for i in range(n_steps_sgld): # Sample a random chunk of the spectrogram, accounting for padding # need to get a good sampling of the beginning/end (boundary effects) # to understand this: think about how often we would update x[0] (first point) # if we only sampled U(0,c.shape-receptive_field-SGLD_WINDOW) j = np.random.randint(hparams.cin_pad - SGLD_WINDOW // hparams.hop_size, c.shape[-1] - hparams.cin_pad, BATCH_SIZE) j = np.maximum(j, hparams.cin_pad) j = np.minimum(j, c.shape[-1] - hparams.cin_pad - (SGLD_WINDOW // hparams.hop_size)) # Get the corresponding start of the waveform x_start = (j - hparams.cin_pad) * hparams.hop_size patches_c = [] patches_x = [] for k in range(BATCH_SIZE): patches_c.append(c[0, :, j[k] - hparams.cin_pad : j[k] + hparams.cin_pad + (SGLD_WINDOW // hparams.hop_size)]) patches_x.append(x[:, x_start[k] : x_start[k] + SGLD_WINDOW + 1]) patches_c = torch.stack(patches_c, axis=0) patches_x = torch.stack(patches_x, axis=0) # Forward pass log_prob, prediction0 = parmodel(patches_x, c=patches_c, sigma=sigma) log_prob = torch.sum(log_prob) grad = torch.autograd.grad(log_prob, patches_x)[0] x_update = eta * grad # Langevin step epsilon = np.sqrt(2 * eta) * torch.normal(0, 1, size=x_update.shape, device=device) x_update += epsilon with torch.no_grad(): for k in range(BATCH_SIZE): x[:, x_start[k] : x_start[k] + SGLD_WINDOW + 1] += x_update[k] if (not i % 20) or (i == (n_steps_sgld - 1)): # debugging print("--------------") print("i {}".format(i)) print("Max sample {}".format( abs(x).max())) print('Mean sample logpx: {}'.format(log_prob / x.shape[-1])) print("Max gradient update: {}".format(eta * abs(grad).max())) out = P.inv_mulaw_quantize(x[0, 1:].detach().cpu().numpy(), hparams.quantize_channels - 1) out = np.clip(out, -1, 1) sf.write(join(writing_dir, "out_{}.wav".format(sigma)), out, hparams.sample_rate) final_time = time.time() with open(join(writing_dir, "info.json"), "w") as f: json.dump({"time": float(final_time - t0)}, f, indent=4)
def main(args): model = build_model().to(device) model.eval() receptive_field = model.receptive_field test_data_loader = get_data_loader(args["<dump-root>"], collate_fn) (x, y, c, g, input_lengths) = next(iter(test_data_loader)) # cin_pad = hparams.cin_pad # if cin_pad > 0: # c = F.pad(c, pad=(cin_pad, cin_pad), mode="replicate") c = c.to(device) sanity_check(model, c, g) # Write inputs x_original_out = inv_linear_quantize(x, hparams.quantize_channels - 1) x_original_out = P.inv_mulaw_quantize(x, hparams.quantize_channels - 1) sf.write("x_original.wav", x_original_out[0, 0,], hparams.sample_rate) # Initialize with noise x = torch.FloatTensor(np.random.uniform(-512, 700, size=(1, x.shape[-1] + 1))).to(device) # x = F.pad(x, (receptive_field, 0), "constant", 127) x.requires_grad = True sigmas = [175.9, 110., 68.7, 42.9, 26.8, 16.8, 10.5, 6.55, 4.1, 2.56, 1.6, 1.0, 0.625, 0.39, 0.1] start_sigma = 256. end_sigma = 0.1 for idx, sigma in enumerate(sigmas): n_steps = 200 # Bump down a model checkpoint_path = join(args["<checkpoint>"], checkpoints[sigma], "checkpoint_latest.pth") print("Load checkpoint0 from {}".format(checkpoint_path)) checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint["state_dict"]) eta = .02 * (sigma ** 2) gamma = 15 * (1.0 / sigma) ** 2 for i in range(n_steps): # Seed with noised up GT, good for unconditional generation # x0[0, :receptive_field] = torch.FloatTensor(x0_original[:receptive_field] + np.random.normal(0, sigma, x0_original[:receptive_field].shape)).to(device) # x1[0, :receptive_field] = torch.FloatTensor(x1_original[:receptive_field] + np.random.normal(0, sigma, x1_original[:receptive_field].shape)).to(device) # Seed with noised up silence # x0[0, :receptive_field] = torch.FloatTensor(np.random.normal(127, sigma, x0_original[:receptive_field].shape)).to(device) # x1[0, :receptive_field] = torch.FloatTensor(np.random.normal(127, sigma, x1_original[:receptive_field].shape)).to(device) # Forward pass log_prob, prediction = model.smoothed_loss(x, c=c, sigma=sigma) log_prob = torch.sum(log_prob) grad = torch.autograd.grad(log_prob, x)[0] x_update = eta * grad # Langevin step epsilon = np.sqrt(2 * eta) * torch.normal(0, 1, size=(1, x.shape[-1]), device=device) x_update += epsilon with torch.no_grad(): x += x_update if (not i % 20) or (i == (n_steps - 1)): # debugging print("--------------") print('sigma = {}'.format(sigma)) print('eta = {}'.format(eta)) print("i {}".format(i)) print("Max sample {}".format( abs(x).max())) print('Mean sample logpx: {}'.format(log_prob / x.shape[-1])) print("Max gradient update: {}".format(eta * abs(grad).max())) out = P.inv_mulaw_quantize(x[0, 1:].detach().cpu().numpy(), hparams.quantize_channels - 1) # out = inv_linear_quantize(x[0].detach().cpu().numpy(), hparams.quantize_channels - 1) out = np.clip(out, -1, 1) sf.write("out_{}.wav".format(sigma), out, hparams.sample_rate)
def eval_model(global_step, writer, model, y, c, g, input_lengths, eval_dir, ema=None): if ema is not None: print("Using averaged model for evaluation") model = clone_as_averaged_model(model, ema) model.eval() idx = np.random.randint(0, len(y)) length = input_lengths[idx].data.cpu().numpy()[0] # (T,) y_target = y[idx].view(-1).data.cpu().numpy()[:length] if c is not None: c = c[idx, :, :length].unsqueeze(0) assert c.dim() == 3 print("Shape of local conditioning features: {}".format(c.size())) if g is not None: # TODO: test g = g[idx] print("Shape of global conditioning features: {}".format(g.size())) # Dummy silence if is_mulaw_quantize(hparams.input_type): initial_value = P.mulaw_quantize(0, hparams.quantize_channels) elif is_mulaw(hparams.input_type): initial_value = P.mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 print("Intial value:", initial_value) # (C,) if is_mulaw_quantize(hparams.input_type): initial_input = np_utils.to_categorical( initial_value, num_classes=hparams.quantize_channels).astype(np.float32) initial_input = Variable(torch.from_numpy(initial_input)).view( 1, 1, hparams.quantize_channels) else: initial_input = Variable(torch.zeros(1, 1, 1).fill_(initial_value)) initial_input = initial_input.cuda() if use_cuda else initial_input # Run the model in fast eval mode y_hat = model.incremental_forward(initial_input, c=c, g=g, T=length, softmax=True, quantize=True, tqdm=tqdm, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy() y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels) y_target = P.inv_mulaw_quantize(y_target, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = P.inv_mulaw( y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels) y_target = P.inv_mulaw(y_target, hparams.quantize_channels) else: y_hat = y_hat.view(-1).cpu().data.numpy() # Save audio os.makedirs(eval_dir, exist_ok=True) path = join(eval_dir, "step{:09d}_predicted.wav".format(global_step)) librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate) path = join(eval_dir, "step{:09d}_target.wav".format(global_step)) librosa.output.write_wav(path, y_target, sr=hparams.sample_rate) # save figure path = join(eval_dir, "step{:09d}_waveplots.png".format(global_step)) save_waveplot(path, y_hat, y_target)
def wavegen(model, length=None, c=None, g=None, initial_value=None, fast=False, tqdm=tqdm): """Generate waveform samples by WaveNet. Args: model (nn.Module) : WaveNet decoder length (int): Time steps to generate. If conditinlal features are given, then this is determined by the feature size. c (numpy.ndarray): Conditional features, of shape T x C g (scaler): Speaker ID initial_value (int) : initial_value for the WaveNet decoder. fast (Bool): Whether to remove weight normalization or not. tqdm (lambda): tqdm Returns: numpy.ndarray : Generated waveform samples """ from train import sanity_check sanity_check(model, c, g) c = _to_numpy(c) g = _to_numpy(g) if use_cuda: model = model.cuda() model.eval() if fast: model.make_generation_fast_() if c is None: assert length is not None else: # (Tc, D) assert c.ndim == 2 Tc = c.shape[0] upsample_factor = audio.get_hop_size() # Overwrite length according to feature size length = Tc * upsample_factor # (Tc, D) -> (Tc', D) # Repeat features before feeding it to the network if not hparams.upsample_conditional_features: c = np.repeat(c, upsample_factor, axis=0) # B x C x T c = Variable(torch.FloatTensor(c.T).unsqueeze(0)) if initial_value is None: if is_mulaw_quantize(hparams.input_type): initial_value = P.mulaw_quantize(0, hparams.quantize_channels) else: initial_value = 0.0 if is_mulaw_quantize(hparams.input_type): assert initial_value >= 0 and initial_value < hparams.quantize_channels initial_input = np_utils.to_categorical( initial_value, num_classes=hparams.quantize_channels).astype(np.float32) initial_input = Variable(torch.from_numpy(initial_input)).view( 1, 1, hparams.quantize_channels) else: initial_input = Variable(torch.zeros(1, 1, 1)).fill_(initial_value) g = None if g is None else Variable(torch.LongTensor([g])) if use_cuda: initial_input = initial_input.cuda() g = None if g is None else g.cuda() c = None if c is None else c.cuda() y_hat = model.incremental_forward(initial_input, c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy() y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels) elif is_mulaw(hparams.input_type): y_hat = P.inv_mulaw( y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels) else: y_hat = y_hat.view(-1).cpu().data.numpy() return y_hat
def eval_model(global_step, writer, device, model, y, c, g, input_lengths, eval_dir, ema=None): if ema is not None: print("Using averaged model for evaluation") model = clone_as_averaged_model(device, model, ema) model.make_generation_fast_() model.eval() idx = np.random.randint(0, len(y)) length = input_lengths[idx].data.cpu().item() # (T,) y_target = y[idx].view(-1).data.cpu().numpy()[:length] if c is not None: if hparams.upsample_conditional_features: c = c[idx, :, :length // audio.get_hop_size() + hparams.cin_pad * 2].unsqueeze(0) else: c = c[idx, :, :length].unsqueeze(0) assert c.dim() == 3 print("Shape of local conditioning features: {}".format(c.size())) if g is not None: # TODO: test g = g[idx] print("Shape of global conditioning features: {}".format(g.size())) # Dummy silence if is_mulaw_quantize(hparams.input_type): initial_value = P.mulaw_quantize(0, hparams.quantize_channels - 1) elif is_mulaw(hparams.input_type): initial_value = P.mulaw(0.0, hparams.quantize_channels) else: initial_value = 0.0 # (C,) if is_mulaw_quantize(hparams.input_type): initial_input = to_categorical( initial_value, num_classes=hparams.quantize_channels).astype(np.float32) initial_input = torch.from_numpy(initial_input).view( 1, 1, hparams.quantize_channels) else: initial_input = torch.zeros(1, 1, 1).fill_(initial_value) initial_input = initial_input.to(device) # Run the model in fast eval mode with torch.no_grad(): y_hat = model.incremental_forward( initial_input, c=c, g=g, T=length, softmax=True, quantize=True, tqdm=tqdm, log_scale_min=hparams.log_scale_min) if is_mulaw_quantize(hparams.input_type): y_hat = y_hat.max(1)[1].view(-1).long().cpu().data.numpy() y_hat = P.inv_mulaw_quantize(y_hat, hparams.quantize_channels - 1) y_target = P.inv_mulaw_quantize(y_target, hparams.quantize_channels - 1) elif is_mulaw(hparams.input_type): y_hat = P.inv_mulaw(y_hat.view(-1).cpu().data.numpy(), hparams.quantize_channels) y_target = P.inv_mulaw(y_target, hparams.quantize_channels) else: y_hat = y_hat.view(-1).cpu().data.numpy() # Save audio os.makedirs(eval_dir, exist_ok=True) path = join(eval_dir, "step{:09d}_predicted.wav".format(global_step)) librosa.output.write_wav(path, y_hat, sr=hparams.sample_rate) path = join(eval_dir, "step{:09d}_target.wav".format(global_step)) librosa.output.write_wav(path, y_target, sr=hparams.sample_rate) # save figure path = join(eval_dir, "step{:09d}_waveplots.png".format(global_step)) save_waveplot(path, y_hat, y_target) # add audio and figures to tensorboard writer.add_audio('target_audio', y_target, global_step, hparams.sample_rate) writer.add_audio('generated_audio', y_hat, global_step, hparams.sample_rate)