def discrete_collate(batch) : """collate function used for discrete wav output, such as 9-bit, mulaw-discrete, etc. """ pad = 2 mel_win = hp.seq_len // hp.hop_size + 2 * pad max_offsets = [x[0].shape[-1] - (mel_win + 2 * pad) for x in batch] mel_offsets = [np.random.randint(0, offset) for offset in max_offsets] sig_offsets = [(offset + pad) * hp.hop_size for offset in mel_offsets] mels = [x[0][:, mel_offsets[i]:mel_offsets[i] + mel_win] \ for i, x in enumerate(batch)] coarse = [x[1][sig_offsets[i]:sig_offsets[i] + hp.seq_len + 1] \ for i, x in enumerate(batch)] mels = np.stack(mels).astype(np.float32) coarse = np.stack(coarse).astype(np.int64) mels = torch.FloatTensor(mels) coarse = torch.LongTensor(coarse) if hp.input_type == 'bits': x_input = 2 * coarse[:, :hp.seq_len].float() / (2**hp.bits - 1.) - 1. elif hp.input_type == 'mulaw': x_input = inv_mulaw_quantize(coarse[:, :hp.seq_len], hp.mulaw_quantize_channels) y_coarse = coarse[:, 1:] return x_input, mels, y_coarse
def body(time, current_inputs, final_outputs, current_input_buffers, current_c_buffers): # we need shift condition by one current_c = c[:, time:time + 1, :] if c is not None else None current_outputs = current_inputs new_input_buffers = [] new_c_buffers = [] for layer, current_input_buffer, current_c_buffer in zip( self.fft_layers, current_input_buffers, current_c_buffers): current_outputs, out_input_buffer, out_c_buffer = layer.incremental_forward( inputs=current_outputs, c=current_c, input_buffers=current_input_buffer, c_buffers=current_c_buffer, ) new_input_buffers.append(out_input_buffer) new_c_buffers.append(out_c_buffer) current_outputs = self.out_layer(current_outputs) posterior = tf.nn.softmax(tf.reshape(current_outputs, [1, -1]), axis=-1) # dist = tf.distributions.Categorical(probs=posterior) # sample = tf.cast(dist.sample(), tf.int32) sample = tf.py_func(np.random.choice, [ np.arange(self.hp.quantize_channels), 1, True, tf.reshape(posterior, [-1]) ], tf.int64) sample = tf.reshape(sample, [-1]) # sample = tf.argmax(posterior, axis=-1) decode_sample = utils.inv_mulaw_quantize(sample, self.hp.quantize_channels) final_outputs = final_outputs.write(time, decode_sample) if utils.is_mulaw_quantize(self.hp.input_type): next_sample = tf.one_hot(tf.cast(sample, tf.int32), self.hp.quantize_channels) else: next_sample = decode_sample next_time = time + 1 next_inputs = current_inputs[:, 1:, :] if test_inputs is not None: next_sample = tf.reshape(test_inputs[:, next_time], [1, 1, self.in_channels]) else: next_sample = tf.reshape(next_sample, [1, 1, self.in_channels]) next_inputs = tf.concat( [next_inputs, tf.cast(next_sample, tf.float32)], axis=1) return next_time, next_inputs, final_outputs, new_input_buffers, new_c_buffers
def synthesize(mel_sp, save_path, weight_path): wavenet = WaveNet(hparams.num_mels, hparams.upsample_scales) wavenet.load_weights(weight_path) mel_sp = tf.expand_dims(mel_sp, axis=0) outputs = wavenet.synthesis(mel_sp) outputs = np.squeeze(outputs) outputs = inv_mulaw_quantize(outputs) save_wav(outputs, save_path, hparams.sampling_rate)
def discrete_collate(batch): """collate function used for discrete wav output, such as 9-bit, mulaw-discrete, etc. """ pad = 2 mel_win = hp.seq_len // hp.hop_size + 2 * pad max_offsets = [x[0].shape[-1] - (mel_win + 2 * pad) for x in batch] mel_offsets = [np.random.randint(0, offset) for offset in max_offsets] sig_offsets = [(offset + pad) * hp.hop_size for offset in mel_offsets] mels = [x[0][:, mel_offsets[i]:mel_offsets[i] + mel_win] \ for i, x in enumerate(batch)] coarse = [x[1][sig_offsets[i]:sig_offsets[i] + hp.wav_seq_factor * hp.seq_len + 1] \ for i, x in enumerate(batch)] mels = np.stack(mels).astype(np.float32) try: coarse = np.stack(coarse).astype(np.int64) except: sz = np.max([len(c) for c in coarse]) c_errs = [n for n, c in enumerate(coarse) if len(c) != sz] #print("error in stacking, possible empty file???") # this is weird, bad, and random? some wav file truncated # copy a neighbor instead... for c_err in c_errs: # will wrap around due to negative indexing while True: idx = c_err - 1 if idx not in c_errs: break coarse[c_err] = coarse[idx].copy() mels[c_err] = mels[idx].copy() coarse = np.stack(coarse).astype(np.int64) mels = torch.FloatTensor(mels) coarse = torch.LongTensor(coarse) if hp.input_type == 'bits': x_input = 2 * coarse[:, :hp.wav_seq_factor * hp.seq_len].float() / (2**hp.bits - 1.) - 1. elif hp.input_type == 'mulaw': x_input = inv_mulaw_quantize( coarse[:, :hp.wav_seq_factor * hp.seq_len], hp.mulaw_quantize_channels) y_coarse = coarse[:, 1:] return x_input, mels, y_coarse
def batch_generate(self, mels) : """mel should be of shape [batch_size x 80 x mel_length] """ self.eval() output = [] rnn1 = self.get_gru_cell(self.rnn1) rnn2 = self.get_gru_cell(self.rnn2) b_size = mels.shape[0] assert len(mels.shape) == 3, "mels should have shape [batch_size x 80 x mel_length]" with torch.no_grad() : x = torch.zeros(b_size, 1).cuda() h1 = torch.zeros(b_size, self.rnn_dims).cuda() h2 = torch.zeros(b_size, self.rnn_dims).cuda() mels = torch.FloatTensor(mels).cuda() mels, aux = self.upsample(mels) aux_idx = [self.aux_dims * i for i in range(5)] a1 = aux[:, :, aux_idx[0]:aux_idx[1]] a2 = aux[:, :, aux_idx[1]:aux_idx[2]] a3 = aux[:, :, aux_idx[2]:aux_idx[3]] a4 = aux[:, :, aux_idx[3]:aux_idx[4]] seq_len = mels.size(1) for i in tqdm(range(seq_len)) : m_t = mels[:, i, :] a1_t = a1[:, i, :] a2_t = a2[:, i, :] a3_t = a3[:, i, :] a4_t = a4[:, i, :] x = torch.cat([x, m_t, a1_t], dim=1) x = self.I(x) h1 = rnn1(x, h1) x = x + h1 inp = torch.cat([x, a2_t], dim=1) h2 = rnn2(inp, h2) x = x + h2 x = torch.cat([x, a3_t], dim=1) x = F.relu(self.fc1(x)) x = torch.cat([x, a4_t], dim=1) x = F.relu(self.fc2(x)) x = self.fc3(x) if hp.input_type == 'raw': sample = sample_from_beta_dist(x.unsqueeze(0)) elif hp.input_type == 'mixture': sample = sample_from_discretized_mix_logistic(x.unsqueeze(-1),hp.log_scale_min) elif hp.input_type == 'bits': posterior = F.softmax(x, dim=1).view(b_size, -1) distrib = torch.distributions.Categorical(posterior) sample = 2 * distrib.sample().float() / (self.n_classes - 1.) - 1. elif hp.input_type == 'mulaw': posterior = F.softmax(x, dim=1).view(b_size, -1) distrib = torch.distributions.Categorical(posterior) print(type(distrib.sample())) sample = inv_mulaw_quantize(distrib.sample(), hp.mulaw_quantize_channels, True) output.append(sample.view(-1)) x = sample.view(b_size,1) output = torch.stack(output).cpu().numpy() self.train() # output is a batch of wav segments of shape [batch_size x seq_len] # will need to merge into one wav of size [batch_size * seq_len] assert output.shape[1] == b_size output = (output.swapaxes(1,0)).reshape(-1) return output
def generate(self, mels, DEVICE="cuda") : self.eval() output = [] rnn1 = self.get_gru_cell(self.rnn1) rnn2 = self.get_gru_cell(self.rnn2) with torch.no_grad() : x = torch.zeros(1, 1).to(DEVICE) h1 = torch.zeros(1, self.rnn_dims).to(DEVICE) h2 = torch.zeros(1, self.rnn_dims).to(DEVICE) mels = torch.FloatTensor(mels).to(DEVICE).unsqueeze(0) mels, aux = self.upsample(mels) aux_idx = [self.aux_dims * i for i in range(5)] a1 = aux[:, :, aux_idx[0]:aux_idx[1]] a2 = aux[:, :, aux_idx[1]:aux_idx[2]] a3 = aux[:, :, aux_idx[2]:aux_idx[3]] a4 = aux[:, :, aux_idx[3]:aux_idx[4]] seq_len = mels.size(1) for i in tqdm(range(seq_len)) : m_t = mels[:, i, :] a1_t = a1[:, i, :] a2_t = a2[:, i, :] a3_t = a3[:, i, :] a4_t = a4[:, i, :] x = torch.cat([x, m_t, a1_t], dim=1) x = self.I(x) h1 = rnn1(x, h1) x = x + h1 inp = torch.cat([x, a2_t], dim=1) h2 = rnn2(inp, h2) x = x + h2 x = torch.cat([x, a3_t], dim=1) x = F.relu(self.fc1(x)) x = torch.cat([x, a4_t], dim=1) x = F.relu(self.fc2(x)) x = self.fc3(x) if hp.input_type == 'raw': if hp.distribution == 'beta': sample = sample_from_beta_dist(x.unsqueeze(0)) elif hp.distribution == 'gaussian': sample = sample_from_gaussian(x.unsqueeze(0)) elif hp.input_type == 'mixture': sample = sample_from_discretized_mix_logistic(x.unsqueeze(-1),hp.log_scale_min) elif hp.input_type == 'bits': posterior = F.softmax(x, dim=1).view(-1) distrib = torch.distributions.Categorical(posterior) sample = 2 * distrib.sample().float() / (self.n_classes - 1.) - 1. elif hp.input_type == 'mulaw': posterior = F.softmax(x, dim=1).view(-1) distrib = torch.distributions.Categorical(posterior) sample = inv_mulaw_quantize(distrib.sample(), hp.mulaw_quantize_channels, True) output.append(sample.view(-1)) x = torch.FloatTensor([[sample]]).to(DEVICE) output = torch.stack(output).cpu().numpy() self.train() return output
def generate(self, mels, target=11000, overlap=550, batched=True): self.eval() output = [] rnn1 = self.get_gru_cell(self.rnn1) with torch.no_grad(): mels = torch.FloatTensor(mels).cuda().unsqueeze(0) mels = self.pad_tensor(mels.transpose(1, 2), pad=hp.pad, side='both') mels, aux = self.upsample(mels.transpose(1, 2)) if batched: mels = self.fold_with_overlap(mels, target, overlap) aux = self.fold_with_overlap(aux, target, overlap) b_size, seq_len, _ = mels.size() h1 = torch.zeros(b_size, self.rnn_dims).cuda() x = torch.zeros(b_size, 1).cuda() d = self.aux_dims aux_split = [aux[:, :, d * i:d * (i + 1)] for i in range(2)] for i in range(seq_len): m_t = mels[:, i, :] a1_t, a2_t = \ (a[:, i, :] for a in aux_split) x = torch.cat([x, m_t, a1_t[:,:-1]], dim=1) x = self.I(x) h1 = rnn1(x, h1) x = x + h1 x = torch.cat([x, a2_t], dim=1) x = F.relu(self.fc1(x)) #x = F.relu(self.fc2(x)) x = self.fc3(x) if hp.input_type == 'raw': sample = sample_from_beta_dist(x.unsqueeze(0)).view(-1) elif hp.input_type == 'mixture': sample = sample_from_discretized_mix_logistic(x.unsqueeze(-1),hp.log_scale_min) elif hp.input_type == 'bits': posterior = F.softmax(x, dim=1) distrib = torch.distributions.Categorical(posterior) sample = 2 * distrib.sample().float() / (self.n_classes - 1.) - 1. elif hp.input_type == 'mulaw': posterior = F.softmax(x, dim=1) distrib = torch.distributions.Categorical(posterior) sample = inv_mulaw_quantize(distrib.sample(), hp.mulaw_quantize_channels, True) output.append(sample) x = sample.unsqueeze(-1) output = torch.stack(output).transpose(0, 1) output = output.cpu().numpy() if batched: output = self.xfade_and_unfold(output, target, overlap) else: output = output[0] self.train() return output
def incremental_forward(self, c=None, g=None, test_inputs=None, targets=None): if g is not None: raise NotImplementedError("global condition is not added now!") # use the zero as inputs inputs = tf.zeros([1, 1], dtype=tf.float32) if utils.is_mulaw_quantize(self.hp.input_type): inputs = utils.mulaw_quantize(inputs, self.hp.quantize_channels) inputs = tf.one_hot(tf.cast(inputs, tf.int32), self.hp.quantize_channels) else: inputs = tf.expand_dims(inputs, axis=-1) # check whether need to upsample condition if c is not None and self.upsample_conv is not None: c = tf.expand_dims(c, axis=-1) # [B T cin_channels 1] for transposed_conv in self.upsample_conv: c = transposed_conv(c) c = tf.squeeze(c, axis=-1) # [B new_T cin_channels] # apply zero padding to condition if c is not None: c_shape = tf.shape(c) padding_c = tf.zeros( [c_shape[0], self.receptive_filed, c_shape[-1]]) c = tf.concat([padding_c, c], axis=1) # create c_buffers c_buffers = [ tf.zeros([1, 2**i // 2 + 1, self.hp.cin_channels]) for i in range(self.hp.n_layers, 0, -1) ] synthesis_length = tf.shape(c)[1] initial_time = tf.constant(0, dtype=tf.int32) initial_outputs_ta = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True) input_buffers = [ self._convert_type(tf.zeros([1, 2**self.hp.n_layers // 2 + 1])) ] for i in range(self.hp.n_layers - 1, 0, -1): input_buffers.append( self._convert_type(tf.zeros([1, 2**i // 2 + 1]))) def condition(time, unused_initial_input, unused_final_outputs, unused_input_buffers, unused_c_buffers): return tf.less(time, synthesis_length) def body(time, current_inputs, final_outputs, current_input_buffers, current_c_buffers): # we need shift condition by one current_c = c[:, time:time + 1, :] if c is not None else None current_outputs = current_inputs new_input_buffers = [] new_c_buffers = [] for layer, current_input_buffer, current_c_buffer in zip( self.fft_layers, current_input_buffers, current_c_buffers): current_outputs, out_input_buffer, out_c_buffer = layer.incremental_forward( inputs=current_outputs, c=current_c, input_buffers=current_input_buffer, c_buffers=current_c_buffer, ) new_input_buffers.append(out_input_buffer) new_c_buffers.append(out_c_buffer) current_outputs = self.out_layer(current_outputs) posterior = tf.nn.softmax(tf.reshape(current_outputs, [1, -1]), axis=-1) # dist = tf.distributions.Categorical(probs=posterior) # sample = tf.cast(dist.sample(), tf.int32) sample = tf.py_func(np.random.choice, [ np.arange(self.hp.quantize_channels), 1, True, tf.reshape(posterior, [-1]) ], tf.int64) sample = tf.reshape(sample, [-1]) # sample = tf.argmax(posterior, axis=-1) decode_sample = utils.inv_mulaw_quantize(sample, self.hp.quantize_channels) final_outputs = final_outputs.write(time, decode_sample) if utils.is_mulaw_quantize(self.hp.input_type): next_sample = tf.one_hot(tf.cast(sample, tf.int32), self.hp.quantize_channels) else: next_sample = decode_sample next_time = time + 1 next_inputs = current_inputs[:, 1:, :] if test_inputs is not None: next_sample = tf.reshape(test_inputs[:, next_time], [1, 1, self.in_channels]) else: next_sample = tf.reshape(next_sample, [1, 1, self.in_channels]) next_inputs = tf.concat( [next_inputs, tf.cast(next_sample, tf.float32)], axis=1) return next_time, next_inputs, final_outputs, new_input_buffers, new_c_buffers result = tf.while_loop(condition, body, loop_vars=[ initial_time, inputs, initial_outputs_ta, input_buffers, c_buffers ], parallel_iterations=32, swap_memory=True) outputs_ta = result[2] outputs = outputs_ta.stack() self.eval_outputs = outputs self.eval_targets = utils.inv_mulaw_quantize( targets, self.hp.quantize_channels) if targets is not None else None