def generate(self, mels, batched, target, overlap, mu_law, progress_callback=None): mu_law = mu_law if self.mode == 'RAW' else False progress_callback = progress_callback or self.gen_display self.eval() output = [] start = time.time() rnn1 = self.get_gru_cell(self.rnn1) rnn2 = self.get_gru_cell(self.rnn2) with torch.no_grad(): mels = mels wave_len = (mels.size(-1) - 1) * self.hop_length mels = self.pad_tensor(mels.transpose(1, 2), pad=self.pad, side='both') mels, aux = self.upsample(mels.transpose(1, 2)) if batched: mels = self.fold_with_overlap(mels, target, overlap) aux = self.fold_with_overlap(aux, target, overlap) b_size, seq_len, _ = mels.size() h1 = torch.zeros(b_size, self.rnn_dims) h2 = torch.zeros(b_size, self.rnn_dims) x = torch.zeros(b_size, 1) d = self.aux_dims aux_split = [aux[:, :, d * i:d * (i + 1)] for i in range(4)] for i in range(seq_len): m_t = mels[:, i, :] a1_t, a2_t, a3_t, a4_t = (a[:, i, :] for a in aux_split) x = torch.cat([x, m_t, a1_t], dim=1) x = self.I(x) h1 = rnn1(x, h1) x = x + h1 inp = torch.cat([x, a2_t], dim=1) h2 = rnn2(inp, h2) x = x + h2 x = torch.cat([x, a3_t], dim=1) x = F.relu(self.fc1(x)) x = torch.cat([x, a4_t], dim=1) x = F.relu(self.fc2(x)) logits = self.fc3(x) if self.mode == 'MOL': sample = sample_from_discretized_mix_logistic( logits.unsqueeze(0).transpose(1, 2)) output.append(sample.view(-1)) # x = torch.FloatTensor([[sample]]).cuda() x = sample.transpose(0, 1).cuda() elif self.mode == 'RAW': posterior = F.softmax(logits, dim=1) distrib = torch.distributions.Categorical(posterior) sample = 2 * distrib.sample().float() / (self.n_classes - 1.) - 1. output.append(sample) x = sample.unsqueeze(-1) else: raise RuntimeError("Unknown model mode value - ", self.mode) if i % 100 == 0: gen_rate = (i + 1) / (time.time() - start) * b_size / 1000 progress_callback(i, seq_len, b_size, gen_rate) output = torch.stack(output).transpose(0, 1) output = output.cpu().numpy() output = output.astype(np.float64) if batched: output = self.xfade_and_unfold(output, target, overlap) else: output = output[0] if mu_law: output = decode_mu_law(output, self.n_classes, False) if hp.apply_preemphasis: output = de_emphasis(output) # Fade-out at the end to avoid signal cutting out suddenly fade_out = np.linspace(1, 0, 20 * self.hop_length) output = output[:wave_len] output[-20 * self.hop_length:] *= fade_out self.train() return output
def generate(self, mels, batched, target, overlap, mu_law, progress_callback=None): mu_law = mu_law if self.mode == 'RAW' else False # i(cur), seq_len, gen_rate, batch_size progress_callback = progress_callback or self.gen_display self.eval() output = [] start = time.time() rnn1 = self.get_gru_cell(self.rnn1) rnn2 = self.get_gru_cell(self.rnn2) with torch.no_grad(): mels = mels.cuda() #TODO why minus 1 wave_len = (mels.size(-1) - 1) * self.hop_length # [1, T+2pad, D] NOTE pad mel mels = self.pad_tensor(mels.transpose(1, 2), pad=self.pad, side='both') # [1, L, D], [1, L, res_out] mels, aux = self.upsample(mels.transpose(1, 2)) # [F, 800+8000+800, D/res_out] if batched: mels = self.fold_with_overlap(mels, target, overlap) aux = self.fold_with_overlap(aux, target, overlap) b_size, seq_len, _ = mels.size() # if GRU: [1, F, Drnn], if GRUCell [F, Drnn] h1 = torch.zeros(b_size, self.rnn_dims).cuda() h2 = torch.zeros(b_size, self.rnn_dims).cuda() # x [F, 1] x = torch.zeros(b_size, 1).cuda() # [F, 9600, res_out] -> 4 * [F, 9600, res_out/4] d = self.aux_dims aux_split = [aux[:, :, d * i:d * (i + 1)] for i in range(4)] # 9600 steps for i in range(seq_len): # [F, D] m_t = mels[:, i, :] # 4 * [F, res_out/4] a1_t, a2_t, a3_t, a4_t = (a[:, i, :] for a in aux_split) # [F, 1+D+res_out/4] -> [F, Drnn] x = torch.cat([x, m_t, a1_t], dim=1) x = self.I(x) # [F, Drnn], [F, Drnn] # output: [F, Drnn] h1 = rnn1(x, h1) x = x + h1 inp = torch.cat([x, a2_t], dim=1) # [F, Drnn+res_out/4], [F, Drnn] # output: [F, Drnn] h2 = rnn2(inp, h2) x = x + h2 x = torch.cat([x, a3_t], dim=1) # [F, Drnn+res_out/4] # output: [F, Dfc] x = F.relu(self.fc1(x)) x = torch.cat([x, a4_t], dim=1) # [F, Drnn+res_out/4] # output: [F, Dfc] x = F.relu(self.fc2(x)) # [F, Dfc] -> [F, C] logits = self.fc3(x) if self.mode == 'MOL': sample = sample_from_discretized_mix_logistic(logits.unsqueeze(0).transpose(1, 2)) output.append(sample.view(-1)) # x = torch.FloatTensor([[sample]]).cuda() x = sample.transpose(0, 1).cuda() elif self.mode == 'RAW' : posterior = F.softmax(logits, dim=1) distrib = torch.distributions.Categorical(posterior) # [F] (0, 511) -> (-1, 1) sample = 2 * distrib.sample().float() / (self.n_classes - 1.) - 1. output.append(sample) # [F, 1], input to next step x = sample.unsqueeze(-1) else: raise RuntimeError("Unknown model mode value - ", self.mode) if i % 100 == 0: gen_rate = (i + 1) / (time.time() - start) * b_size / 1000 progress_callback(i, seq_len, b_size, gen_rate) # 9600 * [F] -> [F, 9600] output = torch.stack(output).transpose(0, 1) output = output.cpu().numpy() output = output.astype(np.float64) if batched: # [F, 9600] -> [F*(overlap+target)+overlap] output = self.xfade_and_unfold(output, target, overlap) else: output = output[0] if mu_law: output = decode_mu_law(output, self.n_classes, False) if hp.apply_preemphasis: output = de_emphasis(output) # Fade-out at the end to avoid signal cutting out suddenly fade_out = np.linspace(1, 0, 20 * self.hop_length) output = output[:wave_len] output[-20 * self.hop_length:] *= fade_out self.train() return output