def sample(self, condition): x = None seq = torch.from_numpy(process_blizzard(condition)).long().unsqueeze(0) input_lengths = torch.LongTensor([seq[0].shape[0]]).cuda() audio_lengths = torch.LongTensor([0]).cuda() ## Tier 1 ## tqdm.write('Tier 1') for t in tqdm(range(self.args.timestep // self.t_div)): audio_lengths += 1 if x is None: x = torch.zeros((1, self.n_mels // self.f_div, 1)).cuda() else: x = torch.cat( [x, torch.zeros((1, self.n_mels // self.f_div, 1)).cuda()], dim=-1) for m in tqdm(range(self.n_mels // self.f_div)): torch.cuda.synchronize() if self.infer_hp.conditional: mu, std, pi, _ = self.tiers[1](x, seq, input_lengths, audio_lengths) else: mu, std, pi = self.tiers[1](x, audio_lengths) temp = sample_gmm(mu, std, pi) x[:, m, t] = temp[:, m, t] ## Tier 2~N ## for tier in tqdm(range(2, self.hp.model.tier + 1)): tqdm.write('Tier %d' % tier) mu, std, pi = self.tiers[tier](x, audio_lengths) temp = sample_gmm(mu, std, pi) x = self.tierutil.interleave(x, temp, tier + 1) return x
def sample_model_with_breakdown(model, condition): x = None seq = torch.from_numpy(process_blizzard(condition)).long().unsqueeze(0) input_lengths = torch.LongTensor([seq[0].shape[0]]).cuda() audio_lengths = torch.LongTensor([0]).cuda() breakdown = {} ## Tier 1 ## tqdm.write('Tier 1') for t in tqdm(range(model.args.timestep // model.t_div)): audio_lengths += 1 if x is None: x = torch.zeros((1, model.n_mels // model.f_div, 1)).cuda() else: x = torch.cat([x, torch.zeros((1, model.n_mels // model.f_div, 1)).cuda()], dim=-1) for m in tqdm(range(model.n_mels // model.f_div)): torch.cuda.synchronize() if model.infer_hp.conditional: mu, std, pi, _ = model.tiers[1](x, seq, input_lengths, audio_lengths) else: mu, std, pi = model.tiers[1](x, audio_lengths) temp = sample_gmm(mu, std, pi) x[:, m, t] = temp[:, m, t] breakdown[1] = (x.clone()[0].cpu().detach().numpy(), x.clone()[0].cpu().detach().numpy()) ## Tier 2~N ## for tier in tqdm(range(2, model.hp.model.tier + 1)): tqdm.write('Tier %d' % tier) mu, std, pi = model.tiers[tier](x, audio_lengths) temp = sample_gmm(mu, std, pi) breakdown[tier] = (x.clone()[0].cpu().detach().numpy(), temp.clone()[0].cpu().detach().numpy()) x = model.tierutil.interleave(x, temp, tier + 1) return breakdown, x
def run_inference_on_tier(source, tier, text, timestep): # Returns a tuple, (inference, next_tier) # inference is the conditional inference on the current tier # next_tier interleaves the inference with the input to generate the next tier args = parse_inference_args(['-c', 'config/blizzard_compressed_experiments.yaml', '-p', 'config/inference.yaml', '-t', str(timestep), '-n', 'hw_blizzard_compressed', '-i', text]) hp = HParam('./config/blizzard_compressed_experiments.yaml') infererence_hp = HParam(args.infer_config) assert timestep % t_div[hp.model.tier] == 0, \ "timestep should be divisible by %d, got %d" % (t_div[hp.model.tier], timestep) model = MelNet(hp, args, infererence_hp).cuda() model.load_tiers() model.eval() audio_lengths = torch.LongTensor([0]).cuda() if tier > 1: for t in tqdm(range(model.args.timestep // model.t_div)): audio_lengths += 1 # source = breakdown[tier][0] x = torch.unsqueeze(torch.from_numpy(source), 0) mu, std, pi = model.tiers[tier](x, audio_lengths) temp = sample_gmm(mu, std, pi) next_tier = model.tierutil.interleave(x, temp, tier + 1) return next_tier[0].cpu().detach().numpy(), temp[0].cpu().detach().numpy()
def sample(self, condition): x = None if condition is not None: # seq = torch.from_numpy(text_to_sequence(condition)).long().unsqueeze(0) x = condition else: seq = torch.LongTensor([[0]]) # input_lengths = torch.LongTensor([seq[0].shape[0]]).cuda() if x is not None: audio_lengths = torch.LongTensor([x.size()[-1]]).cuda() else: audio_lengths = torch.LongTensor([0]).cuda() ## Tier 1 ## tqdm.write('Tier 1') if self.args.timestep == 0: mu, std, pi = self.tiers[1](x, audio_lengths) temp = sample_gmm(mu, std, pi) return temp for t in tqdm(range(self.args.timestep // self.t_div)): audio_lengths += 1 if x is None: x = torch.zeros((1, self.n_mels // self.f_div, 1)).cuda() else: x = torch.cat( [x, torch.zeros((1, self.n_mels // self.f_div, 1)).cuda()], dim=-1) for m in tqdm(range(self.n_mels // self.f_div)): torch.cuda.synchronize() if self.infer_hp.conditional: # mu, std, pi, _ = self.tiers[1](x, seq, input_lengths, audio_lengths) break else: mu, std, pi = self.tiers[1](x, audio_lengths) temp = sample_gmm(mu, std, pi) new_idx = audio_lengths.item() - 1 x[:, m, new_idx] = temp[:, m, new_idx] ## Tier 2~N ## for tier in tqdm(range(2, self.hp.model.tier + 1)): tqdm.write('Tier %d' % tier) mu, std, pi = self.tiers[tier](x) temp = sample_gmm(mu, std, pi) x = self.tierutil.interleave(x, temp, tier + 1) return x
def sample_dependence(self, condition, label, dependence_length): x = None if condition is not None: # seq = torch.from_numpy(text_to_sequence(condition)).long().unsqueeze(0) x = condition else: seq = torch.LongTensor([[0]]) if x is not None: audio_lengths = torch.LongTensor([x.size()[-1]]).cuda() else: audio_lengths = torch.LongTensor([0]).cuda() for t in tqdm(range(self.args.timestep // self.t_div)): # audio_lengths += 1 if x is None: x = torch.zeros((1, self.n_mels // self.f_div, 1)).cuda() else: x = torch.cat( [x, torch.zeros((1, self.n_mels // self.f_div, 1)).cuda()], dim=-1) for m in tqdm(range(self.n_mels // self.f_div)): torch.cuda.synchronize() if self.infer_hp.conditional: # mu, std, pi, _ = self.tiers[1](x, seq, input_lengths, audio_lengths) break else: class_label = torch.tensor( label, dtype=torch.long) if isinstance( label, int) else torch.LongTensor(label) if m == 0: mu, std, pi, h_t, h_c = self.tiers[1]( x[:, :, -dependence_length:], audio_lengths, class_label.cuda(non_blocking=True).unsqueeze(0), save_hidden=True, hidden_t=None, hidden_c=None) else: mu, std, pi = self.tiers[1]( x[:, :, -dependence_length:], audio_lengths, class_label.cuda(non_blocking=True).unsqueeze(0), save_hidden=False, hidden_t=h_t, hidden_c=h_c) temp = sample_gmm(mu, std, pi) new_idx = audio_lengths.item() - 1 x[:, m, -1] = temp[:, m, new_idx] return x
def run_inference(source, timestep, tier_to_breakdown): # First load in the model hp = HParam('./config/blizzard_alldata_v5.yaml') infer_hp = HParam('./config/inference.yaml') args = parse_inference_args([ '-c', 'config/blizzard_alldata_v5.yaml', '-p', 'config/inference.yaml', '-t', str(timestep), '-n', 'test_tiers', '-i', SENTENCE ]) model = MelNet(hp, args, infer_hp).cuda() model.load_tiers() model.eval() audio_lengths = torch.LongTensor([0]).cuda() for t in tqdm(range(model.args.timestep // model.t_div)): audio_lengths += 1 ## Tier 2~N ## x = torch.unsqueeze(torch.from_numpy(source), 0) for tier in tqdm( range(model.hp.model.tier + 1 - TESTING_TIERS, model.hp.model.tier + 1)): tqdm.write('Tier %d' % tier) # Save original source and inference source actual_source = tier_to_breakdown[tier][0] actual_target = tier_to_breakdown[tier][1] actual_interleaved = tier_to_breakdown[tier + 1][0] current_source = x save_image(x.detach().numpy()[0], 'tier_%d_inference_source' % tier) save_image(actual_source, 'tier_%d_actual_source' % tier) save_image(actual_target, 'tier_%d_actual_target' % tier) mu, std, pi = model.tiers[tier](x, audio_lengths) temp = sample_gmm(mu, std, pi) save_image(temp[0].cpu().detach().numpy(), 'tier_%d_inference_target' % tier) x = model.tierutil.interleave(x, temp, tier + 1) save_image(x.detach().numpy()[0], 'tier_%d_inference_interleaved' % tier) save_image(actual_interleaved, 'tier_%d_actual_interleaved' % tier) reconstructed_mel_tensor = x.detach().numpy() return reconstructed_mel_tensor[0]
def run_inference(sentence, timestep, tier_to_breakdown): # First load in the model hp = HParam('./config/blizzard_alldata_v5.yaml') infer_hp = HParam('./config/inference.yaml') args = parse_inference_args([ '-c', 'config/blizzard_alldata_v5.yaml', '-p', 'config/inference.yaml', '-t', str(timestep), '-n', 'test_tiers', '-i', SENTENCE ]) model = MelNet(hp, args, infer_hp).cuda() model.load_tiers() model.eval() x = None seq = torch.from_numpy(process_blizzard(sentence)).long().unsqueeze(0) input_lengths = torch.LongTensor([seq[0].shape[0]]).cuda() audio_lengths = torch.LongTensor([0]).cuda() actual_target = tier_to_breakdown[1][1] # save_image(seq.detach().numpy(), 'tier_1_seq_source') ## Tier 1 ## tqdm.write('Tier 1') for t in tqdm(range(model.args.timestep // model.t_div)): audio_lengths += 1 if x is None: x = torch.zeros((1, model.n_mels // model.f_div, 1)).cuda() else: x = torch.cat( [x, torch.zeros((1, model.n_mels // model.f_div, 1)).cuda()], dim=-1) for m in tqdm(range(model.n_mels // model.f_div)): torch.cuda.synchronize() if model.infer_hp.conditional: mu, std, pi, _ = model.tiers[1](x, seq, input_lengths, audio_lengths) else: mu, std, pi = model.tiers[1](x, audio_lengths) temp = sample_gmm(mu, std, pi) x[:, m, t] = temp[:, m, t] save_image(x[0].cpu().detach().numpy(), 'tier_1_inference_target') save_image(actual_target, 'tier_1_actual_target') # for t in tqdm(range(model.args.timestep // model.t_div)): # audio_lengths += 1 # ## Tier 2~N ## # x = torch.unsqueeze(torch.from_numpy(source), 0) # for tier in tqdm(range(model.hp.model.tier + 1 - TESTING_TIERS, model.hp.model.tier + 1)): # tqdm.write('Tier %d' % tier) # # Save original source and inference source # actual_source = tier_to_breakdown[tier][0] # actual_target = tier_to_breakdown[tier][1] # actual_interleaved = tier_to_breakdown[tier+1][0] # current_source = x # save_image(x.detach().numpy()[0], 'tier_%d_inference_source' % tier) # save_image(actual_source, 'tier_%d_actual_source' % tier) # save_image(actual_target, 'tier_%d_actual_target' % tier) # mu, std, pi = model.tiers[tier](x, audio_lengths) # temp = sample_gmm(mu, std, pi) # save_image(temp[0].cpu().detach().numpy(), 'tier_%d_inference_target' % tier) # x = model.tierutil.interleave(x, temp, tier + 1) # save_image(x.detach().numpy()[0], 'tier_%d_inference_interleaved' % tier) # save_image(actual_interleaved, 'tier_%d_actual_interleaved' % tier) reconstructed_mel_tensor = x.detach().numpy() return reconstructed_mel_tensor[0]