def test_fastspeech_gpu_trainable_and_decodable(teacher_type, model_dict): # make args idim, odim = 10, 25 model_args = make_feedforward_transformer_args(**model_dict) # setup batch ilens = [10, 5] olens = [20, 15] device = torch.device('cuda') batch = prepare_inputs(idim, odim, ilens, olens, model_args["spk_embed_dim"], device=device) # define teacher model and save it if teacher_type == "transformer": teacher_model_args = make_transformer_args(**model_dict) teacher_model = Transformer(idim, odim, Namespace(**teacher_model_args)) elif teacher_type == "tacotron2": teacher_model_args = make_taco2_args(**model_dict) teacher_model = Tacotron2(idim, odim, Namespace(**teacher_model_args)) else: raise ValueError() tmpdir = tempfile.mkdtemp(prefix="tmp_", dir="/tmp") torch.save(teacher_model.state_dict(), tmpdir + "/model.dummy.best") with open(tmpdir + "/model.json", 'wb') as f: f.write( json.dumps((idim, odim, teacher_model_args), indent=4, ensure_ascii=False, sort_keys=True).encode('utf_8')) # define model model_args["teacher_model"] = tmpdir + "/model.dummy.best" model = FeedForwardTransformer(idim, odim, Namespace(**model_args)) model.to(device) optimizer = torch.optim.Adam(model.parameters()) # trainable loss = model(**batch).mean() optimizer.zero_grad() loss.backward() optimizer.step() # decodable model.eval() with torch.no_grad(): if model_args["spk_embed_dim"] is None: spemb = None else: spemb = batch["spembs"][0] model.inference(batch["xs"][0][:batch["ilens"][0]], None, spemb=spemb) model.calculate_all_attentions(**batch) # remove tmpdir if os.path.exists(tmpdir): shutil.rmtree(tmpdir)
def test_transformer_multi_gpu_trainable(model_dict): # make args model_args = make_transformer_args(**model_dict) # setup batch idim = 5 odim = 10 ilens = [10, 5] olens = [20, 15] device = torch.device("cuda") batch = prepare_inputs(idim, odim, ilens, olens, model_args["spk_embed_dim"], device=device) # define model ngpu = 2 device_ids = list(range(ngpu)) model = Transformer(idim, odim, Namespace(**model_args)) model = torch.nn.DataParallel(model, device_ids) model.to(device) optimizer = torch.optim.Adam(model.parameters()) # trainable loss = model(**batch).mean() optimizer.zero_grad() loss.backward() optimizer.step() # check gradient of ScaledPositionalEncoding if model.module.use_scaled_pos_enc: assert model.module.encoder.embed[1].alpha.grad is not None assert model.module.decoder.embed[1].alpha.grad is not None
def test_initialization(model_dict): # make args idim, odim = 10, 25 teacher_model_args = make_transformer_args(**model_dict) model_args = make_feedforward_transformer_args(**model_dict) # define teacher model and save it teacher_model = Transformer(idim, odim, Namespace(**teacher_model_args)) tmpdir = tempfile.mkdtemp(prefix="tmp_", dir="/tmp") torch.save(teacher_model.state_dict(), tmpdir + "/model.dummy.best") with open(tmpdir + "/model.json", 'wb') as f: f.write( json.dumps((idim, odim, teacher_model_args), indent=4, ensure_ascii=False, sort_keys=True).encode('utf_8')) # define model model_args["teacher_model"] = tmpdir + "/model.dummy.best" model = FeedForwardTransformer(idim, odim, Namespace(**model_args)) # check initialization if model_args["transferred_encoder_module"] == "all": for p1, p2 in zip(model.encoder.parameters(), model.teacher.encoder.parameters()): np.testing.assert_array_equal(p1.data.cpu().numpy(), p2.data.cpu().numpy()) else: np.testing.assert_array_equal( model.encoder.embed[0].weight.data.cpu().numpy(), model.teacher.encoder.embed[0].weight.data.cpu().numpy()) # remove tmpdir if os.path.exists(tmpdir): shutil.rmtree(tmpdir)
def test_transformer_gpu_trainable(model_dict): # make args model_args = make_transformer_args(**model_dict) idim = 5 odim = 10 ilens = [10, 5] olens = [20, 15] device = torch.device('cuda') batch = prepare_inputs(idim, odim, ilens, olens, device=device) # define model model = Transformer(idim, odim, Namespace(**model_args)) model.to(device) optimizer = torch.optim.Adam(model.parameters()) # trainable loss = model(**batch).mean() optimizer.zero_grad() loss.backward() optimizer.step() # check gradient of ScaledPositionalEncoding if model.use_scaled_pos_enc: assert model.encoder.embed[1].alpha.grad is not None assert model.decoder.embed[1].alpha.grad is not None
def test_fastspeech_multi_gpu_trainable(teacher_type, model_dict): # make args idim, odim = 10, 25 model_args = make_feedforward_transformer_args(**model_dict) # setup batch ilens = [10, 5] olens = [20, 15] device = torch.device("cuda") batch = prepare_inputs(idim, odim, ilens, olens, model_args["spk_embed_dim"], device=device) # define teacher model and save it if teacher_type == "transformer": teacher_model_args = make_transformer_args(**model_dict) teacher_model = Transformer(idim, odim, Namespace(**teacher_model_args)) elif teacher_type == "tacotron2": teacher_model_args = make_taco2_args(**model_dict) teacher_model = Tacotron2(idim, odim, Namespace(**teacher_model_args)) else: raise ValueError() tmpdir = tempfile.mkdtemp(prefix="tmp_", dir="/tmp") torch.save(teacher_model.state_dict(), tmpdir + "/model.dummy.best") with open(tmpdir + "/model.json", "wb") as f: f.write( json.dumps( (idim, odim, teacher_model_args), indent=4, ensure_ascii=False, sort_keys=True, ).encode("utf_8")) # define model ngpu = 2 device_ids = list(range(ngpu)) model_args["teacher_model"] = tmpdir + "/model.dummy.best" model = FeedForwardTransformer(idim, odim, Namespace(**model_args)) model = torch.nn.DataParallel(model, device_ids) model.to(device) optimizer = torch.optim.Adam(model.parameters()) # trainable loss = model(**batch).mean() optimizer.zero_grad() loss.backward() optimizer.step() # remove tmpdir if os.path.exists(tmpdir): shutil.rmtree(tmpdir)
def test_attention_masking(model_dict): # make args model_args = make_transformer_args(**model_dict) # setup batch idim = 5 odim = 10 ilens = [10, 5] olens = [20, 15] batch = prepare_inputs(idim, odim, ilens, olens) # define model model = Transformer(idim, odim, Namespace(**model_args)) # test encoder self-attention xs = model.encoder.embed(batch["xs"]) xs[1, ilens[1]:] = float("nan") x_masks = model._source_mask(batch["ilens"]) a = model.encoder.encoders[0].self_attn a(xs, xs, xs, x_masks) aws = a.attn.detach().numpy() assert not np.isnan(aws).any() for aw, ilen in zip(aws, batch["ilens"]): np.testing.assert_almost_equal(aw[:, :ilen, :ilen].sum(), float(aw.shape[0] * ilen), decimal=4) assert aw[:, ilen:, ilen:].sum() == 0.0 # test encoder-decoder attention ys = model.decoder.embed(batch["ys"]) ys[1, olens[1]:] = float("nan") xy_masks = model._source_to_target_mask(batch["ilens"], batch["olens"]) a = model.decoder.decoders[0].src_attn a(ys, xs, xs, xy_masks) aws = a.attn.detach().numpy() assert not np.isnan(aws).any() for aw, ilen, olen in zip(aws, batch["ilens"], batch["olens"]): np.testing.assert_almost_equal(aw[:, :olen, :ilen].sum(), float(aw.shape[0] * olen), decimal=4) assert aw[:, olen:, ilen:].sum() == 0.0 # test decoder self-attention ys = model.decoder.embed(batch["ys"]) ys[1, olens[1]:] = float("nan") y_masks = model._target_mask(batch["olens"]) a = model.decoder.decoders[0].self_attn a(ys, ys, ys, y_masks) aws = a.attn.detach().numpy() assert not np.isnan(aws).any() for aw, olen in zip(aws, batch["olens"]): np.testing.assert_almost_equal(aw[:, :olen, :olen].sum(), float(aw.shape[0] * olen), decimal=4) assert aw[:, olen:, olen:].sum() == 0.0
def test_transformer_trainable_and_decodable(model_dict): # make args model_args = make_transformer_args(**model_dict) inference_args = make_inference_args() # setup batch idim = 5 odim = 10 ilens = [10, 5] olens = [20, 15] batch = prepare_inputs(idim, odim, ilens, olens) # define model model = Transformer(idim, odim, Namespace(**model_args)) optimizer = torch.optim.Adam(model.parameters()) # trainable loss = model(**batch).mean() optimizer.zero_grad() loss.backward() optimizer.step() # check gradient of ScaledPositionalEncoding if model.use_scaled_pos_enc: assert model.encoder.embed[1].alpha.grad is not None assert model.decoder.embed[1].alpha.grad is not None # decodable model.eval() with torch.no_grad(): model.inference(batch["xs"][0][:batch["ilens"][0]], Namespace(**inference_args)) model.calculate_all_attentions(**batch)
def _load_teacher_model(self, model_path): # get teacher model config idim, odim, args = get_model_conf(model_path) # assert dimension is the same between teacher and studnet assert idim == self.idim assert odim == self.odim assert args.reduction_factor == self.reduction_factor # load teacher model model = Transformer(idim, odim, args) torch_load(model_path, model) # freeze teacher model parameters for p in model.parameters(): p.requires_grad = False return model
def test_duration_calculator(): # define duration calculator idim, odim = 10, 25 teacher_model_args = make_transformer_args() teacher = Transformer(idim, odim, Namespace(**teacher_model_args)) duration_calculator = DurationCalculator(teacher) # setup batch ilens = [10, 5, 3] olens = [20, 15, 10] batch = prepare_inputs(idim, odim, ilens, olens) # calculate durations ds = duration_calculator(batch["xs"], batch["ilens"], batch["ys"], batch["olens"]) np.testing.assert_array_equal(ds.sum(dim=-1).cpu().numpy(), batch["olens"].cpu().numpy())
def test_forward_and_inference_are_equal(model_dict): # make args model_args = make_transformer_args(dprenet_dropout_rate=0.0, **model_dict) # setup batch idim = 5 odim = 10 ilens = [10] olens = [20] batch = prepare_inputs(idim, odim, ilens, olens) xs = batch["xs"] ilens = batch["ilens"] ys = batch["ys"] olens = batch["olens"] # define model model = Transformer(idim, odim, Namespace(**model_args)) model.eval() # TODO(kan-bayashi): update following ugly part with torch.no_grad(): # --------- forward calculation --------- x_masks = model._source_mask(ilens) hs_fp, h_masks = model.encoder(xs, x_masks) if model.reduction_factor > 1: ys_in = ys[:, model.reduction_factor - 1::model.reduction_factor] olens_in = olens.new( [olen // model.reduction_factor for olen in olens]) else: ys_in, olens_in = ys, olens ys_in = model._add_first_frame_and_remove_last_frame(ys_in) y_masks = model._target_mask(olens_in) zs, _ = model.decoder(ys_in, y_masks, hs_fp, h_masks) before_outs = model.feat_out(zs).view(zs.size(0), -1, model.odim) logits = model.prob_out(zs).view(zs.size(0), -1) after_outs = before_outs + model.postnet(before_outs.transpose( 1, 2)).transpose(1, 2) # --------- forward calculation --------- # --------- inference calculation --------- hs_ir, _ = model.encoder(xs, None) maxlen = ys_in.shape[1] minlen = ys_in.shape[1] idx = 0 # this is the inferene calculation but we use groundtruth to check the behavior ys_in_ = ys_in[0, idx].view(1, 1, model.odim) np.testing.assert_array_equal( ys_in_.new_zeros(1, 1, model.odim).detach().cpu().numpy(), ys_in_.detach().cpu().numpy(), ) outs, probs = [], [] while True: idx += 1 y_masks = subsequent_mask(idx).unsqueeze(0) z = model.decoder.forward_one_step(ys_in_, y_masks, hs_ir)[0] # (B, idx, adim) outs += [model.feat_out(z).view(1, -1, model.odim)] # [(1, r, odim), ...] probs += [torch.sigmoid(model.prob_out(z))[0]] # [(r), ...] if idx >= maxlen: if idx < minlen: continue outs = torch.cat(outs, dim=1).transpose( 1, 2) # (1, L, odim) -> (1, odim, L) if model.postnet is not None: outs = outs + model.postnet(outs) # (1, odim, L) outs = outs.transpose(2, 1).squeeze(0) # (L, odim) probs = torch.cat(probs, dim=0) break ys_in_ = torch.cat((ys_in_, ys_in[0, idx].view(1, 1, model.odim)), dim=1) # (1, idx + 1, odim) # --------- inference calculation --------- # check both are equal np.testing.assert_array_almost_equal( hs_fp.detach().cpu().numpy(), hs_ir.detach().cpu().numpy(), ) np.testing.assert_array_almost_equal( after_outs.squeeze(0).detach().cpu().numpy(), outs.detach().cpu().numpy(), ) np.testing.assert_array_almost_equal( torch.sigmoid(logits.squeeze(0)).detach().cpu().numpy(), probs.detach().cpu().numpy(), )