Exemplo n.º 1
0
def save_checkpoint(model, optimizer, learning_rate, iteration, filepath):
    print("Saving model and optimizer state at iteration {} to {}".format(
          iteration, filepath))
    model_for_saving = Flowtron(**model_config).cuda()
    model_for_saving.load_state_dict(model.state_dict())
    torch.save({'model': model_for_saving,
                'iteration': iteration,
                'optimizer': optimizer.state_dict(),
                'learning_rate': learning_rate}, filepath, _use_new_zipfile_serialization=False)
def infer(flowtron_path, waveglow_path, output_dir, text, speaker_id, n_frames,
          sigma, gate_threshold, seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    # load waveglow
    waveglow = torch.load(waveglow_path)['model'].cuda().eval()
    waveglow.cuda().half()
    for k in waveglow.convinv:
        k.float()
    waveglow.eval()

    # load flowtron
    model = Flowtron(**model_config).cuda()
    state_dict = torch.load(flowtron_path, map_location='cpu')['state_dict']
    model.load_state_dict(state_dict)
    model.eval()
    print("Loaded checkpoint '{}')".format(flowtron_path))

    ignore_keys = ['training_files', 'validation_files']
    trainset = Data(
        data_config['training_files'],
        **dict((k, v) for k, v in data_config.items() if k not in ignore_keys))
    speaker_vecs = trainset.get_speaker_id(speaker_id).cuda()
    text = trainset.get_text(text).cuda()
    speaker_vecs = speaker_vecs[None]
    text = text[None]

    with torch.no_grad():
        residual = torch.cuda.FloatTensor(1, 80, n_frames).normal_() * sigma
        mels, attentions = model.infer(residual,
                                       speaker_vecs,
                                       text,
                                       gate_threshold=gate_threshold)

    for k in range(len(attentions)):
        attention = torch.cat(attentions[k]).cpu().numpy()
        fig, axes = plt.subplots(1, 2, figsize=(16, 4))
        axes[0].imshow(mels[0].cpu().numpy(), origin='bottom', aspect='auto')
        axes[1].imshow(attention[:, 0].transpose(),
                       origin='bottom',
                       aspect='auto')
        fig.savefig(
            os.path.join(
                output_dir,
                'sid{}_sigma{}_attnlayer{}.png'.format(speaker_id, sigma, k)))
        plt.close("all")

    with torch.no_grad():
        audio = waveglow.infer(mels.half(), sigma=0.8).float()

    audio = audio.cpu().numpy()[0]
    # normalize audio for now
    audio = audio / np.abs(audio).max()
    print(audio.shape)

    write(
        os.path.join(output_dir, 'sid{}_sigma{}.wav'.format(speaker_id,
                                                            sigma)),
        data_config['sampling_rate'], audio)
Exemplo n.º 3
0
    def __init__(self):
        self.config_path = 'flowtron/config.json'
        self.models_path = os.getcwd() + '/models/'
        self.training_files_path = os.getcwd() + '/filelists/dataset_train.txt'
        with open(self.config_path) as f:
            data = f.read()
        self.config = json.loads(data)
        self.config['model_config']['n_speakers'] = 41
        self.lambd = 0.001
        self.sigma = 0.85
        self.waveglow_sigma = 1
        self.n_frames = 1800
        self.aggregation_type = 'batch'

        self.model = Flowtron(**self.config['model_config']).cuda()
        flowtron_path = self.models_path + self.models['flowtron']
        waveglow_path = self.models_path + self.waveglow['default']

        if 'state_dict' in torch.load(flowtron_path, map_location='cpu'):
            load = torch.load(flowtron_path, map_location='cpu')
            state_dict = load['state_dict']
        else:
            load = torch.load(flowtron_path, map_location='cpu')
            state_dict = load['model'].state_dict()
        self.model.load_state_dict(state_dict, strict=False)
        self.model.eval()

        self.waveglow = torch.load(waveglow_path)['model']
        self.waveglow.cuda().eval()

        self.z_baseline = torch.cuda.FloatTensor(
            1, 80, self.n_frames).normal_() * self.sigma

        ignore_keys = ['training_files', 'validation_files']
        self.trainset = Data(
            self.training_files_path,
            **dict((k, v) for k, v in self.config['data_config'].items()
                   if k not in ignore_keys))
Exemplo n.º 4
0
def setup():
    # Parse configs.  Globals nicer in this case
    with open("flowtron/infer.json") as f:
        data = f.read()

    global config
    config = json.loads(data)

    global data_config
    data_config = config["data_config"]
    global model_config
    model_config = config["model_config"]

    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False

    global flowtron
    global waveglow
    global trainset
    
    encoder_weights = Path("encoder/saved_models/pretrained.pt")
    encoder.load_model(encoder_weights)

    torch.manual_seed(1234)
    torch.cuda.manual_seed(1234)

    #Load waveglow
    waveglow = torch.load("flowtron/tacotron2/waveglow/saved_models/waveglow_256channels_universal_v5.pt")['model'].cuda().eval()
    waveglow.cuda().half()
    for k in waveglow.convinv:
        k.float()
    waveglow.eval()
    
    #Load flowtron
    flowtron = Flowtron(**model_config).cuda()
    state_dict = torch.load("flowtron/saved_models/pretrained.pt", map_location='cpu')['model'].state_dict()
    flowtron.load_state_dict(state_dict)
    flowtron.eval()

    ignore_keys = ['training_files', 'validation_files']
    trainset = Data(
        data_config['training_files'],
        **dict((k, v) for k, v in data_config.items() if k not in ignore_keys))
Exemplo n.º 5
0
def load_models(flowtron_path, waveglow_path):
    # load waveglow
    waveglow = torch.load(waveglow_path)['model'].cuda().eval()
    waveglow.cuda()
    for k in waveglow.convinv:
        k.float()
    waveglow.eval()

    # load flowtron
    try:
        model = Flowtron(**model_config).cuda()
        state_dict = torch.load(flowtron_path,
                                map_location='cpu')['state_dict']
        model.load_state_dict(state_dict)
    except KeyError:
        model = torch.load(flowtron_path)['model']

    model.eval()
    print("Loaded model '{}')".format(flowtron_path))

    return model, waveglow
Exemplo n.º 6
0
def train(n_gpus,
          rank,
          output_directory,
          epochs,
          optim_algo,
          learning_rate,
          weight_decay,
          sigma,
          iters_per_checkpoint,
          batch_size,
          seed,
          checkpoint_path,
          ignore_layers,
          include_layers,
          finetune_layers,
          warmstart_checkpoint_path,
          with_tensorboard,
          grad_clip_val,
          fp16_run,
          tensorboard_path=None):
    fp16_run = bool(fp16_run)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    if n_gpus > 1:
        init_distributed(rank, n_gpus, **dist_config)

    criterion = FlowtronLoss(sigma, bool(model_config['n_components']),
                             bool(model_config['use_gate_layer']))
    model = Flowtron(**model_config).cuda()

    if len(finetune_layers):
        for name, param in model.named_parameters():
            if name in finetune_layers:
                param.requires_grad = True
            else:
                param.requires_grad = False

    print("Initializing %s optimizer" % (optim_algo))
    if optim_algo == 'Adam':
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=learning_rate,
                                     weight_decay=weight_decay)
    elif optim_algo == 'RAdam':
        optimizer = RAdam(model.parameters(),
                          lr=learning_rate,
                          weight_decay=weight_decay)
    else:
        print("Unrecognized optimizer %s!" % (optim_algo))
        exit(1)

    # Load checkpoint if one exists
    iteration = 0
    if warmstart_checkpoint_path != "":
        model = warmstart(warmstart_checkpoint_path, model)

    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer, ignore_layers)
        iteration += 1  # next iteration is iteration + 1

    if n_gpus > 1:
        model = apply_gradient_allreduce(model)
    print(model)
    scaler = amp.GradScaler(enabled=fp16_run)

    train_loader, valset, collate_fn = prepare_dataloaders(
        data_config, n_gpus, batch_size)

    # Get shared output_directory ready
    if rank == 0 and not os.path.isdir(output_directory):
        os.makedirs(output_directory)
        os.chmod(output_directory, 0o775)
        print("Output directory", output_directory)

    if with_tensorboard and rank == 0:
        tboard_out_path = tensorboard_path
        if tensorboard_path is None:
            tboard_out_path = os.path.join(output_directory, "logs/run1")
        print("Setting up Tensorboard log in %s" % (tboard_out_path))
        logger = FlowtronLogger(tboard_out_path)

    # force set the learning rate to what is specified
    for param_group in optimizer.param_groups:
        param_group['lr'] = learning_rate

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))

    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for batch in train_loader:
            model.zero_grad()

            mel, speaker_vecs, text, in_lens, out_lens, gate_target, attn_prior = batch
            mel, speaker_vecs, text = mel.cuda(), speaker_vecs.cuda(
            ), text.cuda()
            in_lens, out_lens, gate_target = in_lens.cuda(), out_lens.cuda(
            ), gate_target.cuda()
            attn_prior = attn_prior.cuda() if valset.use_attn_prior else None
            with amp.autocast(enabled=fp16_run):
                z, log_s_list, gate_pred, attn, mean, log_var, prob = model(
                    mel, speaker_vecs, text, in_lens, out_lens, attn_prior)

                loss_nll, loss_gate = criterion(
                    (z, log_s_list, gate_pred, mean, log_var, prob),
                    gate_target, out_lens)
                loss = loss_nll + loss_gate

            if n_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
                reduced_gate_loss = reduce_tensor(loss_gate.data,
                                                  n_gpus).item()
                reduced_nll_loss = reduce_tensor(loss_nll.data, n_gpus).item()
            else:
                reduced_loss = loss.item()
                reduced_gate_loss = loss_gate.item()
                reduced_nll_loss = loss_nll.item()

            scaler.scale(loss).backward()
            if grad_clip_val > 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               grad_clip_val)

            scaler.step(optimizer)
            scaler.update()

            if rank == 0:
                print("{}:\t{:.9f}".format(iteration, reduced_loss),
                      flush=True)

            if with_tensorboard and rank == 0:
                logger.add_scalar('training_loss', reduced_loss, iteration)
                logger.add_scalar('training_loss_gate', reduced_gate_loss,
                                  iteration)
                logger.add_scalar('training_loss_nll', reduced_nll_loss,
                                  iteration)
                logger.add_scalar('learning_rate', learning_rate, iteration)

            if iteration % iters_per_checkpoint == 0:
                val_loss, val_loss_nll, val_loss_gate, attns, gate_pred, gate_target = compute_validation_loss(
                    model, criterion, valset, collate_fn, batch_size, n_gpus)
                if rank == 0:
                    print("Validation loss {}: {:9f}  ".format(
                        iteration, val_loss))
                    if with_tensorboard:
                        logger.log_validation(val_loss, val_loss_nll,
                                              val_loss_gate, attns, gate_pred,
                                              gate_target, iteration)

                    checkpoint_path = "{}/model_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
Exemplo n.º 7
0
def train(n_gpus, rank, output_directory, epochs, learning_rate, weight_decay,
          sigma, iters_per_checkpoint, batch_size, seed, checkpoint_path,
          ignore_layers, include_layers, warmstart_checkpoint_path,
          with_tensorboard, fp16_run):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    if n_gpus > 1:
        init_distributed(rank, n_gpus, **dist_config)

    criterion = FlowtronLoss(sigma, bool(model_config['n_components']),
                             model_config['use_gate_layer'])
    model = Flowtron(**model_config).cuda()
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=weight_decay)

    # Load checkpoint if one exists
    iteration = 0
    if warmstart_checkpoint_path != "":
        model = warmstart(warmstart_checkpoint_path, model)

    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer, ignore_layers)
        iteration += 1  # next iteration is iteration + 1

    if n_gpus > 1:
        model = apply_gradient_allreduce(model)
    print(model)
    if fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    train_loader, valset, collate_fn = prepare_dataloaders(
        data_config, n_gpus, batch_size)

    # Get shared output_directory ready
    if rank == 0 and not os.path.isdir(output_directory):
        os.makedirs(output_directory)
        os.chmod(output_directory, 0o775)
    print("output directory", output_directory)

    if with_tensorboard and rank == 0:
        logger = FlowtronLogger(os.path.join(output_directory, 'logs'))

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for batch in train_loader:
            model.zero_grad()

            mel, speaker_vecs, text, in_lens, out_lens, gate_target = batch
            mel, speaker_vecs, text = mel.cuda(), speaker_vecs.cuda(
            ), text.cuda()
            in_lens, out_lens, gate_target = in_lens.cuda(), out_lens.cuda(
            ), gate_target.cuda()

            z, log_s_list, gate_pred, attn, mean, log_var, prob = model(
                mel, speaker_vecs, text, in_lens, out_lens)
            loss = criterion((z, log_s_list, gate_pred, mean, log_var, prob),
                             gate_target, out_lens)

            if n_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()

            if fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()
            optimizer.step()

            if rank == 0:
                print("{}:\t{:.9f}".format(iteration, reduced_loss),
                      flush=True)

            if with_tensorboard and rank == 0:
                logger.add_scalar('training_loss', reduced_loss, iteration)
                logger.add_scalar('learning_rate', learning_rate, iteration)

            if (iteration % iters_per_checkpoint == 0):
                val_loss, attns, gate_pred, gate_target = compute_validation_loss(
                    model, criterion, valset, collate_fn, batch_size, n_gpus)
                if rank == 0:
                    print("Validation loss {}: {:9f}  ".format(
                        iteration, val_loss))
                    if with_tensorboard:
                        logger.log_validation(val_loss, attns, gate_pred,
                                              gate_target, iteration)

                    checkpoint_path = "{}/model_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
Exemplo n.º 8
0
def infer(flowtron_path, waveglow_path, text, speaker_id, n_frames, sigma,
          seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    # load waveglow
    waveglow = torch.load(waveglow_path)['model'].cuda().eval()
    waveglow.cuda().half()
    for k in waveglow.convinv:
        k.float()
    waveglow.eval()

    # load flowtron
    model = Flowtron(**model_config).cuda()
    cpt_dict = torch.load(flowtron_path)
    if 'model' in cpt_dict:
        dummy_dict = cpt_dict['model'].state_dict()
    else:
        dummy_dict = cpt_dict['state_dict']
    model.load_state_dict(dummy_dict)
    model.eval()

    print("Loaded checkpoint '{}')".format(flowtron_path))

    ignore_keys = ['training_files', 'validation_files']
    trainset = Data(
        data_config['training_files'],
        **dict((k, v) for k, v in data_config.items() if k not in ignore_keys))

    tic_prep = time.time()

    str_text = text
    num_char = len(str_text)
    num_word = len(str_text.split())

    speaker_vecs = trainset.get_speaker_id(speaker_id).cuda()
    text = trainset.get_text(text).cuda()

    speaker_vecs = speaker_vecs[None]
    text = text[None]
    toc_prep = time.time()

    ############## warm up   ########### to measure exact flowtron inference time

    with torch.no_grad():
        tic_warmup = time.time()
        residual = torch.cuda.FloatTensor(1, 80, n_frames).normal_() * sigma
        mels, attentions = model.infer(residual, speaker_vecs, text)
        toc_warmup = time.time()

    tic_flowtron = time.time()
    with torch.no_grad(), torch.autograd.profiler.emit_nvtx(
    ):  ########### prof.
        tic_residual = time.time()
        residual = torch.cuda.FloatTensor(1, 80, n_frames).normal_() * sigma
        toc_residual = time.time()
        profiler.start()  ########### prof.
        mels, attentions = model.infer(residual, speaker_vecs, text)
        profiler.stop()  ########### prof.
        toc_flowtron = time.time()

    for k in range(len(attentions)):
        attention = torch.cat(attentions[k]).cpu().numpy()
        fig, axes = plt.subplots(1, 2, figsize=(16, 4))
        axes[0].imshow(mels[0].cpu().numpy(), origin='bottom', aspect='auto')
        axes[1].imshow(attention[:, 0].transpose(),
                       origin='bottom',
                       aspect='auto')
        fig.savefig('sid{}_sigma{}_attnlayer{}.png'.format(
            speaker_id, sigma, k))
        plt.close("all")

    tic_waveglow = time.time()
    audio = waveglow.infer(mels.half(), sigma=0.8).float()
    toc_waveglow = time.time()

    audio = audio.cpu().numpy()[0]
    # normalize audio for now
    audio = audio / np.abs(audio).max()

    len_audio = len(audio)
    dur_audio = len_audio / 22050
    num_frames = int(len_audio / 256)

    dur_prep = toc_prep - tic_prep
    dur_residual = toc_residual - tic_residual
    dur_flowtron_in = toc_flowtron - toc_residual
    dur_warmup = toc_warmup - tic_warmup
    dur_flowtron_out = toc_flowtron - tic_residual
    dur_waveglow = toc_waveglow - tic_waveglow
    dur_total = dur_prep + dur_flowtron_out + dur_waveglow

    RTF = dur_audio / dur_total

    str_text = "\n text : " + str_text
    str_num = "\n text {:d} char {:d} words  ".format(num_char, num_word)
    str_audio = "\n generated audio : {:2.3f} samples  {:2.3f} sec  with  {:d} mel frames ".format(
        len_audio, dur_audio, num_frames)
    str_perf = "\n total time {:2.3f} = text prep {:2.3f} + flowtron{:2.3f} + wg {:2.3f}  ".format(
        dur_total, dur_prep, dur_flowtron_out, dur_waveglow)
    str_flow = "\n total flowtron {:2.3f} = residual cal {:2.3f} + flowtron {:2.3f}  ".format(
        dur_flowtron_out, dur_residual, dur_flowtron_in)
    str_rtf = "\n RTF is {:2.3f} x  with warm up {:2.3f} ".format(
        RTF, dur_warmup)

    print(str_text, str_num, str_audio, str_perf, str_flow, str_rtf)

    write("sid{}_sigma{}.wav".format(speaker_id, sigma),
          data_config['sampling_rate'], audio)
Exemplo n.º 9
0
class AudioGeneratorFlowtron:
    models = {
        'flowtron': 'flowtron_model.pt',
    }

    waveglow = {'default': 'waveglow_256channels_universal_v5.pt'}

    def __init__(self):
        self.config_path = 'flowtron/config.json'
        self.models_path = os.getcwd() + '/models/'
        self.training_files_path = os.getcwd() + '/filelists/dataset_train.txt'
        with open(self.config_path) as f:
            data = f.read()
        self.config = json.loads(data)
        self.config['model_config']['n_speakers'] = 41
        self.lambd = 0.001
        self.sigma = 0.85
        self.waveglow_sigma = 1
        self.n_frames = 1800
        self.aggregation_type = 'batch'

        self.model = Flowtron(**self.config['model_config']).cuda()
        flowtron_path = self.models_path + self.models['flowtron']
        waveglow_path = self.models_path + self.waveglow['default']

        if 'state_dict' in torch.load(flowtron_path, map_location='cpu'):
            load = torch.load(flowtron_path, map_location='cpu')
            state_dict = load['state_dict']
        else:
            load = torch.load(flowtron_path, map_location='cpu')
            state_dict = load['model'].state_dict()
        self.model.load_state_dict(state_dict, strict=False)
        self.model.eval()

        self.waveglow = torch.load(waveglow_path)['model']
        self.waveglow.cuda().eval()

        self.z_baseline = torch.cuda.FloatTensor(
            1, 80, self.n_frames).normal_() * self.sigma

        ignore_keys = ['training_files', 'validation_files']
        self.trainset = Data(
            self.training_files_path,
            **dict((k, v) for k, v in self.config['data_config'].items()
                   if k not in ignore_keys))

    def generate(self, text: str, speaker: int):
        speaker_vecs = self.trainset.get_speaker_id(speaker).cuda()
        speaker_vecs = speaker_vecs[None]
        text = self.trainset.get_text(text).cuda()
        text = text[None]

        with torch.no_grad():
            mel_baseline = self.model.infer(self.z_baseline, speaker_vecs,
                                            text)[0]

        with torch.no_grad():
            audio_base = self.waveglow.infer(mel_baseline,
                                             sigma=self.waveglow_sigma)

        audio = audio_base[0].data.cpu().numpy()
        return audio

    def prepare_dataset(self, dataset_path):
        dataset = Data(
            dataset_path,
            **dict((k, v) for k, v in self.config['data_config'].items()
                   if k not in ['training_files', 'validation_files']))
        return dataset