Exemplo n.º 1
0
def synthesis(text_input, args):
    local_rank = dg.parallel.Env().local_rank
    place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace())
    fluid.enable_dygraph(place)

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    # tensorboard
    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = SummaryWriter(os.path.join(args.output, 'log'))

    model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels'])
    # Load parameters.
    global_step = io.load_parameters(model=model,
                                     checkpoint_path=args.checkpoint)
    model.eval()

    text = np.asarray(text_to_sequence(text_input))
    text = np.expand_dims(text, axis=0)
    pos_text = np.arange(1, text.shape[1] + 1)
    pos_text = np.expand_dims(pos_text, axis=0)

    text = dg.to_variable(text)
    pos_text = dg.to_variable(pos_text)

    _, mel_output_postnet = model(text, pos_text, alpha=args.alpha)

    result = np.exp(mel_output_postnet.numpy())
    mel_output_postnet = fluid.layers.transpose(
        fluid.layers.squeeze(mel_output_postnet, [0]), [1, 0])
    mel_output_postnet = np.exp(mel_output_postnet.numpy())
    basis = librosa.filters.mel(cfg['audio']['sr'], cfg['audio']['n_fft'],
                                cfg['audio']['num_mels'])
    inv_basis = np.linalg.pinv(basis)
    spec = np.maximum(1e-10, np.dot(inv_basis, mel_output_postnet))

    # synthesis use clarinet
    wav_clarinet = synthesis_with_clarinet(args.config_clarinet,
                                           args.checkpoint_clarinet, result,
                                           place)
    writer.add_audio(text_input + '(clarinet)', wav_clarinet, 0,
                     cfg['audio']['sr'])
    if not os.path.exists(os.path.join(args.output, 'samples')):
        os.mkdir(os.path.join(args.output, 'samples'))
    write(os.path.join(os.path.join(args.output, 'samples'), 'clarinet.wav'),
          cfg['audio']['sr'], wav_clarinet)

    #synthesis use griffin-lim
    wav = librosa.core.griffinlim(spec**cfg['audio']['power'],
                                  hop_length=cfg['audio']['hop_length'],
                                  win_length=cfg['audio']['win_length'])
    writer.add_audio(text_input + '(griffin-lim)', wav, 0, cfg['audio']['sr'])
    write(
        os.path.join(os.path.join(args.output, 'samples'), 'grinffin-lim.wav'),
        cfg['audio']['sr'], wav)
    print("Synthesis completed !!!")
    writer.close()
Exemplo n.º 2
0
def main(input_files,
         model_dir,
         output_dir,
         batch_size,
         implementation,
         data_config,
         audio_config,
         preload_mels=False):
    model_filename = get_latest_checkpoint(model_dir)
    print("Model path: {}".format(model_filename))
    model = torch.load(model_filename)['model']
    wavenet = nv_wavenet.NVWaveNet(**(model.export_weights()))
    print("Wavenet num layers: {}, max_dilation: {}".format(
        wavenet.num_layers, wavenet.max_dilation))
    writer = SummaryWriter(output_dir)
    mel_extractor = Mel2SampOnehot(audio_config=audio_config, **data_config)
    input_files = utils.files_to_list(input_files)

    audio_processor = AudioProcessor(audio_config)
    for j, files in enumerate(chunker(input_files, batch_size)):
        mels = []
        for i, file_path in enumerate(files):
            if preload_mels:
                mel = np.load(file_path[0]).T
                mel = torch.from_numpy(mel)
                mel = utils.to_gpu(mel)
            else:
                audio, _ = utils.load_wav_to_torch(file_path)
                file_name = os.path.splitext(os.path.basename(file_path))[0]
                writer.add_audio("eval_true/{}/{}".format(i, file_name),
                                 audio / utils.MAX_WAV_VALUE, 0, 22050)
                mel = mel_extractor.get_mel(audio)
                mel = mel.t().cuda()
            mels.append(torch.unsqueeze(mel, 0))
        mels = torch.cat(mels, 0)
        cond_input = model.get_cond_input(mels)
        audio_data = wavenet.infer(cond_input, implementation)

        for i, file_path in enumerate(files):
            file_name = os.path.splitext(os.path.basename(file_path[0]))[0]
            audio = utils.mu_law_decode_numpy(audio_data[i, :].cpu().numpy(),
                                              256)
            print("Range of {}.wav before deemphasis : {} to {}".format(
                file_name, audio.min(), audio.max()))
            if mel_extractor.apply_preemphasis:
                audio = audio.astype("float32")
                audio = audio_processor.deemphasis(audio[None, :])
                audio = audio.numpy()[0]
            print("Range of {}.wav after deemphasis : {} to {}".format(
                file_name, audio.min(), audio.max()))
            audio = np.tanh(audio)
            output_filepath = "{}.wav".format(file_name)
            output_filepath = os.path.join(output_dir, output_filepath)
            assert audio.dtype in [np.float64, np.float32]
            assert (np.abs(audio)).max() <= 1
            writer.add_audio(output_filepath, audio, 0, 22050)
            audio = (audio * 32767).astype("int16")
            scipy.io.wavfile.write(output_filepath, 22050, audio)
Exemplo n.º 3
0
def synthesis(text_input, args):
    local_rank = dg.parallel.Env().local_rank
    place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace())
    fluid.enable_dygraph(place)

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    # tensorboard
    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = SummaryWriter(os.path.join(args.output, 'log'))

    model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels'])
    # Load parameters.
    global_step = io.load_parameters(
        model=model, checkpoint_path=args.checkpoint)
    model.eval()

    text = np.asarray(text_to_sequence(text_input))
    text = np.expand_dims(text, axis=0)
    pos_text = np.arange(1, text.shape[1] + 1)
    pos_text = np.expand_dims(pos_text, axis=0)

    text = dg.to_variable(text).astype(np.int64)
    pos_text = dg.to_variable(pos_text).astype(np.int64)

    _, mel_output_postnet = model(text, pos_text, alpha=args.alpha)

    if args.vocoder == 'griffin-lim':
        #synthesis use griffin-lim
        wav = synthesis_with_griffinlim(mel_output_postnet, cfg['audio'])
    elif args.vocoder == 'waveflow':
        wav = synthesis_with_waveflow(mel_output_postnet, args,
                                      args.checkpoint_vocoder, place)
    else:
        print(
            'vocoder error, we only support griffinlim and waveflow, but recevied %s.'
            % args.vocoder)

    writer.add_audio(text_input + '(' + args.vocoder + ')', wav, 0,
                     cfg['audio']['sr'])
    if not os.path.exists(os.path.join(args.output, 'samples')):
        os.mkdir(os.path.join(args.output, 'samples'))
    write(
        os.path.join(
            os.path.join(args.output, 'samples'), args.vocoder + '.wav'),
        cfg['audio']['sr'], wav)
    print("Synthesis completed !!!")
    writer.close()
Exemplo n.º 4
0
def tb_train2():
    import torchvision.utils as vutils
    import torchvision.models as models
    from torchvision import datasets

    resnet18 = models.resnet18(False)
    writer = SummaryWriter()
    sample_rate = 44100
    freqs = [262, 294, 330, 349, 392, 440, 440, 440, 440, 440, 440]

    for n_iter in range(100):
        dummy_s1 = torch.rand(1)
        dummy_s2 = torch.rand(1)
        # data grouping by `slash`
        writer.add_scalar('data/scalar1', dummy_s1[0], n_iter)
        writer.add_scalar('data/scalar2', dummy_s2[0], n_iter)

        writer.add_scalars('data/scalar_group', {'xsinx': n_iter * np.sin(n_iter),
                                                 'xcosx': n_iter * np.cos(n_iter),
                                                 'arctanx': np.arctan(n_iter)}, n_iter)

        dummy_img = torch.rand(32, 3, 64, 64)  # output from network
        if n_iter % 10 == 0:
            x = vutils.make_grid(dummy_img, normalize=True, scale_each=True)
            writer.add_image('Image', x, n_iter)

            dummy_audio = torch.zeros(sample_rate * 2)
            for i in range(x.size(0)):
                # amplitude of sound should in [-1, 1]
                dummy_audio[i] = np.cos(freqs[n_iter // 10] * np.pi * float(i) / float(sample_rate))
            writer.add_audio('myAudio', dummy_audio, n_iter, sample_rate=sample_rate)

            writer.add_text('Text', 'text logged at step:' + str(n_iter), n_iter)

            for name, param in resnet18.named_parameters():
                writer.add_histogram(name, param.clone().cpu().data.numpy(), n_iter)

            # needs tensorboard 0.4RC or later
            writer.add_pr_curve('xoxo', np.random.randint(2, size=100), np.random.rand(100), n_iter)

    dataset = datasets.MNIST('mnist', train=False, download=True)
    images = dataset.test_data[:100].float()
    label = dataset.test_labels[:100]

    features = images.view(100, 784)
    writer.add_embedding(features, metadata=label, label_img=images.unsqueeze(1))

    # export scalar data to JSON for external processing
    writer.export_scalars_to_json("./all_scalars.json")
    writer.close()
Exemplo n.º 5
0
def test_dataset_loader():
    from tqdm import tqdm
    from torch.utils.data import DataLoader
    from torch.utils.data.distributed import DistributedSampler
    from jukebox.utils.audio_utils import audio_preprocess, audio_postprocess
    from jukebox.hparams import setup_hparams
    from jukebox.data.files_dataset import FilesAudioDataset
    hps = setup_hparams("teeny", {})
    hps.sr = 22050  # 44100
    hps.hop_length = 512
    hps.labels = False
    hps.channels = 2
    hps.aug_shift = False
    hps.bs = 2
    hps.nworkers = 2  # Getting 20 it/s with 2 workers, 10 it/s with 1 worker
    print(hps)
    dataset = hps.dataset
    root = hps.root
    from tensorboardX import SummaryWriter
    sr = {22050: '22k', 44100: '44k', 48000: '48k'}[hps.sr]
    writer = SummaryWriter(f'{root}/{dataset}/logs/{sr}/logs')
    dataset = FilesAudioDataset(hps)
    print("Length of dataset", len(dataset))

    # Torch Loader
    collate_fn = lambda batch: t.stack([t.from_numpy(b) for b in batch], 0)
    sampler = DistributedSampler(dataset)
    train_loader = DataLoader(dataset,
                              batch_size=hps.bs,
                              num_workers=hps.nworkers,
                              pin_memory=False,
                              sampler=sampler,
                              drop_last=True,
                              collate_fn=collate_fn)

    dist.barrier()
    sampler.set_epoch(0)
    for i, x in enumerate(tqdm(train_loader)):
        x = x.to('cuda', non_blocking=True)
        for j, aud in enumerate(x):
            writer.add_audio('in_' + str(i * hps.bs + j), aud, 1, hps.sr)
        print("Wrote in")
        x = audio_preprocess(x, hps)
        x = audio_postprocess(x, hps)
        for j, aud in enumerate(x):
            writer.add_audio('out_' + str(i * hps.bs + j), aud, 1, hps.sr)
        print("Wrote out")
        dist.barrier()
        break
Exemplo n.º 6
0
class Logger(object):
    def __init__(self, logdir='./log'):
        self.writer = SummaryWriter(logdir)

    def scalar_summary(self, tag, value, step):
        self.writer.add_scalar(tag, value, step)

    def scalars_summary(self, tag, dictionary, step):
        self.writer.add_scalars(tag, dictionary, step)

    def text_summary(self, tag, value, step):
        self.writer.add_text(tag, value, step)

    def audio_summary(self, tag, value, step, sr):
        self.writer.add_audio(tag, value, step, sample_rate=sr)
Exemplo n.º 7
0
class MetricCounter:
    def __init__(self, exp_name):
        self.writer = SummaryWriter(logdir=exp_name)
        self.metrics = defaultdict(list)
        self.best_metric = float('inf')

    def clear(self):
        self.metrics = defaultdict(list)

    def add_losses(self, linear, mel, total):
        for name, value in zip(("linear_loss", "mel_loss", "total_loss"),
                               (linear, mel, total)):
            self.metrics[name].append(value)

    def loss_message(self):
        metrics = ((k, np.mean(self.metrics[k]))
                   for k in ("linear_loss", "mel_loss", "total_loss"))
        return '; '.join(map(lambda x: x[0] + '=' + '%.5f' % x[1], metrics))

    def write_to_tensorboard(self, epoch_num, validation=False, epoch=False):
        scalar_prefix = 'Validation' if validation else 'Train'
        epoch_prefix = "Epoch" if epoch else "Iter"
        for k in ("linear_loss", "mel_loss", "total_loss"):
            self.writer.add_scalar(tag=(scalar_prefix + epoch_prefix + '_' +
                                        k),
                                   scalar_value=np.mean(self.metrics[k]),
                                   global_step=epoch_num)

    def write_audio_to_tensorboard(self,
                                   exp_name,
                                   outputs,
                                   step_num,
                                   sample_rate,
                                   validation=False):
        self.writer.add_audio(exp_name,
                              outputs,
                              step_num,
                              sample_rate=sample_rate)

    def update_best_model(self):
        cur_metric = np.mean(self.metrics['total_loss'])
        if cur_metric < self.best_metric:
            self.best_metric = cur_metric
            return True
        else:
            return False
Exemplo n.º 8
0
class Logger(object):
    def __init__(self, run_dir, **kwargs):
        self.writer = SummaryWriter(run_dir, **kwargs)
        self.async_executor = ThreadPoolExecutor(max_workers=4)
        self.futures = dict()

    def add_scalar(self, name, scalar, global_step):
        self.writer.add_scalar(name, scalar, global_step)

    def add_audio(self, name, audio, global_step, sr=22050):
        self.writer.add_audio(name, audio, global_step, sample_rate=sr)

    def add_image(self, name, image, global_step):
        self.writer.add_image(name, image, global_step)

    def add_async(self, fn, cb, *args, **kwargs):
        future = self.async_executor.submit(fn, *args, **kwargs)
        self.futures[future] = cb

    def process_async(self):
        done = list(filter(lambda future: future.done(), self.futures))

        for future in done:
            cb = self.futures[future]
            try:
                res = future.result()
            except TimeoutError:
                print('TimeoutError, no need to be too upset')
            else:
                del self.futures[future]
                cb(res)

    def close(self):
        self.async_executor.shutdown(wait=True)
        self.process_async()
        self.writer.close()

    def __enter__(self):
        return self

    def __exit__(self, exception_type, exception_value, traceback):
        self.close()
Exemplo n.º 9
0
class Logger(object):
    def __init__(self):
        self.summary_writer = None

    def setup(self, *args, **kwargs):
        if self.summary_writer is not None:
            raise RuntimeError("set_up can only be called once")
        self.summary_writer = SummaryWriter(*args, **kwargs)

    def add_audio(self, *args, **kwargs):
        self.ensure_ready()
        self.summary_writer.add_audio(*args, **kwargs)

    def add_embedding(self, *args, **kwargs):
        self.ensure_ready()
        self.summary_writer.add_embedding(*args, **kwargs)

    def ensure_ready(self):
        if self.summary_writer is None:
            raise RuntimeError("set_up has not been run")
Exemplo n.º 10
0
class Logger(object):
    def __init__(self, log_dir):
        self.log_dir = log_dir
        self.writer = SummaryWriter(log_dir)

    def log(self, n_iter, report, log_type="scalar", sr=None, text=False):
        if log_type not in LOG_TYPE:
            raise ("Wrong data type for logger.")

        if log_type == "scalar":
            if text:
                self._print_scalars(n_iter, report)
            for k, v in report.items():
                self.writer.add_scalar("scalar/{}".format(k), v, n_iter)
        elif log_type == "audio":
            if sr is None:
                raise ("Sample rate is required for saving audio data.")
            for k, v in report.items():
                self.writer.add_audio(k, v, n_iter, sample_rate=sr)
        elif log_type == "image":
            for k, v in report.items():
                self.writer.add_image(k, v, n_iter)

    def _print_scalars(self, n_iter, report):
        print("---------------------------")
        print("n_iter : {}".format(n_iter))
        for k, v in report.items():
            print("{} : {:.4f}".format(k, v))
        print("---------------------------")

    def write(self):
        if not os.path.exists(self.log_dir):
            os.mkdir(self.log_dir)
        self.writer.export_scalars_to_json(
            os.path.join(self.log_dir, "log.json"))

    def flush(self):
        self.writer.file_writer.flush()

    def close(self):
        self.writer.close()
Exemplo n.º 11
0
    writer.add_scalars(
        'data/scalar_group', {
            "xsinx": n_iter * np.sin(n_iter),
            "xcosx": n_iter * np.cos(n_iter),
            "arctanx": np.arctan(n_iter)
        }, n_iter)
    x = torch.rand(32, 3, 64, 64)  # output from network
    if n_iter % 10 == 0:
        x = vutils.make_grid(x, normalize=True, scale_each=True)
        writer.add_image('Image', x, n_iter)
        x = torch.zeros(sample_rate * 2)
        for i in range(x.size(0)):
            x[i] = np.cos(
                freqs[n_iter // 10] * np.pi * float(i) /
                float(sample_rate))  # sound amplitude should in [-1, 1]
        writer.add_audio('myAudio', x, n_iter, sample_rate=sample_rate)
        writer.add_text('Text', 'text logged at step:' + str(n_iter), n_iter)
        for name, param in vgg16.named_parameters():
            writer.add_histogram(name,
                                 param.clone().cpu().data.numpy(), n_iter)
        writer.add_pr_curve('xoxo', np.random.randint(2, size=100),
                            np.random.rand(100),
                            n_iter)  #needs tensorboard 0.4RC or later
dataset = datasets.MNIST('mnist', train=False, download=True)
images = dataset.test_data[:100].float()
label = dataset.test_labels[:100]
features = images.view(100, 784)
writer.add_embedding(features, metadata=label, label_img=images.unsqueeze(1))

# export scalar data to JSON for external processing
writer.export_scalars_to_json("./all_scalars.json")
Exemplo n.º 12
0
def train_and_evaluate(dataset, hparams, logdir, checkpoint=None):
    log.info("Fetch model...")
    model = util.fetch_model(hparams)
    log.info("Fetch dataloader...")
    dataloader = util.fetch_dataloader(dataset, model, hparams)
    log.info("Fetch optimizer...")
    optimizer = util.fetch_optimizer(model, hparams)

    global_step = 0
    best_metric = 0.0
    best_model = model

    writer = SummaryWriter(logdir)

    # load model or resume from checkpoint if possible
    if checkpoint:
        state = util.load_checkpoint(checkpoint)
        if hparams.resume:
            log.info(
                'Resuming training from checkpoint: {}'.format(checkpoint))
            best_metric = state['best_metric']
            global_step = state['global_step']
            optimizer.load_state_dict(state['optim_dict'])
        log.info('Loading model from checkpoint: {}'.format(checkpoint))
        model.load_state_dict(state['state_dict'])

    log.info("Start training...")
    run_tic = time.time()
    for epoch in range(hparams.num_epochs):

        log.info("Epoch {}/{}".format(epoch + 1, hparams.num_epochs))

        global_step = train(dataloader['train'], model, optimizer, global_step,
                            hparams, writer)

        metric = val(dataloader['val'], model, global_step, hparams, writer)

        is_best = False
        if metric >= best_metric:
            log.info('Found new best! Metric: {}'.format(metric))
            is_best = True
            best_metric = metric
            best_model = model

        # Save weights
        log.info('Saving checkpoint at global step {}'.format(global_step))
        util.save_checkpoint(
            {
                'global_step': global_step,
                'best_metric': best_metric,
                'metric': metric,
                'state_dict': model.state_dict(),
                'optim_dict': optimizer.state_dict()
            },
            is_best=is_best,
            checkpoint=logdir)

    run_tac = time.time()

    log.info("Generating a sample sound with the best model...")
    gen_tic = time.time()
    audio = best_model.generate(hparams.sample_size)
    gen_tac = time.time()
    # write audio to the tensorboard
    log.info("{} epochs with batchsize:{}, total time passed:{} ".format(
        hparams.num_epochs, hparams.batch_size, run_tac - run_tic))
    log.info("Sample size: {}, generation time: {}".format(
        hparams.sample_size, gen_tac - gen_tic))
    writer.add_audio('final/wav', audio, global_step, hparams.sample_rate)
            "xcosx": n_iter * np.cos(n_iter),
            "arctanx": np.arctan(n_iter)
        }, n_iter)
    x = torch.rand(32, 3, 64, 64)  # output from network
    if n_iter % 10 == 0:
        x = vutils.make_grid(x, normalize=True, scale_each=True)
        writer.add_image('Image', x, n_iter)  # Tensor
        # writer.add_image('astronaut', skimage.data.astronaut(), n_iter) # numpy
        # writer.add_image('imread',
        # skimage.io.imread('screenshots/audio.png'), n_iter) # numpy
        x = torch.zeros(sample_rate * 2)
        for i in range(x.size(0)):
            # sound amplitude should in [-1, 1]
            x[i] = np.cos(freqs[n_iter // 10] * np.pi * float(i) /
                          float(sample_rate))
        writer.add_audio('myAudio', x, n_iter)
        writer.add_text('Text', 'text logged at step:' + str(n_iter), n_iter)
        writer.add_text('markdown Text', '''a|b\n-|-\nc|d''', n_iter)
        for name, param in resnet18.named_parameters():
            if 'bn' not in name:
                writer.add_histogram(name, param, n_iter)
        writer.add_pr_curve('xoxo', np.random.randint(2, size=100),
                            np.random.rand(100),
                            n_iter)  # needs tensorboard 0.4RC or later
        writer.add_pr_curve_raw('prcurve with raw data', true_positive_counts,
                                false_positive_counts, true_negative_counts,
                                false_negative_counts, precision, recall,
                                n_iter)
# export scalar data to JSON for external processing
writer.export_scalars_to_json(
    # 在 PyTorch_Tutorial/Code 目录下运行
Exemplo n.º 14
0
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
          sigma, iters_per_checkpoint, batch_size, seed, fp16_run,
          checkpoint_path, with_tensorboard, waveglow_config, dist_config,
          data_config, train_config, **kwargs):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # =====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    # =====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config).cuda()

    # =====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    # =====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1
    iteration_start = iteration
    trainset = Mel2Samp(**data_config)
    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    train_loader = DataLoader(
        trainset,
        num_workers=train_config.get('dataloader_num_workers', 8),
        shuffle=train_config.get('dataloader_shuffle', True),
        sampler=train_sampler,
        batch_size=batch_size,
        pin_memory=train_config.get('dataloader_pin_memory', False),
        drop_last=True)

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if with_tensorboard and rank == 0:
        from tensorboardX import SummaryWriter
        logger = SummaryWriter(os.path.join(os.path.dirname(output_directory),
                                            'tensorboard'),
                               filename_suffix='.tensorboard')

    with open(Path(output_directory).parent.joinpath('metadata', 'train.txt'),
              'wt',
              encoding='utf8') as fout:
        for line in trainset.audio_files:
            fpath = os.path.abspath(line)
            fout.write(f'{fpath}\n')

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(
                tqdm(train_loader, desc=f"Epoch-{epoch}", ncols=100)):
            model.zero_grad()

            mel, audio = batch
            mel = torch.autograd.Variable(mel.cuda())
            audio = torch.autograd.Variable(audio.cuda())
            outputs = model((mel, audio))

            loss = criterion(outputs)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.item()

            if fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            optimizer.step()

            # print("{}:\t{:.9f}".format(iteration, reduced_loss))
            if with_tensorboard and rank == 0:
                logger.add_scalar('training_loss', reduced_loss,
                                  i + len(train_loader) * epoch)

            if (iteration % iters_per_checkpoint == 0) or (iteration
                                                           == iteration_start):
                if rank == 0:
                    checkpoint_path = "{}/waveglow-{:06d}.pt".format(
                        output_directory, iteration)
                    save_checkpoint(model,
                                    optimizer,
                                    learning_rate,
                                    iteration,
                                    checkpoint_path,
                                    waveglow_config=waveglow_config)

                    info_path = os.path.join(output_directory, 'info.yml')
                    checkpoint_info = {
                        'name': os.path.basename(checkpoint_path),
                        'iteration': iteration,
                        'loss': reduced_loss
                    }
                    keep_n_checkpoints(info_path, checkpoint_info, 5)

                    if with_tensorboard:
                        # outputs[0].shape: torch.Size([1, 8, 1000])
                        with torch.no_grad():
                            d = model.infer(mel.data[0].unsqueeze(0),
                                            sigma=sigma)
                            d = d.cpu().squeeze()
                            pred_audio = (d - d.min()) * 1.98 / (
                                d.max() - d.min()) - 0.99

                            logger.add_audio(
                                "generated/iteration-{}.wav".format(iteration),
                                pred_audio,
                                iteration,
                                sample_rate=trainset.sampling_rate,
                            )

                            true_audio = audio.data[0].squeeze()
                            logger.add_audio(
                                "original/iteration-{}.wav".format(iteration),
                                true_audio,
                                iteration,
                                sample_rate=trainset.sampling_rate,
                            )

                            # 查看频谱,直观了解生成语音的情况
                            mel_output = trainset.get_mel(pred_audio.cpu())
                            logger.add_image(
                                "generated/iteration-{}.png".format(iteration),
                                plot_spectrogram_to_numpy(
                                    mel_output.data.cpu().numpy()),
                                iteration,
                                dataformats='HWC')

                            mel_input = mel.data[0]
                            logger.add_image(
                                "original/iteration-{}.png".format(iteration),
                                plot_spectrogram_to_numpy(
                                    mel_input.data.cpu().numpy()),
                                iteration,
                                dataformats='HWC')

            iteration += 1
Exemplo n.º 15
0
class Trainer:
    """
    Generalized training helper class.

    This class focuses remove repetitive sources in general training pipeline.
    And almost things has similar patterns to train some models, but, in major,
    forwarding process is mainly different in most cases.
    So, if engineer extends this class as their own cases, he/she just override forward function.

    Args:
        model: a main model to be saved and to be forwarded
        optimizer: optimizer module
        train_dataset: dataset on train phase
        valid_dataset: dataset on validation phase
        max_step: maximum iteration step
        valid_max_step: maximum iteration steps on each validation time.
        save_interval: save and validate interval (in iteration)
        log_interval: log interval (in iteration)
        save_dir: base directory to save checkpoints and logs
        save_prefix: a prefix to categorize each experiment
        grad_clip: scalars to clamp gradients
        grad_norm: maximum norm of gradients to be clipped
        pretrained_path: specific file path of checkpoint
        sr: sampling rate
        scheduler: learning rate scheduler

    Examples::
        class MyTrainer(Trainer):

            def forward(self, input: torch.tensor, target: torch.tensor, is_logging: bool):
                # forward model
                out = self.model(input)

                # calc your own loss
                loss = calc_loss(out, target)

                # build meta for logging
                meta = {
                    'loss': (loss.item(), LogType.SCALAR),
                    'out': (out[0], LogType.PLOT)
                }
                return loss, meta
    """
    def __init__(self,
                 model: nn.Module,
                 optimizer: torch.optim.Optimizer,
                 train_dataset,
                 valid_dataset,
                 max_step: int,
                 valid_max_step: int,
                 save_interval: int,
                 log_interval: int,
                 save_dir: str,
                 save_prefix: str = 'save',
                 grad_clip: float = 0.0,
                 grad_norm: float = 0.0,
                 pretrained_path: str = None,
                 sr: int = None,
                 scheduler: torch.optim.lr_scheduler._LRScheduler = None):

        # save project info
        self.pretrained_trained = pretrained_path

        # model
        self.model = model
        self.optimizer = optimizer
        self.scheduler = scheduler

        # log how many parameters in the model
        n_params = sum(p.numel() for p in self.model.parameters()
                       if p.requires_grad)
        log('Model {} was loaded. Total {} params.'.format(
            self.model.__class__.__name__, n_params))

        # adopt repeating function on datasets
        self.train_dataset = self.repeat(train_dataset)
        self.valid_dataset = self.repeat(valid_dataset)

        # save parameters
        self.step = 0
        if sr:
            self.sr = sr
        else:
            self.sr = SAMPLE_RATE
        self.max_step = max_step
        self.save_interval = save_interval
        self.log_interval = log_interval
        self.save_dir = save_dir
        self.save_prefix = save_prefix
        self.grad_clip = grad_clip
        self.grad_norm = grad_norm
        self.valid_max_step = valid_max_step

        # make dirs
        self.log_dir = os.path.join(save_dir, 'logs', self.save_prefix)
        self.model_dir = os.path.join(save_dir, 'models')
        os.makedirs(self.log_dir, exist_ok=True)
        os.makedirs(self.model_dir, exist_ok=True)

        self.writer = SummaryWriter(log_dir=self.log_dir, flush_secs=10)

        # load previous checkpoint
        # set seed
        self.seed = None
        self.load()

        if not self.seed:
            self.seed = np.random.randint(np.iinfo(np.int32).max)
            np.random.seed(self.seed)
            torch.manual_seed(self.seed)
            torch.cuda.manual_seed(self.seed)

        # load pretrained model
        if self.step == 0 and pretrained_path:
            self.load_pretrained_model()

        # valid loss
        self.best_valid_loss = np.finfo(np.float32).max
        self.cur_best_valid_loss = self.best_valid_loss
        self.save_valid_loss = np.finfo(np.float32).max

    @abc.abstractmethod
    def forward(self,
                *inputs,
                is_logging: bool = False) -> Tuple[torch.Tensor, Dict]:
        """
        :param inputs: Loaded Data Points from Speech Loader
        :param is_logging: log or not
        :return: Loss Tensor, Log Dictionary
        """
        raise NotImplemented

    def run(self) -> float:
        try:
            # training loop
            for i in range(self.step + 1, self.max_step + 1):

                # update step
                self.step = i

                # logging
                if i % self.save_interval == 1:
                    log('------------- TRAIN step : %d -------------' % i)

                # do training step
                if self.scheduler is not None:
                    self.scheduler.step(i)
                self.model.train()
                self.train(i)

                # save model
                if i % self.save_interval == 0:
                    log('------------- VALID step : %d -------------' % i)
                    # valid
                    self.model.eval()
                    self.validate(i)
                    # save model checkpoint file
                    self.save(i)

        except KeyboardInterrupt:
            log('Train is canceled !!')

        return self.best_valid_loss

    def clip_grad(self):
        if self.grad_clip:
            for p in self.model.parameters():
                if p.grad is not None:
                    p.grad = p.grad.clamp(-self.grad_clip, self.grad_clip)
        if self.grad_norm:
            torch.nn.utils.clip_grad_norm_(
                [p for p in self.model.parameters() if p.requires_grad],
                self.grad_norm)

    def train(self, step: int) -> torch.Tensor:

        # update model
        self.optimizer.zero_grad()

        # flag for logging
        log_flag = step % self.log_interval == 0

        # forward model
        loss, meta = self.forward(*to_device(next(self.train_dataset)),
                                  log_flag)

        # check loss nan
        if loss != loss:
            log('{} cur step NAN is occured'.format(step))
            return

        loss.backward()
        self.clip_grad()
        self.optimizer.step()

        # logging
        if log_flag:
            # console logging
            self.console_log('train', meta, step)
            # tensorboard logging
            self.tensorboard_log('train', meta, step)

    def validate(self, step: int):

        loss = 0.
        stat = defaultdict(float)

        for i in range(self.valid_max_step):

            # forward model
            with torch.no_grad():
                batch_loss, meta = self.forward(
                    *to_device(next(self.valid_dataset)), True)
                loss += batch_loss

            # update stat
            for key, (value, log_type) in meta.items():
                if log_type == LogType.SCALAR:
                    stat[key] += value

            # console logging of this step
            if (i + 1) % self.log_interval == 0:
                self.console_log('valid', meta, i + 1)

        meta_non_scalar = {
            key: (value, log_type)
            for key, (value, log_type) in meta.items()
            if not log_type == LogType.SCALAR
        }
        self.tensorboard_log('valid', meta_non_scalar, step)

        # averaging stat
        loss /= self.valid_max_step
        for key in stat.keys():
            stat[key] = stat[key] / self.valid_max_step

        # update best valid loss
        if loss < self.best_valid_loss:
            self.best_valid_loss = loss

        # console logging of total stat
        msg = 'step {} / total stat'.format(step)
        for key, value in sorted(stat.items()):
            msg += '\t{}: {:.6f}'.format(key, value)
        log(msg)

        # tensor board logging of scalar stat
        for key, value in stat.items():
            self.writer.add_scalar('valid/{}'.format(key),
                                   value,
                                   global_step=step)

    @property
    def save_name(self):
        if isinstance(self.model, nn.parallel.DataParallel):
            module = self.model.module
        else:
            module = self.model
        return self.save_prefix + '/' + module.__class__.__name__

    def load(self, load_optim: bool = True):
        # make name
        save_name = self.save_name

        # save path
        save_path = os.path.join(self.model_dir, save_name)

        # get latest file
        check_files = glob.glob(os.path.join(save_path, '*'))
        if check_files:
            # load latest state dict
            latest_file = max(check_files, key=os.path.getctime)
            state_dict = torch.load(latest_file)
            if 'seed' in state_dict:
                self.seed = state_dict['seed']
            # load model
            if isinstance(self.model, nn.DataParallel):
                self.model.module.load_state_dict(
                    get_loadable_checkpoint(state_dict['model']))
            else:
                self.model.load_state_dict(
                    get_loadable_checkpoint(state_dict['model']))
            if load_optim:
                self.optimizer.load_state_dict(state_dict['optim'])
            if self.scheduler is not None:
                self.scheduler.load_state_dict(state_dict['scheduler'])
            self.step = state_dict['step']
            log('checkpoint \'{}\' is loaded. previous step={}'.format(
                latest_file, self.step))
        else:
            log('No any checkpoint in {}. Loading network skipped.'.format(
                save_path))

    def save(self, step: int):

        # state dict
        state_dict = get_loadable_checkpoint(self.model.state_dict())

        # train
        state_dict = {
            'step': step,
            'model': state_dict,
            'optim': self.optimizer.state_dict(),
            'pretrained_step': step,
            'seed': self.seed
        }
        if self.scheduler is not None:
            state_dict.update({'scheduler': self.scheduler.state_dict()})

        # save for training
        save_name = self.save_name

        save_path = os.path.join(self.model_dir, save_name)
        os.makedirs(save_path, exist_ok=True)
        torch.save(state_dict,
                   os.path.join(save_path, 'step_{:06d}.chkpt'.format(step)))

        # save best
        if self.best_valid_loss != self.cur_best_valid_loss:
            save_path = os.path.join(self.model_dir, save_name + '.best.chkpt')
            torch.save(state_dict, save_path)
            self.cur_best_valid_loss = self.best_valid_loss

        # logging
        log('step %d / saved model.' % step)

    def load_pretrained_model(self):
        assert os.path.exists(
            self.pretrained_trained), 'You must define pretrained path!'
        self.model.load_state_dict(
            get_loadable_checkpoint(
                torch.load(self.pretrained_trained)['model']))

    def console_log(self, tag: str, meta: Dict[str, Any], step: int):
        # console logging
        msg = '{}\t{:06d} it'.format(tag, step)
        for key, (value, log_type) in sorted(meta.items()):
            if log_type == LogType.SCALAR:
                msg += '\t{}: {:.6f}'.format(key, value)
        log(msg)

    def tensorboard_log(self, tag: str, meta: Dict[str, Any], step: int):
        for key, (value, log_type) in meta.items():
            if log_type == LogType.IMAGE:
                self.writer.add_image('{}/{}'.format(tag, key),
                                      imshow_to_buf(to_numpy(value)),
                                      global_step=step)
            elif log_type == LogType.AUDIO:
                self.writer.add_audio('{}/{}'.format(tag, key),
                                      to_numpy(value),
                                      global_step=step,
                                      sample_rate=self.sr)
            elif log_type == LogType.SCALAR:
                self.writer.add_scalar('{}/{}'.format(tag, key),
                                       value,
                                       global_step=step)
            elif log_type == LogType.PLOT:
                self.writer.add_image('{}/{}'.format(tag, key),
                                      plot_to_buf(to_numpy(value)),
                                      global_step=step)

    @staticmethod
    def repeat(iterable):
        while True:
            for x in iterable:
                yield x
Exemplo n.º 16
0
    writer.add_scalar('data/scalar2', dummy_s2[0], n_iter)

    writer.add_scalars('data/scalar_group', {'xsinx': n_iter * np.sin(n_iter),
                                             'xcosx': n_iter * np.cos(n_iter),
                                             'arctanx': np.arctan(n_iter)}, n_iter)

    dummy_img = torch.rand(32, 3, 64, 64)  # output from network
    if n_iter % 10 == 0:
        x = vutils.make_grid(dummy_img, normalize=True, scale_each=True)
        writer.add_image('Image', x, n_iter)

        dummy_audio = torch.zeros(sample_rate * 2)
        for i in range(x.size(0)):
            # amplitude of sound should in [-1, 1]
            dummy_audio[i] = np.cos(freqs[n_iter // 10] * np.pi * float(i) / float(sample_rate))
        writer.add_audio('myAudio', dummy_audio, n_iter, sample_rate=sample_rate)

        writer.add_text('Text', 'text logged at step:' + str(n_iter), n_iter)

        for name, param in resnet18.named_parameters():
            writer.add_histogram(name, param.clone().cpu().data.numpy(), n_iter)

        # needs tensorboard 0.4RC or later
        writer.add_pr_curve('xoxo', np.random.randint(2, size=100), np.random.rand(100), n_iter)

dataset = datasets.MNIST('mnist', train=False, download=True)
images = dataset.test_data[:100].float()
label = dataset.test_labels[:100]

features = images.view(100, 784)
writer.add_embedding(features, metadata=label, label_img=images.unsqueeze(1))
Exemplo n.º 17
0
def train(num_gpus,
          rank,
          group_name,
          output_directory,
          epochs,
          learning_rate,
          sigma,
          iters_per_checkpoint,
          batch_size,
          seed,
          fp16_run,
          checkpoint_path,
          with_tensorboard,
          num_workers=2):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(sigma)
    model = WaveGlow(**waveglow_config).cuda()

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1

    # HACK: setup separate training and eval sets
    training_files = data_config['training_files']
    eval_files = data_config['eval_files']
    del data_config['training_files']
    del data_config['eval_files']
    data_config['audio_files'] = training_files
    trainset = Mel2Samp(**data_config)
    data_config['audio_files'] = eval_files
    evalset = Mel2Samp(**data_config)

    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    eval_sampler = DistributedSampler(evalset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======

    print("Creating dataloaders with " + str(num_workers) + " workers")
    train_loader = DataLoader(trainset,
                              num_workers=num_workers,
                              shuffle=True,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True)
    eval_loader = DataLoader(evalset,
                             num_workers=num_workers,
                             shuffle=True,
                             sampler=eval_sampler,
                             batch_size=batch_size,
                             pin_memory=False,
                             drop_last=True)

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if with_tensorboard and rank == 0:
        from tensorboardX import SummaryWriter
        logger_train = SummaryWriter(
            os.path.join(output_directory, 'logs', 'train'))
        logger_eval = SummaryWriter(
            os.path.join(output_directory, 'logs', 'eval'))

    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        model.train()
        with tqdm(total=len(train_loader)) as train_pbar:
            for i, batch in enumerate(train_loader):
                model.zero_grad()

                mel, audio = batch
                mel = torch.autograd.Variable(mel.cuda())
                audio = torch.autograd.Variable(audio.cuda())
                outputs = model((mel, audio))

                loss = criterion(outputs)
                if num_gpus > 1:
                    reduced_loss = reduce_tensor(loss.data, num_gpus).item()
                else:
                    reduced_loss = loss.item()

                if fp16_run:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                optimizer.step()

                train_pbar.set_description(
                    "Epoch {} Iter {} Loss {:.3f}".format(
                        epoch, iteration, reduced_loss))
                if with_tensorboard and rank == 0 and iteration % 10 == 0:
                    logger_train.add_scalar('loss', reduced_loss,
                                            i + len(train_loader) * epoch)
                    # adding logging for GPU utilization and memory usage
                    gpu_memory_used, gpu_utilization = get_gpu_stats()
                    k = 'gpu' + str(0)
                    logger_train.add_scalar(k + '/memory', gpu_memory_used,
                                            iteration)
                    logger_train.add_scalar(k + '/load', gpu_utilization,
                                            iteration)
                    logger_train.flush()

                if (iteration % iters_per_checkpoint == 0):
                    if rank == 0:
                        checkpoint_path = "{}/waveglow_{}".format(
                            output_directory, iteration)
                        save_checkpoint(model, optimizer, learning_rate,
                                        iteration, checkpoint_path)

                iteration += 1
                train_pbar.update(1)

        # Eval
        model.eval()
        torch.cuda.empty_cache()

        with torch.no_grad():
            tensorboard_mel, tensorboard_audio = None, None
            loss_accum = []
            with tqdm(total=len(eval_loader)) as eval_pbar:
                for i, batch in enumerate(eval_loader):
                    model.zero_grad()
                    mel, audio = batch
                    mel = torch.autograd.Variable(mel.cuda())
                    audio = torch.autograd.Variable(audio.cuda())
                    outputs = model((mel, audio))
                    loss = criterion(outputs).item()
                    loss_accum.append(loss)
                    eval_pbar.set_description("Epoch {} Eval {:.3f}".format(
                        epoch, loss))
                    outputs = None

                    # use the first batch for tensorboard audio samples
                    if i == 0:
                        tensorboard_mel = mel
                        tensorboard_audio = audio
                    eval_pbar.update(1)

            if with_tensorboard and rank == 0:
                loss_avg = statistics.mean(loss_accum)
                tqdm.write("Epoch {} Eval AVG {}".format(epoch, loss_avg))
                logger_eval.add_scalar('loss', loss_avg, iteration)

            # log audio samples to tensorboard
            tensorboard_audio_generated = model.infer(tensorboard_mel)
            for i in range(0, 5):
                ta = tensorboard_audio[i].cpu().numpy()
                tag = tensorboard_audio_generated[i].cpu().numpy()
                logger_eval.add_audio("sample " + str(i) + "/orig",
                                      ta,
                                      epoch,
                                      sample_rate=data_config['sampling_rate'])
                logger_eval.add_audio("sample " + str(i) + "/gen",
                                      tag,
                                      epoch,
                                      sample_rate=data_config['sampling_rate'])
            logger_eval.flush()
Exemplo n.º 18
0
                    "Generative adversarial network or variational auto-encoder.",
                    "Please call Stella.",
                    "Some have accepted this as a miracle without any physical explanation.",
                ]
                for idx, sent in enumerate(sentences):
                    wav, attn = eval_model(
                        dv3, sent, replace_pronounciation_prob, min_level_db,
                        ref_level_db, power, n_iter, win_length, hop_length,
                        preemphasis)
                    wav_path = os.path.join(
                        state_dir, "waveform",
                        "eval_sample_{:09d}.wav".format(global_step))
                    sf.write(wav_path, wav, sample_rate)
                    writer.add_audio(
                        "eval_sample_{}".format(idx),
                        wav,
                        global_step,
                        sample_rate=sample_rate)
                    attn_path = os.path.join(
                        state_dir, "alignments",
                        "eval_sample_attn_{:09d}.png".format(global_step))
                    plot_alignment(attn, attn_path)
                    writer.add_image(
                        "eval_sample_attn{}".format(idx),
                        cm.viridis(attn),
                        global_step,
                        dataformats="HWC")

            # save checkpoint
            if global_step % save_interval == 0:
                io.save_parameters(ckpt_dir, global_step, dv3, optim)
Exemplo n.º 19
0
def train(args, hp, hp_str, logger, vocoder):
    os.makedirs(os.path.join(hp.train.chkpt_dir, args.name), exist_ok=True)
    os.makedirs(os.path.join(args.outdir, args.name), exist_ok=True)
    os.makedirs(os.path.join(args.outdir, args.name, "assets"), exist_ok=True)
    device = torch.device("cuda" if hp.train.ngpu > 0 else "cpu")

    dataloader = loader.get_tts_dataset(hp.data.data_dir, hp.train.batch_size,
                                        hp)
    validloader = loader.get_tts_dataset(hp.data.data_dir, 1, hp, True)

    idim = len(valid_symbols)
    odim = hp.audio.num_mels
    model = fastspeech.FeedForwardTransformer(idim, odim, hp)
    # set torch device
    model = model.to(device)
    print("Model is loaded ...")
    githash = get_commit_hash()
    if args.checkpoint_path is not None:
        if os.path.exists(args.checkpoint_path):
            logger.info("Resuming from checkpoint: %s" % args.checkpoint_path)
            checkpoint = torch.load(args.checkpoint_path)
            model.load_state_dict(checkpoint["model"])
            optimizer = get_std_opt(
                model,
                hp.model.adim,
                hp.model.transformer_warmup_steps,
                hp.model.transformer_lr,
            )
            optimizer.load_state_dict(checkpoint["optim"])
            global_step = checkpoint["step"]

            if hp_str != checkpoint["hp_str"]:
                logger.warning(
                    "New hparams is different from checkpoint. Will use new.")

            if githash != checkpoint["githash"]:
                logger.warning(
                    "Code might be different: git hash is different.")
                logger.warning("%s -> %s" % (checkpoint["githash"], githash))

        else:
            print("Checkpoint does not exixts")
            global_step = 0
            return None
    else:
        print("New Training")
        global_step = 0
        optimizer = get_std_opt(
            model,
            hp.model.adim,
            hp.model.transformer_warmup_steps,
            hp.model.transformer_lr,
        )

    print("Batch Size :", hp.train.batch_size)

    num_params(model)

    os.makedirs(os.path.join(hp.train.log_dir, args.name), exist_ok=True)
    writer = SummaryWriter(os.path.join(hp.train.log_dir, args.name))
    model.train()
    forward_count = 0
    # print(model)
    for epoch in range(hp.train.epochs):
        start = time.time()
        running_loss = 0
        j = 0

        pbar = tqdm.tqdm(dataloader, desc="Loading train data")
        for data in pbar:
            global_step += 1
            x, input_length, y, _, out_length, _, dur, e, p = data
            # x : [batch , num_char], input_length : [batch], y : [batch, T_in, num_mel]
            #             # stop_token : [batch, T_in], out_length : [batch]

            loss, report_dict = model(
                x.cuda(),
                input_length.cuda(),
                y.cuda(),
                out_length.cuda(),
                dur.cuda(),
                e.cuda(),
                p.cuda(),
            )
            loss = loss.mean() / hp.train.accum_grad
            running_loss += loss.item()

            loss.backward()

            # update parameters
            forward_count += 1
            j = j + 1
            if forward_count != hp.train.accum_grad:
                continue
            forward_count = 0
            step = global_step

            # compute the gradient norm to check if it is normal or not
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       hp.train.grad_clip)
            logging.debug("grad norm={}".format(grad_norm))
            if math.isnan(grad_norm):
                logging.warning("grad norm is nan. Do not update model.")
            else:
                optimizer.step()
            optimizer.zero_grad()

            if step % hp.train.summary_interval == 0:
                pbar.set_description(
                    "Average Loss %.04f Loss %.04f | step %d" %
                    (running_loss / j, loss.item(), step))

                for r in report_dict:
                    for k, v in r.items():
                        if k is not None and v is not None:
                            if "cupy" in str(type(v)):
                                v = v.get()
                            if "cupy" in str(type(k)):
                                k = k.get()
                            writer.add_scalar("main/{}".format(k), v, step)

            if step % hp.train.validation_step == 0:

                for valid in validloader:
                    x_, input_length_, y_, _, out_length_, ids_, dur_, e_, p_ = valid
                    model.eval()
                    with torch.no_grad():
                        loss_, report_dict_ = model(
                            x_.cuda(),
                            input_length_.cuda(),
                            y_.cuda(),
                            out_length_.cuda(),
                            dur_.cuda(),
                            e_.cuda(),
                            p_.cuda(),
                        )

                        mels_ = model.inference(x_[-1].cuda())  # [T, num_mel]

                    model.train()
                    for r in report_dict_:
                        for k, v in r.items():
                            if k is not None and v is not None:
                                if "cupy" in str(type(v)):
                                    v = v.get()
                                if "cupy" in str(type(k)):
                                    k = k.get()
                                writer.add_scalar("validation/{}".format(k), v,
                                                  step)

                    mels_ = mels_.T  # Out: [num_mels, T]
                    writer.add_image(
                        "melspectrogram_target_{}".format(ids_[-1]),
                        plot_spectrogram_to_numpy(
                            y_[-1].T.data.cpu().numpy()[:, :out_length_[-1]]),
                        step,
                        dataformats="HWC",
                    )
                    writer.add_image(
                        "melspectrogram_prediction_{}".format(ids_[-1]),
                        plot_spectrogram_to_numpy(mels_.data.cpu().numpy()),
                        step,
                        dataformats="HWC",
                    )

                    # print(mels.unsqueeze(0).shape)

                    audio = generate_audio(
                        mels_.unsqueeze(0), vocoder
                    )  # selecting the last data point to match mel generated above
                    audio = audio.cpu().float().numpy()
                    audio = audio / (audio.max() - audio.min()
                                     )  # get values between -1 and 1

                    writer.add_audio(
                        tag="generated_audio_{}".format(ids_[-1]),
                        snd_tensor=torch.Tensor(audio),
                        global_step=step,
                        sample_rate=hp.audio.sample_rate,
                    )

                    _, target = read_wav_np(
                        hp.data.wav_dir + f"{ids_[-1]}.wav",
                        sample_rate=hp.audio.sample_rate,
                    )

                    writer.add_audio(
                        tag=" target_audio_{}".format(ids_[-1]),
                        snd_tensor=torch.Tensor(target),
                        global_step=step,
                        sample_rate=hp.audio.sample_rate,
                    )

                ##
            if step % hp.train.save_interval == 0:
                avg_p, avg_e, avg_d = evaluate(hp, validloader, model)
                writer.add_scalar("evaluation/Pitch Loss", avg_p, step)
                writer.add_scalar("evaluation/Energy Loss", avg_e, step)
                writer.add_scalar("evaluation/Dur Loss", avg_d, step)
                save_path = os.path.join(
                    hp.train.chkpt_dir,
                    args.name,
                    "{}_fastspeech_{}_{}k_steps.pyt".format(
                        args.name, githash, step // 1000),
                )

                torch.save(
                    {
                        "model": model.state_dict(),
                        "optim": optimizer.state_dict(),
                        "step": step,
                        "hp_str": hp_str,
                        "githash": githash,
                    },
                    save_path,
                )
                logger.info("Saved checkpoint to: %s" % save_path)
        print("Time taken for epoch {} is {} sec\n".format(
            epoch + 1, int(time.time() - start)))
    writer.add_scalar('data/scalar2', dummy_s2[0], n_iter)

    writer.add_scalars('data/scalar_group', {'xsinx': n_iter * np.sin(n_iter),
                                             'xcosx': n_iter * np.cos(n_iter),
                                             'arctanx': np.arctan(n_iter)}, n_iter)

    dummy_img = torch.rand(32, 3, 64, 64)  # output from network
    if n_iter % 10 == 0:
        x = vutils.make_grid(dummy_img, normalize=True, scale_each=True)
        writer.add_image('Image', x, n_iter) # 后面的覆盖前面的

        dummy_audio = torch.zeros(sample_rate * 2)
        for i in range(x.size(0)):
            # amplitude of sound should in [-1, 1]
            dummy_audio[i] = np.cos(freqs[n_iter // 10] * np.pi * float(i) / float(sample_rate))
        writer.add_audio('myAudio', dummy_audio, n_iter, sample_rate=sample_rate) # 后面的覆盖前面的

        writer.add_text('Text', 'text logged at step:' + str(n_iter), n_iter) # 后面没有覆盖前面,why ?
        # 文本就不会覆盖???

        for name, param in resnet18.named_parameters():
            writer.add_histogram(name, param.clone().cpu().data.numpy(), n_iter)
            # 把每个iter的 参数全部保存了,因为没有重名!
            # 同时默认创建 distribution 视图

        # needs tensorboard 0.4RC or later
        writer.add_pr_curve('xoxo', np.random.randint(2, size=100), np.random.rand(100), n_iter)
        # 好像后面也没有覆盖前面的!why ?

dataset = datasets.MNIST('mnist', train=False, download=True)
images = dataset.test_data[:100].float()
class PPTS_Solver(Solver):
    def __init__(self, config, args, mode='train'):
        super(PPTS_Solver, self).__init__(config, args)

        self.phn_hat_dir = config['path']['ppr']['output_dir']
        self.phn_dim = config['text']['phn_dim']
        self.n_fft = config['audio']['n_fft']

        self.lr = config['model']['ppts']['lr']
        self.optimizer_type = config['model']['ppts']['type']
        self.betas = [float(x) for x in config['model']['ppts']['betas'].split(',')]
        self.weight_decay = config['model']['ppts']['weight_decay']

        self.spk_id = args.spk_id
        if self.spk_id == ('' or None):
            print("[Error] A spk_id must be given to init a PPTS solver")
            exit()

        self.mode = mode
        self.model, self.criterion = self.build_model()
        if mode == 'train':
            self.optimizer = self.build_optimizer()
            self.train_loader = self.get_dataset(self.train_meta_path)
            self.eval_loader = self.get_dataset(self.eval_meta_path)
            self.log_dir = os.path.join(config['path']['ppts']['log_dir'], self.spk_id)
            self.writer = SummaryWriter(self.log_dir)
        elif mode == 'test':
            self.test_loader = self.get_dataset(self.test_meta_path)

        self.save_dir = os.path.join(config['path']['ppts']['save_dir'], self.spk_id)
        self.ppts_output_dir = os.path.join(config['path']['ppts']['output_dir'], self.spk_id)

        # attempt to load or set gs and epoch to 0
        self.load_ckpt()

    def get_dataset(self, meta_path):
        dataset = PPTS_VCTKDataset(
            feat_dir=self.feat_dir,
            meta_path=meta_path,
            dict_path=self.dict_path,
            phn_hat_dir=self.phn_hat_dir,
            spk_id = self.spk_id,
            mode=self.mode
        )
        dataloader = DataLoader(
            dataset, batch_size=self.batch_size,
            shuffle=True if self.mode == 'train' else False,
            num_workers=self.num_workers,
            collate_fn=dataset._collate_fn, pin_memory=True
        )
        return dataloader

    def build_model(self):
        ppts = PPTS(
            input_dim=self.phn_dim, output_dim=(self.n_fft//2)+1,
            dropout_rate=0.5, prenet_hidden_dims=[256, 128], K=16,
            conv1d_bank_hidden_dim=128, conv1d_projections_hidden_dim=256,
            gru_dim=256
        )
        ppts = ppts.to(self.device)
        criterion = torch.nn.L1Loss()
        return ppts, criterion

    def build_optimizer(self):
        optimizer = getattr(torch.optim, self.optimizer_type)
        optimizer = optimizer(
            self.model.parameters(), lr=self.lr,
            betas=self.betas, weight_decay=self.weight_decay
        )
        return optimizer

    def save_ckpt(self):
        if not os.path.exists(self.save_dir):
            os.makedirs(self.save_dir)
        checkpoint_path = os.path.join(
            self.save_dir, "model.ckpt-{}.pt".format(self.global_step)
        )
        torch.save({
            "model": self.model.state_dict(),
            "optimizer": self.optimizer.state_dict(),
            "global_step": self.global_step,
            "epoch": self.epoch
        }, checkpoint_path)
        print("Checkpoint model.ckpt-{}.pt saved.".format(self.global_step))
        with open(os.path.join(self.save_dir, "checkpoint"), "w") as f:
            f.write("model.ckpt-{}".format(self.global_step))
        return

    def load_ckpt(self):
        checkpoint_list = os.path.join(self.save_dir, 'checkpoint')
        if os.path.exists(checkpoint_list):
            checkpoint_filename = open(checkpoint_list).readline().strip()
            checkpoint_path = os.path.join(self.save_dir, "{}.pt".format(checkpoint_filename))
            if self.use_gpu:
                checkpoint = torch.load(checkpoint_path)
            else:
                checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
            self.model.load_state_dict(checkpoint['model'])
            if self.mode == 'train':
                self.optimizer.load_state_dict(checkpoint['optimizer'])
            self.global_step = checkpoint['global_step']
            self.epoch = checkpoint['epoch']
            print("Checkpoint model.ckpt-{}.pt loaded.".format(self.global_step))
        else:
            self.global_step = 0
            self.epoch = 0
            print("Start training with new parameters.")
        return

    def train(self):
        epoch_loss = 0.0
        self.model.train()
        for idx, (_, phn_hat_batch, mag_batch) in enumerate(self.train_loader):
            phn_hat_batch, mag_batch = phn_hat_batch.to(self.device), mag_batch.to(self.device)

            # Forward
            self.optimizer.zero_grad()
            mag_hat = self.model(phn_hat_batch)
            #loss = self.criterion(mag_hat, mag_batch)
            loss = 0.5 * self.criterion(mag_hat, mag_batch) + \
                   0.5 * self.criterion(mag_hat[:,:,:200], mag_batch[:,:,:200])
            epoch_loss += loss.item()

            # Logging
            # Because of number of batch is too few, only log at epoch level
            '''
            if self.global_step % self.log_interval == 0:
                print(
                    '[GS=%3d, epoch=%d, idx=%3d] loss: %.6f' % \
                    (self.global_step+1, self.epoch+1, idx+1, loss.item())
                )
            if self.global_step % self.summ_interval == 0:
                self.writer.add_scalar('train/training_loss', loss.item(), self.global_step)
            '''

            # Backward
            loss.backward()
            self.optimizer.step()

            # Saving or not
            self.global_step += 1
            if self.global_step % self.ckpt_interval == 0:
                self.save_ckpt()

        epoch_loss /= (idx+1)
        print('[epoch %d] training_loss: %.6f' % (self.epoch, epoch_loss))
        self.writer.add_scalar('train/epoch_training_loss', epoch_loss, self.epoch)
        self.writer.add_image('train/phn_hat',
            torch.t(phn_hat_batch[0]).detach().cpu().numpy(),
            self.epoch, dataformats='HW'
        )
        self.writer.add_image(
            'train/mag_gt', torch.t(mag_batch[0]).detach().cpu().numpy()[::-1,:],
            self.epoch, dataformats='HW'
        )
        self.writer.add_image(
            'train/mag_hat', torch.t(mag_hat[0]).detach().cpu().numpy()[::-1,:],
            self.epoch, dataformats='HW'
        )
        self.writer.add_audio(
            'train/audio_gt', self.ap.inv_spectrogram(mag_batch[0].detach().cpu().numpy()),
            self.epoch, sample_rate=self.ap.sr
        )
        self.writer.add_audio(
            'train/audio_hat', self.ap.inv_spectrogram(mag_hat[0].detach().cpu().numpy()),
            self.epoch, sample_rate=self.ap.sr
        )
        self.epoch += 1

        return

    def eval(self):
        eval_loss = 0.0
        self.model.eval()
        with torch.no_grad():
            for idx, (_, phn_hat_batch, mag_batch) in enumerate(self.eval_loader):
                phn_hat_batch, mag_batch = phn_hat_batch.to(self.device), mag_batch.to(self.device)
                mag_hat = self.model(phn_hat_batch)
                loss = self.criterion(mag_hat, mag_batch)
                eval_loss += loss.item()

                if idx % 100 == 0:
                    break

        eval_loss /= (idx+1)
        print('[eval %d] eval_loss: %.6f' % (self.epoch, eval_loss))

        self.writer.add_scalar('eval/eval_loss', eval_loss, self.epoch)
        self.writer.add_image(
            'eval/mag_gt', torch.t(mag_batch[0]).detach().cpu().numpy()[::-1,:],
            self.epoch, dataformats='HW'
        )
        self.writer.add_image(
            'eval/mag_hat', torch.t(mag_hat[0]).detach().cpu().numpy()[::-1,:],
            self.epoch, dataformats='HW'
        )
        self.writer.add_audio(
            'eval/audio_gt', self.ap.inv_spectrogram(mag_batch[0].detach().cpu().numpy()),
            self.epoch, sample_rate=self.ap.sr
        )
        self.writer.add_audio(
            'eval/audio_hat', self.ap.inv_spectrogram(mag_hat[0].detach().cpu().numpy()),
            self.epoch, sample_rate=self.ap.sr
        )

        return
Exemplo n.º 22
0
class Runner():
    ''' Handler for complete pre-training progress of upstream models '''
    def __init__(self, args, config, dataloader, ckpdir):

        self.device = torch.device('cuda') if (
            args.gpu and torch.cuda.is_available()) else torch.device('cpu')
        if torch.cuda.is_available(): print('[Runner] - CUDA is available!')
        self.model_kept = []
        self.global_step = 1
        self.log = SummaryWriter(ckpdir)

        self.args = args
        self.config = config
        self.dataloader = dataloader
        self.ckpdir = ckpdir

        # optimizer
        self.learning_rate = float(config['optimizer']['learning_rate'])
        self.warmup_proportion = config['optimizer']['warmup_proportion']
        self.gradient_accumulation_steps = config['optimizer'][
            'gradient_accumulation_steps']
        self.gradient_clipping = config['optimizer']['gradient_clipping']

        # Training details
        self.apex = config['runner']['apex']
        self.total_steps = config['runner']['total_steps']
        self.log_step = config['runner']['log_step']
        self.save_step = config['runner']['save_step']
        self.duo_feature = config['runner']['duo_feature']
        self.max_keep = config['runner']['max_keep']

        # model
        self.transformer_config = config['transformer']
        self.dr = config['transformer']['downsample_rate']
        self.dual_transformer = config['transformer'][
            'dual_transformer'] if 'dual_transformer' in config[
                'transformer'] else False
        self.wave_transformer = config['transformer'][
            'wave_transformer'] if 'wave_transformer' in config[
                'transformer'] else False
        if 'online' in config:
            print(f'[Runner] - Using features extracted on-the-fly')
            self.input_dim, self.output_dim = [
                feat.size(-1)
                for feat in self.dataloader.dataset.preprocessor()
            ]
        else:
            if self.wave_transformer:
                raise ValueError(
                    'Wave transformer needs to be run with online feature extraction!'
                )
            print(f'[Runner] - Using features pre-extracted and saved')
            self.input_dim = self.transformer_config['input_dim']
            self.output_dim = 1025 if self.duo_feature else None  # output dim is the same as input dim if not using duo features

    def set_model(self):
        # build the Transformer model with speech prediction head
        if self.dual_transformer:
            print('[Runner] - Initializing Dual Transformer model...')
            model_config = DualTransformerConfig(self.config)
            self.model = DualTransformerForMaskedAcousticModel(
                model_config, self.input_dim, self.output_dim).to(self.device)
        else:
            if self.wave_transformer:
                print('[Runner] - Initializing Wave Transformer model...')
            else:
                print('[Runner] - Initializing Transformer model...')
            model_config = TransformerConfig(self.config)
            self.model = TransformerForMaskedAcousticModel(
                model_config, self.input_dim, self.output_dim).to(self.device)
        self.model.train()

        if self.args.multi_gpu:
            self.model = torch.nn.DataParallel(self.model)
            print('[Runner] - Multi-GPU training Enabled: ' +
                  str(torch.cuda.device_count()))
        print('[Runner] - Number of parameters: ' + str(
            sum(p.numel()
                for p in self.model.parameters() if p.requires_grad)))

        # Setup optimizer
        param_optimizer = list(self.model.named_parameters())

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        if 'type' not in self.config['optimizer']:
            self.config['optimizer']['type'] = 'adam'
        print('[Runner] - Optimizer: ' +
              ('apex Fused Adam' if self.
               apex else str(self.config['optimizer']['type'])))
        if self.apex:
            try:
                from apex.optimizers import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=self.learning_rate,
                                  bias_correction=False,
                                  max_grad_norm=1.0)
            if self.config['optimizer']['loss_scale'] == 0:
                self.optimizer = FP16_Optimizer(optimizer,
                                                dynamic_loss_scale=True)
            else:
                self.optimizer = FP16_Optimizer(
                    optimizer,
                    static_loss_scale=self.config['optimizer']['loss_scale'])
            self.warmup_linear = WarmupLinearSchedule(
                warmup=self.warmup_proportion, t_total=self.total_steps)
        elif self.config['optimizer']['type'] == 'adam':
            self.optimizer = BertAdam(optimizer_grouped_parameters,
                                      lr=self.learning_rate,
                                      warmup=self.warmup_proportion,
                                      t_total=self.total_steps,
                                      schedule='warmup_linear')
        elif self.config['optimizer']['type'] == 'lamb' or self.config[
                'optimizer']['type'] == 'adamW':
            self.optimizer = Lamb(
                optimizer_grouped_parameters,
                lr=self.learning_rate,
                warmup=self.warmup_proportion,
                t_total=self.total_steps,
                schedule='warmup_linear',
                adam=True
                if self.config['optimizer']['type'] == 'adamW' else False,
                correct_bias=True
                if self.config['optimizer']['type'] == 'adamW' else False)
        else:
            raise NotImplementedError()

    def save_model(self, name='states', to_path=None):
        if self.dual_transformer:
            all_states = {
                'SpecHead':
                self.model.SpecHead.state_dict() if not self.args.multi_gpu
                else self.model.module.SpecHead.state_dict()
            }
            if hasattr(self.model, 'SpecTransformer'):
                all_states['SpecTransformer'] = self.model.SpecTransformer.state_dict(
                ) if not self.args.multi_gpu else self.model.module.SpecTransformer.state_dict(
                )
            if hasattr(self.model, 'SPE'):
                all_states[
                    'SPE'] = self.model.SPE if not self.args.multi_gpu else self.model.module.SPE
            if hasattr(self.model, 'PhoneticTransformer'):
                all_states[
                    'PhoneticTransformer'] = self.model.PhoneticTransformer.Transformer.state_dict(
                    ) if not self.args.multi_gpu else self.model.module.PhoneticTransformer.Transformer.state_dict(
                    )
            if hasattr(self.model.PhoneticTransformer, 'PhoneRecognizer'):
                all_states[
                    'PhoneticLayer'] = self.model.PhoneticTransformer.PhoneRecognizer.state_dict(
                    ) if not self.args.multi_gpu else self.model.module.PhoneticTransformer.PhoneRecognizer.state_dict(
                    )
            if hasattr(self.model, 'SpeakerTransformer'):
                all_states[
                    'SpeakerTransformer'] = self.model.SpeakerTransformer.Transformer.state_dict(
                    ) if not self.args.multi_gpu else self.model.module.SpeakerTransformer.Transformer.state_dict(
                    )
            if hasattr(self.model.SpeakerTransformer, 'SpeakerRecognizer'):
                all_states[
                    'SpeakerLayer'] = self.model.SpeakerTransformer.SpeakerRecognizer.state_dict(
                    ) if not self.args.multi_gpu else self.model.module.SpeakerTransformer.SpeakerRecognizer.state_dict(
                    )
        else:
            all_states = {
                'SpecHead':
                self.model.SpecHead.state_dict() if not self.args.multi_gpu
                else self.model.module.SpecHead.state_dict(),
                'Transformer':
                self.model.Transformer.state_dict() if not self.args.multi_gpu
                else self.model.module.Transformer.state_dict(),
            }

        all_states['Optimizer'] = self.optimizer.state_dict()
        all_states['Global_step'] = self.global_step
        all_states['Settings'] = {'Config': self.config, 'Paras': self.args}

        if to_path is None:
            new_model_path = '{}/{}-{}.ckpt'.format(self.ckpdir, name,
                                                    self.global_step)
        else:
            new_model_path = to_path

        torch.save(all_states, new_model_path)
        self.model_kept.append(new_model_path)

        if len(self.model_kept) >= self.max_keep:
            os.remove(self.model_kept[0])
            self.model_kept.pop(0)

    def up_sample_frames(self, spec, return_first=False):
        if len(spec.shape) != 3:
            spec = spec.unsqueeze(0)
            assert (len(spec.shape) == 3
                    ), 'Input should have acoustic feature of shape BxTxD'
        # spec shape: [batch_size, sequence_length // downsample_rate, output_dim * downsample_rate]
        spec_flatten = spec.view(spec.shape[0], spec.shape[1] * self.dr,
                                 spec.shape[2] // self.dr)
        if return_first: return spec_flatten[0]
        return spec_flatten  # spec_flatten shape: [batch_size, sequence_length * downsample_rate, output_dim // downsample_rate]

    def down_sample_frames(self, spec):
        left_over = spec.shape[1] % self.dr
        if left_over != 0: spec = spec[:, :-left_over, :]
        spec_stacked = spec.view(spec.shape[0], spec.shape[1] // self.dr,
                                 spec.shape[2] * self.dr)
        return spec_stacked

    def process_data(self, spec):
        """Process training data for the masked acoustic model"""
        with torch.no_grad():

            assert (
                len(spec) == 5
            ), 'dataloader should return (spec_masked, pos_enc, mask_label, attn_mask, spec_stacked)'
            # Unpack and Hack bucket: Bucketing should cause acoustic feature to have shape 1xBxTxD'
            spec_masked = spec[0].squeeze(0)
            pos_enc = spec[1].squeeze(0)
            mask_label = spec[2].squeeze(0)
            attn_mask = spec[3].squeeze(0)
            spec_stacked = spec[4].squeeze(0)

            spec_masked = spec_masked.to(device=self.device)
            if pos_enc.dim() == 3:
                # pos_enc: (batch_size, seq_len, hidden_size)
                # GPU memory need (batch_size * seq_len * hidden_size)
                pos_enc = pos_enc.float().to(device=self.device)
            elif pos_enc.dim() == 2:
                # pos_enc: (seq_len, hidden_size)
                # GPU memory only need (seq_len * hidden_size) even after expanded
                pos_enc = pos_enc.float().to(device=self.device).expand(
                    spec_masked.size(0), *pos_enc.size())
            mask_label = mask_label.bool().to(device=self.device)
            attn_mask = attn_mask.float().to(device=self.device)
            spec_stacked = spec_stacked.to(device=self.device)

        return spec_masked, pos_enc, mask_label, attn_mask, spec_stacked  # (x, pos_enc, mask_label, attention_mask. y)

    def process_dual_data(self, spec):
        """Process training data for the dual masked acoustic model"""
        with torch.no_grad():

            assert (
                len(spec) == 6
            ), 'dataloader should return (time_masked, freq_masked, pos_enc, mask_label, attn_mask, spec_stacked)'
            # Unpack and Hack bucket: Bucketing should cause acoustic feature to have shape 1xBxTxD'
            time_masked = spec[0].squeeze(0)
            freq_masked = spec[1].squeeze(0)
            pos_enc = spec[2].squeeze(0)
            mask_label = spec[3].squeeze(0)
            attn_mask = spec[4].squeeze(0)
            spec_stacked = spec[5].squeeze(0)

            time_masked = time_masked.to(device=self.device)
            freq_masked = freq_masked.to(device=self.device)
            if pos_enc.dim() == 3:
                # pos_enc: (batch_size, seq_len, hidden_size)
                # GPU memory need (batch_size * seq_len * hidden_size)
                pos_enc = pos_enc.float().to(device=self.device)
            elif pos_enc.dim() == 2:
                # pos_enc: (seq_len, hidden_size)
                # GPU memory only need (seq_len * hidden_size) even after expanded
                pos_enc = pos_enc.float().to(device=self.device).expand(
                    time_masked.size(0), *pos_enc.size())
            mask_label = mask_label.bool().to(device=self.device)
            attn_mask = attn_mask.float().to(device=self.device)
            spec_stacked = spec_stacked.to(device=self.device)

        return time_masked, freq_masked, pos_enc, mask_label, attn_mask, spec_stacked  # (x, pos_enc, mask_label, attention_mask. y)

    def train(self):
        ''' Self-Supervised Pre-Training of Transformer Model'''

        pbar = tqdm(total=self.total_steps)
        while self.global_step <= self.total_steps:

            progress = tqdm(self.dataloader, desc="Iteration")

            step = 0
            loss_val = 0
            for batch in progress:
                batch_is_valid, *batch = batch
                try:
                    if self.global_step > self.total_steps: break
                    if not batch_is_valid: continue
                    step += 1

                    if self.dual_transformer:
                        time_masked, freq_masked, pos_enc, mask_label, attn_mask, spec_stacked = self.process_dual_data(
                            batch)
                        loss, pred_spec = self.model(time_masked, freq_masked,
                                                     pos_enc, mask_label,
                                                     attn_mask, spec_stacked)
                    else:
                        spec_masked, pos_enc, mask_label, attn_mask, spec_stacked = self.process_data(
                            batch)
                        loss, pred_spec = self.model(spec_masked, pos_enc,
                                                     mask_label, attn_mask,
                                                     spec_stacked)

                    # Accumulate Loss
                    if self.gradient_accumulation_steps > 1:
                        loss = loss / self.gradient_accumulation_steps
                    if self.apex and self.args.multi_gpu:
                        raise NotImplementedError
                    elif self.apex:
                        self.optimizer.backward(loss)
                    elif self.args.multi_gpu:
                        loss = loss.sum()
                        loss.backward()
                    else:
                        loss.backward()
                    loss_val += loss.item()

                    # Update
                    if (step + 1) % self.gradient_accumulation_steps == 0:
                        if self.apex:
                            # modify learning rate with special warm up BERT uses
                            # if conifg.apex is False, BertAdam is used and handles this automatically
                            lr_this_step = self.learning_rate * self.warmup_linear.get_lr(
                                self.global_step, self.warmup_proportion)
                            for param_group in self.optimizer.param_groups:
                                param_group['lr'] = lr_this_step

                        # Step
                        grad_norm = torch.nn.utils.clip_grad_norm_(
                            self.model.parameters(), self.gradient_clipping)
                        if math.isnan(grad_norm):
                            print(
                                '[Runner] - Error : grad norm is NaN @ step ' +
                                str(self.global_step))
                        else:
                            self.optimizer.step()
                        self.optimizer.zero_grad()

                        if self.global_step % self.log_step == 0:
                            # Log
                            self.log.add_scalar('lr',
                                                self.optimizer.get_lr()[0],
                                                self.global_step)
                            self.log.add_scalar('loss', (loss_val),
                                                self.global_step)
                            self.log.add_scalar('gradient norm', grad_norm,
                                                self.global_step)
                            progress.set_description("Loss %.4f" % (loss_val))

                        if self.global_step % self.save_step == 0:
                            self.save_model('states')

                            # tensorboard log
                            if self.dual_transformer: spec_masked = time_masked
                            spec_list = [spec_masked, pred_spec, spec_stacked]
                            name_list = ['mask_spec', 'pred_spec', 'true_spec']
                            if self.dual_transformer:
                                spec_list.insert(1, freq_masked)
                                name_list.insert(1, 'mask_freq')
                                name_list[0] = 'mask_time'

                            for i in range(len(spec_list)):
                                if i == 0 and self.wave_transformer:
                                    self.log.add_audio(
                                        name_list[0],
                                        spec_list[0][0].data.cpu().numpy(),
                                        self.global_step,
                                        self.config['online']['sample_rate'])
                                    continue
                                spec = self.up_sample_frames(spec_list[i][0],
                                                             return_first=True)
                                spec = plot_spectrogram_to_numpy(
                                    spec.data.cpu().numpy())
                                self.log.add_image(name_list[i], spec,
                                                   self.global_step)

                        loss_val = 0
                        pbar.update(1)
                        self.global_step += 1

                except RuntimeError as e:
                    if 'CUDA out of memory' in str(e):
                        print('CUDA out of memory at step: ', self.global_step)
                        torch.cuda.empty_cache()
                        self.optimizer.zero_grad()
                    else:
                        raise

        pbar.close()
        self.log.close()
Exemplo n.º 23
0
def synthesis(text_input, args):
    place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())

    with open(args.config_path) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    # tensorboard
    if not os.path.exists(args.log_dir):
        os.mkdir(args.log_dir)
    path = os.path.join(args.log_dir, 'synthesis')

    writer = SummaryWriter(path)

    with dg.guard(place):
        with fluid.unique_name.guard():
            model = TransformerTTS(cfg)
            model.set_dict(
                load_checkpoint(
                    str(args.transformer_step),
                    os.path.join(args.checkpoint_path, "transformer")))
            model.eval()

        with fluid.unique_name.guard():
            model_vocoder = Vocoder(cfg, args.batch_size)
            model_vocoder.set_dict(
                load_checkpoint(
                    str(args.vocoder_step),
                    os.path.join(args.checkpoint_path, "vocoder")))
            model_vocoder.eval()
        # init input
        text = np.asarray(text_to_sequence(text_input))
        text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
        mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32)
        pos_text = np.arange(1, text.shape[1] + 1)
        pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])

        pbar = tqdm(range(args.max_len))
        for i in pbar:
            dec_slf_mask = get_triu_tensor(
                mel_input.numpy(), mel_input.numpy()).astype(np.float32)
            dec_slf_mask = fluid.layers.cast(
                dg.to_variable(dec_slf_mask != 0), np.float32) * (-2**32 + 1)
            pos_mel = np.arange(1, mel_input.shape[1] + 1)
            pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0])
            mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
                text, mel_input, pos_text, pos_mel, dec_slf_mask)
            mel_input = fluid.layers.concat(
                [mel_input, postnet_pred[:, -1:, :]], axis=1)

        mag_pred = model_vocoder(postnet_pred)

        _ljspeech_processor = audio.AudioProcessor(
            sample_rate=cfg['audio']['sr'],
            num_mels=cfg['audio']['num_mels'],
            min_level_db=cfg['audio']['min_level_db'],
            ref_level_db=cfg['audio']['ref_level_db'],
            n_fft=cfg['audio']['n_fft'],
            win_length=cfg['audio']['win_length'],
            hop_length=cfg['audio']['hop_length'],
            power=cfg['audio']['power'],
            preemphasis=cfg['audio']['preemphasis'],
            signal_norm=True,
            symmetric_norm=False,
            max_norm=1.,
            mel_fmin=0,
            mel_fmax=None,
            clip_norm=True,
            griffin_lim_iters=60,
            do_trim_silence=False,
            sound_norm=False)

        wav = _ljspeech_processor.inv_spectrogram(
            fluid.layers.transpose(
                fluid.layers.squeeze(mag_pred, [0]), [1, 0]).numpy())
        global_step = 0
        for i, prob in enumerate(attn_probs):
            for j in range(4):
                x = np.uint8(cm.viridis(prob.numpy()[j]) * 255)
                writer.add_image(
                    'Attention_%d_0' % global_step,
                    x,
                    i * 4 + j,
                    dataformats="HWC")

        for i, prob in enumerate(attn_enc):
            for j in range(4):
                x = np.uint8(cm.viridis(prob.numpy()[j]) * 255)
                writer.add_image(
                    'Attention_enc_%d_0' % global_step,
                    x,
                    i * 4 + j,
                    dataformats="HWC")

        for i, prob in enumerate(attn_dec):
            for j in range(4):
                x = np.uint8(cm.viridis(prob.numpy()[j]) * 255)
                writer.add_image(
                    'Attention_dec_%d_0' % global_step,
                    x,
                    i * 4 + j,
                    dataformats="HWC")
        writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
        if not os.path.exists(args.sample_path):
            os.mkdir(args.sample_path)
        write(
            os.path.join(args.sample_path, 'test.wav'), cfg['audio']['sr'],
            wav)
    writer.close()
Exemplo n.º 24
0
class TensorboardLogger(object):
    def __init__(self, log_dir, model_name):
        self.model_name = model_name
        self.writer = SummaryWriter(log_dir)
        self.train_stats = {}
        self.eval_stats = {}

    def tb_model_weights(self, model, step):
        layer_num = 1
        for name, param in model.named_parameters():
            if param.numel() == 1:
                self.writer.add_scalar(
                    "layer{}-{}/value".format(layer_num, name), param.max(),
                    step)
            else:
                self.writer.add_scalar(
                    "layer{}-{}/max".format(layer_num, name), param.max(),
                    step)
                self.writer.add_scalar(
                    "layer{}-{}/min".format(layer_num, name), param.min(),
                    step)
                self.writer.add_scalar(
                    "layer{}-{}/mean".format(layer_num, name), param.mean(),
                    step)
                self.writer.add_scalar(
                    "layer{}-{}/std".format(layer_num, name), param.std(),
                    step)
                self.writer.add_histogram(
                    "layer{}-{}/param".format(layer_num, name), param, step)
                self.writer.add_histogram(
                    "layer{}-{}/grad".format(layer_num, name), param.grad,
                    step)
            layer_num += 1

    def dict_to_tb_scalar(self, scope_name, stats, step):
        for key, value in stats.items():
            self.writer.add_scalar('{}/{}'.format(scope_name, key), value,
                                   step)

    def dict_to_tb_figure(self, scope_name, figures, step):
        for key, value in figures.items():
            self.writer.add_figure('{}/{}'.format(scope_name, key), value,
                                   step)

    def dict_to_tb_audios(self, scope_name, audios, step, sample_rate):
        for key, value in audios.items():
            try:
                self.writer.add_audio('{}/{}'.format(scope_name, key),
                                      value,
                                      step,
                                      sample_rate=sample_rate)
            except:
                traceback.print_exc()

    def tb_train_iter_stats(self, step, stats):
        self.dict_to_tb_scalar(f"{self.model_name}_TrainIterStats", stats,
                               step)

    def tb_train_epoch_stats(self, step, stats):
        self.dict_to_tb_scalar(f"{self.model_name}_TrainEpochStats", stats,
                               step)

    def tb_train_figures(self, step, figures):
        self.dict_to_tb_figure(f"{self.model_name}_TrainFigures", figures,
                               step)

    def tb_train_audios(self, step, audios, sample_rate):
        self.dict_to_tb_audios(f"{self.model_name}_TrainAudios", audios, step,
                               sample_rate)

    def tb_eval_stats(self, step, stats):
        self.dict_to_tb_scalar(f"{self.model_name}_EvalStats", stats, step)

    def tb_eval_figures(self, step, figures):
        self.dict_to_tb_figure(f"{self.model_name}_EvalFigures", figures, step)

    def tb_eval_audios(self, step, audios, sample_rate):
        self.dict_to_tb_audios(f"{self.model_name}_EvalAudios", audios, step,
                               sample_rate)

    def tb_test_audios(self, step, audios, sample_rate):
        self.dict_to_tb_audios(f"{self.model_name}_TestAudios", audios, step,
                               sample_rate)

    def tb_test_figures(self, step, figures):
        self.dict_to_tb_figure(f"{self.model_name}_TestFigures", figures, step)

    def tb_add_text(self, title, text, step):
        self.writer.add_text(title, text, step)
Exemplo n.º 25
0
    def do_summary(self, m_summary, sess, itr):

        valid_path = self.valid_path
        clean_speech = self.clean_speech
        clean_speech = utils.identity_trans(clean_speech)

        noisy_speech = self.noisy_speech
        noisy_speech = utils.identity_trans(noisy_speech)

        temp_dir = self.temp_dir
        name = self.name
        logs_dir = self.logs_dir

        writer = SummaryWriter(log_dir=self.logs_dir + '/summary')

        summary_dr = dr.DataReader(temp_dir, '', valid_path["norm_path"], dist_num=config.dist_num, is_training=False,
                                   is_shuffle=False)
        pred = []

        while True:

            summary_inputs, summary_labels = summary_dr.next_batch(config.batch_size)

            feed_dict = {m_summary.inputs: summary_inputs, m_summary.labels: summary_labels, m_summary.keep_prob: 1.0}

            pred_temp = sess.run(m_summary.pred, feed_dict=feed_dict)

            pred.append(pred_temp)

            if summary_dr.file_change_checker():
                phase = summary_dr.phase[0]

                lpsd = np.expand_dims(
                    np.reshape(np.concatenate(pred, axis=0), [-1, config.freq_size])[0:phase.shape[0], :],
                    axis=2)

                mean, std = summary_dr.norm_process(valid_path["norm_path"] + '/norm_noisy.mat')

                lpsd = np.squeeze((lpsd * std * 1.18) + mean)  # denorm

                recon_speech = utils.get_recon(np.transpose(lpsd, (1, 0)), np.transpose(phase, (1, 0)),
                                               win_size=config.win_size, win_step=config.win_step, fs=config.fs)

                # plt.plot(recon_speech)
                # plt.show()
                # lab = np.reshape(np.asarray(lab), [-1, 1])
                summary_dr.reader_initialize()
                break

        # write summary

        if itr == config.summary_step:
            writer.close()
            self.noisy_measure = utils.se_eval(clean_speech,
                                          np.squeeze(noisy_speech), float(config.fs))
            summary_fname = tf.summary.text(name + '_filename', tf.convert_to_tensor(self.noisy_dir))

            if name == 'train':

                config_str = "<br>sampling frequency: %d</br>" \
                             "<br>window step: %d ms</br>" \
                             "<br>window size: %d ms</br>" \
                             "<br>fft size: %d</br>" \
                             "<br>learning rate: %f</br><br>learning rate decay: %.4f</br><br>learning" \
                             " rate decay frequency: %.4d</br>" \
                             "<br>dropout rate: %.4f</br><br>max epoch:" \
                             " %.4e</br><br>batch size: %d</br><br>model type: %s</br>"\
                             % (config.fs, (config.win_step/config.fs*1000), (config.win_size/config.fs*1000),
                                config.nfft, config.lr, config.lrDecayRate, config.lrDecayFreq, config.keep_prob,
                                config.max_epoch, config.batch_size, config.mode)

                summary_config = tf.summary.text(name + '_configuration', tf.convert_to_tensor(config_str))

                code_list = []
                read_flag = False

                with open('./lib/trnmodel.py', 'r') as f:
                    while True:
                        line = f.readline()
                        if "def inference(self, inputs):" in line:
                            read_flag = True

                        if "return fm" in line:
                            code_list.append('<br>' + line.replace('\n', '') + '</br>')
                            break

                        if read_flag:
                            code_list.append('<br>' + line.replace('\n', '') + '</br>')

                code_list = "<pre>" + "".join(code_list) + "</pre>"

                summary_model = tf.summary.text('train_model', tf.convert_to_tensor(code_list))

                summary_op = tf.summary.merge([summary_fname, summary_config, summary_model])
            else:
                summary_op = tf.summary.merge([summary_fname])

            with tf.Session() as sess:
                summary_writer = tf.summary.FileWriter(logs_dir + '/summary/text')
                text = sess.run(summary_op)
                summary_writer.add_summary(text, 1)
            summary_writer.close()

            writer = SummaryWriter(log_dir=logs_dir + '/summary')

            writer.add_audio(name + '_audio_ref' + '/clean', clean_speech
                             /np.max(np.abs(clean_speech)), itr,
                             sample_rate=config.fs)
            writer.add_audio(name + '_audio_ref' + '/noisy', noisy_speech
                             /np.max(np.abs(noisy_speech)), itr,
                             sample_rate=config.fs)
            clean_S = get_spectrogram(clean_speech)
            noisy_S = get_spectrogram(noisy_speech)

            writer.add_image(name + '_spectrogram_ref' + '/clean', clean_S, itr)  # image_shape = (C, H, W)
            writer.add_image(name + '_spectrogram_ref' + '/noisy', noisy_S, itr)  # image_shape = (C, H, W)

        enhanced_measure = utils.se_eval(clean_speech, recon_speech, float(config.fs))
        writer.add_scalars(name + '_speech_quality' + '/pesq', {'enhanced': enhanced_measure['pesq'],
                                                                'ref': self.noisy_measure['pesq']}, itr)
        writer.add_scalars(name + '_speech_quality' + '/stoi', {'enhanced': enhanced_measure['stoi'],
                                                                'ref': self.noisy_measure['stoi']}, itr)
        writer.add_scalars(name + '_speech_quality' + '/lsd', {'enhanced': enhanced_measure['lsd'],
                                                               'ref': self.noisy_measure['lsd']}, itr)
        writer.add_scalars(name + '_speech_quality' + '/ssnr', {'enhanced': enhanced_measure['ssnr'],
                                                                'ref': self.noisy_measure['ssnr']}, itr)

        writer.add_audio(name + '_audio_enhanced' + '/enhanced', recon_speech/np.max(np.abs(recon_speech)),
                         itr, sample_rate=config.fs)
        enhanced_S = get_spectrogram(recon_speech)
        writer.add_image(name + '_spectrogram_enhanced' + '/enhanced', enhanced_S, itr)  # image_shape = (C, H, W)
        writer.close()
Exemplo n.º 26
0
        # save sampled audio at the beginning of each epoch
        if i == 0:
            fake_speech = generator(fixed_test_noise, z)
            fake_speech_data = fake_speech.data.cpu().numpy(
            )  # convert to numpy array
            fake_speech_data = emph.de_emphasis(fake_speech_data,
                                                emph_coeff=0.95)

            for idx in range(4):  # select four samples
                generated_sample = fake_speech_data[idx]
                gen_fname = test_noise_filenames[idx]
                filepath = os.path.join(
                    gen_data_path, '{}_e{}.wav'.format(gen_fname, epoch + 1))
                # write to file
                wavfile.write(filepath, sample_rate, generated_sample.T)
                # write for tensorboard log
                tbwriter.add_audio(gen_fname, generated_sample.T, total_steps,
                                   sample_rate)

        # increment total steps
        total_steps += 1

    # save the model parameters for each epoch
    g_path = os.path.join(models_path, 'generator-{}.pkl'.format(epoch + 1))
    d_path = os.path.join(models_path,
                          'discriminator-{}.pkl'.format(epoch + 1))
    torch.save(generator.state_dict(), g_path)
    torch.save(discriminator.state_dict(), d_path)
tbwriter.close()
print('Finished Training!')
Exemplo n.º 27
0
def synthesis(text_input, args):
    local_rank = dg.parallel.Env().local_rank
    place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace())

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    # tensorboard
    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = SummaryWriter(os.path.join(args.output, 'log'))

    fluid.enable_dygraph(place)
    with fluid.unique_name.guard():
        network_cfg = cfg['network']
        model = TransformerTTS(
            network_cfg['embedding_size'], network_cfg['hidden_size'],
            network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'],
            cfg['audio']['num_mels'], network_cfg['outputs_per_step'],
            network_cfg['decoder_num_head'], network_cfg['decoder_n_layers'])
        # Load parameters.
        global_step = io.load_parameters(
            model=model, checkpoint_path=args.checkpoint_transformer)
        model.eval()

    with fluid.unique_name.guard():
        model_vocoder = Vocoder(cfg['train']['batch_size'],
                                cfg['vocoder']['hidden_size'],
                                cfg['audio']['num_mels'],
                                cfg['audio']['n_fft'])
        # Load parameters.
        global_step = io.load_parameters(
            model=model_vocoder, checkpoint_path=args.checkpoint_vocoder)
        model_vocoder.eval()
    # init input
    text = np.asarray(text_to_sequence(text_input))
    text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
    mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32)
    pos_text = np.arange(1, text.shape[1] + 1)
    pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])

    pbar = tqdm(range(args.max_len))
    for i in pbar:
        pos_mel = np.arange(1, mel_input.shape[1] + 1)
        pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0])
        mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
            text, mel_input, pos_text, pos_mel)
        mel_input = fluid.layers.concat([mel_input, postnet_pred[:, -1:, :]],
                                        axis=1)

    mag_pred = model_vocoder(postnet_pred)

    _ljspeech_processor = audio.AudioProcessor(
        sample_rate=cfg['audio']['sr'],
        num_mels=cfg['audio']['num_mels'],
        min_level_db=cfg['audio']['min_level_db'],
        ref_level_db=cfg['audio']['ref_level_db'],
        n_fft=cfg['audio']['n_fft'],
        win_length=cfg['audio']['win_length'],
        hop_length=cfg['audio']['hop_length'],
        power=cfg['audio']['power'],
        preemphasis=cfg['audio']['preemphasis'],
        signal_norm=True,
        symmetric_norm=False,
        max_norm=1.,
        mel_fmin=0,
        mel_fmax=None,
        clip_norm=True,
        griffin_lim_iters=60,
        do_trim_silence=False,
        sound_norm=False)

    # synthesis with cbhg
    wav = _ljspeech_processor.inv_spectrogram(
        fluid.layers.transpose(fluid.layers.squeeze(mag_pred, [0]),
                               [1, 0]).numpy())
    global_step = 0
    for i, prob in enumerate(attn_probs):
        for j in range(4):
            x = np.uint8(cm.viridis(prob.numpy()[j]) * 255)
            writer.add_image('Attention_%d_0' % global_step,
                             x,
                             i * 4 + j,
                             dataformats="HWC")

    writer.add_audio(text_input + '(cbhg)', wav, 0, cfg['audio']['sr'])

    if not os.path.exists(os.path.join(args.output, 'samples')):
        os.mkdir(os.path.join(args.output, 'samples'))
    write(os.path.join(os.path.join(args.output, 'samples'), 'cbhg.wav'),
          cfg['audio']['sr'], wav)

    # synthesis with griffin-lim
    wav = _ljspeech_processor.inv_melspectrogram(
        fluid.layers.transpose(fluid.layers.squeeze(postnet_pred, [0]),
                               [1, 0]).numpy())
    writer.add_audio(text_input + '(griffin)', wav, 0, cfg['audio']['sr'])

    write(os.path.join(os.path.join(args.output, 'samples'), 'griffin.wav'),
          cfg['audio']['sr'], wav)
    print("Synthesis completed !!!")
    writer.close()
Exemplo n.º 28
0
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
          iters_per_checkpoint, iters_per_eval, batch_size, seed, checkpoint_path, log_dir, ema_decay=0.9999):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    if train_data_config["no_chunks"]:
        criterion = MaskedCrossEntropyLoss()
    else:
        criterion = CrossEntropyLoss()
    model = WaveNet(**wavenet_config).cuda()
    ema = ExponentialMovingAverage(ema_decay)
    for name, param in model.named_parameters():
        if param.requires_grad:
            ema.register(name, param.data)

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = StepLR(optimizer, step_size=200000, gamma=0.5)

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":
        model, optimizer, scheduler, iteration, ema = load_checkpoint(checkpoint_path, model,
                                                                      optimizer, scheduler, ema)
        iteration += 1  # next iteration is iteration + 1

    trainset = Mel2SampOnehot(audio_config=audio_config, verbose=True, **train_data_config)
    validset = Mel2SampOnehot(audio_config=audio_config, verbose=False, **valid_data_config)
    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    valid_sampler = DistributedSampler(validset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    print(train_data_config)
    if train_data_config["no_chunks"]:
        collate_fn = utils.collate_fn
    else:
        collate_fn = torch.utils.data.dataloader.default_collate
    train_loader = DataLoader(trainset, num_workers=1, shuffle=False,
                              collate_fn=collate_fn,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=True,
                              drop_last=True)
    valid_loader = DataLoader(validset, num_workers=1, shuffle=False,
                              sampler=valid_sampler, batch_size=1, pin_memory=True)
    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)
    
    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    writer = SummaryWriter(log_dir)
    print("Checkpoints writing to: {}".format(log_dir))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            if low_memory:
                torch.cuda.empty_cache()
            scheduler.step()
            model.zero_grad()

            if train_data_config["no_chunks"]:
                x, y, seq_lens = batch
                seq_lens = to_gpu(seq_lens)
            else:
                x, y = batch
            x = to_gpu(x).float()
            y = to_gpu(y)
            x = (x, y)  # auto-regressive takes outputs as inputs
            y_pred = model(x)
            if train_data_config["no_chunks"]:
                loss = criterion(y_pred, y, seq_lens)
            else:
                loss = criterion(y_pred, y)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus)[0]
            else:
                reduced_loss = loss.data[0]
            loss.backward()
            optimizer.step()

            for name, param in model.named_parameters():
                if name in ema.shadow:
                    ema.update(name, param.data)

            print("{}:\t{:.9f}".format(iteration, reduced_loss))
            if rank == 0:
                writer.add_scalar('loss', reduced_loss, iteration)
            if (iteration % iters_per_checkpoint == 0 and iteration):
                if rank == 0:
                    checkpoint_path = "{}/wavenet_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, scheduler, learning_rate, iteration,
                                    checkpoint_path, ema, wavenet_config)
            if (iteration % iters_per_eval == 0 and iteration > 0 and not config["no_validation"]):
                if low_memory:
                    torch.cuda.empty_cache()
                if rank == 0:
                    model_eval = nv_wavenet.NVWaveNet(**(model.export_weights()))
                    for j, valid_batch in enumerate(valid_loader):
                        mel, audio = valid_batch
                        mel = to_gpu(mel).float()
                        cond_input = model.get_cond_input(mel)
                        predicted_audio = model_eval.infer(cond_input, nv_wavenet.Impl.AUTO)
                        predicted_audio = utils.mu_law_decode_numpy(predicted_audio[0, :].cpu().numpy(), 256)
                        writer.add_audio("valid/predicted_audio_{}".format(j),
                                         predicted_audio,
                                         iteration,
                                         22050)
                        audio = utils.mu_law_decode_numpy(audio[0, :].cpu().numpy(), 256)
                        writer.add_audio("valid_true/audio_{}".format(j),
                                         audio,
                                         iteration,
                                         22050)
                        if low_memory:
                            torch.cuda.empty_cache()
            iteration += 1
Exemplo n.º 29
0
def main(args):

    # setup output paths and read configs
    c = load_config(args.config_path)
    _ = os.path.dirname(os.path.realpath(__file__))
    OUT_PATH = os.path.join(_, c.output_path)
    OUT_PATH = create_experiment_folder(OUT_PATH)
    CHECKPOINT_PATH = os.path.join(OUT_PATH, 'checkpoints')
    shutil.copyfile(args.config_path, os.path.join(OUT_PATH, 'config.json'))

    # save config to tmp place to be loaded by subsequent modules.
    file_name = str(os.getpid())
    tmp_path = os.path.join("/tmp/", file_name+'_tts')
    pickle.dump(c, open(tmp_path, "wb"))

    # setup tensorboard
    LOG_DIR = OUT_PATH
    tb = SummaryWriter(LOG_DIR)

    # Ctrl+C handler to remove empty experiment folder
    def signal_handler(signal, frame):
        print(" !! Pressed Ctrl+C !!")
        remove_experiment_folder(OUT_PATH)
        sys.exit(1)
    signal.signal(signal.SIGINT, signal_handler)

    # Setup the dataset
    dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata.csv'),
                              os.path.join(c.data_path, 'wavs'),
                              c.r,
                              c.sample_rate,
                              c.text_cleaner,
                              c.num_mels,
                              c.min_level_db,
                              c.frame_shift_ms,
                              c.frame_length_ms,
                              c.preemphasis,
                              c.ref_level_db,
                              c.num_freq,
                              c.power
                             )

    dataloader = DataLoader(dataset, batch_size=c.batch_size,
                            shuffle=True, collate_fn=dataset.collate_fn,
                            drop_last=True, num_workers=c.num_loader_workers)

    # setup the model
    model = Tacotron(c.embedding_size,
                     c.hidden_size,
                     c.num_mels,
                     c.num_freq,
                     c.r)

    # plot model on tensorboard
    dummy_input = dataset.get_dummy_data()

    ## TODO: onnx does not support RNN fully yet
    # model_proto_path = os.path.join(OUT_PATH, "model.proto")
    # onnx.export(model, dummy_input, model_proto_path, verbose=True)
    # tb.add_graph_onnx(model_proto_path)

    if use_cuda:
        model = nn.DataParallel(model.cuda())

    optimizer = optim.Adam(model.parameters(), lr=c.lr)

    if args.restore_step:
        checkpoint = torch.load(os.path.join(
            args.restore_path, 'checkpoint_%d.pth.tar' % args.restore_step))
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        print("\n > Model restored from step %d\n" % args.restore_step)
        start_epoch = checkpoint['step'] // len(dataloader)
        best_loss = checkpoint['linear_loss']
    else:
        start_epoch = 0
        print("\n > Starting a new training")

    num_params = count_parameters(model)
    print(" | > Model has {} parameters".format(num_params))

    model = model.train()

    if not os.path.exists(CHECKPOINT_PATH):
        os.mkdir(CHECKPOINT_PATH)

    if use_cuda:
        criterion = nn.L1Loss().cuda()
    else:
        criterion = nn.L1Loss()

    n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq)

    #lr_scheduler = ReduceLROnPlateau(optimizer, factor=c.lr_decay,
    #                               patience=c.lr_patience, verbose=True)
    epoch_time = 0
    best_loss = float('inf')
    for epoch in range(0, c.epochs):

        print("\n | > Epoch {}/{}".format(epoch, c.epochs))
        progbar = Progbar(len(dataset) / c.batch_size)

        for num_iter, data in enumerate(dataloader):
            start_time = time.time()

            text_input = data[0]
            text_lengths = data[1]
            linear_input = data[2]
            mel_input = data[3]

            current_step = num_iter + args.restore_step + epoch * len(dataloader) + 1

            # setup lr
            current_lr = lr_decay(c.lr, current_step)
            for params_group in optimizer.param_groups:
                params_group['lr'] = current_lr

            optimizer.zero_grad()

            # Add a single frame of zeros to Mel Specs for better end detection
            #try:
            #    mel_input = np.concatenate((np.zeros(
            #        [c.batch_size, 1, c.num_mels], dtype=np.float32),
            #        mel_input[:, 1:, :]), axis=1)
            #except:
            #    raise TypeError("not same dimension")

            # convert inputs to variables
            text_input_var = Variable(text_input)
            mel_spec_var = Variable(mel_input)
            linear_spec_var = Variable(linear_input, volatile=True)

            # sort sequence by length.
            # TODO: might be unnecessary
            sorted_lengths, indices = torch.sort(
                     text_lengths.view(-1), dim=0, descending=True)
            sorted_lengths = sorted_lengths.long().numpy()

            text_input_var = text_input_var[indices]
            mel_spec_var = mel_spec_var[indices]
            linear_spec_var = linear_spec_var[indices]

            if use_cuda:
                text_input_var = text_input_var.cuda()
                mel_spec_var = mel_spec_var.cuda()
                linear_spec_var = linear_spec_var.cuda()

            mel_output, linear_output, alignments =\
                model.forward(text_input_var, mel_spec_var,
                              input_lengths= torch.autograd.Variable(torch.cuda.LongTensor(sorted_lengths)))

            mel_loss = criterion(mel_output, mel_spec_var)
            #linear_loss = torch.abs(linear_output - linear_spec_var)
            #linear_loss = 0.5 * \
                #torch.mean(linear_loss) + 0.5 * \
                #torch.mean(linear_loss[:, :n_priority_freq, :])
            linear_loss = 0.5 * criterion(linear_output, linear_spec_var) \
                    + 0.5 * criterion(linear_output[:, :, :n_priority_freq],
                                      linear_spec_var[: ,: ,:n_priority_freq])
            loss = mel_loss + linear_loss
            # loss = loss.cuda()

            loss.backward()
            grad_norm = nn.utils.clip_grad_norm(model.parameters(), 1.)  ## TODO: maybe no need
            optimizer.step()

            step_time = time.time() - start_time
            epoch_time += step_time

            progbar.update(num_iter+1, values=[('total_loss', loss.data[0]),
                                       ('linear_loss', linear_loss.data[0]),
                                       ('mel_loss', mel_loss.data[0]),
                                       ('grad_norm', grad_norm)])

            # Plot Learning Stats
            tb.add_scalar('Loss/TotalLoss', loss.data[0], current_step)
            tb.add_scalar('Loss/LinearLoss', linear_loss.data[0],
                          current_step)
            tb.add_scalar('Loss/MelLoss', mel_loss.data[0], current_step)
            tb.add_scalar('Params/LearningRate', optimizer.param_groups[0]['lr'],
                          current_step)
            tb.add_scalar('Params/GradNorm', grad_norm, current_step)
            tb.add_scalar('Time/StepTime', step_time, current_step)

            align_img = alignments[0].data.cpu().numpy()
            align_img = plot_alignment(align_img)
            tb.add_image('Attn/Alignment', align_img, current_step)

            if current_step % c.save_step == 0:

                if c.checkpoint:
                    # save model
                    save_checkpoint(model, optimizer, linear_loss.data[0],
                                    OUT_PATH, current_step, epoch)

                # Diagnostic visualizations
                const_spec = linear_output[0].data.cpu().numpy()
                gt_spec = linear_spec_var[0].data.cpu().numpy()

                const_spec = plot_spectrogram(const_spec, dataset.ap)
                gt_spec = plot_spectrogram(gt_spec, dataset.ap)
                tb.add_image('Spec/Reconstruction', const_spec, current_step)
                tb.add_image('Spec/GroundTruth', gt_spec, current_step)

                align_img = alignments[0].data.cpu().numpy()
                align_img = plot_alignment(align_img)
                tb.add_image('Attn/Alignment', align_img, current_step)

                # Sample audio
                audio_signal = linear_output[0].data.cpu().numpy()
                dataset.ap.griffin_lim_iters = 60
                audio_signal = dataset.ap.inv_spectrogram(audio_signal.T)
                try:
                    tb.add_audio('SampleAudio', audio_signal, current_step,
                                 sample_rate=c.sample_rate)
                except:
                    print("\n > Error at audio signal on TB!!")
                    print(audio_signal.max())
                    print(audio_signal.min())


        # average loss after the epoch
        avg_epoch_loss = np.mean(
            progbar.sum_values['linear_loss'][0] / max(1, progbar.sum_values['linear_loss'][1]))
        best_loss = save_best_model(model, optimizer, avg_epoch_loss,
                                    best_loss, OUT_PATH,
                                    current_step, epoch)

        #lr_scheduler.step(loss.data[0])
        tb.add_scalar('Time/EpochTime', epoch_time, epoch)
        epoch_time = 0
Exemplo n.º 30
0
def collect_to_tfevents(
    input_dir: Path,
    output_dir: Optional[Path],
    filename_suffix: str,
    audio_tag_format: str,
    diff_tag: str,
    iteration_format: str,
    remove_exist: bool,
    expected_wave_dir: Optional[Path],
):
    if output_dir is None:
        output_dir = input_dir

    if remove_exist:
        for p in output_dir.glob(f"*tfevents*{filename_suffix}"):
            p.unlink()

    flag_calc_diff = expected_wave_dir is not None

    summary_writer = SummaryWriter(logdir=str(output_dir),
                                   filename_suffix=filename_suffix)

    diffs: DefaultDict[int, List[float]] = defaultdict(list)
    for p in tqdm(sorted(input_dir.rglob("*"), key=_to_nums),
                  desc=input_dir.stem):
        if p.is_dir():
            continue

        if "tfevents" in p.name:
            continue

        rp = p.relative_to(input_dir)
        iteration = int(iteration_format.format(p=p, rp=rp))

        # audio
        if p.suffix in [".wav"]:
            wave, sr = librosa.load(str(p), sr=None)
            summary_writer.add_audio(
                tag=audio_tag_format.format(p=p, rp=rp),
                snd_tensor=wave,
                sample_rate=sr,
                global_step=iteration,
            )

        # diff
        if flag_calc_diff and p.name.endswith("_woc.wav"):
            wave_id = p.name[:-8]
            expected = expected_wave_dir.joinpath(f"{wave_id}.wav")

            diff = calc_mcd(path1=expected, path2=p)
            diffs[iteration].append(diff)

    if flag_calc_diff:
        for iteration, values in sorted(diffs.items()):
            summary_writer.add_scalar(
                tag=diff_tag,
                scalar_value=numpy.mean(values),
                global_step=iteration,
            )

    summary_writer.close()