예제 #1
0
    def __init__(self, n_tiers: int, layers: List[int], hidden_size: int,
                 gmm_size: int, freq: int):
        """
        Args:
            n_tiers (int): number of tiers the model is composed of
            layers (List[int]): list with the layers of every tier
            hidden_size (int): parameter for the hidden_state of the Delayed Stack Layers and other
                               and other sizes
            gmm_size (int): number of mixture components of the GMM
            freq (int): size of the frequency axis of the spectrogram to generate. See note in the
                        documentation of the file.
        """
        super(MelNet, self).__init__()

        self.n_tiers = n_tiers
        self.layers = layers
        self.hidden_size = hidden_size
        self.gmm_size = gmm_size

        assert freq >= 2 ** (self.n_tiers / 2), "Size of frequency axis is too small for " \
                                                "being generated with the number of tiers " \
                                                "of this model"
        self.freq = freq

        self.tiers = nn.ModuleList([
            Tier1(
                tier=1,
                n_layers=layers[0],
                hidden_size=hidden_size,
                gmm_size=gmm_size,
                # Calculate size of FREQ dimension for this tier
                freq=tierutil.get_size_freqdim_of_tier(
                    n_mels=self.freq, n_tiers=self.n_tiers, tier=1))
        ] + [
            Tier(
                tier=tier_idx,
                n_layers=layers[tier_idx],
                hidden_size=hidden_size,
                gmm_size=gmm_size,
                # Calculate size of FREQ dimension for this tier
                freq=tierutil.get_size_freqdim_of_tier(
                    n_mels=self.freq, n_tiers=self.n_tiers, tier=tier_idx + 1))
            for tier_idx in range(1, n_tiers)
        ])
예제 #2
0
    def sample(self, hp: HParams, synthesisp: HParams, timestamp: str, logger: logging.Logger,
               n_samples: int, length: int) -> torch.Tensor:
        """
        Generates n_samples of audio of the given length.

        Args:
            hp (HParams): parameters. Parameters needed are hp.device
            synthesisp (HParams): parameters for performing the synthesis. Parameters needed are
                                  synthesisp.output_path to save the spectrogram generated at
                                  each tier.
            timestamp (str): information that identifies completely this run (synthesis).
            logger (logging.Logger):
            n_samples (int): amount of samples to generate.
            length (int): length of the samples to generate (in timesteps).

        Returns:
            spectrograms (torch.Tensor): samples of audio in spectrogram representation.
                                   Shape: [B=n_samples, FREQ=self.freq, FRAMES=length].
        """
        assert length >= 2 ** (
                self.n_tiers / 2), "Length is too short for being generated with the " \
                                   "number of tiers of this model."

        # Initially, the spectrogram (x) to generate it does not exist.
        x = None

        # --- TIER 1 ----
        # The spectrogram is generated autoregressively, frame (length, or timestep) by frame.
        logger.info(f"Starting Tier 1/{self.n_tiers}")
        freq_of_tier1 = tierutil.get_size_freqdim_of_tier(n_mels=self.freq, n_tiers=self.n_tiers,
                                                          tier=1)
        length_of_tier1 = tierutil.get_size_timedim_of_tier(timesteps=length, n_tiers=self.n_tiers,
                                                            tier=1)
        for i in range(0, length_of_tier1):
            logger.info(f"Tier 1/{self.n_tiers} - Frame {i}/{length_of_tier1}")
            if x is None:
                # If the spectrogram has not been initialized, we initialized to an initial frame
                # of all zeros
                x = torch.zeros((n_samples, freq_of_tier1, 1), device=hp.device)
            else:
                # If the spectrogram has already been initialized, we have already computed some
                # frames. We concatenate a new frame initialized to all zeros which will be replaced
                # pixel by pixel by the new values
                # We change the shape from [B, FREQ, FRAMES] to [B, FREQ, FRAMES+1] by adding a new
                # frame
                x = torch.cat(
                    [x, torch.zeros((n_samples, freq_of_tier1, 1), device=hp.device)], dim=-1)

            # Inside a frame, the spectrogram is generated autoregressively, freq by freq
            for j in range(0, freq_of_tier1):
                # we generate the parameters for all the spectrogram (across all samples)
                mu_hat, std_hat, pi_hat = self.tiers[0](x)
                # with the parameters we generate the values of the next spectrogram
                # (across all samples)
                new_spectrogram = sample_gmm_batch(mu_hat, std_hat, pi_hat)
                # but only use the value of the new pixel that we are generating
                # (across all samples) since the spectrogram is generated autoregressively
                x[:, j, i] = new_spectrogram[:, j, i]

        # Save spectrogram generated at tier1
        torch.save(x, f"{synthesisp.output_path}/{timestamp}_tier1.pt")

        # --- TIER >1 ---
        for tier_idx in range(2, self.n_tiers + 1):
            temp_x = None  # temporary spectrogram that will be generated by this tier
            # The spectrogram is generated autoregressively, frame (length, or timestep) by frame.
            logger.info(f"Starting Tier {tier_idx}/{self.n_tiers}")
            freq_of_tierX = tierutil.get_size_freqdim_of_tier(n_mels=self.freq,
                                                              n_tiers=self.n_tiers,
                                                              tier=tier_idx)
            length_of_tierX = tierutil.get_size_timedim_of_tier(timesteps=length,
                                                                n_tiers=self.n_tiers,
                                                                tier=tier_idx)
            for i in range(0, length_of_tierX):
                logger.info(f"Tier {tier_idx}/{self.n_tiers} - Frame {i}/{length_of_tierX}")
                if temp_x is None:
                    # If the spectrogram of this tier has not been initialized, we initialized to an
                    # initial frame of all zeros
                    temp_x = torch.zeros((n_samples, freq_of_tierX, 1), device=hp.device)
                else:
                    # If the spectrogram of this tier has already been initialized, we have already
                    # computed some frames. We concatenate a new frame initialized to all zeros
                    # which will be replaced pixel by pixel by the new values
                    # We change shape from [B, FREQ, FRAMES] to [B, FREQ, FRAMES+1] by adding a new
                    # frame
                    temp_x = torch.cat(
                        [temp_x, torch.zeros((n_samples, freq_of_tierX, 1), device=hp.device)],
                        dim=-1)

                # Inside a frame, the spectrogram is generated autoregressively, freq by freq
                for j in range(0, freq_of_tierX):
                    # we generate the parameters for all the spectrogram (across all samples)
                    mu_hat, std_hat, pi_hat = self.tiers[tier_idx - 1](temp_x, x)
                    # with the parameters we generate the values of the next spectrogram
                    # (across all samples)
                    new_spectrogram = sample_gmm_batch(mu_hat, std_hat, pi_hat)
                    # but only use the value of the new pixel that we are generating
                    # (across all samples) since the spectrogram is generated autoregressively
                    temp_x[:, j, i] = new_spectrogram[:, j, i]

            # After generating the spectrogram of this tier, we interleave it to put it together
            # with the spectrogram generated by previous tiers. In the next iteration, this will
            # be the input to condition the next tier
            x = tierutil.interleave(temp_x, x, tier_idx)
            x = x.to(hp.device)
            # Save spectrogram generated at tier1
            torch.save(temp_x, f"{synthesisp.output_path}/{timestamp}_tier{tier_idx}.pt")
            torch.save(x, f"{synthesisp.output_path}/{timestamp}_tier1-tier{tier_idx}.pt")

        return x
예제 #3
0
def train_tier(args: argparse.Namespace, hp: HParams, tier: int,
               extension_architecture: str, timestamp: str,
               tensorboardwriter: TensorboardWriter,
               logger: logging.Logger) -> None:
    """
    Trains one tier of MelNet.

    Args:
        args (argparse.Namespace): parameters to set up the training. At least, args must contain:
                                   args = {"path_config": ...,
                                           "tier": ...,
                                           "checkpoint_path": ...}
        hp (HParams): hyperparameters for the model and other parameters (training, dataset, ...)
        tier (int): number of the tier to train.
        extension_architecture (str): information about the network's architecture of this run
                                      (training) to identify the logs and weights of the model.
        timestamp (str): information that identifies completely this run (training).
        tensorboardwriter (TensorboardWriter): to log information about training to tensorboard.
        logger (logging.Logger): to log general information about the training of the model.
    """
    logger.info(f"Start training of tier {tier}/{hp.network.n_tiers}")

    # Setup the data ready to be consumed
    train_dataloader, test_dataloader, num_samples = get_dataloader(hp)

    # Setup tier
    # Calculate size of FREQ dimension for this tier
    tier_freq = tierutil.get_size_freqdim_of_tier(n_mels=hp.audio.mel_channels,
                                                  n_tiers=hp.network.n_tiers,
                                                  tier=tier)

    if tier == 1:
        model = Tier1(tier=tier,
                      n_layers=hp.network.layers[tier - 1],
                      hidden_size=hp.network.hidden_size,
                      gmm_size=hp.network.gmm_size,
                      freq=tier_freq)
    else:
        model = Tier(tier=tier,
                     n_layers=hp.network.layers[tier - 1],
                     hidden_size=hp.network.hidden_size,
                     gmm_size=hp.network.gmm_size,
                     freq=tier_freq)
    model = model.to(hp.device)
    model.train()
    parameters = model.parameters()

    # Setup loss criterion and optimizer
    criterion = GMMLoss()
    optimizer = torch.optim.RMSprop(params=parameters,
                                    lr=hp.training.lr,
                                    momentum=hp.training.momentum)

    # Check if training has to be resumed from previous checkpoint
    if args.checkpoint_path is not None:
        model, optimizer = resume_training(args, hp, tier, model, optimizer,
                                           logger)
    else:
        logger.info(
            f"Starting new training on dataset {hp.data.dataset} with configuration file "
            f"name {hp.name}")

    # Train the tier
    total_iterations = 0
    loss_logging = 0  # accumulated loss between logging iterations
    loss_save = 0  # accumulated loss between saving iterations
    prev_loss_onesample = 1e8  # used to compare between saving iterations and decide whether or not
    # to save the model

    gradients = []

    for epoch in range(hp.training.epochs):
        logger.info(f"Epoch: {epoch}/{hp.training.epochs} - Starting")
        for i, (waveform, utterance) in enumerate(train_dataloader):

            # 1.1 Transform waveform input to melspectrogram and apply preprocessing to normalize
            waveform = waveform.to(device=hp.device, non_blocking=True)
            spectrogram = transforms.wave_to_melspectrogram(waveform, hp)
            spectrogram = audio_normalizing.preprocessing(spectrogram, hp)
            # 1.2 Get input and output from the original spectrogram for this tier
            input_spectrogram, output_spectrogram = tierutil.split(
                spectrogram=spectrogram, tier=tier, n_tiers=hp.network.n_tiers)
            length_spectrogram = input_spectrogram.size(2)
            # if item is too long, we jump to the next one
            if length_spectrogram > 1000:
                continue

            # 2. Compute the model output
            if tier == 1:
                # generation is unconditional so there is only one input
                mu_hat, std_hat, pi_hat = model(spectrogram=input_spectrogram)
            else:
                # generation is conditional on the spectrogram generated by previous tiers
                mu_hat, std_hat, pi_hat = model(
                    spectrogram=output_spectrogram,
                    spectrogram_prev_tier=input_spectrogram)
            # gpumemory.stat_cuda("Forward")
            # 3. Calculate the loss
            loss = criterion(mu=mu_hat,
                             std=std_hat,
                             pi=pi_hat,
                             target=output_spectrogram)
            # gpumemory.stat_cuda("Loss")
            del spectrogram
            del mu_hat, std_hat, pi_hat

            # 3.1 Check if loss has exploded
            if torch.isnan(loss) or torch.isinf(loss):
                error_msg = f"Loss exploded at Epoch: {epoch}/{hp.training.epochs} - " \
                            f"Iteration: {i * hp.training.batch_size}/{num_samples}"
                logger.error(error_msg)
                raise Exception(error_msg)

            # 4. Compute gradients
            loss_cpu = loss.item()
            loss = loss / hp.training.accumulation_steps
            loss.backward()

            # 5. Perform backpropagation (using gradient accumulation so efective batch size is the
            # same as in the paper)
            if (total_iterations + 1) % (hp.training.accumulation_steps /
                                         hp.training.batch_size) == 0:

                gradients.append(gradient_norm(model))
                avg_gradient = sum(gradients) / len(gradients)
                logger.info(f"Gradient norm: {gradients[-1]} - "
                            f"Avg gradient: {avg_gradient}")
                torch.nn.utils.clip_grad_norm_(parameters, 2200)
                optimizer.step()
                model.zero_grad()

            # 6. Logging and saving model
            loss_oneframe = loss_cpu / (length_spectrogram *
                                        hp.training.batch_size)
            loss_logging += loss_oneframe  # accumulated loss between logging iterations
            loss_save += loss_oneframe  # accumulated loss between saving iterations

            # 6.1 Save model (if is better than previous tier)
            if (total_iterations + 1) % hp.training.save_iterations == 0:
                # Calculate average loss of one sample of a batch
                loss_onesample = loss_save / hp.training.save_iterations
                # if loss_onesample of these iterations is lower, the tier is better and we save it
                if loss_onesample <= prev_loss_onesample:
                    path = f"{hp.training.dir_chkpt}/tier{tier}_{timestamp}_loss{loss_onesample:.2f}.pt"
                    torch.save(obj={
                        'dataset': hp.data.dataset,
                        'tier_idx': tier,
                        'hp': hp,
                        'epoch': epoch,
                        'iterations': i,
                        'total_iterations': total_iterations,
                        'tier': model.state_dict(),
                        'optimizer': optimizer.state_dict()
                    },
                               f=path)
                    logger.info(f"Model saved to: {path}")
                    prev_loss_onesample = loss_onesample
                loss_save = 0

            # 6.2 Logging
            if (total_iterations + 1) % hp.logging.log_iterations == 0:
                # Calculate average loss of one sample of a batch
                loss_onesample = loss_logging / hp.logging.log_iterations
                tensorboardwriter.log_training(hp, loss_onesample,
                                               total_iterations)
                logger.info(
                    f"Epoch: {epoch}/{hp.training.epochs} - "
                    f"Iteration: {i * hp.training.batch_size}/{num_samples} - "
                    f"Loss: {loss_onesample:.4f}")
                loss_logging = 0

            # 6.3 Evaluate
            if (total_iterations + 1) % hp.training.evaluation_iterations == 0:
                evaluation(hp, tier, test_dataloader, model, criterion, logger)
            total_iterations += 1

        # After finishing training: save model, hyperparameters and total loss
        path = f"{hp.training.dir_chkpt}/tier{tier}_{timestamp}_epoch{epoch}_final.pt"
        torch.save(obj={
            'dataset':
            hp.data.dataset,
            'tier_idx':
            tier,
            'hp':
            hp,
            'epoch':
            epoch,
            'iterations':
            evaluation(hp, tier, test_dataloader, model, criterion, logger),
            'total_iterations':
            total_iterations,
            'tier':
            model.state_dict(),
            'optimizer':
            optimizer.state_dict()
        },
                   f=path)
        logger.info(f"Model saved to: {path}")
        tensorboardwriter.log_end_training(hp=hp, loss=-1)
        logger.info("Finished training")
예제 #4
0
    def sample(self, hp: HParams, synthesisp: HParams, timestamp: str,
               logger: logging.Logger, n_samples: int,
               length: int) -> torch.Tensor:
        """
        Generates n_samples of audio of the given length.

        Args:
            hp (HParams): parameters. Parameters needed are hp.device
            synthesisp (HParams): parameters for performing the synthesis. Parameters needed are
                                  synthesisp.output_path to save the spectrogram generated at
                                  each tier.
            timestamp (str): information that identifies completely this run (synthesis).
            logger (logging.Logger):
            n_samples (int): amount of samples to generate.
            length (int): length of the samples to generate (in timesteps).

        Returns:
            spectrograms (torch.Tensor): samples of audio in spectrogram representation.
                                   Shape: [B=n_samples, FREQ=self.freq, FRAMES=length].
        """
        assert length >= 2 ** (
                self.n_tiers / 2), "Length is too short for being generated with the " \
                                   "number of tiers of this model."

        # Initially, the spectrogram (x) to generate it does not exist.
        x = None

        # Load a spectrogram from the dataset
        from src.utils.training_batch import get_dataloader
        from src.dataprocessing import transforms as T
        from src.dataprocessing.audio_normalizing import preprocessing
        dataloader, _, _ = get_dataloader(hp)

        wave = None
        for i, (waveform, utterance) in enumerate(dataloader):
            if "building" in utterance[0]:
                print(utterance[0])
            if utterance[
                    0] == "One building, Market Hall, was unavailable for November 22.":
                wave = waveform
                break

        if wave is None:
            logger.info("wave not found")
            return

        #dataiter = iter(dataloader)
        #wave, utterance = dataiter.next()
        waveform = wave.to(device=hp.device, non_blocking=True)
        spectrogram = T.wave_to_melspectrogram(waveform, hp)
        spectrogram = preprocessing(spectrogram, hp)

        # Split the spectrogram to get the spectrogram that would be the output of the first tier
        input_spectrogram, output_spectrogram = tierutil.split(
            spectrogram=spectrogram, tier=1, n_tiers=hp.network.n_tiers)

        # Use the spectrogram from the dataset as output from first tier
        x = output_spectrogram
        length = spectrogram.size(2)

        # Save spectrogram generated at tier1
        torch.save(x, f"{synthesisp.output_path}/{timestamp}_tier1.pt")

        # --- TIER >1 ---
        for tier_idx in range(2, self.n_tiers + 1):
            temp_x = None  # temporary spectrogram that will be generated by this tier
            # The spectrogram is generated autoregressively, frame (length, or timestep) by frame.
            logger.info(f"Starting Tier {tier_idx}/{self.n_tiers}")
            freq_of_tierX = tierutil.get_size_freqdim_of_tier(
                n_mels=self.freq, n_tiers=self.n_tiers, tier=tier_idx)
            length_of_tierX = tierutil.get_size_timedim_of_tier(
                timesteps=length, n_tiers=self.n_tiers, tier=tier_idx)
            print("Shape of original spectrogram: ", spectrogram.size())
            print("Shape of spectrogram_prev_tier (x): ", x.size())
            print("Freq_of_tierX: ", freq_of_tierX)
            print("Length_of_tierX: ", length_of_tierX)
            length_of_tierX = min(length_of_tierX, x.size(2))
            x = x[:, :, :length_of_tierX]
            for i in range(0, length_of_tierX):
                logger.info(
                    f"Tier {tier_idx}/{self.n_tiers} - Frame {i}/{length_of_tierX}"
                )
                if temp_x is None:
                    # If the spectrogram of this tier has not been initialized, we initialized to an
                    # initial frame of all zeros
                    temp_x = torch.zeros((n_samples, freq_of_tierX, 1),
                                         device=hp.device)
                else:
                    # If the spectrogram of this tier has already been initialized, we have already
                    # computed some frames. We concatenate a new frame initialized to all zeros
                    # which will be replaced pixel by pixel by the new values
                    # We change shape from [B, FREQ, FRAMES] to [B, FREQ, FRAMES+1] by adding a new
                    # frame
                    temp_x = torch.cat([
                        temp_x,
                        torch.zeros(
                            (n_samples, freq_of_tierX, 1), device=hp.device)
                    ],
                                       dim=-1)

                # Inside a frame, the spectrogram is generated autoregressively, freq by freq
                for j in range(0, freq_of_tierX):
                    # we generate the parameters for all the spectrogram (across all samples)
                    mu_hat, std_hat, pi_hat = self.tiers[tier_idx - 1](temp_x,
                                                                       x)
                    # with the parameters we generate the values of the next spectrogram
                    # (across all samples)
                    new_spectrogram = sample_gmm_batch(mu_hat, std_hat, pi_hat)
                    # but only use the value of the new pixel that we are generating
                    # (across all samples) since the spectrogram is generated autoregressively
                    temp_x[:, j, i] = new_spectrogram[:, j, i]

            # After generating the spectrogram of this tier, we interleave it to put it together
            # with the spectrogram generated by previous tiers. In the next iteration, this will
            # be the input to condition the next tier
            x = tierutil.interleave(temp_x, x, tier_idx)
            x = x.to(hp.device)
            # Save spectrogram generated at tier1
            torch.save(
                temp_x,
                f"{synthesisp.output_path}/{timestamp}_tier{tier_idx}.pt")
            torch.save(
                x,
                f"{synthesisp.output_path}/{timestamp}_tier1-tier{tier_idx}.pt"
            )

        return x