示例#1
0
def main(mel_files, model_filename, output_dir, batch_size, implementation):
    mel_files = utils.files_to_list(mel_files)
    model = torch.load(model_filename)['model']
    wavenet = nv_wavenet.NVWaveNet(**(model.export_weights()))

    for files in chunker(mel_files, batch_size):
        mels = []
        for file_path in files:
            print(file_path)
            mel = torch.load(file_path)
            mel = utils.to_gpu(mel)
            mels.append(torch.unsqueeze(mel, 0))
        cond_input = model.get_cond_input(torch.cat(mels, 0))
        audio_data = wavenet.infer(cond_input, implementation)

        for i, file_path in enumerate(files):
            file_name = os.path.splitext(os.path.basename(file_path))[0]

            audio = utils.mu_law_decode_numpy(audio_data[i, :].cpu().numpy(),
                                              wavenet.A)
            audio = utils.MAX_WAV_VALUE * audio
            audio = 32768.0 * audio
            wavdata = audio.astype('int16')
            #wavdata = audio.astype('float16')
            write("{}/{}.wav".format(output_dir, file_name), 16000, wavdata)
示例#2
0
def main(audio_files, model_filename, output_dir, batch_size, speaker_id,
         implementation):
    audio_files = utils.files_to_list(audio_files)
    model = torch.load(model_filename)['model']
    model.eval()
    wavenet = nv_wavenet.NVWaveNet(
        **(model.decoders[speaker_id].export_weights()))

    for files in chunker(audio_files, batch_size):
        audio_ = []
        for file_path in files:
            print(file_path)
            audio, sampling_rate = utils.load_wav_to_torch(file_path)
            if sampling_rate != 16000:
                raise ValueError("{} SR doesn't match target {} SR".format(
                    sampling_rate, 16000))
            audio = utils.mu_law_encode(audio / utils.MAX_WAV_VALUE, 256)
            audio = utils.to_gpu(audio)
            audio_.append(torch.unsqueeze(audio, 0))
        latent = model.get_latent_input(torch.cat(audio_, 0))
        cond_input = model.decoders[speaker_id].get_cond_input(latent)
        audio_data = wavenet.infer(cond_input, implementation)

        for i, file_path in enumerate(files):
            file_name = os.path.splitext(os.path.basename(file_path))[0]

            audio = utils.mu_law_decode_numpy(audio_data[i, :].cpu().numpy(),
                                              wavenet.A)
            audio = utils.MAX_WAV_VALUE * audio
            wavdata = audio.astype('int16')
            write("{}/{}.wav".format(output_dir, file_name), 16000, wavdata)
示例#3
0
def main(input_files,
         model_dir,
         output_dir,
         batch_size,
         implementation,
         data_config,
         audio_config,
         preload_mels=False):
    model_filename = get_latest_checkpoint(model_dir)
    print("Model path: {}".format(model_filename))
    model = torch.load(model_filename)['model']
    wavenet = nv_wavenet.NVWaveNet(**(model.export_weights()))
    print("Wavenet num layers: {}, max_dilation: {}".format(
        wavenet.num_layers, wavenet.max_dilation))
    writer = SummaryWriter(output_dir)
    mel_extractor = Mel2SampOnehot(audio_config=audio_config, **data_config)
    input_files = utils.files_to_list(input_files)

    audio_processor = AudioProcessor(audio_config)
    for j, files in enumerate(chunker(input_files, batch_size)):
        mels = []
        for i, file_path in enumerate(files):
            if preload_mels:
                mel = np.load(file_path[0]).T
                mel = torch.from_numpy(mel)
                mel = utils.to_gpu(mel)
            else:
                audio, _ = utils.load_wav_to_torch(file_path)
                file_name = os.path.splitext(os.path.basename(file_path))[0]
                writer.add_audio("eval_true/{}/{}".format(i, file_name),
                                 audio / utils.MAX_WAV_VALUE, 0, 22050)
                mel = mel_extractor.get_mel(audio)
                mel = mel.t().cuda()
            mels.append(torch.unsqueeze(mel, 0))
        mels = torch.cat(mels, 0)
        cond_input = model.get_cond_input(mels)
        audio_data = wavenet.infer(cond_input, implementation)

        for i, file_path in enumerate(files):
            file_name = os.path.splitext(os.path.basename(file_path[0]))[0]
            audio = utils.mu_law_decode_numpy(audio_data[i, :].cpu().numpy(),
                                              256)
            print("Range of {}.wav before deemphasis : {} to {}".format(
                file_name, audio.min(), audio.max()))
            if mel_extractor.apply_preemphasis:
                audio = audio.astype("float32")
                audio = audio_processor.deemphasis(audio[None, :])
                audio = audio.numpy()[0]
            print("Range of {}.wav after deemphasis : {} to {}".format(
                file_name, audio.min(), audio.max()))
            audio = np.tanh(audio)
            output_filepath = "{}.wav".format(file_name)
            output_filepath = os.path.join(output_dir, output_filepath)
            assert audio.dtype in [np.float64, np.float32]
            assert (np.abs(audio)).max() <= 1
            writer.add_audio(output_filepath, audio, 0, 22050)
            audio = (audio * 32767).astype("int16")
            scipy.io.wavfile.write(output_filepath, 22050, audio)
示例#4
0
    def __init__(self, training_files, segment_length, mu_quantization,
                 filter_length, hop_length, win_length, sampling_rate):
        audio_files = utils.files_to_list(training_files)
        self.audio_files = audio_files
        random.seed(1234)
        random.shuffle(self.audio_files)

        self.segment_length = segment_length
        self.mu_quantization = mu_quantization
        self.sampling_rate = sampling_rate
示例#5
0
    def __init__(self,
                 audio_files,
                 mu_quantization,
                 no_chunks,
                 audio_config,
                 segment_length,
                 use_tf=False,
                 use_lws=True,
                 load_mel=False,
                 verbose=False):

        audio_files = utils.files_to_list(audio_files)
        self.audio_files = audio_files
        random.seed(1234)
        random.shuffle(self.audio_files)

        if not load_mel:
            if use_tf:
                audio_processor_cls = AudioProcessor
            elif use_lws:
                audio_processor_cls = LwsAudioProcessor
            else:
                raise ValueError(
                    "Mel spectrum can be calculated only with tf or lws!")
            self.audio_processor = audio_processor_cls(audio_config)

        self.mu_quantization = mu_quantization
        self.segment_length = segment_length

        audio_params = AudioProcessor._load_params(audio_config)
        if verbose:
            print("Audio params:")
            pprint(audio_params)
        self.audio_params = audio_params
        self.window_size = audio_params["window_size"]
        self.preemphasis_coeff = audio_params["preemphasis_coef"]
        self.apply_preemphasis = audio_params["apply_preemphasis"]
        self.window_step = audio_params["window_step"]
        self.sample_rate = audio_params["sample_rate"]
        self.mel_segment_length = int(
            np.ceil((segment_length - self.window_size) / self.window_step))
        self.num_mels = audio_params["num_mel_bins"]
        self.use_tf = use_tf
        self.load_mel = load_mel
        self.no_chunks = no_chunks
        self.use_lws = use_lws
示例#6
0
    def __init__(self, training_files, segment_length, mu_quantization,
                 filter_length, hop_length, win_length, sampling_rate,
                 mel_fmin, mel_fmax):
        audio_files = utils.files_to_list(training_files)
        self.audio_files = audio_files
        random.seed(1234)
        random.shuffle(self.audio_files)
        mel_fmax = None if mel_fmax == -1 else mel_fmax

        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax)

        self.segment_length = segment_length
        self.mu_quantization = mu_quantization
        self.sampling_rate = sampling_rate
示例#7
0
                        required=True,
                        type=str,
                        help='File containing list of wavefiles')
    parser.add_argument('-o',
                        "--output_dir",
                        required=True,
                        type=str,
                        help='Directory to put Mel-Spectrogram Tensors')
    parser.add_argument('-c',
                        '--config',
                        type=str,
                        help='JSON file for configuration')

    args = parser.parse_args()

    filepaths = utils.files_to_list(args.audio_list)

    # Make directory if it doesn't exist
    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)
        os.chmod(args.output_dir, 0o775)

    # Parse config.  Only using data processing
    with open(args.config) as f:
        data = f.read()
    config = json.loads(data)
    data_config = config["data_config"]
    mel_factory = Mel2SampOnehot(**data_config)

    for filepath in filepaths:
        audio, sampling_rate = utils.load_wav_to_torch(filepath)