Пример #1
0
def get_data_loaders(dump_root, speaker_id, test_shuffle=True):
    data_loaders = {}
    local_conditioning = hparams.cin_channels > 0

    if hparams.max_time_steps is not None:
        max_steps = ensure_divisible(hparams.max_time_steps, audio.get_hop_size(), True)
    else:
        max_steps = None

    for phase in ["train_no_dev", "dev"]:
        train = phase == "train_no_dev"
        X = FileSourceDataset(
            RawAudioDataSource(join(dump_root, phase), speaker_id=speaker_id,
                               max_steps=max_steps, cin_pad=hparams.cin_pad,
                               hop_size=audio.get_hop_size()))
        if local_conditioning:
            Mel = FileSourceDataset(
                MelSpecDataSource(join(dump_root, phase), speaker_id=speaker_id,
                                  max_steps=max_steps, cin_pad=hparams.cin_pad,
                                  hop_size=audio.get_hop_size()))
            assert len(X) == len(Mel)
            print("Local conditioning enabled. Shape of a sample: {}.".format(
                Mel[0].shape))
        else:
            Mel = None
        print("[{}]: length of the dataset is {}".format(phase, len(X)))

        if train:
            lengths = np.array(X.file_data_source.lengths)
            # Prepare sampler
            sampler = PartialyRandomizedSimilarTimeLengthSampler(
                lengths, batch_size=hparams.batch_size)
            shuffle = False
            # make sure that there's no sorting bugs for https://github.com/r9y9/wavenet_vocoder/issues/130
            sampler_idx = np.asarray(sorted(list(map(lambda s: int(s), sampler))))
            assert (sampler_idx == np.arange(len(sampler_idx), dtype=np.int)).all()
        else:
            sampler = None
            shuffle = test_shuffle

        dataset = PyTorchDataset(X, Mel)
        data_loader = data_utils.DataLoader(
            dataset, batch_size=hparams.batch_size, drop_last=True,
            num_workers=hparams.num_workers, sampler=sampler, shuffle=shuffle,
            collate_fn=collate_fn, pin_memory=hparams.pin_memory)

        speaker_ids = {}
        if X.file_data_source.multi_speaker:
            for idx, (x, c, g) in enumerate(dataset):
                if g is not None:
                    try:
                        speaker_ids[g] += 1
                    except KeyError:
                        speaker_ids[g] = 1
            if len(speaker_ids) > 0:
                print("Speaker stats:", speaker_ids)

        data_loaders[phase] = data_loader

    return data_loaders
Пример #2
0
def get_data_loaders(config):
    data_loaders = {}
    for phase in ["train_no_dev", "dev"]:
        in_dir = to_absolute_path(config.data[phase].in_dir)
        out_dir = to_absolute_path(config.data[phase].out_dir)
        train = phase.startswith("train")
        in_feats = FileSourceDataset(NpyFileSource(in_dir))
        out_feats = FileSourceDataset(NpyFileSource(out_dir))

        in_feats = MemoryCacheDataset(in_feats, cache_size=10000)
        out_feats = MemoryCacheDataset(out_feats, cache_size=10000)

        dataset = Dataset(in_feats, out_feats)
        data_loaders[phase] = data_utils.DataLoader(
            dataset,
            batch_size=config.data.batch_size,
            collate_fn=collate_fn,
            pin_memory=config.data.pin_memory,
            num_workers=config.data.num_workers,
            shuffle=train)

        for x, y, l in data_loaders[phase]:
            logger.info(f"{x.shape}, {y.shape}, {l.shape}")

    return data_loaders
Пример #3
0
def test_real_metrics():
    _, source = example_file_data_sources_for_acoustic_model()
    X = FileSourceDataset(source)
    lengths = [len(x) for x in X]
    X = X.asarray()

    mgc = X[:, :, :source.mgc_dim // 3]
    lf0 = X[:, :, source.lf0_start_idx]
    vuv = (X[:, :, source.vuv_start_idx] > 0).astype(np.int)
    bap = X[:, :, source.bap_start_idx]

    mgc_tgt = mgc + 0.01
    lf0_tgt = lf0 + 0.01
    vuv_tgt = vuv.copy()
    bap_tgt = bap + 0.01

    mcd = metrics.melcd(mgc, mgc_tgt, lengths)
    bap_mcd = metrics.melcd(bap, bap_tgt, lengths)
    lf0_mse = metrics.lf0_mean_squared_error(lf0, vuv, lf0_tgt, vuv_tgt,
                                             lengths)
    vuv_err = metrics.vuv_error(vuv, vuv_tgt)
    assert mcd > 0
    assert bap_mcd > 0
    assert lf0_mse > 0
    assert vuv_err == 0.0
Пример #4
0
def get_data_loaders(data_root, speaker_id, test_shuffle=True):
    data_loaders = {}
    local_conditioning = hparams.cin_channels > 0
    for phase in ["train", "test"]:
        train = phase == "train"
        X = FileSourceDataset(
            RawAudioDataSource(data_root,
                               speaker_id=speaker_id,
                               train=train,
                               test_size=hparams.test_size,
                               test_num_samples=hparams.test_num_samples,
                               random_state=hparams.random_state))
        if local_conditioning:
            Mel = FileSourceDataset(
                MelSpecDataSource(data_root,
                                  speaker_id=speaker_id,
                                  train=train,
                                  test_size=hparams.test_size,
                                  test_num_samples=hparams.test_num_samples,
                                  random_state=hparams.random_state))
            assert len(X) == len(Mel)
            print("Local conditioning enabled. Shape of a sample: {}.".format(
                Mel[0].shape))
        else:
            Mel = None
        print("[{}]: length of the dataset is {}".format(phase, len(X)))

        if train:
            lengths = np.array(X.file_data_source.lengths)
            # Prepare sampler
            sampler = PartialyRandomizedSimilarTimeLengthSampler(
                lengths, batch_size=hparams.batch_size)
            shuffle = False
        else:
            sampler = None
            shuffle = test_shuffle

        dataset = PyTorchDataset(X, Mel)
        data_loader = data_utils.DataLoader(dataset,
                                            batch_size=hparams.batch_size,
                                            num_workers=hparams.num_workers,
                                            sampler=sampler,
                                            shuffle=shuffle,
                                            collate_fn=collate_fn,
                                            pin_memory=hparams.pin_memory)

        speaker_ids = {}
        if X.file_data_source.multi_speaker:
            for idx, (x, c, g) in enumerate(dataset):
                if g is not None:
                    try:
                        speaker_ids[g] += 1
                    except KeyError:
                        speaker_ids[g] = 1
            if len(speaker_ids) > 0:
                print("Speaker stats:", speaker_ids)

        data_loaders[phase] = data_loader

    return data_loaders
Пример #5
0
 def __init__(self, data_root_dir=DATA_ROOT, train_flag=True, cond_sel='mfcc', cache_size=1000, transform=None):
     
     self.train_flag = train_flag
     self.cond_sel = cond_sel # 'mfcc' or 'pyspec'
     self.cache_size= cache_size
     self.data_root_dir = data_root_dir
     
     if self.train_flag is True:
         self.X = FileSourceDataset(WavSource(data_root=data_root_dir, file_sel_range=[0,1000]))
     else:
         self.X = FileSourceDataset(WavSource(data_root=data_root_dir, file_sel_range=[1000,1132]))
         self.cache_size = 1
         
     self.utt_lengths = [len(utt) for utt in self.X]
     self.X_raw = MemoryCacheFramewiseDataset(self.X, self.utt_lengths, self.cache_size)
     self.utt_total_length = len(self.X_raw)
     
     self.sample_start, self.sample_end = list(), list()
     
     # # This initializes self.sample_start and self.sample_end
     if self.train_flag is True:
         self.rand_flush() 
     else:
         self.init_for_test()
     
     # Feature scaling factors
     scf = np.load(self.data_root_dir + '../processed_slt_arctic/scale_factors.npy').item()
     self.pyspec_max = np.max(scf['pyworld_max'][64:64+513]) #11.159795
     self.mfcc_mean = scf['melmfcc_mean'][128:128+25]
     self.mfcc_std  = scf['melmfcc_std'][128:128+25]
     return None
Пример #6
0
def test_jsut():
    DATA_DIR = join(expanduser("~"), "data", "jsut_ver1.1")
    if not exists(DATA_DIR):
        warn("Data doesn't exist at {}".format(DATA_DIR))
        return

    class MyTextDataSource(jsut.TranscriptionDataSource):
        def __init__(self, data_root, subsets):
            super(MyTextDataSource, self).__init__(data_root, subsets)

        def collect_features(self, text):
            return text

    data_source = MyTextDataSource(DATA_DIR, subsets=["basic5000"])
    X1 = FileSourceDataset(data_source)
    assert X1[0] == u"水をマレーシアから買わなくてはならないのです。"

    data_source = MyTextDataSource(DATA_DIR, subsets=["travel1000"])
    X2 = FileSourceDataset(data_source)
    assert X2[0] == u"あなたの荷物は、ロサンゼルスに残っています。"

    # Multiple subsets
    data_source = MyTextDataSource(DATA_DIR,
                                   subsets=["basic5000", "travel1000"])
    X3 = FileSourceDataset(data_source)
    assert X3[0] == u"水をマレーシアから買わなくてはならないのです。"
    assert len(X3) == len(X1) + len(X2)

    # All subsets
    data_source = MyTextDataSource(DATA_DIR, subsets=jsut.available_subsets)
    X = FileSourceDataset(data_source)
    # As of 2017/11/2. There were 30 missing wav files.
    # This should be 7696
    assert len(X) == 7696

    class MyWavFileDataSource(jsut.WavFileDataSource):
        def __init__(self, data_root, subsets):
            super(MyWavFileDataSource, self).__init__(data_root, subsets)
            self.alpha = pysptk.util.mcepalpha(48000)

        def collect_features(self, path):
            fs, x = wavfile.read(path)
            assert fs == 48000
            x = x.astype(np.float64)
            f0, timeaxis = pyworld.dio(x, fs, frame_period=5)
            f0 = pyworld.stonemask(x, f0, timeaxis, fs)
            spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
            spectrogram = trim_zeros_frames(spectrogram)
            mc = pysptk.sp2mc(spectrogram, order=24, alpha=self.alpha)
            return mc.astype(np.float32)

    data_source = MyWavFileDataSource(DATA_DIR, subsets=["basic5000"])
    X = FileSourceDataset(data_source)
    print(X[0].shape)
Пример #7
0
def test_vcc2016():
    DATA_DIR = join(expanduser("~"), "data", "vcc2016")
    if not exists(DATA_DIR):
        warn("Data doesn't exist at {}".format(DATA_DIR))
        return

    class MyFileDataSource(vcc2016.WavFileDataSource):
        def __init__(self, data_root, speakers, labelmap=None, max_files=2):
            super(MyFileDataSource, self).__init__(
                data_root, speakers, labelmap=labelmap, max_files=max_files)
            self.alpha = pysptk.util.mcepalpha(16000)

        def collect_features(self, path):
            fs, x = wavfile.read(path)
            x = x.astype(np.float64)
            f0, timeaxis = pyworld.dio(x, fs, frame_period=5)
            f0 = pyworld.stonemask(x, f0, timeaxis, fs)
            spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
            spectrogram = trim_zeros_frames(spectrogram)
            mc = pysptk.sp2mc(spectrogram, order=24, alpha=self.alpha)
            return mc.astype(np.float32)

    max_files = 10
    data_source = MyFileDataSource(
        DATA_DIR, speakers=["SF1"], max_files=max_files)
    X = FileSourceDataset(data_source)
    assert len(X) == max_files
    print(X[0].shape)  # warmup collect_features path

    # Multi speakers
    data_source = MyFileDataSource(
        DATA_DIR, speakers=["SF1", "SF2"], max_files=max_files)
    X = FileSourceDataset(data_source)
    assert len(X) == max_files

    # Speaker labels
    Y = data_source.labels
    assert np.all(Y[:max_files // 2] == 0)
    assert np.all(Y[max_files // 2:] == 1)

    # Custum speaker id
    data_source = MyFileDataSource(
        DATA_DIR, speakers=["SF1", "SF2"], max_files=max_files,
        labelmap={"SF1": 1, "SF2": 0})
    X = FileSourceDataset(data_source)
    Y = data_source.labels
    assert np.all(Y[:max_files // 2] == 1)
    assert np.all(Y[max_files // 2:] == 0)

    # Use all data
    data_source = MyFileDataSource(
        DATA_DIR, speakers=["SF1", "SF2"], max_files=None)
    X = FileSourceDataset(data_source)
    assert len(X) == 162 * 2
Пример #8
0
def test_dtw_frame_length_adjastment():
    _, X = example_file_data_sources_for_duration_model()
    X = FileSourceDataset(X)
    X_unaligned = X.asarray()
    # This should trigger frame length adjastment
    Y_unaligned = np.pad(X_unaligned, [(0, 0), (5, 0), (0, 0)],
                         mode="constant", constant_values=0)
    Y_unaligned = Y_unaligned[:, :-5, :]
    for aligner in [DTWAligner(), IterativeDTWAligner(
            n_iter=1, max_iter_gmm=1, n_components_gmm=1)]:
        X_aligned, Y_aligned = aligner.transform((X_unaligned, Y_unaligned))
        assert X_aligned.shape == Y_aligned.shape
Пример #9
0
def _get_small_datasets(padded=False, duration=False, padded_length=1000):
    if duration:
        X, Y = example_file_data_sources_for_duration_model()
    else:
        X, Y = example_file_data_sources_for_acoustic_model()
    if padded:
        X = PaddedFileSourceDataset(X, padded_length=padded_length)
        Y = PaddedFileSourceDataset(Y, padded_length=padded_length)
    else:
        X = FileSourceDataset(X)
        Y = FileSourceDataset(Y)
    return X, Y
Пример #10
0
def get_data_loaders(dump_root,
                     speaker_id,
                     hparams=None,
                     rank_id=None,
                     group_size=None):
    """create train dataset"""
    local_conditioning = hparams.cin_channels > 0

    if hparams.max_time_steps is not None:
        max_steps = ensure_divisible(hparams.max_time_steps,
                                     audio.get_hop_size(), True)
    else:
        max_steps = None

    X = FileSourceDataset(
        RawAudioDataSource(os.path.join(dump_root, 'train_no_dev'),
                           speaker_id=speaker_id,
                           max_steps=max_steps,
                           cin_pad=hparams.cin_pad,
                           hop_size=audio.get_hop_size()))

    if local_conditioning:
        Mel = FileSourceDataset(
            MelSpecDataSource(os.path.join(dump_root, 'train_no_dev'),
                              speaker_id=speaker_id,
                              max_steps=max_steps,
                              cin_pad=hparams.cin_pad,
                              hop_size=audio.get_hop_size()))
        assert len(X) == len(Mel)
        print("Local conditioning enabled. Shape of a sample: {}.".format(
            Mel[0].shape))
    else:
        Mel = None
    print("length of the dataset is {}".format(len(X)))
    length_x = np.array(X.file_data_source.lengths)
    dataset = DualDataset(X,
                          Mel,
                          length_x,
                          batch_size=hparams.batch_size,
                          hparams=hparams)
    sampler = DistributedSampler(dataset,
                                 rank_id,
                                 group_size,
                                 shuffle=True,
                                 seed=0)
    data_loaders = de.GeneratorDataset(
        dataset,
        ["x_batch", "y_batch", "c_batch", "g_batch", "input_lengths", "mask"],
        sampler=sampler)

    return data_loaders
Пример #11
0
def test_ljspeech():
    DATA_DIR = join(expanduser("~"), "data", "LJSpeech-1.0")
    if not exists(DATA_DIR):
        warn("Data doesn't exist at {}".format(DATA_DIR))
        return

    class MyTextDataSource(ljspeech.TranscriptionDataSource):
        def __init__(self, data_root):
            super(MyTextDataSource, self).__init__(data_root)

        def collect_features(self, text):
            return text

    class MyNormalizedTextDataSource(ljspeech.NormalizedTranscriptionDataSource
                                     ):
        def __init__(self, data_root):
            super(MyNormalizedTextDataSource, self).__init__(data_root)

        def collect_features(self, text):
            return text

    data_source = MyTextDataSource(DATA_DIR)
    X = FileSourceDataset(data_source)
    assert X[1] == "in being comparatively modern."

    data_source = MyNormalizedTextDataSource(DATA_DIR)
    X = FileSourceDataset(data_source)
    assert X[1] == "in being comparatively modern."

    class MyWavFileDataSource(ljspeech.WavFileDataSource):
        def __init__(self, data_root):
            super(MyWavFileDataSource, self).__init__(data_root)
            self.alpha = pysptk.util.mcepalpha(22050)

        def collect_features(self, path):
            fs, x = wavfile.read(path)
            assert fs == 22050
            x = x.astype(np.float64)
            f0, timeaxis = pyworld.dio(x, fs, frame_period=5)
            f0 = pyworld.stonemask(x, f0, timeaxis, fs)
            spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
            spectrogram = trim_zeros_frames(spectrogram)
            mc = pysptk.sp2mc(spectrogram, order=24, alpha=self.alpha)
            return mc.astype(np.float32)

    data_source = MyWavFileDataSource(DATA_DIR)
    X = FileSourceDataset(data_source)
    print(X[0].shape)
Пример #12
0
def infer(args):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


    model = Modern_DBLSTM_1(args).to(device)

    mfcc_x_test = FileSourceDataset(MFCCSource(args.wav_dir))
    dataset_test = InferenceDataset(mfcc_x_test)
    test_loader = torch.utils.data.DataLoader(dataset_test,
                                              batch_size=1, shuffle=False,
                                              num_workers=4)

    model.load_state_dict(torch.load(args.model_name))

    for i, sample in enumerate(test_loader):
        inputs = sample['speech'].to(device)

        wav, filename = mfcc_x_test[i]

        filename_save = join(args.save_dir,split(filename)[1].split(".")[0])
        predicted = model(inputs).detach().cpu().numpy()

        plt.plot(predicted[0,:,:])
        plt.show()
        np.save(filename_save,predicted[0,:,:])
Пример #13
0
def test_meanvar():
    # Pick acoustic features for testing
    _, X = example_file_data_sources_for_acoustic_model()
    X = FileSourceDataset(X)
    lengths = [len(x) for x in X]
    D = X[0].shape[-1]
    X_mean, X_var = P.meanvar(X)
    X_std = np.sqrt(X_var)
    assert np.isfinite(X_mean).all()
    assert np.isfinite(X_var).all()
    assert X_mean.shape[-1] == D
    assert X_var.shape[-1] == D

    _, X_std_hat = P.meanstd(X)
    assert np.allclose(X_std, X_std_hat)

    x = X[0]
    x_scaled = P.scale(x, X_mean, X_std)
    assert np.isfinite(x_scaled).all()

    # For padded dataset
    _, X = example_file_data_sources_for_acoustic_model()
    X = PaddedFileSourceDataset(X, 1000)
    # Should get same results with padded features
    X_mean_hat, X_var_hat = P.meanvar(X, lengths)
    assert np.allclose(X_mean, X_mean_hat)
    assert np.allclose(X_var, X_var_hat)

    # Inverse transform
    x = X[0]
    x_hat = P.inv_scale(P.scale(x, X_mean, X_std), X_mean, X_std)
    assert np.allclose(x, x_hat, atol=1e-5)
Пример #14
0
def Dataloader(data_root, meta_text):
	
	# Input dataset definitions
	X = FileSourceDataset(TextDataSource(data_root, meta_text))
	Mel = FileSourceDataset(MelSpecDataSource(data_root, meta_text))
	Y = FileSourceDataset(LinearSpecDataSource(data_root, meta_text))

	# Dataset and Dataloader setup
	dataset = PyTorchDatasetWrapper(X, Mel, Y)
	data_loader = data.DataLoader(dataset, 
								  batch_size=config.batch_size,
								  num_workers=config.num_workers, 
								  shuffle=True,
								  collate_fn=collate_fn, 
								  pin_memory=config.pin_memory)
	return data_loader
Пример #15
0
def get_data_loader(data_dir, collate_fn):
    wav_paths = glob(join(data_dir, "*-wave.npy"))
    if len(wav_paths) != 0:
        X = FileSourceDataset(
            RawAudioDataSource(data_dir,
                               hop_size=audio.get_hop_size(),
                               max_steps=None,
                               cin_pad=hparams.cin_pad))
    else:
        X = None

    C = FileSourceDataset(
        MelSpecDataSource(data_dir,
                          hop_size=audio.get_hop_size(),
                          max_steps=None,
                          cin_pad=hparams.cin_pad))

    # C = None
    # No audio found:
    if X is None:
        assert len(C) > 0
        data_loader = data_utils.DataLoader(C,
                                            batch_size=hparams.batch_size,
                                            drop_last=False,
                                            num_workers=hparams.num_workers,
                                            sampler=None,
                                            shuffle=False,
                                            collate_fn=dummy_collate,
                                            pin_memory=hparams.pin_memory)
    else:
        assert len(X) == len(C)
        if C[0].shape[-1] != hparams.cin_channels:
            raise RuntimeError(
                """Invalid cin_channnels {}. Expectd to be {}.""".format(
                    hparams.cin_channels, C[0].shape[-1]))
        dataset = PyTorchDataset(X, C)

        data_loader = data_utils.DataLoader(dataset,
                                            batch_size=hparams.batch_size,
                                            drop_last=False,
                                            num_workers=0,
                                            sampler=None,
                                            shuffle=True,
                                            collate_fn=collate_fn,
                                            pin_memory=hparams.pin_memory)

    return data_loader
Пример #16
0
def initialize_training(checkpoint_path):

    # Input dataset definitions
    X = FileSourceDataset(TextDataSource())
    Mel = FileSourceDataset(MelSpecDataSource())
    Y = FileSourceDataset(LinearSpecDataSource())

    # Dataset and Dataloader setup
    dataset = PyTorchDataset(X, Mel, Y)
    data_loader = data.DataLoader(dataset,
                                  batch_size=config.batch_size,
                                  num_workers=config.num_workers,
                                  shuffle=True,
                                  collate_fn=collate_fn,
                                  pin_memory=config.pin_memory)

    # Model
    model = Tacotron(n_vocab=len(symbols),
                     embedding_dim=config.embedding_dim,
                     mel_dim=config.num_mels,
                     linear_dim=config.num_freq,
                     r=config.outputs_per_step,
                     padding_idx=config.padding_idx,
                     use_memory_mask=config.use_memory_mask)

    optimizer = optim.Adam(model.parameters(),
                           lr=config.initial_learning_rate,
                           betas=(config.adam_beta1, config.adam_beta2),
                           weight_decay=config.weight_decay)

    # Load checkpoint
    if checkpoint_path != None:
        print("Load checkpoint from: {}".format(checkpoint_path))
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint["state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        try:
            global_step = checkpoint["global_step"]
            global_epoch = checkpoint["global_epoch"]
        except:
            print('Warning: global step and global epoch unable to restore!')
            sys.exit(0)

    return model, optimizer, data_loader
Пример #17
0
def test_empty_dataset():
    class EmptyDataSource(FileDataSource):
        def collect_files(self):
            return []

        def collect_features(path):
            pass
    X = FileSourceDataset(EmptyDataSource())

    def __test_outof_range(X):
        print(X[0])

    # Should raise IndexError
    yield raises(IndexError)(__test_outof_range), X
Пример #18
0
def test_minmax():
    # Pick linguistic features for testing
    X, _ = example_file_data_sources_for_acoustic_model()
    X = FileSourceDataset(X)
    lengths = [len(x) for x in X]
    D = X[0].shape[-1]
    X_min, X_max = P.minmax(X)
    assert np.isfinite(X_min).all()
    assert np.isfinite(X_max).all()

    x = X[0]
    x_scaled = P.minmax_scale(x, X_min, X_max, feature_range=(0, 0.99))
    assert np.max(x_scaled) <= 1
    assert np.min(x_scaled) >= 0
    assert np.isfinite(x_scaled).all()

    # Need to specify (min, max) or (scale_, min_)
    @raises(ValueError)
    def __test_raise1(x, X_min, X_max):
        P.minmax_scale(x)

    @raises(ValueError)
    def __test_raise2(x, X_min, X_max):
        P.inv_minmax_scale(x)

    __test_raise1(x, X_min, X_max)
    __test_raise2(x, X_min, X_max)

    # Explicit scale_ and min_
    min_, scale_ = P.minmax_scale_params(X_min, X_max, feature_range=(0, 0.99))
    x_scaled_hat = P.minmax_scale(x, min_=min_, scale_=scale_)
    assert np.allclose(x_scaled, x_scaled_hat)

    # For padded dataset
    X, _ = example_file_data_sources_for_acoustic_model()
    X = PaddedFileSourceDataset(X, 1000)
    # Should get same results with padded features
    X_min_hat, X_max_hat = P.minmax(X, lengths)
    assert np.allclose(X_min, X_min_hat)
    assert np.allclose(X_max, X_max_hat)

    # Inverse transform
    x = X[0]
    x_hat = P.inv_minmax_scale(P.minmax_scale(x, X_min, X_max), X_min, X_max)
    assert np.allclose(x, x_hat)

    x_hat = P.inv_minmax_scale(P.minmax_scale(x, scale_=scale_, min_=min_),
                               scale_=scale_,
                               min_=min_)
    assert np.allclose(x, x_hat)
Пример #19
0
def get_data_loader(hparam, data_dir):
    """
    test data loader
    """
    wav_paths = glob.glob(os.path.join(data_dir, "*-wave.npy"))
    if wav_paths:
        X = FileSourceDataset(RawAudioDataSource(data_dir,
                                                 hop_size=audio.get_hop_size(),
                                                 max_steps=None, cin_pad=hparam.cin_pad))
    else:
        X = None
    C = FileSourceDataset(MelSpecDataSource(data_dir,
                                            hop_size=audio.get_hop_size(),
                                            max_steps=None, cin_pad=hparam.cin_pad))

    length_x = np.array(C.file_data_source.lengths)
    if C[0].shape[-1] != hparam.cin_channels:
        raise RuntimeError("Invalid cin_channnels {}. Expected to be {}.".format(hparam.cin_channels, C[0].shape[-1]))

    dataset = DualDataset(X, C, length_x, batch_size=hparam.batch_size, hparams=hparam)

    data_loader = de.GeneratorDataset(dataset, ["x_batch", "y_batch", "c_batch", "g_batch", "input_lengths", "mask"])

    return data_loader, dataset
Пример #20
0
def my_app(config : DictConfig) -> None:
    global logger
    logger = getLogger(config.verbose)
    logger.info(config.pretty())

    device = torch.device("cuda" if use_cuda else "cpu")
    in_dir = to_absolute_path(config.in_dir)
    out_dir = to_absolute_path(config.out_dir)
    os.makedirs(out_dir, exist_ok=True)

    model_config = OmegaConf.load(to_absolute_path(config.model.model_yaml))
    model = hydra.utils.instantiate(model_config.netG).to(device)
    checkpoint = torch.load(to_absolute_path(config.model.checkpoint),
        map_location=lambda storage, loc: storage)
    model.load_state_dict(checkpoint["state_dict"])

    scaler = joblib.load(to_absolute_path(config.out_scaler_path))

    in_feats = FileSourceDataset(NpyFileSource(in_dir))

    with torch.no_grad():
        for idx in tqdm(range(len(in_feats))):
            feats = torch.from_numpy(in_feats[idx]).unsqueeze(0).to(device)
            out = model(feats, [feats.shape[1]]).squeeze(0).cpu().data.numpy()

            out = scaler.inverse_transform(out)

            # Apply MLPG if necessary
            if np.any(model_config.has_dynamic_features):
                windows = get_windows(3)
                out = multi_stream_mlpg(
                    out, scaler.var_, windows, model_config.stream_sizes,
                    model_config.has_dynamic_features)

            name = basename(in_feats.collected_files[idx][0])
            out_path = join(out_dir, name)
            np.save(out_path, out, allow_pickle=False)
Пример #21
0
    # Preventing Windows specific error such as MemoryError
    # Also reduces the occurrence of THAllocator.c 0x05 error in Widows build of PyTorch
    if platform.system() == "Windows":
        print(
            " [!] Windows Detected - IF THAllocator.c 0x05 error occurs SET num_workers to 1"
        )

    assert hparams.name == "deepvoice3"
    print(hparams_debug_string())

    _frontend = getattr(frontend, hparams.frontend)

    os.makedirs(checkpoint_dir, exist_ok=True)

    # Input dataset definitions
    X = FileSourceDataset(TextDataSource(data_root, speaker_id))
    Mel = FileSourceDataset(MelSpecDataSource(data_root, speaker_id))
    Y = FileSourceDataset(LinearSpecDataSource(data_root, speaker_id))

    # Prepare sampler
    frame_lengths = Mel.file_data_source.frame_lengths
    sampler = PartialyRandomizedSimilarTimeLengthSampler(
        frame_lengths, batch_size=hparams.batch_size)

    # Dataset and Dataloader setup
    dataset = PyTorchDataset(X, Mel, Y)
    data_loader = data_utils.DataLoader(dataset,
                                        batch_size=hparams.batch_size,
                                        num_workers=hparams.num_workers,
                                        sampler=sampler,
                                        collate_fn=collate_fn,
Пример #22
0
    use_phone_alignment = args["--use_phone_alignment"]
    question_path = args["--question_path"]

    # Features required to train duration model
    # X -> Y
    # X: linguistic
    # Y: duration
    X_duration_source = LinguisticSource(
        add_frame_features=False,
        subphone_features=None,
        use_phone_alignment=use_phone_alignment,
        question_path=question_path)
    Y_duration_source = DurationFeatureSource(
        use_phone_alignment=use_phone_alignment)

    X_duration = FileSourceDataset(X_duration_source)
    Y_duration = FileSourceDataset(Y_duration_source)

    # Features required to train acoustic model
    # X -> Y
    # X: linguistic
    # Y: acoustic
    subphone_features = "full" if not use_phone_alignment else "coarse_coding"
    X_acoustic_source = LinguisticSource(
        add_frame_features=True,
        subphone_features=subphone_features,
        use_phone_alignment=use_phone_alignment,
        question_path=question_path)
    Y_acoustic_source = AcousticSource(use_phone_alignment=use_phone_alignment)
    X_acoustic = FileSourceDataset(X_acoustic_source)
    Y_acoustic = FileSourceDataset(Y_acoustic_source)
Пример #23
0
        #    output_mean, output_std = self.output_meanstd
        #    input_slice[:,1:25] = (input_slice[:,1:25] - input_mean[1:25])/input_std[1:25]
        #    output_slice[:,1:25] = (output_slice[:,1:25] - output_mean[1:25])/output_std[1:25]
        #    input_slice_normalised = input_slice
        #    output_slice_normalised = output_slice

        # Second index: selecting 24 MCEP features
        # Third index: randomly samping 128 frames
        input_tensor = torch.FloatTensor(mcep_A_normalised)
        output_tensor = torch.FloatTensor(mcep_B_normalised)

        filename_A = list(self.input_file_source.dataset.collected_files[idx])
        filename_B = list(self.output_file_source.dataset.collected_files[idx])

        #other = OtherParameters(f0_A,f0_B,bap_A,bap_B)


        return (input_tensor, output_tensor, filename_A, filename_B)


if __name__ == '__main__':

    data_source = VCC2016DataSource("/home/boomkin/repos/Voice_Converter_CycleGAN/data", ["SF1"])
    something = FileSourceDataset(data_source)

    print(something.collected_files[15])
    print(something[0].shape)
# Doesn't provide acceleration
#class MyInt(int):

Пример #24
0
 def __test_wrong_num_collected_files():
     X = FileSourceDataset(WrongNumberOfCollectedFilesDataSource())
     X[0]
Пример #25
0
 def __test_wrong_num_args():
     X = FileSourceDataset(WrongNumberOfArgsDataSource())
     X[0]
Пример #26
0
        return mgc.astype(np.float32)


if __name__ == "__main__":
    args = docopt(__doc__)
    print("Command line args:\n", args)
    DATA_ROOT = args["<DATA_ROOT>"]
    source_speaker = args["<source_speaker>"]
    target_speaker = args["<target_speaker>"]
    max_files = int(args["--max_files"])
    dst_dir = args["--dst_dir"]
    overwrite = args["--overwrite"]

    print(hparams_debug_string(hp))

    X_dataset = FileSourceDataset(MGCSource(DATA_ROOT, [source_speaker],
                                            max_files=max_files))
    Y_dataset = FileSourceDataset(MGCSource(DATA_ROOT, [target_speaker],
                                            max_files=max_files))

    skip_feature_extraction = exists(join(dst_dir, "X")) \
        and exists(join(dst_dir, "Y"))
    if overwrite:
        skip_feature_extraction = False
    if skip_feature_extraction:
        print("Features seems to be prepared, skipping feature extraction.")
        sys.exit(0)

    # Create dirs
    for speaker, name in [(source_speaker, "X"), (target_speaker, "Y")]:
        d = join(dst_dir, name)
        print("Destination dir for {}: {}".format(speaker, d))
    os.makedirs(checkpoint_dir, exist_ok=True)

    # Vocab size
    phids = make_phids(DATA_ROOT + '/txt.done.data.tacotron.phseq.train')
    outfile = checkpoint_dir + '/ids.json'
    with open(outfile, 'w') as outfile:
        json.dump(phids, outfile)
    print(phids)
    print("Length of vocabulary: ", len(phids))
    phids = dict(phids)
    #sys.exit()

    # Input dataset definitions
    X = FileSourceDataset(
        PhoneDataSource(DATA_ROOT, phids,
                        "txt.done.data.tacotron.phseq.train"))
    Mel = FileSourceDataset(
        MelSpecDataSource(DATA_ROOT, "txt.done.data.tacotron.phseq.train"))
    Y = FileSourceDataset(
        LinearSpecDataSource(DATA_ROOT, "txt.done.data.tacotron.phseq.train"))

    # Dataset and Dataloader setup
    dataset = PyTorchDataset(X, Mel, Y)
    data_loader = data_utils.DataLoader(dataset,
                                        batch_size=hparams.batch_size,
                                        num_workers=hparams.num_workers,
                                        shuffle=True,
                                        collate_fn=collate_fn,
                                        pin_memory=hparams.pin_memory)
Пример #28
0
    log_event_path = args["--log-event-path"]
    disable_slack = args["--disable-slack"]

    # Flags to update discriminator/generator or not
    update_d = w_d > 0
    update_g = False if discriminator_warmup else True

    os.makedirs(checkpoint_dir, exist_ok=True)

    X = {"train": {}, "test": {}}
    Y = {"train": {}, "test": {}}
    utt_lengths = {"train": {}, "test": {}}

    for phase in ["train", "test"]:
        train = True if phase == "train" else False
        X[phase] = FileSourceDataset(
            NPYDataSource(inputs_dir, train=train, max_files=max_files))
        Y[phase] = FileSourceDataset(
            NPYDataSource(outputs_dir, train=train, max_files=max_files))
        # Assuming X and Y are time aligned.
        x_lengths = np.array([len(x) for x in X[phase]])
        y_lengths = np.array([len(y) for y in Y[phase]])
        assert np.allclose(x_lengths, y_lengths)
        utt_lengths[phase] = x_lengths
        print("Size of dataset for {}: {}".format(phase, len(X[phase])))

    # Collect stats for noramlization (from training data)
    # if this becomes performance heavy (not now), this can be done in a separte
    # script
    phase = "train"
    # TODO: ugly?
    if hp == hparams.vc:
Пример #29
0
            D[t] = np.diag(self.covarYY[m]) - np.diag(self.covarYX[m]) / \
                np.diag(self.covarXX[m]) * np.diag(self.covarXY[m])

        # Once we have mean and variance over frames, then we can do MLPG
        return mlpg(E, D, self.windows)


source = CMUArcticSpectrumDataSource(data_root=DATA_ROOT,
                                     speakers=["ksp"],
                                     max_files=max_files)
target = CMUArcticSpectrumDataSource(data_root=DATA_ROOT,
                                     speakers=["slt"],
                                     max_files=max_files)

# Build dataset as 3D tensor (NxTxD)
X = FileSourceDataset(source).asarray(padded_length=1200)
Y = FileSourceDataset(target).asarray(padded_length=1200)

# Alignment
X, Y = DTWAligner(verbose=0, dist=melcd).transform((X, Y))

# Drop 1st dimention
X, Y = X[:, :, 1:], Y[:, :, 1:]

static_dim = X.shape[-1]

X = apply_each2d_trim(delta_features, X, windows)
Y = apply_each2d_trim(delta_features, Y, windows)

# Joint features
XY = np.concatenate((X, Y), axis=-1).reshape(-1, X.shape[-1] * 2)
    def __init__(self,
                 data_root_dir=None,
                 train_mode=False,
                 output_mode='melspec',
                 transform=None,
                 data_sel=None):

        self.wav_root_dir = data_root_dir + '/wavs/'
        self.train_mode = train_mode
        self.output_mode = output_mode
        self.transform = transform
        self.data_sel = data_sel

        self.max_len_text = MAX_LEN_TEXT
        self.max_len_melspec = MAX_LEN_MELSPEC
        self.max_len_spec = MAX_LEN_SPEC

        #        self.max_len_paired_text = MAX_LEN_PAIRED_TEXT # Max-lengths are required for z-padding
        #        self.max_len_paired_spec = MAX_LEN_PAIRED_SPEC
        #        self.max_len_paired_melspec = MAX_LEN_PAIRED_MELSPEC

        # Preparing Text:
        self.text_csv_path = data_root_dir + '/metadata.csv'
        self.reduce_punc_table = str.maketrans(
            string.ascii_uppercase, string.ascii_lowercase,
            '0123456789!#"$%&\()*+/:;<=>?@[\\]^_`{|}~')
        self.chr2int_table = dict(
            zip(" ',-." + string.ascii_lowercase, np.arange(0, 31)))

        df = pd.read_csv(self.text_csv_path,
                         index_col=False,
                         sep='|',
                         header=None,
                         memory_map=True)  # memory_map: speed-up reading.
        nan_rows = df[df[2].isnull()].index.values
        df.iloc[nan_rows, 2] = df.iloc[nan_rows,
                                       1]  # fixing dataset NaN value bugs...
        df = df.drop(1, axis=1)
        df.columns = ['file_id', 'text']
        df = df.drop(OMIT_DATA_ROWS, axis=0).reset_index(
            drop=True)  # Omitting foreign language..

        if self.train_mode is True:
            self.file_ids = df.iloc[0:N_TRAIN,
                                    0]  # file_ids: LJ**-**** (13,000)
            self.texts = df.iloc[0:N_TRAIN, 1]
            if self.data_sel is not None:
                self.file_ids = self.file_ids[self.data_sel].reset_index(
                    drop=True)
                self.texts = self.texts[self.data_sel].reset_index(drop=True)
        else:
            self.file_ids = df.iloc[N_TRAIN:,
                                    0].reset_index(drop=True)  # (100)
            self.texts = df.iloc[N_TRAIN:, 1].reset_index(drop=True)
            if self.data_sel is not None:
                self.file_ids = self.file_ids[self.data_sel].reset_index(
                    drop=True)
                self.texts = self.texts[self.data_sel].reset_index(drop=True)

        # Prepraing Audio:
        if self.train_mode is True:
            self.spec_features = MemoryCacheDataset(FileSourceDataset(
                SpecSource(wav_data_root=self.wav_root_dir,
                           file_sel_range=[0, N_TRAIN],
                           output_mode=self.output_mode)),
                                                    cache_size=len(
                                                        self.file_ids))
        else:
            self.spec_features = MemoryCacheDataset(FileSourceDataset(
                SpecSource(wav_data_root=self.wav_root_dir,
                           file_sel_range=[N_TRAIN, None],
                           output_mode=self.output_mode)),
                                                    cache_size=len(
                                                        self.file_ids))
        assert (len(self.file_ids) == len(self.spec_features))

        #        # Pairing: Sort and divide by feature lengths, then concat small + large
        #        lengths       = np.load('mspec_length_train_13000.npy')
        #        sorted_by_len = np.argsort(lengths)
        #        n_org       = len(sorted_by_len)
        #        n_pairs     = int(n_org / 2)
        #        self.paired_items = list()
        #        for i in range(n_pairs):
        #            self.paired_items.append([sorted_by_len[i], sorted_by_len[n_org - 1 - i]])

        return None
Пример #31
0
        return mgc.astype(np.float32)


if __name__ == "__main__":
    args = docopt(__doc__)
    print("Command line args:\n", args)
    DATA_ROOT = args["<DATA_ROOT>"]
    source_speaker = args["<source_speaker>"]
    target_speaker = args["<target_speaker>"]
    max_files = int(args["--max_files"])
    dst_dir = args["--dst_dir"]
    overwrite = args["--overwrite"]

    print(hparams_debug_string(hp))

    X_dataset = FileSourceDataset(
        MGCSource(DATA_ROOT, [source_speaker], max_files=max_files))
    Y_dataset = FileSourceDataset(
        MGCSource(DATA_ROOT, [target_speaker], max_files=max_files))

    skip_feature_extraction = exists(join(dst_dir, "X")) \
        and exists(join(dst_dir, "Y"))
    if overwrite:
        skip_feature_extraction = False
    if skip_feature_extraction:
        print("Features seems to be prepared, skipping feature extraction.")
        sys.exit(0)

    # Create dirs
    for speaker, name in [(source_speaker, "X"), (target_speaker, "Y")]:
        d = join(dst_dir, name)
        print("Destination dir for {}: {}".format(speaker, d))