def train():
    batch_size = 32
    n_epoch = 100
    n_mfcc = 60

    speech_loader = SpeechLoader(batch_size=batch_size, n_mfcc=n_mfcc)
    n_out = speech_loader.vocab_size

    model = WaveNet(n_out, batch_size=batch_size, n_mfcc=n_mfcc)

    model.build_graph()
    chpt = tf.train.get_checkpoint_state(TRAIN_DIR)
    if chpt:
        print("restore model paramters from %s" % chpt.model_checkpoint_path)
        model.restore(chpt.model_checkpoint_path)
    else:
        print("init a new model.")
        model.init_sess()

    speech_loader.create_batches()
    model.train_val(speech_loader.mfcc_tensor,
                    speech_loader.label_tensor,
                    ckpt_dir=TRAIN_DIR,
                    n_epoch=n_epoch,
                    val_rate=0.15)
예제 #2
0
def main():
    model = WaveNet()
    checkpoint = torch.load(
        'runs/Oct09_11-24-52_K-00030-LIN/checkpoint_9000.pth')
    model.load_state_dict(checkpoint['model'])
    weights = model.export_weights()
    wavenet = nv_wavenet.NVWaveNet(**weights)

    # TODO: とりあえずバッチサイズ1で実験
    # TODO: 複数の音声をまとめて推論するときは長さをpaddingする必要あり
    filename = 'data/arctic_a0001.wav'
    audio, sampling_rate = load_wav_to_torch(filename)
    mel = get_mel(audio)
    mel.unsqueeze_(0)
    print(mel.shape)

    # NVWaveNetの入力に合うように整形
    # (channels, batch=1, num_layers, samples)
    cond_input = get_cond_input(mel, model)

    # 波形を生成
    # 生成された波形は mu-law された状態なので元に戻す必要がある
    audio_data = wavenet.infer(cond_input, nv_wavenet.Impl.AUTO)
    print(audio_data.shape)
    print(audio_data.min(), audio_data.max())

    # wavenet.Aはmu_quantization
    audio = mu_law_decode_numpy(audio_data[0].cpu().numpy(), wavenet.A)
    audio = MAX_WAV_VALUE * audio
    wavdata = audio.astype('int16')
    scipy.io.wavfile.write('gen.wav', 16000, wavdata)
def speech_to_text():
    n_mfcc = 60
    batch_size = 1
    n_epoch = 100

    speech_loader = SpeechLoader(batch_size=batch_size, n_mfcc=n_mfcc)
    n_out = speech_loader.vocab_size

    model = WaveNet(n_out, batch_size=batch_size, n_mfcc=n_mfcc)
    model.build_graph()

    chpt = tf.train.get_checkpoint_state(TRAIN_DIR)
    if chpt:
        print("restore model paramters from %s" % chpt.model_checkpoint_path)
        model.restore(chpt.model_checkpoint_path)
    else:
        print("init a new model.")
        model.init_sess()

    file_names = os.listdir(TEXT_DIR)
    file_list = [os.path.join(TEXT_DIR, file_name) for file_name in file_names]

    step = 0
    for file in file_list:
        step += 1
        mfcc_features = speech_loader.load_one_file(file)

        output = model.predict(mfcc_features)
        # transfer to word
        words = speech_loader.index2str(output[0])
        print("Input(%d): %s" % (step, file))
        print("Output(%d): %s" % (step, words))
예제 #4
0
def create_network(batch_size, num_dilations, learning_rate):
    # model
    x = nn.Variable(shape=(batch_size, data_config.duration, 1))  # (B, T, 1)
    onehot = F.one_hot(x, shape=(data_config.q_bit_len, ))  # (B, T, C)
    wavenet_input = F.transpose(onehot, (0, 2, 1))  # (B, C, T)

    # speaker embedding
    s_emb = None

    net = WaveNet(num_dilations)
    wavenet_output = net(wavenet_input, s_emb)

    pred = F.transpose(wavenet_output, (0, 2, 1))

    # (B, T, 1)
    t = nn.Variable(shape=(batch_size, data_config.duration, 1))

    loss = F.mean(F.softmax_cross_entropy(pred, t))
    # loss.visit(PrintFunc())

    # Create Solver.
    solver = S.Adam(learning_rate)
    solver.set_parameters(nn.get_parameters())

    return x, t, loss, solver
예제 #5
0
def speech_to_text(wav_files, labels_dict):
    n_mfcc = 60

    # load data

    speech_loader = SpeechLoader(n_mfcc=n_mfcc, is_training=False)

    wav_max_len = 673

    # load model
    model = WaveNet(speech_loader.vocab_size, n_mfcc=n_mfcc, is_training=False)

    saver = tf.train.Saver(tf.trainable_variables())

    test_wav = wav_files[:10]


    # word dict
    word_map = {value: key for key, value in speech_loader.wordmap.items()}
    print(word_map)

    with tf.Session() as sess:

        saver.restore(sess, tf.train.latest_checkpoint('../model'))

        for wav_path in test_wav:
            wav, sr = librosa.load(wav_path, mono=True)
            mfcc = np.transpose(np.expand_dims(librosa.feature.mfcc(wav, sr, n_mfcc=n_mfcc),
                                               axis=0), [0, 2, 1])
            mfcc = mfcc.tolist()

            while len(mfcc[0]) < wav_max_len:
                mfcc[0].append([0] * n_mfcc)

            # recognition
            decoded = tf.transpose(model.logit, perm=[1, 0, 2])
            decoded, probs = tf.nn.ctc_beam_search_decoder(decoded, model.seq_len,
                                                           top_paths=1, merge_repeated=True)
            predict = tf.sparse_to_dense(decoded[0].indices,
                                         decoded[0].dense_shape,
                                         decoded[0].values) + 1
            output, probs = sess.run([predict, probs], feed_dict={model.input_data: mfcc})

            # result
            words = ''
            for i in range(len(output[0])):
                words += word_map.get(output[0][i], -1)

            wav_name = os.path.basename(wav_path).split('.')[0]

            print('-------------------------------------------------------')
            print(f'Input: {wav_path}')
            print(f'Output: {words}')
            print(f'True result: {labels_dict[wav_name]}')
예제 #6
0
def main():
    train_dataset = WaveNetDataset('train.list')
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               num_workers=0,
                                               shuffle=True,
                                               batch_size=batch_size,
                                               pin_memory=False,
                                               drop_last=True)

    model = WaveNet().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = torch.nn.CrossEntropyLoss()

    # training
    model.train()
    global_iters = 0
    for epoch in range(epochs):
        print('Epoch: {}'.format(epoch))
        for mel, audio in tqdm(train_loader,
                               total=len(train_loader),
                               desc='train'):
            mel = mel.to(device)
            audio = audio.to(device)
            optimizer.zero_grad()
            audio_pred = model(mel, audio)
            loss = criterion(audio_pred, audio)
            loss.backward()
            optimizer.step()

            if global_iters % logging_iters == 0:
                writer.add_scalar('train/loss', loss.item(), global_iters)

            if global_iters % checkpoint_iters == 0:
                checkpoint_path = os.path.join(
                    writer.logdir, 'checkpoint_{}.pth'.format(global_iters))
                save_checkpoint(model, optimizer, global_iters,
                                checkpoint_path)

            global_iters += 1
예제 #7
0
def train():
    '''

    :return:
    '''

    batch_size = 8
    n_mfcc = 60
    n_epoch = 100

    source_file = '/home/ydf_micro/datasets/data_thchs30'
    speech_loader = SpeechLoader(os.path.join(source_file, 'train'),
                                 batch_size, n_mfcc)

    n_out = speech_loader.vocab_size

    # load model

    model = WaveNet(n_out, batch_size=batch_size, n_mfcc=n_mfcc)

    saver = tf.train.Saver(tf.global_variables())

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        # sess.graph.finalize() # Graph is read-only after this statement

        for epoch in range(n_epoch):
            speech_loader.create_batches()  # random shuffle data
            speech_loader.reset_batch_pointer()
            for batch in range(speech_loader.n_batches):
                batch_start = time.time()
                batches_wav, batches_label = speech_loader.next_batch()
                feed = {
                    model.input_data: batches_wav,
                    model.targets: batches_label
                }
                train_loss, _ = sess.run([model.cost, model.optimizer_op],
                                         feed_dict=feed)
                batch_end = time.time()
                print(
                    f'epoch: {epoch+1}/{n_epoch}, batch: {batch+1}/{speech_loader.n_batches}, '
                    f'loss: {train_loss:.2f}, time: {(batch_end-batch_start):.2f}s'
                )

            # save models
            if epoch % 5 == 0:
                saver.save(sess,
                           os.path.join(os.path.dirname(os.getcwd()), 'model',
                                        'speech.module'),
                           global_step=epoch)
예제 #8
0
def create_model(hparams, ppg_dim_, fwh_dim, **kwargs):
    if load_recent_model(hparams.model_save_path) is not None:
        model_path, recent_epoch = load_recent_model(hparams.model_save_path)
        with CustomObjectScope({'ZoneoutLSTMCell': ZoneoutLSTMCell}):
            mymodel = load_model(model_path)
        return mymodel, recent_epoch
    else:
        #  if not add emotion, ppg_dim - 4 due to emotion
        if hparams.add_emotion is False:
            ppg_dim = ppg_dim_ - 4
        else:
            ppg_dim = ppg_dim_

        if hparams.network == 'BLSTM':
            return BRNN(-1, ppg_dim, fwh_dim, hparams).get_model(), 0
        elif hparams.network == 'WaveNet':
            return WaveNet(hparams.wavenet_input_time, fwh_dim,
                           ppg_dim * hparams.wavenet_context_size,
                           hparams).get_model(), 0
        elif hparams.network == 'Tacotron':
            Tacotron_model = TacotronDecoder(
                ppg_dim,
                fwh_dim,
                hparams,
                stateful=kwargs['stateful'],
                state_batch_size=kwargs['state_batch_size']).decode()
            if hparams.BLSTM_pretrain is True and hparams.Tacotron_encoder == 'BLSTM':
                BLSTM_model_path = load_best_model(hparams.BLSTM_pretrain_path,
                                                   hparams)
                with CustomObjectScope({'ZoneoutLSTMCell': ZoneoutLSTMCell}):
                    BLSTM_model = load_model(BLSTM_model_path)
                print(BLSTM_model.layers)
                for i in range(3):
                    Tacotron_model.get_layer(
                        'Encoder_BLSTM_' + str(i + 1)).set_weights(
                            BLSTM_model.layers[len(BLSTM_model.layers) - 4 +
                                               i].get_weights())
                    if hparams.BLSTM_finetune is False:
                        Tacotron_model.get_layer('Encoder_BLSTM_' +
                                                 str(i + 1)).trainable = False
            print(Tacotron_model.non_trainable_weights)
            return Tacotron_model, 0
        elif hparams.network == 'CNN':
            return CNN(ppg_dim, fwh_dim, hparams).get_model(), 0
        else:
            return None, 0
예제 #9
0
import torch
from torch.autograd import Variable
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader
from model import WaveNet
from data import Piano

piano = Piano(data_dir='./Dataset_4s/', length=4)
training_data = DataLoader(piano, batch_size=2, shuffle=True)
model = WaveNet().cuda()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
scheduler = optim.lr_scheduler.StepLR(optimizer, 500, gamma=0.5)
recp_field = 1276

checkpoint = torch.load('checkpoint.pth')
model.load_state_dict(checkpoint['state_dict'])
epoch_old = checkpoint['epoch']
optimizer.load_state_dict(checkpoint['optimizer'])
model.train()

for epoch in range(epoch_old, 10000):
    running_loss = 0.0
    for index, (data, target, _) in enumerate(training_data):
        data = Variable(data.type(torch.FloatTensor)).cuda()
        logits = model(data)
        logits = logits[:, :, :-1]
        y = target[:, :, recp_field:].squeeze(1).cuda()

        loss = F.cross_entropy(logits, y).cuda()
        optimizer.zero_grad()
예제 #10
0
    if args.classifier:

        audio_data = AudioData()
        num_samples = audio_data.num_samples
        num_classes = audio_data.classes

        dilations = [
            1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1, 2, 4, 8, 16, 32, 64, 128,
            256, 512
        ]

        network = WaveNet(num_samples,
                          num_classes,
                          dilations,
                          dilation_channels=32,
                          skip_channels=128,
                          output_channels=num_classes,
                          learning_rate=0.001)

        saver = tf.train.Saver()

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            if args.logdir is not None and os.path.exists(args.logdir):
                checkpoint_state = tf.train.get_checkpoint_state(args.logdir)
                if checkpoint_state is not None:
                    try:
                        saver.restore(sess,
                                      checkpoint_state.model_checkpoint_path)
예제 #11
0
    torch.backends.cudnn.deterministic=True
    torch.manual_seed = n
    random.seed(n)
    np.random.seed(n)

if __name__ == '__main__':

    set_seed(42)

    featurizer = MelSpectrogram(MelSpectrogramConfig())

    mu_law = torchaudio.transforms.MuLawEncoding(quantization_channels=256)
    mu_law_dec = torchaudio.transforms.MuLawDecoding(quantization_channels=256)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = WaveNet(device).to(device)

    lr = 0.001
    optimizer = Adam(model.parameters(), lr=lr)

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, verbose=True, patience=1, factor=0.5)

    criterion = nn.CrossEntropyLoss()

    CROP = 12800
    # root = '/content/drive/MyDrive/DLA/hw5/LJSpeech-1.1/wavs/'

    root = 'LJSpeech-1.1/wavs/'

    wav_ids = [f for f in listdir(root) if isfile(join(root, f))]
예제 #12
0
import torch
import torchaudio
import numpy as np
from model import WaveNet, MuLaw
from featurizer_spec import MelSpectrogram, MelSpectrogramConfig


mulaw = MuLaw()
featurizer = MelSpectrogram(MelSpectrogramConfig()).to(device)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = WaveNet()
model.load_state_dict(torch.load('wavenet_15.pth'))
model.to(device)
model.eval()

wav, _ = torchaudio.load("LJ033-0047.wav")
wav = wav.to(device)

model.eval()
with torch.no_grad():
    mels = featurizer(wav)
    prediction = model.inference(mels).squeeze()

result = mulaw.decode(prediction).cpu()
torchaudio.save('result.wav', result, 22050)
 
예제 #13
0

def one_hot_encode(data, channels=256):
    data = data.numpy()
    one_hot = np.zeros((data.size, channels), dtype=float)
    one_hot[np.arange(data.size), data] = 1

    return one_hot


seedloader = DataLoader(piano, batch_size=1, shuffle=True)

recp_field = 1276
sample_len = 4000 * 8

model = WaveNet().cuda()
checkpoint = torch.load('checkpoint.pth')
model.load_state_dict(checkpoint['state_dict'])

song = 6
with torch.no_grad():
    for seed, _, audio in seedloader:
        seed = seed[:, :, 500:recp_field + 500].float().cuda()
        #sample = Variable(sample.type(torch.FloatTensor)).cuda()
        output = seed
        for index in range(sample_len):
            #print(sample[:, :, -10:].argmax(1))
            new = model(seed)
            p = torch.distributions.categorical.Categorical(
                logits=new.squeeze())
            new_mag = p.sample()
예제 #14
0
import torch
from torch.autograd import Variable
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader
from model import WaveNet
from data import Piano
import numpy as np

recp_field = 1276  # 5116 for (10, 5)

piano = Piano(data_dir='./Dataset_4s/', length=4)
training_data = DataLoader(piano, batch_size=2, shuffle=True)
model = WaveNet().cuda()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
scheduler = optim.lr_scheduler.StepLR(optimizer, 500, gamma=0.5)

for epoch in range(20000):
    running_loss = 0.0
    for index, (data, target, _) in enumerate(training_data):
        data = Variable(data.type(torch.FloatTensor)).cuda()
        logits = model(data)
        logits = logits[:, :, :-1]
        y = target[:, :, recp_field:].squeeze(1).cuda()

        loss = F.cross_entropy(logits, y).cuda()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        running_loss += loss.item()
예제 #15
0
def train():
    args = get_args()

    # Set context.
    from nnabla.ext_utils import get_extension_context
    logger.info("Running in {}:{}".format(args.context, args.type_config))
    ctx = get_extension_context(args.context,
                                device_id=args.device_id,
                                type_config=args.type_config)
    nn.set_default_context(ctx)

    data_iterator = data_iterator_librispeech(args.batch_size, args.data_dir)
    _data_source = data_iterator._data_source  # dirty hack...

    # model
    x = nn.Variable(
        shape=(args.batch_size, data_config.duration, 1))  # (B, T, 1)
    onehot = F.one_hot(x, shape=(data_config.q_bit_len, ))  # (B, T, C)
    wavenet_input = F.transpose(onehot, (0, 2, 1))  # (B, C, T)

    # speaker embedding
    if args.use_speaker_id:
        s_id = nn.Variable(shape=(args.batch_size, 1))
        with nn.parameter_scope("speaker_embedding"):
            s_emb = PF.embed(s_id, n_inputs=_data_source.n_speaker,
                             n_features=WavenetConfig.speaker_dims)
            s_emb = F.transpose(s_emb, (0, 2, 1))
    else:
        s_emb = None

    net = WaveNet()
    wavenet_output = net(wavenet_input, s_emb)

    pred = F.transpose(wavenet_output, (0, 2, 1))

    # (B, T, 1)
    t = nn.Variable(shape=(args.batch_size, data_config.duration, 1))

    loss = F.mean(F.softmax_cross_entropy(pred, t))

    # for generation
    prob = F.softmax(pred)

    # Create Solver.
    solver = S.Adam(args.learning_rate)
    solver.set_parameters(nn.get_parameters())

    # Create monitor.
    monitor = Monitor(args.monitor_path)
    monitor_loss = MonitorSeries("Training loss", monitor, interval=10)

    # setup save env.
    audio_save_path = os.path.join(os.path.abspath(
        args.model_save_path), "audio_results")
    if audio_save_path and not os.path.exists(audio_save_path):
        os.makedirs(audio_save_path)

    # Training loop.
    for i in range(args.max_iter):
        # todo: validation

        x.d, _speaker, t.d = data_iterator.next()
        if args.use_speaker_id:
            s_id.d = _speaker.reshape(-1, 1)

        solver.zero_grad()
        loss.forward(clear_no_need_grad=True)
        loss.backward(clear_buffer=True)
        solver.update()

        loss.data.cast(np.float32, ctx)
        monitor_loss.add(i, loss.d.copy())

        if i % args.model_save_interval == 0:
            prob.forward()
            audios = mu_law_decode(
                np.argmax(prob.d, axis=-1), quantize=data_config.q_bit_len)  # (B, T)
            save_audio(audios, i, audio_save_path)
예제 #16
0
    print("residual_channels", args.residual_channels)
    print("dilation_channels", args.dilation_channels)
    print("skip_channels", args.skip_channels)
    print("end_channels", args.end_channels)

    device = torch.device('cpu')
    if torch.cuda.is_available() and args.use_cuda:
        device = torch.device('cuda')
    print("Device:", device)

    GLOBAL_CONDITIONING = False
    if (args.dataset == 'VCTK'):
        GLOBAL_CONDITIONING = True

    model = WaveNet(args.blocks, args.layers_per_block, GLOBAL_CONDITIONING, args.speakers, \
                    output_channels=256, residual_channels=args.residual_channels, dilation_channels=args.dilation_channels, \
                    skip_channels=args.skip_channels, end_channels=args.end_channels)
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    criterion = nn.CrossEntropyLoss()

    model.to(device)
    if args.load_path:
        state = torch.load(args.load_path, map_location='cpu')
        model.load_state_dict(state['model'])
        model.to(device)

        #temp bug fix
        optimizer = optim.Adam(model.parameters(), lr=0.01)
        optimizer.load_state_dict(state['optimizer'])
예제 #17
0
import torch
from torch.autograd import Variable
import torch.nn as nn
from torch import optim
from torch.utils.data import Dataset, DataLoader
from model import WaveNet, GenLSTM, DisLSTM
from data import Piano
import torchaudio
import numpy as np

recp_field = 1276
sample_len = 4000 * 8

netW = WaveNet(dilation_depth=8).cuda()
netG = GenLSTM(stride=500).double().cuda()
netD = DisLSTM().double().cuda()
optimizerD = optim.Adam(netD.parameters(), lr=1e-3)
optimizerG = optim.Adam(netG.parameters(), lr=1e-3)

piano = Piano(data_dir='./Dataset_4s', length=4, sample_rate=4000)
dataloader = DataLoader(piano, batch_size=5, shuffle=True)
seedloader = DataLoader(piano, batch_size=1, shuffle=True)

checkpoint = torch.load('checkpoint.pth')
netW.load_state_dict(checkpoint['state_dict'])

criterion = nn.BCELoss()


def wavenetGen(batch_size=5, sample_len=4, recp_field=1276):
    for i, (seed, _, _) in enumerate(seedloader):
예제 #18
0
train_dataset = LJSpeechDataset(train)
test_dataset = LJSpeechDataset(test)

train_dataloader = DataLoader(train_dataset,
                              batch_size=6,
                              num_workers=8,
                              shuffle=False,
                              pin_memory=True)
test_dataloader = DataLoader(test_dataset,
                             batch_size=6,
                             num_workers=8,
                             pin_memory=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = WaveNet()

optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()
featurizer = MelSpectrogram(MelSpectrogramConfig()).to(device)
model.to(device)
wandb.watch(model, log="all")

N_EPOCHS = 14

for epoch in tqdm(range(N_EPOCHS)):
    train_loss, train_acc = train_model(model, train_dataloader, optimizer,
                                        criterion)
    test_loss, test_acc = evaluate(model, test_dataloader, criterion)

    wandb.log({
예제 #19
0
print(device)
BATCH_SIZE = 1

torch.backends.cudnn.deterministic = True
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

from dataset import load_dataset
dataloader_train, dataloader_val = load_dataset(featurizer, BATCH_SIZE)

from model import WaveNet

generator = WaveNet(n_mels=80, n_blocks=20).to(device)

from math import exp, log

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import wandb
run = wandb.init(project="DLA_HW5",
                 config={
                     "n_mels": 80,
                     "n_blocks": 20,
                     "learn_rate": 0.001,
                     "batch_size": 1
                 })
config = wandb.config
예제 #20
0
X_train, X_test = train_test_split(list(df['id']), train_size=.9)

train_dataset = LJSpeech(X_train, train=True)
test_dataset = LJSpeech(X_test, train=False)

train_loader = DataLoader(train_dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=8)
val_loader = DataLoader(test_dataset,
                        batch_size=batch_size,
                        shuffle=False,
                        num_workers=8)

model = WaveNet(device, MelSpectrogramConfig())

model = model.to(device)

learning_rate = 0.001

error = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                               step_size=10,
                                               gamma=0.5)

for epoch in range(num_epochs):
    train(epoch, train_loader, model, device, optimizer, error)