def train(): batch_size = 32 n_epoch = 100 n_mfcc = 60 speech_loader = SpeechLoader(batch_size=batch_size, n_mfcc=n_mfcc) n_out = speech_loader.vocab_size model = WaveNet(n_out, batch_size=batch_size, n_mfcc=n_mfcc) model.build_graph() chpt = tf.train.get_checkpoint_state(TRAIN_DIR) if chpt: print("restore model paramters from %s" % chpt.model_checkpoint_path) model.restore(chpt.model_checkpoint_path) else: print("init a new model.") model.init_sess() speech_loader.create_batches() model.train_val(speech_loader.mfcc_tensor, speech_loader.label_tensor, ckpt_dir=TRAIN_DIR, n_epoch=n_epoch, val_rate=0.15)
def main(): model = WaveNet() checkpoint = torch.load( 'runs/Oct09_11-24-52_K-00030-LIN/checkpoint_9000.pth') model.load_state_dict(checkpoint['model']) weights = model.export_weights() wavenet = nv_wavenet.NVWaveNet(**weights) # TODO: とりあえずバッチサイズ1で実験 # TODO: 複数の音声をまとめて推論するときは長さをpaddingする必要あり filename = 'data/arctic_a0001.wav' audio, sampling_rate = load_wav_to_torch(filename) mel = get_mel(audio) mel.unsqueeze_(0) print(mel.shape) # NVWaveNetの入力に合うように整形 # (channels, batch=1, num_layers, samples) cond_input = get_cond_input(mel, model) # 波形を生成 # 生成された波形は mu-law された状態なので元に戻す必要がある audio_data = wavenet.infer(cond_input, nv_wavenet.Impl.AUTO) print(audio_data.shape) print(audio_data.min(), audio_data.max()) # wavenet.Aはmu_quantization audio = mu_law_decode_numpy(audio_data[0].cpu().numpy(), wavenet.A) audio = MAX_WAV_VALUE * audio wavdata = audio.astype('int16') scipy.io.wavfile.write('gen.wav', 16000, wavdata)
def speech_to_text(): n_mfcc = 60 batch_size = 1 n_epoch = 100 speech_loader = SpeechLoader(batch_size=batch_size, n_mfcc=n_mfcc) n_out = speech_loader.vocab_size model = WaveNet(n_out, batch_size=batch_size, n_mfcc=n_mfcc) model.build_graph() chpt = tf.train.get_checkpoint_state(TRAIN_DIR) if chpt: print("restore model paramters from %s" % chpt.model_checkpoint_path) model.restore(chpt.model_checkpoint_path) else: print("init a new model.") model.init_sess() file_names = os.listdir(TEXT_DIR) file_list = [os.path.join(TEXT_DIR, file_name) for file_name in file_names] step = 0 for file in file_list: step += 1 mfcc_features = speech_loader.load_one_file(file) output = model.predict(mfcc_features) # transfer to word words = speech_loader.index2str(output[0]) print("Input(%d): %s" % (step, file)) print("Output(%d): %s" % (step, words))
def create_network(batch_size, num_dilations, learning_rate): # model x = nn.Variable(shape=(batch_size, data_config.duration, 1)) # (B, T, 1) onehot = F.one_hot(x, shape=(data_config.q_bit_len, )) # (B, T, C) wavenet_input = F.transpose(onehot, (0, 2, 1)) # (B, C, T) # speaker embedding s_emb = None net = WaveNet(num_dilations) wavenet_output = net(wavenet_input, s_emb) pred = F.transpose(wavenet_output, (0, 2, 1)) # (B, T, 1) t = nn.Variable(shape=(batch_size, data_config.duration, 1)) loss = F.mean(F.softmax_cross_entropy(pred, t)) # loss.visit(PrintFunc()) # Create Solver. solver = S.Adam(learning_rate) solver.set_parameters(nn.get_parameters()) return x, t, loss, solver
def speech_to_text(wav_files, labels_dict): n_mfcc = 60 # load data speech_loader = SpeechLoader(n_mfcc=n_mfcc, is_training=False) wav_max_len = 673 # load model model = WaveNet(speech_loader.vocab_size, n_mfcc=n_mfcc, is_training=False) saver = tf.train.Saver(tf.trainable_variables()) test_wav = wav_files[:10] # word dict word_map = {value: key for key, value in speech_loader.wordmap.items()} print(word_map) with tf.Session() as sess: saver.restore(sess, tf.train.latest_checkpoint('../model')) for wav_path in test_wav: wav, sr = librosa.load(wav_path, mono=True) mfcc = np.transpose(np.expand_dims(librosa.feature.mfcc(wav, sr, n_mfcc=n_mfcc), axis=0), [0, 2, 1]) mfcc = mfcc.tolist() while len(mfcc[0]) < wav_max_len: mfcc[0].append([0] * n_mfcc) # recognition decoded = tf.transpose(model.logit, perm=[1, 0, 2]) decoded, probs = tf.nn.ctc_beam_search_decoder(decoded, model.seq_len, top_paths=1, merge_repeated=True) predict = tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape, decoded[0].values) + 1 output, probs = sess.run([predict, probs], feed_dict={model.input_data: mfcc}) # result words = '' for i in range(len(output[0])): words += word_map.get(output[0][i], -1) wav_name = os.path.basename(wav_path).split('.')[0] print('-------------------------------------------------------') print(f'Input: {wav_path}') print(f'Output: {words}') print(f'True result: {labels_dict[wav_name]}')
def main(): train_dataset = WaveNetDataset('train.list') train_loader = torch.utils.data.DataLoader(train_dataset, num_workers=0, shuffle=True, batch_size=batch_size, pin_memory=False, drop_last=True) model = WaveNet().to(device) optimizer = torch.optim.Adam(model.parameters(), lr=lr) criterion = torch.nn.CrossEntropyLoss() # training model.train() global_iters = 0 for epoch in range(epochs): print('Epoch: {}'.format(epoch)) for mel, audio in tqdm(train_loader, total=len(train_loader), desc='train'): mel = mel.to(device) audio = audio.to(device) optimizer.zero_grad() audio_pred = model(mel, audio) loss = criterion(audio_pred, audio) loss.backward() optimizer.step() if global_iters % logging_iters == 0: writer.add_scalar('train/loss', loss.item(), global_iters) if global_iters % checkpoint_iters == 0: checkpoint_path = os.path.join( writer.logdir, 'checkpoint_{}.pth'.format(global_iters)) save_checkpoint(model, optimizer, global_iters, checkpoint_path) global_iters += 1
def train(): ''' :return: ''' batch_size = 8 n_mfcc = 60 n_epoch = 100 source_file = '/home/ydf_micro/datasets/data_thchs30' speech_loader = SpeechLoader(os.path.join(source_file, 'train'), batch_size, n_mfcc) n_out = speech_loader.vocab_size # load model model = WaveNet(n_out, batch_size=batch_size, n_mfcc=n_mfcc) saver = tf.train.Saver(tf.global_variables()) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # sess.graph.finalize() # Graph is read-only after this statement for epoch in range(n_epoch): speech_loader.create_batches() # random shuffle data speech_loader.reset_batch_pointer() for batch in range(speech_loader.n_batches): batch_start = time.time() batches_wav, batches_label = speech_loader.next_batch() feed = { model.input_data: batches_wav, model.targets: batches_label } train_loss, _ = sess.run([model.cost, model.optimizer_op], feed_dict=feed) batch_end = time.time() print( f'epoch: {epoch+1}/{n_epoch}, batch: {batch+1}/{speech_loader.n_batches}, ' f'loss: {train_loss:.2f}, time: {(batch_end-batch_start):.2f}s' ) # save models if epoch % 5 == 0: saver.save(sess, os.path.join(os.path.dirname(os.getcwd()), 'model', 'speech.module'), global_step=epoch)
def create_model(hparams, ppg_dim_, fwh_dim, **kwargs): if load_recent_model(hparams.model_save_path) is not None: model_path, recent_epoch = load_recent_model(hparams.model_save_path) with CustomObjectScope({'ZoneoutLSTMCell': ZoneoutLSTMCell}): mymodel = load_model(model_path) return mymodel, recent_epoch else: # if not add emotion, ppg_dim - 4 due to emotion if hparams.add_emotion is False: ppg_dim = ppg_dim_ - 4 else: ppg_dim = ppg_dim_ if hparams.network == 'BLSTM': return BRNN(-1, ppg_dim, fwh_dim, hparams).get_model(), 0 elif hparams.network == 'WaveNet': return WaveNet(hparams.wavenet_input_time, fwh_dim, ppg_dim * hparams.wavenet_context_size, hparams).get_model(), 0 elif hparams.network == 'Tacotron': Tacotron_model = TacotronDecoder( ppg_dim, fwh_dim, hparams, stateful=kwargs['stateful'], state_batch_size=kwargs['state_batch_size']).decode() if hparams.BLSTM_pretrain is True and hparams.Tacotron_encoder == 'BLSTM': BLSTM_model_path = load_best_model(hparams.BLSTM_pretrain_path, hparams) with CustomObjectScope({'ZoneoutLSTMCell': ZoneoutLSTMCell}): BLSTM_model = load_model(BLSTM_model_path) print(BLSTM_model.layers) for i in range(3): Tacotron_model.get_layer( 'Encoder_BLSTM_' + str(i + 1)).set_weights( BLSTM_model.layers[len(BLSTM_model.layers) - 4 + i].get_weights()) if hparams.BLSTM_finetune is False: Tacotron_model.get_layer('Encoder_BLSTM_' + str(i + 1)).trainable = False print(Tacotron_model.non_trainable_weights) return Tacotron_model, 0 elif hparams.network == 'CNN': return CNN(ppg_dim, fwh_dim, hparams).get_model(), 0 else: return None, 0
import torch from torch.autograd import Variable import torch.nn.functional as F from torch import optim from torch.utils.data import Dataset, DataLoader from model import WaveNet from data import Piano piano = Piano(data_dir='./Dataset_4s/', length=4) training_data = DataLoader(piano, batch_size=2, shuffle=True) model = WaveNet().cuda() optimizer = optim.Adam(model.parameters(), lr=1e-3) scheduler = optim.lr_scheduler.StepLR(optimizer, 500, gamma=0.5) recp_field = 1276 checkpoint = torch.load('checkpoint.pth') model.load_state_dict(checkpoint['state_dict']) epoch_old = checkpoint['epoch'] optimizer.load_state_dict(checkpoint['optimizer']) model.train() for epoch in range(epoch_old, 10000): running_loss = 0.0 for index, (data, target, _) in enumerate(training_data): data = Variable(data.type(torch.FloatTensor)).cuda() logits = model(data) logits = logits[:, :, :-1] y = target[:, :, recp_field:].squeeze(1).cuda() loss = F.cross_entropy(logits, y).cuda() optimizer.zero_grad()
if args.classifier: audio_data = AudioData() num_samples = audio_data.num_samples num_classes = audio_data.classes dilations = [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ] network = WaveNet(num_samples, num_classes, dilations, dilation_channels=32, skip_channels=128, output_channels=num_classes, learning_rate=0.001) saver = tf.train.Saver() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) if args.logdir is not None and os.path.exists(args.logdir): checkpoint_state = tf.train.get_checkpoint_state(args.logdir) if checkpoint_state is not None: try: saver.restore(sess, checkpoint_state.model_checkpoint_path)
torch.backends.cudnn.deterministic=True torch.manual_seed = n random.seed(n) np.random.seed(n) if __name__ == '__main__': set_seed(42) featurizer = MelSpectrogram(MelSpectrogramConfig()) mu_law = torchaudio.transforms.MuLawEncoding(quantization_channels=256) mu_law_dec = torchaudio.transforms.MuLawDecoding(quantization_channels=256) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = WaveNet(device).to(device) lr = 0.001 optimizer = Adam(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, verbose=True, patience=1, factor=0.5) criterion = nn.CrossEntropyLoss() CROP = 12800 # root = '/content/drive/MyDrive/DLA/hw5/LJSpeech-1.1/wavs/' root = 'LJSpeech-1.1/wavs/' wav_ids = [f for f in listdir(root) if isfile(join(root, f))]
import torch import torchaudio import numpy as np from model import WaveNet, MuLaw from featurizer_spec import MelSpectrogram, MelSpectrogramConfig mulaw = MuLaw() featurizer = MelSpectrogram(MelSpectrogramConfig()).to(device) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = WaveNet() model.load_state_dict(torch.load('wavenet_15.pth')) model.to(device) model.eval() wav, _ = torchaudio.load("LJ033-0047.wav") wav = wav.to(device) model.eval() with torch.no_grad(): mels = featurizer(wav) prediction = model.inference(mels).squeeze() result = mulaw.decode(prediction).cpu() torchaudio.save('result.wav', result, 22050)
def one_hot_encode(data, channels=256): data = data.numpy() one_hot = np.zeros((data.size, channels), dtype=float) one_hot[np.arange(data.size), data] = 1 return one_hot seedloader = DataLoader(piano, batch_size=1, shuffle=True) recp_field = 1276 sample_len = 4000 * 8 model = WaveNet().cuda() checkpoint = torch.load('checkpoint.pth') model.load_state_dict(checkpoint['state_dict']) song = 6 with torch.no_grad(): for seed, _, audio in seedloader: seed = seed[:, :, 500:recp_field + 500].float().cuda() #sample = Variable(sample.type(torch.FloatTensor)).cuda() output = seed for index in range(sample_len): #print(sample[:, :, -10:].argmax(1)) new = model(seed) p = torch.distributions.categorical.Categorical( logits=new.squeeze()) new_mag = p.sample()
import torch from torch.autograd import Variable import torch.nn.functional as F from torch import optim from torch.utils.data import Dataset, DataLoader from model import WaveNet from data import Piano import numpy as np recp_field = 1276 # 5116 for (10, 5) piano = Piano(data_dir='./Dataset_4s/', length=4) training_data = DataLoader(piano, batch_size=2, shuffle=True) model = WaveNet().cuda() optimizer = optim.Adam(model.parameters(), lr=1e-3) scheduler = optim.lr_scheduler.StepLR(optimizer, 500, gamma=0.5) for epoch in range(20000): running_loss = 0.0 for index, (data, target, _) in enumerate(training_data): data = Variable(data.type(torch.FloatTensor)).cuda() logits = model(data) logits = logits[:, :, :-1] y = target[:, :, recp_field:].squeeze(1).cuda() loss = F.cross_entropy(logits, y).cuda() optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step() running_loss += loss.item()
def train(): args = get_args() # Set context. from nnabla.ext_utils import get_extension_context logger.info("Running in {}:{}".format(args.context, args.type_config)) ctx = get_extension_context(args.context, device_id=args.device_id, type_config=args.type_config) nn.set_default_context(ctx) data_iterator = data_iterator_librispeech(args.batch_size, args.data_dir) _data_source = data_iterator._data_source # dirty hack... # model x = nn.Variable( shape=(args.batch_size, data_config.duration, 1)) # (B, T, 1) onehot = F.one_hot(x, shape=(data_config.q_bit_len, )) # (B, T, C) wavenet_input = F.transpose(onehot, (0, 2, 1)) # (B, C, T) # speaker embedding if args.use_speaker_id: s_id = nn.Variable(shape=(args.batch_size, 1)) with nn.parameter_scope("speaker_embedding"): s_emb = PF.embed(s_id, n_inputs=_data_source.n_speaker, n_features=WavenetConfig.speaker_dims) s_emb = F.transpose(s_emb, (0, 2, 1)) else: s_emb = None net = WaveNet() wavenet_output = net(wavenet_input, s_emb) pred = F.transpose(wavenet_output, (0, 2, 1)) # (B, T, 1) t = nn.Variable(shape=(args.batch_size, data_config.duration, 1)) loss = F.mean(F.softmax_cross_entropy(pred, t)) # for generation prob = F.softmax(pred) # Create Solver. solver = S.Adam(args.learning_rate) solver.set_parameters(nn.get_parameters()) # Create monitor. monitor = Monitor(args.monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=10) # setup save env. audio_save_path = os.path.join(os.path.abspath( args.model_save_path), "audio_results") if audio_save_path and not os.path.exists(audio_save_path): os.makedirs(audio_save_path) # Training loop. for i in range(args.max_iter): # todo: validation x.d, _speaker, t.d = data_iterator.next() if args.use_speaker_id: s_id.d = _speaker.reshape(-1, 1) solver.zero_grad() loss.forward(clear_no_need_grad=True) loss.backward(clear_buffer=True) solver.update() loss.data.cast(np.float32, ctx) monitor_loss.add(i, loss.d.copy()) if i % args.model_save_interval == 0: prob.forward() audios = mu_law_decode( np.argmax(prob.d, axis=-1), quantize=data_config.q_bit_len) # (B, T) save_audio(audios, i, audio_save_path)
print("residual_channels", args.residual_channels) print("dilation_channels", args.dilation_channels) print("skip_channels", args.skip_channels) print("end_channels", args.end_channels) device = torch.device('cpu') if torch.cuda.is_available() and args.use_cuda: device = torch.device('cuda') print("Device:", device) GLOBAL_CONDITIONING = False if (args.dataset == 'VCTK'): GLOBAL_CONDITIONING = True model = WaveNet(args.blocks, args.layers_per_block, GLOBAL_CONDITIONING, args.speakers, \ output_channels=256, residual_channels=args.residual_channels, dilation_channels=args.dilation_channels, \ skip_channels=args.skip_channels, end_channels=args.end_channels) optimizer = optim.Adam(model.parameters(), lr=0.01) criterion = nn.CrossEntropyLoss() model.to(device) if args.load_path: state = torch.load(args.load_path, map_location='cpu') model.load_state_dict(state['model']) model.to(device) #temp bug fix optimizer = optim.Adam(model.parameters(), lr=0.01) optimizer.load_state_dict(state['optimizer'])
import torch from torch.autograd import Variable import torch.nn as nn from torch import optim from torch.utils.data import Dataset, DataLoader from model import WaveNet, GenLSTM, DisLSTM from data import Piano import torchaudio import numpy as np recp_field = 1276 sample_len = 4000 * 8 netW = WaveNet(dilation_depth=8).cuda() netG = GenLSTM(stride=500).double().cuda() netD = DisLSTM().double().cuda() optimizerD = optim.Adam(netD.parameters(), lr=1e-3) optimizerG = optim.Adam(netG.parameters(), lr=1e-3) piano = Piano(data_dir='./Dataset_4s', length=4, sample_rate=4000) dataloader = DataLoader(piano, batch_size=5, shuffle=True) seedloader = DataLoader(piano, batch_size=1, shuffle=True) checkpoint = torch.load('checkpoint.pth') netW.load_state_dict(checkpoint['state_dict']) criterion = nn.BCELoss() def wavenetGen(batch_size=5, sample_len=4, recp_field=1276): for i, (seed, _, _) in enumerate(seedloader):
train_dataset = LJSpeechDataset(train) test_dataset = LJSpeechDataset(test) train_dataloader = DataLoader(train_dataset, batch_size=6, num_workers=8, shuffle=False, pin_memory=True) test_dataloader = DataLoader(test_dataset, batch_size=6, num_workers=8, pin_memory=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = WaveNet() optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001) criterion = nn.CrossEntropyLoss() featurizer = MelSpectrogram(MelSpectrogramConfig()).to(device) model.to(device) wandb.watch(model, log="all") N_EPOCHS = 14 for epoch in tqdm(range(N_EPOCHS)): train_loss, train_acc = train_model(model, train_dataloader, optimizer, criterion) test_loss, test_acc = evaluate(model, test_dataloader, criterion) wandb.log({
print(device) BATCH_SIZE = 1 torch.backends.cudnn.deterministic = True random.seed(42) np.random.seed(42) torch.manual_seed(42) if torch.cuda.is_available(): torch.cuda.manual_seed_all(42) from dataset import load_dataset dataloader_train, dataloader_val = load_dataset(featurizer, BATCH_SIZE) from model import WaveNet generator = WaveNet(n_mels=80, n_blocks=20).to(device) from math import exp, log import os os.environ['CUDA_LAUNCH_BLOCKING'] = "1" import wandb run = wandb.init(project="DLA_HW5", config={ "n_mels": 80, "n_blocks": 20, "learn_rate": 0.001, "batch_size": 1 }) config = wandb.config
X_train, X_test = train_test_split(list(df['id']), train_size=.9) train_dataset = LJSpeech(X_train, train=True) test_dataset = LJSpeech(X_test, train=False) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=8) val_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=8) model = WaveNet(device, MelSpectrogramConfig()) model = model.to(device) learning_rate = 0.001 error = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5) for epoch in range(num_epochs): train(epoch, train_loader, model, device, optimizer, error)