def list_data( dataset_dir, speaker_ids, noise_dirs, max_files=None, shuffle=True, augmentation_factor=1, oversample_noise=True, ): speech_dataset = AudioVisualDataset(dataset_dir) speech_subset = speech_dataset.subset(speaker_ids, max_files, shuffle) noise_dataset = AudioDataset(noise_dirs) noise_subset = noise_dataset.subset(max_files, shuffle) if not oversample_noise: n_files = min(len(speech_subset), len(noise_subset)) speech_entries = speech_subset[:n_files] noise_entries = noise_subset[:n_files] else: speech_and_noise_entries = [ (s, n) for s, n in zip(speech_subset, itertools.cycle(noise_subset)) ] speech_entries, noise_entries = [ list(x) for x in zip(*speech_and_noise_entries) ] all_speech_entries = speech_entries all_noise_file_paths = noise_entries for i in range(augmentation_factor - 1): all_speech_entries += speech_entries all_noise_file_paths += random.sample(noise_entries, len(noise_entries)) return all_speech_entries, all_noise_file_paths
def list_data(dataset_dir, speaker_ids, noise_dirs, max_files=None, shuffle=True, augmentation_factor=1): speech_dataset = AudioVisualDataset(dataset_dir) speech_subset = speech_dataset.subset(speaker_ids, max_files, shuffle) noise_dataset = AudioDataset(noise_dirs) noise_file_paths = noise_dataset.subset(max_files, shuffle) n_files = min(len(speech_subset), len(noise_file_paths)) speech_entries = speech_subset[:n_files] noise_file_paths = noise_file_paths[:n_files] all_speech_entries = speech_entries all_noise_file_paths = noise_file_paths for i in range(augmentation_factor - 1): all_speech_entries += speech_entries all_noise_file_paths += random.sample(noise_file_paths, len(noise_file_paths)) return all_speech_entries, all_noise_file_paths
def main(log_dir, results_path): model = AudioDenoiserNet(cfg.SEQUENCE_LENGTH) # weight initialization model = model.apply(weight_init) if torch.cuda.is_available(): model.cuda() batch_size = cfg.BATCH_SIZE learning_rate = cfg.LEARNING_RATE num_epochs = cfg.NUM_EPOCHS train_dataset = AudioDataset(cfg.DATA_DIR, "training") train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, num_workers=2) val_dataset = AudioDataset(cfg.DATA_DIR, "validation") val_data_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2) # hyperparameter_search(model, train_data_loader, val_data_loader, log_dir, num_epochs) train_loss, valid_loss = train_model(model, train_data_loader, val_data_loader, log_dir, batch_size, learning_rate, num_epochs)
def main(csv_base): sequence_length = 8 model = AudioDenoiserNet(sequence_length) # weight initialization model = model.apply(weight_init) if torch.cuda.is_available(): model.cuda() batch_size = 512 learning_rate = 0.001 num_epochs = 30 data_dir = "/local/mnt2/workspace2/tkuai/cnn_audio_denoiser/pytorch_dataset2/old_data" saving_figure = "/local/mnt2/workspace2/tkuai/cnn_audio_denoiser/pytorch_model2/results2/" train_dataset = AudioDataset(data_dir, "training") train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, num_workers=2) val_dataset = AudioDataset(data_dir, "validation") val_data_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2) train_loss, valid_loss = train_model(model, train_data_loader, val_data_loader, csv_base, batch_size, learning_rate, num_epochs) plot_loss(train_loss, valid_loss, saving_figure)
def main(args): # initialize dataset audio_dataset = AudioDataset(args.manifest, batch_size=args.batch_size, root_dir=args.root_dir) audio_dataloader = audio_dataset.dataloader() attack = Attack(args.model_path, batch_size=args.batch_size, lr_stage1=args.lr_stage1, lr_stage2=args.lr_stage2, num_iter_stage1=args.num_iter_stage1, num_iter_stage2=args.num_iter_stage2) # initialize attack class attack.attack_stage1(audio_dataloader) for idx, batch in enumerate(audio_dataloader): #print(idx, batch) for key, value in batch.items(): try: print(key, ':', value.shape) except AttributeError: print(key, ':', value) exit()
def test_audio_dataset(self): mel_ids = ['small_sample'] text_dict = {'small_sample': 'Small sample text.'} cleaner = lambda x: x.lower() symbols = 'abcdefghijklmnopqrstuvwxyz. ' tokenizer = Tokenizer(cleaners=cleaner, symbols=symbols) dataset = AudioDataset(mel_path=self.mel_path, mel_ids=mel_ids, text_dict=text_dict, tokenizer=tokenizer) self.assertEqual(1, len(dataset)) seq, mel, mel_id, mel_len = dataset[0] text = tokenizer.decode(seq) self.assertEqual('small sample text.', text) self.assertEqual((101, 80), mel.shape) self.assertEqual('small_sample', mel_id) self.assertEqual(101, mel_len)
def test(save_epoch, batchsize, data_path, save_path, modeldir, cls_num): # Dataset definition dataset = AudioDataset(data_path) collator = AudioCollator(cls_num) # Model & Optimizer definition generator = Generator(cls_num=cls_num) generator.load_state_dict( torch.load(f"{modeldir}/generator_{save_epoch - 1}.model")) generator.eval() # evaluation mode # Data loader dataloader = DataLoader(dataset, batch_size=batchsize, shuffle=False, collate_fn=collator, drop_last=True) dataloader = tqdm(dataloader) output = [] # Evaluation for i, data in enumerate(dataloader): x_sp, x_label, y_label = data x_to_y = torch.cat([y_label, x_label], dim=1) y_to_x = torch.cat([x_label, y_label], dim=1) x_to_x = torch.cat([x_label, x_label], dim=1) # Generator update y_eval = generator(x_sp, x_to_y) y_npy = y_eval.to('cpu').detach().numpy().flatten() # Save to List output.append(y_npy) # Writer out_array = np.array(output) out_array = 0.8 * out_array / np.max(np.abs(out_array)) # Normalization path = str(Path(save_path)) + '.wav' write_wav(path, out_array, sr=22050)
"n_epochs": 20, "dropout": [0.5, 0.3], "masking": [20, 10], "sample_rate": 22050, "n_mels": 128, "n_fft": 1024, "win_length": 512, "hop_length": 512, "augment": True } train_loader = DataLoader(AudioDataset( path=os.path.join("audio", "train"), sample_rate=config["sample_rate"], n_mels=config["n_mels"], n_fft=config["n_fft"], win_length=config["win_length"], hop_length=config["hop_length"], augment=config["augment"], ), batch_size=config["batch_size"], shuffle=True, pin_memory=True) val_loader = DataLoader(AudioDataset( path=os.path.join("audio", "validation"), sample_rate=config["sample_rate"], n_mels=config["n_mels"], n_fft=config["n_fft"], win_length=config["win_length"], hop_length=config["hop_length"], ),
import torch import torch.nn as nn import torchvision from torchvision import transforms from logger import Logger import torch.utils.data from dataset import AudioDataset from model import AudioCycleGAN from hparams import hparams as opt import time # MNIST dataset dataset = AudioDataset("./data/preprocess") # Data loader data_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=2, shuffle=True) data_iter = iter(data_loader) iter_per_epoch = len(data_loader) # Fully connected neural network with one hidden layer model = AudioCycleGAN(opt) logger = Logger('./logs') # Loss and optimizer model.setup(opt) # regular setup: load and print networks; create schedulers total_iters = 0 # the total number of training iterations for epoch in range( opt.n_epochs, opt.niter + opt.niter_decay + 1
def experiment(cfg, fold, use_pretrained, predict_on_private): print(fold) set_seed(cfg.seed) device = torch.device(cfg.device) datasets = { 'train': AudioDataset(cfg.data_root, fold['train'], 'train'), 'valid': AudioDataset(cfg.data_root, fold['valid'], 'valid'), 'public_test': AudioDataset(cfg.data_root, fold['public_test'], 'public_test'), } dataloaders = { 'train': DataLoader(datasets['train'], batch_size=cfg.batch_size, shuffle=True, collate_fn=datasets['train'].collate_fn), 'valid': DataLoader(datasets['valid'], batch_size=cfg.batch_size, shuffle=False, collate_fn=datasets['valid'].collate_fn), 'public_test': DataLoader(datasets['public_test'], batch_size=cfg.batch_size, shuffle=False, collate_fn=datasets['public_test'].collate_fn) } if predict_on_private: datasets['private_test'] = AudioDataset(cfg.data_root, fold['private_test'], 'private_test') dataloaders['private_test'] = DataLoader( datasets['private_test'], batch_size=cfg.batch_size, shuffle=False, collate_fn=datasets['private_test'].collate_fn) model_path = f'./predictions/{cfg.init_time}/{cfg.task}_{"_".join([str(i) for i in fold["train"]])}_pretrained_model.pt' if use_pretrained: print(f'Using pre-trained model from {model_path}') best_metric = -1.0 else: best_metric = train(cfg, datasets, dataloaders, device, model_path) # make predictions test_predictions = {} predict( cfg, model_path, dataloaders['train'], device, f'./predictions/{cfg.init_time}/{cfg.task}_train_{"_".join([str(i) for i in fold["train"]])}_vggish.csv' ) predict( cfg, model_path, dataloaders['valid'], device, f'./predictions/{cfg.init_time}/{cfg.task}_valid_{"_".join([str(i) for i in fold["valid"]])}_vggish.csv' ) test_predictions['public_test'] = predict( cfg, model_path, dataloaders['public_test'], device, f'./predictions/{cfg.init_time}/{cfg.task}_public_test_trained_on_{"_".join([str(i) for i in fold["train"]])}_vggish.csv' ) if predict_on_private: test_predictions['private_test'] = predict( cfg, model_path, dataloaders['private_test'], device, f'./predictions/{cfg.init_time}/{cfg.task}_private_test_trained_on_{"_".join([str(i) for i in fold["train"]])}_vggish.csv' ) return best_metric, test_predictions
parser.add_argument("--language", type=str, default="english") # mode = "melgan" # load_vqvae = False # load_melgan = False args = parser.parse_args() data_path = os.path.join(args.datadir, args.language) if not os.path.exists(os.path.join(args.ckpt_path, "logs")): os.makedirs(os.path.join(args.ckpt_path, "logs")) if args.mode == "vqvae": print("[VQVAE] Loading training data...") rec_train_dataset = AudioDataset(audio_files=Path(data_path) / "rec_train_files.txt", segment_length=hps.seg_len, sampling_rate=16000, mode='reconst') num_speaker = rec_train_dataset.get_speaker_num() speaker2id = rec_train_dataset.get_speaker2id() #test_set = AudioDataset(audio_files=Path(data_path) / "test_files.txt", segment_length=22050 * 4, sampling_rate=22050, augment=False) train_data_loader = DataLoader( rec_train_dataset, batch_size=hps.batch_size_vqvae, shuffle=True, num_workers=4, pin_memory=True) #hps.batch_size, num_workers=4) #test_data_loader = DataLoader(test_set, batch_size=1) trainer = Trainer( hps=hps, logger_path=os.path.join(args.ckpt_path, "logs"),
def train(out_dir, inp_txt, num_threads, task, batch_size): torch.set_num_threads(num_threads) print('Number of threads: ', torch.get_num_threads()) melspec_dir = os.path.normpath(out_dir) + '/melspec' print('Create directory to save models...') model_dir = os.path.normpath(out_dir) + '/' + f'{task}_model' os.makedirs(model_dir, exist_ok=True) print('Reading training list file...') ref_labels_dict, (train_fnames, val_fnames, train_labels, val_labels) =\ get_train_val_data(inp_txt) with open(model_dir + '/label_ids.pkl', 'wb') as f: pickle.dump(ref_labels_dict, f) print('Creating PyTorch datasets...') train_dataset = AudioDataset(train_fnames, train_labels, melspec_dir) val_dataset = AudioDataset(val_fnames, val_labels, melspec_dir, False, train_dataset.mean, train_dataset.std) mean, std = train_dataset.mean, train_dataset.std with open(model_dir + '/mean_std.pkl', 'wb') as f: pickle.dump((mean, std), f) train_loader_1 = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) train_loader_2 = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=batch_size) num_classes = CONFIG[task]['num_classes'] model = MirexModel(num_classes) # Define optimizer, scheduler and loss criteria optimizer = optim.Adam(model.parameters(), lr=0.001, amsgrad=True) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True) criterion = nn.CrossEntropyLoss() cuda = False device = torch.device('cuda:0' if cuda else 'cpu') print('Device: ', device) model = model.to(device) epochs = 100 train_loss_hist = [] valid_loss_hist = [] lowest_val_loss = np.inf epochs_without_new_lowest = 0 print('Training...') for i in range(epochs): start_time = time.time() this_epoch_train_loss = 0 for i1, i2 in zip(train_loader_1, train_loader_2): # mixup--------- x1, y1 = i1 x2, y2 = i2 alpha = 1 mixup_vals = np.random.beta(alpha, alpha, i1[0].shape[0]) mvals = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1, 1, 1)) inputs = (mvals * x1) + ((1 - mvals) * x2) y1_onehot = torch.nn.functional.one_hot(y1, num_classes).float() y2_onehot = torch.nn.functional.one_hot(y2, num_classes).float() mvals = torch.Tensor(mixup_vals.reshape(mixup_vals.shape[0], 1)) labels = (mvals * y1_onehot) + ((1 - mvals) * y2_onehot) # mixup ends ---------- inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() with torch.set_grad_enabled(True): model = model.train() outputs = model(inputs) loss = mixup_cross_entropy_loss(outputs, labels) loss.backward() optimizer.step() this_epoch_train_loss += loss.detach().cpu().numpy() this_epoch_valid_loss = 0 for inputs, labels in val_loader: inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() with torch.set_grad_enabled(False): model = model.eval() outputs = model(inputs) loss = criterion(outputs, labels) this_epoch_valid_loss += loss.detach().cpu().numpy() this_epoch_train_loss /= len(train_loader_1) this_epoch_valid_loss /= len(val_loader) train_loss_hist.append(this_epoch_train_loss) valid_loss_hist.append(this_epoch_valid_loss) if this_epoch_valid_loss < lowest_val_loss: lowest_val_loss = this_epoch_valid_loss torch.save(model.state_dict(), f'{model_dir}/best_model.pth') epochs_without_new_lowest = 0 else: epochs_without_new_lowest += 1 if epochs_without_new_lowest >= 25: break print(f'Epoch: {i+1}\ttrain_loss: {this_epoch_train_loss}\tval_loss: {this_epoch_valid_loss}\ttime: {(time.time()-start_time):.0f}s') scheduler.step(this_epoch_valid_loss) return model_dir
SEED = 42 random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) torch.cuda.manual_seed(SEED) torch.backends.cudnn.deterministic = True preprocess = DataPreprocess(config['DataPreprocess']['sr'], config['DataPreprocess']['max_length'], config['DataPreprocess']['classes_mapping']) preprocess.get_train_data() preprocess.get_test_data() train_dataset = AudioDataset(path_to_sound_files='data/train/audio', path_to_csv='data/train/meta/train.csv') train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True) test_dataset = AudioDataset(path_to_sound_files='data/test/audio', path_to_csv='data/test/meta/test.csv') test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = Resnet18Multi(config['Model']['num_classes']) model.to(device) pytorch_total_params = sum(p.numel() for p in model.parameters()) logging.info('Total num of model parameters: {}'.format(pytorch_total_params)) optimizer = torch.optim.Adam(model.parameters(), lr=config['Model']['learning_rate'])
def main(): parser = argparse.ArgumentParser() parser.add_argument('--log_dir', type=str, default='logs', help='output log directory') parser.add_argument('--feature', type=str, choices=['melgram', 'mfcc'], default='mfcc', help='feature') parser.add_argument('--model_type', type=str, choices=['alex1d', 'alex2d', 'lstm', 'resnet'], default='alex2d', help='convolution type') parser.add_argument('--batch_size', type=int, default=128, help='training and valid batch size') parser.add_argument('--valid_ratio', type=float, default=0.1, help='the ratio of validation data') parser.add_argument('--epochs', type=int, default=32, help='number of epochs to train') parser.add_argument('--lr', type=float, default=0.001, help='learning rate') parser.add_argument('--seed', type=int, default=1234, help='random seed') args = parser.parse_args() print('log_dir:', args.log_dir) print('feature:', args.feature) print('model_type:', args.model_type) print('batch_size:', args.batch_size) print('valid_ratio:', args.valid_ratio) print('epochs:', args.epochs) print('lr:', args.lr) print('seed:', args.seed) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) if cuda: torch.cuda.manual_seed(args.seed) # データリストをDataFrameとしてロード train_df = pd.read_csv('./data/train.csv') test_df = pd.read_csv('./data/sample_submission.csv') # DataFrameのラベルをインデックスに変換 le = LabelEncoder() le.fit(np.unique(train_df.label)) train_df['label_idx'] = le.transform(train_df['label']) num_classes = len(le.classes_) # Datasetをロード # test=Trueにするとラベルは読み込まれない train_dataset = AudioDataset(train_df, './data/audio_train', feature=args.feature, model_type=args.model_type) test_dataset = AudioDataset(test_df, './data/audio_test', test=True, feature=args.feature, model_type=args.model_type) # 訓練データを訓練とバリデーションにランダムに分割 # あとでCVによるEnsembleできるようにシードを指定する num_train = len(train_dataset) indices = list(range(num_train)) split = int(args.valid_ratio * num_train) np.random.shuffle(indices) train_idx, valid_idx = indices[split:], indices[:split] train_sampler = SubsetRandomSampler(train_idx) valid_sampler = SubsetRandomSampler(valid_idx) train_loader = torch.utils.data.DataLoader(train_dataset, args.batch_size, sampler=train_sampler, num_workers=num_workers) # バリデーションデータはtrain_datasetの一部を使う val_loader = torch.utils.data.DataLoader(train_dataset, args.batch_size, sampler=valid_sampler, num_workers=num_workers) # テストデータはDataFrameの順番のまま読み込みたいため # shuffle=Falseとする test_loader = torch.utils.data.DataLoader(test_dataset, args.batch_size, shuffle=False) # build model if args.model_type == 'alex2d': model = AlexNet2d(num_classes).to(device) elif args.model_type == 'alex1d': model = AlexNet1d(num_classes).to(device) elif args.model_type == 'lstm': model = ConvLSTM(num_classes).to(device) elif args.model_type == 'resnet': model = ResNet([2, 2, 2, 2]).to(device) else: print('Invalid model_type: %s' % args.model_type) exit(1) print(model) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=args.lr) scheduler = CyclicLR(optimizer, base_lr=0.0001, max_lr=0.01, step_size=10, mode="exp_range") # 学習率の履歴を保存(可視化用) lr_list = [] best_acc = 0.0 best_model = None writer = SummaryWriter(args.log_dir) for epoch in range(1, args.epochs + 1): loss, acc = train(train_loader, model, criterion, optimizer) val_loss, val_acc = valid(val_loader, model, criterion) lr_list.append(scheduler.get_lr()[0]) scheduler.step() # logging writer.add_scalar('train/loss', loss, epoch) writer.add_scalar('train/acc', acc, epoch) writer.add_scalar('valid/loss', val_loss, epoch) writer.add_scalar('valid/acc', val_acc, epoch) print( 'Epoch [%d/%d] loss: %.5f acc: %.5f val_loss: %.5f val_acc: %.5f' % (epoch, args.epochs, loss, acc, val_loss, val_acc)) if val_acc > best_acc: print('val_acc improved from %.5f to %.5f' % (best_acc, val_acc)) best_acc = val_acc # remove the old model file if best_model is not None: os.remove(best_model) best_model = os.path.join( args.log_dir, 'epoch%03d-%.3f-%.3f.pth' % (epoch, val_loss, val_acc)) torch.save(model.state_dict(), best_model) # ベストモデルでテストデータを評価 # あとでEnsembleできるようにモデルの出力値も保存しておく print('best_model:', best_model) model.load_state_dict( torch.load(best_model, map_location=lambda storage, loc: storage)) predictions = test(test_loader, model) np.save(os.path.join(args.log_dir, 'predictions.npy'), predictions.cpu().numpy()) # Top3の出力を持つラベルに変換 _, indices = predictions.topk(3) # (N, 3) # ラベルに変換 predicted_labels = le.classes_[indices] predicted_labels = [' '.join(lst) for lst in predicted_labels] test_df['label'] = predicted_labels test_df.to_csv(os.path.join(args.log_dir, 'submission.csv'), index=False)
vqvae_model = args.vqvae_model melgan_model = args.melgan_model save_path = args.save_path #os.remove(wav_save_path) if not os.path.exists(save_path): os.makedirs(save_path) #hps.seg_len = 16000 * 10 eval_mode = "both" if eval_mode in ['vqvae', 'both']: encoding_dataset = AudioDataset(audio_files=Path(data_path) / "eval_files.txt", segment_length=2048 * 126, sampling_rate=16000, mode='reconst', augment=False, load_speech_id=True) #dataset = AudioDataset(audio_files=Path(data_path) / "eval_files.txt", segment_length=2048 * 126, sampling_rate=16000, mode=data_mode, augment=False, load_speech_id=True) num_src_speaker = AudioDataset(audio_files=Path(data_path) / "rec_train_files.txt", segment_length=2048 * 126, sampling_rate=16000, mode='reconst', augment=False).get_speaker_num() if eval_mode in ['melgan', 'both']: convert_dataset = AudioDataset(audio_files=Path(data_path) / "synthesis.txt", segment_length=2048 * 126,
def main(): parser = argparse.ArgumentParser() parser.add_argument('log_dir', type=str, help='input log directory') parser.add_argument('model_file', type=str, help='input model file') parser.add_argument('--feature', type=str, choices=['melgram', 'mfcc'], default='mfcc', help='feature') parser.add_argument('--model_type', type=str, choices=['alex1d', 'alex2d', 'lstm', 'resnet'], default='alex2d', help='convolution type of the model') args = parser.parse_args() print('log_dir:', args.log_dir) print('model_file:', args.model_file) print('feature:', args.feature) print('model_type:', args.model_type) # load dataset train_df = pd.read_csv('./data/train.csv') test_df = pd.read_csv('./data/sample_submission.csv') le = LabelEncoder() le.fit(np.unique(train_df.label)) train_df['label_idx'] = le.transform(train_df['label']) num_classes = len(le.classes_) test_dataset = AudioDataset(test_df, './data/audio_test', test=True, feature=args.feature, model_type=args.model_type) test_loader = torch.utils.data.DataLoader(test_dataset, 128, shuffle=False) # load model if args.model_type == 'alex2d': model = AlexNet2d(num_classes).to(device) elif args.model_type == 'alex1d': model = AlexNet1d(num_classes).to(device) elif args.model_type == 'lstm': model = ConvLSTM(num_classes).to(device) elif args.model_type == 'resnet': model = ResNet([2, 2, 2, 2]).to(device) else: print('Invalid model_type: %s' % args.model_type) exit(1) print(model) # 学習済みモデルをロード model.load_state_dict( torch.load(args.model_file, map_location=lambda storage, loc: storage)) # test time augmentation tta_predictions = test_time_augmentation(test_loader, model, num_aug=5) np.save(os.path.join(args.log_dir, 'tta_predictions.npy'), tta_predictions.cpu().numpy()) # Top3の出力を持つラベルに変換 _, indices = tta_predictions.topk(3) predicted_labels = le.classes_[indices] predicted_labels = [' '.join(lst) for lst in predicted_labels] test_df['label'] = predicted_labels test_df.to_csv(os.path.join(args.log_dir, 'tta_submission.csv'), index=False)
def train(epochs, batchsize, data_path, modeldir, cls_num, duration): # Dataset definition dataset = AudioDataset(data_path) collator = AudioCollator(cls_num) # Model & Optimizer definition generator = Generator(cls_num=cls_num) generator.cuda() generator.train() gen_opt = torch.optim.Adam(generator.parameters(), lr=0.0002, betas=(0.5, 0.999)) discriminator = Discriminator(cls_num) discriminator.cuda() discriminator.train() dis_opt = torch.optim.Adam(discriminator.parameters(), lr=0.0002, betas=(0.5, 0.999)) # Writer definition writer = tbx.SummaryWriter() iterations = 0 for epoch in range(epochs): dataloader = DataLoader(dataset, batch_size=batchsize, shuffle=True, collate_fn=collator, drop_last=True) dataloader = tqdm(dataloader) for i, data in enumerate(dataloader): iterations += 1 x_sp, x_label, y_label = data x_to_y = torch.cat([y_label, x_label], dim=1) y_to_x = torch.cat([x_label, y_label], dim=1) x_to_x = torch.cat([x_label, x_label], dim=1) # Discriminator update y_fake = generator(x_sp, x_to_y) # Adversarial loss dis_loss_real, dis_loss_fake = adversarial_loss_dis( discriminator, y_fake, x_sp, x_to_y, y_to_x) dis_loss = dis_loss_real + dis_loss_fake dis_opt.zero_grad() dis_loss.backward() dis_opt.step() write(writer, "dis_loss_real", dis_loss_real, iterations) write(writer, "dis_loss_fake", dis_loss_fake, iterations) # Generator update y_fake = generator(x_sp, x_to_y) x_fake = generator(y_fake, y_to_x) x_identity = generator(x_sp, x_to_x) # Adversarial loss gen_loss_fake = adversarial_loss_gen(discriminator, y_fake, x_to_y) # Cycle-consistency loss cycle_loss = cycle_consistency_loss(x_fake, x_sp) # Identity-mapping loss if epoch < duration: identity_loss = identity_mapping_loss(x_identity, x_sp) else: identity_loss = torch.as_tensor(np.array(0)) #identity_loss = torch.from_numpy(0) gen_loss = gen_loss_fake + cycle_loss + identity_loss gen_opt.zero_grad() gen_loss.backward() gen_opt.step() write(writer, "gen_loss_fake", gen_loss_fake, iterations) write(writer, "cycle_loss", cycle_loss, iterations) write(writer, "identity_loss", identity_loss, iterations) print(f"iteration: {iterations}") print( f"dis loss real: {dis_loss_real} dis loss fake: {dis_loss_fake}" ) print( f"gen loss fake: {gen_loss_fake} cycle loss: {cycle_loss} identity loss: {identity_loss}" ) if i == 0: torch.save(generator.state_dict(), f"{modeldir}/generator_{epoch}.model")
model, optimizer, scheduler = loadModel(model, adam, scheduler, fileName, stage, startEpoch == 1) if startEpoch is not 1: print( f"Successfully loaded model with last completed epoch as {startEpoch-1}" ) else: raise Exception("No such file exixts") else: model = Lipreader(stage) adam = optim.Adam(model.parameters(), lr=3e-4, weight_decay=0.) scheduler = lrScheduler.LambdaLR(adam, lr_lambda=[updateLRFunc]) trainDataset = AudioDataset("train") trainDataLoader = DataLoader(trainDataset, batch_size=config.data["batchSize"], shuffle=config.data["shuffle"], num_workers=config.data["workers"]) validationDataset = AudioDataset("val") validationDataLoader = DataLoader(validationDataset, batch_size=config.data["batchSize"], shuffle=config.data["shuffle"], num_workers=config.data["workers"]) trainCriterion = nn.CrossEntropyLoss() if isinstance( model.Backend, TemporalCNN) else NLLSequenceLoss() validationCriterion = temporalCNNValidator if isinstance( model.Backend, TemporalCNN) else gruValidator
def recognize(args): model, LFR_m, LFR_n = Transformer.load_model(args.model_path) print(model) model.eval() model.cuda() char_list, sos_id, eos_id = process_dict(args.dict) assert model.decoder.sos_id == sos_id and model.decoder.eos_id == eos_id tr_dataset = AudioDataset('test', args.batch_size) path_list = tr_dataset.path_lst label_list = tr_dataset.han_lst num_data = tr_dataset.path_count ran_num = random.randint(0, num_data - 1) num = args.count words_num = 0 word_error_num = 0 seq_error = 0 data = '' with torch.no_grad(): for index in range(num): try: print('\nthe ', index + 1, 'th example.') data += 'the ' + str(index + 1) + 'th example.\n' index = (ran_num + index) % num_data standard_label = label_list[index] feature, label = get_fbank_and_hanzi_data( index, args.feature_dim, char_list, path_list, label_list) if len(feature) > 1600: continue input = build_LFR_features(feature, args.LFR_m, args.LFR_n) input = torch.from_numpy(input).float() input_length = torch.tensor([input.size(0)], dtype=torch.int) input = input.cuda() nbest_hyps = model.recognize(input, input_length, char_list, args) pred_label = nbest_hyps[0]['yseq'][1:-1] pred_res = ''.join([char_list[index] for index in pred_label]) print("stand:", label) print("pred :", pred_label) data += "stand:" + str(standard_label) + '\n' data += "pred :" + str(pred_res) + '\n' words_n = len(label) words_num += words_n word_distance = GetEditDistance(pred_label, label) if (word_distance <= words_n): word_error_num += word_distance else: word_error_num += words_n if pred_label != label: seq_error += 1 except ValueError: continue print('WER = ', (1 - word_error_num / words_num) * 100, '%') print('CER = ', (1 - seq_error / args.count) * 100, '%') data += 'WER = ' + str((1 - word_error_num / words_num) * 100) + '%' data += 'CER = ' + str((1 - seq_error / args.count) * 100) + '%' with open('../../model_log/pred/test_' + str(args.count) + '.txt', 'w', encoding='utf-8') as f: f.writelines(data)