def load_test_train_data(audio_conf, train_manifest, val_manifest, labels, batch_size=20, augment=False, normalize=True, num_workers=4): train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=train_manifest, labels=labels, normalize=normalize, augment=augment) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=val_manifest, labels=labels, normalize=normalize, augment=False) train_sampler = BucketingSampler(train_dataset, batch_size=batch_size) train_loader = AudioDataLoader(train_dataset, num_workers=num_workers, batch_sampler=train_sampler) test_loader = AudioDataLoader(test_dataset, batch_size=batch_size, num_workers=num_workers) return (train_loader, test_loader, train_sampler)
def get_data_loaders(labels, cfg): train_dataset = SpectrogramDataset(cfg.train_manifest, cfg.audio_conf, labels, mel_spec=cfg.mel_spec) train_batch_loader = BatchAudioDataLoader(train_dataset, batch_size=cfg.batch_size) eval_dataset = SpectrogramDataset(cfg.val_manifest, cfg.audio_conf, labels, mel_spec=cfg.mel_spec) val_batch_loader = BatchAudioDataLoader(eval_dataset, batch_size=cfg.batch_size) return train_batch_loader, val_batch_loader
def test(**kwargs): set_random_seeds(kwargs['seed']) print('starting as %s' % time.asctime()) device = 'cuda:0' if kwargs['cuda'] and torch.cuda.is_available() else 'cpu' model = Wav2Letter.load_model(kwargs['model_path']) model.to(device) model.eval() dataset = SpectrogramDataset(kwargs['test_manifest'],model.audio_conf,model.labels) decoder = get_decoder(kwargs['decoder'],kwargs['lm_path'],model.labels,get_beam_search_params(kwargs['beam_search_params'])) with torch.no_grad(): num_samples = len(dataset) index_to_print = random.randrange(num_samples) cer = np.zeros(num_samples) wer = np.zeros(num_samples) for idx, (data) in enumerate(dataset): inputs, targets, file_paths, text = data out = model(torch.FloatTensor(inputs).unsqueeze(0).to(device)) out_sizes = torch.IntTensor([out.size(1)]) predicted_texts = decoder.decode(probs=out,sizes=out_sizes)[0] cer[idx] = decoder.cer_ratio(text, predicted_texts) wer[idx] = decoder.wer_ratio(text, predicted_texts) if (idx == index_to_print and kwargs['print_samples']) or kwargs['print_all']: print(text) print('Decoder result: ' + predicted_texts) print('Raw acoustic: ' + ''.join(map(lambda i: model.labels[i], torch.argmax(out.squeeze(), 1)))) print('CER:%f, WER:%f' % (cer.mean(),wer.mean()))
def main(config: DictConfig) -> None: print(OmegaConf.to_yaml(config)) torch.manual_seed(config.eval.seed) torch.cuda.manual_seed_all(config.eval.seed) np.random.seed(config.eval.seed) random.seed(config.eval.seed) use_cuda = config.eval.cuda and torch.cuda.is_available() device = torch.device('cuda' if use_cuda else 'cpu') char2id, id2char = load_label(config.eval.label_path, config.eval.blank_id) audio_paths, transcripts, _, _ = load_dataset(config.eval.dataset_path, config.eval.mode) test_dataset = SpectrogramDataset( config.eval.audio_path, audio_paths, transcripts, config.audio.sampling_rate, config.audio.n_mel, config.audio.frame_length, config.audio.frame_stride, config.audio.extension, config.train.sos_id, config.train.eos_id, ) test_loader = AudioDataLoader( test_dataset, batch_size=config.eval.batch_size, num_workers=config.eval.num_workers, ) model = load_test_model(config, device) print('Start Test !!!') evaluator = Evaluator(config, device, test_loader, id2char) evaluator.evaluate(model)
def main(): args = parser.parse_args() save_folder = args.save_folder if args.visdom: from visdom import Visdom viz = Visdom() opts = [ dict(title='Loss', ylabel='Loss', xlabel='Epoch'), dict(title='WER', ylabel='WER', xlabel='Epoch'), dict(title='CER', ylabel='CER', xlabel='Epoch') ] viz_windows = [None, None, None] loss_results, cer_results, wer_results = torch.Tensor( args.epochs), torch.Tensor(args.epochs), torch.Tensor(args.epochs) epochs = torch.range(1, args.epochs) try: os.makedirs(save_folder) except OSError as e: if e.errno == errno.EEXIST: print('Directory already exists.') else: raise criterion = CTCLoss() with open(args.labels_path) as label_file: labels = str(''.join(json.load(label_file))) audio_conf = dict(sample_rate=args.sample_rate, window_size=args.window_size, window_stride=args.window_stride, window=args.window) train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels, normalize=True) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.val_manifest, labels=labels, normalize=True) train_loader = AudioDataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers) test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) model = DeepSpeech(rnn_hidden_size=args.hidden_size, nb_layers=args.hidden_layers, num_classes=len(labels)) decoder = ArgMaxDecoder(labels) if args.cuda: model = torch.nn.DataParallel(model).cuda() print(model) parameters = model.parameters() optimizer = torch.optim.SGD(parameters, lr=args.lr, momentum=args.momentum, nesterov=True) batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() for epoch in range(args.epochs): model.train() end = time.time() avg_loss = 0 for i, (data) in enumerate(train_loader): inputs, targets, input_percentages, target_sizes = data # measure data loading time data_time.update(time.time() - end) inputs = Variable(inputs) target_sizes = Variable(target_sizes) targets = Variable(targets) if args.cuda: inputs = inputs.cuda() out = model(inputs) out = out.transpose(0, 1) # TxNxH seq_length = out.size(0) sizes = Variable(input_percentages.mul_(int(seq_length)).int()) loss = criterion(out, targets, sizes, target_sizes) loss = loss / inputs.size(0) # average the loss by minibatch loss_sum = loss.data.sum() inf = float("inf") if loss_sum == inf or loss_sum == -inf: print("WARNING: received an inf loss, setting loss value to 0") loss_value = 0 else: loss_value = loss.data[0] avg_loss += loss_value losses.update(loss_value, inputs.size(0)) # compute gradient optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), args.max_norm) # SGD step optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if not args.silent: print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( (epoch + 1), (i + 1), len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses)) avg_loss /= len(train_loader) print('Training Summary Epoch: [{0}]\t' 'Average Loss {loss:.3f}\t'.format(epoch + 1, loss=avg_loss)) total_cer, total_wer = 0, 0 for i, (data) in enumerate(test_loader): # test inputs, targets, input_percentages, target_sizes = data inputs = Variable(inputs) # unflatten targets split_targets = [] offset = 0 for size in target_sizes: split_targets.append(targets[offset:offset + size]) offset += size if args.cuda: inputs = inputs.cuda() out = model(inputs) out = out.transpose(0, 1) # TxNxH seq_length = out.size(0) sizes = Variable(input_percentages.mul_(int(seq_length)).int()) decoded_output = decoder.decode(out.data, sizes) target_strings = decoder.process_strings( decoder.convert_to_strings(split_targets)) wer, cer = 0, 0 for x in range(len(target_strings)): wer += decoder.wer(decoded_output[x], target_strings[x]) / float( len(target_strings[x].split())) cer += decoder.cer(decoded_output[x], target_strings[x]) / float( len(target_strings[x])) total_cer += cer total_wer += wer wer = total_wer / len(test_loader.dataset) cer = total_cer / len(test_loader.dataset) wer *= 100 cer *= 100 print('Validation Summary Epoch: [{0}]\t' 'Average WER {wer:.0f}\t' 'Average CER {cer:.0f}\t'.format(epoch + 1, wer=wer, cer=cer)) if args.visdom: loss_results[epoch] = avg_loss wer_results[epoch] = wer cer_results[epoch] = cer epoch += 1 x_axis = epochs[0:epoch] y_axis = [ loss_results[0:epoch], wer_results[0:epoch], cer_results[0:epoch] ] for x in range(len(viz_windows)): if viz_windows[x] is None: viz_windows[x] = viz.line( X=x_axis, Y=y_axis[x], opts=opts[x], ) else: viz.line( X=x_axis, Y=y_axis[x], win=viz_windows[x], update='replace', ) if args.epoch_save: file_path = '%s/deepspeech_%d.pth.tar' % (save_folder, epoch) torch.save(checkpoint(model, args, len(labels), epoch), file_path) torch.save(checkpoint(model, args, len(labels)), args.final_model_path)
def test_dataloader(): args = parser.parse_args() # Set seeds for determinism torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) np.random.seed(args.seed) random.seed(args.seed) device = torch.device("cuda" if args.cuda else "cpu") if args.mixed_precision and not args.cuda: raise ValueError( 'If using mixed precision training, CUDA must be enabled!') args.distributed = args.world_size > 1 main_proc = True device = torch.device("cuda" if args.cuda else "cpu") save_folder = args.save_folder os.makedirs(save_folder, exist_ok=True) # Ensure save folder exists loss_results, cer_results, wer_results = torch.Tensor( args.epochs), torch.Tensor(args.epochs), torch.Tensor(args.epochs) best_wer = None if main_proc and args.visdom: visdom_logger = VisdomLogger(args.id, args.epochs) if main_proc and args.tensorboard: tensorboard_logger = TensorBoardLogger(args.id, args.log_dir, args.log_params) avg_loss, start_epoch, start_iter, optim_state = 0, 0, 0, None if args.continue_from: # Starting from previous model print("Loading checkpoint model %s" % args.continue_from) package = torch.load(args.continue_from, map_location=lambda storage, loc: storage) if not args.finetune: # Don't want to restart training optim_state = package['optim_dict'] start_epoch = int(package.get( 'epoch', 1)) - 1 # Index start at 0 for training start_iter = package.get('iteration', None) if start_iter is None: start_epoch += 1 # We saved model after epoch finished, start at the next epoch. start_iter = 0 else: start_iter += 1 avg_loss = int(package.get('avg_loss', 0)) loss_results, cer_results, wer_results = package['loss_results'], package['cer_results'], \ package['wer_results'] if main_proc and args.visdom: # Add previous scores to visdom graph visdom_logger.load_previous_values(start_epoch, package) if main_proc and args.tensorboard: # Previous scores to tensorboard logs tensorboard_logger.load_previous_values(start_epoch, package) print("Loading label from %s" % args.labels_path) with open(args.labels_path) as label_file: labels = str(''.join(json.load(label_file))) audio_conf = dict(sample_rate=args.sample_rate, window_size=args.window_size, window_stride=args.window_stride, window=args.window, noise_dir=args.noise_dir, noise_prob=args.noise_prob, noise_levels=(args.noise_min, args.noise_max)) else: print("must load model!!!") exit() # decoder = GreedyDecoder(labels) train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels, normalize=True, augment=args.augment) train_sampler = BucketingSampler(train_dataset, batch_size=args.batch_size) train_loader = AudioDataLoader(train_dataset, num_workers=args.num_workers, batch_sampler=train_sampler) for i, (data) in enumerate(train_loader, start=start_iter): # 获取初始输入 inputs, targets, input_percentages, target_sizes = data input_sizes = input_percentages.mul_(int(inputs.size(3))).int() inputs = inputs.to(device) size = inputs.size() print(size) # 初始化模型 model = M_Noise_Deepspeech(package, size) for para in model.deepspeech_net.parameters(): para.requires_grad = False model = model.to(device) # 获取初始输出 out_star = model.deepspeech_net(inputs, input_sizes)[0] out_star = out_star.transpose(0, 1) # TxNxH float_out_star = out_star.float() break parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = torch.optim.SGD(parameters, lr=args.lr, momentum=args.momentum, nesterov=True, weight_decay=1e-5) print(model)
def main(config: DictConfig) -> None: warnings.filterwarnings('ignore') print(OmegaConf.to_yaml(config)) torch.manual_seed(config.train.seed) torch.cuda.manual_seed_all(config.train.seed) np.random.seed(config.train.seed) random.seed(config.train.seed) use_cuda = config.train.cuda and torch.cuda.is_available() device = torch.device('cuda' if use_cuda else 'cpu') char2id, id2char = load_label(config.train.label_path, config.train.blank_id) train_audio_paths, train_transcripts, valid_audio_paths, valid_transcripts = load_dataset( config.train.dataset_path, config.train.mode) train_dataset = SpectrogramDataset( config.train.audio_path, train_audio_paths, train_transcripts, config.audio.sampling_rate, config.audio.n_mfcc if config.audio.feature_extraction == 'mfcc' else config.audio.n_mel, config.audio.frame_length, config.audio.frame_stride, config.audio.extension, config.audio.feature_extraction, config.audio.normalize, config.audio.spec_augment, config.audio.freq_mask_parameter, config.audio.num_time_mask, config.audio.num_freq_mask, config.train.sos_id, config.train.eos_id, ) train_sampler = BucketingSampler(train_dataset, batch_size=config.train.batch_size) train_loader = AudioDataLoader( train_dataset, batch_sampler=train_sampler, num_workers=config.train.num_workers, ) valid_dataset = SpectrogramDataset( config.train.audio_path, valid_audio_paths, valid_transcripts, config.audio.sampling_rate, config.audio.n_mfcc if config.audio.feature_extraction == 'mfcc' else config.audio.n_mel, config.audio.frame_length, config.audio.frame_stride, config.audio.extension, config.audio.feature_extraction, config.audio.normalize, config.audio.spec_augment, config.audio.freq_mask_parameter, config.audio.num_time_mask, config.audio.num_freq_mask, config.train.sos_id, config.train.eos_id, ) valid_sampler = BucketingSampler(valid_dataset, batch_size=config.train.batch_size) valid_loader = AudioDataLoader( valid_dataset, batch_sampler=valid_sampler, num_workers=config.train.num_workers, ) model = build_model(config, device) optimizer = optim.Adam(model.parameters(), lr=config.train.lr) print('Start Train !!!') for epoch in range(0, config.train.epochs): train(config, model, device, train_loader, valid_loader, train_sampler, optimizer, epoch, id2char, epoch) if epoch % 2 == 0: torch.save( model, os.path.join(os.getcwd(), config.train.model_save_path + str(epoch) + '.pt'))
# from decoder import BeamCTCDecoder # decoder = BeamCTCDecoder(model.labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, # cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, # beam_width=args.beam_width, num_processes=args.lm_workers) # elif args.decoder == "greedy": # decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) # else: # decoder = None # target_decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) decoder = MyDecoder(model.labels) target_decoder = MyDecoder(model.labels) test_dataset = SpectrogramDataset(audio_conf=model.audio_conf, manifest_filepath=args.test_manifest, metadata_file_path=metadata_path, labels=model.labels, normalize=True) test_loader = AudioDataLoader( test_dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True ) # in train, the manifest will be already stratified with only train data so it's ok. accuracy_mean, acccuracy_std, output_data = evaluate( test_loader=test_loader, device=device, model=model, decoder=decoder, target_decoder=target_decoder, save_output=args.save_output,
alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) elif args.decoder == "greedy": decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) else: decoder = None target_decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) test_dataset = SpectrogramDataset(audio_conf=model.audio_conf, manifest_filepath=args.test_manifest, is_pinyin=args.pinyin, labels=model.labels, normalize=True) test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) wer, cer, output_data, tmp = evaluate(test_loader=test_loader, device=device, model=model, decoder=decoder, target_decoder=target_decoder, save_output=args.save_output, verbose=args.verbose, half=args.half) pd.DataFrame(tmp).to_csv(os.path.join(os.path.dirname(args.test_manifest), 'test_result.csv'),
def main(): args = parser.parse_args() save_folder = args.save_folder loss_results, cer_results, wer_results = torch.Tensor( args.epochs), torch.Tensor(args.epochs), torch.Tensor(args.epochs) if args.visdom: from visdom import Visdom viz = Visdom() opts = [ dict(title='Loss', ylabel='Loss', xlabel='Epoch'), dict(title='WER', ylabel='WER', xlabel='Epoch'), dict(title='CER', ylabel='CER', xlabel='Epoch') ] viz_windows = [None, None, None] epochs = torch.arange(1, args.epochs + 1) if args.tensorboard: from logger import TensorBoardLogger try: os.makedirs(args.log_dir) except OSError as e: if e.errno == errno.EEXIST: print('Directory already exists.') for file in os.listdir(args.log_dir): file_path = os.path.join(args.log_dir, file) try: if os.path.isfile(file_path): os.unlink(file_path) except Exception as e: raise else: raise logger = TensorBoardLogger(args.log_dir) try: os.makedirs(save_folder) except OSError as e: if e.errno == errno.EEXIST: print('Directory already exists.') else: raise criterion = CTCLoss() with open(args.labels_path) as label_file: labels = str(''.join(json.load(label_file))) audio_conf = dict(sample_rate=args.sample_rate, window_size=args.window_size, window_stride=args.window_stride, window=args.window, noise_dir=args.noise_dir, noise_prob=args.noise_prob, noise_levels=(args.noise_min, args.noise_max)) train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels, normalize=True, augment=args.augment) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.val_manifest, labels=labels, normalize=True, augment=False) train_loader = AudioDataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers) test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) rnn_type = args.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" model = DeepSpeech(rnn_hidden_size=args.hidden_size, nb_layers=args.hidden_layers, labels=labels, rnn_type=supported_rnns[rnn_type], audio_conf=audio_conf, bidirectional=True) parameters = model.parameters() optimizer = torch.optim.SGD(parameters, lr=args.lr, momentum=args.momentum, nesterov=True) decoder = GreedyDecoder(labels) if args.continue_from: print("Loading checkpoint model %s" % args.continue_from) package = torch.load(args.continue_from) model.load_state_dict(package['state_dict']) optimizer.load_state_dict(package['optim_dict']) start_epoch = int(package.get( 'epoch', 1)) - 1 # Python index start at 0 for training start_iter = package.get('iteration', None) if start_iter is None: start_epoch += 1 # Assume that we saved a model after an epoch finished, so start at the next epoch. start_iter = 0 else: start_iter += 1 avg_loss = int(package.get('avg_loss', 0)) loss_results, cer_results, wer_results = package[ 'loss_results'], package['cer_results'], package['wer_results'] if args.visdom and \ package['loss_results'] is not None and start_epoch > 0: # Add previous scores to visdom graph x_axis = epochs[0:start_epoch] y_axis = [ loss_results[0:start_epoch], wer_results[0:start_epoch], cer_results[0:start_epoch] ] for x in range(len(viz_windows)): viz_windows[x] = viz.line( X=x_axis, Y=y_axis[x], opts=opts[x], ) if args.tensorboard and \ package['loss_results'] is not None and start_epoch > 0: # Previous scores to tensorboard logs for i in range(start_epoch): info = { 'Avg Train Loss': loss_results[i], 'Avg WER': wer_results[i], 'Avg CER': cer_results[i] } for tag, val in info.items(): logger.scalar_summary(tag, val, i + 1) if not args.no_bucketing: print("Using bucketing sampler for the following epochs") train_dataset = SpectrogramDatasetWithLength( audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels, normalize=True, augment=args.augment) sampler = BucketingSampler(train_dataset) train_loader.sampler = sampler else: avg_loss = 0 start_epoch = 0 start_iter = 0 if args.cuda: model = torch.nn.DataParallel(model).cuda() print(model) print("Number of parameters: %d" % DeepSpeech.get_param_size(model)) batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() for epoch in range(start_epoch, args.epochs): model.train() end = time.time() for i, (data) in enumerate(train_loader, start=start_iter): if i == len(train_loader): break inputs, targets, input_percentages, target_sizes = data # measure data loading time data_time.update(time.time() - end) inputs = Variable(inputs, requires_grad=False) target_sizes = Variable(target_sizes, requires_grad=False) targets = Variable(targets, requires_grad=False) if args.cuda: inputs = inputs.cuda() out = model(inputs) out = out.transpose(0, 1) # TxNxH seq_length = out.size(0) sizes = Variable(input_percentages.mul_(int(seq_length)).int(), requires_grad=False) loss = criterion(out, targets, sizes, target_sizes) loss = loss / inputs.size(0) # average the loss by minibatch loss_sum = loss.data.sum() inf = float("inf") if loss_sum == inf or loss_sum == -inf: print("WARNING: received an inf loss, setting loss value to 0") loss_value = 0 else: loss_value = loss.data[0] avg_loss += loss_value losses.update(loss_value, inputs.size(0)) # compute gradient optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), args.max_norm) # SGD step optimizer.step() if args.cuda: torch.cuda.synchronize() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if not args.silent: print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( (epoch + 1), (i + 1), len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses)) if args.checkpoint_per_batch > 0 and i > 0 and ( i + 1) % args.checkpoint_per_batch == 0: file_path = '%s/deepspeech_checkpoint_epoch_%d_iter_%d.pth.tar' % ( save_folder, epoch + 1, i + 1) print("Saving checkpoint model to %s" % file_path) torch.save( DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, iteration=i, loss_results=loss_results, wer_results=wer_results, cer_results=cer_results, avg_loss=avg_loss), file_path) del loss del out avg_loss /= len(train_loader) print('Training Summary Epoch: [{0}]\t' 'Average Loss {loss:.3f}\t'.format(epoch + 1, loss=avg_loss)) start_iter = 0 # Reset start iteration for next epoch total_cer, total_wer = 0, 0 model.eval() for i, (data) in enumerate(test_loader): # test inputs, targets, input_percentages, target_sizes = data inputs = Variable(inputs, volatile=True) # unflatten targets split_targets = [] offset = 0 for size in target_sizes: split_targets.append(targets[offset:offset + size]) offset += size if args.cuda: inputs = inputs.cuda() out = model(inputs) out = out.transpose(0, 1) # TxNxH seq_length = out.size(0) sizes = input_percentages.mul_(int(seq_length)).int() decoded_output = decoder.decode(out.data, sizes) target_strings = decoder.process_strings( decoder.convert_to_strings(split_targets)) wer, cer = 0, 0 for x in range(len(target_strings)): wer += decoder.wer(decoded_output[x], target_strings[x]) / float( len(target_strings[x].split())) cer += decoder.cer(decoded_output[x], target_strings[x]) / float( len(target_strings[x])) total_cer += cer total_wer += wer if args.cuda: torch.cuda.synchronize() del out wer = total_wer / len(test_loader.dataset) cer = total_cer / len(test_loader.dataset) wer *= 100 cer *= 100 loss_results[epoch] = avg_loss wer_results[epoch] = wer cer_results[epoch] = cer print('Validation Summary Epoch: [{0}]\t' 'Average WER {wer:.3f}\t' 'Average CER {cer:.3f}\t'.format(epoch + 1, wer=wer, cer=cer)) if args.visdom: # epoch += 1 x_axis = epochs[0:epoch + 1] y_axis = [ loss_results[0:epoch + 1], wer_results[0:epoch + 1], cer_results[0:epoch + 1] ] for x in range(len(viz_windows)): if viz_windows[x] is None: viz_windows[x] = viz.line( X=x_axis, Y=y_axis[x], opts=opts[x], ) else: viz.line( X=x_axis, Y=y_axis[x], win=viz_windows[x], update='replace', ) if args.tensorboard: info = {'Avg Train Loss': avg_loss, 'Avg WER': wer, 'Avg CER': cer} for tag, val in info.items(): logger.scalar_summary(tag, val, epoch + 1) if args.log_params: for tag, value in model.named_parameters(): tag = tag.replace('.', '/') logger.histo_summary(tag, to_np(value), epoch + 1) logger.histo_summary(tag + '/grad', to_np(value.grad), epoch + 1) if args.checkpoint: file_path = '%s/deepspeech_%d.pth.tar' % (save_folder, epoch + 1) torch.save( DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results, wer_results=wer_results, cer_results=cer_results), file_path) # anneal lr optim_state = optimizer.state_dict() optim_state['param_groups'][0][ 'lr'] = optim_state['param_groups'][0]['lr'] / args.learning_anneal optimizer.load_state_dict(optim_state) print('Learning rate annealed to: {lr:.6f}'.format( lr=optim_state['param_groups'][0]['lr'])) avg_loss = 0 if not args.no_bucketing and epoch == 0: print("Switching to bucketing sampler for following epochs") train_dataset = SpectrogramDatasetWithLength( audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels, normalize=True, augment=args.augment) sampler = BucketingSampler(train_dataset) train_loader.sampler = sampler torch.save(DeepSpeech.serialize(model, optimizer=optimizer), args.final_model_path)
noise_prob=args.noise_prob, noise_levels=(args.noise_min, args.noise_max)) rnn_type = args.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" model = DeepSpeech(rnn_hidden_size=args.hidden_size, nb_layers=args.hidden_layers, labels=labels, rnn_type=supported_rnns[rnn_type], audio_conf=audio_conf, bidirectional=args.bidirectional) decoder = GreedyDecoder(labels) train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels, normalize=True, augment=args.augment) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.val_manifest, labels=labels, normalize=True, augment=False) if not args.distributed: train_sampler = BucketingSampler(train_dataset, batch_size=args.batch_size) else: train_sampler = DistributedBucketingSampler( train_dataset, batch_size=args.batch_size, num_replicas=args.world_size,
help='path to validation manifest csv', default='data/test_manifest.csv') parser.add_argument('--batch-size', default=20, type=int, help='Batch size for training') parser.add_argument('--num-workers', default=4, type=int, help='Number of workers used in dataloading') parser.add_argument('--verbose', action="store_true", help="print out decoded output and error of each sample") parser.add_argument('--output-path', default=None, type=str, help="Where to save raw acoustic output") args = parser.parse_args() if __name__ == '__main__': torch.set_grad_enabled(False) model, _ = load_model(args.model_path) device = torch.device("cuda" if args.cuda else "cpu") label_decoder = LabelDecoder(model.labels) model.eval() model = model.to(device) test_dataset = SpectrogramDataset(audio_conf=model.audio_conf, manifest_filepath=args.test_manifest, labels=model.labels) test_sampler = BucketingSampler(test_dataset, batch_size=args.batch_size) test_loader = AudioDataLoader(test_dataset, batch_sampler=test_sampler, num_workers=args.num_workers) test_sampler.shuffle(1) total_wer, total_cer, total_ler, num_words, num_chars, num_labels = 0, 0, 0, 0, 0, 0 output_data = [] for i, (data) in tqdm(enumerate(test_loader), total=len(test_loader), ascii=True): inputs, targets, input_sizes, target_sizes, filenames = data inputs = inputs.to(device) input_sizes = input_sizes.to(device) outputs = model.transcribe(inputs, input_sizes) for i, target in enumerate(targets):
assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" model = DeepSpeech(rnn_hidden_size=args.hidden_size, nb_layers=args.hidden_layers, labels=labels, rnn_type=supported_rnns[rnn_type], audio_conf=audio_conf, bidirectional=args.bidirectional) state = TrainingState(model=model) state.init_results_tracking(epochs=args.epochs) # Data setup evaluation_decoder = GreedyDecoder(model.labels) # Decoder used for validation train_dataset = SpectrogramDataset(audio_conf=model.audio_conf, manifest_filepath=args.train_manifest, labels=model.labels, normalize=True, speed_volume_perturb=args.speed_volume_perturb, spec_augment=args.spec_augment) test_dataset = SpectrogramDataset(audio_conf=model.audio_conf, manifest_filepath=args.val_manifest, labels=model.labels, normalize=True, speed_volume_perturb=False, spec_augment=False) if not args.distributed: train_sampler = DSRandomSampler(dataset=train_dataset, batch_size=args.batch_size, start_index=state.training_step) else: train_sampler = DSElasticDistributedSampler(dataset=train_dataset, batch_size=args.batch_size,
train_loss = int(package.get('avg_loss', 0)) for i in range(start_epoch): train_results[i] = package['train_results'][i] val_results[i] = package['val_results'][i] best_wer = float(val_results[:start_epoch].min()) else: with open(args.labels_path) as label_file: labels = json.load(label_file) model = Model(model_conf, audio_conf, labels) model = model.to(device) # Data inputs configuration train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels) val_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.val_manifest, labels=labels) label_decoder = LabelDecoder(labels) if not args.distributed: train_sampler = BucketingSampler(train_dataset, batch_size=batch_size) val_sampler = BucketingSampler(val_dataset, batch_size=batch_size) else: train_sampler = DistributedBucketingSampler( train_dataset, batch_size=batch_size, num_replicas=args.world_size, rank=args.rank)
def init_datasets(audio_conf,labels, kwargs): train_dataset = SpectrogramDataset(kwargs['train_manifest'], audio_conf, labels) batch_sampler = BatchSampler(SequentialSampler(train_dataset), batch_size=kwargs['batch_size'], drop_last=False) train_batch_loader = BatchAudioDataLoader(train_dataset, batch_sampler=batch_sampler) eval_dataset = SpectrogramDataset(kwargs['val_manifest'], audio_conf, labels) return train_dataset, train_batch_loader, eval_dataset
args = parser.parse_args() val_manifest = "data/dev-clean_manifest.csv" device = "cuda" model_path = args.model_path package = torch.load(model_path, map_location=lambda storage, loc: storage) model = DeepSpeech.load_model_package(package) labels = model.labels audio_conf = model.audio_conf model.to(device) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=val_manifest, labels=labels, normalize=True, speed_volume_perturb=False, spec_augment=False) test_loader = AudioDataLoader(test_dataset, batch_size=20, num_workers=4) decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) target_decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) with torch.no_grad(): evaluate(test_loader, device, model, decoder, target_decoder, verbose=True) print("made it this far")
def main(): args = parser.parse_args() params.cuda = not bool(args.cpu) print("Use cuda: {}".format(params.cuda)) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if params.rnn_type == 'gru' and params.rnn_act_type != 'tanh': print( "ERROR: GRU does not currently support activations other than tanh" ) sys.exit() if params.rnn_type == 'rnn' and params.rnn_act_type != 'relu': print("ERROR: We should be using ReLU RNNs") sys.exit() print("=======================================================") for arg in vars(args): print("***%s = %s " % (arg.ljust(25), getattr(args, arg))) print("=======================================================") save_folder = args.save_folder loss_results, cer_results, wer_results = torch.Tensor( params.epochs), torch.Tensor(params.epochs), torch.Tensor( params.epochs) best_wer = None try: os.makedirs(save_folder) except OSError as e: if e.errno == errno.EEXIST: print('Directory already exists.') else: raise criterion = CTCLoss() with open(params.labels_path) as label_file: labels = str(''.join(json.load(label_file))) audio_conf = dict(sample_rate=params.sample_rate, window_size=params.window_size, window_stride=params.window_stride, window=params.window, noise_dir=params.noise_dir, noise_prob=params.noise_prob, noise_levels=(params.noise_min, params.noise_max)) if args.use_set == 'libri': testing_manifest = params.val_manifest + ("_held" if args.hold_idx >= 0 else "") else: testing_manifest = params.test_manifest if args.batch_size_val > 0: params.batch_size_val = args.batch_size_val print("Testing on: {}".format(testing_manifest)) train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=params.val_manifest, labels=labels, normalize=True, augment=params.augment) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=testing_manifest, labels=labels, normalize=True, augment=False) train_loader = AudioDataLoader(train_dataset, batch_size=params.batch_size, num_workers=1) test_loader = AudioDataLoader(test_dataset, batch_size=params.batch_size_val, num_workers=1) rnn_type = params.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" model = DeepSpeech(rnn_hidden_size=params.hidden_size, nb_layers=params.hidden_layers, labels=labels, rnn_type=supported_rnns[rnn_type], audio_conf=audio_conf, bidirectional=False, rnn_activation=params.rnn_act_type, bias=params.bias) parameters = model.parameters() optimizer = torch.optim.SGD(parameters, lr=params.lr, momentum=params.momentum, nesterov=True, weight_decay=params.l2) decoder = GreedyDecoder(labels) if args.continue_from: print("Loading checkpoint model %s" % args.continue_from) package = torch.load(args.continue_from) model.load_state_dict(package['state_dict']) optimizer.load_state_dict(package['optim_dict']) start_epoch = int(package.get( 'epoch', 1)) - 1 # Python index start at 0 for training start_iter = package.get('iteration', None) if start_iter is None: start_epoch += 1 # Assume that we saved a model after an epoch finished, so start at the next epoch. start_iter = 0 else: start_iter += 1 avg_loss = int(package.get('avg_loss', 0)) if args.start_epoch != -1: start_epoch = args.start_epoch avg_loss = 0 start_epoch = 0 start_iter = 0 avg_training_loss = 0 epoch = 1 else: avg_loss = 0 start_epoch = 0 start_iter = 0 avg_training_loss = 0 if params.cuda: model = torch.nn.DataParallel(model).cuda() # model = torch.nn.parallel.DistributedDataParallel(model).cuda() print(model) print("Number of parameters: %d" % DeepSpeech.get_param_size(model)) batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() ctc_time = AverageMeter() for epoch in range(start_epoch, params.epochs): ################################################################################################################# # The test script only really cares about this section. ################################################################################################################# model.eval() wer, cer, trials = eval_model_verbose(model, test_loader, decoder, params.cuda, args.n_trials) root = os.getcwd() outfile = osp.join( root, "inference_bs{}_i{}_gpu{}.csv".format(params.batch_size_val, args.hold_idx, params.cuda)) print("Exporting inference to: {}".format(outfile)) make_file(outfile) write_line( outfile, "batch times pre normalized by hold_sec =,{}\n".format( args.hold_sec)) write_line(outfile, "wer, {}\n".format(wer)) write_line(outfile, "cer, {}\n".format(cer)) write_line(outfile, "bs, {}\n".format(params.batch_size_val)) write_line(outfile, "hold_idx, {}\n".format(args.hold_idx)) write_line(outfile, "cuda, {}\n".format(params.cuda)) write_line(outfile, "avg batch time, {}\n".format(trials.avg / args.hold_sec)) percentile_50 = np.percentile( trials.array, 50) / params.batch_size_val / args.hold_sec write_line(outfile, "50%-tile latency, {}\n".format(percentile_50)) percentile_99 = np.percentile( trials.array, 99) / params.batch_size_val / args.hold_sec write_line(outfile, "99%-tile latency, {}\n".format(percentile_99)) write_line(outfile, "through put, {}\n".format(1 / percentile_50)) write_line(outfile, "data\n") for trial in trials.array: write_line(outfile, "{}\n".format(trial / args.hold_sec)) loss_results[epoch] = avg_loss wer_results[epoch] = wer cer_results[epoch] = cer print('Validation Summary Epoch: [{0}]\t' 'Average WER {wer:.3f}\t' 'Average CER {cer:.3f}\t'.format(epoch + 1, wer=wer, cer=cer)) # anneal lr optim_state = optimizer.state_dict() optim_state['param_groups'][0]['lr'] = optim_state['param_groups'][0][ 'lr'] / params.learning_anneal optimizer.load_state_dict(optim_state) print('Learning rate annealed to: {lr:.6f}'.format( lr=optim_state['param_groups'][0]['lr'])) break print("=======================================================") print("***Best WER = ", best_wer) for arg in vars(args): print("***%s = %s " % (arg.ljust(25), getattr(args, arg))) print("=======================================================")
def main(): args = parser.parse_args() torch.set_printoptions(profile="full") criterion = nn.CrossEntropyLoss() class_accu_reg = tnt.meter.ClassErrorMeter(topk=[1], accuracy=True) class_accu_sum = tnt.meter.ClassErrorMeter(topk=[1], accuracy=True) audio_conf = dict(sample_rate=args.sample_rate, window_size=args.window_size, window_stride=args.window_stride, window=args.window, noise_dir=args.noise_dir, noise_prob=args.noise_prob, noise_levels=(args.noise_min, args.noise_max)) train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.train_manifest, normalize=True, augment=args.augment) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.val_manifest, normalize=True, augment=False) train_loader = AudioDataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers) test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) rnn_type = args.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" #print("FIRST LAYER TYPE:\t", args.first_layer_type) #print("MFCC TRANSFORM:\t\t", args.mfcc) model = DeepSpeech(rnn_hidden_size=args.hidden_size, nb_layers=args.hidden_layers, rnn_type=supported_rnns[rnn_type], audio_conf=audio_conf, bidirectional=True, cnn_features=args.cnn_features, kernel=args.kernel, first_layer_type=args.first_layer_type, stride=args.stride, mfcc=args.mfcc) ######## #print(list(model.rnns.modules())) #for rnn in model.rnns.modules(): # print(rnn)#.flatten_parameters() #def flat_model(model): # for m in model.modules(): # if isinstance(m, nn.LSTM): # m.flatten_parameters() ######## parameters = model.parameters() optimizer = torch.optim.SGD(parameters, lr=args.lr, momentum=args.momentum, nesterov=True) #scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.learning_rate_decay_epochs, gamma=args.learning_rate_decay_rate) scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99) avg_loss = 0 start_epoch = 0 start_iter = 0 best_train_accu_reg = 0 best_train_accu_sum = 0 best_test_accu_reg = 0 best_test_accu_sum = 0 best_avg_loss = float("inf") # sys.float_info.max # 1000000 epoch_70 = None epoch_90 = None epoch_95 = None epoch_99 = None utterance_sequence_length = int(args.utterance_miliseconds / 10) loss_begin = round(args.crop_begin / (10 * args.stride)) loss_end = -round(args.crop_end / (10 * args.stride)) or None gap = loss_begin print("LOSS BEGIN:", loss_begin) print("LOSS END:", loss_end) if args.cuda: model = torch.nn.DataParallel(model).cuda() print(model) print("Number of parameters: ", DeepSpeech.get_param_size(model)) batch_time = AverageMeter() data_time = AverageMeter() #losses = AverageMeter() print(args, "\n") for epoch in range(start_epoch, args.epochs): losses = AverageMeter() scheduler.step() optim_state_now = optimizer.state_dict() print('\nLEARNING RATE: {lr:.6f}'.format( lr=optim_state_now['param_groups'][0]['lr'])) class_accu_reg.reset() class_accu_sum.reset() model.train() end = time.time() for i, (data) in enumerate(train_loader, start=start_iter): if i == len(train_loader): break inputs, input_percentages, speaker_labels, mfccs = data # measure data loading time data_time.update(time.time() - end) inputs = Variable(inputs, requires_grad=False) ######## mfccs = Variable(mfccs, requires_grad=False) if args.mfcc == "true": inputs = mfccs # <<-- This line makes us to use mfccs... #print("INPUTS SIZE:", inputs.size()) #print("MFCCS SIZE:", mfccs.size()) ######## speaker_labels = Variable(speaker_labels, requires_grad=False) speaker_labels = speaker_labels.cuda(async=True).long() if args.cuda: inputs = inputs.cuda() ######## ######## sizes = inputs.size() inputs = inputs.view(sizes[0], sizes[1] * sizes[2], sizes[3]) # Collapse feature dimension #print("INPUTS SIZE: ====>>>>>\t", inputs.size()) #start = 0 #duration = 100 start = random.randint( 0, int((inputs.size(2) - 1) * (1 - args.sample_proportion))) duration = int((inputs.size(2)) * (args.sample_proportion)) #start = random.randint(0, (inputs.size(3)-1)-utterance_sequence_length) #duration = utterance_sequence_length utterances = inputs[ ..., start:start + duration] # <<<<<<====== THIS IS THE MOST IMPORTANT CODE OF THE PROJECT #print("UTTERS SIZE: ====>>>>>\t", utterances.size(), start, start+duration) out = model(utterances) #print("OUTPUT SIZE: ====>>>>>\t", out.size()) out = out.transpose(0, 1) # TxNxH ######## ######## # Prints the output of the model in a sequence of probabilities of char for each audio... #torch.set_printoptions(profile="full") ####print("OUT: " + str(out.size()), "SPEAKER LABELS:" + str(speaker_labels.size()), "INPUT PERCENTAGES MEAN: " + str(input_percentages.mean())) #print(out[:,:,0]) #print("SPEAKER LABELS: " + str(speaker_labels)) #print(out[0][0]) #softmax_output = F.softmax(out).data # This DOES NOT what I want... #softmax_output_alt = flex_softmax(out, axis=2).data # This is FINE!!! <<<=== #print(softmax_output[0][0]) #print(softmax_output_alt[0][0]) ####new_out = torch.sum(out, 0) ####new_out = torch.sum(out[20:], 0) #print(out.size()) #print(new_out.size()) #print(out[-1].size()) class_accu_reg.add(out[round(out.size(0) / 2)].data, speaker_labels.data) class_accu_sum.add( torch.sum(out[loss_begin:loss_end], 0).data, speaker_labels.data) #class_accu_reg.add(processed_out.data, processed_speaker_labels.data) if args.loss_type == "reg": processed_out = out[round(out.size(0) / 2)] processed_speaker_labels = speaker_labels if args.loss_type == "mult": #indices = torch.LongTensor([0,2]) mult = (round(out.size(0) / 4), round(out.size(0) / 2), round(3 * out.size(0) / 4)) processed_out = out.contiguous()[mult, ...].view(-1, 48) processed_speaker_labels = speaker_labels.repeat( out.size(0), 1)[mult, ...].view(-1) #processed_out = out.contiguous()[(round(out.size(0)/4),round(out.size(0)/2),round(3*out.size(0)/4)),...].view(-1,48) #processed_speaker_labels = speaker_labels.repeat(out.size(0),1)[(round(out.size(0)/4),round(out.size(0)/2),round(3*out.size(0)/4)),...].view(-1) #processed_out = out.contiguous()[(loss_begin,round(out.size(0)/2),loss_end),...].view(-1,48) #processed_speaker_labels = speaker_labels.repeat(out.size(0),1)[(loss_begin,round(out.size(0)/2),loss_end),...].view(-1) ##speaker_labels = speaker_labels.expand(20, out.size(0)) elif args.loss_type == "sum": sum_begin = round(out.size(0) / 2) - round(out.size(0) / 4) sum_end = round(out.size(0) / 2) + round(out.size(0) / 4) processed_out = torch.sum(out[sum_begin:sum_end], 0) processed_speaker_labels = speaker_labels #processed_out = torch.sum(out[loss_begin:loss_end], 0) #processed_speaker_labels = speaker_labels #processed_out = torch.sum(out, 0) #processed_speaker_labels = speaker_labels elif args.loss_type == "full": full_begin = round(out.size(0) / 2) - round(out.size(0) / 4) full_end = round(out.size(0) / 2) + round(out.size(0) / 4) processed_out = out.contiguous()[full_begin:full_end].view( -1, 48) processed_speaker_labels = speaker_labels.repeat( out.size(0), 1)[full_begin:full_end].view(-1) ##speaker_labels = speaker_labels.expand(20, out.size(0)) #processed_out = out.contiguous()[loss_begin:loss_end].view(-1,48) #processed_speaker_labels = speaker_labels.repeat(out.size(0),1)[loss_begin:loss_end].view(-1) ##speaker_labels = speaker_labels.expand(20, out.size(0)) #processed_out = out.contiguous().view(-1, 48) #processed_speaker_labels = speaker_labels.repeat(out.size(0),1).view(-1) ##speaker_labels = speaker_labels.expand(20, out.size(0)) #print("PROC OUTPUT: ====>>>>>\t" + str(processed_out.size())) #print("PROC LABELS: ====>>>>>\t" + str(processed_speaker_labels.size())) loss = criterion(processed_out, processed_speaker_labels) loss = loss / inputs.size(0) # average the loss by minibatch loss_sum = loss.data.sum() inf = float("inf") if loss_sum == inf or loss_sum == -inf: print("WARNING: received an inf loss, setting loss value to 0") loss_value = 0 else: loss_value = loss.data[0] avg_loss += loss_value losses.update(loss_value, inputs.size(0)) #accu_out3 = torch.sum(flex_softmax(out[20:], axis=2), 0) #print(classaccu.value()[0], classaccu.value()[1]) # Cross Entropy Loss for a Sequence (Time Series) of Output? #output = output.view(-1,29) #target = target.view(-1) #criterion = nn.CrossEntropyLoss() #loss = criterion(output,target) # compute gradient optimizer.zero_grad() loss.backward() #torch.nn.utils.clip_grad_norm(model.parameters(), args.max_norm) # SGD step optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if not args.silent: print('Epoch: [{0}][{1}/{2}]\t' 'Loss {loss.val:.8f} ({loss.avg:.8f})\t' 'CARR {carr:.2f}\t' 'CARS {cars:.2f}\t'.format( (epoch + 1), (i + 1), len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, carr=class_accu_reg.value()[0], cars=class_accu_sum.value()[0])) if args.cuda: torch.cuda.synchronize() del loss del out del processed_out del speaker_labels del processed_speaker_labels avg_loss /= len(train_loader) if (best_avg_loss > avg_loss): best_avg_loss = avg_loss print("\nCURRENT EPOCH AVERAGE LOSS:\t", avg_loss) print("\nCURRENT EPOCH TRAINING RESULTS:\t", class_accu_reg.value()[0], "\t", class_accu_sum.value()[0], "\n") if (best_train_accu_reg < class_accu_reg.value()[0]): best_train_accu_reg = class_accu_reg.value()[0] if (best_train_accu_sum < class_accu_sum.value()[0]): best_train_accu_sum = class_accu_sum.value()[0] get_70 = (class_accu_reg.value()[0] > 70) if ((epoch_70 is None) and (get_70 == True)): epoch_70 = epoch + 1 get_90 = (class_accu_reg.value()[0] > 90) if ((epoch_90 is None) and (get_90 == True)): epoch_90 = epoch + 1 get_95 = (class_accu_reg.value()[0] > 95) if ((epoch_95 is None) and (get_95 == True)): epoch_95 = epoch + 1 get_99 = (class_accu_reg.value()[0] > 99) if ((epoch_99 is None) and (get_99 == True)): epoch_99 = epoch + 1 start_iter = 0 # Reset start iteration for next epoch model.eval() class_accu_reg.reset() class_accu_sum.reset() for i, (data) in enumerate(test_loader): # test inputs, input_percentages, speaker_labels, mfccs = data inputs = Variable(inputs, volatile=True) ######## mfccs = Variable(mfccs, requires_grad=False) if args.mfcc == "true": inputs = mfccs # <<-- This line makes us to use mfccs... #print("INPUTS SIZE:", inputs.size()) #print("MFCCS SIZE:", mfccs.size()) ######## speaker_labels = Variable(speaker_labels, requires_grad=False) speaker_labels = speaker_labels.cuda(async=True).long() if args.cuda: inputs = inputs.cuda() ######## ######## sizes = inputs.size() inputs = inputs.view(sizes[0], sizes[1] * sizes[2], sizes[3]) # Collapse feature dimension #print("INPUTS SIZE: ====>>>>>\t", inputs.size()) #start = round(inputs.size(2)/2)-40 #duration = 80 #start = random.randint(0, int((inputs.size(3)-1)*(1-args.sample_proportion))) #duration = int((inputs.size(3))*(args.sample_proportion)) #start = random.randint(0, (inputs.size(3)-1)-utterance_sequence_length) #duration = utterance_sequence_length utterances = inputs #[...,start:start+duration] # <<<<<<====== THIS IS THE MOST IMPORTANT CODE OF THE PROJECT #print("UTTERS SIZE: ====>>>>>\t", utterances.size(), start, start+duration) out = model(utterances) #print("OUTPUT SIZE: ====>>>>>\t", out.size()) out = out.transpose(0, 1) # TxNxH ######## ######## # Prints the output of the model in a sequence of probabilities of char for each audio... #torch.set_printoptions(profile="full") ########print("OUT: " + str(out.size()), "NEW OUT:" + str(new_out.size()), "SPEAKER LABELS:" + str(speaker_labels.size()), "INPUT PERCENTAGES MEAN: " + str(input_percentages.mean())) #print(out[:,:,0]) #print("SPEAKER LABELS: " + str(speaker_labels)) #print(out[0][0]) #softmax_output = F.softmax(out).data # This DOES NOT what I want... #softmax_output_alt = flex_softmax(out, axis=2).data # This is FINE!!! <<<=== #print(softmax_output[0][0]) #print(softmax_output_alt[0][0]) ######## #if args.loss_type == "reg": # processed_out = out[round(out.size(0)/2)]; processed_speaker_labels = speaker_labels #elif args.loss_type == "sum" or "full": # #processed_out = torch.sum(out[loss_begin:loss_end], 0); processed_speaker_labels = speaker_labels # processed_out = torch.sum(out, 0); processed_speaker_labels = speaker_labels #elif args.loss_type == "full": # #processed_out = out.contiguous()[loss_begin:loss_end].view(-1,48); processed_speaker_labels = speaker_labels.repeat(out.size(0),1)[loss_begin:loss_end].view(-1) #speaker_labels = speaker_labels.expand(20, out.size(0)) # processed_out = out.contiguous().view(-1, 48); processed_speaker_labels = speaker_labels.repeat(out.size(0),1).view(-1) # speaker_labels = speaker_labels.expand(20, out.size(0)) #print("OUT: " + str(out.size()), "SPEAKER LABELS:" + str(speaker_labels.size())) #print("PROC OUTPUT: ====>>>>>\t" + str(processed_out.size())) #print("PROC LABELS: ====>>>>>\t" + str(processed_speaker_labels.size())) class_accu_reg.add(out[round(out.size(0) / 2)].data, speaker_labels.data) class_accu_sum.add( torch.sum(out[loss_begin:loss_end], 0).data, speaker_labels.data) #class_accu_reg.add(processed_out.data, processed_speaker_labels.data) print('Validation Summary Epoch: [{0}]\t' 'CARR {carr:.2f}\t' 'CARS {cars:.2f}\t'.format(epoch + 1, carr=class_accu_reg.value()[0], cars=class_accu_sum.value()[0])) if args.cuda: torch.cuda.synchronize() del out print("\nCURRENT EPOCH TEST RESULTS:\t", class_accu_reg.value()[0], "\t", class_accu_sum.value()[0], "\n") if (best_test_accu_reg < class_accu_reg.value()[0]): best_test_accu_reg = class_accu_reg.value()[0] if (best_test_accu_sum < class_accu_sum.value()[0]): best_test_accu_sum = class_accu_sum.value()[0] print("\nBEST AVERAGE LOSS:\t\t", best_avg_loss) print("\nBEST EPOCH TRAINING RESULTS:\t", best_train_accu_reg, "\t", best_train_accu_sum) print("\nBEST EPOCH TEST RESULTS:\t", best_test_accu_reg, "\t", best_test_accu_sum) print("\nEPOCHS 70%, 90%, 95%, 99%:\t", epoch_70, "\t", epoch_90, "\t", epoch_95, "\t", epoch_99, "\n") torch.save( DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch), args.model_path) avg_loss = 0 if not args.no_bucketing and epoch == 0: print("Switching to bucketing sampler for following epochs") train_dataset = SpectrogramDatasetWithLength( audio_conf=audio_conf, manifest_filepath=args.train_manifest, normalize=True, augment=args.augment) sampler = BucketingSampler(train_dataset) train_loader.sampler = sampler
decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) elif args.decoder == "greedy": decoder = GreedyDecoder(labels, blank_index=labels.index('_')) else: decoder = None target_decoder = GreedyDecoder(labels, blank_index=labels.index('_')) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.test_manifest, cache_path=args.cache_dir, labels=labels, normalize=args.norm) # import random;random.shuffle(test_dataset.ids) test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) total_cer, total_wer, num_tokens, num_chars = 0, 0, 0, 0 processed_files = [] for i, data in tqdm(enumerate(test_loader), total=len(test_loader)): inputs, targets, filenames, input_percentages, target_sizes = data input_sizes = input_percentages.mul_(int(inputs.size(3))).int() # unflatten targets split_targets = []
def main(): args = parser.parse_args() save_folder = args.save_folder ######## """ loss_results, cer_results, wer_results = torch.Tensor(args.epochs), torch.Tensor(args.epochs), torch.Tensor( args.epochs) best_wer = None if args.visdom: from visdom import Visdom viz = Visdom() opts = [dict(title=args.visdom_id + ' Loss', ylabel='Loss', xlabel='Epoch'), dict(title=args.visdom_id + ' WER', ylabel='WER', xlabel='Epoch'), dict(title=args.visdom_id + ' CER', ylabel='CER', xlabel='Epoch')] viz_windows = [None, None, None] epochs = torch.arange(1, args.epochs + 1) if args.tensorboard: from logger import TensorBoardLogger try: os.makedirs(args.log_dir) except OSError as e: if e.errno == errno.EEXIST: print('Directory already exists.') for file in os.listdir(args.log_dir): file_path = os.path.join(args.log_dir, file) try: if os.path.isfile(file_path): os.unlink(file_path) except Exception as e: raise else: raise logger = TensorBoardLogger(args.log_dir) try: os.makedirs(save_folder) except OSError as e: if e.errno == errno.EEXIST: print('Directory already exists.') else: raise """ ######## ######## """ criterion = CTCLoss() """ criterion = nn.CrossEntropyLoss() class_accu = tnt.meter.ClassErrorMeter(topk=[1], accuracy=True) class_accu_sum = tnt.meter.ClassErrorMeter(topk=[1], accuracy=True) class_accu_sum_120 = tnt.meter.ClassErrorMeter(topk=[1], accuracy=True) class_accu_sum_240 = tnt.meter.ClassErrorMeter(topk=[1], accuracy=True) class_accu_sum_360 = tnt.meter.ClassErrorMeter(topk=[1], accuracy=True) class_accu_sum_480 = tnt.meter.ClassErrorMeter(topk=[1], accuracy=True) ######## with open(args.labels_path) as label_file: labels = str(''.join(json.load(label_file))) audio_conf = dict(sample_rate=args.sample_rate, window_size=args.window_size, window_stride=args.window_stride, window=args.window, noise_dir=args.noise_dir, noise_prob=args.noise_prob, noise_levels=(args.noise_min, args.noise_max)) train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels, normalize=True, augment=args.augment) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.val_manifest, labels=labels, normalize=True, augment=False) train_loader = AudioDataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers) test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) rnn_type = args.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" ######## """ model = DeepSpeech(rnn_hidden_size=args.hidden_size, nb_layers=args.hidden_layers, labels=labels, rnn_type=supported_rnns[rnn_type], audio_conf=audio_conf, bidirectional=True, cnn_features=args.cnn_features) """ model = DeepSpeech(rnn_hidden_size=args.hidden_size, nb_layers=args.hidden_layers, labels=labels, rnn_type=supported_rnns[rnn_type], audio_conf=audio_conf, bidirectional=True, cnn_features=args.cnn_features, kernel=args.kernel, stride=args.stride) ######## ######## #print(list(model.rnns.modules())) #for rnn in model.rnns.modules(): # print(rnn)#.flatten_parameters() #def flat_model(model): # for m in model.modules(): # if isinstance(m, nn.LSTM): # m.flatten_parameters() ######## parameters = model.parameters() optimizer = torch.optim.SGD(parameters, lr=args.lr, momentum=args.momentum, nesterov=True) ######## #scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.learning_rate_decay_epochs, gamma=args.learning_rate_decay_rate) #scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99) ######## ######## """ decoder = GreedyDecoder(labels) """ ######## ######## """ if args.continue_from: print("Loading checkpoint model %s" % args.continue_from) package = torch.load(args.continue_from) model.load_state_dict(package['state_dict']) optimizer.load_state_dict(package['optim_dict']) start_epoch = int(package.get('epoch', 1)) - 1 # Python index start at 0 for training start_iter = package.get('iteration', None) if start_iter is None: start_epoch += 1 # Assume that we saved a model after an epoch finished, so start at the next epoch. start_iter = 0 else: start_iter += 1 avg_loss = int(package.get('avg_loss', 0)) loss_results, cer_results, wer_results = package['loss_results'], package[ 'cer_results'], package['wer_results'] if args.visdom and \ package['loss_results'] is not None and start_epoch > 0: # Add previous scores to visdom graph x_axis = epochs[0:start_epoch] y_axis = [loss_results[0:start_epoch], wer_results[0:start_epoch], cer_results[0:start_epoch]] for x in range(len(viz_windows)): viz_windows[x] = viz.line( X=x_axis, Y=y_axis[x], opts=opts[x], ) if args.tensorboard and \ package['loss_results'] is not None and start_epoch > 0: # Previous scores to tensorboard logs for i in range(start_epoch): info = { 'Avg Train Loss': loss_results[i], 'Avg WER': wer_results[i], 'Avg CER': cer_results[i] } for tag, val in info.items(): logger.scalar_summary(tag, val, i + 1) if not args.no_bucketing and epoch != 0: print("Using bucketing sampler for the following epochs") train_dataset = SpectrogramDatasetWithLength(audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels, normalize=True, augment=args.augment) sampler = BucketingSampler(train_dataset) train_loader.sampler = sampler else: avg_loss = 0 start_epoch = 0 start_iter = 0 """ avg_loss = 0 start_epoch = 0 start_iter = 0 best_train_accu = 0 best_train_accu_sum = 0 best_train_accu_sum_120 = 0 best_train_accu_sum_240 = 0 best_train_accu_sum_360 = 0 best_train_accu_sum_480 = 0 best_test_accu = 0 best_test_accu_sum = 0 best_test_accu_sum_120 = 0 best_test_accu_sum_240 = 0 best_test_accu_sum_360 = 0 best_test_accu_sum_480 = 0 best_avg_loss = float("inf") # sys.float_info.max # 1000000 epoch_70 = None epoch_90 = None epoch_95 = None epoch_99 = None if args.stride == 1: multiplier = 6 if args.stride == 2: multiplier = 3 if args.stride == 3: multiplier = 2 if args.stride == 4: multiplier = 1 # (Should be 1.5...) #sample_time_steps = int(args.sample_miliseconds / 10) loss_begin = round(args.crop_begin / (10 * args.stride)) loss_end = -round(args.crop_end / (10 * args.stride)) or None print("LOSS BEGIN:", loss_begin) print("LOSS END:", loss_end) ######## if args.cuda: model = torch.nn.DataParallel(model).cuda() print(model) print("Number of parameters: %d" % DeepSpeech.get_param_size(model)) batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() for epoch in range(start_epoch, args.epochs): ######## #scheduler.step() optim_state_now = optimizer.state_dict() print('\nLEARNING RATE: {lr:.6f}'.format(lr=optim_state_now['param_groups'][0]['lr'])) class_accu.reset() class_accu_sum.reset() class_accu_sum_120.reset() class_accu_sum_240.reset() class_accu_sum_360.reset() class_accu_sum_480.reset() ######## model.train() end = time.time() for i, (data) in enumerate(train_loader, start=start_iter): if i == len(train_loader): break ######## """ inputs, targets, input_percentages, target_sizes = data """ inputs, targets, input_percentages, target_sizes, speaker_labels = data ######## # measure data loading time data_time.update(time.time() - end) inputs = Variable(inputs, requires_grad=False) ######## """ target_sizes = Variable(target_sizes, requires_grad=False) targets = Variable(targets, requires_grad=False) """ speaker_labels = Variable(speaker_labels, requires_grad=False) ######## if args.cuda: inputs = inputs.cuda() ######## """ out = model(inputs) """ #temp_random = random.randint(0, (inputs.size(3)-1)-sample_time_steps) #print("INPUT", inputs[...,temp_random:temp_random+sample_time_steps].size(),temp_random, temp_random+sample_time_steps) #out = model(inputs[...,temp_random:temp_random+sample_time_steps]) #print("OUTPUT", out.size()) start = random.randint(0, int((inputs.size(3)-1)*(1-args.sample_proportion))) print("INPUT", inputs.size(3), inputs[...,start:start+int((inputs.size(3))*(args.sample_proportion))].size(),start, start+int((inputs.size(3))*(args.sample_proportion))) out = model(inputs[...,start:start+int((inputs.size(3))*(args.sample_proportion))]) print("OUTPUT", out.size()) ######## out = out.transpose(0, 1) # TxNxH ######## speaker_labels = speaker_labels.cuda(async=True).long() # Prints the output of the model in a sequence of probabilities of char for each audio... torch.set_printoptions(profile="full") ####print("OUT: " + str(out.size()), "SPEAKER LABELS:" + str(speaker_labels.size()), "INPUT PERCENTAGES MEAN: " + str(input_percentages.mean())) """ seq_length = out.size(0) sizes = Variable(input_percentages.mul_(int(seq_length)).int(), requires_grad=False) loss = criterion(out, targets, sizes, target_sizes) """ #print(out[:,:,0]) #print("SPEAKER LABELS: " + str(speaker_labels)) #print(out[0][0]) #softmax_output = F.softmax(out).data # This DOES NOT what I want... #softmax_output_alt = flex_softmax(out, axis=2).data # This is FINE!!! <<<=== #print(softmax_output[0][0]) #print(softmax_output_alt[0][0]) ####new_out = torch.sum(out, 0) ####new_out = torch.sum(out[20:], 0) #print(out.size()) #print(new_out.size()) #print(out[-1].size()) ######## ######## if args.loss_type == "reg": #loss_out = out[-1]; loss_speaker_labels = speaker_labels loss_out = out[round(out.size(0)/2)]; loss_speaker_labels = speaker_labels #print("LOSS TYPE = REGULAR") elif args.loss_type == "sum": loss_out = torch.sum(out[loss_begin:loss_end], 0); loss_speaker_labels = speaker_labels #print("LOSS TYPE = SUM") elif args.loss_type == "full": # Don't know if is ok!!! Don't use!!! => loss_out = out.contiguous().view(-1,48); loss_speaker_labels = speaker_labels.repeat(out.size(0)) #speaker_labels = speaker_labels.expand(20, out.size(0)) # Don't know if is ok!!! Don't use!!! => loss_out = out.contiguous().view(-1,48); loss_speaker_labels = speaker_labels.repeat(1, out.size(0)).squeeze() #speaker_labels = speaker_labels.expand(20, out.size(0)) loss_out = out.contiguous()[loss_begin:loss_end].view(-1,48); loss_speaker_labels = speaker_labels.repeat(out.size(0),1)[loss_begin:loss_end].view(-1) #speaker_labels = speaker_labels.expand(20, out.size(0)) #print("LOSS TYPE = FULL") print("LOSS_OUT: " + str(loss_out.size()), "SPEAKER LABELS:" + str(loss_speaker_labels.size())) loss = criterion(loss_out, loss_speaker_labels) ######## loss = loss / inputs.size(0) # average the loss by minibatch loss_sum = loss.data.sum() inf = float("inf") if loss_sum == inf or loss_sum == -inf: print("WARNING: received an inf loss, setting loss value to 0") loss_value = 0 else: loss_value = loss.data[0] avg_loss += loss_value losses.update(loss_value, inputs.size(0)) ######## #if args.stride == 1: multiplier = 6 #if args.stride == 2: multiplier = 3 #if args.stride == 3: multiplier = 2 #if args.stride == 4: multiplier = 1 #(Should be 1.5...) #if args.stride == 5: multiplier = 1 #(Should be 1.25...) class_accu.add(out[round(out.size(0)/2)].data, speaker_labels.data) class_accu_sum.add(torch.sum(out, 0).data, speaker_labels.data) #class_accu_sum_120.add(torch.sum(out[1*multiplier:-1*multiplier], 0).data, speaker_labels.data) #class_accu_sum_240.add(torch.sum(out[2*multiplier:-2*multiplier], 0).data, speaker_labels.data) #class_accu_sum_360.add(torch.sum(out[3*multiplier:-3*multiplier], 0).data, speaker_labels.data) #class_accu_sum_480.add(torch.sum(out[4*multiplier:-4*multiplier], 0).data, speaker_labels.data) ####class_accu_sum_120.add(torch.sum(out[round(out.size(0)/2)-1*multiplier:round(out.size(0)/2)+1*multiplier], 0).data, speaker_labels.data) ####class_accu_sum_240.add(torch.sum(out[round(out.size(0)/2)-2*multiplier:round(out.size(0)/2)+2*multiplier], 0).data, speaker_labels.data) ####class_accu_sum_360.add(torch.sum(out[round(out.size(0)/2)-3*multiplier:round(out.size(0)/2)+3*multiplier], 0).data, speaker_labels.data) ####class_accu_sum_480.add(torch.sum(out[round(out.size(0)/2)-4*multiplier:round(out.size(0)/2)+4*multiplier], 0).data, speaker_labels.data) #accu_out3 = torch.sum(flex_softmax(out[20:], axis=2), 0) #print(classaccu.value()[0], classaccu.value()[1]) # Cross Entropy Loss for a Sequence (Time Series) of Output? #output = output.view(-1,29) #target = target.view(-1) #criterion = nn.CrossEntropyLoss() #loss = criterion(output,target) ######## # compute gradient optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), args.max_norm) # SGD step optimizer.step() if args.cuda: torch.cuda.synchronize() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if not args.silent: ######## """ print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( (epoch + 1), (i + 1), len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses)) """ print('Epoch: [{0}][{1}/{2}]\t' # 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' # 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'CAR {car:.3f}\t' 'CAR_SUM {car_sum:.3f}\t' #'CAR_SUM_120 {car_sum_120:.3f}\t' #'CAR_SUM_240 {car_sum_240:.3f}\t' #'CAR_SUM_360 {car_sum_360:.3f}\t' #'CAR_SUM_480 {car_sum_480:.3f}\t' .format((epoch + 1), (i + 1), len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, car=class_accu.value()[0], car_sum=class_accu_sum.value()[0], # car_sum_240=class_accu_sum_240.value()[0], car_sum_120=class_accu_sum_120.value()[0], # car_sum_360=class_accu_sum_360.value()[0], car_sum_480=class_accu_sum_480.value()[0] ) ) ######## ######## """ if args.checkpoint_per_batch > 0 and i > 0 and (i + 1) % args.checkpoint_per_batch == 0: file_path = '%s/deepspeech_checkpoint_epoch_%d_iter_%d.pth.tar' % (save_folder, epoch + 1, i + 1) print("Saving checkpoint model to %s" % file_path) torch.save(DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, iteration=i, loss_results=loss_results, wer_results=wer_results, cer_results=cer_results, avg_loss=avg_loss), file_path) """ ######## del loss del out ######## del loss_out del speaker_labels del loss_speaker_labels ######## avg_loss /= len(train_loader) ######## """ print('Training Summary Epoch: [{0}]\t' 'Average Loss {loss:.3f}\t'.format( epoch + 1, loss=avg_loss)) """ if (best_avg_loss > avg_loss): best_avg_loss = avg_loss print("\nCURRENT EPOCH TRAINING RESULTS:\t", class_accu.value()[0], "\t", class_accu_sum.value()[0],"\t", #class_accu_sum_120.value()[0], class_accu_sum_240.value()[0], class_accu_sum_360.value()[0], "\t", class_accu_sum_480.value()[0], "\n" ) if (best_train_accu < class_accu.value()[0]): best_train_accu = class_accu.value()[0] if (best_train_accu_sum < class_accu_sum.value()[0]): best_train_accu_sum = class_accu_sum.value()[0] #if (best_train_accu_sum_120 < class_accu_sum_120.value()[0]): best_train_accu_sum_120 = class_accu_sum_120.value()[0] #if (best_train_accu_sum_240 < class_accu_sum_240.value()[0]): best_train_accu_sum_240 = class_accu_sum_240.value()[0] #if (best_train_accu_sum_360 < class_accu_sum_360.value()[0]): best_train_accu_sum_360 = class_accu_sum_360.value()[0] #if (best_train_accu_sum_480 < class_accu_sum_480.value()[0]): best_train_accu_sum_480 = class_accu_sum_480.value()[0] get_70 = ((class_accu.value()[0] > 70) or (class_accu_sum.value()[0] > 70) #or (class_accu_sum_120.value()[0] > 70) or (class_accu_sum_240.value()[0] > 70) #or (class_accu_sum_360.value()[0] > 70) or (class_accu_sum_480.value()[0] > 70) ) if ((epoch_70 is None) and (get_70 == True)): epoch_70 = epoch + 1 get_90 = ((class_accu.value()[0] > 90) or (class_accu_sum.value()[0] > 90) #or (class_accu_sum_120.value()[0] > 90) or (class_accu_sum_240.value()[0] > 90) #or (class_accu_sum_360.value()[0] > 90) or (class_accu_sum_480.value()[0] > 90) ) if ((epoch_90 is None) and (get_90 == True)): epoch_90 = epoch + 1 get_95 = ((class_accu.value()[0] > 95) or (class_accu_sum.value()[0] > 95) #or (class_accu_sum_120.value()[0] > 95) or (class_accu_sum_240.value()[0] > 95) #or (class_accu_sum_360.value()[0] > 95) or (class_accu_sum_480.value()[0] > 95) ) if ((epoch_95 is None) and (get_95 == True)): epoch_95 = epoch + 1 get_99 = ((class_accu.value()[0] > 99) or (class_accu_sum.value()[0] > 99) #or (class_accu_sum_120.value()[0] > 99) or (class_accu_sum_240.value()[0] > 99) #or (class_accu_sum_360.value()[0] > 99) or (class_accu_sum_480.value()[0] > 99) ) if ((epoch_99 is None) and (get_99 == True)): epoch_99 = epoch + 1 ######## start_iter = 0 # Reset start iteration for next epoch total_cer, total_wer = 0, 0 model.eval() ######## class_accu.reset() class_accu_sum.reset() class_accu_sum_120.reset() class_accu_sum_240.reset() class_accu_sum_360.reset() class_accu_sum_480.reset() ######## for i, (data) in enumerate(test_loader): # test ######## """ inputs, targets, input_percentages, target_sizes = data """ inputs, targets, input_percentages, target_sizes, speaker_labels = data ######## inputs = Variable(inputs, volatile=True) ######## speaker_labels = Variable(speaker_labels, requires_grad=False) speaker_labels = speaker_labels.cuda(async=True).long() """ # unflatten targets split_targets = [] offset = 0 for size in target_sizes: split_targets.append(targets[offset:offset + size]) offset += size """ ######## if args.cuda: inputs = inputs.cuda() out = model(inputs) out = out.transpose(0, 1) # TxNxH ######## speaker_labels = speaker_labels.cuda(async=True).long() # Prints the output of the model in a sequence of probabilities of char for each audio... torch.set_printoptions(profile="full") ########print("OUT: " + str(out.size()), "NEW OUT:" + str(new_out.size()), "SPEAKER LABELS:" + str(speaker_labels.size()), "INPUT PERCENTAGES MEAN: " + str(input_percentages.mean())) #print(out[:,:,0]) #print("SPEAKER LABELS: " + str(speaker_labels)) #print(out[0][0]) #softmax_output = F.softmax(out).data # This DOES NOT what I want... #softmax_output_alt = flex_softmax(out, axis=2).data # This is FINE!!! <<<=== #print(softmax_output[0][0]) #print(softmax_output_alt[0][0]) ######## ######## #if args.stride == 1: multiplier = 6 #if args.stride == 2: multiplier = 3 #if args.stride == 3: multiplier = 2 #if args.stride == 4: multiplier = 1 #(Should be 1.5...) #if args.stride == 5: multiplier = 1 #(Should be 1.25...) class_accu.add(out[round(out.size(0)/2)].data, speaker_labels.data) class_accu_sum.add(torch.sum(out, 0).data, speaker_labels.data) class_accu_sum_120.add(torch.sum(out[1*multiplier:-1*multiplier], 0).data, speaker_labels.data) class_accu_sum_240.add(torch.sum(out[2*multiplier:-2*multiplier], 0).data, speaker_labels.data) class_accu_sum_360.add(torch.sum(out[3*multiplier:-3*multiplier], 0).data, speaker_labels.data) class_accu_sum_480.add(torch.sum(out[4*multiplier:-4*multiplier], 0).data, speaker_labels.data) #class_accu_sum_120.add(torch.sum(out[round(out.size(0)/2)-1*multiplier:round(out.size(0)/2)+1*multiplier], 0).data, speaker_labels.data) #class_accu_sum_240.add(torch.sum(out[round(out.size(0)/2)-2*multiplier:round(out.size(0)/2)+2*multiplier], 0).data, speaker_labels.data) #class_accu_sum_360.add(torch.sum(out[round(out.size(0)/2)-3*multiplier:round(out.size(0)/2)+3*multiplier], 0).data, speaker_labels.data) #class_accu_sum_480.add(torch.sum(out[round(out.size(0)/2)-4*multiplier:round(out.size(0)/2)+4*multiplier], 0).data, speaker_labels.data) #accu_out3 = torch.sum(flex_softmax(out[20:], axis=2), 0) #print(classaccu.value()[0], classaccu.value()[1]) # Cross Entropy Loss for a Sequence (Time Series) of Output? #output = output.view(-1,29) #target = target.view(-1) #criterion = nn.CrossEntropyLoss() #loss = criterion(output,target) print('Validation Summary Epoch: [{0}]\t' 'CAR {car:.3f}\t' 'CAR_SUM {car_sum:.3f}\t' 'CAR_SUM_120 {car_sum_120:.3f}\t' 'CAR_SUM_240 {car_sum_240:.3f}\t' 'CAR_SUM_360 {car_sum_360:.3f}\t' 'CAR_SUM_480 {car_sum_480:.3f}\t' .format(epoch + 1, car=class_accu.value()[0], car_sum=class_accu_sum.value()[0], car_sum_240=class_accu_sum_240.value()[0], car_sum_120=class_accu_sum_120.value()[0], car_sum_360=class_accu_sum_360.value()[0], car_sum_480=class_accu_sum_480.value()[0] ) ) """ seq_length = out.size(0) sizes = input_percentages.mul_(int(seq_length)).int() decoded_output = decoder.decode(out.data, sizes) target_strings = decoder.process_strings(decoder.convert_to_strings(split_targets)) wer, cer = 0, 0 for x in range(len(target_strings)): wer += decoder.wer(decoded_output[x], target_strings[x]) / float(len(target_strings[x].split())) cer += decoder.cer(decoded_output[x], target_strings[x]) / float(len(target_strings[x])) total_cer += cer total_wer += wer """ ######## if args.cuda: torch.cuda.synchronize() del out ######## """ wer = total_wer / len(test_loader.dataset) cer = total_cer / len(test_loader.dataset) wer *= 100 cer *= 100 loss_results[epoch] = avg_loss wer_results[epoch] = wer cer_results[epoch] = cer print('Validation Summary Epoch: [{0}]\t' 'Average WER {wer:.3f}\t' 'Average CER {cer:.3f}\t'.format( epoch + 1, wer=wer, cer=cer)) """ ######## ######## print("\nCURRENT EPOCH TEST RESULTS:\t", class_accu.value()[0], "\t", class_accu_sum.value()[0], "\t", class_accu_sum_120.value()[0], "\t", class_accu_sum_240.value()[0], "\t", class_accu_sum_360.value()[0], "\t", class_accu_sum_480.value()[0], "\n") if (best_test_accu < class_accu.value()[0]): best_test_accu = class_accu.value()[0] if (best_test_accu_sum < class_accu_sum.value()[0]): best_test_accu_sum = class_accu_sum.value()[0] if (best_test_accu_sum_120 < class_accu_sum_120.value()[0]): best_test_accu_sum_120 = class_accu_sum_120.value()[0] if (best_test_accu_sum_240 < class_accu_sum_240.value()[0]): best_test_accu_sum_240 = class_accu_sum_240.value()[0] if (best_test_accu_sum_360 < class_accu_sum_360.value()[0]): best_test_accu_sum_360 = class_accu_sum_360.value()[0] if (best_test_accu_sum_480 < class_accu_sum_480.value()[0]): best_test_accu_sum_480 = class_accu_sum_480.value()[0] print("\nBEST EPOCH TRAINING RESULTS:\t", best_train_accu, "\t", best_train_accu_sum, "\t", best_train_accu_sum_120, "\t", best_train_accu_sum_240, "\t", best_train_accu_sum_360, "\t", best_train_accu_sum_480) print("\nBEST EPOCH TEST RESULTS:\t", best_test_accu, "\t", best_test_accu_sum, "\t", best_test_accu_sum_120, "\t", best_test_accu_sum_240, "\t", best_test_accu_sum_360, "\t", best_test_accu_sum_480) print("\nEPOCHS 70%, 90%, 95%, 99%:\t", epoch_70, "\t", epoch_90, "\t", epoch_95, "\t", epoch_99) print("\nBEST AVERAGE LOSS:\t", best_avg_loss, "\n") ######## ######## """ if args.visdom: # epoch += 1 x_axis = epochs[0:epoch + 1] y_axis = [loss_results[0:epoch + 1], wer_results[0:epoch + 1], cer_results[0:epoch + 1]] for x in range(len(viz_windows)): if viz_windows[x] is None: viz_windows[x] = viz.line( X=x_axis, Y=y_axis[x], opts=opts[x], ) else: viz.line( X=x_axis, Y=y_axis[x], win=viz_windows[x], update='replace', ) if args.tensorboard: info = { 'Avg Train Loss': avg_loss, 'Avg WER': wer, 'Avg CER': cer } for tag, val in info.items(): logger.scalar_summary(tag, val, epoch + 1) if args.log_params: for tag, value in model.named_parameters(): tag = tag.replace('.', '/') logger.histo_summary(tag, to_np(value), epoch + 1) logger.histo_summary(tag + '/grad', to_np(value.grad), epoch + 1) if args.checkpoint: file_path = '%s/deepspeech_%d.pth.tar' % (save_folder, epoch + 1) torch.save(DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results, wer_results=wer_results, cer_results=cer_results), file_path) # anneal lr optim_state = optimizer.state_dict() optim_state['param_groups'][0]['lr'] = optim_state['param_groups'][0]['lr'] / args.learning_anneal optimizer.load_state_dict(optim_state) print('Learning rate annealed to: {lr:.6f}'.format(lr=optim_state['param_groups'][0]['lr'])) if best_wer is None or best_wer > wer: print("Found better validated model, saving to %s" % args.model_path) torch.save(DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results, wer_results=wer_results, cer_results=cer_results) , args.model_path) best_wer = wer """ ######## avg_loss = 0 if not args.no_bucketing and epoch == 0: print("Switching to bucketing sampler for following epochs") train_dataset = SpectrogramDatasetWithLength(audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels, normalize=True, augment=args.augment) sampler = BucketingSampler(train_dataset) train_loader.sampler = sampler
def convert(parser): args = parser.parse_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if params.rnn_type == 'gru' and params.rnn_act_type != 'tanh': print("ERROR: GRU does not currently support activations other than tanh") sys.exit() if params.rnn_type == 'rnn' and params.rnn_act_type != 'relu': print("ERROR: We should be using ReLU RNNs") sys.exit() print("=======================================================") for arg in vars(args): print("***%s = %s " % (arg.ljust(25), getattr(args, arg))) print("=======================================================") save_folder = args.save_folder try: os.makedirs(save_folder) except OSError as e: if e.errno == errno.EEXIST: print('Directory already exists.') else: raise with open(params.labels_path) as label_file: labels = str(''.join(json.load(label_file))) audio_conf = dict(sample_rate=params.sample_rate, window_size=params.window_size, window_stride=params.window_stride, window=params.window, noise_dir=params.noise_dir, noise_prob=params.noise_prob, noise_levels=(params.noise_min, params.noise_max)) val_batch_size = min(8,params.batch_size_val) print("Using bs={} for validation. Parameter found was {}".format(val_batch_size,params.batch_size_val)) train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=params.train_manifest, labels=labels, normalize=True, augment=params.augment) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=params.val_manifest, labels=labels, normalize=True, augment=False) train_loader = AudioDataLoader(train_dataset, batch_size=params.batch_size, num_workers=(1 if params.cuda else 1)) test_loader = AudioDataLoader(test_dataset, batch_size=val_batch_size, num_workers=(1 if params.cuda else 1)) rnn_type = params.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" model = DeepSpeech(rnn_hidden_size = params.hidden_size, nb_layers = params.hidden_layers, labels = labels, rnn_type = supported_rnns[rnn_type], audio_conf = audio_conf, bidirectional = False, rnn_activation = params.rnn_act_type, bias = params.bias) parameters = model.parameters() if args.continue_from: print("Loading checkpoint model %s" % args.continue_from) package = torch.load(args.continue_from) model.load_state_dict(package['state_dict']) if params.cuda: model = model.cuda() if params.cuda: model = torch.nn.DataParallel(model).cuda() print(model) print("Number of parameters: %d" % DeepSpeech.get_param_size(model)) #################################################### # Begin ONNX conversion #################################################### model.train(False) # Input to the model data = next(iter(train_loader)) inputs, targets, input_percentages, target_sizes = data inputs = Variable(inputs, requires_grad=False) target_sizes = Variable(target_sizes, requires_grad=False) targets = Variable(targets, requires_grad=False) if params.cuda: inputs = inputs.cuda() x = inputs print(x.size()) # Export the model onnx_file_path = osp.join(osp.dirname(args.continue_from),osp.basename(args.continue_from).split('.')[0]+".onnx") print("Saving new ONNX model to: {}".format(onnx_file_path)) torch.onnx.export(model, # model being run inputs, # model input (or a tuple for multiple inputs) onnx_file_path, # where to save the model (can be a file or file-like object) export_params=True, # store the trained parameter weights inside the model file verbose=False)
def main(): args = parser.parse_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if params.rnn_type == 'gru' and params.rnn_act_type != 'tanh': print( "ERROR: GRU does not currently support activations other than tanh" ) sys.exit() if params.rnn_type == 'rnn' and params.rnn_act_type != 'relu': print("ERROR: We should be using ReLU RNNs") sys.exit() print("=======================================================") for arg in vars(args): print("***%s = %s " % (arg.ljust(25), getattr(args, arg))) print("=======================================================") save_folder = args.save_folder loss_results, cer_results, wer_results = torch.Tensor( params.epochs), torch.Tensor(params.epochs), torch.Tensor( params.epochs) best_wer = None try: os.makedirs(save_folder) except OSError as e: if e.errno == errno.EEXIST: print('Directory already exists.') else: raise criterion = CTCLoss() with open(params.labels_path) as label_file: labels = str(''.join(json.load(label_file))) audio_conf = dict(sample_rate=params.sample_rate, window_size=params.window_size, window_stride=params.window_stride, window=params.window, noise_dir=params.noise_dir, noise_prob=params.noise_prob, noise_levels=(params.noise_min, params.noise_max)) train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=params.train_manifest, labels=labels, normalize=True, augment=params.augment) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=params.val_manifest, labels=labels, normalize=True, augment=False) train_loader = AudioDataLoader(train_dataset, batch_size=params.batch_size, num_workers=1) test_loader = AudioDataLoader(test_dataset, batch_size=params.batch_size, num_workers=1) rnn_type = params.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" model = DeepSpeech(rnn_hidden_size=params.hidden_size, nb_layers=params.hidden_layers, labels=labels, rnn_type=supported_rnns[rnn_type], audio_conf=audio_conf, bidirectional=False, rnn_activation=params.rnn_act_type, bias=params.bias) parameters = model.parameters() optimizer = torch.optim.SGD(parameters, lr=params.lr, momentum=params.momentum, nesterov=True, weight_decay=params.l2) decoder = GreedyDecoder(labels) if args.continue_from: print("Loading checkpoint model %s" % args.continue_from) package = torch.load(args.continue_from) model.load_state_dict(package['state_dict']) optimizer.load_state_dict(package['optim_dict']) start_epoch = int(package.get( 'epoch', 1)) - 1 # Python index start at 0 for training start_iter = package.get('iteration', None) if start_iter is None: start_epoch += 1 # Assume that we saved a model after an epoch finished, so start at the next epoch. start_iter = 0 else: start_iter += 1 avg_loss = int(package.get('avg_loss', 0)) if args.start_epoch != -1: start_epoch = args.start_epoch loss_results[: start_epoch], cer_results[:start_epoch], wer_results[:start_epoch] = package[ 'loss_results'][:start_epoch], package[ 'cer_results'][:start_epoch], package[ 'wer_results'][:start_epoch] print(loss_results) epoch = start_epoch else: avg_loss = 0 start_epoch = 0 start_iter = 0 avg_training_loss = 0 if params.cuda: model = torch.nn.DataParallel(model).cuda() print(model) print("Number of parameters: %d" % DeepSpeech.get_param_size(model)) batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() ctc_time = AverageMeter() for epoch in range(start_epoch, params.epochs): model.train() end = time.time() for i, (data) in enumerate(train_loader, start=start_iter): if i == len(train_loader): break inputs, targets, input_percentages, target_sizes = data # measure data loading time data_time.update(time.time() - end) inputs = Variable(inputs, requires_grad=False) target_sizes = Variable(target_sizes, requires_grad=False) targets = Variable(targets, requires_grad=False) if params.cuda: inputs = inputs.cuda() out = model(inputs) out = out.transpose(0, 1) # TxNxH seq_length = out.size(0) sizes = Variable(input_percentages.mul_(int(seq_length)).int(), requires_grad=False) ctc_start_time = time.time() loss = criterion(out, targets, sizes, target_sizes) ctc_time.update(time.time() - ctc_start_time) loss = loss / inputs.size(0) # average the loss by minibatch loss_sum = loss.data.sum() inf = float("inf") if loss_sum == inf or loss_sum == -inf: print("WARNING: received an inf loss, setting loss value to 0") loss_value = 0 else: loss_value = loss.data[0] avg_loss += loss_value losses.update(loss_value, inputs.size(0)) # compute gradient optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), params.max_norm) # SGD step optimizer.step() if params.cuda: torch.cuda.synchronize() # measure elapsed time batch_time.update(time.time() - end) end = time.time() print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'CTC Time {ctc_time.val:.3f} ({ctc_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( (epoch + 1), (i + 1), len(train_loader), batch_time=batch_time, data_time=data_time, ctc_time=ctc_time, loss=losses)) del loss del out avg_loss /= len(train_loader) print('Training Summary Epoch: [{0}]\t' 'Average Loss {loss:.3f}\t'.format( epoch + 1, loss=avg_loss, )) start_iter = 0 # Reset start iteration for next epoch total_cer, total_wer = 0, 0 model.eval() wer, cer = eval_model(model, test_loader, decoder) loss_results[epoch] = avg_loss wer_results[epoch] = wer cer_results[epoch] = cer print('Validation Summary Epoch: [{0}]\t' 'Average WER {wer:.3f}\t' 'Average CER {cer:.3f}\t'.format(epoch + 1, wer=wer, cer=cer)) if args.checkpoint: file_path = '%s/deepspeech_%d.pth.tar' % (save_folder, epoch + 1) torch.save( DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results, wer_results=wer_results, cer_results=cer_results), file_path) # anneal lr optim_state = optimizer.state_dict() optim_state['param_groups'][0]['lr'] = optim_state['param_groups'][0][ 'lr'] / params.learning_anneal optimizer.load_state_dict(optim_state) print('Learning rate annealed to: {lr:.6f}'.format( lr=optim_state['param_groups'][0]['lr'])) if best_wer is None or best_wer > wer: print("Found better validated model, saving to %s" % args.model_path) torch.save( DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results, wer_results=wer_results, cer_results=cer_results), args.model_path) best_wer = wer avg_loss = 0 #If set to exit at a given accuracy, exit if params.exit_at_acc and (best_wer <= args.acc): break print("=======================================================") print("***Best WER = ", best_wer) for arg in vars(args): print("***%s = %s " % (arg.ljust(25), getattr(args, arg))) print("=======================================================")
model = DeepSpeech(rnn_hidden_size=args.hidden_size, nb_layers=args.hidden_layers, labels=labels, rnn_type=supported_rnns[rnn_type], audio_conf=audio_conf, bidirectional=args.bidirectional) parameters = model.parameters() # optimizer = torch.optim.SGD(parameters, lr=args.lr, # momentum=args.momentum, nesterov=True) optimizer = torch.optim.Adam(parameters, lr=args.lr) decoder = GreedyDecoder(labels) train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels, normalize=True, augment=args.augment, pitch=args.pitch, whitenoise=args.whitenoise) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.val_manifest, labels=labels, normalize=True, augment=False, pitch=False, whitenoise=False) if not args.distributed: train_sampler = BucketingSampler(train_dataset, batch_size=args.batch_size) else: train_sampler = DistributedBucketingSampler(
labels=labels, rnn_type=rnn_type, audio_conf=audio_conf, bidirectional=args.bidirectional, bnm=args.batch_norm_momentum) parameters = model.parameters() optimizer = build_optimizer(args, parameters) enorm = ENorm(model.named_parameters(), optimizer, c=1) criterion = CTCLoss() decoder = GreedyDecoder(labels) train_dataset = SpectrogramDataset(audio_conf=audio_conf, cache_path=args.cache_dir, manifest_filepath=args.train_manifest, labels=labels, normalize=args.norm, augment=args.augment, curriculum_filepath=args.curriculum) test_dataset = SpectrogramDataset(audio_conf=audio_conf, cache_path=args.cache_dir, manifest_filepath=args.val_manifest, labels=labels, normalize=args.norm, augment=False) if args.reverse_sort: # XXX: A hack to test max memory load. train_dataset.ids.reverse() test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size,
decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) elif args.decoder == "greedy": decoder = GreedyDecoder(labels, blank_index=labels.index('_')) else: decoder = None target_decoder = GreedyDecoder(labels, blank_index=labels.index('_')) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.test_manifest, labels=labels, normalize=True) test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) total_cer, total_wer, num_tokens, num_chars = 0, 0, 0, 0 output_data = [] for i, (data) in tqdm(enumerate(test_loader), total=len(test_loader)): inputs, targets, input_percentages, target_sizes = data input_sizes = input_percentages.mul_(int(inputs.size(3))).int() # unflatten targets split_targets = [] offset = 0 for size in target_sizes: split_targets.append(targets[offset:offset + size])
def train_main(args): args.distributed = args.world_size > 1 main_proc = True if args.distributed: if args.gpu_rank: torch.cuda.set_device(int(args.gpu_rank)) dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) main_proc = args.rank == 0 # Only the first proc should save models save_folder = args.save_folder loss_results, cer_results, wer_results = torch.Tensor(args.epochs), torch.Tensor(args.epochs), torch.Tensor( args.epochs) best_wer = None if args.visdom and main_proc: from visdom import Visdom viz = Visdom() opts = dict(title=args.id, ylabel='', xlabel='Epoch', legend=['Loss', 'WER', 'CER']) viz_window = None epochs = torch.arange(1, args.epochs + 1) if args.tensorboard and main_proc: os.makedirs(args.log_dir, exist_ok=True) from tensorboardX import SummaryWriter tensorboard_writer = SummaryWriter(args.log_dir) os.makedirs(save_folder, exist_ok=True) avg_loss, start_epoch, start_iter = 0, 0, 0 if args.continue_from: # Starting from previous model print("Loading checkpoint model %s" % args.continue_from) package = torch.load(args.continue_from, map_location=lambda storage, loc: storage) model = DeepSpeech.load_model_package(package) labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) parameters = model.parameters() optimizer = torch.optim.SGD(parameters, lr=args.lr, momentum=args.momentum, nesterov=True) if not args.finetune: # Don't want to restart training if args.cuda: model.cuda() optimizer.load_state_dict(package['optim_dict']) start_epoch = int(package.get('epoch', 1)) - 1 # Index start at 0 for training start_iter = package.get('iteration', None) if start_iter is None: start_epoch += 1 # We saved model after epoch finished, start at the next epoch. start_iter = 0 else: start_iter += 1 avg_loss = int(package.get('avg_loss', 0)) loss_results, cer_results, wer_results = package['loss_results'], package[ 'cer_results'], package['wer_results'] if main_proc and args.visdom and \ package[ 'loss_results'] is not None and start_epoch > 0: # Add previous scores to visdom graph x_axis = epochs[0:start_epoch] y_axis = torch.stack( (loss_results[0:start_epoch], wer_results[0:start_epoch], cer_results[0:start_epoch]), dim=1) viz_window = viz.line( X=x_axis, Y=y_axis, opts=opts, ) if main_proc and args.tensorboard and \ package[ 'loss_results'] is not None and start_epoch > 0: # Previous scores to tensorboard logs for i in range(start_epoch): values = { 'Avg Train Loss': loss_results[i], 'Avg WER': wer_results[i], 'Avg CER': cer_results[i] } tensorboard_writer.add_scalars(args.id, values, i + 1) else: with open(args.labels_path) as label_file: labels = str(''.join(json.load(label_file))) audio_conf = dict(sample_rate=args.sample_rate, window_size=args.window_size, window_stride=args.window_stride, window=args.window, noise_dir=args.noise_dir, noise_prob=args.noise_prob, noise_levels=(args.noise_min, args.noise_max)) rnn_type = args.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" model = DeepSpeech(rnn_hidden_size=args.hidden_size, nb_layers=args.hidden_layers, labels=labels, rnn_type=supported_rnns[rnn_type], audio_conf=audio_conf, bidirectional=args.bidirectional) parameters = model.parameters() optimizer = torch.optim.SGD(parameters, lr=args.lr, momentum=args.momentum, nesterov=True) criterion = CTCLoss() decoder = GreedyDecoder(labels) train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels, normalize=True, augment=args.augment) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.val_manifest, labels=labels, normalize=True, augment=False) if not args.distributed: train_sampler = BucketingSampler(train_dataset, batch_size=args.batch_size) else: train_sampler = DistributedBucketingSampler(train_dataset, batch_size=args.batch_size, num_replicas=args.world_size, rank=args.rank) train_loader = AudioDataLoader(train_dataset, num_workers=args.num_workers, batch_sampler=train_sampler) test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) if (not args.no_shuffle and start_epoch != 0) or args.no_sorta_grad: print("Shuffling batches for the following epochs") train_sampler.shuffle(start_epoch) if args.cuda: model.cuda() if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=(int(args.gpu_rank),) if args.rank else None) print(model) print("Number of parameters: %d" % DeepSpeech.get_param_size(model)) batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() for epoch in range(start_epoch, args.epochs): model.train() end = time.time() start_epoch_time = time.time() for i, (data) in enumerate(train_loader, start=start_iter): if i == len(train_sampler): break inputs, targets, input_percentages, target_sizes = data input_sizes = input_percentages.mul_(int(inputs.size(3))).int() # measure data loading time data_time.update(time.time() - end) if args.cuda: inputs = inputs.cuda() out, output_sizes = model(inputs, input_sizes) out = out.transpose(0, 1) # TxNxH loss = criterion(out, targets, output_sizes, target_sizes) loss = loss / inputs.size(0) # average the loss by minibatch inf = float("inf") if args.distributed: loss_value = reduce_tensor(loss, args.world_size)[0] else: loss_value = loss.item() if loss_value == inf or loss_value == -inf: print("WARNING: received an inf loss, setting loss value to 0") loss_value = 0 avg_loss += loss_value losses.update(loss_value, inputs.size(0)) # compute gradient optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) # SGD step optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if not args.silent: print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( (epoch + 1), (i + 1), len(train_sampler), batch_time=batch_time, data_time=data_time, loss=losses)) if args.checkpoint_per_batch > 0 and i > 0 and (i + 1) % args.checkpoint_per_batch == 0 and main_proc: file_path = '%s/deepspeech_checkpoint_epoch_%d_iter_%d.pth' % (save_folder, epoch + 1, i + 1) print("Saving checkpoint model to %s" % file_path) torch.save(DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, iteration=i, loss_results=loss_results, wer_results=wer_results, cer_results=cer_results, avg_loss=avg_loss), file_path) del loss del out avg_loss /= len(train_sampler) epoch_time = time.time() - start_epoch_time print('Training Summary Epoch: [{0}]\t' 'Time taken (s): {epoch_time:.0f}\t' 'Average Loss {loss:.3f}\t'.format(epoch + 1, epoch_time=epoch_time, loss=avg_loss)) start_iter = 0 # Reset start iteration for next epoch total_cer, total_wer = 0, 0 model.eval() with torch.no_grad(): for i, (data) in tqdm(enumerate(test_loader), total=len(test_loader)): inputs, targets, input_percentages, target_sizes = data input_sizes = input_percentages.mul_(int(inputs.size(3))).int() # unflatten targets split_targets = [] offset = 0 for size in target_sizes: split_targets.append(targets[offset:offset + size]) offset += size if args.cuda: inputs = inputs.cuda() out, output_sizes = model(inputs, input_sizes) decoded_output, _ = decoder.decode(out.data, output_sizes) target_strings = decoder.convert_to_strings(split_targets) wer, cer = 0, 0 for x in range(len(target_strings)): transcript, reference = decoded_output[x][0], target_strings[x][0] wer += decoder.wer(transcript, reference) / float(len(reference.split())) cer += decoder.cer(transcript, reference) / float(len(reference)) total_cer += cer total_wer += wer del out wer = total_wer / len(test_loader.dataset) cer = total_cer / len(test_loader.dataset) wer *= 100 cer *= 100 loss_results[epoch] = avg_loss wer_results[epoch] = wer cer_results[epoch] = cer print('Validation Summary Epoch: [{0}]\t' 'Average WER {wer:.3f}\t' 'Average CER {cer:.3f}\t'.format(epoch + 1, wer=wer, cer=cer)) if args.visdom and main_proc: x_axis = epochs[0:epoch + 1] y_axis = torch.stack( (loss_results[0:epoch + 1], wer_results[0:epoch + 1], cer_results[0:epoch + 1]), dim=1) if viz_window is None: viz_window = viz.line( X=x_axis, Y=y_axis, opts=opts, ) else: viz.line( X=x_axis.unsqueeze(0).expand(y_axis.size(1), x_axis.size(0)).transpose(0, 1), # Visdom fix Y=y_axis, win=viz_window, update='replace', ) if args.tensorboard and main_proc: values = { 'Avg Train Loss': avg_loss, 'Avg WER': wer, 'Avg CER': cer } tensorboard_writer.add_scalars(args.id, values, epoch + 1) if args.log_params: for tag, value in model.named_parameters(): tag = tag.replace('.', '/') tensorboard_writer.add_histogram(tag, to_np(value), epoch + 1) tensorboard_writer.add_histogram(tag + '/grad', to_np(value.grad), epoch + 1) if args.checkpoint and main_proc: file_path = '%s/deepspeech_%d.pth' % (save_folder, epoch + 1) torch.save(DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results, wer_results=wer_results, cer_results=cer_results), file_path) # anneal lr optim_state = optimizer.state_dict() optim_state['param_groups'][0]['lr'] = optim_state['param_groups'][0]['lr'] / args.learning_anneal optimizer.load_state_dict(optim_state) print('Learning rate annealed to: {lr:.6f}'.format(lr=optim_state['param_groups'][0]['lr'])) if (best_wer is None or best_wer > wer) and main_proc: print("Found better validated model, saving to %s" % args.model_path) torch.save(DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results, wer_results=wer_results, cer_results=cer_results), args.model_path) best_wer = wer avg_loss = 0 if not args.no_shuffle: print("Shuffling batches...") train_sampler.shuffle(epoch)