batch_size=args.batch_size, num_replicas=args.world_size, rank=args.rank) train_loader = AudioDataLoader(train_dataset, num_workers=args.num_workers, batch_sampler=train_sampler) test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) if (not args.no_shuffle and start_epoch != 0) or args.no_sorta_grad: print("Shuffling batches for the following epochs") train_sampler.shuffle(start_epoch) try: model.load_state_dict(torch.load(args.weights)['state_dict'], strict=False) print('using weights') except: print('not using weighs') model = model.to(device) parameters = model.parameters() optimizer = torch.optim.SGD(parameters, lr=args.lr, momentum=args.momentum, nesterov=True, weight_decay=1e-5) if optim_state is not None: optimizer.load_state_dict(optim_state) model, optimizer = amp.initialize( model,
def main(): args = parser.parse_args() save_folder = args.save_folder loss_results, cer_results, wer_results = torch.Tensor( args.epochs), torch.Tensor(args.epochs), torch.Tensor(args.epochs) if args.visdom: from visdom import Visdom viz = Visdom() opts = [ dict(title='Loss', ylabel='Loss', xlabel='Epoch'), dict(title='WER', ylabel='WER', xlabel='Epoch'), dict(title='CER', ylabel='CER', xlabel='Epoch') ] viz_windows = [None, None, None] epochs = torch.arange(1, args.epochs + 1) if args.tensorboard: from logger import TensorBoardLogger try: os.makedirs(args.log_dir) except OSError as e: if e.errno == errno.EEXIST: print('Directory already exists.') for file in os.listdir(args.log_dir): file_path = os.path.join(args.log_dir, file) try: if os.path.isfile(file_path): os.unlink(file_path) except Exception as e: raise else: raise logger = TensorBoardLogger(args.log_dir) try: os.makedirs(save_folder) except OSError as e: if e.errno == errno.EEXIST: print('Directory already exists.') else: raise criterion = CTCLoss() with open(args.labels_path) as label_file: labels = str(''.join(json.load(label_file))) audio_conf = dict(sample_rate=args.sample_rate, window_size=args.window_size, window_stride=args.window_stride, window=args.window, noise_dir=args.noise_dir, noise_prob=args.noise_prob, noise_levels=(args.noise_min, args.noise_max)) train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels, normalize=True, augment=args.augment) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.val_manifest, labels=labels, normalize=True, augment=False) train_loader = AudioDataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers) test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) rnn_type = args.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" model = DeepSpeech(rnn_hidden_size=args.hidden_size, nb_layers=args.hidden_layers, labels=labels, rnn_type=supported_rnns[rnn_type], audio_conf=audio_conf, bidirectional=True) parameters = model.parameters() optimizer = torch.optim.SGD(parameters, lr=args.lr, momentum=args.momentum, nesterov=True) decoder = GreedyDecoder(labels) if args.continue_from: print("Loading checkpoint model %s" % args.continue_from) package = torch.load(args.continue_from) model.load_state_dict(package['state_dict']) optimizer.load_state_dict(package['optim_dict']) start_epoch = int(package.get( 'epoch', 1)) - 1 # Python index start at 0 for training start_iter = package.get('iteration', None) if start_iter is None: start_epoch += 1 # Assume that we saved a model after an epoch finished, so start at the next epoch. start_iter = 0 else: start_iter += 1 avg_loss = int(package.get('avg_loss', 0)) loss_results, cer_results, wer_results = package[ 'loss_results'], package['cer_results'], package['wer_results'] if args.visdom and \ package['loss_results'] is not None and start_epoch > 0: # Add previous scores to visdom graph x_axis = epochs[0:start_epoch] y_axis = [ loss_results[0:start_epoch], wer_results[0:start_epoch], cer_results[0:start_epoch] ] for x in range(len(viz_windows)): viz_windows[x] = viz.line( X=x_axis, Y=y_axis[x], opts=opts[x], ) if args.tensorboard and \ package['loss_results'] is not None and start_epoch > 0: # Previous scores to tensorboard logs for i in range(start_epoch): info = { 'Avg Train Loss': loss_results[i], 'Avg WER': wer_results[i], 'Avg CER': cer_results[i] } for tag, val in info.items(): logger.scalar_summary(tag, val, i + 1) if not args.no_bucketing: print("Using bucketing sampler for the following epochs") train_dataset = SpectrogramDatasetWithLength( audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels, normalize=True, augment=args.augment) sampler = BucketingSampler(train_dataset) train_loader.sampler = sampler else: avg_loss = 0 start_epoch = 0 start_iter = 0 if args.cuda: model = torch.nn.DataParallel(model).cuda() print(model) print("Number of parameters: %d" % DeepSpeech.get_param_size(model)) batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() for epoch in range(start_epoch, args.epochs): model.train() end = time.time() for i, (data) in enumerate(train_loader, start=start_iter): if i == len(train_loader): break inputs, targets, input_percentages, target_sizes = data # measure data loading time data_time.update(time.time() - end) inputs = Variable(inputs, requires_grad=False) target_sizes = Variable(target_sizes, requires_grad=False) targets = Variable(targets, requires_grad=False) if args.cuda: inputs = inputs.cuda() out = model(inputs) out = out.transpose(0, 1) # TxNxH seq_length = out.size(0) sizes = Variable(input_percentages.mul_(int(seq_length)).int(), requires_grad=False) loss = criterion(out, targets, sizes, target_sizes) loss = loss / inputs.size(0) # average the loss by minibatch loss_sum = loss.data.sum() inf = float("inf") if loss_sum == inf or loss_sum == -inf: print("WARNING: received an inf loss, setting loss value to 0") loss_value = 0 else: loss_value = loss.data[0] avg_loss += loss_value losses.update(loss_value, inputs.size(0)) # compute gradient optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), args.max_norm) # SGD step optimizer.step() if args.cuda: torch.cuda.synchronize() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if not args.silent: print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( (epoch + 1), (i + 1), len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses)) if args.checkpoint_per_batch > 0 and i > 0 and ( i + 1) % args.checkpoint_per_batch == 0: file_path = '%s/deepspeech_checkpoint_epoch_%d_iter_%d.pth.tar' % ( save_folder, epoch + 1, i + 1) print("Saving checkpoint model to %s" % file_path) torch.save( DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, iteration=i, loss_results=loss_results, wer_results=wer_results, cer_results=cer_results, avg_loss=avg_loss), file_path) del loss del out avg_loss /= len(train_loader) print('Training Summary Epoch: [{0}]\t' 'Average Loss {loss:.3f}\t'.format(epoch + 1, loss=avg_loss)) start_iter = 0 # Reset start iteration for next epoch total_cer, total_wer = 0, 0 model.eval() for i, (data) in enumerate(test_loader): # test inputs, targets, input_percentages, target_sizes = data inputs = Variable(inputs, volatile=True) # unflatten targets split_targets = [] offset = 0 for size in target_sizes: split_targets.append(targets[offset:offset + size]) offset += size if args.cuda: inputs = inputs.cuda() out = model(inputs) out = out.transpose(0, 1) # TxNxH seq_length = out.size(0) sizes = input_percentages.mul_(int(seq_length)).int() decoded_output = decoder.decode(out.data, sizes) target_strings = decoder.process_strings( decoder.convert_to_strings(split_targets)) wer, cer = 0, 0 for x in range(len(target_strings)): wer += decoder.wer(decoded_output[x], target_strings[x]) / float( len(target_strings[x].split())) cer += decoder.cer(decoded_output[x], target_strings[x]) / float( len(target_strings[x])) total_cer += cer total_wer += wer if args.cuda: torch.cuda.synchronize() del out wer = total_wer / len(test_loader.dataset) cer = total_cer / len(test_loader.dataset) wer *= 100 cer *= 100 loss_results[epoch] = avg_loss wer_results[epoch] = wer cer_results[epoch] = cer print('Validation Summary Epoch: [{0}]\t' 'Average WER {wer:.3f}\t' 'Average CER {cer:.3f}\t'.format(epoch + 1, wer=wer, cer=cer)) if args.visdom: # epoch += 1 x_axis = epochs[0:epoch + 1] y_axis = [ loss_results[0:epoch + 1], wer_results[0:epoch + 1], cer_results[0:epoch + 1] ] for x in range(len(viz_windows)): if viz_windows[x] is None: viz_windows[x] = viz.line( X=x_axis, Y=y_axis[x], opts=opts[x], ) else: viz.line( X=x_axis, Y=y_axis[x], win=viz_windows[x], update='replace', ) if args.tensorboard: info = {'Avg Train Loss': avg_loss, 'Avg WER': wer, 'Avg CER': cer} for tag, val in info.items(): logger.scalar_summary(tag, val, epoch + 1) if args.log_params: for tag, value in model.named_parameters(): tag = tag.replace('.', '/') logger.histo_summary(tag, to_np(value), epoch + 1) logger.histo_summary(tag + '/grad', to_np(value.grad), epoch + 1) if args.checkpoint: file_path = '%s/deepspeech_%d.pth.tar' % (save_folder, epoch + 1) torch.save( DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results, wer_results=wer_results, cer_results=cer_results), file_path) # anneal lr optim_state = optimizer.state_dict() optim_state['param_groups'][0][ 'lr'] = optim_state['param_groups'][0]['lr'] / args.learning_anneal optimizer.load_state_dict(optim_state) print('Learning rate annealed to: {lr:.6f}'.format( lr=optim_state['param_groups'][0]['lr'])) avg_loss = 0 if not args.no_bucketing and epoch == 0: print("Switching to bucketing sampler for following epochs") train_dataset = SpectrogramDatasetWithLength( audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels, normalize=True, augment=args.augment) sampler = BucketingSampler(train_dataset) train_loader.sampler = sampler torch.save(DeepSpeech.serialize(model, optimizer=optimizer), args.final_model_path)
default='hamming', help='Window type for spectrogram generation') parser.add_argument('--cuda', default=True, type=bool, help='Use cuda to train model') args = parser.parse_args() if __name__ == '__main__': package = torch.load(args.model_path) model = DeepSpeech(rnn_hidden_size=package['hidden_size'], nb_layers=package['hidden_layers'], num_classes=package['nout']) if args.cuda: model = torch.nn.DataParallel(model).cuda() model.load_state_dict(package['state_dict']) audio_conf = dict(sample_rate=args.sample_rate, window_size=args.window_size, window_stride=args.window_stride, window=args.window) with open(args.labels_path) as label_file: labels = str(''.join(json.load(label_file))) decoder = ArgMaxDecoder(labels) parser = SpectrogramParser(audio_conf, normalize=True) spect = parser.parse_audio(args.audio_path).contiguous() spect = spect.view(1, 1, spect.size(0), spect.size(1)) out = model(Variable(spect)) out = out.transpose(0, 1) # TxNxH decoded_output = decoder.decode(out.data) print(decoded_output[0])
import torch import torch.onnx from model import DeepSpeech ## A model class instance (class not shown) pytorch_model = DeepSpeech( labels="_'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrs ", rnn_hidden_size=128, nb_layers=2) ## Load the weights from a file (.pth usually) state_dict = torch.load("./24-6-2019/deepspeech_final.pth") ## Load the weights now into a model net architecture defined by our class pytorch_model.load_state_dict(state_dict["state_dict"]) #print(pytorch_model) ## Create the right input shape (e.g. for an image) dummy_input = torch.randn(8, 1, 3316, 3316, device='cuda') #dummy_input = torch.randn(8, 1, 32,32,lenghts=[3316,3316,3316,3316]) #torch.cuda.get_device_name(0) torch.onnx.export(pytorch_model, dummy_input, "deepspeech.onnx")
def convert(parser): args = parser.parse_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if params.rnn_type == 'gru' and params.rnn_act_type != 'tanh': print("ERROR: GRU does not currently support activations other than tanh") sys.exit() if params.rnn_type == 'rnn' and params.rnn_act_type != 'relu': print("ERROR: We should be using ReLU RNNs") sys.exit() print("=======================================================") for arg in vars(args): print("***%s = %s " % (arg.ljust(25), getattr(args, arg))) print("=======================================================") save_folder = args.save_folder try: os.makedirs(save_folder) except OSError as e: if e.errno == errno.EEXIST: print('Directory already exists.') else: raise with open(params.labels_path) as label_file: labels = str(''.join(json.load(label_file))) audio_conf = dict(sample_rate=params.sample_rate, window_size=params.window_size, window_stride=params.window_stride, window=params.window, noise_dir=params.noise_dir, noise_prob=params.noise_prob, noise_levels=(params.noise_min, params.noise_max)) val_batch_size = min(8,params.batch_size_val) print("Using bs={} for validation. Parameter found was {}".format(val_batch_size,params.batch_size_val)) train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=params.train_manifest, labels=labels, normalize=True, augment=params.augment) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=params.val_manifest, labels=labels, normalize=True, augment=False) train_loader = AudioDataLoader(train_dataset, batch_size=params.batch_size, num_workers=(1 if params.cuda else 1)) test_loader = AudioDataLoader(test_dataset, batch_size=val_batch_size, num_workers=(1 if params.cuda else 1)) rnn_type = params.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" model = DeepSpeech(rnn_hidden_size = params.hidden_size, nb_layers = params.hidden_layers, labels = labels, rnn_type = supported_rnns[rnn_type], audio_conf = audio_conf, bidirectional = False, rnn_activation = params.rnn_act_type, bias = params.bias) parameters = model.parameters() if args.continue_from: print("Loading checkpoint model %s" % args.continue_from) package = torch.load(args.continue_from) model.load_state_dict(package['state_dict']) if params.cuda: model = model.cuda() if params.cuda: model = torch.nn.DataParallel(model).cuda() print(model) print("Number of parameters: %d" % DeepSpeech.get_param_size(model)) #################################################### # Begin ONNX conversion #################################################### model.train(False) # Input to the model data = next(iter(train_loader)) inputs, targets, input_percentages, target_sizes = data inputs = Variable(inputs, requires_grad=False) target_sizes = Variable(target_sizes, requires_grad=False) targets = Variable(targets, requires_grad=False) if params.cuda: inputs = inputs.cuda() x = inputs print(x.size()) # Export the model onnx_file_path = osp.join(osp.dirname(args.continue_from),osp.basename(args.continue_from).split('.')[0]+".onnx") print("Saving new ONNX model to: {}".format(onnx_file_path)) torch.onnx.export(model, # model being run inputs, # model input (or a tuple for multiple inputs) onnx_file_path, # where to save the model (can be a file or file-like object) export_params=True, # store the trained parameter weights inside the model file verbose=False)
def main(): args = parser.parse_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if params.rnn_type == 'gru' and params.rnn_act_type != 'tanh': print( "ERROR: GRU does not currently support activations other than tanh" ) sys.exit() if params.rnn_type == 'rnn' and params.rnn_act_type != 'relu': print("ERROR: We should be using ReLU RNNs") sys.exit() print("=======================================================") for arg in vars(args): print("***%s = %s " % (arg.ljust(25), getattr(args, arg))) print("=======================================================") save_folder = args.save_folder loss_results, cer_results, wer_results = torch.Tensor( params.epochs), torch.Tensor(params.epochs), torch.Tensor( params.epochs) best_wer = None try: os.makedirs(save_folder) except OSError as e: if e.errno == errno.EEXIST: print('Directory already exists.') else: raise criterion = CTCLoss() with open(params.labels_path) as label_file: labels = str(''.join(json.load(label_file))) audio_conf = dict(sample_rate=params.sample_rate, window_size=params.window_size, window_stride=params.window_stride, window=params.window, noise_dir=params.noise_dir, noise_prob=params.noise_prob, noise_levels=(params.noise_min, params.noise_max)) train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=params.train_manifest, labels=labels, normalize=True, augment=params.augment) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=params.val_manifest, labels=labels, normalize=True, augment=False) train_loader = AudioDataLoader(train_dataset, batch_size=params.batch_size, num_workers=1) test_loader = AudioDataLoader(test_dataset, batch_size=params.batch_size, num_workers=1) rnn_type = params.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" model = DeepSpeech(rnn_hidden_size=params.hidden_size, nb_layers=params.hidden_layers, labels=labels, rnn_type=supported_rnns[rnn_type], audio_conf=audio_conf, bidirectional=False, rnn_activation=params.rnn_act_type, bias=params.bias) parameters = model.parameters() optimizer = torch.optim.SGD(parameters, lr=params.lr, momentum=params.momentum, nesterov=True, weight_decay=params.l2) decoder = GreedyDecoder(labels) if args.continue_from: print("Loading checkpoint model %s" % args.continue_from) package = torch.load(args.continue_from) model.load_state_dict(package['state_dict']) optimizer.load_state_dict(package['optim_dict']) start_epoch = int(package.get( 'epoch', 1)) - 1 # Python index start at 0 for training start_iter = package.get('iteration', None) if start_iter is None: start_epoch += 1 # Assume that we saved a model after an epoch finished, so start at the next epoch. start_iter = 0 else: start_iter += 1 avg_loss = int(package.get('avg_loss', 0)) if args.start_epoch != -1: start_epoch = args.start_epoch loss_results[: start_epoch], cer_results[:start_epoch], wer_results[:start_epoch] = package[ 'loss_results'][:start_epoch], package[ 'cer_results'][:start_epoch], package[ 'wer_results'][:start_epoch] print(loss_results) epoch = start_epoch else: avg_loss = 0 start_epoch = 0 start_iter = 0 avg_training_loss = 0 if params.cuda: model = torch.nn.DataParallel(model).cuda() print(model) print("Number of parameters: %d" % DeepSpeech.get_param_size(model)) batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() ctc_time = AverageMeter() for epoch in range(start_epoch, params.epochs): model.train() end = time.time() for i, (data) in enumerate(train_loader, start=start_iter): if i == len(train_loader): break inputs, targets, input_percentages, target_sizes = data # measure data loading time data_time.update(time.time() - end) inputs = Variable(inputs, requires_grad=False) target_sizes = Variable(target_sizes, requires_grad=False) targets = Variable(targets, requires_grad=False) if params.cuda: inputs = inputs.cuda() out = model(inputs) out = out.transpose(0, 1) # TxNxH seq_length = out.size(0) sizes = Variable(input_percentages.mul_(int(seq_length)).int(), requires_grad=False) ctc_start_time = time.time() loss = criterion(out, targets, sizes, target_sizes) ctc_time.update(time.time() - ctc_start_time) loss = loss / inputs.size(0) # average the loss by minibatch loss_sum = loss.data.sum() inf = float("inf") if loss_sum == inf or loss_sum == -inf: print("WARNING: received an inf loss, setting loss value to 0") loss_value = 0 else: loss_value = loss.data[0] avg_loss += loss_value losses.update(loss_value, inputs.size(0)) # compute gradient optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), params.max_norm) # SGD step optimizer.step() if params.cuda: torch.cuda.synchronize() # measure elapsed time batch_time.update(time.time() - end) end = time.time() print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'CTC Time {ctc_time.val:.3f} ({ctc_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( (epoch + 1), (i + 1), len(train_loader), batch_time=batch_time, data_time=data_time, ctc_time=ctc_time, loss=losses)) del loss del out avg_loss /= len(train_loader) print('Training Summary Epoch: [{0}]\t' 'Average Loss {loss:.3f}\t'.format( epoch + 1, loss=avg_loss, )) start_iter = 0 # Reset start iteration for next epoch total_cer, total_wer = 0, 0 model.eval() wer, cer = eval_model(model, test_loader, decoder) loss_results[epoch] = avg_loss wer_results[epoch] = wer cer_results[epoch] = cer print('Validation Summary Epoch: [{0}]\t' 'Average WER {wer:.3f}\t' 'Average CER {cer:.3f}\t'.format(epoch + 1, wer=wer, cer=cer)) if args.checkpoint: file_path = '%s/deepspeech_%d.pth.tar' % (save_folder, epoch + 1) torch.save( DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results, wer_results=wer_results, cer_results=cer_results), file_path) # anneal lr optim_state = optimizer.state_dict() optim_state['param_groups'][0]['lr'] = optim_state['param_groups'][0][ 'lr'] / params.learning_anneal optimizer.load_state_dict(optim_state) print('Learning rate annealed to: {lr:.6f}'.format( lr=optim_state['param_groups'][0]['lr'])) if best_wer is None or best_wer > wer: print("Found better validated model, saving to %s" % args.model_path) torch.save( DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results, wer_results=wer_results, cer_results=cer_results), args.model_path) best_wer = wer avg_loss = 0 #If set to exit at a given accuracy, exit if params.exit_at_acc and (best_wer <= args.acc): break print("=======================================================") print("***Best WER = ", best_wer) for arg in vars(args): print("***%s = %s " % (arg.ljust(25), getattr(args, arg))) print("=======================================================")
def main(): args = parser.parse_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if params.rnn_type == 'gru' and params.rnn_act_type != 'tanh': print("ERROR: GRU does not currently support activations other than tanh") sys.exit() if params.rnn_type == 'rnn' and params.rnn_act_type != 'relu': print("ERROR: We should be using ReLU RNNs") sys.exit() print("=======================================================") for arg in vars(args): print("***%s = %s " % (arg.ljust(25), getattr(args, arg))) print("=======================================================") save_folder = args.save_folder loss_results, cer_results, wer_results = torch.Tensor(params.epochs), torch.Tensor(params.epochs), torch.Tensor(params.epochs) best_wer = None try: os.makedirs(save_folder) except OSError as e: if e.errno == errno.EEXIST: print('Directory already exists.') else: raise criterion = CTCLoss() with open(params.labels_path) as label_file: labels = str(''.join(json.load(label_file))) audio_conf = dict(sample_rate=params.sample_rate, window_size=params.window_size, window_stride=params.window_stride, window=params.window, noise_dir=params.noise_dir, noise_prob=params.noise_prob, noise_levels=(params.noise_min, params.noise_max)) train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=params.train_manifest, labels=labels, normalize=True, augment=params.augment) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=params.val_manifest, labels=labels, normalize=True, augment=False) train_loader = AudioDataLoader(train_dataset, batch_size=params.batch_size, num_workers=1) test_loader = AudioDataLoader(test_dataset, batch_size=params.batch_size, num_workers=1) rnn_type = params.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" model = DeepSpeech(rnn_hidden_size = params.hidden_size, nb_layers = params.hidden_layers, labels = labels, rnn_type = supported_rnns[rnn_type], audio_conf = audio_conf, bidirectional = True, rnn_activation = params.rnn_act_type, bias = params.bias) parameters = model.parameters() optimizer = torch.optim.SGD(parameters, lr=params.lr, momentum=params.momentum, nesterov=True, weight_decay = params.l2) decoder = GreedyDecoder(labels) if args.continue_from: print("Loading checkpoint model %s" % args.continue_from) package = torch.load(args.continue_from) model.load_state_dict(package['state_dict']) optimizer.load_state_dict(package['optim_dict']) start_epoch = int(package.get('epoch', 1)) - 1 # Python index start at 0 for training start_iter = package.get('iteration', None) if start_iter is None: start_epoch += 1 # Assume that we saved a model after an epoch finished, so start at the next epoch. start_iter = 0 else: start_iter += 1 avg_loss = int(package.get('avg_loss', 0)) if args.start_epoch != -1: start_epoch = args.start_epoch loss_results[:start_epoch], cer_results[:start_epoch], wer_results[:start_epoch] = package['loss_results'][:start_epoch], package[ 'cer_results'][:start_epoch], package['wer_results'][:start_epoch] print(loss_results) epoch = start_epoch else: avg_loss = 0 start_epoch = 0 start_iter = 0 avg_training_loss = 0 if params.cuda: model = torch.nn.DataParallel(model).cuda() print(model) print("Number of parameters: %d" % DeepSpeech.get_param_size(model)) batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() ctc_time = AverageMeter() for epoch in range(start_epoch, params.epochs): model.train() end = time.time() for i, (data) in enumerate(train_loader, start=start_iter): if i == len(train_loader): break inputs, targets, input_percentages, target_sizes = data # measure data loading time data_time.update(time.time() - end) inputs = Variable(inputs, requires_grad=False) target_sizes = Variable(target_sizes, requires_grad=False) targets = Variable(targets, requires_grad=False) if params.cuda: inputs = inputs.cuda() out = model(inputs) out = out.transpose(0, 1) # TxNxH seq_length = out.size(0) sizes = Variable(input_percentages.mul_(int(seq_length)).int(), requires_grad=False) ctc_start_time = time.time() loss = criterion(out, targets, sizes, target_sizes) ctc_time.update(time.time() - ctc_start_time) loss = loss / inputs.size(0) # average the loss by minibatch loss_sum = loss.data.sum() inf = float("inf") if loss_sum == inf or loss_sum == -inf: print("WARNING: received an inf loss, setting loss value to 0") loss_value = 0 else: loss_value = loss.data[0] avg_loss += loss_value losses.update(loss_value, inputs.size(0)) # compute gradient optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), params.max_norm) # SGD step optimizer.step() if params.cuda: torch.cuda.synchronize() # measure elapsed time batch_time.update(time.time() - end) end = time.time() print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'CTC Time {ctc_time.val:.3f} ({ctc_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( (epoch + 1), (i + 1), len(train_loader), batch_time=batch_time, data_time=data_time, ctc_time=ctc_time, loss=losses)) del loss del out avg_loss /= len(train_loader) print('Training Summary Epoch: [{0}]\t' 'Average Loss {loss:.3f}\t' .format( epoch + 1, loss=avg_loss, )) start_iter = 0 # Reset start iteration for next epoch total_cer, total_wer = 0, 0 model.eval() wer, cer = eval_model( model, test_loader, decoder) loss_results[epoch] = avg_loss wer_results[epoch] = wer cer_results[epoch] = cer print('Validation Summary Epoch: [{0}]\t' 'Average WER {wer:.3f}\t' 'Average CER {cer:.3f}\t'.format( epoch + 1, wer=wer, cer=cer)) if args.checkpoint: file_path = '%s/deepspeech_%d.pth.tar' % (save_folder, epoch + 1) torch.save(DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results, wer_results=wer_results, cer_results=cer_results), file_path) # anneal lr optim_state = optimizer.state_dict() optim_state['param_groups'][0]['lr'] = optim_state['param_groups'][0]['lr'] / params.learning_anneal optimizer.load_state_dict(optim_state) print('Learning rate annealed to: {lr:.6f}'.format(lr=optim_state['param_groups'][0]['lr'])) if best_wer is None or best_wer > wer: print("Found better validated model, saving to %s" % args.model_path) torch.save(DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results, wer_results=wer_results, cer_results=cer_results) , args.model_path) best_wer = wer avg_loss = 0 #If set to exit at a given accuracy, exit if params.exit_at_acc and (best_wer <= args.acc): break print("=======================================================") print("***Best WER = ", best_wer) for arg in vars(args): print("***%s = %s " % (arg.ljust(25), getattr(args, arg))) print("=======================================================")
normalize=True, augment=False) if not args.distributed: train_sampler = BucketingSampler(train_dataset, batch_size=args.batch_size) else: train_sampler = DistributedBucketingSampler(train_dataset, batch_size=args.batch_size, num_replicas=args.world_size, rank=args.rank) train_loader = AudioDataLoader(train_dataset, num_workers=args.num_workers, batch_sampler=train_sampler) test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) if (not args.no_shuffle and start_epoch != 0) or args.no_sorta_grad: print("Shuffling batches for the following epochs") train_sampler.shuffle(start_epoch) try:model.load_state_dict(torch.load(args.weights)['state_dict'], strict = True) except:pass model = model.to(device) parameters = model.parameters() optimizer = torch.optim.SGD(parameters, lr=args.lr, momentum=args.momentum, nesterov=True, weight_decay=1e-5) if optim_state is not None: optimizer.load_state_dict(optim_state) model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, keep_batchnorm_fp32=args.keep_batchnorm_fp32, loss_scale=args.loss_scale) if args.distributed: model = DistributedDataParallel(model) print(model)
def main(): args = parser.parse_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if params.rnn_type == 'gru' and params.rnn_act_type != 'tanh': print("ERROR: GRU does not currently support activations other than tanh") sys.exit() if params.rnn_type == 'rnn' and params.rnn_act_type != 'relu': print("ERROR: We should be using ReLU RNNs") sys.exit() print("=======================================================") for arg in vars(args): print("***%s = %s " % (arg.ljust(25), getattr(args, arg))) print("=======================================================") save_folder = args.save_folder loss_results, cer_results, wer_results = torch.Tensor(params.epochs), torch.Tensor(params.epochs), torch.Tensor(params.epochs) best_wer = None try: os.makedirs(save_folder) except OSError as e: if e.errno == errno.EEXIST: print('Directory already exists.') else: raise #cuda = torch.device('cuda') criterion = torch.nn.CTCLoss()#.to(cuda) with open(params.labels_path) as label_file: labels = str(''.join(json.load(label_file))) # audio_conf = dict(sample_rate=params.sample_rate, # window_size=params.window_size, # window_stride=params.window_stride, # window=params.window, # noise_dir=params.noise_dir, # noise_prob=params.noise_prob, # noise_levels=(params.noise_min, params.noise_max)) # train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=params.train_manifest, labels=labels, # normalize=True, augment=params.augment) # test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=params.val_manifest, labels=labels, # normalize=True, augment=False) # train_loader = AudioDataLoader(train_dataset, batch_size=params.batch_size, # num_workers=1) # test_loader = AudioDataLoader(test_dataset, batch_size=params.batch_size, # num_workers=1) rnn_type = params.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" model = DeepSpeech(rnn_hidden_size = params.hidden_size, nb_layers = params.hidden_layers, labels = labels, rnn_type = supported_rnns[rnn_type], audio_conf = None, bidirectional = True, rnn_activation = params.rnn_act_type, bias = params.bias) parameters = model.parameters() optimizer = torch.optim.SGD(parameters, lr=params.lr, momentum=params.momentum, nesterov=True, weight_decay = params.l2) # decoder = GreedyDecoder(labels) if args.continue_from: print("Loading checkpoint model %s" % args.continue_from) package = torch.load(args.continue_from) model.load_state_dict(package['state_dict']) optimizer.load_state_dict(package['optim_dict']) start_epoch = int(package.get('epoch', 1)) - 1 # Python index start at 0 for training start_iter = package.get('iteration', None) if start_iter is None: start_epoch += 1 # Assume that we saved a model after an epoch finished, so start at the next epoch. start_iter = 0 else: start_iter += 1 avg_loss = int(package.get('avg_loss', 0)) if args.start_epoch != -1: start_epoch = args.start_epoch loss_results[:start_epoch], cer_results[:start_epoch], wer_results[:start_epoch] = package['loss_results'][:start_epoch], package[ 'cer_results'][:start_epoch], package['wer_results'][:start_epoch] print(loss_results) epoch = start_epoch else: avg_loss = 0 start_epoch = 0 start_iter = 0 avg_training_loss = 0 if params.cuda: model = torch.nn.DataParallel(model).cuda() print(model) print("Number of parameters: %d" % DeepSpeech.get_param_size(model)) batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() ctc_time = AverageMeter() forward_time = AverageMeter() backward_time = AverageMeter() filename = "/scratch/wu636/Lantern/src/out/PLDI19evaluation/deepspeech2/ds2-pytorch/data/test/deepspeech_train.pickle" # filename = "/scratch/wu636/training/speech_recognition/data/test/deep_speech_train.pickle" batchedData = user_defined_input.Batch(filename) for epoch in range(start_epoch, params.epochs): model.train() end = time.time() for i in range(batchedData.numBatches): inputs, targets, input_percentages, target_sizes = batchedData.batch() inputs = torch.from_numpy(inputs) targets = torch.from_numpy(targets) input_percentages = torch.from_numpy(input_percentages) target_sizes = torch.from_numpy(target_sizes) # measure data loading time data_time.update(time.time() - end) inputs = Variable(inputs, requires_grad=False) target_sizes = Variable(target_sizes, requires_grad=False) targets = Variable(targets, requires_grad=False) if params.cuda: inputs = inputs.cuda() # measure forward pass time forward_start_time = time.time() out = model(inputs) out = out.transpose(0, 1) # TxNxH seq_length = out.size(0) sizes = Variable(input_percentages.mul_(int(seq_length)).int(), requires_grad=False) # measure ctc loss computing time ctc_start_time = time.time() loss = criterion(out, targets, sizes, target_sizes) ctc_time.update(time.time() - ctc_start_time) loss = loss / inputs.size(0) # average the loss by minibatch loss_sum = loss.data.sum() inf = float("inf") if loss_sum == inf or loss_sum == -inf: print("WARNING: received an inf loss, setting loss value to 0") loss_value = 0 else: loss_value = loss.data.item() avg_loss += loss_value losses.update(loss_value, inputs.size(0)) forward_time.update(time.time() - forward_start_time) # measure backward pass time backward_start_time = time.time() # compute gradient optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), params.max_norm) # SGD step optimizer.step() if params.cuda: torch.cuda.synchronize() backward_time.update(time.time() - backward_start_time) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if ((i+1) % 20 == 0): print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Forward {forward_time.val:.3f} ({forward_time.avg:.3f})\t' 'CTC Time {ctc_time.val:.3f} ({ctc_time.avg:.3f})\t' 'Backward {backward_time.val:.3f} ({backward_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( (epoch + 1), (i + 1), batchedData.numBatches, batch_time=batch_time, data_time=data_time, forward_time=forward_time, ctc_time=ctc_time, backward_time=backward_time, loss=losses)) del loss del out avg_loss /= batchedData.numBatches # len(train_loader) print('Training Summary Epoch: [{0}]\t' 'Average Loss {loss:.3f}\t' .format( epoch + 1, loss=avg_loss, ))
def main(): args = parser.parse_args() params.cuda = not bool(args.cpu) print("Use cuda: {}".format(params.cuda)) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if params.rnn_type == 'gru' and params.rnn_act_type != 'tanh': print( "ERROR: GRU does not currently support activations other than tanh" ) sys.exit() if params.rnn_type == 'rnn' and params.rnn_act_type != 'relu': print("ERROR: We should be using ReLU RNNs") sys.exit() print("=======================================================") for arg in vars(args): print("***%s = %s " % (arg.ljust(25), getattr(args, arg))) print("=======================================================") save_folder = args.save_folder loss_results, cer_results, wer_results = torch.Tensor( params.epochs), torch.Tensor(params.epochs), torch.Tensor( params.epochs) best_wer = None try: os.makedirs(save_folder) except OSError as e: if e.errno == errno.EEXIST: print('Directory already exists.') else: raise criterion = CTCLoss() with open(params.labels_path) as label_file: labels = str(''.join(json.load(label_file))) audio_conf = dict(sample_rate=params.sample_rate, window_size=params.window_size, window_stride=params.window_stride, window=params.window, noise_dir=params.noise_dir, noise_prob=params.noise_prob, noise_levels=(params.noise_min, params.noise_max)) if args.use_set == 'libri': testing_manifest = params.val_manifest + ("_held" if args.hold_idx >= 0 else "") else: testing_manifest = params.test_manifest if args.batch_size_val > 0: params.batch_size_val = args.batch_size_val print("Testing on: {}".format(testing_manifest)) train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=params.val_manifest, labels=labels, normalize=True, augment=params.augment) test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=testing_manifest, labels=labels, normalize=True, augment=False) train_loader = AudioDataLoader(train_dataset, batch_size=params.batch_size, num_workers=1) test_loader = AudioDataLoader(test_dataset, batch_size=params.batch_size_val, num_workers=1) rnn_type = params.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" model = DeepSpeech(rnn_hidden_size=params.hidden_size, nb_layers=params.hidden_layers, labels=labels, rnn_type=supported_rnns[rnn_type], audio_conf=audio_conf, bidirectional=False, rnn_activation=params.rnn_act_type, bias=params.bias) parameters = model.parameters() optimizer = torch.optim.SGD(parameters, lr=params.lr, momentum=params.momentum, nesterov=True, weight_decay=params.l2) decoder = GreedyDecoder(labels) if args.continue_from: print("Loading checkpoint model %s" % args.continue_from) package = torch.load(args.continue_from) model.load_state_dict(package['state_dict']) optimizer.load_state_dict(package['optim_dict']) start_epoch = int(package.get( 'epoch', 1)) - 1 # Python index start at 0 for training start_iter = package.get('iteration', None) if start_iter is None: start_epoch += 1 # Assume that we saved a model after an epoch finished, so start at the next epoch. start_iter = 0 else: start_iter += 1 avg_loss = int(package.get('avg_loss', 0)) if args.start_epoch != -1: start_epoch = args.start_epoch avg_loss = 0 start_epoch = 0 start_iter = 0 avg_training_loss = 0 epoch = 1 else: avg_loss = 0 start_epoch = 0 start_iter = 0 avg_training_loss = 0 if params.cuda: model = torch.nn.DataParallel(model).cuda() # model = torch.nn.parallel.DistributedDataParallel(model).cuda() print(model) print("Number of parameters: %d" % DeepSpeech.get_param_size(model)) batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() ctc_time = AverageMeter() for epoch in range(start_epoch, params.epochs): ################################################################################################################# # The test script only really cares about this section. ################################################################################################################# model.eval() wer, cer, trials = eval_model_verbose(model, test_loader, decoder, params.cuda, args.n_trials) root = os.getcwd() outfile = osp.join( root, "inference_bs{}_i{}_gpu{}.csv".format(params.batch_size_val, args.hold_idx, params.cuda)) print("Exporting inference to: {}".format(outfile)) make_file(outfile) write_line( outfile, "batch times pre normalized by hold_sec =,{}\n".format( args.hold_sec)) write_line(outfile, "wer, {}\n".format(wer)) write_line(outfile, "cer, {}\n".format(cer)) write_line(outfile, "bs, {}\n".format(params.batch_size_val)) write_line(outfile, "hold_idx, {}\n".format(args.hold_idx)) write_line(outfile, "cuda, {}\n".format(params.cuda)) write_line(outfile, "avg batch time, {}\n".format(trials.avg / args.hold_sec)) percentile_50 = np.percentile( trials.array, 50) / params.batch_size_val / args.hold_sec write_line(outfile, "50%-tile latency, {}\n".format(percentile_50)) percentile_99 = np.percentile( trials.array, 99) / params.batch_size_val / args.hold_sec write_line(outfile, "99%-tile latency, {}\n".format(percentile_99)) write_line(outfile, "through put, {}\n".format(1 / percentile_50)) write_line(outfile, "data\n") for trial in trials.array: write_line(outfile, "{}\n".format(trial / args.hold_sec)) loss_results[epoch] = avg_loss wer_results[epoch] = wer cer_results[epoch] = cer print('Validation Summary Epoch: [{0}]\t' 'Average WER {wer:.3f}\t' 'Average CER {cer:.3f}\t'.format(epoch + 1, wer=wer, cer=cer)) # anneal lr optim_state = optimizer.state_dict() optim_state['param_groups'][0]['lr'] = optim_state['param_groups'][0][ 'lr'] / params.learning_anneal optimizer.load_state_dict(optim_state) print('Learning rate annealed to: {lr:.6f}'.format( lr=optim_state['param_groups'][0]['lr'])) break print("=======================================================") print("***Best WER = ", best_wer) for arg in vars(args): print("***%s = %s " % (arg.ljust(25), getattr(args, arg))) print("=======================================================")