Exemplo n.º 1
0
def test(model_name, filename, idx_from, idx_to, logfile):
    print('Read test data')
    image_test, title_test = read_data(filename=filename, limit=idx_to)
    list_index_test = list(range(len(image_test)))[idx_from:]

    net = torch.load(model_name)
    print('Net: \n', net)
    global decoder
    decoder = ArgMaxDecoder(char_dataset, decode_lookup)

    wer, cer = testing(net,
                       image_test,
                       title_test,
                       list_index_test,
                       logfile=logfile)
    print(wer, cer)
Exemplo n.º 2
0
def init(model_name, old_epochs):
    global criterion
    criterion = CTCLoss()

    h0 = Variable(torch.rand(nLayer, batch_size, nHidden))
    c0 = Variable(torch.rand(nLayer, batch_size, nHidden))
    global state0
    state0 = (h0, c0)

    global net
    if old_epochs == 0:
        print('Create model ', model_name)
        net = Net(nIn, nHidden, nLayer, nOut)
    else:
        old_modelname = '%s_%02d.pth' % (model_name, old_epochs)
        print('Load ', old_modelname)
        net = torch.load(old_modelname)

    global optimizer
    optimizer = torch.optim.Adadelta(net.parameters())
    # optimizer = torch.optim.SGD(net.parameters(), lr=0.0001,momentum=0.99)
    global decoder
    decoder = ArgMaxDecoder(char_dataset, decode_lookup)
Exemplo n.º 3
0
                    default='hamming',
                    help='Window type for spectrogram generation')
parser.add_argument('--cuda',
                    default=True,
                    type=bool,
                    help='Use cuda to train model')
args = parser.parse_args()

if __name__ == '__main__':
    package = torch.load(args.model_path)
    model = DeepSpeech(rnn_hidden_size=package['hidden_size'],
                       nb_layers=package['hidden_layers'],
                       num_classes=package['nout'])
    if args.cuda:
        model = torch.nn.DataParallel(model).cuda()
    model.load_state_dict(package['state_dict'])
    audio_conf = dict(sample_rate=args.sample_rate,
                      window_size=args.window_size,
                      window_stride=args.window_stride,
                      window=args.window)
    with open(args.labels_path) as label_file:
        labels = str(''.join(json.load(label_file)))
    decoder = ArgMaxDecoder(labels)
    parser = SpectrogramParser(audio_conf, normalize=True)
    spect = parser.parse_audio(args.audio_path).contiguous()
    spect = spect.view(1, 1, spect.size(0), spect.size(1))
    out = model(Variable(spect))
    out = out.transpose(0, 1)  # TxNxH
    decoded_output = decoder.decode(out.data)
    print(decoded_output[0])
Exemplo n.º 4
0
def main():
    args = parser.parse_args()
    save_folder = args.save_folder

    loss_results, cer_results, wer_results = None, None, None
    if args.visdom:
        from visdom import Visdom
        viz = Visdom(server=args.visdom_server)

        opts = [
            dict(title='Loss', ylabel='Loss', xlabel='Epoch'),
            dict(title='WER', ylabel='WER', xlabel='Epoch'),
            dict(title='CER', ylabel='CER', xlabel='Epoch')
        ]

        viz_windows = [None, None, None]
        loss_results, cer_results, wer_results = torch.Tensor(
            args.epochs), torch.Tensor(args.epochs), torch.Tensor(args.epochs)
        epochs = torch.arange(1, args.epochs + 1)
    if args.tensorboard:
        from logger import Logger
        try:
            os.makedirs(args.log_dir)
        except OSError as e:
            if e.errno == errno.EEXIST:
                print('Directory already exists.')
                for file in os.listdir(args.log_dir):
                    file_path = os.path.join(args.log_dir, file)
                    try:
                        if os.path.isfile(file_path):
                            os.unlink(file_path)
                    except Exception as e:
                        raise
            else:
                raise
        logger = TensorBoardLogger(args.log_dir)

    try:
        os.makedirs(save_folder)
    except OSError as e:
        if e.errno == errno.EEXIST:
            print('Directory already exists.')
        else:
            raise
    criterion = CTCLoss()

    with open(args.labels_path) as label_file:
        labels = str(''.join(json.load(label_file)))
    audio_conf = dict(sample_rate=args.sample_rate,
                      window_size=args.window_size,
                      window_stride=args.window_stride,
                      window=args.window,
                      noise_dir=args.noise_dir,
                      noise_prob=args.noise_prob,
                      noise_levels=(args.noise_min, args.noise_max))

    train_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                       manifest_filepath=args.train_manifest,
                                       labels=labels,
                                       normalize=True,
                                       augment=args.augment)
    test_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                      manifest_filepath=args.val_manifest,
                                      labels=labels,
                                      normalize=True,
                                      augment=False)
    train_loader = AudioDataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   num_workers=args.num_workers)
    test_loader = AudioDataLoader(test_dataset,
                                  batch_size=args.batch_size,
                                  num_workers=args.num_workers)

    rnn_type = args.rnn_type.lower()
    assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"
    model = DeepSpeech(rnn_hidden_size=args.hidden_size,
                       nb_layers=args.hidden_layers,
                       labels=labels,
                       rnn_type=supported_rnns[rnn_type],
                       audio_conf=audio_conf,
                       bidirectional=True)
    parameters = model.parameters()
    optimizer = torch.optim.SGD(parameters,
                                lr=args.lr,
                                momentum=args.momentum,
                                nesterov=True)
    decoder = ArgMaxDecoder(labels)

    if args.continue_from:
        print("Loading checkpoint model %s" % args.continue_from)
        package = torch.load(args.continue_from)
        model.load_state_dict(package['state_dict'])
        optimizer.load_state_dict(package['optim_dict'])
        start_epoch = int(package.get(
            'epoch', 1)) - 1  # Python index start at 0 for training
        start_iter = package.get('iteration', None)
        if start_iter is None:
            start_epoch += 1  # Assume that we saved a model after an epoch finished, so start at the next epoch.
            start_iter = 0
        else:
            start_iter += 1
        avg_loss = int(package.get('avg_loss', 0))
        loss_results, cer_results, wer_results = package[
            'loss_results'], package['cer_results'], package['wer_results']
        if args.visdom and \
                        package['loss_results'] is not None and start_epoch > 0:  # Add previous scores to visdom graph
            x_axis = epochs[0:start_epoch]
            y_axis = [
                loss_results[0:start_epoch], wer_results[0:start_epoch],
                cer_results[0:start_epoch]
            ]
            for x in range(len(viz_windows)):
                viz_windows[x] = viz.line(
                    X=x_axis,
                    Y=y_axis[x],
                    opts=opts[x],
                )
        if args.tensorboard and \
                        package['loss_results'] is not None and start_epoch > 0:  # Previous scores to tensorboard logs
            for i in range(start_epoch):
                info = {
                    'Avg Train Loss': loss_results[i],
                    'Avg WER': wer_results[i],
                    'Avg CER': cer_results[i]
                }
                for tag, val in info.items():
                    logger.scalar_summary(tag, val, i + 1)
    else:
        avg_loss = 0
        start_epoch = 0
        start_iter = 0
    if args.cuda:
        model = torch.nn.DataParallel(model).cuda()

    print(model)
    print("Number of parameters: %d" % DeepSpeech.get_param_size(model))

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    inf = float("inf")

    for epoch in range(start_epoch, args.epochs):
        model.train()
        end = time.time()
        for i, (data) in enumerate(train_loader, start=start_iter):
            if i == len(train_loader):
                break
            inputs, targets, input_percentages, target_sizes = data
            # measure data loading time
            data_time.update(time.time() - end)
            inputs = Variable(inputs, requires_grad=True)
            target_sizes = Variable(target_sizes, requires_grad=False)
            targets = Variable(targets, requires_grad=False)

            if args.cuda:
                inputs = inputs.cuda()

            out = model(inputs)
            out = out.transpose(0, 1)  # TxNxH

            seq_length = out.size(0)
            sizes = Variable(input_percentages.mul_(int(seq_length)).int(),
                             requires_grad=False)

            loss = criterion(out, targets, sizes, target_sizes)
            loss = loss / inputs.size(0)  # average the loss by minibatch

            loss_sum = loss.data.sum()

            if loss_sum == inf or loss_sum == -inf:
                print("WARNING: received an inf loss, setting loss value to 0")
                loss_value = 0
            else:
                loss_value = loss.data[0]

            avg_loss += loss_value
            losses.update(loss_value, inputs.size(0))

            # compute gradient
            optimizer.zero_grad()
            loss.backward()

            torch.nn.utils.clip_grad_norm(model.parameters(), args.max_norm)
            # SGD step
            optimizer.step()

            if args.cuda:
                torch.cuda.synchronize()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()
            if not args.silent:
                print('Epoch: [{0}][{1}/{2}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                          (epoch + 1), (i + 1),
                          len(train_loader),
                          batch_time=batch_time,
                          data_time=data_time,
                          loss=losses))
            if args.checkpoint_per_batch > 0 and i > 0 and (
                    i + 1) % args.checkpoint_per_batch == 0:
                file_path = '%s/deepspeech_checkpoint_epoch_%d_iter_%d.pth.tar' % (
                    save_folder, epoch + 1, i + 1)
                print("Saving checkpoint model to %s" % file_path)
                torch.save(
                    DeepSpeech.serialize(model,
                                         optimizer=optimizer,
                                         epoch=epoch,
                                         iteration=i,
                                         loss_results=loss_results,
                                         wer_results=wer_results,
                                         cer_results=cer_results,
                                         avg_loss=avg_loss), file_path)
            del loss
            del out
        avg_loss /= len(train_loader)

        print('Training Summary Epoch: [{0}]\t'
              'Average Loss {loss:.3f}\t'.format(epoch + 1, loss=avg_loss))

        start_iter = 0  # Reset start iteration for next epoch
        total_cer, total_wer = 0, 0
        model.eval()
        for i, (data) in enumerate(test_loader):  # test
            inputs, targets, input_percentages, target_sizes = data

            inputs = Variable(inputs, volatile=True, requires_grad=False)

            # unflatten targets
            split_targets = []
            offset = 0
            for size in target_sizes:
                split_targets.append(targets[offset:offset + size])
                offset += size

            if args.cuda:
                inputs = inputs.cuda()

            out = model(inputs)
            out = out.transpose(0, 1)  # TxNxH
            seq_length = out.size(0)
            sizes = Variable(input_percentages.mul_(int(seq_length)).int(),
                             volatile=True,
                             requires_grad=False)

            decoded_output = decoder.decode(out.data, sizes)
            target_strings = decoder.process_strings(
                decoder.convert_to_strings(split_targets))
            wer, cer = 0, 0
            for x in range(len(target_strings)):
                wer += decoder.wer(decoded_output[x],
                                   target_strings[x]) / float(
                                       len(target_strings[x].split()))
                cer += decoder.cer(decoded_output[x],
                                   target_strings[x]) / float(
                                       len(target_strings[x]))
            total_cer += cer
            total_wer += wer

            if args.cuda:
                torch.cuda.synchronize()
            del out
        wer = total_wer / len(test_loader.dataset)
        cer = total_cer / len(test_loader.dataset)
        wer *= 100
        cer *= 100
        loss_results[epoch] = avg_loss
        wer_results[epoch] = wer
        cer_results[epoch] = cer
        print('Validation Summary Epoch: [{0}]\t'
              'Average WER {wer:.3f}\t'
              'Average CER {cer:.3f}\t'.format(epoch + 1, wer=wer, cer=cer))

        if args.visdom:
            # epoch += 1
            x_axis = epochs[0:epoch + 1]
            y_axis = [
                loss_results[0:epoch + 1], wer_results[0:epoch + 1],
                cer_results[0:epoch + 1]
            ]
            for x in range(len(viz_windows)):
                if viz_windows[x] is None:
                    viz_windows[x] = viz.line(
                        X=x_axis,
                        Y=y_axis[x],
                        opts=opts[x],
                    )
                else:
                    viz.line(
                        X=x_axis,
                        Y=y_axis[x],
                        win=viz_windows[x],
                        update='replace',
                    )
        if args.tensorboard:
            info = {'Avg Train Loss': avg_loss, 'Avg WER': wer, 'Avg CER': cer}
            for tag, val in info.items():
                logger.scalar_summary(tag, val, epoch + 1)
            if args.log_params:
                for tag, value in model.named_parameters():
                    tag = tag.replace('.', '/')
                    logger.histo_summary(tag, to_np(value), epoch + 1)
                    logger.histo_summary(tag + '/grad', to_np(value.grad),
                                         epoch + 1)
        if args.checkpoint:
            file_path = '%s/deepspeech_%d.pth.tar' % (save_folder, epoch + 1)
            torch.save(
                DeepSpeech.serialize(model,
                                     optimizer=optimizer,
                                     epoch=epoch,
                                     loss_results=loss_results,
                                     wer_results=wer_results,
                                     cer_results=cer_results), file_path)
        # anneal lr
        optim_state = optimizer.state_dict()
        optim_state['param_groups'][0][
            'lr'] = optim_state['param_groups'][0]['lr'] / args.learning_anneal
        optimizer.load_state_dict(optim_state)
        print('Learning rate annealed to: {lr:.6f}'.format(
            lr=optim_state['param_groups'][0]['lr']))

        avg_loss = 0
        if not args.no_bucketing and epoch == 0:
            print("Switching to bucketing sampler for following epochs")
            train_dataset = SpectrogramDatasetWithLength(
                audio_conf=audio_conf,
                manifest_filepath=args.train_manifest,
                labels=labels,
                normalize=True,
                augment=args.augment)
            sampler = BucketingSampler(train_dataset)
            train_loader.sampler = sampler

    torch.save(DeepSpeech.serialize(model, optimizer=optimizer),
               args.final_model_path)
Exemplo n.º 5
0
                    default=20,
                    type=int,
                    help='Batch size for training')
parser.add_argument('--num_workers',
                    default=4,
                    type=int,
                    help='Number of workers used in dataloading')
args = parser.parse_args()

if __name__ == '__main__':
    model = DeepSpeech.load_model(args.model_path, cuda=args.cuda)
    model.eval()

    labels = DeepSpeech.get_labels(model)
    audio_conf = DeepSpeech.get_audio_conf(model)
    decoder = ArgMaxDecoder(labels)

    test_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                      manifest_filepath=args.val_manifest,
                                      labels=labels,
                                      normalize=True)
    test_loader = AudioDataLoader(test_dataset,
                                  batch_size=args.batch_size,
                                  num_workers=args.num_workers)
    total_cer, total_wer = 0, 0
    for i, (data) in enumerate(test_loader):
        inputs, targets, input_percentages, target_sizes = data

        inputs = Variable(inputs)

        # unflatten targets
Exemplo n.º 6
0
def main():
    args = parser.parse_args()
    save_folder = args.save_folder

    if args.visdom:
        from visdom import Visdom
        viz = Visdom()

        opts = [
            dict(title='Loss', ylabel='Loss', xlabel='Epoch'),
            dict(title='WER', ylabel='WER', xlabel='Epoch'),
            dict(title='CER', ylabel='CER', xlabel='Epoch')
        ]

        viz_windows = [None, None, None]
        loss_results, cer_results, wer_results = torch.Tensor(
            args.epochs), torch.Tensor(args.epochs), torch.Tensor(args.epochs)
        epochs = torch.range(1, args.epochs)

    try:
        os.makedirs(save_folder)
    except OSError as e:
        if e.errno == errno.EEXIST:
            print('Directory already exists.')
        else:
            raise
    criterion = CTCLoss()

    with open(args.labels_path) as label_file:
        labels = str(''.join(json.load(label_file)))

    audio_conf = dict(sample_rate=args.sample_rate,
                      window_size=args.window_size,
                      window_stride=args.window_stride,
                      window=args.window)

    train_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                       manifest_filepath=args.train_manifest,
                                       labels=labels,
                                       normalize=True)
    test_dataset = SpectrogramDataset(audio_conf=audio_conf,
                                      manifest_filepath=args.val_manifest,
                                      labels=labels,
                                      normalize=True)
    train_loader = AudioDataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   num_workers=args.num_workers)
    test_loader = AudioDataLoader(test_dataset,
                                  batch_size=args.batch_size,
                                  num_workers=args.num_workers)

    model = DeepSpeech(rnn_hidden_size=args.hidden_size,
                       nb_layers=args.hidden_layers,
                       num_classes=len(labels))
    decoder = ArgMaxDecoder(labels)
    if args.cuda:
        model = torch.nn.DataParallel(model).cuda()
    print(model)
    parameters = model.parameters()
    optimizer = torch.optim.SGD(parameters,
                                lr=args.lr,
                                momentum=args.momentum,
                                nesterov=True)

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()

    for epoch in range(args.epochs):
        model.train()
        end = time.time()
        avg_loss = 0
        for i, (data) in enumerate(train_loader):
            inputs, targets, input_percentages, target_sizes = data
            # measure data loading time
            data_time.update(time.time() - end)
            inputs = Variable(inputs)
            target_sizes = Variable(target_sizes)
            targets = Variable(targets)

            if args.cuda:
                inputs = inputs.cuda()

            out = model(inputs)
            out = out.transpose(0, 1)  # TxNxH

            seq_length = out.size(0)
            sizes = Variable(input_percentages.mul_(int(seq_length)).int())

            loss = criterion(out, targets, sizes, target_sizes)
            loss = loss / inputs.size(0)  # average the loss by minibatch

            loss_sum = loss.data.sum()
            inf = float("inf")
            if loss_sum == inf or loss_sum == -inf:
                print("WARNING: received an inf loss, setting loss value to 0")
                loss_value = 0
            else:
                loss_value = loss.data[0]

            avg_loss += loss_value
            losses.update(loss_value, inputs.size(0))

            # compute gradient
            optimizer.zero_grad()
            loss.backward()

            torch.nn.utils.clip_grad_norm(model.parameters(), args.max_norm)
            # SGD step
            optimizer.step()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()
            if not args.silent:
                print('Epoch: [{0}][{1}/{2}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                          (epoch + 1), (i + 1),
                          len(train_loader),
                          batch_time=batch_time,
                          data_time=data_time,
                          loss=losses))

        avg_loss /= len(train_loader)
        print('Training Summary Epoch: [{0}]\t'
              'Average Loss {loss:.3f}\t'.format(epoch + 1, loss=avg_loss))

        total_cer, total_wer = 0, 0
        for i, (data) in enumerate(test_loader):  # test
            inputs, targets, input_percentages, target_sizes = data

            inputs = Variable(inputs)

            # unflatten targets
            split_targets = []
            offset = 0
            for size in target_sizes:
                split_targets.append(targets[offset:offset + size])
                offset += size

            if args.cuda:
                inputs = inputs.cuda()

            out = model(inputs)
            out = out.transpose(0, 1)  # TxNxH
            seq_length = out.size(0)
            sizes = Variable(input_percentages.mul_(int(seq_length)).int())

            decoded_output = decoder.decode(out.data, sizes)
            target_strings = decoder.process_strings(
                decoder.convert_to_strings(split_targets))
            wer, cer = 0, 0
            for x in range(len(target_strings)):
                wer += decoder.wer(decoded_output[x],
                                   target_strings[x]) / float(
                                       len(target_strings[x].split()))
                cer += decoder.cer(decoded_output[x],
                                   target_strings[x]) / float(
                                       len(target_strings[x]))
            total_cer += cer
            total_wer += wer

        wer = total_wer / len(test_loader.dataset)
        cer = total_cer / len(test_loader.dataset)
        wer *= 100
        cer *= 100

        print('Validation Summary Epoch: [{0}]\t'
              'Average WER {wer:.0f}\t'
              'Average CER {cer:.0f}\t'.format(epoch + 1, wer=wer, cer=cer))

        if args.visdom:
            loss_results[epoch] = avg_loss
            wer_results[epoch] = wer
            cer_results[epoch] = cer
            epoch += 1
            x_axis = epochs[0:epoch]
            y_axis = [
                loss_results[0:epoch], wer_results[0:epoch],
                cer_results[0:epoch]
            ]
            for x in range(len(viz_windows)):
                if viz_windows[x] is None:
                    viz_windows[x] = viz.line(
                        X=x_axis,
                        Y=y_axis[x],
                        opts=opts[x],
                    )
                else:
                    viz.line(
                        X=x_axis,
                        Y=y_axis[x],
                        win=viz_windows[x],
                        update='replace',
                    )
        if args.epoch_save:
            file_path = '%s/deepspeech_%d.pth.tar' % (save_folder, epoch)
            torch.save(checkpoint(model, args, len(labels), epoch), file_path)
    torch.save(checkpoint(model, args, len(labels)), args.final_model_path)
Exemplo n.º 7
0
parser.add_argument('--inference_file', default=None,
                    help='saves results in inference_file.')
args = parser.parse_args()

if args.model_file is None:
    raise ArgumentError("A model file is required for evaluation")

if "val" not in args.manifest:
    raise ArgumentError("Please provide an argument of the form:\n" + \
                        "--manifest val:/path/to/validation/manifest.csv")

# Setup parameters for argmax decoder
alphabet = "_'ABCDEFGHIJKLMNOPQRSTUVWXYZ "
beam_alphabet = alphabet.lower().replace(' ','#')
nout = len(alphabet)
argmax_decoder = ArgMaxDecoder(alphabet, space_index=alphabet.index(" "))
beam_decoder = BeamDecoder('./model.kenlm',beam_alphabet,space_index=alphabet.index(" "))

# Initialize our backend
be = gen_backend(**extract_valid_args(args, gen_backend))

# Setup dataloader
eval_manifest = args.manifest['val']
if not os.path.exists(eval_manifest):
    raise IOError("Manifest file {} not found".format(eval_manifest))

# Setup required dataloader parameters
nbands = 13
max_utt_len = 30
max_tscrpt_len = 1300
Exemplo n.º 8
0
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)
    logger.addHandler(PBStreamHandler(level=logging.DEBUG))

    # Data parameters
    alphabet = "_'ABCDEFGHIJKLMNOPQRSTUVWXYZ "
    nout = 29
    frame_stride = 0.01
    max_utt_len = ((int(args.max_length / frame_stride) - 1) // args.str_w) + 1
    # max_lbl_len = 409
    max_lbl_len = (max_utt_len - 1) // 2
    nbands = 13

    # Initialize the decoder
    decoder = ArgMaxDecoder(alphabet=alphabet,
                            blank_index=alphabet.index("_"),
                            space_index=alphabet.index(" "))

    # TODO: Add this back. Right now it's fine to default to smaller datasets
    # if (args.manifest_train is None) or (args.manifest_val is None):
    #     raise ValueError("For real data, you must provide 'manifest_train' and ""
    #                      ""'manifest_val'.")

    logger.debug("Getting librispeech data")
    train_manifest = Librispeech(manifest_file=args.manifest_train,
                                 path=args.data_dir,
                                 version="dev-clean").load_data()

    train_set = make_aeon_dataloader(train_manifest,
                                     args.max_length,
                                     max_lbl_len,